import json import os from concurrent.futures import ThreadPoolExecutor from urllib.parse import urlencode from urllib.request import Request from urllib.request import urlopen import re import time # 请求地址 base_url = 'http://192.168.1.153:3000' # 数据存储目录 base_folder = 'H:\jy' # 线程池最大线程 thread_count = 3 # 用户id (目前没发现有啥用,不用改) user_id = 28 headers = { # 'User-Agent': UserAgent().chrome 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36', # cookie模拟登录 过期了需要到网页登录抓取cookie替换这里 'Cookie': '_iChad_session=BAh7CCIPc2Vzc2lvbl9pZCIlOWExYTE5MGZlZDE1YTRmZGMxMjY4MDRmZTQxN2JmNjciGXdhcmRlbi51c2VyLnVzZXIua2V5WwgiCVVzZXJbBmkhIiIkMmEkMTAkUXRJMkkyNTQ0SGJoRkVCOHo2d0REdSIQX2NzcmZfdG9rZW4iMWRXQkRhL2NjbEdmL3k4ZWZrYXgzVE1rWksrdkxhL0FzclhPQWxSVkRXRmc9--8bf8e98234e136cc2512da7e36b33bc1580fa5da' } rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' # 写入到文件 def save_to_file(file_name, contents): fh = open(file_name, 'wb+') contents = contents.encode('utf-8') fh.write(contents) fh.close() print('写入数据到【' + file_name + '】完成') # 创建目录 def mkdir(file_name): folder = os.path.exists(file_name) if not folder: os.makedirs(file_name) print('创建目录' + file_name) else: print(file_name + '目录已经存在,跳过创建') # 下载到本地 def download(url, file_path): print('开始下载' + url) f = urlopen(url) data = f.read() with open(file_path, "wb") as code: code.write(data) # 发送get请求 def get(url): # 创建请求 request = Request(url, headers=headers) # 发起请求 response = urlopen(request) # 读取响应内容 response_str = response.read() # 转换unicode # response_str = response_str.decode().encode("utf-8").decode("unicode_escape") response_str = response_str.decode() # 修正返回结果为json格式 response_str = response_str.replace('results', '"results"') response_str = response_str.replace('rows', '"rows"') return response_str # 发送post请求 def post(url, body): # 设置post为formdata提交方式 headers['Content-Type'] = 'application/x-www-form-urlencoded;charset=utf-8' body = urlencode(body) data = body.encode('utf-8') request = Request(url, headers=headers, data=data, method='POST') response = urlopen(request) response_str = response.read() return response_str # 获取所有的档案类型 def getArchiveTypes(): api_path = '/desktop/get_dalb_print_grid' params = { '_dc': 1637722889393, 'userid': user_id, 'page': 1, 'start': 0, 'limit': 99999 } finally_url = base_url + api_path + f'?' + urlencode(params) result = get(finally_url) response_obj = json.loads(result) for index, type in enumerate(response_obj['rows']): if index < 3: print('跳过' + type['lbmc']) continue; print('开始爬取【' + type['lbmc'] + '】数据') type_folder = base_folder + '\\' + str(type['id']) + '_' + type['lbmc'] mkdir(type_folder) getArchives(type['id'], type_folder) # with ThreadPoolExecutor(max_workers=thread_count) as t: # for type in response_obj['rows']: # print('开始爬取【' + type['lbmc'] + '】数据') # type_folder = base_folder + '\\' + str(type['id']) + '_' + type['lbmc'] # mkdir(type_folder) # t.submit(getArchives, type['id'], type_folder) # 根据档案类型获取所有的档案数据 def getArchives(type, folder): api_path = '/desktop/archive_query_jygl' for pageIndex in range(5): params = { "dalb": type, "userid": user_id, "page": pageIndex + 1, "start": pageIndex * 1000, "limit": 1000, } finally_url = base_url + api_path + f'?' + urlencode(params) result = get(finally_url) response_obj = json.loads(result) print('读取该类型下数据完成,共有' + str(response_obj['results']) + '条数据') for index, row in enumerate(response_obj['rows']): # 当前这条数据的文件夹 tm = row['tm'] tm = re.sub(rstr, "_", tm) data_folder = folder + '\\' + str(index + (pageIndex * 1000)) + '_' + tm mkdir(data_folder) getArchiveDetails(row, data_folder) # with ThreadPoolExecutor(max_workers=thread_count) as t: # for row in response_obj['rows']: # # 当前这条数据的文件夹 # data_folder = folder + '\\' + row['tm'] # mkdir(data_folder) # # getArchiveDetails(row, data_folder) # t.submit(getArchiveDetails, row, data_folder) # 根据id获取详细信息 def getArchiveDetails(row, data_folder): print('开始读取【' + row['tm'] + '】详细信息') local_json_path = data_folder + '\\' + 'data.json' if not os.path.exists(local_json_path): api_path = '/desktop/get_archivebyid' body = { 'id': row['id'], 'dh': row['dh'], 'userid': user_id, } finally_url = base_url + api_path result = post(finally_url, body) # 保存数据到文件 save_to_file(local_json_path, result.decode()) else: print(f'{row["tm"]} json数据已经存在,跳过保存') # 获取卷内目录数据 getArchiveInner(row['id'], data_folder) print('开始下载【' + row['tm'] + '】的文件') getArchiveFile(row['dh'], data_folder) # 获取档案文件集合 def getArchiveFile(dh, data_folder): api_path = '/desktop/get_yx_tree?_dc=1637737132431' body = { 'node': 'root', 'dh': dh } finally_url = base_url + api_path result = post(finally_url, body) result = result.decode() result = result.replace('\'', '\"') response_obj = json.loads(result) for file in response_obj: for index, file_item in enumerate(file['children']): id = file_item['id'] file_id = id[:str(id).index('|')] file_name = file_item['text'] cs = 0 while cs < 3: try: getArchiveFileDownloadUrl(file_id, data_folder, str(index) + file_name) cs = 3 # 跳出循环 except BaseException: print(f'{file_name}出现错误休息3秒 再试') cs += 1 # 记录出错次数 time.sleep(3) # with ThreadPoolExecutor(max_workers=thread_count) as t: # for file in response_obj: # for file_item in file['children']: # id = file_item['id'] # file_id = id[:str(id).index('|')] # file_name = file_item['text'] # t.submit(getArchiveFileDownloadUrl, file_id, data_folder, file_name) # 获取文件下载地址 def getArchiveFileDownloadUrl(file_id, data_folder, file_name): api_path = '/desktop/get_timage_from_db' local_path = data_folder + '\\' + file_name + '.jpg' if os.path.exists(local_path): print(file_name + '文件已经下载过了,跳过') return body = { 'gid': file_id, 'userid': user_id } finally_url = base_url + api_path response_str = post(finally_url, body) img_path = response_str.decode().replace('assets/./', 'assets/') download(base_url + img_path, local_path) # 获取卷内目录数据 def getArchiveInner(archive_id, data_folder): print('开始读取' + str(archive_id) + '的卷内目录数据') inner_json_path = data_folder + '\\' + 'inner.json' if not os.path.exists(inner_json_path): api_path = '/desktop/get_document' params = { "query": archive_id, "page": 1, "start": 0, "limit": 99999, } finally_url = base_url + api_path + f'?' + urlencode(params) response_str = get(finally_url) save_to_file(data_folder + '\\' + 'inner.json', response_str) else: print(f'{archive_id} 卷内目录json数据已经存在 跳过保存') if __name__ == '__main__': getArchiveTypes()