|
@@ -0,0 +1,244 @@
|
|
|
|
|
+import json
|
|
|
|
|
+import os
|
|
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
+from urllib.parse import urlencode
|
|
|
|
|
+from urllib.request import Request
|
|
|
|
|
+from urllib.request import urlopen
|
|
|
|
|
+import re
|
|
|
|
|
+import time
|
|
|
|
|
+
|
|
|
|
|
+# 请求地址
|
|
|
|
|
+base_url = 'http://192.168.1.153:3000'
|
|
|
|
|
+
|
|
|
|
|
+# 数据存储目录
|
|
|
|
|
+base_folder = 'H:\jy'
|
|
|
|
|
+
|
|
|
|
|
+# 线程池最大线程
|
|
|
|
|
+thread_count = 3
|
|
|
|
|
+
|
|
|
|
|
+# 用户id (目前没发现有啥用,不用改)
|
|
|
|
|
+user_id = 28
|
|
|
|
|
+
|
|
|
|
|
+headers = {
|
|
|
|
|
+ # 'User-Agent': UserAgent().chrome
|
|
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
|
|
|
|
|
+ # cookie模拟登录 过期了需要到网页登录抓取cookie替换这里
|
|
|
|
|
+ 'Cookie': '_iChad_session=BAh7CCIPc2Vzc2lvbl9pZCIlOWExYTE5MGZlZDE1YTRmZGMxMjY4MDRmZTQxN2JmNjciGXdhcmRlbi51c2VyLnVzZXIua2V5WwgiCVVzZXJbBmkhIiIkMmEkMTAkUXRJMkkyNTQ0SGJoRkVCOHo2d0REdSIQX2NzcmZfdG9rZW4iMWRXQkRhL2NjbEdmL3k4ZWZrYXgzVE1rWksrdkxhL0FzclhPQWxSVkRXRmc9--8bf8e98234e136cc2512da7e36b33bc1580fa5da'
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 写入到文件
|
|
|
|
|
+def save_to_file(file_name, contents):
|
|
|
|
|
+ fh = open(file_name, 'wb+')
|
|
|
|
|
+ contents = contents.encode('utf-8')
|
|
|
|
|
+ fh.write(contents)
|
|
|
|
|
+ fh.close()
|
|
|
|
|
+ print('写入数据到【' + file_name + '】完成')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 创建目录
|
|
|
|
|
+def mkdir(file_name):
|
|
|
|
|
+ folder = os.path.exists(file_name)
|
|
|
|
|
+ if not folder:
|
|
|
|
|
+ os.makedirs(file_name)
|
|
|
|
|
+ print('创建目录' + file_name)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(file_name + '目录已经存在,跳过创建')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 下载到本地
|
|
|
|
|
+def download(url, file_path):
|
|
|
|
|
+ print('开始下载' + url)
|
|
|
|
|
+ f = urlopen(url)
|
|
|
|
|
+ data = f.read()
|
|
|
|
|
+ with open(file_path, "wb") as code:
|
|
|
|
|
+ code.write(data)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 发送get请求
|
|
|
|
|
+def get(url):
|
|
|
|
|
+ # 创建请求
|
|
|
|
|
+ request = Request(url, headers=headers)
|
|
|
|
|
+ # 发起请求
|
|
|
|
|
+ response = urlopen(request)
|
|
|
|
|
+ # 读取响应内容
|
|
|
|
|
+ response_str = response.read()
|
|
|
|
|
+ # 转换unicode
|
|
|
|
|
+ # response_str = response_str.decode().encode("utf-8").decode("unicode_escape")
|
|
|
|
|
+ response_str = response_str.decode()
|
|
|
|
|
+ # 修正返回结果为json格式
|
|
|
|
|
+ response_str = response_str.replace('results', '"results"')
|
|
|
|
|
+ response_str = response_str.replace('rows', '"rows"')
|
|
|
|
|
+ return response_str
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 发送post请求
|
|
|
|
|
+def post(url, body):
|
|
|
|
|
+ # 设置post为formdata提交方式
|
|
|
|
|
+ headers['Content-Type'] = 'application/x-www-form-urlencoded;charset=utf-8'
|
|
|
|
|
+ body = urlencode(body)
|
|
|
|
|
+ data = body.encode('utf-8')
|
|
|
|
|
+ request = Request(url, headers=headers, data=data, method='POST')
|
|
|
|
|
+ response = urlopen(request)
|
|
|
|
|
+ response_str = response.read()
|
|
|
|
|
+ return response_str
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 获取所有的档案类型
|
|
|
|
|
+def getArchiveTypes():
|
|
|
|
|
+ api_path = '/desktop/get_dalb_print_grid'
|
|
|
|
|
+ params = {
|
|
|
|
|
+ '_dc': 1637722889393,
|
|
|
|
|
+ 'userid': user_id,
|
|
|
|
|
+ 'page': 1,
|
|
|
|
|
+ 'start': 0,
|
|
|
|
|
+ 'limit': 99999
|
|
|
|
|
+ }
|
|
|
|
|
+ finally_url = base_url + api_path + f'?' + urlencode(params)
|
|
|
|
|
+ result = get(finally_url)
|
|
|
|
|
+ response_obj = json.loads(result)
|
|
|
|
|
+ for index, type in enumerate(response_obj['rows']):
|
|
|
|
|
+ if index < 3:
|
|
|
|
|
+ print('跳过' + type['lbmc'])
|
|
|
|
|
+ continue;
|
|
|
|
|
+ print('开始爬取【' + type['lbmc'] + '】数据')
|
|
|
|
|
+ type_folder = base_folder + '\\' + str(type['id']) + '_' + type['lbmc']
|
|
|
|
|
+ mkdir(type_folder)
|
|
|
|
|
+ getArchives(type['id'], type_folder)
|
|
|
|
|
+ # with ThreadPoolExecutor(max_workers=thread_count) as t:
|
|
|
|
|
+ # for type in response_obj['rows']:
|
|
|
|
|
+ # print('开始爬取【' + type['lbmc'] + '】数据')
|
|
|
|
|
+ # type_folder = base_folder + '\\' + str(type['id']) + '_' + type['lbmc']
|
|
|
|
|
+ # mkdir(type_folder)
|
|
|
|
|
+ # t.submit(getArchives, type['id'], type_folder)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 根据档案类型获取所有的档案数据
|
|
|
|
|
+def getArchives(type, folder):
|
|
|
|
|
+ api_path = '/desktop/archive_query_jygl'
|
|
|
|
|
+ for pageIndex in range(5):
|
|
|
|
|
+ params = {
|
|
|
|
|
+ "dalb": type,
|
|
|
|
|
+ "userid": user_id,
|
|
|
|
|
+ "page": pageIndex + 1,
|
|
|
|
|
+ "start": pageIndex * 1000,
|
|
|
|
|
+ "limit": 1000,
|
|
|
|
|
+ }
|
|
|
|
|
+ finally_url = base_url + api_path + f'?' + urlencode(params)
|
|
|
|
|
+ result = get(finally_url)
|
|
|
|
|
+ response_obj = json.loads(result)
|
|
|
|
|
+ print('读取该类型下数据完成,共有' + str(response_obj['results']) + '条数据')
|
|
|
|
|
+ for index, row in enumerate(response_obj['rows']):
|
|
|
|
|
+ # 当前这条数据的文件夹
|
|
|
|
|
+ tm = row['tm']
|
|
|
|
|
+ tm = re.sub(rstr, "_", tm)
|
|
|
|
|
+ data_folder = folder + '\\' + str(index + (pageIndex * 1000)) + '_' + tm
|
|
|
|
|
+ mkdir(data_folder)
|
|
|
|
|
+ getArchiveDetails(row, data_folder)
|
|
|
|
|
+ # with ThreadPoolExecutor(max_workers=thread_count) as t:
|
|
|
|
|
+ # for row in response_obj['rows']:
|
|
|
|
|
+ # # 当前这条数据的文件夹
|
|
|
|
|
+ # data_folder = folder + '\\' + row['tm']
|
|
|
|
|
+ # mkdir(data_folder)
|
|
|
|
|
+ # # getArchiveDetails(row, data_folder)
|
|
|
|
|
+ # t.submit(getArchiveDetails, row, data_folder)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 根据id获取详细信息
|
|
|
|
|
+def getArchiveDetails(row, data_folder):
|
|
|
|
|
+ print('开始读取【' + row['tm'] + '】详细信息')
|
|
|
|
|
+ local_json_path = data_folder + '\\' + 'data.json'
|
|
|
|
|
+ if not os.path.exists(local_json_path):
|
|
|
|
|
+ api_path = '/desktop/get_archivebyid'
|
|
|
|
|
+ body = {
|
|
|
|
|
+ 'id': row['id'],
|
|
|
|
|
+ 'dh': row['dh'],
|
|
|
|
|
+ 'userid': user_id,
|
|
|
|
|
+ }
|
|
|
|
|
+ finally_url = base_url + api_path
|
|
|
|
|
+ result = post(finally_url, body)
|
|
|
|
|
+ # 保存数据到文件
|
|
|
|
|
+ save_to_file(local_json_path, result.decode())
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f'{row["tm"]} json数据已经存在,跳过保存')
|
|
|
|
|
+ # 获取卷内目录数据
|
|
|
|
|
+ getArchiveInner(row['id'], data_folder)
|
|
|
|
|
+ print('开始下载【' + row['tm'] + '】的文件')
|
|
|
|
|
+ getArchiveFile(row['dh'], data_folder)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 获取档案文件集合
|
|
|
|
|
+def getArchiveFile(dh, data_folder):
|
|
|
|
|
+ api_path = '/desktop/get_yx_tree?_dc=1637737132431'
|
|
|
|
|
+ body = {
|
|
|
|
|
+ 'node': 'root',
|
|
|
|
|
+ 'dh': dh
|
|
|
|
|
+ }
|
|
|
|
|
+ finally_url = base_url + api_path
|
|
|
|
|
+ result = post(finally_url, body)
|
|
|
|
|
+ result = result.decode()
|
|
|
|
|
+ result = result.replace('\'', '\"')
|
|
|
|
|
+ response_obj = json.loads(result)
|
|
|
|
|
+ for file in response_obj:
|
|
|
|
|
+ for index, file_item in enumerate(file['children']):
|
|
|
|
|
+ id = file_item['id']
|
|
|
|
|
+ file_id = id[:str(id).index('|')]
|
|
|
|
|
+ file_name = file_item['text']
|
|
|
|
|
+ cs = 0
|
|
|
|
|
+ while cs < 3:
|
|
|
|
|
+ try:
|
|
|
|
|
+ getArchiveFileDownloadUrl(file_id, data_folder, str(index) + file_name)
|
|
|
|
|
+ cs = 3 # 跳出循环
|
|
|
|
|
+ except BaseException:
|
|
|
|
|
+ print(f'{file_name}出现错误休息3秒 再试')
|
|
|
|
|
+ cs += 1 # 记录出错次数
|
|
|
|
|
+ time.sleep(3)
|
|
|
|
|
+ # with ThreadPoolExecutor(max_workers=thread_count) as t:
|
|
|
|
|
+ # for file in response_obj:
|
|
|
|
|
+ # for file_item in file['children']:
|
|
|
|
|
+ # id = file_item['id']
|
|
|
|
|
+ # file_id = id[:str(id).index('|')]
|
|
|
|
|
+ # file_name = file_item['text']
|
|
|
|
|
+ # t.submit(getArchiveFileDownloadUrl, file_id, data_folder, file_name)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 获取文件下载地址
|
|
|
|
|
+def getArchiveFileDownloadUrl(file_id, data_folder, file_name):
|
|
|
|
|
+ api_path = '/desktop/get_timage_from_db'
|
|
|
|
|
+ local_path = data_folder + '\\' + file_name + '.jpg'
|
|
|
|
|
+ if os.path.exists(local_path):
|
|
|
|
|
+ print(file_name + '文件已经下载过了,跳过')
|
|
|
|
|
+ return
|
|
|
|
|
+ body = {
|
|
|
|
|
+ 'gid': file_id,
|
|
|
|
|
+ 'userid': user_id
|
|
|
|
|
+ }
|
|
|
|
|
+ finally_url = base_url + api_path
|
|
|
|
|
+ response_str = post(finally_url, body)
|
|
|
|
|
+ img_path = response_str.decode().replace('assets/./', 'assets/')
|
|
|
|
|
+ download(base_url + img_path, local_path)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 获取卷内目录数据
|
|
|
|
|
+def getArchiveInner(archive_id, data_folder):
|
|
|
|
|
+ print('开始读取' + str(archive_id) + '的卷内目录数据')
|
|
|
|
|
+ inner_json_path = data_folder + '\\' + 'inner.json'
|
|
|
|
|
+ if not os.path.exists(inner_json_path):
|
|
|
|
|
+ api_path = '/desktop/get_document'
|
|
|
|
|
+ params = {
|
|
|
|
|
+ "query": archive_id,
|
|
|
|
|
+ "page": 1,
|
|
|
|
|
+ "start": 0,
|
|
|
|
|
+ "limit": 99999,
|
|
|
|
|
+ }
|
|
|
|
|
+ finally_url = base_url + api_path + f'?' + urlencode(params)
|
|
|
|
|
+ response_str = get(finally_url)
|
|
|
|
|
+ save_to_file(data_folder + '\\' + 'inner.json', response_str)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f'{archive_id} 卷内目录json数据已经存在 跳过保存')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
|
+ getArchiveTypes()
|