| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245 |
- import json
- import os
- from concurrent.futures import ThreadPoolExecutor
- from urllib.parse import urlencode
- from urllib.request import Request
- from urllib.request import urlopen
- import re
- import time
- # 请求地址
- base_url = 'http://192.168.1.153:3000'
- # 数据存储目录
- base_folder = 'H:\jy'
- # 线程池最大线程
- thread_count = 3
- # 用户id (目前没发现有啥用,不用改)
- user_id = 28
- headers = {
- # 'User-Agent': UserAgent().chrome
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
- # cookie模拟登录 过期了需要到网页登录抓取cookie替换这里
- 'Cookie': '_iChad_session=BAh7CCIPc2Vzc2lvbl9pZCIlOWExYTE5MGZlZDE1YTRmZGMxMjY4MDRmZTQxN2JmNjciGXdhcmRlbi51c2VyLnVzZXIua2V5WwgiCVVzZXJbBmkhIiIkMmEkMTAkUXRJMkkyNTQ0SGJoRkVCOHo2d0REdSIQX2NzcmZfdG9rZW4iMWRXQkRhL2NjbEdmL3k4ZWZrYXgzVE1rWksrdkxhL0FzclhPQWxSVkRXRmc9--8bf8e98234e136cc2512da7e36b33bc1580fa5da'
- }
- rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
- # 写入到文件
- def save_to_file(file_name, contents):
- fh = open(file_name, 'wb+')
- contents = contents.encode('utf-8')
- fh.write(contents)
- fh.close()
- print('写入数据到【' + file_name + '】完成')
- # 创建目录
- def mkdir(file_name):
- folder = os.path.exists(file_name)
- if not folder:
- os.makedirs(file_name)
- print('创建目录' + file_name)
- else:
- print(file_name + '目录已经存在,跳过创建')
- # 下载到本地
- def download(url, file_path):
- print('开始下载' + url)
- f = urlopen(url)
- data = f.read()
- with open(file_path, "wb") as code:
- code.write(data)
- # 发送get请求
- def get(url):
- # 创建请求
- request = Request(url, headers=headers)
- # 发起请求
- response = urlopen(request)
- # 读取响应内容
- response_str = response.read()
- # 转换unicode
- # response_str = response_str.decode().encode("utf-8").decode("unicode_escape")
- response_str = response_str.decode()
- # 修正返回结果为json格式
- response_str = response_str.replace('results', '"results"')
- response_str = response_str.replace('rows', '"rows"')
- return response_str
- # 发送post请求
- def post(url, body):
- # 设置post为formdata提交方式
- headers['Content-Type'] = 'application/x-www-form-urlencoded;charset=utf-8'
- body = urlencode(body)
- data = body.encode('utf-8')
- request = Request(url, headers=headers, data=data, method='POST')
- response = urlopen(request)
- response_str = response.read()
- return response_str
- # 获取所有的档案类型
- def getArchiveTypes():
- api_path = '/desktop/get_dalb_print_grid'
- params = {
- '_dc': 1637722889393,
- 'userid': user_id,
- 'page': 1,
- 'start': 0,
- 'limit': 99999
- }
- finally_url = base_url + api_path + f'?' + urlencode(params)
- result = get(finally_url)
- response_obj = json.loads(result)
- for index, type in enumerate(response_obj['rows']):
- if index < 3:
- print('跳过' + type['lbmc'])
- continue;
- print('开始爬取【' + type['lbmc'] + '】数据')
- type_folder = base_folder + '\\' + str(type['id']) + '_' + type['lbmc']
- mkdir(type_folder)
- getArchives(type['id'], type_folder)
- # with ThreadPoolExecutor(max_workers=thread_count) as t:
- # for type in response_obj['rows']:
- # print('开始爬取【' + type['lbmc'] + '】数据')
- # type_folder = base_folder + '\\' + str(type['id']) + '_' + type['lbmc']
- # mkdir(type_folder)
- # t.submit(getArchives, type['id'], type_folder)
- # 根据档案类型获取所有的档案数据
- def getArchives(type, folder):
- api_path = '/desktop/archive_query_jygl'
- for pageIndex in range(5):
- params = {
- "dalb": type,
- "userid": user_id,
- "page": pageIndex + 1,
- "start": pageIndex * 1000,
- "limit": 1000,
- }
- finally_url = base_url + api_path + f'?' + urlencode(params)
- result = get(finally_url)
- response_obj = json.loads(result)
- print('读取该类型下数据完成,共有' + str(response_obj['results']) + '条数据')
- for index, row in enumerate(response_obj['rows']):
- # 当前这条数据的文件夹
- tm = row['tm']
- tm = re.sub(rstr, "_", tm)
- data_folder = folder + '\\' + str(index + (pageIndex * 1000)) + '_' + tm
- mkdir(data_folder)
- getArchiveDetails(row, data_folder)
- # with ThreadPoolExecutor(max_workers=thread_count) as t:
- # for row in response_obj['rows']:
- # # 当前这条数据的文件夹
- # data_folder = folder + '\\' + row['tm']
- # mkdir(data_folder)
- # # getArchiveDetails(row, data_folder)
- # t.submit(getArchiveDetails, row, data_folder)
- # 根据id获取详细信息
- def getArchiveDetails(row, data_folder):
- print('开始读取【' + row['tm'] + '】详细信息')
- local_json_path = data_folder + '\\' + 'data.json'
- if not os.path.exists(local_json_path):
- api_path = '/desktop/get_archivebyid'
- body = {
- 'id': row['id'],
- 'dh': row['dh'],
- 'userid': user_id,
- }
- finally_url = base_url + api_path
- result = post(finally_url, body)
- # 保存数据到文件
- save_to_file(local_json_path, result.decode())
- else:
- print(f'{row["tm"]} json数据已经存在,跳过保存')
- # 获取卷内目录数据
- getArchiveInner(row['id'], data_folder)
- print('开始下载【' + row['tm'] + '】的文件')
- getArchiveFile(row['dh'], data_folder)
- # 获取档案文件集合
- def getArchiveFile(dh, data_folder):
- api_path = '/desktop/get_yx_tree?_dc=1637737132431'
- body = {
- 'node': 'root',
- 'dh': dh
- }
- finally_url = base_url + api_path
- result = post(finally_url, body)
- result = result.decode()
- result = result.replace('\'', '\"')
- response_obj = json.loads(result)
- for file in response_obj:
- for index, file_item in enumerate(file['children']):
- id = file_item['id']
- file_id = id[:str(id).index('|')]
- file_name = file_item['text']
- cs = 0
- while cs < 3:
- try:
- getArchiveFileDownloadUrl(file_id, data_folder, str(index) + file_name)
- cs = 3 # 跳出循环
- except BaseException:
- print(f'{file_name}出现错误休息3秒 再试')
- cs += 1 # 记录出错次数
- time.sleep(3)
- # with ThreadPoolExecutor(max_workers=thread_count) as t:
- # for file in response_obj:
- # for file_item in file['children']:
- # id = file_item['id']
- # file_id = id[:str(id).index('|')]
- # file_name = file_item['text']
- # t.submit(getArchiveFileDownloadUrl, file_id, data_folder, file_name)
- # 获取文件下载地址
- def getArchiveFileDownloadUrl(file_id, data_folder, file_name):
- api_path = '/desktop/get_timage_from_db'
- local_path = data_folder + '\\' + file_name + '.jpg'
- if os.path.exists(local_path):
- print(file_name + '文件已经下载过了,跳过')
- return
- body = {
- 'gid': file_id,
- 'userid': user_id
- }
- finally_url = base_url + api_path
- response_str = post(finally_url, body)
- img_path = response_str.decode().replace('assets/./', 'assets/')
- download(base_url + img_path, local_path)
- # 获取卷内目录数据
- def getArchiveInner(archive_id, data_folder):
- print('开始读取' + str(archive_id) + '的卷内目录数据')
- inner_json_path = data_folder + '\\' + 'inner.json'
- if not os.path.exists(inner_json_path):
- api_path = '/desktop/get_document'
- params = {
- "query": archive_id,
- "page": 1,
- "start": 0,
- "limit": 99999,
- }
- finally_url = base_url + api_path + f'?' + urlencode(params)
- response_str = get(finally_url)
- save_to_file(data_folder + '\\' + 'inner.json', response_str)
- else:
- print(f'{archive_id} 卷内目录json数据已经存在 跳过保存')
- if __name__ == '__main__':
- getArchiveTypes()
|