档案数据.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. import json
  2. import os
  3. from concurrent.futures import ThreadPoolExecutor
  4. from urllib.parse import urlencode
  5. from urllib.request import Request
  6. from urllib.request import urlopen
  7. import re
  8. import time
  9. # 请求地址
  10. base_url = 'http://192.168.1.153:3000'
  11. # 数据存储目录
  12. base_folder = 'H:\jy'
  13. # 线程池最大线程
  14. thread_count = 3
  15. # 用户id (目前没发现有啥用,不用改)
  16. user_id = 28
  17. headers = {
  18. # 'User-Agent': UserAgent().chrome
  19. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
  20. # cookie模拟登录 过期了需要到网页登录抓取cookie替换这里
  21. 'Cookie': '_iChad_session=BAh7CCIPc2Vzc2lvbl9pZCIlOWExYTE5MGZlZDE1YTRmZGMxMjY4MDRmZTQxN2JmNjciGXdhcmRlbi51c2VyLnVzZXIua2V5WwgiCVVzZXJbBmkhIiIkMmEkMTAkUXRJMkkyNTQ0SGJoRkVCOHo2d0REdSIQX2NzcmZfdG9rZW4iMWRXQkRhL2NjbEdmL3k4ZWZrYXgzVE1rWksrdkxhL0FzclhPQWxSVkRXRmc9--8bf8e98234e136cc2512da7e36b33bc1580fa5da'
  22. }
  23. rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
  24. # 写入到文件
  25. def save_to_file(file_name, contents):
  26. fh = open(file_name, 'wb+')
  27. contents = contents.encode('utf-8')
  28. fh.write(contents)
  29. fh.close()
  30. print('写入数据到【' + file_name + '】完成')
  31. # 创建目录
  32. def mkdir(file_name):
  33. folder = os.path.exists(file_name)
  34. if not folder:
  35. os.makedirs(file_name)
  36. print('创建目录' + file_name)
  37. else:
  38. print(file_name + '目录已经存在,跳过创建')
  39. # 下载到本地
  40. def download(url, file_path):
  41. print('开始下载' + url)
  42. f = urlopen(url)
  43. data = f.read()
  44. with open(file_path, "wb") as code:
  45. code.write(data)
  46. # 发送get请求
  47. def get(url):
  48. # 创建请求
  49. request = Request(url, headers=headers)
  50. # 发起请求
  51. response = urlopen(request)
  52. # 读取响应内容
  53. response_str = response.read()
  54. # 转换unicode
  55. # response_str = response_str.decode().encode("utf-8").decode("unicode_escape")
  56. response_str = response_str.decode()
  57. # 修正返回结果为json格式
  58. response_str = response_str.replace('results', '"results"')
  59. response_str = response_str.replace('rows', '"rows"')
  60. return response_str
  61. # 发送post请求
  62. def post(url, body):
  63. # 设置post为formdata提交方式
  64. headers['Content-Type'] = 'application/x-www-form-urlencoded;charset=utf-8'
  65. body = urlencode(body)
  66. data = body.encode('utf-8')
  67. request = Request(url, headers=headers, data=data, method='POST')
  68. response = urlopen(request)
  69. response_str = response.read()
  70. return response_str
  71. # 获取所有的档案类型
  72. def getArchiveTypes():
  73. api_path = '/desktop/get_dalb_print_grid'
  74. params = {
  75. '_dc': 1637722889393,
  76. 'userid': user_id,
  77. 'page': 1,
  78. 'start': 0,
  79. 'limit': 99999
  80. }
  81. finally_url = base_url + api_path + f'?' + urlencode(params)
  82. result = get(finally_url)
  83. response_obj = json.loads(result)
  84. for index, type in enumerate(response_obj['rows']):
  85. if index < 3:
  86. print('跳过' + type['lbmc'])
  87. continue;
  88. print('开始爬取【' + type['lbmc'] + '】数据')
  89. type_folder = base_folder + '\\' + str(type['id']) + '_' + type['lbmc']
  90. mkdir(type_folder)
  91. getArchives(type['id'], type_folder)
  92. # with ThreadPoolExecutor(max_workers=thread_count) as t:
  93. # for type in response_obj['rows']:
  94. # print('开始爬取【' + type['lbmc'] + '】数据')
  95. # type_folder = base_folder + '\\' + str(type['id']) + '_' + type['lbmc']
  96. # mkdir(type_folder)
  97. # t.submit(getArchives, type['id'], type_folder)
  98. # 根据档案类型获取所有的档案数据
  99. def getArchives(type, folder):
  100. api_path = '/desktop/archive_query_jygl'
  101. for pageIndex in range(5):
  102. params = {
  103. "dalb": type,
  104. "userid": user_id,
  105. "page": pageIndex + 1,
  106. "start": pageIndex * 1000,
  107. "limit": 1000,
  108. }
  109. finally_url = base_url + api_path + f'?' + urlencode(params)
  110. result = get(finally_url)
  111. response_obj = json.loads(result)
  112. print('读取该类型下数据完成,共有' + str(response_obj['results']) + '条数据')
  113. for index, row in enumerate(response_obj['rows']):
  114. # 当前这条数据的文件夹
  115. tm = row['tm']
  116. tm = re.sub(rstr, "_", tm)
  117. data_folder = folder + '\\' + str(index + (pageIndex * 1000)) + '_' + tm
  118. mkdir(data_folder)
  119. getArchiveDetails(row, data_folder)
  120. # with ThreadPoolExecutor(max_workers=thread_count) as t:
  121. # for row in response_obj['rows']:
  122. # # 当前这条数据的文件夹
  123. # data_folder = folder + '\\' + row['tm']
  124. # mkdir(data_folder)
  125. # # getArchiveDetails(row, data_folder)
  126. # t.submit(getArchiveDetails, row, data_folder)
  127. # 根据id获取详细信息
  128. def getArchiveDetails(row, data_folder):
  129. print('开始读取【' + row['tm'] + '】详细信息')
  130. local_json_path = data_folder + '\\' + 'data.json'
  131. if not os.path.exists(local_json_path):
  132. api_path = '/desktop/get_archivebyid'
  133. body = {
  134. 'id': row['id'],
  135. 'dh': row['dh'],
  136. 'userid': user_id,
  137. }
  138. finally_url = base_url + api_path
  139. result = post(finally_url, body)
  140. # 保存数据到文件
  141. save_to_file(local_json_path, result.decode())
  142. else:
  143. print(f'{row["tm"]} json数据已经存在,跳过保存')
  144. # 获取卷内目录数据
  145. getArchiveInner(row['id'], data_folder)
  146. print('开始下载【' + row['tm'] + '】的文件')
  147. getArchiveFile(row['dh'], data_folder)
  148. # 获取档案文件集合
  149. def getArchiveFile(dh, data_folder):
  150. api_path = '/desktop/get_yx_tree?_dc=1637737132431'
  151. body = {
  152. 'node': 'root',
  153. 'dh': dh
  154. }
  155. finally_url = base_url + api_path
  156. result = post(finally_url, body)
  157. result = result.decode()
  158. result = result.replace('\'', '\"')
  159. response_obj = json.loads(result)
  160. for file in response_obj:
  161. for index, file_item in enumerate(file['children']):
  162. id = file_item['id']
  163. file_id = id[:str(id).index('|')]
  164. file_name = file_item['text']
  165. cs = 0
  166. while cs < 3:
  167. try:
  168. getArchiveFileDownloadUrl(file_id, data_folder, str(index) + file_name)
  169. cs = 3 # 跳出循环
  170. except BaseException:
  171. print(f'{file_name}出现错误休息3秒 再试')
  172. cs += 1 # 记录出错次数
  173. time.sleep(3)
  174. # with ThreadPoolExecutor(max_workers=thread_count) as t:
  175. # for file in response_obj:
  176. # for file_item in file['children']:
  177. # id = file_item['id']
  178. # file_id = id[:str(id).index('|')]
  179. # file_name = file_item['text']
  180. # t.submit(getArchiveFileDownloadUrl, file_id, data_folder, file_name)
  181. # 获取文件下载地址
  182. def getArchiveFileDownloadUrl(file_id, data_folder, file_name):
  183. api_path = '/desktop/get_timage_from_db'
  184. local_path = data_folder + '\\' + file_name + '.jpg'
  185. if os.path.exists(local_path):
  186. print(file_name + '文件已经下载过了,跳过')
  187. return
  188. body = {
  189. 'gid': file_id,
  190. 'userid': user_id
  191. }
  192. finally_url = base_url + api_path
  193. response_str = post(finally_url, body)
  194. img_path = response_str.decode().replace('assets/./', 'assets/')
  195. download(base_url + img_path, local_path)
  196. # 获取卷内目录数据
  197. def getArchiveInner(archive_id, data_folder):
  198. print('开始读取' + str(archive_id) + '的卷内目录数据')
  199. inner_json_path = data_folder + '\\' + 'inner.json'
  200. if not os.path.exists(inner_json_path):
  201. api_path = '/desktop/get_document'
  202. params = {
  203. "query": archive_id,
  204. "page": 1,
  205. "start": 0,
  206. "limit": 99999,
  207. }
  208. finally_url = base_url + api_path + f'?' + urlencode(params)
  209. response_str = get(finally_url)
  210. save_to_file(data_folder + '\\' + 'inner.json', response_str)
  211. else:
  212. print(f'{archive_id} 卷内目录json数据已经存在 跳过保存')
  213. if __name__ == '__main__':
  214. getArchiveTypes()