[Python] 纯文本查看 复制代码
import multiprocessing
import os
import random
import sys
import time
import requests
import json
from bs4 import BeautifulSoup
import xmltodict as xmltodict
import urllib3
urllib3.disable_warnings()
from faker import Faker
fake = Faker(locale='zh_CN')
def get_headers():
headers = {
'Content-Type': 'application/json',
'Referer': 'https://www.cpppc.org:8082/inforpublic/homepage.html',
'User-Agent': fake.user_agent()
}
return headers
def get_payload(page_num, payload_type):
payload = ''
if payload_type == 'org':
payload = json.dumps({
"name": "",
"industry": "",
"pageNumber": page_num,
"size": 5,
"service_types": "",
"level": "",
"dist_province": "",
"dist_city": "",
"dist_code": "",
"nlpVO": {},
"org_name_pinyin_order": "asc"
})
return payload
if payload_type == 'proj':
payload = json.dumps({
"name": "",
"industry": "",
"min": 0,
"max": 10000000000000000,
"pageNumber": page_num,
"size": 5,
"level": "",
"start": "",
"end": "",
"dist_province": "",
"dist_city": "",
"dist_code": "",
"nlpVO": {},
"created_date_order": "desc"
})
return payload
def request_method(request_type, request_url, headers, payload, is_stream):
with requests.Session() as s:
status = 0
count = 0
while status != 200:
if count != 0:
time.sleep(random.randint(1, 3))
count = count + 1
try:
resp = s.request(request_type, request_url, headers=headers, data=payload, timeout=5, stream=is_stream, verify=False)
status = resp.status_code
except Exception as e:
print(f'网络异常{e}')
time.sleep(random.randint(1, 3))
if is_stream:
return resp
else:
return resp.json()
def get_proj(proj_base_url, page_num, msg_queue):
json_result = request_method("POST", proj_base_url, headers=get_headers(), payload=get_payload(page_num, 'proj'), is_stream=False)
result_list = json_result.get('data').get('hits')
if len(result_list) > 0:
for item in result_list:
proj_name = item.get('proj_name')
proj_rid = item.get('proj_rid')
result = [proj_rid, proj_name]
msg_queue.put(result)
page_num = page_num + 1
return True, page_num
else:
msg_queue.put(['finish', 0])
return False, page_num
def download_org(msg_queue, org_base_url):
have_org_more = True
page_num = 1
while have_org_more:
json_result = request_method("POST", org_base_url, headers=get_headers(), payload=get_payload(page_num, 'org'), is_stream=False)
org_list = json_result.get('data').get('hits')
if len(org_list) > 0:
for org in org_list:
org_no = org.get('org_no')
my_count = 0
page_num = 1
has_child_more = True
while has_child_more:
child_url = f"https://www.cpppc.org:8082/api/pub/organization/consulting/project/list?orgNo={org_no}&pageNumber={page_num}&pageSize=10"
child_json = request_method(request_type='GET', request_url=child_url, headers='', payload='', is_stream=False)
proj_list = child_json.get('data').get('currentPageResult')
total_count = child_json.get('data').get('totalCount')
my_count = my_count + len(proj_list)
if my_count <= total_count:
for proj in proj_list:
proj_name = proj.get('projectName')
proj_rid = proj.get('projectId')
result = [proj_rid, proj_name]
msg_queue.put(result)
page_num = page_num + 1
else:
has_child_more = False
page_num = page_num + 1
else:
have_org_more = False
print(f'下载完毕:{org_base_url}')
def download_data(msg_queue, project_url):
have_more = True
page_num = 1
while have_more:
time.sleep(random.randint(1, 3))
have_more, page_num = get_proj(proj_base_url=project_url, page_num=page_num, msg_queue=msg_queue)
print(f'下载完毕:{project_url}')
def analysis_and_download(msg_queue):
while True:
if msg_queue.empty():
print('analysis_and_download队列无任务')
time.sleep(random.randint(1, 3))
else:
# 从队列中获取数据
data = msg_queue.get()
page_json = request_method("GET", f"https://www.cpppc.org:8082/api/pub/project/prepare-detail/{data[0]}", headers=get_headers(), payload='', is_stream=False)
root = {"root": page_json}
xml = xmltodict.unparse(root, pretty=False)
bs = BeautifulSoup(xml, 'lxml')
all_attachs = bs.findAll("attachs")
for attachs in all_attachs:
fileid = attachs.find('fileid').text
filename = attachs.find('filename').text
print(f'文件ID:{fileid}')
print(f'文件名:{filename}')
# 当前路径
program_path = os.path.dirname(os.path.realpath(sys.argv[0]))
replace_project_name = data[1].replace(" ", "_")
project_dir_path = os.path.join(program_path, '下载', replace_project_name)
if not os.path.exists(project_dir_path):
os.makedirs(project_dir_path)
if not os.path.isfile(os.path.join(project_dir_path, filename)):
download_url = f'https://www.cpppc.org:8082/api/pdfs/front/download/{fileid}?token=null&appId=public'
print(f'下载链接:{download_url}')
download_file(download_url, project_dir_path, filename)
else:
pass
print('文件已下载,跳过')
def download_file(download_url, dir_path, file_name):
r = request_method('GET', download_url, headers='', payload='', is_stream=True)
# 获取文件下载数据源
content = r.content
# 打开文件写入
file_path = os.path.join(dir_path, file_name)
with open(file_path, 'wb') as f:
f.write(content)
def start():
"""
# 专家库
https://www.cpppc.org:8082/api/pub/experts/search
# 项目报告
https://www.cpppc.org:8082/api/pub/project-report/search
"""
# 管理库项目
search_url = "https://www.cpppc.org:8082/api/pub/project/search"
# 储备清单
search_store_url = "https://www.cpppc.org:8082/api/pub/project/search-store"
# 机构库
org_url = "https://www.cpppc.org:8082/api/pub/organization/search"
# 队列
q1 = multiprocessing.Queue()
# 生产者
p1 = multiprocessing.Process(target=download_data, args=(q1, search_url,))
p2 = multiprocessing.Process(target=download_data, args=(q1, search_store_url,))
p3 = multiprocessing.Process(target=download_org, args=(q1, org_url,))
# 消费
customer = multiprocessing.Process(target=analysis_and_download, args=(q1,))
p1.start()
p2.start()
p3.start()
customer.start()
if __name__ == '__main__':
if sys.platform.startswith('win'):
multiprocessing.freeze_support()
start()