import
multiprocessing
import
os
import
random
import
sys
import
time
import
requests
import
json
from
bs4
import
BeautifulSoup
import
xmltodict as xmltodict
import
urllib3
urllib3.disable_warnings()
from
faker
import
Faker
fake
=
Faker(locale
=
'zh_CN'
)
def
get_headers():
headers
=
{
'Content-Type'
:
'application/json'
,
'Referer'
:
'https://www.cpppc.org:8082/inforpublic/homepage.html'
,
'User-Agent'
: fake.user_agent()
}
return
headers
def
get_payload(page_num, payload_type):
payload
=
''
if
payload_type
=
=
'org'
:
payload
=
json.dumps({
"name"
: "",
"industry"
: "",
"pageNumber"
: page_num,
"size"
:
5
,
"service_types"
: "",
"level"
: "",
"dist_province"
: "",
"dist_city"
: "",
"dist_code"
: "",
"nlpVO"
: {},
"org_name_pinyin_order"
:
"asc"
})
return
payload
if
payload_type
=
=
'proj'
:
payload
=
json.dumps({
"name"
: "",
"industry"
: "",
"min"
:
0
,
"max"
:
10000000000000000
,
"pageNumber"
: page_num,
"size"
:
5
,
"level"
: "",
"start"
: "",
"end"
: "",
"dist_province"
: "",
"dist_city"
: "",
"dist_code"
: "",
"nlpVO"
: {},
"created_date_order"
:
"desc"
})
return
payload
def
request_method(request_type, request_url, headers, payload, is_stream):
with requests.Session() as s:
status
=
0
count
=
0
while
status !
=
200
:
if
count !
=
0
:
time.sleep(random.randint(
1
,
3
))
count
=
count
+
1
try
:
resp
=
s.request(request_type, request_url, headers
=
headers, data
=
payload, timeout
=
5
, stream
=
is_stream, verify
=
False
)
status
=
resp.status_code
except
Exception as e:
print
(f
'网络异常{e}'
)
time.sleep(random.randint(
1
,
3
))
if
is_stream:
return
resp
else
:
return
resp.json()
def
get_proj(proj_base_url, page_num, msg_queue):
json_result
=
request_method(
"POST"
, proj_base_url, headers
=
get_headers(), payload
=
get_payload(page_num,
'proj'
), is_stream
=
False
)
result_list
=
json_result.get(
'data'
).get(
'hits'
)
if
len
(result_list) >
0
:
for
item
in
result_list:
proj_name
=
item.get(
'proj_name'
)
proj_rid
=
item.get(
'proj_rid'
)
result
=
[proj_rid, proj_name]
msg_queue.put(result)
page_num
=
page_num
+
1
return
True
, page_num
else
:
msg_queue.put([
'finish'
,
0
])
return
False
, page_num
def
download_org(msg_queue, org_base_url):
have_org_more
=
True
page_num
=
1
while
have_org_more:
json_result
=
request_method(
"POST"
, org_base_url, headers
=
get_headers(), payload
=
get_payload(page_num,
'org'
), is_stream
=
False
)
org_list
=
json_result.get(
'data'
).get(
'hits'
)
if
len
(org_list) >
0
:
for
org
in
org_list:
org_no
=
org.get(
'org_no'
)
my_count
=
0
page_num
=
1
has_child_more
=
True
while
has_child_more:
child_url
=
f
"https://www.cpppc.org:8082/api/pub/organization/consulting/project/list?orgNo={org_no}&pageNumber={page_num}&pageSize=10"
child_json
=
request_method(request_type
=
'GET'
, request_url
=
child_url, headers
=
'
', payload='
', is_stream
=
False
)
proj_list
=
child_json.get(
'data'
).get(
'currentPageResult'
)
total_count
=
child_json.get(
'data'
).get(
'totalCount'
)
my_count
=
my_count
+
len
(proj_list)
if
my_count <
=
total_count:
for
proj
in
proj_list:
proj_name
=
proj.get(
'projectName'
)
proj_rid
=
proj.get(
'projectId'
)
result
=
[proj_rid, proj_name]
msg_queue.put(result)
page_num
=
page_num
+
1
else
:
has_child_more
=
False
page_num
=
page_num
+
1
else
:
have_org_more
=
False
print
(f
'下载完毕:{org_base_url}'
)
def
download_data(msg_queue, project_url):
have_more
=
True
page_num
=
1
while
have_more:
time.sleep(random.randint(
1
,
3
))
have_more, page_num
=
get_proj(proj_base_url
=
project_url, page_num
=
page_num, msg_queue
=
msg_queue)
print
(f
'下载完毕:{project_url}'
)
def
analysis_and_download(msg_queue):
while
True
:
if
msg_queue.empty():
print
(
'analysis_and_download队列无任务'
)
time.sleep(random.randint(
1
,
3
))
else
:
data
=
msg_queue.get()
page_json
=
request_method(
"GET"
, f
"https://www.cpppc.org:8082/api/pub/project/prepare-detail/{data[0]}"
, headers
=
get_headers(), payload
=
'', is_stream
=
False
)
root
=
{
"root"
: page_json}
xml
=
xmltodict.unparse(root, pretty
=
False
)
bs
=
BeautifulSoup(xml,
'lxml'
)
all_attachs
=
bs.findAll(
"attachs"
)
for
attachs
in
all_attachs:
fileid
=
attachs.find(
'fileid'
).text
filename
=
attachs.find(
'filename'
).text
print
(f
'文件ID:{fileid}'
)
print
(f
'文件名:{filename}'
)
program_path
=
os.path.dirname(os.path.realpath(sys.argv[
0
]))
replace_project_name
=
data[
1
].replace(
" "
,
"_"
)
project_dir_path
=
os.path.join(program_path,
'下载'
, replace_project_name)
if
not
os.path.exists(project_dir_path):
os.makedirs(project_dir_path)
if
not
os.path.isfile(os.path.join(project_dir_path, filename)):
download_url
=
f
'https://www.cpppc.org:8082/api/pdfs/front/download/{fileid}?token=null&appId=public'
print
(f
'下载链接:{download_url}'
)
download_file(download_url, project_dir_path, filename)
else
:
pass
print
(
'文件已下载,跳过'
)
def
download_file(download_url, dir_path, file_name):
r
=
request_method(
'GET'
, download_url, headers
=
'
', payload='
', is_stream
=
True
)
content
=
r.content
file_path
=
os.path.join(dir_path, file_name)
with
open
(file_path,
'wb'
) as f:
f.write(content)
def
start():
search_url
=
"https://www.cpppc.org:8082/api/pub/project/search"
search_store_url
=
"https://www.cpppc.org:8082/api/pub/project/search-store"
org_url
=
"https://www.cpppc.org:8082/api/pub/organization/search"
q1
=
multiprocessing.Queue()
p1
=
multiprocessing.Process(target
=
download_data, args
=
(q1, search_url,))
p2
=
multiprocessing.Process(target
=
download_data, args
=
(q1, search_store_url,))
p3
=
multiprocessing.Process(target
=
download_org, args
=
(q1, org_url,))
customer
=
multiprocessing.Process(target
=
analysis_and_download, args
=
(q1,))
p1.start()
p2.start()
p3.start()
customer.start()
if
__name__
=
=
'__main__'
:
if
sys.platform.startswith(
'win'
):
multiprocessing.freeze_support()
start()