import
requests
import
re
import
time
import
pandas as pd
import
matplotlib.pyplot as plt
import
wordcloud as wc
print
(
'created by Zeaf'
)
print
(
'若想停止请按ctrl+C'
)
user
=
{
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
response
=
requests.get(
'http://www.pkulaw.cn/cluster_form.aspx?Db=news&menu_item=law&EncodingName=&keyword=%u884C%u653F%u590D%u8BAE%u6CD5&range=name&'
,headers
=
user)
html
=
response.text
urls
=
re.findall(
'href="(.*?)" target="_blank"'
, html)
title_in
=
[]
date_in
=
[]
place_in
=
[]
keyword_in
=
[]
url_in
=
[]
for
url
in
urls:
time.sleep(
1
)
url
=
'http://www.pkulaw.cn/'
+
url
response
=
requests.get(url,headers
=
user)
html
=
response.text
title
=
re.findall(
'<strong>(.*?)</strong>'
, html)
date
=
re.findall(
'【发布日期】</font> (.*?)</td>'
, html)
place
=
re.findall(
'【来源】</font> (.*?)</td>'
, html)
keyword
=
re.findall(
'【关键词语】</font> <a href=".*?" target=_blank>(.*?)</a>'
, html)
title_in.append(title[
0
])
date_in.append(date[
0
])
place_in.append(place[
0
])
keyword_in.append(keyword[
0
])
url_in.append(url)
print
(
'保存中...'
)
print
(
'保存成功!'
)
data
=
{
'标题'
:title_in,
'发布日期'
:date_in,
'来源'
:place_in,
'关键词语'
:keyword_in,
'原文链接'
:url_in}
y
=
pd.DataFrame(data)
y.to_excel(
'news.xlsx'
,index
=
False
)
print
(
'生成表格成功!'
)
content
=
' '
.join(keyword_in)
wordcloud
=
wc.WordCloud(max_words
=
50
,font_path
=
'simhei.ttf'
).generate(content)
plt.imshow(wordcloud)
plt.show()
wordcloud.to_file(
'news.jpg'
)