from
re
import
sub
from
re
import
compile
from
time
import
sleep
from
parsel
import
Selector
import
tomd
from
requests
import
get
def
spider_csdn(url, ccs_head, css_text):
title_url
=
url
if
not
title_url:
print
(
'错误'
,
'请输入网址'
)
sleep(
5
)
return
None
head
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52'
}
html
=
get(url
=
title_url, headers
=
head).text
page
=
Selector(html)
title
=
page.css(
'%s::text'
%
ccs_head).get()
res
=
compile
(
'[^一-龥^a-z^A-Z^0-9]'
)
restr
=
''
res.sub(restr, title)
content
=
page.css(
'%s'
%
css_text).get()
content
=
sub(
'<a.*?a>'
, '', content)
content
=
sub(
'<br>'
, '', content)
content
=
sub(
'<li>'
, '', content)
content
=
sub(
'</li>'
, '', content)
content
=
sub(
'^#'
, '', content)
content
=
sub(
'(<img.*?>)'
,
'<p>\\1</p>'
, content)
content
=
sub(
'loading="lazy"'
, '', content)
texts
=
tomd.Tomd(content).markdown
title
=
title.replace(
' '
, '')
title
=
title.replace(
'\n'
, '')
with
open
((title
+
'.md'
), mode
=
'w'
, encoding
=
'utf-8'
) as (f):
f.write(
'#'
+
title)
f.write(texts)
print
(
'获取文章完成'
)
import
subprocess
from
os
import
getcwd
addr
=
getcwd()
subprocess.Popen(
'explorer %s'
%
addr)
if
__name__
=
=
'__main__'
:
from
pyperclip
import
paste
url
=
paste()
while
True
:
if
'csdn.net/'
in
url:
spider_csdn(url,
'.title-article'
,
'article'
)
input
(
'按任意键继续 '
)
url
=
paste()
elif
'bbsmax.com/'
in
url:
spider_csdn(url,
'.title'
,
'.post-content'
)
input
(
'按任意键继续 '
)
url
=
paste()
elif
'cnblogs.com/'
in
url:
spider_csdn(url,
'#cb_post_title_url > span'
,
'.postBody'
)
input
(
'按任意键继续 '
)
url
=
paste()
elif
'zhuanlan.zhihu.com/'
in
url:
spider_csdn(url,
'#root > div > main > div > article > header > h1'
,
'#root > div > main > div > article > div.Post-RichTextContainer'
)
input
(
'按任意键继续 '
)
url
=
paste()
elif
'weixin.qq.com/'
in
url:
spider_csdn(url,
'.rich_media_title'
,
'/html/body/div[1]/div/div[1]/div[2]'
)
input
(
'按任意键继续 '
)
url
=
paste()
elif
'juejin.cn/'
in
url:
spider_csdn(url,
'.article-title'
,
'.article-content'
)
input
(
'按任意键继续 '
)
url
=
paste()
continue
else
:
print
(
'你的剪切板不是文章的url,目前支持csdn,bbsmax,博客园,:下面是你的剪切板的内容\n%s'
%
url)
input
(
'按任意键继续 '
)
url
=
paste()