import
time
import
re
import
os,sys
import
requests
from
bs4
import
BeautifulSoup
from
pathlib
import
Path
import
pdfkit
def
get_list():
url
=
'https://www.zhihu.com/api/v4/columns/%s/articles?include=data.topics&limit=10'
%
author
article_dict
=
{}
while
True
:
print
(
'fetching'
, url)
try
:
resp
=
requests.get(url, headers
=
headers)
j
=
resp.json()
data
=
j[
'data'
]
except
:
print
(
'get list failed'
)
for
article
in
data:
aid
=
article[
'id'
]
akeys
=
article_dict.keys()
if
aid
not
in
akeys:
article_dict[aid]
=
article[
'title'
]
if
j[
'paging'
][
'is_end'
]:
break
url
=
j[
'paging'
][
'next'
]
time.sleep(
2
)
with
open
(filedir.joinpath(
'zhihu_ids.txt'
),
'w'
) as f:
items
=
sorted
(article_dict.items())
for
item
in
items:
f.write(
'%s %s\n'
%
item)
def
get_html(aid, title, index,encoding
=
'UTF-8'
):
title
=
re.sub(
'[\/:*?"<>|]'
,
'-'
, title)
print
(title)
file_name
=
'%03d. %s.html'
%
(index, title)
file_name
=
file_name.replace(
" "
,"").strip()
print
(
'saving'
, title)
try
:
url
=
'https://zhuanlan.zhihu.com/p/'
+
aid
res
=
requests.get(url, headers
=
headers)
encoding
=
res.encoding
html
=
res.text
soup
=
BeautifulSoup(html,
'html.parser'
)
content
=
soup.find(
class_
=
'Post-RichText'
).prettify()
content
=
content.replace(
'data-actual'
, '')
content
=
content.replace(
'h1>'
,
'h2>'
)
content
=
re.sub(r
'<noscript>.*?</noscript>'
, '', content)
content
=
re.sub(r
'src="data:image.*?"'
, '', content)
strmath
=
'<script type="text/javascript" async src="https://cdn.staticfile.org/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML" ></script>'
content1
=
f
'<!DOCTYPE html><html><head><meta charset={encoding}></head><body><h1>{title}</h1>{content}{strmath}</body></html>'
str1
=
content2
=
content1
+
str1
with
open
(filedir.joinpath(file_name),
'w'
, encoding
=
'utf-8'
) as f:
f.write(content2)
except
:
print
(
'get %s failed'
, title)
time.sleep(
2
)
def
get_details():
with
open
(filedir.joinpath(
'zhihu_ids.txt'
)) as f:
i
=
1
for
line
in
f:
lst
=
line.strip().split(
' '
)
aid
=
lst[
0
]
title
=
'_'
.join(lst[
1
:])
get_html(aid, title, i)
i
+
=
1
def
to_pdf():
print
(
'exporting PDF...'
)
htmls
=
[]
htmls
+
=
[name.name
for
name
in
Path.iterdir(filedir)
if
name.suffix
=
=
".html"
]
path_wk
=
r
"E:\Software\wkhtmltopdf\bin\wkhtmltopdf.exe"
config
=
pdfkit.configuration(wkhtmltopdf
=
path_wk)
path_wk
=
r
"E:\Software\wkhtmltopdf\bin\wkhtmltopdf.exe"
config
=
pdfkit.configuration(wkhtmltopdf
=
path_wk)
options
=
{
'encoding'
:
"utf-8"
,
'orientation'
:
'Portrait'
,
'enable-local-file-access'
:
'--enable-local-file-access'
,
'enable-internal-links'
:
'--enable-internal-links'
,
'enable-javascript'
:
'--enable-javascript'
,
'javascript-delay'
:
'10000'
,
'no-stop-slow-scripts'
:
'--no-stop-slow-scripts'
,
'debug-javascript'
:
'--debug-javascript'
,
'enable-forms'
:
'--enable-forms'
,
'disable-smart-shrinking'
:
'--disable-smart-shrinking'
}
htmls_dir
=
[filedir.joinpath(name)
for
name
in
sorted
(htmls)]
pdfkit.from_file(htmls_dir, dir_path
+
"\\"
+
author
+
'.pdf'
,options
=
options,configuration
=
config)
print
(
'Done'
)
if
__name__
=
=
'__main__'
:
dir_path
=
r
'E:\Brandon\Desktop\zhihu'
author
=
'c_1322265113534304256'
filedir
=
Path(dir_path).joinpath(author)
filedir.mkdir(parents
=
True
,exist_ok
=
True
)
headers
=
{
'origin'
:
'https://zhuanlan.zhihu.com'
,
'referer'
:
'https://zhuanlan.zhihu.com/%s'
%
author,
'User-Agent'
: (
'Mozilla/5.0'
),
}
get_list()
get_details()