[Python] 纯文本查看 复制代码
import time
import re
import os,sys
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import pdfkit
def get_list():
url = 'https://www.zhihu.com/api/v4/columns/%s/articles?include=data.topics&limit=10' % author
article_dict = {}
while True:
print('fetching', url)
try:
resp = requests.get(url, headers=headers)
j = resp.json()
data = j['data']
except:
print('get list failed')
for article in data:
aid = article['id']
akeys = article_dict.keys()
if aid not in akeys:
article_dict[aid] = article['title']
if j['paging']['is_end']:
break
url = j['paging']['next']
time.sleep(2)
with open(filedir.joinpath('zhihu_ids.txt'), 'w') as f:
items = sorted(article_dict.items())
for item in items:
f.write('%s %s\n' % item)
def get_html(aid, title, index,encoding='UTF-8'):
title = re.sub('[\/:*?"<>|]', '-', title) # 正则过滤非法文件字符
print(title)
file_name = '%03d. %s.html' % (index, title)
file_name=file_name.replace(" ","").strip()
# file_name=[strings.replace(i, "") for i in special_characters]
# 删除字符串中的特殊字符
print('saving', title)
try:
url = 'https://zhuanlan.zhihu.com/p/' + aid
res= requests.get(url, headers=headers)
encoding=res.encoding
html =res.text
# soup = BeautifulSoup(html, 'lxml')
soup = BeautifulSoup(html, 'html.parser')
content = soup.find(class_='Post-RichText').prettify()
content = content.replace('data-actual', '')
content = content.replace('h1>', 'h2>')
content = re.sub(r'<noscript>.*?</noscript>', '', content)
content = re.sub(r'src="data:image.*?"', '', content)
# content = f'<!DOCTYPE html><html><head><meta charset={encoding}></head><body><h1>{title}</h1>{content}</body></html>'
strmath = '<script type="text/javascript" async src="https://cdn.staticfile.org/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML" ></script>' #解析数学公式
content1 = f'<!DOCTYPE html><html><head><meta charset={encoding}></head><body><h1>{title}</h1>{content}{strmath}</body></html>'
str1 = """
<style>
body {
margin: 0 50px;
}
p {
text-indent: 2em;
} # 文字首行缩进2em
img {
width: 100%;
} # 图片显示中间 不超出
pre {
white-space: pre-wrap;
word-wrap: break-word;
} # 代码段自动换行
.ztext-math {
display: inline-block;
}
</style>
"""
content2 = content1 + str1
with open(filedir.joinpath(file_name), 'w', encoding='utf-8') as f:
f.write(content2)
except:
print('get %s failed', title)
time.sleep(2)
def get_details():
with open(filedir.joinpath('zhihu_ids.txt')) as f:
i = 1
for line in f:
lst = line.strip().split(' ')
aid = lst[0]
title = '_'.join(lst[1:])
get_html(aid, title, i)
i += 1
def to_pdf():
'''
如需导出pdf,除通过pip安装pdfkit外,还需要手动安装 wkhtmltopdf,具体参见:
[url=https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf]https://github.com/JazzCore/pyth ... talling-wkhtmltopdf[/url]
[url=https://wkhtmltopdf.org/downloads.html]https://wkhtmltopdf.org/downloads.html[/url]
'''
print('exporting PDF...')
htmls = []
htmls += [name.name for name in Path.iterdir(filedir) if name.suffix==".html"]
path_wk = r"E:\Software\wkhtmltopdf\bin\wkhtmltopdf.exe"
# htmls.remove('index.html')
config = pdfkit.configuration(wkhtmltopdf = path_wk)
path_wk = r"E:\Software\wkhtmltopdf\bin\wkhtmltopdf.exe"
config = pdfkit.configuration(wkhtmltopdf = path_wk)
options = {
'encoding': "utf-8",
# 'page-size': 'A4',
'orientation': 'Portrait',#'Landscape',#'Portrait',#横屏竖屏
# 'margin-top': '2mm',
# 'margin-right': '2mm',
# 'margin-bottom': '2mm',
# 'margin-left': '2mm',
# 'no-outline': None,
# 'background': background_color,
# 'quiet':'' #默认情况下, PDFKit 将会显示所有的wkhtmltopdf输出. 如果你不想看到这些信息,需要传递一个quiet选项
'enable-local-file-access': '--enable-local-file-access',
'enable-internal-links': '--enable-internal-links',
'enable-javascript':'--enable-javascript',
# 'javascript - delay' :'--javascript - delay < 300 >',
'javascript-delay':'10000',
'no-stop-slow-scripts':'--no-stop-slow-scripts',
'debug-javascript':'--debug-javascript',
'enable-forms':'--enable-forms',
'disable-smart-shrinking':'--disable-smart-shrinking'
}
htmls_dir=[filedir.joinpath(name) for name in sorted(htmls)]
pdfkit.from_file(htmls_dir, dir_path+"\\"+author + '.pdf',options=options,configuration=config)
print('Done')
if __name__ == '__main__':
dir_path=r'E:\Brandon\Desktop\zhihu'
author = 'c_1322265113534304256'
filedir=Path(dir_path).joinpath(author)
filedir.mkdir(parents=True,exist_ok=True) # 创建文件夹
headers = {
'origin': 'https://zhuanlan.zhihu.com',
'referer': 'https://zhuanlan.zhihu.com/%s' % author,
'User-Agent': ('Mozilla/5.0'),
}
get_list()
get_details()
#to_pdf()