下载图片逻辑
[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import lxml
import os
base_url = 'http://www.nmketang.com/thread/detail_ebook/1829298.html'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}
def get_img_src(url):
res = requests.get(url, headers=headers)
if res.status_code == 200:
soup = BeautifulSoup(res.text, 'lxml')
div = soup.find('div', class_='t_fsz')
img = div.find('img')
return img.attrs['src']
def download_img(path, url):
dir_path = path.rsplit('\\', 1)[0]
print(dir_path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
res = requests.get(url, headers=headers)
if res.status_code == 200:
with open(path, 'wb') as f:
f.write(res.content)
root_path = os.path.dirname(os.path.abspath(__file__))
res = requests.get('https://www.05wang.com/thread-957565-1-1.html', headers=headers)
if res.status_code == 200:
soup = BeautifulSoup(res.text, 'lxml')
divs = soup.find_all('div', class_='box_year')
for year_div in divs:
folder_name = year_div.attrs['id']
lis = year_div.find('ul', class_='nav-tabs').find_all('li')
li_list = []
for li in lis:
li_dict = {'title': li.find('a').text, 'id': li.find('a').attrs['href'].lstrip('#')}
li_list.append(li_dict)
for item in li_list:
div_wrap = year_div.find('div', id=item['id']).find_all('div', class_='show_chapter')
for div in div_wrap:
a_dom = div.find('a')
if not a_dom:
continue
a_url = a_dom.attrs['href']
img_path = dir_name = os.path.join(root_path, 'files', folder_name, item['title'], '{}.jpg'.format(div.find('li').text.strip()))
img_url = get_img_src(a_url)
download_img(img_path, img_url)
生成pdf逻辑
[Python] 纯文本查看 复制代码 from reportlab.lib.units import inch
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter, landscape
import os
def create_pdf(input_folder_path, output_pdf_path):
c = canvas.Canvas(output_pdf_path, pagesize=landscape(letter))
files = os.listdir(input_folder_path)
images = [f for f in files if f.endswith('.jpg') or f.endswith('.png')]
for i, image in enumerate(images):
if i != 0:
c.showPage()
c.drawImage(os.path.join(input_folder_path, image), 0, 0, width=11 * inch, height=8.5 * inch)
c.save()
# 使用方法
# create_pdf('input_folder', 'output.pdf')
for path, folder_list, file_list in os.walk('files'):
if '册' not in path:
continue
file_name = path.split('\\')[-1]
create_pdf(path, '{}-{}.pdf'.format(path.split('\\')[1], path.split('\\')[-1]))
图太多了下载比较慢,代码贴出,可以自己测试下,道友有交流的也可以交流下逻辑 |