import
requests
from bs4
import
BeautifulSoup
import
time
# 基础配置
base_url =
"https://www.shulalua.com/book_185727/"
headers = {
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit...'
}
# 获取目录页
def get_chapters():
response = requests.get(base_url, headers=headers)
soup = BeautifulSoup(response
.text
,
'html.parser'
)
# 根据实际页面结构调整选择器
return [a[
'href'
]
for
a
in
soup.select(
'.chapter-list a'
)]
# 章节内容抓取
def get_content(chapter_url):
try:
res = requests.get(chapter_url, headers=headers)
res.encoding =
'utf-8'
# 根据网页实际编码调整
soup = BeautifulSoup(res
.text
,
'html.parser'
)
# 提取正文(需审查元素确认选择器)
content = soup.find(
'div'
, class_=
'content'
).text.strip()
return content.replace(
'\n\n'
,
'\n'
)
except Exception as e:
print(f
'抓取失败:{chapter_url} - {str(e)}'
)
return
''
# 主程序
if
__name__ ==
'__main__'
:
chapters = get_chapters()
with open(
'novel.txt'
,
'w'
, encoding=
'utf-8'
) as f:
for
idx, url
in
enumerate(chapters, 1):
content = get_content(url)
f.write(f
'第{idx}章\n{content}\n\n'
)
print(f
'已完成第{idx}章'
)
time.sleep(1) # 降低请求频率