# 搜索小说,并选择所需要下载的小说
def search_novel():
chrome_options = webdriver.ChromeOptions()
#后台静默运行
chrome_options.add_argument('--headless')
print('浏览器已打开')
browser = webdriver.Chrome(options=chrome_options)
#browser = webdriver.Chrome()
name_input = input('输入小说名或作者:')
browser.get(f'http://www.duwanjuan.info/modules/article/search.php?q={name_input}')
time.sleep(6)
# 输出网页源代码
html = browser.page_source
browser.close()
# print('浏览器已关闭')
html = etree.HTML(html)
name = html.xpath("//div[@id='jieqi_page_contents']/div[@class='c_row']/div/div/span[@class='c_subject']/a/text()")[:10]
chapter = html.xpath("//div[@class='c_tag']/span[@class='c_value']/a/text()")[:10]
link = html.xpath("//div[@id='jieqi_page_contents']/div[@class='c_row']/div/div/a/@href")[:10]
# 提取每个链接的后缀部分
link_suffixes = [extract_link_suffix(l) for l in link]
author = html.xpath("//div[@class='c_tag']/span[contains(text(), '作者:')]/following-sibling::span[1]/text()")[:10]
num = [i + 1 for i in range(0, len(name))]
data = {'序号': num, '小说': name, '作者': author,'最新章节':chapter,'链接':link_suffixes}
df = pd.DataFrame(data)
if df.empty:
print('搜索数据为空,请重新搜索')
search_novel()
else:
print(df)
sx_input = int(input('请输入序号选择下载的小说:'))
novel_link = link[sx_input - 1]
return novel_link
# 定义一个函数来获取小说章节目录的URL和章节名
def get_chapter_urls(url, visited_urls, value):
global tot_title
global book_name
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
chapter_elements = html.xpath("//div[@class='index']//li[@class='chapter']/a")
chapter_elements.pop(10)
tot_title = html.xpath("//div[@class='index']//li[@class='chapter']/a/text()")
bk = html.xpath("//div[@class='main']/div[@class='headlink cf']/h1/text()[1]")
# 从列表中提取字符串
if bk: # 确保bk不为空
text = bk[0] # 提取列表中的第一个元素
else:
text = "" # 如果bk为空,则设置text为空字符串
# 正则表达式,匹配方括号及其内容,但使用括号捕获括号内的内容
pattern = r"\['(.*?)'\]"
# 使用re.search来查找匹配项,如果找到,则提取捕获组中的内容
match = re.search(pattern, text)
if match:
book_name = match.group(1) # 提取捕获组中的内容
else:
book_name = text # 如果没有找到匹配项,则保留原始text值
chapter_urls = []
for element in chapter_elements:
chapter_name = element.text
chapter_url = element.get('href')
if chapter_url not in visited_urls:
value += 1
chapter_urls.append((chapter_name, chapter_url, value))
visited_urls.add(chapter_url)
return chapter_urls
# 定义一个函数来获取小说具体章节的内容
def get_chapter_content(url):
try:
response = requests.get(url, headers=headers,verify=False,timeout=15)
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
content_element = html.xpath("//div[@id='acontent']/text()")
pattern = r'\r\n \xa0\xa0\xa0\xa0|\s|\(|\)|\读万卷 www.duwanjuan.info'
content = [re.sub(pattern, '', sub_text) for sub_text in content_element]
return content
except requests.RequestException as e:
print(f"Error occurred while fetching content from {url}: {e}")
return []
# 定义一个函数来处理每个章节的爬取任务
def process_chapter(chapter_queue):
global time_start
time_start = time.time()
while not chapter_queue.empty():
chapter_name, chapter_url, value = chapter_queue.get()
print("正在爬取章节:", chapter_name)
try:
content = get_chapter_content(chapter_url)
except Exception as e:
print(f"获取章节内容失败:{e}")
content = []
# 在这里可以将内容保存到文件或进行其他处理
folder_path = f'{book_name}'
if not os.path.exists(folder_path):
os.makedirs(folder_path)
with open(f'{book_name}/{value}.txt', 'w', encoding='utf-8') as f:
f.write('\n' + chapter_name + '\n')
for data in content:
f.write(data + '\n')
f.write('\n\n')
chapter_queue.task_done()
time.sleep(6)
# 合并下载的TXT文件
def merge_txt_files(folder_path, output_file):
txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
txt_files.sort(key=lambda x: int(x[:-4]))
with open(output_file, 'w', encoding='utf-8') as outfile:
for txt_file in txt_files:
with open(os.path.join(folder_path, txt_file), 'r', encoding='utf-8') as infile:
content = infile.read()
outfile.write(content)