本帖最后由 铁板兄长361 于 2024-2-28 20:45 编辑
本人在学习阶段写的一个小小爬虫,还有很多功能没有完善,希望各位前辈指点一二。
目标网站:aHR0cHMlM0EvL3d3dy5qaHE4LmNuLw==
首先打开目标网站分析网页源码,查找网站换页规律,很简单的得知换页规律,这里仅分析网页
得知了网站题目的换页规律,(目标站点没有做防护)轻点撸)~。
那么手写一个小虫子 启动即可!!!
[Python] 纯文本查看 复制代码 import queue
import requests
import time
from lxml import etree
import threading
import pymysql
from queue import Queue
# 数据库连接配置
db_config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': '123456',
'database': '题库',
'charset': 'utf8'
}
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
# 重试次数
max_retries = 5
# 线程数量
num_threads = 3
# 开始爬取的页面
start_page = 1 # 指定从第10000个页面开始
# 爬取并插入数据库的函数
def crawl_and_insert(thread_id, url_queue, db_pool, start_page):
while True:
try:
page_number = start_page + url_queue.get(timeout=1) # 获取页面编号,设置超时时间
except queue.Empty:
break # 队列为空时退出循环
url = f'aHR0cHMlM0EvL3d3dy5qaHE4LmNuLw==daan/tiku/0/{page_number}.html'
retries = 0
# 建立数据库连接
db = db_pool.get()
cursor = db.cursor()
# 检查 URL 是否已被爬取
cursor.execute("SELECT COUNT(*) FROM crawled_urls WHERE url = %s", (url,))
if cursor.fetchone()[0] > 0:
print(f"线程-{thread_id}: URL 已爬取 {url}")
db_pool.put(db) # 将连接放回连接池
continue # 如果已爬取,跳过此URL
# 重试机制
while retries < max_retries:
try:
res = requests.get(url, headers=headers, timeout=10)
content = res.content.decode('GB2312', 'ignore')
tree = etree.HTML(content)
links = tree.xpath("/html/body/div[1]/div/div/div[4]/div/div[1]/div/ul/li/a/@href")
for link in links:
# 获取所有超链接的完整 URL
full_url = 'https://www.jhq8.cn/' + link
# 检查链接是否已被爬取
cursor.execute("SELECT COUNT(*) FROM crawled_urls WHERE url = %s", (full_url,))
if cursor.fetchone()[0] > 0:
print(f"线程-{thread_id}: URL 已爬取 {full_url}")
continue
response = requests.get(full_url, headers=headers)
page_content = response.content.decode('GB2312', 'ignore')
tree = etree.HTML(page_content)
text_list = tree.xpath("/html/body/div[1]/div/div/div[3]/div/div/div[1]/div[2]/p//text()")
formatted_text = ' '.join(text_list)
cleaned_text = formatted_text.replace("\n", "").replace("\r", "").replace("\t", "").replace(' ', '')
# 操作 MySQL 数据库的语句
try:
# 插入数据
sql = "INSERT INTO tiku (题目) VALUES (%s)"
cursor.execute(sql, (cleaned_text,))
db.commit()
# 将爬取的 URL 存储到数据库中
cursor.execute("INSERT INTO crawled_urls (url) VALUES (%s)", (full_url,))
db.commit()
except pymysql.MySQLError as e:
print(f"线程-{thread_id}: 数据库错误 {e}")
db.rollback()
db_pool.put(db) # 将连接放回连接池
break # 成功爬取,跳出重试循环
except requests.RequestException as e:
retries += 1
print(f"线程-{thread_id}: 请求 URL 失败,正在重试... {e}")
# 记录开始时间
start_time = time.time()
# 创建URL队列
url_queue = Queue()
# 向队列中添加URL
for i in range(58387 - start_page): # 从指定页面开始爬取
url_queue.put(i)
# 创建数据库连接池
db_pool = Queue(maxsize=num_threads)
for _ in range(num_threads):
db = pymysql.connect(**db_config)
db_pool.put(db)
# 创建并启动线程
threads = []
for i in range(num_threads):
thread = threading.Thread(target=crawl_and_insert, args=(i, url_queue, db_pool, start_page))
thread.start()
threads.append(thread)
# 等待所有线程完成
for thread in threads:
thread.join()
# 记录结束时间
end_time = time.time()
# 计算程序运行时间
elapsed_time = end_time - start_time
print(f"所有线程都已完成。总运行时间:{elapsed_time:.2f} 秒")
|