[Python] 纯文本查看 复制代码 #引入运行所需的库以及数据库的基本配置
from selenium.webdriver import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
import pymongo
import time,os
client = pymongo.MongoClient('localhost', 27017)
db = client['DB']
#建立三个数据库,分别用来存放单选题,多选题和判断题
xinli1 = db.xinli1
xinli2 = db.xinli2
xinli3 = db.xinli3
##爬取失败时需要重新爬取,需要清空原来的数据库
# xinli1.delete_many({})
# xinli2.delete_many({})
# xinli3.delete_many({})
#打开登录页面
url = "http://www.zaixiankaoshi.com/login/"
options = webdriver.ChromeOptions()
#初始化浏览器
########################
# 处理SSL证书错误问题
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
# 忽略无用的日志
options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
driver = webdriver.Chrome(options=options)
size_Dict = driver.get_window_size()
driver.set_window_rect(x=1300, y=100, width=1250, height=1300) # 设置浏览器的大小和位置
driver.maximize_window() # 最大化浏览器窗口
driver.implicitly_wait(20) # 隐式等待。网页加载数据需要时间,智能化等待。
driver.get(url)
#########################
##先登录账号
driver.find_element(By.XPATH,"/html/body/div[3]/div/div/section/div[2]/div/div[2]/form/div/div[1]/div/div/div/input").send_keys("18203971920")
driver.find_element(By.XPATH,"/html/body/div[3]/div/div/section/div[2]/div/div[2]/form/div/div[2]/div/div/div/input").send_keys("118833xx..")
driver.find_element(By.XPATH,"/html/body/div[3]/div/div/section/div[2]/div/div[2]/form/div/div[3]/button").click()
time.sleep(1)
##要爬取的目标编号
paperId = 8856290
##开始爬取的题号
sequence = 0
#打开这道题所在页面
url= "https://www.zaixiankaoshi.com/online/?paperId="+str(paperId)+"&practice=&modal=1&is_recite=&qtype=&text=%E9%A1%BA%E5%BA%8F%E7%BB%83%E4%B9%A0&sequence="+str(sequence)+"&is_collect=0"
# driver = webdriver.Chrome()
# driver.get(url)
js = "window.open('"+url+"'); "
driver.execute_script(js)
new_window=driver.window_handles[-1]
driver.switch_to.window(new_window)
##如果sequence 不为0,即不是从第一题开始爬取,则需要加上下面的代码
# time.sleep(1)
# driver.find_element(By.XPATH,"/html/body/div[4]/div/div[3]/button[2]").click()
# time.sleep(1)
#找到题目所在区域
start = driver.find_element(By.XPATH,"/html/body/div[3]/div/div/section/div[2]/div[1]/div[2]/div[1]/div/div")
#点击背题模式按钮,显示出答案
driver.find_element(By.XPATH,"/html/body/div[3]/div/div/section/div[2]/div[1]/div[2]/div[2]/div[2]/div[1]/p[2]/span[2]").click()
###########循环#######
x = 0
while True:
#获取答案选项
answer = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/section/div[2]/div[1]/div[2]/div[1]/div[1]/div[3]/div[1]/div/div[1]/div/b/span").text
#获取题目类型
type = start.find_element(By.XPATH, "/html/body/div[3]/div/div/section/div[2]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div/span[1]").text
#按题目类型写入对应的数据库
typeNo = 0
xinli = xinli1
if type == '单选题':
typeNo = 0
xinli = xinli1
elif type == "多选题":
typeNo = 1
xinli = xinli2
elif type == "判断题":
typeNo = 2
xinli = xinli3
#获取题干内容(不含图片等其他特殊信息)
question = start.find_element(By.XPATH, "/html/body/div[3]/div/div/section/div[2]/div[1]/div[2]/div[1]/div[1]/div/div[1]/div/div").text
#获取解析,没有解析就留空
options = start.find_element(By.XPATH, "/html/body/div[3]/div/div/section/div[2]/div[1]/div[2]/div[1]/div[1]/div").find_elements(By.CLASS_NAME, "option")
try:
explain = driver.find_element(By.CLASS_NAME, "answer-analysis").text
except:
explain = ""
print("question="+question)
#做映射
map = {
"type": type,
"typeNo": typeNo,
"question": question,
"answer": answer,
"explain": explain
}
#获取选项,并写入数据库
for i in options:
optionNo = i.find_element(By.CLASS_NAME, "before-icon").text
option = i.find_elements(By.TAG_NAME, "span")[1].text
print("optionNo="+optionNo+",option="+option)
map["option" + optionNo] = option
xinli.insert_one(map)
x+=1
print("已完成第"+str(x)+"题的读入")
#点击下一题
driver.find_element(By.XPATH, "/html/body/div[3]/div/div/section/div[2]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[3]/button[2]").send_keys(Keys.ENTER)
#到第十题提示一下
if x == 10:
print("第十题")
time.sleep(1)
driver.find_element(By.TAG_NAME,"body").send_keys(Keys.ESCAPE)
python3.7,selenium4.8.0,pymongo 4.3.3,mongodb x86_64-5.0.14环境下测试,抓取写入数据库均正常
主要应该就是我在4楼说的问题,寻找带option的class的css选择器那部分代码需要修改 |