吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 751|回复: 26
收起左侧

[求助] 请教大神们一个Python selenium问题

[复制链接]
xiaojipkhuang1 发表于 2024-8-30 09:57
[Python] 纯文本查看 复制代码
import re
import threading
import time
import driver
import pymysql
import requests
from selenium import webdriver
import time
import random
import xlwt
from selenium.webdriver.common.by import By
import smtplib
from email.mime.text import MIMEText
from email.header import Header
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib3.util import wait
import datetime
from datetime import datetime, timedelta
from apscheduler.schedulers.blocking import BlockingScheduler
import pymssql
import csv
import os


def get_ts(keyword, text):
    pattern = r'{}[\u4e00-\u9fa5]*([\d\.]+)'.format(keyword)
    match = re.search(pattern, text)
    if match:
        return match.group(1)  # 输出数字
    else:
        return "1"


def get_gs(keyword, text):
    pattern = fr"(\d+){keyword}"
    match = re.search(pattern, text)
    if match:
        return match.group(1)  # 输出数字
    else:
        return "0"


def get_yp(keyword, text):
    pattern = fr"(\d+){keyword}"
    match = re.search(pattern, text)
    if match:
        return match.group(1)  # 输出数字
    else:
        return "0"


def get_tnr(keyword, text):
    pattern = r'{}[\u4e00-\u9fa5]*([\d\.]+)'.format(keyword)
    match = re.search(pattern, text)
    if match:
        return match.group(1)  # 输出数字
    else:
        return "0"


# def my_job():
#     # 执行JavaScript来刷新页面
#     driver.execute_script("window.location.reload();")
#     time.sleep(random.randint(2, 3))
#     driver.find_element(by=By.XPATH, value='//*[@id="layout"]/div[2]/div[2]/div[3]/div[1]/div/span[2]/span').click()
#     time.sleep(random.randint(3, 4))
#     driver.find_element(by=By.XPATH, value='//*[@id="layout"]/div[2]/div[2]/div[3]/div[1]/div/span[2]/span').click()
#     get_data()
    # work_book.save("盼之.xls")


def extract_datetime(text):
    pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
    match = re.search(pattern, text)
    if match:
        return match.group()
    else:
        return None

    # 配置邮箱发送信息


def con():
    db = pymysql.connect(
        port=3306,
        user='root',
        password='cyj19950610',
        db='mysql',
        charset='utf8'
    )


# 计算页面文本中包含关键字的数量


class save:
    con()
    db = pymysql.connect(
        port=3306,
        user='root',
        password='cyj19950610',
        db='mysql',
        charset='utf8'
    )
    cursor = db.cursor()
    try:
        sql = 'insert ignore  into db_px_wwqy (bh, nss, amount, rate, info_time) values(%s, %s, %s, %s, %s)'
    except:
        print("数据有问题")




def get_data():
    # headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
    # AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
    #          "Host": "g-search1.alicdn.com"}
    global col
    global row
    # divs = driver.find_elements(by=By.XPATH,
    #                             value='//*[@id="top"]/div[5]/div[2]/div[2]/div[1]/a/div/div[2]/div[1]')  # 所有的div标签
    divs = driver.find_elements(by=By.XPATH, value='//div[@class="game_list"]/div[2]/div')
    print(divs)
    for div in divs:
        nss = div.find_element(by=By.XPATH,
                               value='.//div/div[2]/a/div/div[2]').text
        amount = div.find_element(by=By.XPATH,
                                  value='//div/div[3]/div').text
        # nss = div.find_element(by=By.XPATH,
        #                        value='//*[@id="top"]/div[5]/div[2]/div[2]/div[1]/a/div/div[2]/div[1]/h4').text
        # amount = div.find_element(by=By.XPATH,
        #                           value='//*[@id="top"]/div[5]/div[2]/div[2]/div[1]/a/div/div[3]').text
        threshold = 300
        text = nss
        amount = get_ts("¥", amount)
        info_time = datetime.now()
        int_amount = round(int(amount))
        bh = extracted_text = text[:10]
        ds = get_ts("无畏点数", nss)
        rate = round(int(ds) * 0.04)
        print(f"文本: {nss}")
        print(f"金额: {amount}")
        print(f"折损: {rate}")
        # 连接到MySQL数据库
        connection = pymysql.connect(host='localhost', user='root', password='cyj19950610', db='mysql')
        with connection.cursor() as cursor:
            # 编写SQL查询语句,假设表名为your_table,检查字段your_field是否包含数据data_on_page
            sql = "SELECT 1 FROM db_px_wwqy WHERE bh=%s"
            cursor.execute(sql, bh)
            # 获取查询结果
            result = cursor.fetchone()

        connection.close()
        product = {'编号': bh,
                   '内容': nss,
                   '金额': amount,
                   '折损': rate,
                   '发布时间': info_time
                   }
        bh = product.get('编号')
        nss = product.get('内容')
        amount = product.get('金额')
        rate = product.get('折损')
        info_time = product.get('发布时间')
        save.cursor.execute(save.sql,
                            (bh, nss, amount, rate, info_time))
        save.db.commit()

if __name__ == '__main__':
    # div_list = ["编号", "天赏石", "天霓染", "金额", "天赏均价", "发布时间"]
    # col = 0  # 设置行、列
    # row = 1
    # work_book = xlwt.Workbook(encoding='utf-8')  # 创建工作簿
    # work_sheet = work_book.add_sheet('tb_shopping')  # 创建一张表
    # pattern = xlwt.Pattern()  # 设置单元格
    # pattern.pattern = xlwt.Pattern.SOLID_PATTERN
    # pattern.pattern_fore_colour = 5
    # for i in range(6):  # 设置标头
    #     work_sheet.col(i).width = 4444
    #     work_sheet.write(0, i, div_list[i])
    opts = webdriver.ChromeOptions()
    opts.headless = True
    # opts.add_argument('-headless')
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument("--headless")  # 设置Chrome为无头模式
    chrome_options.add_argument('--ignore-certificate-errors')  # 忽略证书错误
    opts.add_argument("--disable-3d-apis")
    opts.add_argument('log-level=3')
    chrome_options.add_argument('blink-settings=imagesEnabled=false')
    driver = webdriver.Chrome(options=opts)
    driver.get('https://www.pxb7.com/selectgame?game_id=148&game_alias=games&gameAlias=wwqy')
    driver.implicitly_wait(1)
    driver.maximize_window()
    driver.find_element(by=By.XPATH, value='//*[@id="layout"]/div[2]/div[2]/div[3]/div[1]/div/span[2]/span').click()
    time.sleep(random.randint(2, 3))
    driver.find_element(by=By.XPATH, value='//*[@id="layout"]/div[2]/div[2]/div[3]/div[1]/div/span[2]/span').click()
    time.sleep(random.randint(2, 3))
    get_data()
    # # 创建调度器
    # scheduler = BlockingScheduler()
    # # 添加任务:间隔30秒执行一次
    # scheduler.add_job(my_job, 'interval', seconds=10)
    # scheduler.start()



这是我的代码

这个xpath能定位到这个文本信息,但是为啥内容一直提取出来是空的

这个xpath能定位到这个文本信息,但是为啥内容一直提取出来是空的

免费评分

参与人数 1吾爱币 +1 热心值 +1 收起 理由
cjcmxc + 1 + 1 我很赞同!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

numbersi 发表于 2024-8-30 10:09
是不是动态加载的原因
 楼主| xiaojipkhuang1 发表于 2024-8-30 10:11
numbersi 发表于 2024-8-30 10:09
是不是动态加载的原因

动态加载的话但是这个xpath位置都是不变的诶,只是内容变化
numbersi 发表于 2024-8-30 10:16
前端vue写的,动态加载的,dom都是动态生成的,你静态获取是空
 楼主| xiaojipkhuang1 发表于 2024-8-30 10:19
numbersi 发表于 2024-8-30 10:16
前端vue写的,动态加载的,dom都是动态生成的,你静态获取是空

啊,那这种有啥办法可以获取内容吗
uuwatch 发表于 2024-8-30 10:22
//*[@id="exposureType:productList|productId:297|gameId:148|price:60|index:1|rcToken:148_297^1724984233_1724984233_4^57_"]/div[2]/a/span[2]
//div[@class="game_list"]/div[2]/div/div]/div[2]/a/div/div[2]
要不你试试用正则匹配?
numbersi 发表于 2024-8-30 10:25
xiaojipkhuang1 发表于 2024-8-30 10:19
啊,那这种有啥办法可以获取内容吗

动态获取你看api啊
 楼主| xiaojipkhuang1 发表于 2024-8-30 10:26
uuwatch 发表于 2024-8-30 10:22
//*[@id="exposureType:productList|productId:297|gameId:148|price:60|index:1|rcToken:148_297^17249842 ...

正则的话这个xpath是动态的
 楼主| xiaojipkhuang1 发表于 2024-8-30 10:27
numbersi 发表于 2024-8-30 10:25
动态获取你看api啊

好吧,动态的话get的地址也是动态变化的,参数我都逆向了,就是请求的地址有个加密的逆不出来
qianxiaohe 发表于 2024-8-30 10:45
可以换掉selenium用试试playwright库。安装库命令pip install playwright,安装对应的浏览器和驱动命令playwright install,录制命令python -m playwright codegen 网址
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-12-11 23:53

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表