获取中国农药信息网农药登记数据

kunsir001 · 发表于 2026-4-23 22:53

本帖最后由 kunsir001 于 2026-4-23 22:54 编辑

[Python] 纯文本查看 复制代码

# -*- coding: utf-8 -*-
"""
中国农药信息网 - 农药登记数据爬虫
目标网站: http://www.chinapesticide.org.cn/zwb/dataCenter (农药登记)
数据接口: https://www.icama.cn/BasicdataSystem/pesticideRegistration/
提取字段: 登记证号、农药名称、农药类别、剂型、总含量、有效期至、登记证持有人、持有人地址
"""

import csv
import re
import time
import logging
import requests
from bs4 import BeautifulSoup

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
log = logging.getLogger(__name__)

# ── 接口地址 ──────────────────────────────────────────────────────
BASE = "https://www.icama.cn/BasicdataSystem/pesticideRegistration"
LIST_URL = f"{BASE}/queryselect.do"
DETAIL_URL = f"{BASE}/viewcompany.do"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "zh-CN,zh;q=0.9",
}

OUTPUT_FILE = "pesticide_data.csv"
CSV_COLUMNS = [
    "登记证号", "农药名称", "农药类别", "剂型",
    "总含量", "有效期至", "登记证持有人", "持有人地址",
]

# ── Session ───────────────────────────────────────────────────────
session = requests.Session()
session.headers.update(HEADERS)


def clean(text: str) -> str:
    if not text:
        return ""
    return re.sub(r"\s+", "", text).strip()


# ── 列表请求 ──────────────────────────────────────────────────────
def fetch_list(page: int = 1, size: int = 20) -> list[dict]:
    """请求列表接口并解析表格"""
    data = {
        "pageNo": str(page),
        "pageSize": str(size),
        "djzh": "", "nymc": "", "cjmc": "", "sf": "", "nylb": "",
        "zhl": "", "jx": "", "zwmc": "", "fzdx": "", "syff": "",
        "dx": "", "yxcf": "", "yxcf_en": "", "yxcfhl": "",
        "yxcf2": "", "yxcf2_en": "", "yxcf2hl": "",
        "yxcf3": "", "yxcf3_en": "", "yxcf3hl": "",
        "yxqs_start": "", "yxqs_end": "",
        "yxjz_start": "", "yxjz_end": "",
        "accOrfuzzy": "2",
    }
    headers = {**HEADERS, "Content-Type": "application/x-www-form-urlencoded"}
    resp = session.post(LIST_URL, data=data, headers=headers, timeout=30)
    resp.encoding = "utf-8"
    if resp.status_code != 200 or len(resp.text) < 200:
        return []

    soup = BeautifulSoup(resp.text, "html.parser")
    table = soup.find("table", id="tab")
    if not table:
        return []

    rows = []
    for idx, tr in enumerate(table.find_all("tr")):
        if idx == 0:
            continue  # 跳过表头
        tds = tr.find_all("td")
        if len(tds) < 7:
            continue

        # 提取纯文本: 7列 = 登记证号|农药名称|农药类别|剂型|总含量|有效期至|登记证持有人
        texts = [clean(td.get_text()) for td in tds]

        # 从 td[6] 的 <a> 中提取持有人名称
        holder_name = texts[6] if len(texts) >= 7 else ""
        cert_code = texts[0]
        holder_td = tds[6] if len(tds) >= 7 else None
        if holder_td:
            a = holder_td.find("a", onclick=True)
            if a:
                a_text = clean(a.get_text())
                if a_text:
                    holder_name = a_text
                m = re.search(r"_viewcompany\('([^']+)'", a["onclick"])
                if m:
                    cert_code = m.group(1)

        rows.append({
            "登记证号": texts[0],
            "农药名称": texts[1],
            "农药类别": texts[2],
            "剂型": texts[3],
            "总含量": texts[4],
            "有效期至": texts[5],
            "登记证持有人": holder_name,
            "持有人地址": "",
            "_cert_code": cert_code or texts[0],  # 用于查详情
        })
    return rows


# ── 持有人详情 ────────────────────────────────────────────────────
def fetch_holder_address(certificate_code: str) -> str:
    """通过 viewcompany.do 获取登记证持有人的单位地址"""
    url = f"{DETAIL_URL}?certificateCode={certificate_code}"
    try:
        resp = session.get(url, timeout=15)
        resp.encoding = "utf-8"
        if resp.status_code != 200:
            return ""
    except Exception:
        return ""

    soup = BeautifulSoup(resp.text, "html.parser")

    # 在 td 中查找"单位地址"字段，地址值在其后的兄弟 td 中
    for td in soup.find_all("td"):
        text = clean(td.get_text())
        if "地址" in text:
            next_td = td.find_next_sibling("td")
            if next_td:
                addr = clean(next_td.get_text())
                if addr and len(addr) > 3:
                    return addr
    return ""


# ── 保存 CSV ──────────────────────────────────────────────────────
def save_csv(rows: list[dict], filename: str = OUTPUT_FILE):
    with open(filename, "w", encoding="utf-8-sig", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
        writer.writeheader()
        for r in rows:
            writer.writerow({k: r[k] for k in CSV_COLUMNS})


# ── 主流程 ────────────────────────────────────────────────────────
def main():
    log.info("=" * 60)
    log.info("中国农药信息网 - 农药登记数据爬虫")
    log.info("=" * 60)

    # 先测试第1页
    log.info("正在测试列表接口...")
    test_rows = fetch_list(page=1, size=20)
    if not test_rows:
        log.error("列表接口不可用，请检查网络或接口是否变更")
        return

    log.info("接口可用! 第1页返回 %d 条数据", len(test_rows))

    all_rows: list[dict] = []
    page = 1

    while True:
        log.info("正在抓取第 %d 页...", page)
        rows = fetch_list(page=page) if page > 1 else test_rows
        if not rows:
            log.info("第 %d 页无数据，抓取结束", page)
            break

        # 获取每条的持有人地址
        for row in rows:
            code = row.pop("_cert_code", "")
            if code:
                addr = fetch_holder_address(code)
                row["持有人地址"] = addr
                time.sleep(0.3)
            all_rows.append(row)

        log.info(
            "  第 %d 页完成, 本页 %d 条, 累计 %d 条",
            page, len(rows), len(all_rows),
        )

        # 每页保存一次（防中断）
        save_csv(all_rows)
        page += 1
        time.sleep(0.5)

    # 最终保存
    save_csv(all_rows)
    log.info("抓取完成! 共 %d 条, 已保存至 %s", len(all_rows), OUTPUT_FILE)

    # 预览前5条
    if all_rows:
        log.info("-" * 80)
        for i, r in enumerate(all_rows[:5], 1):
            log.info(
                "[%d] %s | %s | %s | %s | %s | %s | %s | %s",
                i,
                r["登记证号"], r["农药名称"], r["农药类别"],
                r["剂型"], r["总含量"], r["有效期至"],
                r["登记证持有人"], r["持有人地址"],
            )


if __name__ == "__main__":
    main()

mengxd · 发表于 2026-4-24 07:40

介个我用不上。顶下吧。给需要的人

tingfengkanhai · 发表于 2026-4-24 07:49

楼主给提供了一个很好的思路，同类型网站，也可以用这种方法去获取数据，感谢

wbb1225 · 发表于 2026-4-24 07:55

从事植保行业，参考学习

ttttyyyy · 发表于 2026-4-24 07:56

鼓励转贴优秀软件安全工具和文档！

qq1446319 · 发表于 2026-4-24 08:18

我用不上。顶下吧。给需要的人

Manson9527 · 发表于 2026-4-24 08:21

感谢分享

douyacai · 发表于 2026-4-24 08:24

这个对于这个行业的比较有用，查询很方便

ahui0511 · 发表于 2026-4-24 08:32

感谢分享

风经过 · 发表于 2026-4-24 08:43

优秀，可以借鉴学习方法！！！

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 获取中国农药信息网农药登记数据

浏览过的版块