楼主的代码我的环境下抓不到信息, 重写了一下,加了一些注释。[Asm] 纯文本查看 复制代码 from urllib import request
import urllib,re,chardet
def request_url(url, host):
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0','Host': host}
req = urllib.request.Request(url, headers=header)
de = urllib.request.urlopen(req)
# 加载网页
html = de.read()
# 获取网页编码
charset = chardet.detect(html)
# print(charset['encoding'])
# 根据编码解析网页
html = html.decode(charset['encoding'])
return html
def parse_html():
url = 'https://www.btdx8.com/'
host = 'www.btdx8.com'
list_html = request_url(url, host)
# 根据正则取出电影列表
list_rules = r"<li ><a href=(.*?) title=(.*?) rel=(.*?)><img src=(.*?) alt=(.*?)\s(.*?)\s/>(.*?)</li>"
list_data = re.findall(list_rules, list_html, re.S)
# print(list_data)
for list in list_data:
# print(list)
#'"https://www.btdx8.com/torrent/wljqc_2018.html"', '"[未来机器城][HD-720P/1080P-MP4][英语中字][1.92GB/4.38GB][2018] BT种子"'
url = eval(list[0])
# print(url, host)
data = request_url(url, host)
# print(data)
# 根据正则取出电影下载地址
down_rules = r"<div id=\"zdownload\"><a href=\"(.*?)\" rel=\"nofollow\"(.*?)<span>(.*?)种子</span>(.*?).torrent"
down_url = re.findall(down_rules, data, re.S)
# print(down_url)
for i in down_url:
title = i[3]
type = i[2]
address = i[0]
print((' --电影名称:%s\n --类别:%s\n --下载地址:%s\n'%(title, type, address)))
if __name__ == '__main__':
parse_html() |