from urllib
import
request
import
urllib,re,chardet
def request_url(url, host):
header = {
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0'
,
'Host'
: host}
req
= urllib.request.Request(url, headers=header)
de = urllib.request.urlopen(
req
)
# 加载网页
html = de.read()
# 获取网页编码
charset = chardet.detect(html)
# print(charset[
'encoding'
])
# 根据编码解析网页
html = html.decode(charset[
'encoding'
])
return html
def parse_html():
url =
'https://www.btdx8.com/'
host =
'www.btdx8.com'
list_html = request_url(url, host)
# 根据正则取出电影列表
list_rules = r
"<li ><a href=(.*?) title=(.*?) rel=(.*?)><img src=(.*?) alt=(.*?)\s(.*?)\s/>(.*?)</li>"
list_data = re.findall(list_rules, list_html, re.S)
# print(list_data)
for
list
in
list_data:
# print(list)
#
'"https://www.btdx8.com/torrent/wljqc_2018.html"'
,
'"[未来机器城][HD-720P/1080P-MP4][英语中字][1.92GB/4.38GB][2018] BT种子"'
url = eval(list[0])
# print(url, host)
data = request_url(url, host)
# print(data)
# 根据正则取出电影下载地址
down_rules = r
"<div id=\"zdownload\"><a href=\"(.*?)\" rel=\"nofollow\"(.*?)<span>(.*?)种子</span>(.*?).torrent"
down_url = re.findall(down_rules, data, re.S)
# print(down_url)
for
i
in
down_url:
title
= i[3]
type
= i[2]
address = i[0]
print((
' --电影名称:%s\n --类别:%s\n --下载地址:%s\n'
%(
title
,
type
, address)))
if
__name__ ==
'__main__'
:
parse_html()