import
re
import
time
import
random
import
requests
from
lxml
import
etree
def
get_ip_list():
for
page
in
range
(
1
,
2
):
print
(
'==========正在获取第{}页ip============'
.
format
(
str
(page)))
base_url
=
'https://www.kuaidaili.com/free/inha/{}/'
.
format
(
str
(page))
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
response
=
requests.get(base_url, headers
=
headers)
data
=
response.text
html_data
=
etree.HTML(data)
parse_list
=
html_data.xpath(
'//table[@class="table table-bordered table-striped"]/tbody/tr'
)
proxies_list
=
[]
for
tr
in
parse_list:
ip_num
=
tr.xpath(
'./td[1]/text()'
)
ip_port
=
tr.xpath(
'./td[2]/text()'
)
dict_proxies
=
ip_num[
0
]
+
':'
+
ip_port[
0
]
proxies_list.append(dict_proxies)
time.sleep(
0.5
)
return
proxies_list
def
check_ip(proxies_list):
headers
=
{
'Referer'
:
'https://baidu.com/'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
can_use
=
[]
for
proxy
in
proxies_list:
try
:
proxy_host
=
"http://"
+
proxy
proxies
=
{
"http"
: proxy_host}
response
=
requests.get(
'https://sf.taobao.com'
, headers
=
headers, proxies
=
proxies, timeout
=
0.1
)
if
response.status_code
=
=
200
:
can_use.append(proxy_host)
else
:
print
(
'不可使用'
)
except
Exception as e:
print
(e)
print
(
'总共获得%d条有用ip'
%
len
(can_use))
return
can_use
def
get_random_ip(can_use):
random_proxylist
=
[]
for
ip
in
can_use:
random_proxylist.append(ip)
proxy_ip
=
random.choice(random_proxylist)
proxies
=
{
'http'
: proxy_ip}
return
proxies
p
=
0
curPage
=
1
link_list
=
[
'https://sf-item.taobao.com/sf_item/640824628883.htm'
,
'https://sf-item.taobao.com/sf_item/641502301303.htm'
,
'https://sf-item.taobao.com/sf_item/641843794084.htm'
,
'https://sf-item.taobao.com/sf_item/642251459421.htm'
,
'https://sf-item.taobao.com/sf_item/642193923780.htm'
,
'https://sf-item.taobao.com/sf_item/642194171464.htm'
,
'https://sf-item.taobao.com/sf_item/642254059021.htm'
,
'https://sf-item.taobao.com/sf_item/642633234548.htm'
,
'https://sf-item.taobao.com/sf_item/641674674943.htm'
,
'https://sf-item.taobao.com/sf_item/641860786177.htm'
,
'https://sf-item.taobao.com/sf_item/641833404613.htm'
,
'https://sf-item.taobao.com/sf_item/642906910580.htm'
,
'https://sf-item.taobao.com/sf_item/642930454182.htm'
,
'https://sf-item.taobao.com/sf_item/643246399434.htm'
,
'https://sf-item.taobao.com/sf_item/643107141190.htm'
,
'https://sf-item.taobao.com/sf_item/641423917301.htm'
,
'https://sf-item.taobao.com/sf_item/641126976926.htm'
,
'https://sf-item.taobao.com/sf_item/640899648258.htm'
,
'https://sf-item.taobao.com/sf_item/641285745911.htm'
,
'https://sf-item.taobao.com/sf_item/641870471695.htm'
,
'https://sf-item.taobao.com/sf_item/641051516853.htm'
,
'https://sf-item.taobao.com/sf_item/641299249850.htm'
,
'https://sf-item.taobao.com/sf_item/640900456736.htm'
,
'https://sf-item.taobao.com/sf_item/641435389754.htm'
,
'https://sf-item.taobao.com/sf_item/642820317425.htm'
,
'https://sf-item.taobao.com/sf_item/642186087592.htm'
,
'https://sf-item.taobao.com/sf_item/642151179689.htm'
,
'https://sf-item.taobao.com/sf_item/640825988107.htm'
,
'https://sf-item.taobao.com/sf_item/644021779506.htm'
,
'https://sf-item.taobao.com/sf_item/642666508870.htm'
,
'https://sf-item.taobao.com/sf_item/643049745347.htm'
,
'https://sf-item.taobao.com/sf_item/642662660309.htm'
,
'https://sf-item.taobao.com/sf_item/642664828068.htm'
,
'https://sf-item.taobao.com/sf_item/642663848703.htm'
,
'https://sf-item.taobao.com/sf_item/643740591181.htm'
,
'https://sf-item.taobao.com/sf_item/643395154262.htm'
,
'https://sf-item.taobao.com/sf_item/643398210037.htm'
,
'https://sf-item.taobao.com/sf_item/643047449184.htm'
,
'https://sf-item.taobao.com/sf_item/643738223811.htm'
,
'https://sf-item.taobao.com/sf_item/643050809962.htm'
]
header
=
{
'authority'
:
'sf.taobao.com'
,
'cache-control'
:
'max-age=0'
,
'sec-ch-ua'
:
'"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'upgrade-insecure-requests'
:
'1'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
,
'accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,
'sec-fetch-site'
:
'none'
,
'sec-fetch-mode'
:
'navigate'
,
'sec-fetch-user'
:
'?1'
,
'sec-fetch-dest'
:
'document'
,
'accept-language'
:
'zh-CN,zh;q=0.9'
,
'cookie'
: '',
}
params
=
(
(
'spm'
,
'a213w.7398504.filter.63.76d93a49nDn8tk'
),
(
'auction_source'
,
'0'
),
(
'province'
,
'%D6%D8%C7%EC'
),
(
'sorder'
,
'1'
),
(
'st_param'
,
'-1'
),
(
'auction_start_seg'
,
'-1'
),
)
def
gethtml(url):
response
=
requests.get(url, headers
=
header, params
=
params)
r_response
=
response.content.decode(
'gbk'
)
return
r_response
def
gethtml_detail(url):
proxies
=
get_random_ip(can_use)
print
(proxies)
response
=
requests.get(url, headers
=
header, params
=
params,proxies
=
proxies)
r_response
=
response.content.decode(
'gbk'
)
return
r_response
def
parse_url(html):
ult
=
re.findall(r
'(sf-item[\S]+)\?'
, html)
for
i
in
range
(
len
(ult)):
detai_url
=
"https://"
+
ult[i].replace(
'"'
, "")
link_list.append(detai_url)
return
link_list
def
parse_url_detail(r):
html
=
etree.HTML(r)
final_link
=
"https:"
+
html.xpath(
'//*[@id="J_NoticeDetail"]/@data-from'
)[
0
].strip()
return
final_link
def
next_page():
url_np
=
'https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.1.7c773a49a7C9Lp&auction_source=0&province=%D6%D8%C7%EC&st_param=-1&auction_start_seg=-1&page={}'
url_list
=
[url_np.
format
(i
+
1
)
for
i
in
range
(
0
, curPage)]
return
url_list
def
run_AL():
for
u
in
link_list[
1
:
4
]:
html_detail
=
gethtml_detail(u)
print
(html_detail)
parse
=
parse_url_detail(html_detail)
print
(parse)
if
__name__
=
=
'__main__'
:
proxies_list
=
get_ip_list()
can_use
=
check_ip(proxies_list)
run_AL()