import
requests
import
threading
from
lxml
import
etree
from
queue
import
Queue
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36'
}
class
TiebaSpider:
def
__init__(
self
):
self
.url
=
'http://tieba.baidu.com/f?&kw=lol&pn={}'
self
.url_queen
=
Queue()
self
.response_queen
=
Queue()
def
get_urllist(
self
):
for
i
in
range
(
0
,
5
):
self
.url_queen.put(
self
.url.
format
(i
*
30
))
def
get_response(
self
):
while
True
:
url
=
self
.url_queen.get()
response
=
requests.get(url,headers
=
headers)
self
.response_queen.put(response.content.decode())
self
.url_queen.task_done()
def
get_info(
self
):
while
True
:
html
=
etree.HTML(
self
.response_queen.get())
title
=
html.xpath(
'//div[@class="ti_title"]/span[not(contains(text(),"置顶"))][not(contains(@class,"icon"))]/text()'
)
title_url
=
html.xpath(
'//ul[@class="threads_list"]//a[not(contains(@class,"ti_item j_click_stats"))][not(contains(text(),"立即查看"))]/@href'
)
str
=
'http://tieba.baidu.com'
url
=
[
str
+
i
for
i
in
title_url]
count
=
0
while
count<
len
(title):
print
(
'标题:%s 地址:%s'
%
(title[count],url[count]))
count
+
=
1
self
.response_queen.task_done()
def
run(
self
):
thread_list
=
[]
for
i
in
range
(
1
):
t_list
=
threading.Thread(target
=
self
.get_urllist)
thread_list.append(t_list)
for
i
in
range
(
10
):
t_response
=
threading.Thread(target
=
self
.get_response)
thread_list.append(t_response)
for
i
in
range
(
3
):
t_info
=
threading.Thread(target
=
self
.get_info)
thread_list.append(t_info)
for
t
in
thread_list:
t.setDaemon(
True
)
t.start()
for
p
in
(
self
.url_queen,
self
.response_queen):
p.join()
print
(
"\33[36m获取结束"
)
if
__name__
=
=
'__main__'
:
q
=
TiebaSpider()
q.run()