import
scrapy
import
sys
from
util
import
get_absolute_path
import
hashlib
from
util
import
now_time
from
util
import
convert_mg_url_to_base64
import
urlparse
from
util
import
lock
from
util
import
unlock
from
lxml
import
etree
m
=
hashlib.md5()
sys.path.append(get_absolute_path())
from
crawl.scrapy_crawl.items
import
ScrapyCrawlItem
class
BookmarkSpider(scrapy.Spider):
name
=
"haoyonghaowan"
allowed_domains
=
[
"haoyonghaowan.com"
]
def
__init__(
self
):
self
.base
=
"http://www.haoyonghaowan.com"
self
.reqeust_url
=
self
.base
self
.urls
=
[]
def
start_requests(
self
):
self
.urls.append(
self
.reqeust_url)
for
url
in
self
.urls:
yield
scrapy.Request(url
=
url, callback
=
self
.parse_link_page)
def
parse_link_page(
self
,response):
parts
=
response.xpath(
'//div[contains(@class,"sitelist")]/ul/li/a/@href'
).extract()
for
url
in
parts:
yield
scrapy.Request(url
=
url, callback
=
self
.parse_item)
def
parse_item(
self
,response):
items
=
response.xpath(
'//article[contains(@class,"sitelist")]/div/ul/li/a'
).extract()
for
it
in
items:
try
:
lock()
item
=
ScrapyCrawlItem()
item[
'category2'
]
=
self
.parse_category2(response)
item[
'category1'
]
=
self
.parse_category1(response)
it
=
etree.fromstring(it)
item[
'name'
]
=
self
.parse_name(it)
item[
'url'
]
=
self
.parse_url(it)
if
item[
'url'
]:
item[
'icon'
]
=
self
.parse_icon(item[
'url'
])
item[
'crawl_time'
]
=
self
.parse_crawl_time(it)
item[
'content'
]
=
self
.parse_content(it)
item[
'origin_website'
]
=
self
.base
item[
'stop'
]
=
False
print
item
if
item[
'name'
]
and
item[
'url'
]
and
item[
'category2'
]
and
(
"http"
in
item[
'url'
]):
yield
item
except
Exception as e:
print
"ee"
,e
finally
:
unlock()
item
=
ScrapyCrawlItem()
item[
'stop'
]
=
True
yield
item
def
parse_name(
self
, it):
name
=
it.xpath(
'//text()'
)
if
name:
return
name[
0
]
return
None
def
parse_url(
self
, it):
href
=
it.xpath(
'//@href'
)
if
href:
return
href[
0
]
return
None
def
parse_category2(
self
, response):
name
=
response.xpath(
'//article[contains(@class,"sitelist")]/header/h2/text()'
).extract()
if
name:
return
name[
0
].replace(
'\n'
,'
').replace('
\t
','
').replace('
','
').replace('
\r
','
')
return
None
def
parse_category1(
self
, response):
return
None
def
parse_icon(
self
,link):
try
:
url
=
"http://www.google.cn/s2/favicons?domain="
+
urlparse.urlparse(link).netloc
return
convert_mg_url_to_base64(url)
except
Exception as e:
print
"errro it"
,e
def
parse_crawl_time(
self
, response):
return
now_time()
def
parse_content(
self
, it):
return
None
if
__name__
=
=
'__main__'
:
spider
=
BookmarkSpider()
print
spider.now_time()