好友
阅读权限10
听众
最后登录1970-1-1
|
利用Python 快速获取链家房屋信息,需要改成自己需要的URL
代码如下:
import traceback
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User - Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 119.0.0.0Safari / 537.36"
}
def parse_html(url):
movie_data_list = []
# 爬取的url,默认爬取的深圳的链家房产信息
# 请求url
resp = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(resp.content, 'lxml')
# 筛选全部的li标签
sellListContent = soup.select('.sellListContent li.LOGCLICKDATA')
# 循环遍历
for sell in sellListContent:
# 标题
title = sell.select('div.title a')[0].string
# 先抓取全部的div信息,再针对每一条进行提取
houseInfo = list(sell.select('div.houseInfo')[0].stripped_strings)
# 楼盘名字
loupan = houseInfo[0]
# 对楼盘的信息进行分割
# 房子类型
info = houseInfo[0].split('|')
info1 = info[0].strip()
# 面积大小
house_type = info[1].strip()
# 房间朝向
area = info[2].strip()
# 装修类型
toward = info[3].strip()
# 楼层
renovation = info[4].strip()
# 房屋地址
positionInfo = ''.join(list(sell.select('div.positionInfo')[0].stripped_strings))
# 房屋总价
totalPrice = ''.join(list(sell.select('div.totalPrice')[0].stripped_strings))
# 房屋单价
unitPrice = list(sell.select('div.unitPrice')[0].stripped_strings)[0]
followInfo = ''.join(list(sell.select('div.followInfo')[0].stripped_strings))
movie_data_list.append({
"房屋地址": positionInfo,
"房子类型": info1,
"面积大小": house_type,
"房间朝向": area,
"装修类型": toward,
"楼层": renovation,
"房屋总价": totalPrice,
"房屋单价": unitPrice,
"关注发布": followInfo
})
return movie_data_list;
def export_excel(datas):
"""
导出数据到Excel
:param datas: 数据
:return:
"""
df = pd.DataFrame(datas)
df.to_excel("链家龙华3室2手房.xlsx", index=False)
datas = [] # 所有电影数据
for i in range(1, 2): # 遍历10页
url = 'https://sz.lianjia.com/ershoufang/longhuaqu/pg{}l3/'.format(i)
# print(url)
movie_data_list = parse_html(url)
print(movie_data_list)
datas += movie_data_list
export_excel(datas)
|
|
发帖前要善用【论坛搜索】功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。 |
|
|
|
|