兴隆台租房-美的什么时候上市的

2023年9月16日发(作者:祁莺)
python爬⾍请求库httpx和parsel解析库的使⽤测评
⽬录
requests + BeautifulSoup组合
requests + parsel组合
httpx同步 + parsel组合
httpx异步+ parsel组合
对⽐与总结
Python⽹络爬⾍领域两个最新的⽐较⽕的⼯具莫过于httpx和parsel了。httpx号称下⼀代的新⼀代的⽹络请求库,不仅⽀持
requests库的所有操作,还能发送异步请求,为编写异步爬⾍提供了便利。parsel最初集成在著名Python爬⾍框架Scrapy中,
后独⽴出来成⽴⼀个单独的模块,⽀持XPath选择器, CSS选择器和正则表达式等多种解析提取⽅式, 据说相⽐于
BeautifulSoup,parsel的解析效率更⾼。
今天我们就以爬取链家⽹上的⼆⼿房在售房产信息为例,来测评下httpx和parsel这两个库。为了节约时间,我们以爬取上海市
浦东新区500万元-800万元以上的房产为例。
requests + BeautifulSoup组合
整个项⽬代码如下所⽰:
# homelink_
# Author: ⼤江狗
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
class HomeLinkSpider(object):
def __init__(self):
= UserAgent()
s = {"User-Agent": }
= list()
= "浦东_三房_500_800万.csv"
= "/ershoufang/pudong/a3p5/"
def get_max_page(self):
response = (, headers=s)
if _code == 200:
soup = BeautifulSoup(, '')
a = ('div[class="page-box house-lst-page-box"]')
#使⽤eval是字符串转化为字典格式
max_page = eval(a[0].attrs["page-data"])["totalPage"]
return max_page
else:
print("请求失败 status:{}".format(_code))
return None
def parse_page(self):
max_page = _max_page()
for i in range(1, max_page + 1):
url = '/ershoufang/pudong/pg{}a3p5/'.format(i)
response = (url, headers=s)
soup = BeautifulSoup(, '')
ul = _all("ul", class_="sellListContent")
li_list = ul[0].select("li")
for li in li_list:
detail = dict()
detail['title'] = ('div[class="title"]')[0].get_text()
# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼
house_info = ('div[class="houseInfo"]')[0].get_text()
house_info_list = house_(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = e(r'd{1,2}')
# 从字符串任意位置匹配
match1 = (floor_pattern, house_info_list[4])
if match1:
detail['floor'] = ()
else:
detail['floor'] = "未知"
# 匹配年份
year_pattern = e(r'd{4}')
match2 = (year_pattern, house_info_list[5])
if match2:
detail['year'] = ()
else:
detail['year'] = "未知"
# ⽂兰⼩区 - 塘桥, 提取⼩区名和哈快
position_info = ('div[class="positionInfo"]')[0].get_text().split(' - ')
detail['house'] = position_info[0]
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = e(r'd+')
total_price = ('div[class="totalPrice"]')[0].get_text()
detail['total_price'] = (price_pattern, total_price).group()
# 单价64182元/平⽶, 匹配64182
unit_price = ('div[class="unitPrice"]')[0].get_text()
detail['unit_price'] = (price_pattern, unit_price).group()
(detail)
def write_csv_file(self):
head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层", "年份",
"位置", "总价(万)", "单价(元/平⽅⽶)"]
keys = ["title", "house", "bedroom", "area", "direction",
"floor", "year", "location",
"total_price", "unit_price"]
try:
with open(, 'w', newline='', encoding='utf_8_sig') as csv_file:
writer = (csv_file, dialect='excel')
if head is not None:
ow(head)
for item in :
row_data = []
for k in keys:
row_(item[k])
# print(row_data)
ow(row_data)
print("Write a CSV file to path %s Successful." % )
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (, e))
if __name__ == '__main__':
start = ()
home_link_spider = HomeLinkSpider()
home_link__page()
home_link__csv_file()
end = ()
print("耗时:{}秒".format(end-start))
注意:我们使⽤了fake_useragent, requests和BeautifulSoup,这些都需要通过pip事先安装好才能⽤。
现在我们来看下爬取结果,耗时约18.5秒,总共爬取580条数据。
requests + parsel组合
这次我们同样采⽤requests获取⽬标⽹页内容,使⽤parsel库(事先需通过pip安装)来解析。Parsel库的⽤法和BeautifulSoup相
似,都是先创建实例,然后使⽤各种选择器提取DOM元素和数据,但语法上稍有不同。Beautiful有⾃⼰的语法规则,⽽
Parsel库⽀持标准的css选择器和xpath选择器, 通过get⽅法或getall⽅法获取⽂本或属性值,使⽤起来更⽅便。
# BeautifulSoup的⽤法
from bs4 import BeautifulSoup
soup = BeautifulSoup(, '')
ul = _all("ul", class_="sellListContent")[0]
# Parsel的⽤法, 使⽤Selector类
from parsel import Selector
selector = Selector()
ul = ('stContent')[0]
# Parsel获取⽂本值或属性值案例
(' span::text').get()
('ul li a::attr(href)').get()
>>> for li in ('ul > li'):
... print(('.//@href').get())
注:⽼版的parsel库使⽤extract()或extract_first()⽅法获取⽂本或属性值,在新版中已被get()和getall()⽅法替代。
全部代码如下所⽰:
# homelink_
# Author: ⼤江狗
from fake_useragent import UserAgent
import requests
import csv
import re
import time
from parsel import Selector
class HomeLinkSpider(object):
def __init__(self):
= UserAgent()
s = {"User-Agent": }
= list()
= "浦东_三房_500_800万.csv"
= "/ershoufang/pudong/a3p5/"
def get_max_page(self):
response = (, headers=s)
if _code == 200:
# 创建Selector类实例
selector = Selector()
# 采⽤css选择器获取最⼤页码div Boxl
a = ('div[class="page-box house-lst-page-box"]')
# 使⽤eval将page-data的json字符串转化为字典格式
max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]
print("最⼤页码数:{}".format(max_page))
return max_page
else:
print("请求失败 status:{}".format(_code))
return None
def parse_page(self):
max_page = _max_page()
for i in range(1, max_page + 1):
url = '/ershoufang/pudong/pg{}a3p5/'.format(i)
response = (url, headers=s)
selector = Selector()
ul = ('stContent')[0]
li_list = ('li')
for li in li_list:
detail = dict()
else:
detail['floor'] = "未知"
# 匹配年份
year_pattern = e(r'd{4}')
match2 = (year_pattern, house_info_list[5])
if match2:
detail['year'] = ()
else:
detail['year'] = "未知"
# ⽂兰⼩区 - 塘桥 提取⼩区名和哈快
position_info = ('onInfo a::text').getall()
detail['house'] = position_info[0]
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = e(r'd+')
total_price = ('rice span::text').get()
detail['total_price'] = (price_pattern, total_price).group()
# 单价64182元/平⽶, 匹配64182
unit_price = ('ice span::text').get()
detail['unit_price'] = (price_pattern, unit_price).group()
(detail)
def write_csv_file(self):
head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层",
"年份", "位置", "总价(万)", "单价(元/平⽅⽶)"]
keys = ["title", "house", "bedroom", "area",
"direction", "floor", "year", "location",
"total_price", "unit_price"]
try:
with open(, 'w', newline='', encoding='utf_8_sig') as csv_file:
writer = (csv_file, dialect='excel')
if head is not None:
ow(head)
for item in :
row_data = []
for k in keys:
row_(item[k])
# print(row_data)
ow(row_data)
print("Write a CSV file to path %s Successful." % )
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (, e))
if __name__ == '__main__':
start = ()
home_link_spider = HomeLinkSpider()
home_link__page()
home_link__csv_file()
end = ()
print("耗时:{}秒".format(end-start))
现在我们来看下爬取结果,爬取580条数据耗时约16.5秒,节省了2秒时间。可见parsel⽐BeautifulSoup解析效率是要⾼的,
爬取任务少时差别不⼤,任务多的话差别可能会⼤些。
httpx同步 + parsel组合
我们现在来更进⼀步,使⽤httpx替代requests库。httpx发送同步请求的⽅式和requests库基本⼀样,所以我们只需要修改上例
中两⾏代码,把requests替换成httpx即可, 其余代码⼀模⼀样。
from fake_useragent import UserAgent
import csv
import re
import time
from parsel import Selector
import httpx
class HomeLinkSpider(object):
def __init__(self):
= UserAgent()
s = {"User-Agent": }
= list()
= "浦东_三房_500_800万.csv"
= "/ershoufang/pudong/a3p5/"
def get_max_page(self):
# 修改这⾥把requests换成httpx
response = (, headers=s)
if _code == 200:
# 创建Selector类实例
selector = Selector()
# 采⽤css选择器获取最⼤页码div Boxl
a = ('div[class="page-box house-lst-page-box"]')
# 使⽤eval将page-data的json字符串转化为字典格式
max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]
print("最⼤页码数:{}".format(max_page))
return max_page
else:
print("请求失败 status:{}".format(_code))
return None
def parse_page(self):
max_page = _max_page()
for i in range(1, max_page + 1):
url = '/ershoufang/pudong/pg{}a3p5/'.format(i)
# 修改这⾥把requests换成httpx
response = (url, headers=s)
selector = Selector()
ul = ('stContent')[0]
li_list = ('li')
for li in li_list:
detail = dict()
detail['title'] = (' a::text').get()
# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼
house_info = ('nfo::text').get()
house_info_list = house_(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = e(r'd{1,2}')
match1 = (floor_pattern, house_info_list[4]) # 从字符串任意位置匹配
if match1:
detail['floor'] = ()
else:
detail['floor'] = "未知"
# 匹配年份
year_pattern = e(r'd{4}')
match2 = (year_pattern, house_info_list[5])
if match2:
detail['year'] = ()
else:
detail['year'] = "未知"
# ⽂兰⼩区 - 塘桥 提取⼩区名和哈快
position_info = ('onInfo a::text').getall()
detail['house'] = position_info[0]
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = e(r'd+')
detail['unit_price'] = (price_pattern, unit_price).group()
(detail)
def write_csv_file(self):
head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层",
"年份", "位置", "总价(万)", "单价(元/平⽅⽶)"]
keys = ["title", "house", "bedroom", "area", "direction",
"floor", "year", "location",
"total_price", "unit_price"]
try:
with open(, 'w', newline='', encoding='utf_8_sig') as csv_file:
writer = (csv_file, dialect='excel')
if head is not None:
ow(head)
for item in :
row_data = []
for k in keys:
row_(item[k])
# print(row_data)
ow(row_data)
print("Write a CSV file to path %s Successful." % )
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (, e))
if __name__ == '__main__':
start = ()
home_link_spider = HomeLinkSpider()
home_link__page()
home_link__csv_file()
end = ()
print("耗时:{}秒".format(end-start))
整个爬取过程耗时16.1秒,可见使⽤httpx发送同步请求时效率和requests基本⽆差别。
注意:Windows上使⽤pip安装httpx可能会出现报错,要求安装Visual Studio C++, 这个下载安装好就没事了。
接下来,我们就要开始王炸了,使⽤httpx和asyncio编写⼀个异步爬⾍看看从链家⽹上爬取580条数据到底需要多长时间。
httpx异步+ parsel组合
Httpx厉害的地⽅就是能发送异步请求。整个异步爬⾍实现原理时,先发送同步请求获取最⼤页码,把每个单页的爬取和数据
解析变为⼀个asyncio协程任务(使⽤async定义),最后使⽤loop执⾏。
⼤部分代码与同步爬⾍相同,主要变动地⽅有两个:
# 异步 - 使⽤协程函数解析单页⾯,需传⼊单页⾯url地址
async def parse_single_page(self, url):
# 使⽤httpx发送异步请求获取单页数据
async with lient() as client:
response = await (url, headers=s)
selector = Selector()
# 其余地⽅⼀样
def parse_page(self):
max_page = _max_page()
loop = _event_loop()
# Python 3.6之前⽤_future或_task⽅法创建单个协程任务
# Python 3.7以后可以⽤户_task⽅法创建单个协程任务
tasks = []
for i in range(1, max_page + 1):
url = '/ershoufang/pudong/pg{}a3p5/'.format(i)
(_single_page(url))
# 还可以使⽤(*tasks)命令将多个协程任务加⼊到事件循环
_until_complete((tasks))
()
整个项⽬代码如下所⽰:
from fake_useragent import UserAgent
import csv
import re
import time
from parsel import Selector
import httpx
import asyncio
class HomeLinkSpider(object):
def __init__(self):
= UserAgent()
s = {"User-Agent": }
= list()
= "浦东_三房_500_800万.csv"
= "/ershoufang/pudong/a3p5/"
def get_max_page(self):
response = (, headers=s)
if _code == 200:
# 创建Selector类实例
selector = Selector()
# 采⽤css选择器获取最⼤页码div Boxl
a = ('div[class="page-box house-lst-page-box"]')
# 使⽤eval将page-data的json字符串转化为字典格式
max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]
print("最⼤页码数:{}".format(max_page))
return max_page
else:
print("请求失败 status:{}".format(_code))
return None
# 异步 - 使⽤协程函数解析单页⾯,需传⼊单页⾯url地址
async def parse_single_page(self, url):
async with lient() as client:
response = await (url, headers=s)
selector = Selector()
ul = ('stContent')[0]
li_list = ('li')
for li in li_list:
detail = dict()
detail['title'] = (' a::text').get()
# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼
house_info = ('nfo::text').get()
house_info_list = house_(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = e(r'd{1,2}')
match1 = (floor_pattern, house_info_list[4]) # 从字符串任意位置匹配
if match1:
detail['floor'] = ()
else:
detail['floor'] = "未知"
# 单价64182元/平⽶, 匹配64182
unit_price = ('ice span::text').get()
detail['unit_price'] = (price_pattern, unit_price).group()
(detail)
def parse_page(self):
max_page = _max_page()
loop = _event_loop()
# Python 3.6之前⽤_future或_task⽅法创建单个协程任务
# Python 3.7以后可以⽤户_task⽅法创建单个协程任务
tasks = []
for i in range(1, max_page + 1):
url = '/ershoufang/pudong/pg{}a3p5/'.format(i)
(_single_page(url))
# 还可以使⽤(*tasks)命令将多个协程任务加⼊到事件循环
_until_complete((tasks))
()
def write_csv_file(self):
head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层",
"年份", "位置", "总价(万)", "单价(元/平⽅⽶)"]
keys = ["title", "house", "bedroom", "area", "direction",
"floor", "year", "location",
"total_price", "unit_price"]
try:
with open(, 'w', newline='', encoding='utf_8_sig') as csv_file:
writer = (csv_file, dialect='excel')
if head is not None:
ow(head)
for item in :
row_data = []
for k in keys:
row_(item[k])
ow(row_data)
print("Write a CSV file to path %s Successful." % )
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (, e))
if __name__ == '__main__':
start = ()
home_link_spider = HomeLinkSpider()
home_link__page()
home_link__csv_file()
end = ()
print("耗时:{}秒".format(end-start))
现在到了见证奇迹的时刻了。从链家⽹上爬取了580条数据,使⽤httpx编写的异步爬⾍仅仅花了2.5秒!!
对⽐与总结
爬取同样的内容,采⽤不同⼯具组合耗时是不⼀样的。httpx异步+parsel组合毫⽆疑问是最⼤的赢家, requests和
BeautifulSoup确实可以功成⾝退啦。
requests + BeautifulSoup: 18.5 秒
requests + parsel: 16.5秒
httpx 同步 + parsel: 16.1秒
httpx 异步 + parsel: 2.5秒
对于Python爬⾍,你还有喜欢的库吗?
以上就是python爬⾍请求库httpx和parsel解析库的使⽤测评的详细内容,更多关于python httpx和parsel的资料请关注其它相关
⽂章!
马鞍山建材市场-snkers专属购买通道在哪

更多推荐
link lianjia com
发布评论