兴隆台租房-美的什么时候上市的

python爬虫请求库httpx和parsel解析库的使用测评
2023年9月16日发(作者:祁莺)

python爬⾍请求库httpxparsel解析库的使⽤测评

⽬录

requests + BeautifulSoup组合

requests + parsel组合

httpx同步 + parsel组合

httpx异步+ parsel组合

对⽐与总结

Python⽹络爬⾍领域两个最新的⽐较⽕的⼯具莫过于httpxparsel了。httpx号称下⼀代的新⼀代的⽹络请求库,不仅⽀持

requests库的所有操作,还能发送异步请求,为编写异步爬⾍提供了便利。parsel最初集成在著名Python爬⾍框架Scrapy中,

后独⽴出来成⽴⼀个单独的模块,⽀持XPath选择器, CSS选择器和正则表达式等多种解析提取⽅式, 据说相⽐于

BeautifulSoupparsel的解析效率更⾼。

今天我们就以爬取链家⽹上的⼆⼿房在售房产信息为例,来测评下httpxparsel这两个库。为了节约时间,我们以爬取上海市

浦东新区500万元-800万元以上的房产为例。

requests + BeautifulSoup组合

整个项⽬代码如下所⽰:

# homelink_

# Author: ⼤江狗

from fake_useragent import UserAgent

import requests

from bs4 import BeautifulSoup

import csv

import re

import time

class HomeLinkSpider(object):

def __init__(self):

= UserAgent()

s = {"User-Agent": }

= list()

= "浦东_三房_500_800.csv"

= "/ershoufang/pudong/a3p5/"

def get_max_page(self):

response = (, headers=s)

if _code == 200:

soup = BeautifulSoup(, '')

a = ('div[class="page-box house-lst-page-box"]')

#使⽤eval是字符串转化为字典格式

max_page = eval(a[0].attrs["page-data"])["totalPage"]

return max_page

else:

print("请求失败 status:{}".format(_code))

return None

def parse_page(self):

max_page = _max_page()

for i in range(1, max_page + 1):

url = '/ershoufang/pudong/pg{}a3p5/'.format(i)

response = (url, headers=s)

soup = BeautifulSoup(, '')

ul = _all("ul", class_="sellListContent")

li_list = ul[0].select("li")

for li in li_list:

detail = dict()

detail['title'] = ('div[class="title"]')[0].get_text()

# 21 | 74.14平⽶ | | 精装 | ⾼楼层(6) | 1999年建 | 板楼

house_info = ('div[class="houseInfo"]')[0].get_text()

house_info_list = house_(" | ")

detail['bedroom'] = house_info_list[0]

detail['area'] = house_info_list[1]

detail['direction'] = house_info_list[2]

floor_pattern = e(r'd{1,2}')

# 从字符串任意位置匹配

match1 = (floor_pattern, house_info_list[4])

if match1:

detail['floor'] = ()

else:

detail['floor'] = "未知"

# 匹配年份

year_pattern = e(r'd{4}')

match2 = (year_pattern, house_info_list[5])

if match2:

detail['year'] = ()

else:

detail['year'] = "未知"

# ⽂兰⼩区 - 塘桥, 提取⼩区名和哈快

position_info = ('div[class="positionInfo"]')[0].get_text().split(' - ')

detail['house'] = position_info[0]

detail['location'] = position_info[1]

# 650万,匹配650

price_pattern = e(r'd+')

total_price = ('div[class="totalPrice"]')[0].get_text()

detail['total_price'] = (price_pattern, total_price).group()

# 单价64182/平⽶, 匹配64182

unit_price = ('div[class="unitPrice"]')[0].get_text()

detail['unit_price'] = (price_pattern, unit_price).group()

(detail)

def write_csv_file(self):

head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层", "年份",

"位置", "总价()", "单价(/平⽅⽶)"]

keys = ["title", "house", "bedroom", "area", "direction",

"floor", "year", "location",

"total_price", "unit_price"]

try:

with open(, 'w', newline='', encoding='utf_8_sig') as csv_file:

writer = (csv_file, dialect='excel')

if head is not None:

ow(head)

for item in :

row_data = []

for k in keys:

row_(item[k])

# print(row_data)

ow(row_data)

print("Write a CSV file to path %s Successful." % )

except Exception as e:

print("Fail to write CSV to path: %s, Case: %s" % (, e))

if __name__ == '__main__':

start = ()

home_link_spider = HomeLinkSpider()

home_link__page()

home_link__csv_file()

end = ()

print("耗时:{}".format(end-start))

注意:我们使⽤了fake_useragent, requestsBeautifulSoup,这些都需要通过pip事先安装好才能⽤。

现在我们来看下爬取结果,耗时约18.5秒,总共爬取580条数据。

requests + parsel组合

这次我们同样采⽤requests获取⽬标⽹页内容,使⽤parsel(事先需通过pip安装)来解析。Parsel库的⽤法和BeautifulSoup

似,都是先创建实例,然后使⽤各种选择器提取DOM元素和数据,但语法上稍有不同。Beautiful有⾃⼰的语法规则,⽽

Parsel库⽀持标准的css选择器和xpath选择器, 通过get⽅法或getall⽅法获取⽂本或属性值,使⽤起来更⽅便。

# BeautifulSoup的⽤法

from bs4 import BeautifulSoup

soup = BeautifulSoup(, '')

ul = _all("ul", class_="sellListContent")[0]

# Parsel的⽤法, 使⽤Selector

from parsel import Selector

selector = Selector()

ul = ('stContent')[0]

# Parsel获取⽂本值或属性值案例

(' span::text').get()

('ul li a::attr(href)').get()

>>> for li in ('ul > li'):

... print(('.//@href').get())

注:⽼版的parsel库使⽤extract()extract_first()⽅法获取⽂本或属性值,在新版中已被get()getall()⽅法替代。

全部代码如下所⽰:

# homelink_

# Author: ⼤江狗

from fake_useragent import UserAgent

import requests

import csv

import re

import time

from parsel import Selector

class HomeLinkSpider(object):

def __init__(self):

= UserAgent()

s = {"User-Agent": }

= list()

= "浦东_三房_500_800.csv"

= "/ershoufang/pudong/a3p5/"

def get_max_page(self):

response = (, headers=s)

if _code == 200:

# 创建Selector类实例

selector = Selector()

# 采⽤css选择器获取最⼤页码div Boxl

a = ('div[class="page-box house-lst-page-box"]')

# 使⽤evalpage-datajson字符串转化为字典格式

max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]

print("最⼤页码数:{}".format(max_page))

return max_page

else:

print("请求失败 status:{}".format(_code))

return None

def parse_page(self):

max_page = _max_page()

for i in range(1, max_page + 1):

url = '/ershoufang/pudong/pg{}a3p5/'.format(i)

response = (url, headers=s)

selector = Selector()

ul = ('stContent')[0]

li_list = ('li')

for li in li_list:

detail = dict()

else:

detail['floor'] = "未知"

# 匹配年份

year_pattern = e(r'd{4}')

match2 = (year_pattern, house_info_list[5])

if match2:

detail['year'] = ()

else:

detail['year'] = "未知"

# ⽂兰⼩区 - 塘桥 提取⼩区名和哈快

position_info = ('onInfo a::text').getall()

detail['house'] = position_info[0]

detail['location'] = position_info[1]

# 650万,匹配650

price_pattern = e(r'd+')

total_price = ('rice span::text').get()

detail['total_price'] = (price_pattern, total_price).group()

# 单价64182/平⽶, 匹配64182

unit_price = ('ice span::text').get()

detail['unit_price'] = (price_pattern, unit_price).group()

(detail)

def write_csv_file(self):

head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层",

"年份", "位置", "总价()", "单价(/平⽅⽶)"]

keys = ["title", "house", "bedroom", "area",

"direction", "floor", "year", "location",

"total_price", "unit_price"]

try:

with open(, 'w', newline='', encoding='utf_8_sig') as csv_file:

writer = (csv_file, dialect='excel')

if head is not None:

ow(head)

for item in :

row_data = []

for k in keys:

row_(item[k])

# print(row_data)

ow(row_data)

print("Write a CSV file to path %s Successful." % )

except Exception as e:

print("Fail to write CSV to path: %s, Case: %s" % (, e))

if __name__ == '__main__':

start = ()

home_link_spider = HomeLinkSpider()

home_link__page()

home_link__csv_file()

end = ()

print("耗时:{}".format(end-start))

现在我们来看下爬取结果,爬取580条数据耗时约16.5秒,节省了2秒时间。可见parselBeautifulSoup解析效率是要⾼的,

爬取任务少时差别不⼤,任务多的话差别可能会⼤些。

httpx同步 + parsel组合

我们现在来更进⼀步,使⽤httpx替代requests库。httpx发送同步请求的⽅式和requests库基本⼀样,所以我们只需要修改上例

中两⾏代码,把requests替换成httpx即可, 其余代码⼀模⼀样。

from fake_useragent import UserAgent

import csv

import re

import time

from parsel import Selector

import httpx

class HomeLinkSpider(object):

def __init__(self):

= UserAgent()

s = {"User-Agent": }

= list()

= "浦东_三房_500_800.csv"

= "/ershoufang/pudong/a3p5/"

def get_max_page(self):

# 修改这⾥把requests换成httpx

response = (, headers=s)

if _code == 200:

# 创建Selector类实例

selector = Selector()

# 采⽤css选择器获取最⼤页码div Boxl

a = ('div[class="page-box house-lst-page-box"]')

# 使⽤evalpage-datajson字符串转化为字典格式

max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]

print("最⼤页码数:{}".format(max_page))

return max_page

else:

print("请求失败 status:{}".format(_code))

return None

def parse_page(self):

max_page = _max_page()

for i in range(1, max_page + 1):

url = '/ershoufang/pudong/pg{}a3p5/'.format(i)

# 修改这⾥把requests换成httpx

response = (url, headers=s)

selector = Selector()

ul = ('stContent')[0]

li_list = ('li')

for li in li_list:

detail = dict()

detail['title'] = (' a::text').get()

# 21 | 74.14平⽶ | | 精装 | ⾼楼层(6) | 1999年建 | 板楼

house_info = ('nfo::text').get()

house_info_list = house_(" | ")

detail['bedroom'] = house_info_list[0]

detail['area'] = house_info_list[1]

detail['direction'] = house_info_list[2]

floor_pattern = e(r'd{1,2}')

match1 = (floor_pattern, house_info_list[4]) # 从字符串任意位置匹配

if match1:

detail['floor'] = ()

else:

detail['floor'] = "未知"

# 匹配年份

year_pattern = e(r'd{4}')

match2 = (year_pattern, house_info_list[5])

if match2:

detail['year'] = ()

else:

detail['year'] = "未知"

# ⽂兰⼩区 - 塘桥 提取⼩区名和哈快

position_info = ('onInfo a::text').getall()

detail['house'] = position_info[0]

detail['location'] = position_info[1]

# 650万,匹配650

price_pattern = e(r'd+')

detail['unit_price'] = (price_pattern, unit_price).group()

(detail)

def write_csv_file(self):

head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层",

"年份", "位置", "总价()", "单价(/平⽅⽶)"]

keys = ["title", "house", "bedroom", "area", "direction",

"floor", "year", "location",

"total_price", "unit_price"]

try:

with open(, 'w', newline='', encoding='utf_8_sig') as csv_file:

writer = (csv_file, dialect='excel')

if head is not None:

ow(head)

for item in :

row_data = []

for k in keys:

row_(item[k])

# print(row_data)

ow(row_data)

print("Write a CSV file to path %s Successful." % )

except Exception as e:

print("Fail to write CSV to path: %s, Case: %s" % (, e))

if __name__ == '__main__':

start = ()

home_link_spider = HomeLinkSpider()

home_link__page()

home_link__csv_file()

end = ()

print("耗时:{}".format(end-start))

整个爬取过程耗时16.1秒,可见使⽤httpx发送同步请求时效率和requests基本⽆差别。

注意:Windows上使⽤pip安装httpx可能会出现报错,要求安装Visual Studio C++, 这个下载安装好就没事了。

接下来,我们就要开始王炸了,使⽤httpxasyncio编写⼀个异步爬⾍看看从链家⽹上爬取580条数据到底需要多长时间。

httpx异步+ parsel组合

Httpx厉害的地⽅就是能发送异步请求。整个异步爬⾍实现原理时,先发送同步请求获取最⼤页码,把每个单页的爬取和数据

解析变为⼀个asyncio协程任务(使⽤async定义),最后使⽤loop执⾏。

⼤部分代码与同步爬⾍相同,主要变动地⽅有两个:

# 异步 - 使⽤协程函数解析单页⾯,需传⼊单页⾯url地址

async def parse_single_page(self, url):

# 使⽤httpx发送异步请求获取单页数据

async with lient() as client:

response = await (url, headers=s)

selector = Selector()

# 其余地⽅⼀样

def parse_page(self):

max_page = _max_page()

loop = _event_loop()

# Python 3.6之前⽤_future_task⽅法创建单个协程任务

# Python 3.7以后可以⽤户_task⽅法创建单个协程任务

tasks = []

for i in range(1, max_page + 1):

url = '/ershoufang/pudong/pg{}a3p5/'.format(i)

(_single_page(url))

# 还可以使⽤(*tasks)命令将多个协程任务加⼊到事件循环

_until_complete((tasks))

()

整个项⽬代码如下所⽰:

from fake_useragent import UserAgent

import csv

import re

import time

from parsel import Selector

import httpx

import asyncio

class HomeLinkSpider(object):

def __init__(self):

= UserAgent()

s = {"User-Agent": }

= list()

= "浦东_三房_500_800.csv"

= "/ershoufang/pudong/a3p5/"

def get_max_page(self):

response = (, headers=s)

if _code == 200:

# 创建Selector类实例

selector = Selector()

# 采⽤css选择器获取最⼤页码div Boxl

a = ('div[class="page-box house-lst-page-box"]')

# 使⽤evalpage-datajson字符串转化为字典格式

max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]

print("最⼤页码数:{}".format(max_page))

return max_page

else:

print("请求失败 status:{}".format(_code))

return None

# 异步 - 使⽤协程函数解析单页⾯,需传⼊单页⾯url地址

async def parse_single_page(self, url):

async with lient() as client:

response = await (url, headers=s)

selector = Selector()

ul = ('stContent')[0]

li_list = ('li')

for li in li_list:

detail = dict()

detail['title'] = (' a::text').get()

# 21 | 74.14平⽶ | | 精装 | ⾼楼层(6) | 1999年建 | 板楼

house_info = ('nfo::text').get()

house_info_list = house_(" | ")

detail['bedroom'] = house_info_list[0]

detail['area'] = house_info_list[1]

detail['direction'] = house_info_list[2]

floor_pattern = e(r'd{1,2}')

match1 = (floor_pattern, house_info_list[4]) # 从字符串任意位置匹配

if match1:

detail['floor'] = ()

else:

detail['floor'] = "未知"

# 单价64182/平⽶, 匹配64182

unit_price = ('ice span::text').get()

detail['unit_price'] = (price_pattern, unit_price).group()

(detail)

def parse_page(self):

max_page = _max_page()

loop = _event_loop()

# Python 3.6之前⽤_future_task⽅法创建单个协程任务

# Python 3.7以后可以⽤户_task⽅法创建单个协程任务

tasks = []

for i in range(1, max_page + 1):

url = '/ershoufang/pudong/pg{}a3p5/'.format(i)

(_single_page(url))

# 还可以使⽤(*tasks)命令将多个协程任务加⼊到事件循环

_until_complete((tasks))

()

def write_csv_file(self):

head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层",

"年份", "位置", "总价()", "单价(/平⽅⽶)"]

keys = ["title", "house", "bedroom", "area", "direction",

"floor", "year", "location",

"total_price", "unit_price"]

try:

with open(, 'w', newline='', encoding='utf_8_sig') as csv_file:

writer = (csv_file, dialect='excel')

if head is not None:

ow(head)

for item in :

row_data = []

for k in keys:

row_(item[k])

ow(row_data)

print("Write a CSV file to path %s Successful." % )

except Exception as e:

print("Fail to write CSV to path: %s, Case: %s" % (, e))

if __name__ == '__main__':

start = ()

home_link_spider = HomeLinkSpider()

home_link__page()

home_link__csv_file()

end = ()

print("耗时:{}".format(end-start))

现在到了见证奇迹的时刻了。从链家⽹上爬取了580条数据,使⽤httpx编写的异步爬⾍仅仅花了2.5!!

对⽐与总结

爬取同样的内容,采⽤不同⼯具组合耗时是不⼀样的。httpx异步+parsel组合毫⽆疑问是最⼤的赢家, requests

BeautifulSoup确实可以功成⾝退啦。

requests + BeautifulSoup: 18.5

requests + parsel: 16.5

httpx 同步 + parsel: 16.1

httpx 异步 + parsel: 2.5

对于Python爬⾍,你还有喜欢的库吗?

以上就是python爬⾍请求库httpxparsel解析库的使⽤测评的详细内容,更多关于python httpxparsel的资料请关注其它相关

⽂章!

马鞍山建材市场-snkers专属购买通道在哪

python爬虫请求库httpx和parsel解析库的使用测评

更多推荐

link lianjia com