scrapy
2024年7月29日大约 1 分钟
安装
pip install
使用
- 进入一个文件夹,创建一个项目
scrapy startproject tutorial
- 创建一个
spider
# 选择一个模板创建
scrapy genspider -l
# or 创建一个名称为example的spider, example.com 是allowed_domains
scrapy genspider example example.com
# or 根据模板创建
crapy genspider -t crawl scrapyorg scrapy.org
- 运行
# 进入项目下
scrapy crawl example
具体示例
动态生成ua
- 安装fake-useragent
- 使用
import
之后, 调用,然后用random
API生成
import scrapy
from fake_useragent import UserAgent
ua = UserAgent()
class exampleSpider(scrapy.Spider):
name = 'exzample'
allowed_domains = ['example.com']
def start_requests(self):
search_names = ['aaa', 'bbb']
prefix = 'https://example.com?keywords='
start_urls = []
for n in search_names:
start_urls.append(prefix + n)
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse, headers=self.get_random_user_agent())
def parse(self, response):
print(response.request.headers['User-Agent'])
# open(‘result.txt’, 'wb+').write(response.body.xpath("//td/a[@class='check']").extract())
link = response.xpath('//td/a/@href').extract_first()
if link:
link = 'https://example.com' + link
# 获取页面中的另一个链接
yield scrapy.Request(link, callback=self.parse, headers=self.get_random_user_agent())
else:
names = response.xpath('//tr/th/text()').extract()
tds = response.xpath('//tr/td/span/text()[not(ancestor::style)]').extract()
clean_tds = list(filter(bool, [td.replace('\n', '').strip() for td in tds]))
# 假如开启了pipeline, 则会到pipeline中执行,在pipelines.py中可以导出数据
yield {
'url': response.url,
'names': names,
'data': clean_tds
}
pass
# 定义一个生成随机ua的方法
def get_random_user_agent(self):
return { 'User-Agent': ua.random }
设置延迟
将settings.py
中配置
DOWNLOAD_DELAY = 12
RANDOMIZE_DOWNLOAD_DELAY = True
pipelines中
import csv
import json
class examplePipeline:
def process_item(self, item, spider):
if item['data']:
# csv 文件
self.file = open('file/' + item['data'][0] + '.csv', 'w', newline='')
self.writer = csv.writer(self.file)
self.writer.writerow(item['names']) # CSV头部
data = item['data']
self.writer.writerow(data) # 值
return item
else:
# json 文件
with open('syz/' + item['medName'] + '.json', 'w', encoding='utf-8') as json_file:
json.dump(item['yaozh01Data'], json_file, ensure_ascii=False)
return item
def close_spider(self, spider):
self.file.close()