You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
82 lines
3.2 KiB
82 lines
3.2 KiB
import datetime
|
|
import json
|
|
|
|
import scrapy
|
|
from scrapy import Request
|
|
from scrapy.http import FormRequest
|
|
|
|
# 爬取富宝咨询的废纸与成品纸价格变动
|
|
from papernews_spider.myfirstPj.spiders import Tools
|
|
|
|
|
|
class PaperPricesSpider(scrapy.Spider):
|
|
# 爬虫名字
|
|
name = 'PaperPriceSpider'
|
|
|
|
# 爬虫域名
|
|
allowed_domains = ['news.f139.com']
|
|
|
|
# 印刷出版主页面
|
|
start_urls = ['http://news.f139.com/list.do?channelID=94&categoryID=27']
|
|
|
|
def parse(self, response):
|
|
# 爬取信息连接
|
|
lists = response.xpath('//a[@target="_blank"]/@href').extract()
|
|
|
|
# 字符串拼接给lists链接加上域名
|
|
|
|
newlists = lists
|
|
for i in range(len(lists)):
|
|
newlists[i] = 'http://news.f139.com%s' % (lists[i])
|
|
|
|
# 编辑输出编码格式,输出文件名,信息为追加写入
|
|
Tools.write_txt(Tools.__init__(self), "priceUrl.txt", newlists)
|
|
|
|
# 更新生成的连接列表
|
|
newlists = Tools.url_manage(Tools.__init__(self), 'priceUrl.txt', 'oldPriceUrl.txt')
|
|
|
|
# 清洗多余字符
|
|
newlists = Tools.cleantxt(Tools.__init__(self), '\n', '', newlists)
|
|
newlists = Tools.cleantxt(Tools.__init__(self),
|
|
'===============================================================================================',
|
|
'', newlists)
|
|
|
|
# print(newlists)
|
|
# 更新oldPriceUrl
|
|
Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", newlists)
|
|
|
|
# 循环爬取newlist里面的网页(新网页)
|
|
for i in range(len(newlists)):
|
|
yield Request(newlists[i], callback=self.parse_second)
|
|
if i == (len(newlists) - 1):
|
|
data = {"name": PaperPricesSpider.name, "url": newlists, "status": 200}
|
|
print(json.dumps(data))
|
|
|
|
# print(response.body.decode('utf-8', 'ignore'))
|
|
|
|
# 获取二层页面的方法
|
|
def parse_second(self, response):
|
|
# 资讯详情页
|
|
lists = response.xpath(
|
|
"//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text()").extract()
|
|
# lists = response.xpath("normalize-space(.//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text())").extract_first()
|
|
|
|
# 编辑输出编码格式,输出文件名,信息为追加写入
|
|
lists = Tools.cleantxt(Tools.__init__(self), '富宝', '千鸟', lists) # 修改富宝咨询为千鸟咨询
|
|
del lists[1:5]
|
|
del lists[12:18] # 删除空白行
|
|
Tools.write_txt(Tools.__init__(self), "priceText.txt", lists)
|
|
# 推送到阿里云(可选)
|
|
now = datetime.datetime.now()
|
|
directory = "spider-information-data/fuBao/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/"
|
|
Tools.put(Tools.__init__(self), directory, lists)
|
|
|
|
# 重写request
|
|
def start_requests(self):
|
|
"""
|
|
这是一个重载函数,它的作用是发出一个Request请求
|
|
:return:
|
|
"""
|
|
|
|
# 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。
|
|
yield Request(self.start_urls[0], callback=self.parse)
|