import datetime import json import scrapy from scrapy import Request from scrapy.http import FormRequest # 爬取富宝咨询的废纸与成品纸价格变动 from papernews_spider.myfirstPj.spiders import Tools class PaperPricesSpider(scrapy.Spider): # 爬虫名字 name = 'PaperPriceSpider' # 爬虫域名 allowed_domains = ['news.f139.com'] # 印刷出版主页面 start_urls = ['http://news.f139.com/list.do?channelID=94&categoryID=27'] def parse(self, response): # 爬取信息连接 lists = response.xpath('//a[@target="_blank"]/@href').extract() # 字符串拼接给lists链接加上域名 newlists = lists for i in range(len(lists)): newlists[i] = 'http://news.f139.com%s' % (lists[i]) # 编辑输出编码格式,输出文件名,信息为追加写入 Tools.write_txt(Tools.__init__(self), "priceUrl.txt", newlists) # 更新生成的连接列表 newlists = Tools.url_manage(Tools.__init__(self), 'priceUrl.txt', 'oldPriceUrl.txt') # 清洗多余字符 newlists = Tools.cleantxt(Tools.__init__(self), '\n', '', newlists) newlists = Tools.cleantxt(Tools.__init__(self), '===============================================================================================', '', newlists) # print(newlists) # 更新oldPriceUrl Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", newlists) # 循环爬取newlist里面的网页(新网页) for i in range(len(newlists)): yield Request(newlists[i], callback=self.parse_second) if i == (len(newlists) - 1): data = {"name": PaperPricesSpider.name, "url": newlists, "status": 200} print(json.dumps(data)) # print(response.body.decode('utf-8', 'ignore')) # 获取二层页面的方法 def parse_second(self, response): # 资讯详情页 lists = response.xpath( "//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text()").extract() # lists = response.xpath("normalize-space(.//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text())").extract_first() # 编辑输出编码格式,输出文件名,信息为追加写入 lists = Tools.cleantxt(Tools.__init__(self), '富宝', '千鸟', lists) # 修改富宝咨询为千鸟咨询 del lists[1:5] del lists[12:18] # 删除空白行 Tools.write_txt(Tools.__init__(self), "priceText.txt", lists) # 推送到阿里云(可选) now = datetime.datetime.now() directory = "spider-information-data/fuBao/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/" Tools.put(Tools.__init__(self), directory, lists) # 重写request def start_requests(self): """ 这是一个重载函数,它的作用是发出一个Request请求 :return: """ # 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。 yield Request(self.start_urls[0], callback=self.parse)