# -*- coding: utf-8 -*- import datetime import scrapy from scrapy import Request import oss2 import os from scrapy import Selector from papernews_spider.myfirstPj.items import MyfirstpjItem from papernews_spider.myfirstPj.spiders import Tools from papernews_spider.myfirstPj import settings # 爬取纸业网咨询中心的印刷出版页面 class PaperNewsSpider(scrapy.Spider): # 爬虫名字 name = 'PaperNews' # 爬虫域名 allowed_domains = ['www.paper.com.cn'] # 国内咨询主页面 start_urls = ['http://www.paper.com.cn/news/nation.php?news_type=%B9%FA%C4%DA%D7%CA%D1%B6'] # 07 # 爬虫主类 def parse(self, response): # 获取资讯链接 lists = response.xpath('//td[@width="85%"]/a/@href').extract() # 字符串拼接给lists链接加上域名 lists_num = len(lists) - 1 newlists = lists while True: newlists[lists_num] = 'http://www.paper.com.cn%s' % (lists[lists_num]) lists_num = lists_num - 1 if lists_num < 0: lists_num = len(lists) - 1 break # 编辑输出编码格式,输出文件名,信息为追加写入 Tools.write_txt(Tools.__init__(self), "newsUrl.txt", newlists) # 循环爬取newlist里面的网页 while True: yield Request(newlists[lists_num], callback=self.parse_second) lists_num = lists_num - 1 if lists_num < 0: lists_num = len(lists) - 1 break # 获取二层页面的方法 def parse_second(self, response): # 资讯详情页 lists = response.xpath('//b/text() | //p/text()').extract() # 编辑输出编码格式,输出文件名,信息为追加写入 Tools.write_txt(Tools.__init__(self), "News.txt", lists) print(lists) # 推送url到阿里云 now = datetime.datetime.now() directory = "spider-information-data/paper.com.cn/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/" Tools.put(Tools.__init__(self), directory, lists) # 重写request def start_requests(self): """ 这是一个重载函数,它的作用是发出一个Request请求 :return: """ # 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。 yield Request(self.start_urls[0], callback=self.parse)