You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
74 lines
2.5 KiB
74 lines
2.5 KiB
# -*- coding: utf-8 -*-
|
|
import datetime
|
|
|
|
import scrapy
|
|
from scrapy import Request
|
|
import oss2
|
|
import os
|
|
|
|
from scrapy import Selector
|
|
|
|
from papernews_spider.myfirstPj.items import MyfirstpjItem
|
|
from papernews_spider.myfirstPj.spiders import Tools
|
|
from papernews_spider.myfirstPj import settings
|
|
|
|
|
|
# 爬取纸业网咨询中心的印刷出版页面
|
|
class PaperNewsSpider(scrapy.Spider):
|
|
# 爬虫名字
|
|
name = 'PaperNews'
|
|
# 爬虫域名
|
|
allowed_domains = ['www.paper.com.cn']
|
|
|
|
# 国内咨询主页面
|
|
start_urls = ['http://www.paper.com.cn/news/nation.php?news_type=%B9%FA%C4%DA%D7%CA%D1%B6'] # 07
|
|
|
|
# 爬虫主类
|
|
def parse(self, response):
|
|
|
|
# 获取资讯链接
|
|
lists = response.xpath('//td[@width="85%"]/a/@href').extract()
|
|
|
|
# 字符串拼接给lists链接加上域名
|
|
lists_num = len(lists) - 1
|
|
newlists = lists
|
|
while True:
|
|
newlists[lists_num] = 'http://www.paper.com.cn%s' % (lists[lists_num])
|
|
lists_num = lists_num - 1
|
|
if lists_num < 0:
|
|
lists_num = len(lists) - 1
|
|
break
|
|
|
|
# 编辑输出编码格式,输出文件名,信息为追加写入
|
|
Tools.write_txt(Tools.__init__(self), "newsUrl.txt", newlists)
|
|
|
|
|
|
# 循环爬取newlist里面的网页
|
|
while True:
|
|
yield Request(newlists[lists_num], callback=self.parse_second)
|
|
lists_num = lists_num - 1
|
|
if lists_num < 0:
|
|
lists_num = len(lists) - 1
|
|
break
|
|
|
|
# 获取二层页面的方法
|
|
def parse_second(self, response):
|
|
# 资讯详情页
|
|
lists = response.xpath('//b/text() | //p/text()').extract()
|
|
|
|
# 编辑输出编码格式,输出文件名,信息为追加写入
|
|
Tools.write_txt(Tools.__init__(self), "News.txt", lists)
|
|
print(lists)
|
|
# 推送url到阿里云
|
|
now = datetime.datetime.now()
|
|
directory = "spider-information-data/paper.com.cn/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/"
|
|
Tools.put(Tools.__init__(self), directory, lists)
|
|
|
|
# 重写request
|
|
def start_requests(self):
|
|
"""
|
|
这是一个重载函数,它的作用是发出一个Request请求
|
|
:return:
|
|
"""
|
|
# 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。
|
|
yield Request(self.start_urls[0], callback=self.parse)
|