commit
7c37472886
18 changed files with 699 additions and 0 deletions
Split View
Diff Options
-
154.gitignore
-
21README.md
-
0papernews_spider/__init__.py
-
0papernews_spider/myfirstPj/__init__.py
-
13papernews_spider/myfirstPj/items.py
-
103papernews_spider/myfirstPj/middlewares.py
-
25papernews_spider/myfirstPj/pipelines.py
-
96papernews_spider/myfirstPj/settings.py
-
25papernews_spider/myfirstPj/spiders/NewsContent.py
-
74papernews_spider/myfirstPj/spiders/PaperNewsSpider.py
-
82papernews_spider/myfirstPj/spiders/PaperPriceSpider.py
-
8papernews_spider/myfirstPj/spiders/Url.py
-
76papernews_spider/myfirstPj/spiders/__init__.py
-
0papernews_spider/myfirstPj/tools/Api.py
-
6papernews_spider/myfirstPj/tools/Run.py
-
5papernews_spider/myfirstPj/tools/SetSpiderName.py
-
0papernews_spider/myfirstPj/tools/__init__.py
-
11papernews_spider/scrapy.cfg
@ -0,0 +1,154 @@ |
|||
### IntelliJ IDEA ### |
|||
.idea |
|||
.gitback |
|||
|
|||
# ---> Python |
|||
# Byte-compiled / optimized / DLL files |
|||
__pycache__/ |
|||
*.py[cod] |
|||
*$py.class |
|||
|
|||
# C extensions |
|||
*.so |
|||
|
|||
# Distribution / packaging |
|||
.Python |
|||
build/ |
|||
develop-eggs/ |
|||
dist/ |
|||
downloads/ |
|||
eggs/ |
|||
.eggs/ |
|||
lib/ |
|||
lib64/ |
|||
parts/ |
|||
sdist/ |
|||
var/ |
|||
wheels/ |
|||
share/python-wheels/ |
|||
*.egg-info/ |
|||
.installed.cfg |
|||
*.egg |
|||
MANIFEST |
|||
|
|||
# PyInstaller |
|||
# Usually these files are written by a python script from a template |
|||
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
|||
*.manifest |
|||
*.spec |
|||
|
|||
# Installer logs |
|||
pip-log.txt |
|||
pip-delete-this-directory.txt |
|||
|
|||
# Unit test / coverage reports |
|||
htmlcov/ |
|||
.tox/ |
|||
.nox/ |
|||
.coverage |
|||
.coverage.* |
|||
.cache |
|||
nosetests.xml |
|||
coverage.xml |
|||
*.cover |
|||
*.py,cover |
|||
.hypothesis/ |
|||
.pytest_cache/ |
|||
cover/ |
|||
|
|||
# Translations |
|||
*.mo |
|||
*.pot |
|||
|
|||
# Django stuff: |
|||
*.log |
|||
local_settings.py |
|||
db.sqlite3 |
|||
db.sqlite3-journal |
|||
|
|||
# Flask stuff: |
|||
instance/ |
|||
.webassets-cache |
|||
|
|||
# Scrapy stuff: |
|||
.scrapy |
|||
|
|||
# Sphinx documentation |
|||
docs/_build/ |
|||
|
|||
# PyBuilder |
|||
.pybuilder/ |
|||
target/ |
|||
|
|||
# Jupyter Notebook |
|||
.ipynb_checkpoints |
|||
|
|||
# IPython |
|||
profile_default/ |
|||
ipython_config.py |
|||
|
|||
# pyenv |
|||
# For a library or package, you might want to ignore these files since the code is |
|||
# intended to run in multiple environments; otherwise, check them in: |
|||
# .python-version |
|||
|
|||
# pipenv |
|||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
|||
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
|||
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
|||
# install all needed dependencies. |
|||
#Pipfile.lock |
|||
|
|||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow |
|||
__pypackages__/ |
|||
|
|||
# Celery stuff |
|||
celerybeat-schedule |
|||
celerybeat.pid |
|||
|
|||
# SageMath parsed files |
|||
*.sage.py |
|||
|
|||
# Environments |
|||
.env |
|||
.venv |
|||
env/ |
|||
venv/ |
|||
ENV/ |
|||
env.bak/ |
|||
venv.bak/ |
|||
|
|||
# Spyder project settings |
|||
.spyderproject |
|||
.spyproject |
|||
|
|||
# Rope project settings |
|||
.ropeproject |
|||
|
|||
# mkdocs documentation |
|||
/site |
|||
|
|||
# mypy |
|||
.mypy_cache/ |
|||
.dmypy.json |
|||
dmypy.json |
|||
|
|||
# Pyre type checker |
|||
.pyre/ |
|||
|
|||
# pytype static type analyzer |
|||
.pytype/ |
|||
|
|||
# Cython debug symbols |
|||
cython_debug/ |
|||
|
|||
#文档和日志 |
|||
*.txt |
|||
|
|||
#飞桨 |
|||
/papernews_spider/Module/model_best/ |
|||
/papernews_spider/Module/ |
|||
|
|||
#包含test的测试文件 |
|||
*test* |
|||
/papernews_spider/myfirstPj/spiders/model_best/ |
|||
@ -0,0 +1,21 @@ |
|||
|
|||
## demo |
|||
|
|||
1. 安装Python 3.7 以上版本。 |
|||
2. 安装依赖:命令行执行 `pip install -r requirements.txt`。 |
|||
|
|||
|
|||
### 说明 |
|||
|
|||
+ Run.py是运行爬虫的方法,在SetSpdierName.py里修改要运行的爬虫项目 |
|||
+ 爬取纸业网咨询中心的印刷出版页面链接以及链接内详情内容。(PapernewsSpider.py) ,链接接为:http://www.paper.com.cn/news/nation.php?news_type=%D3%A1%CB%A2%B3%F6%B0%E6 |
|||
+ 文件News.txt是爬取到的资讯数据 |
|||
+ 爬取富宝咨询的纸厂调价栏目,记录的是废纸以及成品纸的价格变动(PaperpriceSpider.py)链接为:http://news.f139.com |
|||
+ 已经写好网页去重,并写好可以复用的相关的去重方法和清洗方法 |
|||
|
|||
### 文件说明 |
|||
+ spider文件夹存放的是爬虫文件 |
|||
+ tools文件夹存放的是各种工具类,包含各种测试用的工具类, |
|||
+ (现在已经关闭控制台输出) |
|||
|
|||
|
|||
@ -0,0 +1,13 @@ |
|||
# Define here the models for your scraped items |
|||
# |
|||
# See documentation in: |
|||
# https://docs.scrapy.org/en/latest/topics/items.html |
|||
# -*- coding: utf-8 -*- |
|||
import scrapy |
|||
|
|||
# 负责处理被spider提取出来的item。当页面被爬虫解析所需的数据存入Item |
|||
class MyfirstpjItem(scrapy.Item): |
|||
# define the fields for your item here like: |
|||
text = scrapy.Field() |
|||
url = scrapy.Field() |
|||
pass |
|||
@ -0,0 +1,103 @@ |
|||
# Define here the models for your spider middleware |
|||
# |
|||
# See documentation in: |
|||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
|||
|
|||
from scrapy import signals |
|||
|
|||
# useful for handling different item types with a single interface |
|||
from itemadapter import is_item, ItemAdapter |
|||
|
|||
|
|||
class MyfirstpjSpiderMiddleware: |
|||
# Not all methods need to be defined. If a method is not defined, |
|||
# scrapy acts as if the spider middleware does not modify the |
|||
# passed objects. |
|||
|
|||
@classmethod |
|||
def from_crawler(cls, crawler): |
|||
# This method is used by Scrapy to create your spiders. |
|||
s = cls() |
|||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |
|||
return s |
|||
|
|||
def process_spider_input(self, response, spider): |
|||
# Called for each response that goes through the spider |
|||
# middleware and into the spider. |
|||
|
|||
# Should return None or raise an exception. |
|||
return None |
|||
|
|||
def process_spider_output(self, response, result, spider): |
|||
# Called with the results returned from the Spider, after |
|||
# it has processed the response. |
|||
|
|||
# Must return an iterable of Request, or item objects. |
|||
for i in result: |
|||
yield i |
|||
|
|||
def process_spider_exception(self, response, exception, spider): |
|||
# Called when a spider or process_spider_input() method |
|||
# (from other spider middleware) raises an exception. |
|||
|
|||
# Should return either None or an iterable of Request or item objects. |
|||
pass |
|||
|
|||
def process_start_requests(self, start_requests, spider): |
|||
# Called with the start requests of the spider, and works |
|||
# similarly to the process_spider_output() method, except |
|||
# that it doesn’t have a response associated. |
|||
|
|||
# Must return only requests (not items). |
|||
for r in start_requests: |
|||
yield r |
|||
|
|||
def spider_opened(self, spider): |
|||
spider.logger.info('Spider opened: %s' % spider.name) |
|||
|
|||
|
|||
class MyfirstpjDownloaderMiddleware: |
|||
# Not all methods need to be defined. If a method is not defined, |
|||
# scrapy acts as if the downloader middleware does not modify the |
|||
# passed objects. |
|||
|
|||
@classmethod |
|||
def from_crawler(cls, crawler): |
|||
# This method is used by Scrapy to create your spiders. |
|||
s = cls() |
|||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |
|||
return s |
|||
|
|||
def process_request(self, request, spider): |
|||
# Called for each request that goes through the downloader |
|||
# middleware. |
|||
|
|||
# Must either: |
|||
# - return None: continue processing this request |
|||
# - or return a Response object |
|||
# - or return a Request object |
|||
# - or raise IgnoreRequest: process_exception() methods of |
|||
# installed downloader middleware will be called |
|||
return None |
|||
|
|||
def process_response(self, request, response, spider): |
|||
# Called with the response returned from the downloader. |
|||
|
|||
# Must either; |
|||
# - return a Response object |
|||
# - return a Request object |
|||
# - or raise IgnoreRequest |
|||
return response |
|||
|
|||
def process_exception(self, request, exception, spider): |
|||
# Called when a download handler or a process_request() |
|||
# (from other downloader middleware) raises an exception. |
|||
|
|||
# Must either: |
|||
# - return None: continue processing this exception |
|||
# - return a Response object: stops process_exception() chain |
|||
# - return a Request object: stops process_exception() chain |
|||
pass |
|||
|
|||
def spider_opened(self, spider): |
|||
spider.logger.info('Spider opened: %s' % spider.name) |
|||
@ -0,0 +1,25 @@ |
|||
# Define your item pipelines here |
|||
# |
|||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting |
|||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html |
|||
|
|||
# -*- coding: utf-8 -*- |
|||
# useful for handling different item types with a single interface |
|||
import requests |
|||
from itemadapter import ItemAdapter |
|||
|
|||
|
|||
# 排列次序 |
|||
class MyfirstpjPipeline: |
|||
def process_item(self, item, spider): |
|||
# file = open("items.txt", "a") # 以追加的方式打开文件,不存在则创建 |
|||
# # 因为item中的数据是unicode编码,为了在控制台中查看数据的有效性和保存, |
|||
# # 将其编码改为utf-8 |
|||
# item_string = str(item).decode("unicode_escape").encode('utf-8') |
|||
# file.write(item_string) |
|||
# file.write('\n') |
|||
# file.close() |
|||
# print(item_string) # 在控制台输出 |
|||
# return item # 会在控制台输出原item数据,可以选择不写 |
|||
|
|||
print(item) |
|||
@ -0,0 +1,96 @@ |
|||
# Scrapy settings for myfirstPj project |
|||
# |
|||
# For simplicity, this file contains only settings considered important or |
|||
# commonly used. You can find more settings consulting the documentation: |
|||
# |
|||
# https://docs.scrapy.org/en/latest/topics/settings.html |
|||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |
|||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
|||
|
|||
BOT_NAME = 'myfirstPj' |
|||
|
|||
SPIDER_MODULES = ['myfirstPj.spiders'] |
|||
NEWSPIDER_MODULE = 'myfirstPj.spiders' |
|||
|
|||
|
|||
# Crawl responsibly by identifying yourself (and your website) on the user-agent |
|||
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36' |
|||
|
|||
|
|||
|
|||
# Obey robots.txt rules |
|||
ROBOTSTXT_OBEY = False |
|||
|
|||
# Configure maximum concurrent requests performed by Scrapy (default: 16) |
|||
#CONCURRENT_REQUESTS = 32 |
|||
|
|||
# Configure a delay for requests for the same website (default: 0) |
|||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay |
|||
# See also autothrottle settings and docs |
|||
#DOWNLOAD_DELAY = 3 |
|||
# The download delay setting will honor only one of: |
|||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 |
|||
#CONCURRENT_REQUESTS_PER_IP = 16 |
|||
|
|||
# Disable cookies (enabled by default) |
|||
COOKIES_ENABLED = False |
|||
|
|||
# Disable Telnet Console (enabled by default) |
|||
#TELNETCONSOLE_ENABLED = False |
|||
|
|||
# Override the default request headers: |
|||
DEFAULT_REQUEST_HEADERS = { |
|||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
|||
'Accept-Language': 'en', |
|||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36', |
|||
# 'Cookie': "'_qquc': '6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb6935540e9c8a32c823850e2a4fd1b9e6a7081f424d32347fd2822284431a74dec2c47cf56d87c11ed27ae08743d556ec1fbf41b4668dbfd6df049246d413308d16aed327f1420253934934bbb062de14706171347d330ba71e632c2d6a89b62e833cd2fac9e3fc13e07e94c47dbc159d7fed1db22e3274c3e3f940651d83de34fc405f741b3f69aac578d05fe26961e0125531c4fcb34a62af3e7a288d862f6eb34803c9e144a1661d0f8fc78ef8b87f3bf7ae89672f4ff196aadc60a8eae6483bae2ed065d851f447fc8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf4076d2dfe3efa02b5b8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf40f32e52a595d5e141717f6417f948aaa279d001d13b7a2bee2460d1d835ce38d700864368eab8f2b10f31642b295093604226def5e00d3d6a929c2e4596344032166b1741ede12384d1e9263bfd40239651284929d15a1aae886b7cf155fbd493', 'Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae': '1665814075', 'JSESSIONID': '12070D5B8A0173C0509273A9FD2060C5', 'Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae': '1666073527'" |
|||
'Cookie': "'JSESSIONID=2E161C24EE80617B11702B4E76A42FF0; _qquc=6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb693595284cd9e2fd46f4ee81e2dc73caad231f424d32347fd2822284431a74dec2c4b83f1b9d8786ebd0d7365bf0935405158f9d16bb79a0bd1f25565e5fdaa4fefb8026fd46b9c3cb6230301904f42b70dac9ff77c32d9228079178f395dd41708e28f30f4223948f25c30255de9bdf9210c2c95a92b6f80aff8ab6cd29261c19c181d4ecb97a9c5e4841b4668dbfd6df042c4e113b1f49a2735a11313cd1738fec5bcb7e4f0ae6ab92005e478fba8f38efe561ea8de8abb858e98715ccff56e8a9b336d1c71d4222cfd4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e26062333f8df1e661d4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e2ed41a28a5d5b372fd4e3cd2cdd4ab3e976652525878b2ab6c6cf81d18a62f4ef6a0d294378ea2f7c792744feca70155430d053014964fed2194a1d90a161c73c6f7567f12bdd69a2d9b81824ced2ccba099beefbb90aee3433a598ff7d0d05e8; Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae=1649303831,1649383836,1650452961; Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae=1650453289'" |
|||
} |
|||
|
|||
# Enable or disable spider middlewares |
|||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
|||
#SPIDER_MIDDLEWARES = { |
|||
# 'myfirstPj.middlewares.MyfirstpjSpiderMiddleware': 543, |
|||
#} |
|||
|
|||
# Enable or disable downloader middlewares |
|||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |
|||
#DOWNLOADER_MIDDLEWARES = { |
|||
# 'myfirstPj.middlewares.MyfirstpjDownloaderMiddleware': 543, |
|||
#} |
|||
|
|||
# Enable or disable extensions |
|||
# See https://docs.scrapy.org/en/latest/topics/extensions.html |
|||
#EXTENSIONS = { |
|||
# 'scrapy.extensions.telnet.TelnetConsole': None, |
|||
#} |
|||
|
|||
# Configure item pipelines |
|||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html |
|||
ITEM_PIPELINES = { |
|||
'myfirstPj.pipelines.MyfirstpjPipeline': 300, |
|||
} |
|||
|
|||
# Enable and configure the AutoThrottle extension (disabled by default) |
|||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html |
|||
#AUTOTHROTTLE_ENABLED = True |
|||
# The initial download delay |
|||
#AUTOTHROTTLE_START_DELAY = 5 |
|||
# The maximum download delay to be set in case of high latencies |
|||
#AUTOTHROTTLE_MAX_DELAY = 60 |
|||
# The average number of requests Scrapy should be sending in parallel to |
|||
# each remote server |
|||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 |
|||
# Enable showing throttling stats for every response received: |
|||
#AUTOTHROTTLE_DEBUG = False |
|||
|
|||
# Enable and configure HTTP caching (disabled by default) |
|||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings |
|||
#HTTPCACHE_ENABLED = True |
|||
#HTTPCACHE_EXPIRATION_SECS = 0 |
|||
#HTTPCACHE_DIR = 'httpcache' |
|||
#HTTPCACHE_IGNORE_HTTP_CODES = [] |
|||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
|||
|
|||
|
|||
FEED_EXPORT_ENCODING='utf-8' |
|||
@ -0,0 +1,25 @@ |
|||
import scrapy |
|||
|
|||
import papernews_spider |
|||
from papernews_spider.myfirstPj.spiders import Tools |
|||
|
|||
|
|||
# 正文爬虫(测试用) |
|||
# import PapernewsSpider |
|||
|
|||
|
|||
class NewscontentSpider(scrapy.Spider): |
|||
name = 'NewsContent' |
|||
allowed_domains = ['www.paper.com.cn'] |
|||
urllist = open('url.txt', 'a', encoding='utf-8') |
|||
start_urls = ['http://www.paper.com.cn/'] |
|||
|
|||
def parse(self, response): |
|||
# 资讯详情页 |
|||
lists = response.xpath('//b/text() | //p/text()').extract() |
|||
# lists = response.xpath("//a[contains(@target, '_blank')]/text()").extract() |
|||
# 打开保存的url链接 |
|||
urllist = open("url.txt", 'a', encoding='utf-8') |
|||
|
|||
# 编辑输出编码格式,输出文件名,信息为追加写入 |
|||
Tools.write_txt(Tools.__init__(self), "News.txt", urllist) |
|||
@ -0,0 +1,74 @@ |
|||
# -*- coding: utf-8 -*- |
|||
import datetime |
|||
|
|||
import scrapy |
|||
from scrapy import Request |
|||
import oss2 |
|||
import os |
|||
|
|||
from scrapy import Selector |
|||
|
|||
from papernews_spider.myfirstPj.items import MyfirstpjItem |
|||
from papernews_spider.myfirstPj.spiders import Tools |
|||
from papernews_spider.myfirstPj import settings |
|||
|
|||
|
|||
# 爬取纸业网咨询中心的印刷出版页面 |
|||
class PaperNewsSpider(scrapy.Spider): |
|||
# 爬虫名字 |
|||
name = 'PaperNews' |
|||
# 爬虫域名 |
|||
allowed_domains = ['www.paper.com.cn'] |
|||
|
|||
# 国内咨询主页面 |
|||
start_urls = ['http://www.paper.com.cn/news/nation.php?news_type=%B9%FA%C4%DA%D7%CA%D1%B6'] # 07 |
|||
|
|||
# 爬虫主类 |
|||
def parse(self, response): |
|||
|
|||
# 获取资讯链接 |
|||
lists = response.xpath('//td[@width="85%"]/a/@href').extract() |
|||
|
|||
# 字符串拼接给lists链接加上域名 |
|||
lists_num = len(lists) - 1 |
|||
newlists = lists |
|||
while True: |
|||
newlists[lists_num] = 'http://www.paper.com.cn%s' % (lists[lists_num]) |
|||
lists_num = lists_num - 1 |
|||
if lists_num < 0: |
|||
lists_num = len(lists) - 1 |
|||
break |
|||
|
|||
# 编辑输出编码格式,输出文件名,信息为追加写入 |
|||
Tools.write_txt(Tools.__init__(self), "newsUrl.txt", newlists) |
|||
|
|||
|
|||
# 循环爬取newlist里面的网页 |
|||
while True: |
|||
yield Request(newlists[lists_num], callback=self.parse_second) |
|||
lists_num = lists_num - 1 |
|||
if lists_num < 0: |
|||
lists_num = len(lists) - 1 |
|||
break |
|||
|
|||
# 获取二层页面的方法 |
|||
def parse_second(self, response): |
|||
# 资讯详情页 |
|||
lists = response.xpath('//b/text() | //p/text()').extract() |
|||
|
|||
# 编辑输出编码格式,输出文件名,信息为追加写入 |
|||
Tools.write_txt(Tools.__init__(self), "News.txt", lists) |
|||
print(lists) |
|||
# 推送url到阿里云 |
|||
now = datetime.datetime.now() |
|||
directory = "spider-information-data/paper.com.cn/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/" |
|||
Tools.put(Tools.__init__(self), directory, lists) |
|||
|
|||
# 重写request |
|||
def start_requests(self): |
|||
""" |
|||
这是一个重载函数,它的作用是发出一个Request请求 |
|||
:return: |
|||
""" |
|||
# 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。 |
|||
yield Request(self.start_urls[0], callback=self.parse) |
|||
@ -0,0 +1,82 @@ |
|||
import datetime |
|||
import json |
|||
|
|||
import scrapy |
|||
from scrapy import Request |
|||
from scrapy.http import FormRequest |
|||
|
|||
# 爬取富宝咨询的废纸与成品纸价格变动 |
|||
from papernews_spider.myfirstPj.spiders import Tools |
|||
|
|||
|
|||
class PaperPricesSpider(scrapy.Spider): |
|||
# 爬虫名字 |
|||
name = 'PaperPriceSpider' |
|||
|
|||
# 爬虫域名 |
|||
allowed_domains = ['news.f139.com'] |
|||
|
|||
# 印刷出版主页面 |
|||
start_urls = ['http://news.f139.com/list.do?channelID=94&categoryID=27'] |
|||
|
|||
def parse(self, response): |
|||
# 爬取信息连接 |
|||
lists = response.xpath('//a[@target="_blank"]/@href').extract() |
|||
|
|||
# 字符串拼接给lists链接加上域名 |
|||
|
|||
newlists = lists |
|||
for i in range(len(lists)): |
|||
newlists[i] = 'http://news.f139.com%s' % (lists[i]) |
|||
|
|||
# 编辑输出编码格式,输出文件名,信息为追加写入 |
|||
Tools.write_txt(Tools.__init__(self), "priceUrl.txt", newlists) |
|||
|
|||
# 更新生成的连接列表 |
|||
newlists = Tools.url_manage(Tools.__init__(self), 'priceUrl.txt', 'oldPriceUrl.txt') |
|||
|
|||
# 清洗多余字符 |
|||
newlists = Tools.cleantxt(Tools.__init__(self), '\n', '', newlists) |
|||
newlists = Tools.cleantxt(Tools.__init__(self), |
|||
'===============================================================================================', |
|||
'', newlists) |
|||
|
|||
# print(newlists) |
|||
# 更新oldPriceUrl |
|||
Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", newlists) |
|||
|
|||
# 循环爬取newlist里面的网页(新网页) |
|||
for i in range(len(newlists)): |
|||
yield Request(newlists[i], callback=self.parse_second) |
|||
if i == (len(newlists) - 1): |
|||
data = {"name": PaperPricesSpider.name, "url": newlists, "status": 200} |
|||
print(json.dumps(data)) |
|||
|
|||
# print(response.body.decode('utf-8', 'ignore')) |
|||
|
|||
# 获取二层页面的方法 |
|||
def parse_second(self, response): |
|||
# 资讯详情页 |
|||
lists = response.xpath( |
|||
"//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text()").extract() |
|||
# lists = response.xpath("normalize-space(.//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text())").extract_first() |
|||
|
|||
# 编辑输出编码格式,输出文件名,信息为追加写入 |
|||
lists = Tools.cleantxt(Tools.__init__(self), '富宝', '千鸟', lists) # 修改富宝咨询为千鸟咨询 |
|||
del lists[1:5] |
|||
del lists[12:18] # 删除空白行 |
|||
Tools.write_txt(Tools.__init__(self), "priceText.txt", lists) |
|||
# 推送到阿里云(可选) |
|||
now = datetime.datetime.now() |
|||
directory = "spider-information-data/fuBao/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/" |
|||
Tools.put(Tools.__init__(self), directory, lists) |
|||
|
|||
# 重写request |
|||
def start_requests(self): |
|||
""" |
|||
这是一个重载函数,它的作用是发出一个Request请求 |
|||
:return: |
|||
""" |
|||
|
|||
# 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。 |
|||
yield Request(self.start_urls[0], callback=self.parse) |
|||
@ -0,0 +1,8 @@ |
|||
import scrapy |
|||
# 配置爬虫的网页 |
|||
class UrllRoom(): |
|||
# 拉取数据库要查询的 |
|||
name = 'baidu' |
|||
allowed_domains = ['www.baidu.com'] |
|||
start_urls = ['http://www.baidu.com/'] |
|||
|
|||
@ -0,0 +1,76 @@ |
|||
# This package will contain the spiders of your Scrapy project |
|||
# |
|||
# Please refer to the documentation for information on how to create and manage |
|||
# your spiders. |
|||
|
|||
import oss2 |
|||
from papernews_spider.Module.generateID import IdWorker |
|||
|
|||
|
|||
class Tools: |
|||
|
|||
# 上传阿里云oss存储,参数为数据流 |
|||
def put(self, directory, datalist): # (oss文件位置与命名,传入的数据流) |
|||
# 阿里云账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维,请登录RAM控制台创建RAM用户。 |
|||
auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn') |
|||
# yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1(深圳)为例,Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。 |
|||
# 填写Bucket名称。 |
|||
endpoint = 'oss-cn-shenzhen.aliyuncs.com' # 外网 |
|||
# endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com' # 内网 |
|||
bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake') |
|||
|
|||
# 生成上传数据 |
|||
id = IdWorker(1, 2, 0) |
|||
new_id = str(id.get_id()) + ".txt" |
|||
data = '' |
|||
for item in datalist: |
|||
data = data + item |
|||
bucket.put_object(directory + new_id, data) |
|||
|
|||
|
|||
|
|||
# 写文件方法 参数(self,生成文件名,xpath返回的list) |
|||
def write_txt(self, filename, listname): |
|||
file = open(filename, 'a', encoding='UTF-8') |
|||
for item in listname: |
|||
if not (item == '\r\n ' |
|||
or item == '\r\n ' |
|||
or item == '\r\n ' |
|||
or item == '\r\t' |
|||
or item == '\r\r\n \r\n \r\n ' |
|||
or item == '\r\r\n \r\n ' |
|||
or item == '\r\n \r\n ' |
|||
or item == '\r\r\n \r\n \r\n '): # 清洗数据 |
|||
file.write(item) |
|||
file.write("\n") |
|||
# print(item) |
|||
file.close() |
|||
|
|||
# url管理器(newlist:新获取的网页,oldurl是旧的记录url的文件) |
|||
def url_manage(self, newlist, oldurl): |
|||
""" |
|||
return:尚未爬取的链接 |
|||
""" |
|||
# 打开两个文件 |
|||
newfile = open(newlist, 'r', encoding='utf-8') |
|||
oldfile = open(oldurl, 'r', encoding='utf-8') |
|||
# 获取流 |
|||
new = newfile.readlines() |
|||
old = oldfile.readlines() |
|||
|
|||
newfile.close() |
|||
oldfile.close() |
|||
# 返回差集 |
|||
difference = set(new).difference(old) |
|||
|
|||
# # 更新oldPriceUrl |
|||
# Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new) |
|||
return difference |
|||
|
|||
# 清洗数据方法(self,要更改的字符串,更变成的数据,传入的数据list,) |
|||
def cleantxt(self, cleancontent, replacecontent, lists=list): |
|||
|
|||
# 清洗数据 |
|||
for new in lists: |
|||
new = [i.replace(cleancontent, replacecontent) for i in lists] |
|||
return new |
|||
@ -0,0 +1,6 @@ |
|||
from scrapy import cmdline |
|||
|
|||
import SetSpiderName |
|||
# cmdline.execute('s crapy crawl baidu -s LOG_FILE=debug.log'.split()) |
|||
cmdline.execute(('scrapy crawl %s -s LOG_FILE=debug.log' %(SetSpiderName.SetName.name)).split()) |
|||
# cmdline.execute(('scrapy crawl %s' %(SetSpiderName.SetName.name)).split()) |
|||
@ -0,0 +1,5 @@ |
|||
|
|||
|
|||
class SetName(): |
|||
# name = "PaperNews" |
|||
name = "PaperPriceSpider" |
|||
@ -0,0 +1,11 @@ |
|||
# Automatically created by: scrapy startproject |
|||
# |
|||
# For more information about the [deploy] section see: |
|||
# https://scrapyd.readthedocs.io/en/latest/deploy.html |
|||
|
|||
[settings] |
|||
default = myfirstPj.settings |
|||
|
|||
[deploy] |
|||
#url = http://localhost:6800/ |
|||
project = myfirstPj |
|||
Write
Preview
Loading…
Cancel
Save