commit
7c37472886
18 changed files with 699 additions and 0 deletions
Unified View
Diff Options
-
154.gitignore
-
21README.md
-
0papernews_spider/__init__.py
-
0papernews_spider/myfirstPj/__init__.py
-
13papernews_spider/myfirstPj/items.py
-
103papernews_spider/myfirstPj/middlewares.py
-
25papernews_spider/myfirstPj/pipelines.py
-
96papernews_spider/myfirstPj/settings.py
-
25papernews_spider/myfirstPj/spiders/NewsContent.py
-
74papernews_spider/myfirstPj/spiders/PaperNewsSpider.py
-
82papernews_spider/myfirstPj/spiders/PaperPriceSpider.py
-
8papernews_spider/myfirstPj/spiders/Url.py
-
76papernews_spider/myfirstPj/spiders/__init__.py
-
0papernews_spider/myfirstPj/tools/Api.py
-
6papernews_spider/myfirstPj/tools/Run.py
-
5papernews_spider/myfirstPj/tools/SetSpiderName.py
-
0papernews_spider/myfirstPj/tools/__init__.py
-
11papernews_spider/scrapy.cfg
@ -0,0 +1,154 @@ |
|||||
|
### IntelliJ IDEA ### |
||||
|
.idea |
||||
|
.gitback |
||||
|
|
||||
|
# ---> Python |
||||
|
# Byte-compiled / optimized / DLL files |
||||
|
__pycache__/ |
||||
|
*.py[cod] |
||||
|
*$py.class |
||||
|
|
||||
|
# C extensions |
||||
|
*.so |
||||
|
|
||||
|
# Distribution / packaging |
||||
|
.Python |
||||
|
build/ |
||||
|
develop-eggs/ |
||||
|
dist/ |
||||
|
downloads/ |
||||
|
eggs/ |
||||
|
.eggs/ |
||||
|
lib/ |
||||
|
lib64/ |
||||
|
parts/ |
||||
|
sdist/ |
||||
|
var/ |
||||
|
wheels/ |
||||
|
share/python-wheels/ |
||||
|
*.egg-info/ |
||||
|
.installed.cfg |
||||
|
*.egg |
||||
|
MANIFEST |
||||
|
|
||||
|
# PyInstaller |
||||
|
# Usually these files are written by a python script from a template |
||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||
|
*.manifest |
||||
|
*.spec |
||||
|
|
||||
|
# Installer logs |
||||
|
pip-log.txt |
||||
|
pip-delete-this-directory.txt |
||||
|
|
||||
|
# Unit test / coverage reports |
||||
|
htmlcov/ |
||||
|
.tox/ |
||||
|
.nox/ |
||||
|
.coverage |
||||
|
.coverage.* |
||||
|
.cache |
||||
|
nosetests.xml |
||||
|
coverage.xml |
||||
|
*.cover |
||||
|
*.py,cover |
||||
|
.hypothesis/ |
||||
|
.pytest_cache/ |
||||
|
cover/ |
||||
|
|
||||
|
# Translations |
||||
|
*.mo |
||||
|
*.pot |
||||
|
|
||||
|
# Django stuff: |
||||
|
*.log |
||||
|
local_settings.py |
||||
|
db.sqlite3 |
||||
|
db.sqlite3-journal |
||||
|
|
||||
|
# Flask stuff: |
||||
|
instance/ |
||||
|
.webassets-cache |
||||
|
|
||||
|
# Scrapy stuff: |
||||
|
.scrapy |
||||
|
|
||||
|
# Sphinx documentation |
||||
|
docs/_build/ |
||||
|
|
||||
|
# PyBuilder |
||||
|
.pybuilder/ |
||||
|
target/ |
||||
|
|
||||
|
# Jupyter Notebook |
||||
|
.ipynb_checkpoints |
||||
|
|
||||
|
# IPython |
||||
|
profile_default/ |
||||
|
ipython_config.py |
||||
|
|
||||
|
# pyenv |
||||
|
# For a library or package, you might want to ignore these files since the code is |
||||
|
# intended to run in multiple environments; otherwise, check them in: |
||||
|
# .python-version |
||||
|
|
||||
|
# pipenv |
||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||
|
# install all needed dependencies. |
||||
|
#Pipfile.lock |
||||
|
|
||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow |
||||
|
__pypackages__/ |
||||
|
|
||||
|
# Celery stuff |
||||
|
celerybeat-schedule |
||||
|
celerybeat.pid |
||||
|
|
||||
|
# SageMath parsed files |
||||
|
*.sage.py |
||||
|
|
||||
|
# Environments |
||||
|
.env |
||||
|
.venv |
||||
|
env/ |
||||
|
venv/ |
||||
|
ENV/ |
||||
|
env.bak/ |
||||
|
venv.bak/ |
||||
|
|
||||
|
# Spyder project settings |
||||
|
.spyderproject |
||||
|
.spyproject |
||||
|
|
||||
|
# Rope project settings |
||||
|
.ropeproject |
||||
|
|
||||
|
# mkdocs documentation |
||||
|
/site |
||||
|
|
||||
|
# mypy |
||||
|
.mypy_cache/ |
||||
|
.dmypy.json |
||||
|
dmypy.json |
||||
|
|
||||
|
# Pyre type checker |
||||
|
.pyre/ |
||||
|
|
||||
|
# pytype static type analyzer |
||||
|
.pytype/ |
||||
|
|
||||
|
# Cython debug symbols |
||||
|
cython_debug/ |
||||
|
|
||||
|
#文档和日志 |
||||
|
*.txt |
||||
|
|
||||
|
#飞桨 |
||||
|
/papernews_spider/Module/model_best/ |
||||
|
/papernews_spider/Module/ |
||||
|
|
||||
|
#包含test的测试文件 |
||||
|
*test* |
||||
|
/papernews_spider/myfirstPj/spiders/model_best/ |
||||
@ -0,0 +1,21 @@ |
|||||
|
|
||||
|
## demo |
||||
|
|
||||
|
1. 安装Python 3.7 以上版本。 |
||||
|
2. 安装依赖:命令行执行 `pip install -r requirements.txt`。 |
||||
|
|
||||
|
|
||||
|
### 说明 |
||||
|
|
||||
|
+ Run.py是运行爬虫的方法,在SetSpdierName.py里修改要运行的爬虫项目 |
||||
|
+ 爬取纸业网咨询中心的印刷出版页面链接以及链接内详情内容。(PapernewsSpider.py) ,链接接为:http://www.paper.com.cn/news/nation.php?news_type=%D3%A1%CB%A2%B3%F6%B0%E6 |
||||
|
+ 文件News.txt是爬取到的资讯数据 |
||||
|
+ 爬取富宝咨询的纸厂调价栏目,记录的是废纸以及成品纸的价格变动(PaperpriceSpider.py)链接为:http://news.f139.com |
||||
|
+ 已经写好网页去重,并写好可以复用的相关的去重方法和清洗方法 |
||||
|
|
||||
|
### 文件说明 |
||||
|
+ spider文件夹存放的是爬虫文件 |
||||
|
+ tools文件夹存放的是各种工具类,包含各种测试用的工具类, |
||||
|
+ (现在已经关闭控制台输出) |
||||
|
|
||||
|
|
||||
@ -0,0 +1,13 @@ |
|||||
|
# Define here the models for your scraped items |
||||
|
# |
||||
|
# See documentation in: |
||||
|
# https://docs.scrapy.org/en/latest/topics/items.html |
||||
|
# -*- coding: utf-8 -*- |
||||
|
import scrapy |
||||
|
|
||||
|
# 负责处理被spider提取出来的item。当页面被爬虫解析所需的数据存入Item |
||||
|
class MyfirstpjItem(scrapy.Item): |
||||
|
# define the fields for your item here like: |
||||
|
text = scrapy.Field() |
||||
|
url = scrapy.Field() |
||||
|
pass |
||||
@ -0,0 +1,103 @@ |
|||||
|
# Define here the models for your spider middleware |
||||
|
# |
||||
|
# See documentation in: |
||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
||||
|
|
||||
|
from scrapy import signals |
||||
|
|
||||
|
# useful for handling different item types with a single interface |
||||
|
from itemadapter import is_item, ItemAdapter |
||||
|
|
||||
|
|
||||
|
class MyfirstpjSpiderMiddleware: |
||||
|
# Not all methods need to be defined. If a method is not defined, |
||||
|
# scrapy acts as if the spider middleware does not modify the |
||||
|
# passed objects. |
||||
|
|
||||
|
@classmethod |
||||
|
def from_crawler(cls, crawler): |
||||
|
# This method is used by Scrapy to create your spiders. |
||||
|
s = cls() |
||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |
||||
|
return s |
||||
|
|
||||
|
def process_spider_input(self, response, spider): |
||||
|
# Called for each response that goes through the spider |
||||
|
# middleware and into the spider. |
||||
|
|
||||
|
# Should return None or raise an exception. |
||||
|
return None |
||||
|
|
||||
|
def process_spider_output(self, response, result, spider): |
||||
|
# Called with the results returned from the Spider, after |
||||
|
# it has processed the response. |
||||
|
|
||||
|
# Must return an iterable of Request, or item objects. |
||||
|
for i in result: |
||||
|
yield i |
||||
|
|
||||
|
def process_spider_exception(self, response, exception, spider): |
||||
|
# Called when a spider or process_spider_input() method |
||||
|
# (from other spider middleware) raises an exception. |
||||
|
|
||||
|
# Should return either None or an iterable of Request or item objects. |
||||
|
pass |
||||
|
|
||||
|
def process_start_requests(self, start_requests, spider): |
||||
|
# Called with the start requests of the spider, and works |
||||
|
# similarly to the process_spider_output() method, except |
||||
|
# that it doesn’t have a response associated. |
||||
|
|
||||
|
# Must return only requests (not items). |
||||
|
for r in start_requests: |
||||
|
yield r |
||||
|
|
||||
|
def spider_opened(self, spider): |
||||
|
spider.logger.info('Spider opened: %s' % spider.name) |
||||
|
|
||||
|
|
||||
|
class MyfirstpjDownloaderMiddleware: |
||||
|
# Not all methods need to be defined. If a method is not defined, |
||||
|
# scrapy acts as if the downloader middleware does not modify the |
||||
|
# passed objects. |
||||
|
|
||||
|
@classmethod |
||||
|
def from_crawler(cls, crawler): |
||||
|
# This method is used by Scrapy to create your spiders. |
||||
|
s = cls() |
||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |
||||
|
return s |
||||
|
|
||||
|
def process_request(self, request, spider): |
||||
|
# Called for each request that goes through the downloader |
||||
|
# middleware. |
||||
|
|
||||
|
# Must either: |
||||
|
# - return None: continue processing this request |
||||
|
# - or return a Response object |
||||
|
# - or return a Request object |
||||
|
# - or raise IgnoreRequest: process_exception() methods of |
||||
|
# installed downloader middleware will be called |
||||
|
return None |
||||
|
|
||||
|
def process_response(self, request, response, spider): |
||||
|
# Called with the response returned from the downloader. |
||||
|
|
||||
|
# Must either; |
||||
|
# - return a Response object |
||||
|
# - return a Request object |
||||
|
# - or raise IgnoreRequest |
||||
|
return response |
||||
|
|
||||
|
def process_exception(self, request, exception, spider): |
||||
|
# Called when a download handler or a process_request() |
||||
|
# (from other downloader middleware) raises an exception. |
||||
|
|
||||
|
# Must either: |
||||
|
# - return None: continue processing this exception |
||||
|
# - return a Response object: stops process_exception() chain |
||||
|
# - return a Request object: stops process_exception() chain |
||||
|
pass |
||||
|
|
||||
|
def spider_opened(self, spider): |
||||
|
spider.logger.info('Spider opened: %s' % spider.name) |
||||
@ -0,0 +1,25 @@ |
|||||
|
# Define your item pipelines here |
||||
|
# |
||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting |
||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html |
||||
|
|
||||
|
# -*- coding: utf-8 -*- |
||||
|
# useful for handling different item types with a single interface |
||||
|
import requests |
||||
|
from itemadapter import ItemAdapter |
||||
|
|
||||
|
|
||||
|
# 排列次序 |
||||
|
class MyfirstpjPipeline: |
||||
|
def process_item(self, item, spider): |
||||
|
# file = open("items.txt", "a") # 以追加的方式打开文件,不存在则创建 |
||||
|
# # 因为item中的数据是unicode编码,为了在控制台中查看数据的有效性和保存, |
||||
|
# # 将其编码改为utf-8 |
||||
|
# item_string = str(item).decode("unicode_escape").encode('utf-8') |
||||
|
# file.write(item_string) |
||||
|
# file.write('\n') |
||||
|
# file.close() |
||||
|
# print(item_string) # 在控制台输出 |
||||
|
# return item # 会在控制台输出原item数据,可以选择不写 |
||||
|
|
||||
|
print(item) |
||||
@ -0,0 +1,96 @@ |
|||||
|
# Scrapy settings for myfirstPj project |
||||
|
# |
||||
|
# For simplicity, this file contains only settings considered important or |
||||
|
# commonly used. You can find more settings consulting the documentation: |
||||
|
# |
||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html |
||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |
||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
||||
|
|
||||
|
BOT_NAME = 'myfirstPj' |
||||
|
|
||||
|
SPIDER_MODULES = ['myfirstPj.spiders'] |
||||
|
NEWSPIDER_MODULE = 'myfirstPj.spiders' |
||||
|
|
||||
|
|
||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent |
||||
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36' |
||||
|
|
||||
|
|
||||
|
|
||||
|
# Obey robots.txt rules |
||||
|
ROBOTSTXT_OBEY = False |
||||
|
|
||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16) |
||||
|
#CONCURRENT_REQUESTS = 32 |
||||
|
|
||||
|
# Configure a delay for requests for the same website (default: 0) |
||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay |
||||
|
# See also autothrottle settings and docs |
||||
|
#DOWNLOAD_DELAY = 3 |
||||
|
# The download delay setting will honor only one of: |
||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 |
||||
|
#CONCURRENT_REQUESTS_PER_IP = 16 |
||||
|
|
||||
|
# Disable cookies (enabled by default) |
||||
|
COOKIES_ENABLED = False |
||||
|
|
||||
|
# Disable Telnet Console (enabled by default) |
||||
|
#TELNETCONSOLE_ENABLED = False |
||||
|
|
||||
|
# Override the default request headers: |
||||
|
DEFAULT_REQUEST_HEADERS = { |
||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
||||
|
'Accept-Language': 'en', |
||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36', |
||||
|
# 'Cookie': "'_qquc': '6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb6935540e9c8a32c823850e2a4fd1b9e6a7081f424d32347fd2822284431a74dec2c47cf56d87c11ed27ae08743d556ec1fbf41b4668dbfd6df049246d413308d16aed327f1420253934934bbb062de14706171347d330ba71e632c2d6a89b62e833cd2fac9e3fc13e07e94c47dbc159d7fed1db22e3274c3e3f940651d83de34fc405f741b3f69aac578d05fe26961e0125531c4fcb34a62af3e7a288d862f6eb34803c9e144a1661d0f8fc78ef8b87f3bf7ae89672f4ff196aadc60a8eae6483bae2ed065d851f447fc8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf4076d2dfe3efa02b5b8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf40f32e52a595d5e141717f6417f948aaa279d001d13b7a2bee2460d1d835ce38d700864368eab8f2b10f31642b295093604226def5e00d3d6a929c2e4596344032166b1741ede12384d1e9263bfd40239651284929d15a1aae886b7cf155fbd493', 'Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae': '1665814075', 'JSESSIONID': '12070D5B8A0173C0509273A9FD2060C5', 'Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae': '1666073527'" |
||||
|
'Cookie': "'JSESSIONID=2E161C24EE80617B11702B4E76A42FF0; _qquc=6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb693595284cd9e2fd46f4ee81e2dc73caad231f424d32347fd2822284431a74dec2c4b83f1b9d8786ebd0d7365bf0935405158f9d16bb79a0bd1f25565e5fdaa4fefb8026fd46b9c3cb6230301904f42b70dac9ff77c32d9228079178f395dd41708e28f30f4223948f25c30255de9bdf9210c2c95a92b6f80aff8ab6cd29261c19c181d4ecb97a9c5e4841b4668dbfd6df042c4e113b1f49a2735a11313cd1738fec5bcb7e4f0ae6ab92005e478fba8f38efe561ea8de8abb858e98715ccff56e8a9b336d1c71d4222cfd4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e26062333f8df1e661d4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e2ed41a28a5d5b372fd4e3cd2cdd4ab3e976652525878b2ab6c6cf81d18a62f4ef6a0d294378ea2f7c792744feca70155430d053014964fed2194a1d90a161c73c6f7567f12bdd69a2d9b81824ced2ccba099beefbb90aee3433a598ff7d0d05e8; Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae=1649303831,1649383836,1650452961; Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae=1650453289'" |
||||
|
} |
||||
|
|
||||
|
# Enable or disable spider middlewares |
||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
||||
|
#SPIDER_MIDDLEWARES = { |
||||
|
# 'myfirstPj.middlewares.MyfirstpjSpiderMiddleware': 543, |
||||
|
#} |
||||
|
|
||||
|
# Enable or disable downloader middlewares |
||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |
||||
|
#DOWNLOADER_MIDDLEWARES = { |
||||
|
# 'myfirstPj.middlewares.MyfirstpjDownloaderMiddleware': 543, |
||||
|
#} |
||||
|
|
||||
|
# Enable or disable extensions |
||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html |
||||
|
#EXTENSIONS = { |
||||
|
# 'scrapy.extensions.telnet.TelnetConsole': None, |
||||
|
#} |
||||
|
|
||||
|
# Configure item pipelines |
||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html |
||||
|
ITEM_PIPELINES = { |
||||
|
'myfirstPj.pipelines.MyfirstpjPipeline': 300, |
||||
|
} |
||||
|
|
||||
|
# Enable and configure the AutoThrottle extension (disabled by default) |
||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html |
||||
|
#AUTOTHROTTLE_ENABLED = True |
||||
|
# The initial download delay |
||||
|
#AUTOTHROTTLE_START_DELAY = 5 |
||||
|
# The maximum download delay to be set in case of high latencies |
||||
|
#AUTOTHROTTLE_MAX_DELAY = 60 |
||||
|
# The average number of requests Scrapy should be sending in parallel to |
||||
|
# each remote server |
||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 |
||||
|
# Enable showing throttling stats for every response received: |
||||
|
#AUTOTHROTTLE_DEBUG = False |
||||
|
|
||||
|
# Enable and configure HTTP caching (disabled by default) |
||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings |
||||
|
#HTTPCACHE_ENABLED = True |
||||
|
#HTTPCACHE_EXPIRATION_SECS = 0 |
||||
|
#HTTPCACHE_DIR = 'httpcache' |
||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = [] |
||||
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
||||
|
|
||||
|
|
||||
|
FEED_EXPORT_ENCODING='utf-8' |
||||
@ -0,0 +1,25 @@ |
|||||
|
import scrapy |
||||
|
|
||||
|
import papernews_spider |
||||
|
from papernews_spider.myfirstPj.spiders import Tools |
||||
|
|
||||
|
|
||||
|
# 正文爬虫(测试用) |
||||
|
# import PapernewsSpider |
||||
|
|
||||
|
|
||||
|
class NewscontentSpider(scrapy.Spider): |
||||
|
name = 'NewsContent' |
||||
|
allowed_domains = ['www.paper.com.cn'] |
||||
|
urllist = open('url.txt', 'a', encoding='utf-8') |
||||
|
start_urls = ['http://www.paper.com.cn/'] |
||||
|
|
||||
|
def parse(self, response): |
||||
|
# 资讯详情页 |
||||
|
lists = response.xpath('//b/text() | //p/text()').extract() |
||||
|
# lists = response.xpath("//a[contains(@target, '_blank')]/text()").extract() |
||||
|
# 打开保存的url链接 |
||||
|
urllist = open("url.txt", 'a', encoding='utf-8') |
||||
|
|
||||
|
# 编辑输出编码格式,输出文件名,信息为追加写入 |
||||
|
Tools.write_txt(Tools.__init__(self), "News.txt", urllist) |
||||
@ -0,0 +1,74 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
import datetime |
||||
|
|
||||
|
import scrapy |
||||
|
from scrapy import Request |
||||
|
import oss2 |
||||
|
import os |
||||
|
|
||||
|
from scrapy import Selector |
||||
|
|
||||
|
from papernews_spider.myfirstPj.items import MyfirstpjItem |
||||
|
from papernews_spider.myfirstPj.spiders import Tools |
||||
|
from papernews_spider.myfirstPj import settings |
||||
|
|
||||
|
|
||||
|
# 爬取纸业网咨询中心的印刷出版页面 |
||||
|
class PaperNewsSpider(scrapy.Spider): |
||||
|
# 爬虫名字 |
||||
|
name = 'PaperNews' |
||||
|
# 爬虫域名 |
||||
|
allowed_domains = ['www.paper.com.cn'] |
||||
|
|
||||
|
# 国内咨询主页面 |
||||
|
start_urls = ['http://www.paper.com.cn/news/nation.php?news_type=%B9%FA%C4%DA%D7%CA%D1%B6'] # 07 |
||||
|
|
||||
|
# 爬虫主类 |
||||
|
def parse(self, response): |
||||
|
|
||||
|
# 获取资讯链接 |
||||
|
lists = response.xpath('//td[@width="85%"]/a/@href').extract() |
||||
|
|
||||
|
# 字符串拼接给lists链接加上域名 |
||||
|
lists_num = len(lists) - 1 |
||||
|
newlists = lists |
||||
|
while True: |
||||
|
newlists[lists_num] = 'http://www.paper.com.cn%s' % (lists[lists_num]) |
||||
|
lists_num = lists_num - 1 |
||||
|
if lists_num < 0: |
||||
|
lists_num = len(lists) - 1 |
||||
|
break |
||||
|
|
||||
|
# 编辑输出编码格式,输出文件名,信息为追加写入 |
||||
|
Tools.write_txt(Tools.__init__(self), "newsUrl.txt", newlists) |
||||
|
|
||||
|
|
||||
|
# 循环爬取newlist里面的网页 |
||||
|
while True: |
||||
|
yield Request(newlists[lists_num], callback=self.parse_second) |
||||
|
lists_num = lists_num - 1 |
||||
|
if lists_num < 0: |
||||
|
lists_num = len(lists) - 1 |
||||
|
break |
||||
|
|
||||
|
# 获取二层页面的方法 |
||||
|
def parse_second(self, response): |
||||
|
# 资讯详情页 |
||||
|
lists = response.xpath('//b/text() | //p/text()').extract() |
||||
|
|
||||
|
# 编辑输出编码格式,输出文件名,信息为追加写入 |
||||
|
Tools.write_txt(Tools.__init__(self), "News.txt", lists) |
||||
|
print(lists) |
||||
|
# 推送url到阿里云 |
||||
|
now = datetime.datetime.now() |
||||
|
directory = "spider-information-data/paper.com.cn/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/" |
||||
|
Tools.put(Tools.__init__(self), directory, lists) |
||||
|
|
||||
|
# 重写request |
||||
|
def start_requests(self): |
||||
|
""" |
||||
|
这是一个重载函数,它的作用是发出一个Request请求 |
||||
|
:return: |
||||
|
""" |
||||
|
# 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。 |
||||
|
yield Request(self.start_urls[0], callback=self.parse) |
||||
@ -0,0 +1,82 @@ |
|||||
|
import datetime |
||||
|
import json |
||||
|
|
||||
|
import scrapy |
||||
|
from scrapy import Request |
||||
|
from scrapy.http import FormRequest |
||||
|
|
||||
|
# 爬取富宝咨询的废纸与成品纸价格变动 |
||||
|
from papernews_spider.myfirstPj.spiders import Tools |
||||
|
|
||||
|
|
||||
|
class PaperPricesSpider(scrapy.Spider): |
||||
|
# 爬虫名字 |
||||
|
name = 'PaperPriceSpider' |
||||
|
|
||||
|
# 爬虫域名 |
||||
|
allowed_domains = ['news.f139.com'] |
||||
|
|
||||
|
# 印刷出版主页面 |
||||
|
start_urls = ['http://news.f139.com/list.do?channelID=94&categoryID=27'] |
||||
|
|
||||
|
def parse(self, response): |
||||
|
# 爬取信息连接 |
||||
|
lists = response.xpath('//a[@target="_blank"]/@href').extract() |
||||
|
|
||||
|
# 字符串拼接给lists链接加上域名 |
||||
|
|
||||
|
newlists = lists |
||||
|
for i in range(len(lists)): |
||||
|
newlists[i] = 'http://news.f139.com%s' % (lists[i]) |
||||
|
|
||||
|
# 编辑输出编码格式,输出文件名,信息为追加写入 |
||||
|
Tools.write_txt(Tools.__init__(self), "priceUrl.txt", newlists) |
||||
|
|
||||
|
# 更新生成的连接列表 |
||||
|
newlists = Tools.url_manage(Tools.__init__(self), 'priceUrl.txt', 'oldPriceUrl.txt') |
||||
|
|
||||
|
# 清洗多余字符 |
||||
|
newlists = Tools.cleantxt(Tools.__init__(self), '\n', '', newlists) |
||||
|
newlists = Tools.cleantxt(Tools.__init__(self), |
||||
|
'===============================================================================================', |
||||
|
'', newlists) |
||||
|
|
||||
|
# print(newlists) |
||||
|
# 更新oldPriceUrl |
||||
|
Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", newlists) |
||||
|
|
||||
|
# 循环爬取newlist里面的网页(新网页) |
||||
|
for i in range(len(newlists)): |
||||
|
yield Request(newlists[i], callback=self.parse_second) |
||||
|
if i == (len(newlists) - 1): |
||||
|
data = {"name": PaperPricesSpider.name, "url": newlists, "status": 200} |
||||
|
print(json.dumps(data)) |
||||
|
|
||||
|
# print(response.body.decode('utf-8', 'ignore')) |
||||
|
|
||||
|
# 获取二层页面的方法 |
||||
|
def parse_second(self, response): |
||||
|
# 资讯详情页 |
||||
|
lists = response.xpath( |
||||
|
"//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text()").extract() |
||||
|
# lists = response.xpath("normalize-space(.//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text())").extract_first() |
||||
|
|
||||
|
# 编辑输出编码格式,输出文件名,信息为追加写入 |
||||
|
lists = Tools.cleantxt(Tools.__init__(self), '富宝', '千鸟', lists) # 修改富宝咨询为千鸟咨询 |
||||
|
del lists[1:5] |
||||
|
del lists[12:18] # 删除空白行 |
||||
|
Tools.write_txt(Tools.__init__(self), "priceText.txt", lists) |
||||
|
# 推送到阿里云(可选) |
||||
|
now = datetime.datetime.now() |
||||
|
directory = "spider-information-data/fuBao/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/" |
||||
|
Tools.put(Tools.__init__(self), directory, lists) |
||||
|
|
||||
|
# 重写request |
||||
|
def start_requests(self): |
||||
|
""" |
||||
|
这是一个重载函数,它的作用是发出一个Request请求 |
||||
|
:return: |
||||
|
""" |
||||
|
|
||||
|
# 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。 |
||||
|
yield Request(self.start_urls[0], callback=self.parse) |
||||
@ -0,0 +1,8 @@ |
|||||
|
import scrapy |
||||
|
# 配置爬虫的网页 |
||||
|
class UrllRoom(): |
||||
|
# 拉取数据库要查询的 |
||||
|
name = 'baidu' |
||||
|
allowed_domains = ['www.baidu.com'] |
||||
|
start_urls = ['http://www.baidu.com/'] |
||||
|
|
||||
@ -0,0 +1,76 @@ |
|||||
|
# This package will contain the spiders of your Scrapy project |
||||
|
# |
||||
|
# Please refer to the documentation for information on how to create and manage |
||||
|
# your spiders. |
||||
|
|
||||
|
import oss2 |
||||
|
from papernews_spider.Module.generateID import IdWorker |
||||
|
|
||||
|
|
||||
|
class Tools: |
||||
|
|
||||
|
# 上传阿里云oss存储,参数为数据流 |
||||
|
def put(self, directory, datalist): # (oss文件位置与命名,传入的数据流) |
||||
|
# 阿里云账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维,请登录RAM控制台创建RAM用户。 |
||||
|
auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn') |
||||
|
# yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1(深圳)为例,Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。 |
||||
|
# 填写Bucket名称。 |
||||
|
endpoint = 'oss-cn-shenzhen.aliyuncs.com' # 外网 |
||||
|
# endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com' # 内网 |
||||
|
bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake') |
||||
|
|
||||
|
# 生成上传数据 |
||||
|
id = IdWorker(1, 2, 0) |
||||
|
new_id = str(id.get_id()) + ".txt" |
||||
|
data = '' |
||||
|
for item in datalist: |
||||
|
data = data + item |
||||
|
bucket.put_object(directory + new_id, data) |
||||
|
|
||||
|
|
||||
|
|
||||
|
# 写文件方法 参数(self,生成文件名,xpath返回的list) |
||||
|
def write_txt(self, filename, listname): |
||||
|
file = open(filename, 'a', encoding='UTF-8') |
||||
|
for item in listname: |
||||
|
if not (item == '\r\n ' |
||||
|
or item == '\r\n ' |
||||
|
or item == '\r\n ' |
||||
|
or item == '\r\t' |
||||
|
or item == '\r\r\n \r\n \r\n ' |
||||
|
or item == '\r\r\n \r\n ' |
||||
|
or item == '\r\n \r\n ' |
||||
|
or item == '\r\r\n \r\n \r\n '): # 清洗数据 |
||||
|
file.write(item) |
||||
|
file.write("\n") |
||||
|
# print(item) |
||||
|
file.close() |
||||
|
|
||||
|
# url管理器(newlist:新获取的网页,oldurl是旧的记录url的文件) |
||||
|
def url_manage(self, newlist, oldurl): |
||||
|
""" |
||||
|
return:尚未爬取的链接 |
||||
|
""" |
||||
|
# 打开两个文件 |
||||
|
newfile = open(newlist, 'r', encoding='utf-8') |
||||
|
oldfile = open(oldurl, 'r', encoding='utf-8') |
||||
|
# 获取流 |
||||
|
new = newfile.readlines() |
||||
|
old = oldfile.readlines() |
||||
|
|
||||
|
newfile.close() |
||||
|
oldfile.close() |
||||
|
# 返回差集 |
||||
|
difference = set(new).difference(old) |
||||
|
|
||||
|
# # 更新oldPriceUrl |
||||
|
# Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new) |
||||
|
return difference |
||||
|
|
||||
|
# 清洗数据方法(self,要更改的字符串,更变成的数据,传入的数据list,) |
||||
|
def cleantxt(self, cleancontent, replacecontent, lists=list): |
||||
|
|
||||
|
# 清洗数据 |
||||
|
for new in lists: |
||||
|
new = [i.replace(cleancontent, replacecontent) for i in lists] |
||||
|
return new |
||||
@ -0,0 +1,6 @@ |
|||||
|
from scrapy import cmdline |
||||
|
|
||||
|
import SetSpiderName |
||||
|
# cmdline.execute('s crapy crawl baidu -s LOG_FILE=debug.log'.split()) |
||||
|
cmdline.execute(('scrapy crawl %s -s LOG_FILE=debug.log' %(SetSpiderName.SetName.name)).split()) |
||||
|
# cmdline.execute(('scrapy crawl %s' %(SetSpiderName.SetName.name)).split()) |
||||
@ -0,0 +1,5 @@ |
|||||
|
|
||||
|
|
||||
|
class SetName(): |
||||
|
# name = "PaperNews" |
||||
|
name = "PaperPriceSpider" |
||||
@ -0,0 +1,11 @@ |
|||||
|
# Automatically created by: scrapy startproject |
||||
|
# |
||||
|
# For more information about the [deploy] section see: |
||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html |
||||
|
|
||||
|
[settings] |
||||
|
default = myfirstPj.settings |
||||
|
|
||||
|
[deploy] |
||||
|
#url = http://localhost:6800/ |
||||
|
project = myfirstPj |
||||
Write
Preview
Loading…
Cancel
Save