初始化

3 years ago · 7c37472886
18 changed files with 699 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,154 @@
+### IntelliJ IDEA ###
+.idea
+.gitback
+
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+#文档和日志
+*.txt
+
+#飞桨
+/papernews_spider/Module/model_best/
+/papernews_spider/Module/
+
+#包含test的测试文件
+*test*
+/papernews_spider/myfirstPj/spiders/model_best/
--- a/README.md
+++ b/README.md
@ -0,0 +1,21 @@
+
+## demo
+
+1. 安装Python 3.7 以上版本。
+2. 安装依赖：命令行执行 `pip install -r requirements.txt`。
+
+
+### 说明
+
+ Run.py是运行爬虫的方法，在SetSpdierName.py里修改要运行的爬虫项目
+ 爬取纸业网咨询中心的印刷出版页面链接以及链接内详情内容。(PapernewsSpider.py) ，链接接为：http://www.paper.com.cn/news/nation.php?news_type=%D3%A1%CB%A2%B3%F6%B0%E6
+ 文件News.txt是爬取到的资讯数据
+ 爬取富宝咨询的纸厂调价栏目，记录的是废纸以及成品纸的价格变动（PaperpriceSpider.py）链接为：http://news.f139.com
+ 已经写好网页去重，并写好可以复用的相关的去重方法和清洗方法
+
+### 文件说明
+ spider文件夹存放的是爬虫文件
+ tools文件夹存放的是各种工具类，包含各种测试用的工具类，
+ (现在已经关闭控制台输出)
+
+
--- a/papernews_spider/init.py
+++ b/papernews_spider/init.py
--- a/papernews_spider/myfirstPj/init.py
+++ b/papernews_spider/myfirstPj/init.py
--- a/papernews_spider/myfirstPj/items.py
+++ b/papernews_spider/myfirstPj/items.py
@ -0,0 +1,13 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+# -*- coding: utf-8 -*-
+import scrapy
+
+# 负责处理被spider提取出来的item。当页面被爬虫解析所需的数据存入Item
+class MyfirstpjItem(scrapy.Item):
+    # define the fields for your item here like:
+    text = scrapy.Field()
+    url = scrapy.Field()
+    pass
--- a/papernews_spider/myfirstPj/middlewares.py
+++ b/papernews_spider/myfirstPj/middlewares.py
@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class MyfirstpjSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class MyfirstpjDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/papernews_spider/myfirstPj/pipelines.py
+++ b/papernews_spider/myfirstPj/pipelines.py
@ -0,0 +1,25 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+# -*- coding: utf-8 -*-
+# useful for handling different item types with a single interface
+import requests
+from itemadapter import ItemAdapter
+
+
+# 排列次序
+class MyfirstpjPipeline:
+    def process_item(self, item, spider):
+        # file = open("items.txt", "a")  # 以追加的方式打开文件，不存在则创建
+        # # 因为item中的数据是unicode编码，为了在控制台中查看数据的有效性和保存，
+        # # 将其编码改为utf-8
+        # item_string = str(item).decode("unicode_escape").encode('utf-8')
+        # file.write(item_string)
+        # file.write('\n')
+        # file.close()
+        # print(item_string)  # 在控制台输出
+        # return item  # 会在控制台输出原item数据，可以选择不写
+
+        print(item)
--- a/papernews_spider/myfirstPj/settings.py
+++ b/papernews_spider/myfirstPj/settings.py
@ -0,0 +1,96 @@
+# Scrapy settings for myfirstPj project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'myfirstPj'
+
+SPIDER_MODULES = ['myfirstPj.spiders']
+NEWSPIDER_MODULE = 'myfirstPj.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36'
+
+
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36',
+    # 'Cookie': "'_qquc': '6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb6935540e9c8a32c823850e2a4fd1b9e6a7081f424d32347fd2822284431a74dec2c47cf56d87c11ed27ae08743d556ec1fbf41b4668dbfd6df049246d413308d16aed327f1420253934934bbb062de14706171347d330ba71e632c2d6a89b62e833cd2fac9e3fc13e07e94c47dbc159d7fed1db22e3274c3e3f940651d83de34fc405f741b3f69aac578d05fe26961e0125531c4fcb34a62af3e7a288d862f6eb34803c9e144a1661d0f8fc78ef8b87f3bf7ae89672f4ff196aadc60a8eae6483bae2ed065d851f447fc8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf4076d2dfe3efa02b5b8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf40f32e52a595d5e141717f6417f948aaa279d001d13b7a2bee2460d1d835ce38d700864368eab8f2b10f31642b295093604226def5e00d3d6a929c2e4596344032166b1741ede12384d1e9263bfd40239651284929d15a1aae886b7cf155fbd493', 'Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae': '1665814075', 'JSESSIONID': '12070D5B8A0173C0509273A9FD2060C5', 'Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae': '1666073527'"
+    'Cookie': "'JSESSIONID=2E161C24EE80617B11702B4E76A42FF0; _qquc=6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb693595284cd9e2fd46f4ee81e2dc73caad231f424d32347fd2822284431a74dec2c4b83f1b9d8786ebd0d7365bf0935405158f9d16bb79a0bd1f25565e5fdaa4fefb8026fd46b9c3cb6230301904f42b70dac9ff77c32d9228079178f395dd41708e28f30f4223948f25c30255de9bdf9210c2c95a92b6f80aff8ab6cd29261c19c181d4ecb97a9c5e4841b4668dbfd6df042c4e113b1f49a2735a11313cd1738fec5bcb7e4f0ae6ab92005e478fba8f38efe561ea8de8abb858e98715ccff56e8a9b336d1c71d4222cfd4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e26062333f8df1e661d4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e2ed41a28a5d5b372fd4e3cd2cdd4ab3e976652525878b2ab6c6cf81d18a62f4ef6a0d294378ea2f7c792744feca70155430d053014964fed2194a1d90a161c73c6f7567f12bdd69a2d9b81824ced2ccba099beefbb90aee3433a598ff7d0d05e8; Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae=1649303831,1649383836,1650452961; Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae=1650453289'"
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'myfirstPj.middlewares.MyfirstpjSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'myfirstPj.middlewares.MyfirstpjDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'myfirstPj.pipelines.MyfirstpjPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+
+FEED_EXPORT_ENCODING='utf-8'
--- a/papernews_spider/myfirstPj/spiders/NewsContent.py
+++ b/papernews_spider/myfirstPj/spiders/NewsContent.py
@ -0,0 +1,25 @@
+import scrapy
+
+import papernews_spider
+from papernews_spider.myfirstPj.spiders import Tools
+
+
+# 正文爬虫（测试用）
+# import PapernewsSpider
+
+
+class NewscontentSpider(scrapy.Spider):
+    name = 'NewsContent'
+    allowed_domains = ['www.paper.com.cn']
+    urllist = open('url.txt', 'a', encoding='utf-8')
+    start_urls = ['http://www.paper.com.cn/']
+
+    def parse(self, response):
+        # 资讯详情页
+        lists = response.xpath('//b/text() | //p/text()').extract()
+        # lists = response.xpath("//a[contains(@target, '_blank')]/text()").extract()
+        # 打开保存的url链接
+        urllist = open("url.txt", 'a', encoding='utf-8')
+
+        # 编辑输出编码格式，输出文件名，信息为追加写入
+        Tools.write_txt(Tools.__init__(self), "News.txt", urllist)
--- a/papernews_spider/myfirstPj/spiders/PaperNewsSpider.py
+++ b/papernews_spider/myfirstPj/spiders/PaperNewsSpider.py
@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+import datetime
+
+import scrapy
+from scrapy import Request
+import oss2
+import os
+
+from scrapy import Selector
+
+from papernews_spider.myfirstPj.items import MyfirstpjItem
+from papernews_spider.myfirstPj.spiders import Tools
+from papernews_spider.myfirstPj import settings
+
+
+# 爬取纸业网咨询中心的印刷出版页面
+class PaperNewsSpider(scrapy.Spider):
+    # 爬虫名字
+    name = 'PaperNews'
+    # 爬虫域名
+    allowed_domains = ['www.paper.com.cn']
+
+    # 国内咨询主页面
+    start_urls = ['http://www.paper.com.cn/news/nation.php?news_type=%B9%FA%C4%DA%D7%CA%D1%B6']  # 07
+
+    # 爬虫主类
+    def parse(self, response):
+
+        # 获取资讯链接
+        lists = response.xpath('//td[@width="85%"]/a/@href').extract()
+
+        # 字符串拼接给lists链接加上域名
+        lists_num = len(lists) - 1
+        newlists = lists
+        while True:
+            newlists[lists_num] = 'http://www.paper.com.cn%s' % (lists[lists_num])
+            lists_num = lists_num - 1
+            if lists_num < 0:
+                lists_num = len(lists) - 1
+                break
+
+        # 编辑输出编码格式，输出文件名，信息为追加写入
+        Tools.write_txt(Tools.__init__(self), "newsUrl.txt", newlists)
+
+
+        # 循环爬取newlist里面的网页
+        while True:
+            yield Request(newlists[lists_num], callback=self.parse_second)
+            lists_num = lists_num - 1
+            if lists_num < 0:
+                lists_num = len(lists) - 1
+                break
+
+    # 获取二层页面的方法
+    def parse_second(self, response):
+        # 资讯详情页
+        lists = response.xpath('//b/text() | //p/text()').extract()
+
+        # 编辑输出编码格式，输出文件名，信息为追加写入
+        Tools.write_txt(Tools.__init__(self), "News.txt", lists)
+        print(lists)
+        # 推送url到阿里云
+        now = datetime.datetime.now()
+        directory = "spider-information-data/paper.com.cn/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/"
+        Tools.put(Tools.__init__(self), directory, lists)
+
+    # 重写request
+    def start_requests(self):
+        """
+        这是一个重载函数，它的作用是发出一个Request请求
+        :return:
+        """
+        # 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers，cookie。配置到了setting里面。
+        yield Request(self.start_urls[0], callback=self.parse)
--- a/papernews_spider/myfirstPj/spiders/PaperPriceSpider.py
+++ b/papernews_spider/myfirstPj/spiders/PaperPriceSpider.py
@ -0,0 +1,82 @@
+import datetime
+import json
+
+import scrapy
+from scrapy import Request
+from scrapy.http import FormRequest
+
+# 爬取富宝咨询的废纸与成品纸价格变动
+from papernews_spider.myfirstPj.spiders import Tools
+
+
+class PaperPricesSpider(scrapy.Spider):
+    #  爬虫名字
+    name = 'PaperPriceSpider'
+
+    # 爬虫域名
+    allowed_domains = ['news.f139.com']
+
+    # 印刷出版主页面
+    start_urls = ['http://news.f139.com/list.do?channelID=94&categoryID=27']
+
+    def parse(self, response):
+        # 爬取信息连接
+        lists = response.xpath('//a[@target="_blank"]/@href').extract()
+
+        # 字符串拼接给lists链接加上域名
+
+        newlists = lists
+        for i in range(len(lists)):
+            newlists[i] = 'http://news.f139.com%s' % (lists[i])
+
+        # 编辑输出编码格式，输出文件名，信息为追加写入
+        Tools.write_txt(Tools.__init__(self), "priceUrl.txt", newlists)
+
+        # 更新生成的连接列表
+        newlists = Tools.url_manage(Tools.__init__(self), 'priceUrl.txt', 'oldPriceUrl.txt')
+
+        # 清洗多余字符
+        newlists = Tools.cleantxt(Tools.__init__(self), '\n', '', newlists)
+        newlists = Tools.cleantxt(Tools.__init__(self),
+                                  '===============================================================================================',
+                                  '', newlists)
+
+        # print(newlists)
+        # 更新oldPriceUrl
+        Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", newlists)
+
+        # 循环爬取newlist里面的网页(新网页)
+        for i in range(len(newlists)):
+            yield Request(newlists[i], callback=self.parse_second)
+            if i == (len(newlists) - 1):
+                data = {"name": PaperPricesSpider.name, "url": newlists, "status": 200}
+                print(json.dumps(data))
+
+        # print(response.body.decode('utf-8', 'ignore'))
+
+    # 获取二层页面的方法
+    def parse_second(self, response):
+        # 资讯详情页
+        lists = response.xpath(
+            "//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text()").extract()
+        # lists = response.xpath("normalize-space(.//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text())").extract_first()
+
+        # 编辑输出编码格式，输出文件名，信息为追加写入
+        lists = Tools.cleantxt(Tools.__init__(self), '富宝', '千鸟', lists)  # 修改富宝咨询为千鸟咨询
+        del lists[1:5]
+        del lists[12:18]  # 删除空白行
+        Tools.write_txt(Tools.__init__(self), "priceText.txt", lists)
+        # 推送到阿里云（可选）
+        now = datetime.datetime.now()
+        directory = "spider-information-data/fuBao/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/"
+        Tools.put(Tools.__init__(self), directory, lists)
+
+    # 重写request
+    def start_requests(self):
+        """
+        这是一个重载函数，它的作用是发出一个Request请求
+        :return:
+        """
+
+        # 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers，cookie。配置到了setting里面。
+        yield Request(self.start_urls[0], callback=self.parse)
--- a/papernews_spider/myfirstPj/spiders/Url.py
+++ b/papernews_spider/myfirstPj/spiders/Url.py
@ -0,0 +1,8 @@
+import scrapy
+# 配置爬虫的网页
+class UrllRoom():
+    # 拉取数据库要查询的
+    name = 'baidu'
+    allowed_domains = ['www.baidu.com']
+    start_urls = ['http://www.baidu.com/']
+
--- a/papernews_spider/myfirstPj/spiders/init.py
+++ b/papernews_spider/myfirstPj/spiders/init.py
@ -0,0 +1,76 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
+
+import oss2
+from papernews_spider.Module.generateID import IdWorker
+
+
+class Tools:
+
+    # 上传阿里云oss存储,参数为数据流
+    def put(self, directory, datalist):  # (oss文件位置与命名，传入的数据流）
+        # 阿里云账号AccessKey拥有所有API的访问权限，风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维，请登录RAM控制台创建RAM用户。
+        auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn')
+        # yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1（深圳）为例，Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。
+        # 填写Bucket名称。
+        endpoint = 'oss-cn-shenzhen.aliyuncs.com'  # 外网
+        # endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com'  # 内网
+        bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake')
+
+        # 生成上传数据
+        id = IdWorker(1, 2, 0)
+        new_id = str(id.get_id()) + ".txt"
+        data = ''
+        for item in datalist:
+            data = data + item
+        bucket.put_object(directory + new_id, data)
+
+
+
+    # 写文件方法 参数(self，生成文件名，xpath返回的list)
+    def write_txt(self, filename, listname):
+        file = open(filename, 'a', encoding='UTF-8')
+        for item in listname:
+            if not (item == '\r\n        '
+                    or item == '\r\n       '
+                    or item == '\r\n          '
+                    or item == '\r\t'
+                    or item == '\r\r\n        \r\n        \r\n        '
+                    or item == '\r\r\n      \r\n          '
+                    or item == '\r\n        \r\n          '
+                    or item == '\r\r\n      \r\n         \r\n     '):  # 清洗数据
+                file.write(item)
+                file.write("\n")
+                # print(item)
+        file.close()
+
+    # url管理器(newlist:新获取的网页，oldurl是旧的记录url的文件)
+    def url_manage(self, newlist, oldurl):
+        """
+        return：尚未爬取的链接
+        """
+        # 打开两个文件
+        newfile = open(newlist, 'r', encoding='utf-8')
+        oldfile = open(oldurl, 'r', encoding='utf-8')
+        # 获取流
+        new = newfile.readlines()
+        old = oldfile.readlines()
+
+        newfile.close()
+        oldfile.close()
+        # 返回差集
+        difference = set(new).difference(old)
+
+        # # 更新oldPriceUrl
+        # Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new)
+        return difference
+
+    # 清洗数据方法(self,要更改的字符串，更变成的数据，传入的数据list，)
+    def cleantxt(self, cleancontent, replacecontent, lists=list):
+
+        # 清洗数据
+        for new in lists:
+            new = [i.replace(cleancontent, replacecontent) for i in lists]
+            return new
--- a/papernews_spider/myfirstPj/tools/Api.py
+++ b/papernews_spider/myfirstPj/tools/Api.py
--- a/papernews_spider/myfirstPj/tools/Run.py
+++ b/papernews_spider/myfirstPj/tools/Run.py
@ -0,0 +1,6 @@
+from scrapy import cmdline
+
+import SetSpiderName
+# cmdline.execute('s   crapy crawl baidu -s LOG_FILE=debug.log'.split())
+cmdline.execute(('scrapy crawl %s -s LOG_FILE=debug.log' %(SetSpiderName.SetName.name)).split())
+# cmdline.execute(('scrapy crawl %s' %(SetSpiderName.SetName.name)).split())
--- a/papernews_spider/myfirstPj/tools/SetSpiderName.py
+++ b/papernews_spider/myfirstPj/tools/SetSpiderName.py
@ -0,0 +1,5 @@
+
+
+class SetName():
+    # name = "PaperNews"
+    name = "PaperPriceSpider"
--- a/papernews_spider/myfirstPj/tools/init.py
+++ b/papernews_spider/myfirstPj/tools/init.py
--- a/papernews_spider/scrapy.cfg
+++ b/papernews_spider/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = myfirstPj.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = myfirstPj