初始化

3 years ago · 7c37472886
18 changed files with 699 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,154 @@
 ### IntelliJ IDEA ###
 .idea
 .gitback
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 #文档和日志
 *.txt
 #飞桨
 /papernews_spider/Module/model_best/
 /papernews_spider/Module/
 #包含test的测试文件
 *test*
 /papernews_spider/myfirstPj/spiders/model_best/
--- a/README.md
+++ b/README.md
@ -0,0 +1,21 @@
 ## demo
 1. 安装Python 3.7 以上版本。
 2. 安装依赖：命令行执行 `pip install -r requirements.txt`。
 ### 说明
 + Run.py是运行爬虫的方法，在SetSpdierName.py里修改要运行的爬虫项目
 + 爬取纸业网咨询中心的印刷出版页面链接以及链接内详情内容。(PapernewsSpider.py) ，链接接为：http://www.paper.com.cn/news/nation.php?news_type=%D3%A1%CB%A2%B3%F6%B0%E6
 + 文件News.txt是爬取到的资讯数据
 + 爬取富宝咨询的纸厂调价栏目，记录的是废纸以及成品纸的价格变动（PaperpriceSpider.py）链接为：http://news.f139.com
 + 已经写好网页去重，并写好可以复用的相关的去重方法和清洗方法
 ### 文件说明
 + spider文件夹存放的是爬虫文件
 + tools文件夹存放的是各种工具类，包含各种测试用的工具类，
 + (现在已经关闭控制台输出)
--- a/papernews_spider/init.py
+++ b/papernews_spider/init.py
--- a/papernews_spider/myfirstPj/init.py
+++ b/papernews_spider/myfirstPj/init.py
--- a/papernews_spider/myfirstPj/items.py
+++ b/papernews_spider/myfirstPj/items.py
@ -0,0 +1,13 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 # -*- coding: utf-8 -*-
 import scrapy
 # 负责处理被spider提取出来的item。当页面被爬虫解析所需的数据存入Item
 class MyfirstpjItem(scrapy.Item):
    # define the fields for your item here like:
    text = scrapy.Field()
    url = scrapy.Field()
    pass
--- a/papernews_spider/myfirstPj/middlewares.py
+++ b/papernews_spider/myfirstPj/middlewares.py
@ -0,0 +1,103 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 class MyfirstpjSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Request or item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
 class MyfirstpjDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
--- a/papernews_spider/myfirstPj/pipelines.py
+++ b/papernews_spider/myfirstPj/pipelines.py
@ -0,0 +1,25 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 # -*- coding: utf-8 -*-
 # useful for handling different item types with a single interface
 import requests
 from itemadapter import ItemAdapter
 # 排列次序
 class MyfirstpjPipeline:
    def process_item(self, item, spider):
        # file = open("items.txt", "a")  # 以追加的方式打开文件，不存在则创建
        # # 因为item中的数据是unicode编码，为了在控制台中查看数据的有效性和保存，
        # # 将其编码改为utf-8
        # item_string = str(item).decode("unicode_escape").encode('utf-8')
        # file.write(item_string)
        # file.write('\n')
        # file.close()
        # print(item_string)  # 在控制台输出
        # return item  # 会在控制台输出原item数据，可以选择不写
        print(item)
--- a/papernews_spider/myfirstPj/settings.py
+++ b/papernews_spider/myfirstPj/settings.py
@ -0,0 +1,96 @@
 # Scrapy settings for myfirstPj project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = 'myfirstPj'
 SPIDER_MODULES = ['myfirstPj.spiders']
 NEWSPIDER_MODULE = 'myfirstPj.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36'
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36',
    # 'Cookie': "'_qquc': '6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb6935540e9c8a32c823850e2a4fd1b9e6a7081f424d32347fd2822284431a74dec2c47cf56d87c11ed27ae08743d556ec1fbf41b4668dbfd6df049246d413308d16aed327f1420253934934bbb062de14706171347d330ba71e632c2d6a89b62e833cd2fac9e3fc13e07e94c47dbc159d7fed1db22e3274c3e3f940651d83de34fc405f741b3f69aac578d05fe26961e0125531c4fcb34a62af3e7a288d862f6eb34803c9e144a1661d0f8fc78ef8b87f3bf7ae89672f4ff196aadc60a8eae6483bae2ed065d851f447fc8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf4076d2dfe3efa02b5b8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf40f32e52a595d5e141717f6417f948aaa279d001d13b7a2bee2460d1d835ce38d700864368eab8f2b10f31642b295093604226def5e00d3d6a929c2e4596344032166b1741ede12384d1e9263bfd40239651284929d15a1aae886b7cf155fbd493', 'Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae': '1665814075', 'JSESSIONID': '12070D5B8A0173C0509273A9FD2060C5', 'Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae': '1666073527'"
    'Cookie': "'JSESSIONID=2E161C24EE80617B11702B4E76A42FF0; _qquc=6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb693595284cd9e2fd46f4ee81e2dc73caad231f424d32347fd2822284431a74dec2c4b83f1b9d8786ebd0d7365bf0935405158f9d16bb79a0bd1f25565e5fdaa4fefb8026fd46b9c3cb6230301904f42b70dac9ff77c32d9228079178f395dd41708e28f30f4223948f25c30255de9bdf9210c2c95a92b6f80aff8ab6cd29261c19c181d4ecb97a9c5e4841b4668dbfd6df042c4e113b1f49a2735a11313cd1738fec5bcb7e4f0ae6ab92005e478fba8f38efe561ea8de8abb858e98715ccff56e8a9b336d1c71d4222cfd4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e26062333f8df1e661d4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e2ed41a28a5d5b372fd4e3cd2cdd4ab3e976652525878b2ab6c6cf81d18a62f4ef6a0d294378ea2f7c792744feca70155430d053014964fed2194a1d90a161c73c6f7567f12bdd69a2d9b81824ced2ccba099beefbb90aee3433a598ff7d0d05e8; Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae=1649303831,1649383836,1650452961; Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae=1650453289'"
 }
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    'myfirstPj.middlewares.MyfirstpjSpiderMiddleware': 543,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
 #    'myfirstPj.middlewares.MyfirstpjDownloaderMiddleware': 543,
 #}
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
 #}
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
    'myfirstPj.pipelines.MyfirstpjPipeline': 300,
 }
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 FEED_EXPORT_ENCODING='utf-8'
--- a/papernews_spider/myfirstPj/spiders/NewsContent.py
+++ b/papernews_spider/myfirstPj/spiders/NewsContent.py
@ -0,0 +1,25 @@
 import scrapy
 import papernews_spider
 from papernews_spider.myfirstPj.spiders import Tools
 # 正文爬虫（测试用）
 # import PapernewsSpider
 class NewscontentSpider(scrapy.Spider):
    name = 'NewsContent'
    allowed_domains = ['www.paper.com.cn']
    urllist = open('url.txt', 'a', encoding='utf-8')
    start_urls = ['http://www.paper.com.cn/']
    def parse(self, response):
        # 资讯详情页
        lists = response.xpath('//b/text() | //p/text()').extract()
        # lists = response.xpath("//a[contains(@target, '_blank')]/text()").extract()
        # 打开保存的url链接
        urllist = open("url.txt", 'a', encoding='utf-8')
        # 编辑输出编码格式，输出文件名，信息为追加写入
        Tools.write_txt(Tools.__init__(self), "News.txt", urllist)
--- a/papernews_spider/myfirstPj/spiders/PaperNewsSpider.py
+++ b/papernews_spider/myfirstPj/spiders/PaperNewsSpider.py
@ -0,0 +1,74 @@
 # -*- coding: utf-8 -*-
 import datetime
 import scrapy
 from scrapy import Request
 import oss2
 import os
 from scrapy import Selector
 from papernews_spider.myfirstPj.items import MyfirstpjItem
 from papernews_spider.myfirstPj.spiders import Tools
 from papernews_spider.myfirstPj import settings
 # 爬取纸业网咨询中心的印刷出版页面
 class PaperNewsSpider(scrapy.Spider):
    # 爬虫名字
    name = 'PaperNews'
    # 爬虫域名
    allowed_domains = ['www.paper.com.cn']
    # 国内咨询主页面
    start_urls = ['http://www.paper.com.cn/news/nation.php?news_type=%B9%FA%C4%DA%D7%CA%D1%B6']  # 07
    # 爬虫主类
    def parse(self, response):
        # 获取资讯链接
        lists = response.xpath('//td[@width="85%"]/a/@href').extract()
        # 字符串拼接给lists链接加上域名
        lists_num = len(lists) - 1
        newlists = lists
        while True:
            newlists[lists_num] = 'http://www.paper.com.cn%s' % (lists[lists_num])
            lists_num = lists_num - 1
            if lists_num < 0:
                lists_num = len(lists) - 1
                break
        # 编辑输出编码格式，输出文件名，信息为追加写入
        Tools.write_txt(Tools.__init__(self), "newsUrl.txt", newlists)
        # 循环爬取newlist里面的网页
        while True:
            yield Request(newlists[lists_num], callback=self.parse_second)
            lists_num = lists_num - 1
            if lists_num < 0:
                lists_num = len(lists) - 1
                break
    # 获取二层页面的方法
    def parse_second(self, response):
        # 资讯详情页
        lists = response.xpath('//b/text() | //p/text()').extract()
        # 编辑输出编码格式，输出文件名，信息为追加写入
        Tools.write_txt(Tools.__init__(self), "News.txt", lists)
        print(lists)
        # 推送url到阿里云
        now = datetime.datetime.now()
        directory = "spider-information-data/paper.com.cn/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/"
        Tools.put(Tools.__init__(self), directory, lists)
    # 重写request
    def start_requests(self):
        """
        这是一个重载函数，它的作用是发出一个Request请求
        :return:
        """
        # 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers，cookie。配置到了setting里面。
        yield Request(self.start_urls[0], callback=self.parse)
--- a/papernews_spider/myfirstPj/spiders/PaperPriceSpider.py
+++ b/papernews_spider/myfirstPj/spiders/PaperPriceSpider.py
@ -0,0 +1,82 @@
 import datetime
 import json
 import scrapy
 from scrapy import Request
 from scrapy.http import FormRequest
 # 爬取富宝咨询的废纸与成品纸价格变动
 from papernews_spider.myfirstPj.spiders import Tools
 class PaperPricesSpider(scrapy.Spider):
    #  爬虫名字
    name = 'PaperPriceSpider'
    # 爬虫域名
    allowed_domains = ['news.f139.com']
    # 印刷出版主页面
    start_urls = ['http://news.f139.com/list.do?channelID=94&categoryID=27']
    def parse(self, response):
        # 爬取信息连接
        lists = response.xpath('//a[@target="_blank"]/@href').extract()
        # 字符串拼接给lists链接加上域名
        newlists = lists
        for i in range(len(lists)):
            newlists[i] = 'http://news.f139.com%s' % (lists[i])
        # 编辑输出编码格式，输出文件名，信息为追加写入
        Tools.write_txt(Tools.__init__(self), "priceUrl.txt", newlists)
        # 更新生成的连接列表
        newlists = Tools.url_manage(Tools.__init__(self), 'priceUrl.txt', 'oldPriceUrl.txt')
        # 清洗多余字符
        newlists = Tools.cleantxt(Tools.__init__(self), '\n', '', newlists)
        newlists = Tools.cleantxt(Tools.__init__(self),
                                  '===============================================================================================',
                                  '', newlists)
        # print(newlists)
        # 更新oldPriceUrl
        Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", newlists)
        # 循环爬取newlist里面的网页(新网页)
        for i in range(len(newlists)):
            yield Request(newlists[i], callback=self.parse_second)
            if i == (len(newlists) - 1):
                data = {"name": PaperPricesSpider.name, "url": newlists, "status": 200}
                print(json.dumps(data))
        # print(response.body.decode('utf-8', 'ignore'))
    # 获取二层页面的方法
    def parse_second(self, response):
        # 资讯详情页
        lists = response.xpath(
            "//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text()").extract()
        # lists = response.xpath("normalize-space(.//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text())").extract_first()
        # 编辑输出编码格式，输出文件名，信息为追加写入
        lists = Tools.cleantxt(Tools.__init__(self), '富宝', '千鸟', lists)  # 修改富宝咨询为千鸟咨询
        del lists[1:5]
        del lists[12:18]  # 删除空白行
        Tools.write_txt(Tools.__init__(self), "priceText.txt", lists)
        # 推送到阿里云（可选）
        now = datetime.datetime.now()
        directory = "spider-information-data/fuBao/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/"
        Tools.put(Tools.__init__(self), directory, lists)
    # 重写request
    def start_requests(self):
        """
        这是一个重载函数，它的作用是发出一个Request请求
        :return:
        """
        # 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers，cookie。配置到了setting里面。
        yield Request(self.start_urls[0], callback=self.parse)
--- a/papernews_spider/myfirstPj/spiders/Url.py
+++ b/papernews_spider/myfirstPj/spiders/Url.py
@ -0,0 +1,8 @@
 import scrapy
 # 配置爬虫的网页
 class UrllRoom():
    # 拉取数据库要查询的
    name = 'baidu'
    allowed_domains = ['www.baidu.com']
    start_urls = ['http://www.baidu.com/']
--- a/papernews_spider/myfirstPj/spiders/init.py
+++ b/papernews_spider/myfirstPj/spiders/init.py
@ -0,0 +1,76 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
 import oss2
 from papernews_spider.Module.generateID import IdWorker
 class Tools:
    # 上传阿里云oss存储,参数为数据流
    def put(self, directory, datalist):  # (oss文件位置与命名，传入的数据流）
        # 阿里云账号AccessKey拥有所有API的访问权限，风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维，请登录RAM控制台创建RAM用户。
        auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn')
        # yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1（深圳）为例，Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。
        # 填写Bucket名称。
        endpoint = 'oss-cn-shenzhen.aliyuncs.com'  # 外网
        # endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com'  # 内网
        bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake')
        # 生成上传数据
        id = IdWorker(1, 2, 0)
        new_id = str(id.get_id()) + ".txt"
        data = ''
        for item in datalist:
            data = data + item
        bucket.put_object(directory + new_id, data)
    # 写文件方法 参数(self，生成文件名，xpath返回的list)
    def write_txt(self, filename, listname):
        file = open(filename, 'a', encoding='UTF-8')
        for item in listname:
            if not (item == '\r\n        '
                    or item == '\r\n       '
                    or item == '\r\n          '
                    or item == '\r\t'
                    or item == '\r\r\n        \r\n        \r\n        '
                    or item == '\r\r\n      \r\n          '
                    or item == '\r\n        \r\n          '
                    or item == '\r\r\n      \r\n         \r\n     '):  # 清洗数据
                file.write(item)
                file.write("\n")
                # print(item)
        file.close()
    # url管理器(newlist:新获取的网页，oldurl是旧的记录url的文件)
    def url_manage(self, newlist, oldurl):
        """
        return：尚未爬取的链接
        """
        # 打开两个文件
        newfile = open(newlist, 'r', encoding='utf-8')
        oldfile = open(oldurl, 'r', encoding='utf-8')
        # 获取流
        new = newfile.readlines()
        old = oldfile.readlines()
        newfile.close()
        oldfile.close()
        # 返回差集
        difference = set(new).difference(old)
        # # 更新oldPriceUrl
        # Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new)
        return difference
    # 清洗数据方法(self,要更改的字符串，更变成的数据，传入的数据list，)
    def cleantxt(self, cleancontent, replacecontent, lists=list):
        # 清洗数据
        for new in lists:
            new = [i.replace(cleancontent, replacecontent) for i in lists]
            return new
--- a/papernews_spider/myfirstPj/tools/Api.py
+++ b/papernews_spider/myfirstPj/tools/Api.py
--- a/papernews_spider/myfirstPj/tools/Run.py
+++ b/papernews_spider/myfirstPj/tools/Run.py
@ -0,0 +1,6 @@
 from scrapy import cmdline
 import SetSpiderName
 # cmdline.execute('s   crapy crawl baidu -s LOG_FILE=debug.log'.split())
 cmdline.execute(('scrapy crawl %s -s LOG_FILE=debug.log' %(SetSpiderName.SetName.name)).split())
 # cmdline.execute(('scrapy crawl %s' %(SetSpiderName.SetName.name)).split())
--- a/papernews_spider/myfirstPj/tools/SetSpiderName.py
+++ b/papernews_spider/myfirstPj/tools/SetSpiderName.py
@ -0,0 +1,5 @@
 class SetName():
    # name = "PaperNews"
    name = "PaperPriceSpider"
--- a/papernews_spider/myfirstPj/tools/init.py
+++ b/papernews_spider/myfirstPj/tools/init.py
--- a/papernews_spider/scrapy.cfg
+++ b/papernews_spider/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = myfirstPj.settings
 [deploy]
 #url = http://localhost:6800/
 project = myfirstPj