commit 7c3747288653c43d19d10fbb68317bba92ec2bc2 Author: wanghui Date: Mon Mar 6 17:27:10 2023 +0800 初始化 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2b320d --- /dev/null +++ b/.gitignore @@ -0,0 +1,154 @@ +### IntelliJ IDEA ### +.idea +.gitback + +# ---> Python +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +#文档和日志 +*.txt + +#飞桨 +/papernews_spider/Module/model_best/ +/papernews_spider/Module/ + +#包含test的测试文件 +*test* +/papernews_spider/myfirstPj/spiders/model_best/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..69154f5 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ + +## demo + +1. 安装Python 3.7 以上版本。 +2. 安装依赖:命令行执行 `pip install -r requirements.txt`。 + + +### 说明 + ++ Run.py是运行爬虫的方法,在SetSpdierName.py里修改要运行的爬虫项目 ++ 爬取纸业网咨询中心的印刷出版页面链接以及链接内详情内容。(PapernewsSpider.py) ,链接接为:http://www.paper.com.cn/news/nation.php?news_type=%D3%A1%CB%A2%B3%F6%B0%E6 ++ 文件News.txt是爬取到的资讯数据 ++ 爬取富宝咨询的纸厂调价栏目,记录的是废纸以及成品纸的价格变动(PaperpriceSpider.py)链接为:http://news.f139.com ++ 已经写好网页去重,并写好可以复用的相关的去重方法和清洗方法 + +### 文件说明 ++ spider文件夹存放的是爬虫文件 ++ tools文件夹存放的是各种工具类,包含各种测试用的工具类, ++ (现在已经关闭控制台输出) + + diff --git a/papernews_spider/__init__.py b/papernews_spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/papernews_spider/myfirstPj/__init__.py b/papernews_spider/myfirstPj/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/papernews_spider/myfirstPj/items.py b/papernews_spider/myfirstPj/items.py new file mode 100644 index 0000000..b8be260 --- /dev/null +++ b/papernews_spider/myfirstPj/items.py @@ -0,0 +1,13 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html +# -*- coding: utf-8 -*- +import scrapy + +# 负责处理被spider提取出来的item。当页面被爬虫解析所需的数据存入Item +class MyfirstpjItem(scrapy.Item): + # define the fields for your item here like: + text = scrapy.Field() + url = scrapy.Field() + pass diff --git a/papernews_spider/myfirstPj/middlewares.py b/papernews_spider/myfirstPj/middlewares.py new file mode 100644 index 0000000..44cabec --- /dev/null +++ b/papernews_spider/myfirstPj/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class MyfirstpjSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class MyfirstpjDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/papernews_spider/myfirstPj/pipelines.py b/papernews_spider/myfirstPj/pipelines.py new file mode 100644 index 0000000..72905ab --- /dev/null +++ b/papernews_spider/myfirstPj/pipelines.py @@ -0,0 +1,25 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + +# -*- coding: utf-8 -*- +# useful for handling different item types with a single interface +import requests +from itemadapter import ItemAdapter + + +# 排列次序 +class MyfirstpjPipeline: + def process_item(self, item, spider): + # file = open("items.txt", "a") # 以追加的方式打开文件,不存在则创建 + # # 因为item中的数据是unicode编码,为了在控制台中查看数据的有效性和保存, + # # 将其编码改为utf-8 + # item_string = str(item).decode("unicode_escape").encode('utf-8') + # file.write(item_string) + # file.write('\n') + # file.close() + # print(item_string) # 在控制台输出 + # return item # 会在控制台输出原item数据,可以选择不写 + + print(item) diff --git a/papernews_spider/myfirstPj/settings.py b/papernews_spider/myfirstPj/settings.py new file mode 100644 index 0000000..05ad451 --- /dev/null +++ b/papernews_spider/myfirstPj/settings.py @@ -0,0 +1,96 @@ +# Scrapy settings for myfirstPj project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'myfirstPj' + +SPIDER_MODULES = ['myfirstPj.spiders'] +NEWSPIDER_MODULE = 'myfirstPj.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36' + + + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +DEFAULT_REQUEST_HEADERS = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36', + # 'Cookie': "'_qquc': '6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb6935540e9c8a32c823850e2a4fd1b9e6a7081f424d32347fd2822284431a74dec2c47cf56d87c11ed27ae08743d556ec1fbf41b4668dbfd6df049246d413308d16aed327f1420253934934bbb062de14706171347d330ba71e632c2d6a89b62e833cd2fac9e3fc13e07e94c47dbc159d7fed1db22e3274c3e3f940651d83de34fc405f741b3f69aac578d05fe26961e0125531c4fcb34a62af3e7a288d862f6eb34803c9e144a1661d0f8fc78ef8b87f3bf7ae89672f4ff196aadc60a8eae6483bae2ed065d851f447fc8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf4076d2dfe3efa02b5b8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf40f32e52a595d5e141717f6417f948aaa279d001d13b7a2bee2460d1d835ce38d700864368eab8f2b10f31642b295093604226def5e00d3d6a929c2e4596344032166b1741ede12384d1e9263bfd40239651284929d15a1aae886b7cf155fbd493', 'Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae': '1665814075', 'JSESSIONID': '12070D5B8A0173C0509273A9FD2060C5', 'Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae': '1666073527'" + 'Cookie': "'JSESSIONID=2E161C24EE80617B11702B4E76A42FF0; _qquc=6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb693595284cd9e2fd46f4ee81e2dc73caad231f424d32347fd2822284431a74dec2c4b83f1b9d8786ebd0d7365bf0935405158f9d16bb79a0bd1f25565e5fdaa4fefb8026fd46b9c3cb6230301904f42b70dac9ff77c32d9228079178f395dd41708e28f30f4223948f25c30255de9bdf9210c2c95a92b6f80aff8ab6cd29261c19c181d4ecb97a9c5e4841b4668dbfd6df042c4e113b1f49a2735a11313cd1738fec5bcb7e4f0ae6ab92005e478fba8f38efe561ea8de8abb858e98715ccff56e8a9b336d1c71d4222cfd4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e26062333f8df1e661d4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e2ed41a28a5d5b372fd4e3cd2cdd4ab3e976652525878b2ab6c6cf81d18a62f4ef6a0d294378ea2f7c792744feca70155430d053014964fed2194a1d90a161c73c6f7567f12bdd69a2d9b81824ced2ccba099beefbb90aee3433a598ff7d0d05e8; Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae=1649303831,1649383836,1650452961; Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae=1650453289'" +} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'myfirstPj.middlewares.MyfirstpjSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'myfirstPj.middlewares.MyfirstpjDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'myfirstPj.pipelines.MyfirstpjPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + + +FEED_EXPORT_ENCODING='utf-8' diff --git a/papernews_spider/myfirstPj/spiders/NewsContent.py b/papernews_spider/myfirstPj/spiders/NewsContent.py new file mode 100644 index 0000000..128691e --- /dev/null +++ b/papernews_spider/myfirstPj/spiders/NewsContent.py @@ -0,0 +1,25 @@ +import scrapy + +import papernews_spider +from papernews_spider.myfirstPj.spiders import Tools + + +# 正文爬虫(测试用) +# import PapernewsSpider + + +class NewscontentSpider(scrapy.Spider): + name = 'NewsContent' + allowed_domains = ['www.paper.com.cn'] + urllist = open('url.txt', 'a', encoding='utf-8') + start_urls = ['http://www.paper.com.cn/'] + + def parse(self, response): + # 资讯详情页 + lists = response.xpath('//b/text() | //p/text()').extract() + # lists = response.xpath("//a[contains(@target, '_blank')]/text()").extract() + # 打开保存的url链接 + urllist = open("url.txt", 'a', encoding='utf-8') + + # 编辑输出编码格式,输出文件名,信息为追加写入 + Tools.write_txt(Tools.__init__(self), "News.txt", urllist) diff --git a/papernews_spider/myfirstPj/spiders/PaperNewsSpider.py b/papernews_spider/myfirstPj/spiders/PaperNewsSpider.py new file mode 100644 index 0000000..d657e55 --- /dev/null +++ b/papernews_spider/myfirstPj/spiders/PaperNewsSpider.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +import datetime + +import scrapy +from scrapy import Request +import oss2 +import os + +from scrapy import Selector + +from papernews_spider.myfirstPj.items import MyfirstpjItem +from papernews_spider.myfirstPj.spiders import Tools +from papernews_spider.myfirstPj import settings + + +# 爬取纸业网咨询中心的印刷出版页面 +class PaperNewsSpider(scrapy.Spider): + # 爬虫名字 + name = 'PaperNews' + # 爬虫域名 + allowed_domains = ['www.paper.com.cn'] + + # 国内咨询主页面 + start_urls = ['http://www.paper.com.cn/news/nation.php?news_type=%B9%FA%C4%DA%D7%CA%D1%B6'] # 07 + + # 爬虫主类 + def parse(self, response): + + # 获取资讯链接 + lists = response.xpath('//td[@width="85%"]/a/@href').extract() + + # 字符串拼接给lists链接加上域名 + lists_num = len(lists) - 1 + newlists = lists + while True: + newlists[lists_num] = 'http://www.paper.com.cn%s' % (lists[lists_num]) + lists_num = lists_num - 1 + if lists_num < 0: + lists_num = len(lists) - 1 + break + + # 编辑输出编码格式,输出文件名,信息为追加写入 + Tools.write_txt(Tools.__init__(self), "newsUrl.txt", newlists) + + + # 循环爬取newlist里面的网页 + while True: + yield Request(newlists[lists_num], callback=self.parse_second) + lists_num = lists_num - 1 + if lists_num < 0: + lists_num = len(lists) - 1 + break + + # 获取二层页面的方法 + def parse_second(self, response): + # 资讯详情页 + lists = response.xpath('//b/text() | //p/text()').extract() + + # 编辑输出编码格式,输出文件名,信息为追加写入 + Tools.write_txt(Tools.__init__(self), "News.txt", lists) + print(lists) + # 推送url到阿里云 + now = datetime.datetime.now() + directory = "spider-information-data/paper.com.cn/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/" + Tools.put(Tools.__init__(self), directory, lists) + + # 重写request + def start_requests(self): + """ + 这是一个重载函数,它的作用是发出一个Request请求 + :return: + """ + # 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。 + yield Request(self.start_urls[0], callback=self.parse) diff --git a/papernews_spider/myfirstPj/spiders/PaperPriceSpider.py b/papernews_spider/myfirstPj/spiders/PaperPriceSpider.py new file mode 100644 index 0000000..504fb9f --- /dev/null +++ b/papernews_spider/myfirstPj/spiders/PaperPriceSpider.py @@ -0,0 +1,82 @@ +import datetime +import json + +import scrapy +from scrapy import Request +from scrapy.http import FormRequest + +# 爬取富宝咨询的废纸与成品纸价格变动 +from papernews_spider.myfirstPj.spiders import Tools + + +class PaperPricesSpider(scrapy.Spider): + # 爬虫名字 + name = 'PaperPriceSpider' + + # 爬虫域名 + allowed_domains = ['news.f139.com'] + + # 印刷出版主页面 + start_urls = ['http://news.f139.com/list.do?channelID=94&categoryID=27'] + + def parse(self, response): + # 爬取信息连接 + lists = response.xpath('//a[@target="_blank"]/@href').extract() + + # 字符串拼接给lists链接加上域名 + + newlists = lists + for i in range(len(lists)): + newlists[i] = 'http://news.f139.com%s' % (lists[i]) + + # 编辑输出编码格式,输出文件名,信息为追加写入 + Tools.write_txt(Tools.__init__(self), "priceUrl.txt", newlists) + + # 更新生成的连接列表 + newlists = Tools.url_manage(Tools.__init__(self), 'priceUrl.txt', 'oldPriceUrl.txt') + + # 清洗多余字符 + newlists = Tools.cleantxt(Tools.__init__(self), '\n', '', newlists) + newlists = Tools.cleantxt(Tools.__init__(self), + '===============================================================================================', + '', newlists) + + # print(newlists) + # 更新oldPriceUrl + Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", newlists) + + # 循环爬取newlist里面的网页(新网页) + for i in range(len(newlists)): + yield Request(newlists[i], callback=self.parse_second) + if i == (len(newlists) - 1): + data = {"name": PaperPricesSpider.name, "url": newlists, "status": 200} + print(json.dumps(data)) + + # print(response.body.decode('utf-8', 'ignore')) + + # 获取二层页面的方法 + def parse_second(self, response): + # 资讯详情页 + lists = response.xpath( + "//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text()").extract() + # lists = response.xpath("normalize-space(.//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text())").extract_first() + + # 编辑输出编码格式,输出文件名,信息为追加写入 + lists = Tools.cleantxt(Tools.__init__(self), '富宝', '千鸟', lists) # 修改富宝咨询为千鸟咨询 + del lists[1:5] + del lists[12:18] # 删除空白行 + Tools.write_txt(Tools.__init__(self), "priceText.txt", lists) + # 推送到阿里云(可选) + now = datetime.datetime.now() + directory = "spider-information-data/fuBao/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/" + Tools.put(Tools.__init__(self), directory, lists) + + # 重写request + def start_requests(self): + """ + 这是一个重载函数,它的作用是发出一个Request请求 + :return: + """ + + # 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。 + yield Request(self.start_urls[0], callback=self.parse) diff --git a/papernews_spider/myfirstPj/spiders/Url.py b/papernews_spider/myfirstPj/spiders/Url.py new file mode 100644 index 0000000..9be2a4e --- /dev/null +++ b/papernews_spider/myfirstPj/spiders/Url.py @@ -0,0 +1,8 @@ +import scrapy +# 配置爬虫的网页 +class UrllRoom(): + # 拉取数据库要查询的 + name = 'baidu' + allowed_domains = ['www.baidu.com'] + start_urls = ['http://www.baidu.com/'] + diff --git a/papernews_spider/myfirstPj/spiders/__init__.py b/papernews_spider/myfirstPj/spiders/__init__.py new file mode 100644 index 0000000..0a57627 --- /dev/null +++ b/papernews_spider/myfirstPj/spiders/__init__.py @@ -0,0 +1,76 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. + +import oss2 +from papernews_spider.Module.generateID import IdWorker + + +class Tools: + + # 上传阿里云oss存储,参数为数据流 + def put(self, directory, datalist): # (oss文件位置与命名,传入的数据流) + # 阿里云账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维,请登录RAM控制台创建RAM用户。 + auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn') + # yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1(深圳)为例,Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。 + # 填写Bucket名称。 + endpoint = 'oss-cn-shenzhen.aliyuncs.com' # 外网 + # endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com' # 内网 + bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake') + + # 生成上传数据 + id = IdWorker(1, 2, 0) + new_id = str(id.get_id()) + ".txt" + data = '' + for item in datalist: + data = data + item + bucket.put_object(directory + new_id, data) + + + + # 写文件方法 参数(self,生成文件名,xpath返回的list) + def write_txt(self, filename, listname): + file = open(filename, 'a', encoding='UTF-8') + for item in listname: + if not (item == '\r\n ' + or item == '\r\n ' + or item == '\r\n ' + or item == '\r\t' + or item == '\r\r\n \r\n \r\n ' + or item == '\r\r\n \r\n ' + or item == '\r\n \r\n ' + or item == '\r\r\n \r\n \r\n '): # 清洗数据 + file.write(item) + file.write("\n") + # print(item) + file.close() + + # url管理器(newlist:新获取的网页,oldurl是旧的记录url的文件) + def url_manage(self, newlist, oldurl): + """ + return:尚未爬取的链接 + """ + # 打开两个文件 + newfile = open(newlist, 'r', encoding='utf-8') + oldfile = open(oldurl, 'r', encoding='utf-8') + # 获取流 + new = newfile.readlines() + old = oldfile.readlines() + + newfile.close() + oldfile.close() + # 返回差集 + difference = set(new).difference(old) + + # # 更新oldPriceUrl + # Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new) + return difference + + # 清洗数据方法(self,要更改的字符串,更变成的数据,传入的数据list,) + def cleantxt(self, cleancontent, replacecontent, lists=list): + + # 清洗数据 + for new in lists: + new = [i.replace(cleancontent, replacecontent) for i in lists] + return new diff --git a/papernews_spider/myfirstPj/tools/Api.py b/papernews_spider/myfirstPj/tools/Api.py new file mode 100644 index 0000000..e69de29 diff --git a/papernews_spider/myfirstPj/tools/Run.py b/papernews_spider/myfirstPj/tools/Run.py new file mode 100644 index 0000000..1c66ee6 --- /dev/null +++ b/papernews_spider/myfirstPj/tools/Run.py @@ -0,0 +1,6 @@ +from scrapy import cmdline + +import SetSpiderName +# cmdline.execute('s crapy crawl baidu -s LOG_FILE=debug.log'.split()) +cmdline.execute(('scrapy crawl %s -s LOG_FILE=debug.log' %(SetSpiderName.SetName.name)).split()) +# cmdline.execute(('scrapy crawl %s' %(SetSpiderName.SetName.name)).split()) \ No newline at end of file diff --git a/papernews_spider/myfirstPj/tools/SetSpiderName.py b/papernews_spider/myfirstPj/tools/SetSpiderName.py new file mode 100644 index 0000000..c9163ad --- /dev/null +++ b/papernews_spider/myfirstPj/tools/SetSpiderName.py @@ -0,0 +1,5 @@ + + +class SetName(): + # name = "PaperNews" + name = "PaperPriceSpider" diff --git a/papernews_spider/myfirstPj/tools/__init__.py b/papernews_spider/myfirstPj/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/papernews_spider/scrapy.cfg b/papernews_spider/scrapy.cfg new file mode 100644 index 0000000..249cd2a --- /dev/null +++ b/papernews_spider/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = myfirstPj.settings + +[deploy] +#url = http://localhost:6800/ +project = myfirstPj