Browse Source

初始化

master
commit
7c37472886
18 changed files with 699 additions and 0 deletions
  1. 154
      .gitignore
  2. 21
      README.md
  3. 0
      papernews_spider/__init__.py
  4. 0
      papernews_spider/myfirstPj/__init__.py
  5. 13
      papernews_spider/myfirstPj/items.py
  6. 103
      papernews_spider/myfirstPj/middlewares.py
  7. 25
      papernews_spider/myfirstPj/pipelines.py
  8. 96
      papernews_spider/myfirstPj/settings.py
  9. 25
      papernews_spider/myfirstPj/spiders/NewsContent.py
  10. 74
      papernews_spider/myfirstPj/spiders/PaperNewsSpider.py
  11. 82
      papernews_spider/myfirstPj/spiders/PaperPriceSpider.py
  12. 8
      papernews_spider/myfirstPj/spiders/Url.py
  13. 76
      papernews_spider/myfirstPj/spiders/__init__.py
  14. 0
      papernews_spider/myfirstPj/tools/Api.py
  15. 6
      papernews_spider/myfirstPj/tools/Run.py
  16. 5
      papernews_spider/myfirstPj/tools/SetSpiderName.py
  17. 0
      papernews_spider/myfirstPj/tools/__init__.py
  18. 11
      papernews_spider/scrapy.cfg

154
.gitignore

@ -0,0 +1,154 @@
### IntelliJ IDEA ###
.idea
.gitback
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
#文档和日志
*.txt
#飞桨
/papernews_spider/Module/model_best/
/papernews_spider/Module/
#包含test的测试文件
*test*
/papernews_spider/myfirstPj/spiders/model_best/

21
README.md

@ -0,0 +1,21 @@
## demo
1. 安装Python 3.7 以上版本。
2. 安装依赖:命令行执行 `pip install -r requirements.txt`
### 说明
+ Run.py是运行爬虫的方法,在SetSpdierName.py里修改要运行的爬虫项目
+ 爬取纸业网咨询中心的印刷出版页面链接以及链接内详情内容。(PapernewsSpider.py) ,链接接为:http://www.paper.com.cn/news/nation.php?news_type=%D3%A1%CB%A2%B3%F6%B0%E6
+ 文件News.txt是爬取到的资讯数据
+ 爬取富宝咨询的纸厂调价栏目,记录的是废纸以及成品纸的价格变动(PaperpriceSpider.py)链接为:http://news.f139.com
+ 已经写好网页去重,并写好可以复用的相关的去重方法和清洗方法
### 文件说明
+ spider文件夹存放的是爬虫文件
+ tools文件夹存放的是各种工具类,包含各种测试用的工具类,
+ (现在已经关闭控制台输出)

0
papernews_spider/__init__.py

0
papernews_spider/myfirstPj/__init__.py

13
papernews_spider/myfirstPj/items.py

@ -0,0 +1,13 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
# -*- coding: utf-8 -*-
import scrapy
# 负责处理被spider提取出来的item。当页面被爬虫解析所需的数据存入Item
class MyfirstpjItem(scrapy.Item):
# define the fields for your item here like:
text = scrapy.Field()
url = scrapy.Field()
pass

103
papernews_spider/myfirstPj/middlewares.py

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class MyfirstpjSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyfirstpjDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

25
papernews_spider/myfirstPj/pipelines.py

@ -0,0 +1,25 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# -*- coding: utf-8 -*-
# useful for handling different item types with a single interface
import requests
from itemadapter import ItemAdapter
# 排列次序
class MyfirstpjPipeline:
def process_item(self, item, spider):
# file = open("items.txt", "a") # 以追加的方式打开文件,不存在则创建
# # 因为item中的数据是unicode编码,为了在控制台中查看数据的有效性和保存,
# # 将其编码改为utf-8
# item_string = str(item).decode("unicode_escape").encode('utf-8')
# file.write(item_string)
# file.write('\n')
# file.close()
# print(item_string) # 在控制台输出
# return item # 会在控制台输出原item数据,可以选择不写
print(item)

96
papernews_spider/myfirstPj/settings.py

@ -0,0 +1,96 @@
# Scrapy settings for myfirstPj project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'myfirstPj'
SPIDER_MODULES = ['myfirstPj.spiders']
NEWSPIDER_MODULE = 'myfirstPj.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1935.94 Safari/537.36',
# 'Cookie': "'_qquc': '6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb6935540e9c8a32c823850e2a4fd1b9e6a7081f424d32347fd2822284431a74dec2c47cf56d87c11ed27ae08743d556ec1fbf41b4668dbfd6df049246d413308d16aed327f1420253934934bbb062de14706171347d330ba71e632c2d6a89b62e833cd2fac9e3fc13e07e94c47dbc159d7fed1db22e3274c3e3f940651d83de34fc405f741b3f69aac578d05fe26961e0125531c4fcb34a62af3e7a288d862f6eb34803c9e144a1661d0f8fc78ef8b87f3bf7ae89672f4ff196aadc60a8eae6483bae2ed065d851f447fc8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf4076d2dfe3efa02b5b8f9d16bb79a0bd1ffb36ce652538c792e7c05235526d8d2eceed0ab7823ddf40f32e52a595d5e141717f6417f948aaa279d001d13b7a2bee2460d1d835ce38d700864368eab8f2b10f31642b295093604226def5e00d3d6a929c2e4596344032166b1741ede12384d1e9263bfd40239651284929d15a1aae886b7cf155fbd493', 'Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae': '1665814075', 'JSESSIONID': '12070D5B8A0173C0509273A9FD2060C5', 'Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae': '1666073527'"
'Cookie': "'JSESSIONID=2E161C24EE80617B11702B4E76A42FF0; _qquc=6d2af3823bb4829a95200dba06ccb0265939e590873e5cef7edcafe1c96a755abf5e2403097381fa2d9a6c4604d69e51fc7914acc029fb5033a0475182f48025f054a58ca304d4f0925129c937cb693595284cd9e2fd46f4ee81e2dc73caad231f424d32347fd2822284431a74dec2c4b83f1b9d8786ebd0d7365bf0935405158f9d16bb79a0bd1f25565e5fdaa4fefb8026fd46b9c3cb6230301904f42b70dac9ff77c32d9228079178f395dd41708e28f30f4223948f25c30255de9bdf9210c2c95a92b6f80aff8ab6cd29261c19c181d4ecb97a9c5e4841b4668dbfd6df042c4e113b1f49a2735a11313cd1738fec5bcb7e4f0ae6ab92005e478fba8f38efe561ea8de8abb858e98715ccff56e8a9b336d1c71d4222cfd4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e26062333f8df1e661d4067afd041d5b6002351d15ba1cc3738e7b731a999e72d4eac3bf864dc7c4e2ed41a28a5d5b372fd4e3cd2cdd4ab3e976652525878b2ab6c6cf81d18a62f4ef6a0d294378ea2f7c792744feca70155430d053014964fed2194a1d90a161c73c6f7567f12bdd69a2d9b81824ced2ccba099beefbb90aee3433a598ff7d0d05e8; Hm_lvt_e11e5fa7b1c17369dacfb3f063d64cae=1649303831,1649383836,1650452961; Hm_lpvt_e11e5fa7b1c17369dacfb3f063d64cae=1650453289'"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'myfirstPj.middlewares.MyfirstpjSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'myfirstPj.middlewares.MyfirstpjDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'myfirstPj.pipelines.MyfirstpjPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_EXPORT_ENCODING='utf-8'

25
papernews_spider/myfirstPj/spiders/NewsContent.py

@ -0,0 +1,25 @@
import scrapy
import papernews_spider
from papernews_spider.myfirstPj.spiders import Tools
# 正文爬虫(测试用)
# import PapernewsSpider
class NewscontentSpider(scrapy.Spider):
name = 'NewsContent'
allowed_domains = ['www.paper.com.cn']
urllist = open('url.txt', 'a', encoding='utf-8')
start_urls = ['http://www.paper.com.cn/']
def parse(self, response):
# 资讯详情页
lists = response.xpath('//b/text() | //p/text()').extract()
# lists = response.xpath("//a[contains(@target, '_blank')]/text()").extract()
# 打开保存的url链接
urllist = open("url.txt", 'a', encoding='utf-8')
# 编辑输出编码格式,输出文件名,信息为追加写入
Tools.write_txt(Tools.__init__(self), "News.txt", urllist)

74
papernews_spider/myfirstPj/spiders/PaperNewsSpider.py

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
import datetime
import scrapy
from scrapy import Request
import oss2
import os
from scrapy import Selector
from papernews_spider.myfirstPj.items import MyfirstpjItem
from papernews_spider.myfirstPj.spiders import Tools
from papernews_spider.myfirstPj import settings
# 爬取纸业网咨询中心的印刷出版页面
class PaperNewsSpider(scrapy.Spider):
# 爬虫名字
name = 'PaperNews'
# 爬虫域名
allowed_domains = ['www.paper.com.cn']
# 国内咨询主页面
start_urls = ['http://www.paper.com.cn/news/nation.php?news_type=%B9%FA%C4%DA%D7%CA%D1%B6'] # 07
# 爬虫主类
def parse(self, response):
# 获取资讯链接
lists = response.xpath('//td[@width="85%"]/a/@href').extract()
# 字符串拼接给lists链接加上域名
lists_num = len(lists) - 1
newlists = lists
while True:
newlists[lists_num] = 'http://www.paper.com.cn%s' % (lists[lists_num])
lists_num = lists_num - 1
if lists_num < 0:
lists_num = len(lists) - 1
break
# 编辑输出编码格式,输出文件名,信息为追加写入
Tools.write_txt(Tools.__init__(self), "newsUrl.txt", newlists)
# 循环爬取newlist里面的网页
while True:
yield Request(newlists[lists_num], callback=self.parse_second)
lists_num = lists_num - 1
if lists_num < 0:
lists_num = len(lists) - 1
break
# 获取二层页面的方法
def parse_second(self, response):
# 资讯详情页
lists = response.xpath('//b/text() | //p/text()').extract()
# 编辑输出编码格式,输出文件名,信息为追加写入
Tools.write_txt(Tools.__init__(self), "News.txt", lists)
print(lists)
# 推送url到阿里云
now = datetime.datetime.now()
directory = "spider-information-data/paper.com.cn/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/"
Tools.put(Tools.__init__(self), directory, lists)
# 重写request
def start_requests(self):
"""
Request请求
:return:
"""
# 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。
yield Request(self.start_urls[0], callback=self.parse)

82
papernews_spider/myfirstPj/spiders/PaperPriceSpider.py

@ -0,0 +1,82 @@
import datetime
import json
import scrapy
from scrapy import Request
from scrapy.http import FormRequest
# 爬取富宝咨询的废纸与成品纸价格变动
from papernews_spider.myfirstPj.spiders import Tools
class PaperPricesSpider(scrapy.Spider):
# 爬虫名字
name = 'PaperPriceSpider'
# 爬虫域名
allowed_domains = ['news.f139.com']
# 印刷出版主页面
start_urls = ['http://news.f139.com/list.do?channelID=94&categoryID=27']
def parse(self, response):
# 爬取信息连接
lists = response.xpath('//a[@target="_blank"]/@href').extract()
# 字符串拼接给lists链接加上域名
newlists = lists
for i in range(len(lists)):
newlists[i] = 'http://news.f139.com%s' % (lists[i])
# 编辑输出编码格式,输出文件名,信息为追加写入
Tools.write_txt(Tools.__init__(self), "priceUrl.txt", newlists)
# 更新生成的连接列表
newlists = Tools.url_manage(Tools.__init__(self), 'priceUrl.txt', 'oldPriceUrl.txt')
# 清洗多余字符
newlists = Tools.cleantxt(Tools.__init__(self), '\n', '', newlists)
newlists = Tools.cleantxt(Tools.__init__(self),
'===============================================================================================',
'', newlists)
# print(newlists)
# 更新oldPriceUrl
Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", newlists)
# 循环爬取newlist里面的网页(新网页)
for i in range(len(newlists)):
yield Request(newlists[i], callback=self.parse_second)
if i == (len(newlists) - 1):
data = {"name": PaperPricesSpider.name, "url": newlists, "status": 200}
print(json.dumps(data))
# print(response.body.decode('utf-8', 'ignore'))
# 获取二层页面的方法
def parse_second(self, response):
# 资讯详情页
lists = response.xpath(
"//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text()").extract()
# lists = response.xpath("normalize-space(.//h1/text() | //div[@id='zhengwen']/text() | //div[@id='zhengwen']//p[not(@*)]/text())").extract_first()
# 编辑输出编码格式,输出文件名,信息为追加写入
lists = Tools.cleantxt(Tools.__init__(self), '富宝', '千鸟', lists) # 修改富宝咨询为千鸟咨询
del lists[1:5]
del lists[12:18] # 删除空白行
Tools.write_txt(Tools.__init__(self), "priceText.txt", lists)
# 推送到阿里云(可选)
now = datetime.datetime.now()
directory = "spider-information-data/fuBao/wastePaperPrice/" + str(now.strftime("%Y-%m-%d")) + "/"
Tools.put(Tools.__init__(self), directory, lists)
# 重写request
def start_requests(self):
"""
Request请求
:return:
"""
# 带着headers、cookies去请求self.start_urls[0],返回的response会被送到回调函数parse中,我这里删除了headers,cookie。配置到了setting里面。
yield Request(self.start_urls[0], callback=self.parse)

8
papernews_spider/myfirstPj/spiders/Url.py

@ -0,0 +1,8 @@
import scrapy
# 配置爬虫的网页
class UrllRoom():
# 拉取数据库要查询的
name = 'baidu'
allowed_domains = ['www.baidu.com']
start_urls = ['http://www.baidu.com/']

76
papernews_spider/myfirstPj/spiders/__init__.py

@ -0,0 +1,76 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import oss2
from papernews_spider.Module.generateID import IdWorker
class Tools:
# 上传阿里云oss存储,参数为数据流
def put(self, directory, datalist): # (oss文件位置与命名,传入的数据流)
# 阿里云账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维,请登录RAM控制台创建RAM用户。
auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn')
# yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1(深圳)为例,Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。
# 填写Bucket名称。
endpoint = 'oss-cn-shenzhen.aliyuncs.com' # 外网
# endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com' # 内网
bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake')
# 生成上传数据
id = IdWorker(1, 2, 0)
new_id = str(id.get_id()) + ".txt"
data = ''
for item in datalist:
data = data + item
bucket.put_object(directory + new_id, data)
# 写文件方法 参数(self,生成文件名,xpath返回的list)
def write_txt(self, filename, listname):
file = open(filename, 'a', encoding='UTF-8')
for item in listname:
if not (item == '\r\n '
or item == '\r\n '
or item == '\r\n '
or item == '\r\t'
or item == '\r\r\n \r\n \r\n '
or item == '\r\r\n \r\n '
or item == '\r\n \r\n '
or item == '\r\r\n \r\n \r\n '): # 清洗数据
file.write(item)
file.write("\n")
# print(item)
file.close()
# url管理器(newlist:新获取的网页,oldurl是旧的记录url的文件)
def url_manage(self, newlist, oldurl):
"""
return
"""
# 打开两个文件
newfile = open(newlist, 'r', encoding='utf-8')
oldfile = open(oldurl, 'r', encoding='utf-8')
# 获取流
new = newfile.readlines()
old = oldfile.readlines()
newfile.close()
oldfile.close()
# 返回差集
difference = set(new).difference(old)
# # 更新oldPriceUrl
# Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new)
return difference
# 清洗数据方法(self,要更改的字符串,更变成的数据,传入的数据list,)
def cleantxt(self, cleancontent, replacecontent, lists=list):
# 清洗数据
for new in lists:
new = [i.replace(cleancontent, replacecontent) for i in lists]
return new

0
papernews_spider/myfirstPj/tools/Api.py

6
papernews_spider/myfirstPj/tools/Run.py

@ -0,0 +1,6 @@
from scrapy import cmdline
import SetSpiderName
# cmdline.execute('s crapy crawl baidu -s LOG_FILE=debug.log'.split())
cmdline.execute(('scrapy crawl %s -s LOG_FILE=debug.log' %(SetSpiderName.SetName.name)).split())
# cmdline.execute(('scrapy crawl %s' %(SetSpiderName.SetName.name)).split())

5
papernews_spider/myfirstPj/tools/SetSpiderName.py

@ -0,0 +1,5 @@
class SetName():
# name = "PaperNews"
name = "PaperPriceSpider"

0
papernews_spider/myfirstPj/tools/__init__.py

11
papernews_spider/scrapy.cfg

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = myfirstPj.settings
[deploy]
#url = http://localhost:6800/
project = myfirstPj
Loading…
Cancel
Save