You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
76 lines
3.0 KiB
76 lines
3.0 KiB
# This package will contain the spiders of your Scrapy project
|
|
#
|
|
# Please refer to the documentation for information on how to create and manage
|
|
# your spiders.
|
|
|
|
import oss2
|
|
from papernews_spider.Module.generateID import IdWorker
|
|
|
|
|
|
class Tools:
|
|
|
|
# 上传阿里云oss存储,参数为数据流
|
|
def put(self, directory, datalist): # (oss文件位置与命名,传入的数据流)
|
|
# 阿里云账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维,请登录RAM控制台创建RAM用户。
|
|
auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn')
|
|
# yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1(深圳)为例,Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。
|
|
# 填写Bucket名称。
|
|
endpoint = 'oss-cn-shenzhen.aliyuncs.com' # 外网
|
|
# endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com' # 内网
|
|
bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake')
|
|
|
|
# 生成上传数据
|
|
id = IdWorker(1, 2, 0)
|
|
new_id = str(id.get_id()) + ".txt"
|
|
data = ''
|
|
for item in datalist:
|
|
data = data + item
|
|
bucket.put_object(directory + new_id, data)
|
|
|
|
|
|
|
|
# 写文件方法 参数(self,生成文件名,xpath返回的list)
|
|
def write_txt(self, filename, listname):
|
|
file = open(filename, 'a', encoding='UTF-8')
|
|
for item in listname:
|
|
if not (item == '\r\n '
|
|
or item == '\r\n '
|
|
or item == '\r\n '
|
|
or item == '\r\t'
|
|
or item == '\r\r\n \r\n \r\n '
|
|
or item == '\r\r\n \r\n '
|
|
or item == '\r\n \r\n '
|
|
or item == '\r\r\n \r\n \r\n '): # 清洗数据
|
|
file.write(item)
|
|
file.write("\n")
|
|
# print(item)
|
|
file.close()
|
|
|
|
# url管理器(newlist:新获取的网页,oldurl是旧的记录url的文件)
|
|
def url_manage(self, newlist, oldurl):
|
|
"""
|
|
return:尚未爬取的链接
|
|
"""
|
|
# 打开两个文件
|
|
newfile = open(newlist, 'r', encoding='utf-8')
|
|
oldfile = open(oldurl, 'r', encoding='utf-8')
|
|
# 获取流
|
|
new = newfile.readlines()
|
|
old = oldfile.readlines()
|
|
|
|
newfile.close()
|
|
oldfile.close()
|
|
# 返回差集
|
|
difference = set(new).difference(old)
|
|
|
|
# # 更新oldPriceUrl
|
|
# Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new)
|
|
return difference
|
|
|
|
# 清洗数据方法(self,要更改的字符串,更变成的数据,传入的数据list,)
|
|
def cleantxt(self, cleancontent, replacecontent, lists=list):
|
|
|
|
# 清洗数据
|
|
for new in lists:
|
|
new = [i.replace(cleancontent, replacecontent) for i in lists]
|
|
return new
|