You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

76 lines
3.0 KiB

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import oss2
from papernews_spider.Module.generateID import IdWorker
class Tools:
# 上传阿里云oss存储,参数为数据流
def put(self, directory, datalist): # (oss文件位置与命名,传入的数据流)
# 阿里云账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维,请登录RAM控制台创建RAM用户。
auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn')
# yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1(深圳)为例,Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。
# 填写Bucket名称。
endpoint = 'oss-cn-shenzhen.aliyuncs.com' # 外网
# endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com' # 内网
bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake')
# 生成上传数据
id = IdWorker(1, 2, 0)
new_id = str(id.get_id()) + ".txt"
data = ''
for item in datalist:
data = data + item
bucket.put_object(directory + new_id, data)
# 写文件方法 参数(self,生成文件名,xpath返回的list)
def write_txt(self, filename, listname):
file = open(filename, 'a', encoding='UTF-8')
for item in listname:
if not (item == '\r\n '
or item == '\r\n '
or item == '\r\n '
or item == '\r\t'
or item == '\r\r\n \r\n \r\n '
or item == '\r\r\n \r\n '
or item == '\r\n \r\n '
or item == '\r\r\n \r\n \r\n '): # 清洗数据
file.write(item)
file.write("\n")
# print(item)
file.close()
# url管理器(newlist:新获取的网页,oldurl是旧的记录url的文件)
def url_manage(self, newlist, oldurl):
"""
return:尚未爬取的链接
"""
# 打开两个文件
newfile = open(newlist, 'r', encoding='utf-8')
oldfile = open(oldurl, 'r', encoding='utf-8')
# 获取流
new = newfile.readlines()
old = oldfile.readlines()
newfile.close()
oldfile.close()
# 返回差集
difference = set(new).difference(old)
# # 更新oldPriceUrl
# Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new)
return difference
# 清洗数据方法(self,要更改的字符串,更变成的数据,传入的数据list,)
def cleantxt(self, cleancontent, replacecontent, lists=list):
# 清洗数据
for new in lists:
new = [i.replace(cleancontent, replacecontent) for i in lists]
return new