# This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. import oss2 from papernews_spider.Module.generateID import IdWorker class Tools: # 上传阿里云oss存储,参数为数据流 def put(self, directory, datalist): # (oss文件位置与命名,传入的数据流) # 阿里云账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维,请登录RAM控制台创建RAM用户。 auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn') # yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1(深圳)为例,Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。 # 填写Bucket名称。 endpoint = 'oss-cn-shenzhen.aliyuncs.com' # 外网 # endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com' # 内网 bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake') # 生成上传数据 id = IdWorker(1, 2, 0) new_id = str(id.get_id()) + ".txt" data = '' for item in datalist: data = data + item bucket.put_object(directory + new_id, data) # 写文件方法 参数(self,生成文件名,xpath返回的list) def write_txt(self, filename, listname): file = open(filename, 'a', encoding='UTF-8') for item in listname: if not (item == '\r\n ' or item == '\r\n ' or item == '\r\n ' or item == '\r\t' or item == '\r\r\n \r\n \r\n ' or item == '\r\r\n \r\n ' or item == '\r\n \r\n ' or item == '\r\r\n \r\n \r\n '): # 清洗数据 file.write(item) file.write("\n") # print(item) file.close() # url管理器(newlist:新获取的网页,oldurl是旧的记录url的文件) def url_manage(self, newlist, oldurl): """ return:尚未爬取的链接 """ # 打开两个文件 newfile = open(newlist, 'r', encoding='utf-8') oldfile = open(oldurl, 'r', encoding='utf-8') # 获取流 new = newfile.readlines() old = oldfile.readlines() newfile.close() oldfile.close() # 返回差集 difference = set(new).difference(old) # # 更新oldPriceUrl # Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new) return difference # 清洗数据方法(self,要更改的字符串,更变成的数据,传入的数据list,) def cleantxt(self, cleancontent, replacecontent, lists=list): # 清洗数据 for new in lists: new = [i.replace(cleancontent, replacecontent) for i in lists] return new