ztb-information-spider/papernews_spider/myfirstPj/spiders/__init__.py


								# This package will contain the spiders of your Scrapy project

								#

								# Please refer to the documentation for information on how to create and manage

								# your spiders.


								import oss2

								from papernews_spider.Module.generateID import IdWorker


								class Tools:


								    # 上传阿里云oss存储,参数为数据流

								    def put(self, directory, datalist):  # (oss文件位置与命名，传入的数据流）

								        # 阿里云账号AccessKey拥有所有API的访问权限，风险很高。强烈建议您创建并使用RAM用户进行API访问或日常运维，请登录RAM控制台创建RAM用户。

								        auth = oss2.Auth('LTAI5tRJKZvY8Switqrb3366', 'qAi7Hdrvvc7WLuvOr9n2g5PuBs3Vhn')

								        # yourEndpoint填写Bucket所在地域对应的Endpoint。以华南1（深圳）为例，Endpoint填写为https://oss-cn-hangzhou.aliyuncs.com。

								        # 填写Bucket名称。

								        endpoint = 'oss-cn-shenzhen.aliyuncs.com'  # 外网

								        # endpoint = 'oss-cn-shenzhen-internal.aliyuncs.com'  # 内网

								        bucket = oss2.Bucket(auth, endpoint, 'qn-data-lake')


								        # 生成上传数据

								        id = IdWorker(1, 2, 0)

								        new_id = str(id.get_id()) + ".txt"

								        data = ''

								        for item in datalist:

								            data = data + item

								        bucket.put_object(directory + new_id, data)


								    # 写文件方法 参数(self，生成文件名，xpath返回的list)

								    def write_txt(self, filename, listname):

								        file = open(filename, 'a', encoding='UTF-8')

								        for item in listname:

								            if not (item == '\r\n        '

								                    or item == '\r\n       '

								                    or item == '\r\n          '

								                    or item == '\r\t'

								                    or item == '\r\r\n        \r\n        \r\n        '

								                    or item == '\r\r\n      \r\n          '

								                    or item == '\r\n        \r\n          '

								                    or item == '\r\r\n      \r\n         \r\n     '):  # 清洗数据

								                file.write(item)

								                file.write("\n")

								                # print(item)

								        file.close()


								    # url管理器(newlist:新获取的网页，oldurl是旧的记录url的文件)

								    def url_manage(self, newlist, oldurl):

								        """

								        return：尚未爬取的链接

								        """

								        # 打开两个文件

								        newfile = open(newlist, 'r', encoding='utf-8')

								        oldfile = open(oldurl, 'r', encoding='utf-8')

								        # 获取流

								        new = newfile.readlines()

								        old = oldfile.readlines()


								        newfile.close()

								        oldfile.close()

								        # 返回差集

								        difference = set(new).difference(old)


								        # # 更新oldPriceUrl

								        # Tools.write_txt(Tools.__init__(self), "oldPriceUrl.txt", new)

								        return difference


								    # 清洗数据方法(self,要更改的字符串，更变成的数据，传入的数据list，)

								    def cleantxt(self, cleancontent, replacecontent, lists=list):


								        # 清洗数据

								        for new in lists:

								            new = [i.replace(cleancontent, replacecontent) for i in lists]

								            return new