From 787f7915f7a59d1416b83053028099152dd25242 Mon Sep 17 00:00:00 2001 From: chenhong <> Date: Thu, 8 Aug 2019 18:34:51 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=AE=89=E5=B1=85=E5=AE=A2?= =?UTF-8?q?=E7=BD=91=E7=AB=99=E5=B0=8F=E5=8C=BA=E7=88=AC=E5=8F=96=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- anjuke.py | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 anjuke.py diff --git a/anjuke.py b/anjuke.py new file mode 100644 index 0000000..cc2538c --- /dev/null +++ b/anjuke.py @@ -0,0 +1,216 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# 文件名:anjuke.py +# 日期:2019-08-08 + +""" +AnJuke类的作用是模拟浏览器,逐页爬取安居客网站上单个城市的小区数据。 +通过创建类实例,并调用相应的方法将爬取的数据保存到.txt文件,或读取已有小区数据的.txt文件并插入到指定数据库表。 +一、创建AnJuke类的实例: +ajk = AnJuKe(province, city, save_func) +province 提供该字段作为保存文件名的参数之一和写进.txt文件的数据源之一 +city 提供该字段指定爬取单个城市的所有小区 +save_func 该字段仅支持两个预设值 :1、file 2、mysql +二、实例调用 ajk.start()方法 +start()方法通过判断 self.save_func的值,决定调用 save_file() 或 save_mysql()方法 +save_file()方法会先调用 get_page()方法获取并解析安居客的小区,再调用 write_file()方法写进.txt文件 +save_mysql()方法会先调用 read_file()方法读取指定的.txt文件,读取数据后调用 insert_mysql(data_list)方法将数据插入到指定数据库表 +另: +1、save_file()方法中的cur_page默认为1,即默认从第一页开始爬取,每爬完一页就追加到.txt文件末尾。 +2、如果爬取时程序出错停止,则可以依据print打印的当前页数及小区,修改cur_page继续爬取。 +""" + +from bs4 import BeautifulSoup +from urllib import parse +from xpinyin import Pinyin +from pymysql import connect +import re +import requests +import time + + +class AnJuKe: + # 安居客爬取类 + def __init__(self, province, city, save_func): + # 初始化请求header、城市列表、文件路径 + self.headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'zh-CN,zh;q=0.8', + 'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} + self.save_func = save_func + self.province = province + self.city = city + self.time = time.strftime("%Y-%m-%d", time.localtime()) + self.file_path = 'C:\\Users\\Administrator\\Desktop\\anjuke_{}_{}_{}.txt'.format(self.province, self.city, + self.time) + + def get_page(self, city_py, cur_page): + # 获取并返回某一页的小区页面对象 + url = 'https://{}.anjuke.com/community/p{}/'.format(city_py, cur_page) + response = requests.get(url, headers=self.headers, timeout=30) + return response + + def get_location(self, city_py, estate_id): + # 获取并返回小区经纬度 + url = 'https://{}.anjuke.com/community/view/{}'.format(city_py, estate_id) + response = requests.get(url, headers=self.headers, timeout=30) + bs = BeautifulSoup(response.text, 'html.parser') + comm_title = bs.find_all('div', class_='comm-title') + for title in comm_title: + a_tag = title.find_all('a', class_='map-link') + for link in a_tag: + get_link = link.get('href') + # 解析url后得到参数字典query_dict,解析字典后得到经纬度 + url_data = parse.urlparse(get_link) + query_dict = parse.parse_qs(url_data.fragment) + latitude = query_dict['l1'][0] + longitude = query_dict['l2'][0] + return latitude, longitude + + def has_next(self, city_py, cur_page): + # 判断当前页面是否有下一页并返回结果 + response = self.get_page(city_py, cur_page) + bs = BeautifulSoup(response.text, 'html.parser') + multi_page = bs.find_all('div', class_='multi-page') + for page in multi_page: + next_page = page.find_all('a', class_='aNxt') + for anxt in next_page: + next_flag = anxt.attrs['href'] + return next_flag + + def write_file(self, estate_id, estate_name, lat, lng, estate_district, estate_town, estate_street, fw): + # 将爬取的'城市-小区-街道-纬度-经度'保存到文件 + province = self.province.strip() + city = self.city.strip() + id = estate_id.strip() + estate = estate_name.strip() + lat = lat.strip() + lng = lng.strip() + district = estate_district.strip() + town = estate_town.strip() + street = estate_street.strip() + fw.write( + province + '^' + city + '^' + id + '^' + estate + '^' + lat + '^' + lng + '^' + + district + '^' + town + '^' + street + '\n') + + def read_file(self): + open_file = open(self.file_path, "r", encoding='UTF-8-sig') + data_list = [] + for line in open_file: + data = line.rstrip('\n').split('^') + data_list.append(data) + print(data_list) + open_file.close() + return data_list + + def insert_mysql(self, data_list): + # 保存到数据库 '47.106.79.88', 3306, '1hjz', 'root', 'u2018@U2018' + print("连接数据库") + db = connect(host='47.106.79.88', port=3306, db='1hjz', user='root', passwd='u2018@U2018') + cursor = db.cursor() + insert_sql = "INSERT INTO import_estate(province_name,city_name,estate_id,estate_name,lat,lng,estate_district,estate_town,estate_street) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)" + try: + cursor.execute('''CREATE TABLE if not exists `import_estate` ( + `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, + `province_name` varchar(255) DEFAULT '' COMMENT '省份名称', + `city_name` varchar(255) DEFAULT '' COMMENT '城市名称', + `estate_id` varchar(255) DEFAULT '' COMMENT '安居客的小区id', + `estate_name` varchar(255) DEFAULT '' COMMENT '小区名称', + `lat` decimal(10,6) DEFAULT '0.000000' COMMENT '纬度', + `lng` decimal(10,6) DEFAULT '0.000000' COMMENT '经度', + `estate_district` varchar(255) DEFAULT '' COMMENT '行政区', + `estate_town` varchar(255) DEFAULT '' COMMENT '区镇/县', + `estate_street` varchar(255) DEFAULT '' COMMENT '街道', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='小区信息导入表';''') + cursor.executemany(insert_sql, data_list) + db.commit() + except Exception as e: + print(e) + db.rollback() + cursor.close() + db.close() + + def save_mysql(self): + print("等待写入数据库...") + data_list = self.read_file() + if data_list: + self.insert_mysql(data_list) + print("写入数据库完成!") + else: + print("文件不存在或没有数据") + + def save_file(self): + fw = open(self.file_path, "a", encoding='utf-8') + print("开始爬取{}小区数据,请稍等...".format(self.city)) + cur_page = 1 + # 获取城市全拼作为参数 + pinyin = Pinyin() + city_py = str(pinyin.get_pinyin(self.city, '')) + while True: + self.parse_page(city_py, cur_page, fw) + if self.has_next(city_py, cur_page): + cur_page += 1 + else: + break + fw.close() + print("小区数据已写入文件...") + + def parse_page(self, city_py, cur_page, fw): + # 调用self.get_estate()方法得到小区页面对象,并结合其他类方法抓取页面信息 + response = self.get_page(city_py, cur_page) + bs = BeautifulSoup(response.text, 'html.parser') + estate_list = bs.find_all('div', class_='li-itemmod') + for li in estate_list: + infos = li.find_all('div', class_='li-info') + for info in infos: + # 通过h3标签中的标题找到小区名:estate_name + titles = info.find_all('h3') + for title in titles: + estate_name = title.get_text().strip() + # 通过a标签中的链接获取小区id:estate_id href = 'https://guangzhou/anjuke.com/community/view/607976' + a_tag = title.find_all('a') + for link in a_tag: + get_link = link.attrs['href'] + pattern = re.compile('[0-9]{1,}') + estate_id = pattern.findall(get_link)[0] + # 调用self.get_location()方法获取小区经纬度 + estate_location = self.get_location(city_py, estate_id) + lat = estate_location[0] + lng = estate_location[1] + # 获取小区的地址 + address_list = info.find_all('address') + for address in address_list: + ajk_address = address.get_text().strip().split(']') + estate_district = ajk_address[0].split('[')[1].split('-')[0] + estate_town = ajk_address[0].split('[')[1].split('-')[1] + estate_street = ajk_address[1] + self.write_file(estate_id, estate_name, lat, lng, estate_district, estate_town, estate_street, fw) + # 关键打印信息:爬取时出错能判断已经抓取到哪一页的哪个小区 + print( + '省份:' + self.province + '、城市:' + self.city + '、id:' + estate_id + '、小区:' + estate_name + '、纬度:' + lat + + '、经度:' + lng + '、行政区:' + estate_district + '、区镇/县:' + estate_town + '、街道:' + estate_street + + '、当前页:' + str(cur_page)) + + def start(self): + if self.save_func == 'file': + self.save_file() + elif self.save_func == 'mysql': + self.save_mysql() + else: + print('不支持的保存方式') + + +if __name__ == '__main__': + # 目标城市 广东省:广州,深圳,清远,湛江;湖南省 :长沙,衡阳 + start_time = time.time() + # province = '广东省' + province = '湖南省' + city = '衡阳' + # save_func = 'file' + save_func = 'mysql' + spider = AnJuKe(province, city, save_func) + spider.start() + end_time = time.time() + print("已完成,用时%.2f秒" % (end_time - start_time))