You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
216 lines
10 KiB
216 lines
10 KiB
#!/usr/bin/python
|
|
# -*- coding: UTF-8 -*-
|
|
# 文件名:anjuke.py
|
|
# 日期:2019-08-08
|
|
|
|
"""
|
|
AnJuke类的作用是模拟浏览器,逐页爬取安居客网站上单个城市的小区数据。
|
|
通过创建类实例,并调用相应的方法将爬取的数据保存到.txt文件,或读取已有小区数据的.txt文件并插入到指定数据库表。
|
|
一、创建AnJuke类的实例:
|
|
ajk = AnJuKe(province, city, save_func)
|
|
province 提供该字段作为保存文件名的参数之一和写进.txt文件的数据源之一
|
|
city 提供该字段指定爬取单个城市的所有小区
|
|
save_func 该字段仅支持两个预设值 :1、file 2、mysql
|
|
二、实例调用 ajk.start()方法
|
|
start()方法通过判断 self.save_func的值,决定调用 save_file() 或 save_mysql()方法
|
|
save_file()方法会先调用 get_page()方法获取并解析安居客的小区,再调用 write_file()方法写进.txt文件
|
|
save_mysql()方法会先调用 read_file()方法读取指定的.txt文件,读取数据后调用 insert_mysql(data_list)方法将数据插入到指定数据库表
|
|
另:
|
|
1、save_file()方法中的cur_page默认为1,即默认从第一页开始爬取,每爬完一页就追加到.txt文件末尾。
|
|
2、如果爬取时程序出错停止,则可以依据print打印的当前页数及小区,修改cur_page继续爬取。
|
|
"""
|
|
|
|
from bs4 import BeautifulSoup
|
|
from urllib import parse
|
|
from xpinyin import Pinyin
|
|
from pymysql import connect
|
|
import re
|
|
import requests
|
|
import time
|
|
|
|
|
|
class AnJuKe:
|
|
# 安居客爬取类
|
|
def __init__(self, province, city, save_func):
|
|
# 初始化请求header、城市列表、文件路径
|
|
self.headers = {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'zh-CN,zh;q=0.8',
|
|
'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
|
|
self.save_func = save_func
|
|
self.province = province
|
|
self.city = city
|
|
self.time = time.strftime("%Y-%m-%d", time.localtime())
|
|
self.file_path = 'C:\\Users\\Administrator\\Desktop\\anjuke_{}_{}_{}.txt'.format(self.province, self.city,
|
|
self.time)
|
|
|
|
def get_page(self, city_py, cur_page):
|
|
# 获取并返回某一页的小区页面对象
|
|
url = 'https://{}.anjuke.com/community/p{}/'.format(city_py, cur_page)
|
|
response = requests.get(url, headers=self.headers, timeout=30)
|
|
return response
|
|
|
|
def get_location(self, city_py, estate_id):
|
|
# 获取并返回小区经纬度
|
|
url = 'https://{}.anjuke.com/community/view/{}'.format(city_py, estate_id)
|
|
response = requests.get(url, headers=self.headers, timeout=30)
|
|
bs = BeautifulSoup(response.text, 'html.parser')
|
|
comm_title = bs.find_all('div', class_='comm-title')
|
|
for title in comm_title:
|
|
a_tag = title.find_all('a', class_='map-link')
|
|
for link in a_tag:
|
|
get_link = link.get('href')
|
|
# 解析url后得到参数字典query_dict,解析字典后得到经纬度
|
|
url_data = parse.urlparse(get_link)
|
|
query_dict = parse.parse_qs(url_data.fragment)
|
|
latitude = query_dict['l1'][0]
|
|
longitude = query_dict['l2'][0]
|
|
return latitude, longitude
|
|
|
|
def has_next(self, city_py, cur_page):
|
|
# 判断当前页面是否有下一页并返回结果
|
|
response = self.get_page(city_py, cur_page)
|
|
bs = BeautifulSoup(response.text, 'html.parser')
|
|
multi_page = bs.find_all('div', class_='multi-page')
|
|
for page in multi_page:
|
|
next_page = page.find_all('a', class_='aNxt')
|
|
for anxt in next_page:
|
|
next_flag = anxt.attrs['href']
|
|
return next_flag
|
|
|
|
def write_file(self, estate_id, estate_name, lat, lng, estate_district, estate_town, estate_street, fw):
|
|
# 将爬取的'城市-小区-街道-纬度-经度'保存到文件
|
|
province = self.province.strip()
|
|
city = self.city.strip()
|
|
id = estate_id.strip()
|
|
estate = estate_name.strip()
|
|
lat = lat.strip()
|
|
lng = lng.strip()
|
|
district = estate_district.strip()
|
|
town = estate_town.strip()
|
|
street = estate_street.strip()
|
|
fw.write(
|
|
province + '^' + city + '^' + id + '^' + estate + '^' + lat + '^' + lng + '^' +
|
|
district + '^' + town + '^' + street + '\n')
|
|
|
|
def read_file(self):
|
|
open_file = open(self.file_path, "r", encoding='UTF-8-sig')
|
|
data_list = []
|
|
for line in open_file:
|
|
data = line.rstrip('\n').split('^')
|
|
data_list.append(data)
|
|
print(data_list)
|
|
open_file.close()
|
|
return data_list
|
|
|
|
def insert_mysql(self, data_list):
|
|
# 保存到数据库 '47.106.79.88', 3306, '1hjz', 'root', 'u2018@U2018'
|
|
print("连接数据库")
|
|
db = connect(host='47.106.79.88', port=3306, db='1hjz', user='root', passwd='u2018@U2018')
|
|
cursor = db.cursor()
|
|
insert_sql = "INSERT INTO import_estate(province_name,city_name,estate_id,estate_name,lat,lng,estate_district,estate_town,estate_street) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
|
|
try:
|
|
cursor.execute('''CREATE TABLE if not exists `import_estate` (
|
|
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
|
|
`province_name` varchar(255) DEFAULT '' COMMENT '省份名称',
|
|
`city_name` varchar(255) DEFAULT '' COMMENT '城市名称',
|
|
`estate_id` varchar(255) DEFAULT '' COMMENT '安居客的小区id',
|
|
`estate_name` varchar(255) DEFAULT '' COMMENT '小区名称',
|
|
`lat` decimal(10,6) DEFAULT '0.000000' COMMENT '纬度',
|
|
`lng` decimal(10,6) DEFAULT '0.000000' COMMENT '经度',
|
|
`estate_district` varchar(255) DEFAULT '' COMMENT '行政区',
|
|
`estate_town` varchar(255) DEFAULT '' COMMENT '区镇/县',
|
|
`estate_street` varchar(255) DEFAULT '' COMMENT '街道',
|
|
PRIMARY KEY (`id`)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='小区信息导入表';''')
|
|
cursor.executemany(insert_sql, data_list)
|
|
db.commit()
|
|
except Exception as e:
|
|
print(e)
|
|
db.rollback()
|
|
cursor.close()
|
|
db.close()
|
|
|
|
def save_mysql(self):
|
|
print("等待写入数据库...")
|
|
data_list = self.read_file()
|
|
if data_list:
|
|
self.insert_mysql(data_list)
|
|
print("写入数据库完成!")
|
|
else:
|
|
print("文件不存在或没有数据")
|
|
|
|
def save_file(self):
|
|
fw = open(self.file_path, "a", encoding='utf-8')
|
|
print("开始爬取{}小区数据,请稍等...".format(self.city))
|
|
cur_page = 1
|
|
# 获取城市全拼作为参数
|
|
pinyin = Pinyin()
|
|
city_py = str(pinyin.get_pinyin(self.city, ''))
|
|
while True:
|
|
self.parse_page(city_py, cur_page, fw)
|
|
if self.has_next(city_py, cur_page):
|
|
cur_page += 1
|
|
else:
|
|
break
|
|
fw.close()
|
|
print("小区数据已写入文件...")
|
|
|
|
def parse_page(self, city_py, cur_page, fw):
|
|
# 调用self.get_estate()方法得到小区页面对象,并结合其他类方法抓取页面信息
|
|
response = self.get_page(city_py, cur_page)
|
|
bs = BeautifulSoup(response.text, 'html.parser')
|
|
estate_list = bs.find_all('div', class_='li-itemmod')
|
|
for li in estate_list:
|
|
infos = li.find_all('div', class_='li-info')
|
|
for info in infos:
|
|
# 通过h3标签中的标题找到小区名:estate_name
|
|
titles = info.find_all('h3')
|
|
for title in titles:
|
|
estate_name = title.get_text().strip()
|
|
# 通过a标签中的链接获取小区id:estate_id href = 'https://guangzhou/anjuke.com/community/view/607976'
|
|
a_tag = title.find_all('a')
|
|
for link in a_tag:
|
|
get_link = link.attrs['href']
|
|
pattern = re.compile('[0-9]{1,}')
|
|
estate_id = pattern.findall(get_link)[0]
|
|
# 调用self.get_location()方法获取小区经纬度
|
|
estate_location = self.get_location(city_py, estate_id)
|
|
lat = estate_location[0]
|
|
lng = estate_location[1]
|
|
# 获取小区的地址
|
|
address_list = info.find_all('address')
|
|
for address in address_list:
|
|
ajk_address = address.get_text().strip().split(']')
|
|
estate_district = ajk_address[0].split('[')[1].split('-')[0]
|
|
estate_town = ajk_address[0].split('[')[1].split('-')[1]
|
|
estate_street = ajk_address[1]
|
|
self.write_file(estate_id, estate_name, lat, lng, estate_district, estate_town, estate_street, fw)
|
|
# 关键打印信息:爬取时出错能判断已经抓取到哪一页的哪个小区
|
|
print(
|
|
'省份:' + self.province + '、城市:' + self.city + '、id:' + estate_id + '、小区:' + estate_name + '、纬度:' + lat +
|
|
'、经度:' + lng + '、行政区:' + estate_district + '、区镇/县:' + estate_town + '、街道:' + estate_street +
|
|
'、当前页:' + str(cur_page))
|
|
|
|
def start(self):
|
|
if self.save_func == 'file':
|
|
self.save_file()
|
|
elif self.save_func == 'mysql':
|
|
self.save_mysql()
|
|
else:
|
|
print('不支持的保存方式')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# 目标城市 广东省:广州,深圳,清远,湛江;湖南省 :长沙,衡阳
|
|
start_time = time.time()
|
|
# province = '广东省'
|
|
province = '湖南省'
|
|
city = '衡阳'
|
|
# save_func = 'file'
|
|
save_func = 'mysql'
|
|
spider = AnJuKe(province, city, save_func)
|
|
spider.start()
|
|
end_time = time.time()
|
|
print("已完成,用时%.2f秒" % (end_time - start_time))
|