Browse Source

新增安居客网站小区爬取类

develop
chenhong 6 years ago
parent
commit
787f7915f7
1 changed files with 216 additions and 0 deletions
  1. 216
      anjuke.py

216
anjuke.py

@ -0,0 +1,216 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 文件名:anjuke.py
# 日期:2019-08-08
"""
AnJuke类的作用是模拟浏览器
.txt文件.txt文件并插入到指定数据库表
AnJuke类的实例
ajk = AnJuKe(province, city, save_func)
province .txt文件的数据源之一
city
save_func 1file 2mysql
ajk.start()
start() self.save_func的值 save_file() save_mysql()
save_file() get_page() write_file().txt文件
save_mysql() read_file().txt文件 insert_mysql(data_list)
1save_file()cur_page默认为1.txt文件末尾
2print打印的当前页数及小区cur_page继续爬取
"""
from bs4 import BeautifulSoup
from urllib import parse
from xpinyin import Pinyin
from pymysql import connect
import re
import requests
import time
class AnJuKe:
# 安居客爬取类
def __init__(self, province, city, save_func):
# 初始化请求header、城市列表、文件路径
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Referer': 'https: // wuhan.anjuke.com / sale /?from=navigation',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
self.save_func = save_func
self.province = province
self.city = city
self.time = time.strftime("%Y-%m-%d", time.localtime())
self.file_path = 'C:\\Users\\Administrator\\Desktop\\anjuke_{}_{}_{}.txt'.format(self.province, self.city,
self.time)
def get_page(self, city_py, cur_page):
# 获取并返回某一页的小区页面对象
url = 'https://{}.anjuke.com/community/p{}/'.format(city_py, cur_page)
response = requests.get(url, headers=self.headers, timeout=30)
return response
def get_location(self, city_py, estate_id):
# 获取并返回小区经纬度
url = 'https://{}.anjuke.com/community/view/{}'.format(city_py, estate_id)
response = requests.get(url, headers=self.headers, timeout=30)
bs = BeautifulSoup(response.text, 'html.parser')
comm_title = bs.find_all('div', class_='comm-title')
for title in comm_title:
a_tag = title.find_all('a', class_='map-link')
for link in a_tag:
get_link = link.get('href')
# 解析url后得到参数字典query_dict,解析字典后得到经纬度
url_data = parse.urlparse(get_link)
query_dict = parse.parse_qs(url_data.fragment)
latitude = query_dict['l1'][0]
longitude = query_dict['l2'][0]
return latitude, longitude
def has_next(self, city_py, cur_page):
# 判断当前页面是否有下一页并返回结果
response = self.get_page(city_py, cur_page)
bs = BeautifulSoup(response.text, 'html.parser')
multi_page = bs.find_all('div', class_='multi-page')
for page in multi_page:
next_page = page.find_all('a', class_='aNxt')
for anxt in next_page:
next_flag = anxt.attrs['href']
return next_flag
def write_file(self, estate_id, estate_name, lat, lng, estate_district, estate_town, estate_street, fw):
# 将爬取的'城市-小区-街道-纬度-经度'保存到文件
province = self.province.strip()
city = self.city.strip()
id = estate_id.strip()
estate = estate_name.strip()
lat = lat.strip()
lng = lng.strip()
district = estate_district.strip()
town = estate_town.strip()
street = estate_street.strip()
fw.write(
province + '^' + city + '^' + id + '^' + estate + '^' + lat + '^' + lng + '^' +
district + '^' + town + '^' + street + '\n')
def read_file(self):
open_file = open(self.file_path, "r", encoding='UTF-8-sig')
data_list = []
for line in open_file:
data = line.rstrip('\n').split('^')
data_list.append(data)
print(data_list)
open_file.close()
return data_list
def insert_mysql(self, data_list):
# 保存到数据库 '47.106.79.88', 3306, '1hjz', 'root', 'u2018@U2018'
print("连接数据库")
db = connect(host='47.106.79.88', port=3306, db='1hjz', user='root', passwd='u2018@U2018')
cursor = db.cursor()
insert_sql = "INSERT INTO import_estate(province_name,city_name,estate_id,estate_name,lat,lng,estate_district,estate_town,estate_street) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
try:
cursor.execute('''CREATE TABLE if not exists `import_estate` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`province_name` varchar(255) DEFAULT '' COMMENT '省份名称',
`city_name` varchar(255) DEFAULT '' COMMENT '城市名称',
`estate_id` varchar(255) DEFAULT '' COMMENT '安居客的小区id',
`estate_name` varchar(255) DEFAULT '' COMMENT '小区名称',
`lat` decimal(10,6) DEFAULT '0.000000' COMMENT '纬度',
`lng` decimal(10,6) DEFAULT '0.000000' COMMENT '经度',
`estate_district` varchar(255) DEFAULT '' COMMENT '行政区',
`estate_town` varchar(255) DEFAULT '' COMMENT '区镇/县',
`estate_street` varchar(255) DEFAULT '' COMMENT '街道',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='小区信息导入表';''')
cursor.executemany(insert_sql, data_list)
db.commit()
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
def save_mysql(self):
print("等待写入数据库...")
data_list = self.read_file()
if data_list:
self.insert_mysql(data_list)
print("写入数据库完成!")
else:
print("文件不存在或没有数据")
def save_file(self):
fw = open(self.file_path, "a", encoding='utf-8')
print("开始爬取{}小区数据,请稍等...".format(self.city))
cur_page = 1
# 获取城市全拼作为参数
pinyin = Pinyin()
city_py = str(pinyin.get_pinyin(self.city, ''))
while True:
self.parse_page(city_py, cur_page, fw)
if self.has_next(city_py, cur_page):
cur_page += 1
else:
break
fw.close()
print("小区数据已写入文件...")
def parse_page(self, city_py, cur_page, fw):
# 调用self.get_estate()方法得到小区页面对象,并结合其他类方法抓取页面信息
response = self.get_page(city_py, cur_page)
bs = BeautifulSoup(response.text, 'html.parser')
estate_list = bs.find_all('div', class_='li-itemmod')
for li in estate_list:
infos = li.find_all('div', class_='li-info')
for info in infos:
# 通过h3标签中的标题找到小区名:estate_name
titles = info.find_all('h3')
for title in titles:
estate_name = title.get_text().strip()
# 通过a标签中的链接获取小区id:estate_id href = 'https://guangzhou/anjuke.com/community/view/607976'
a_tag = title.find_all('a')
for link in a_tag:
get_link = link.attrs['href']
pattern = re.compile('[0-9]{1,}')
estate_id = pattern.findall(get_link)[0]
# 调用self.get_location()方法获取小区经纬度
estate_location = self.get_location(city_py, estate_id)
lat = estate_location[0]
lng = estate_location[1]
# 获取小区的地址
address_list = info.find_all('address')
for address in address_list:
ajk_address = address.get_text().strip().split('')
estate_district = ajk_address[0].split('')[1].split('-')[0]
estate_town = ajk_address[0].split('')[1].split('-')[1]
estate_street = ajk_address[1]
self.write_file(estate_id, estate_name, lat, lng, estate_district, estate_town, estate_street, fw)
# 关键打印信息:爬取时出错能判断已经抓取到哪一页的哪个小区
print(
'省份:' + self.province + '、城市:' + self.city + '、id:' + estate_id + '、小区:' + estate_name + '、纬度:' + lat +
'、经度:' + lng + '、行政区:' + estate_district + '、区镇/县:' + estate_town + '、街道:' + estate_street +
'、当前页:' + str(cur_page))
def start(self):
if self.save_func == 'file':
self.save_file()
elif self.save_func == 'mysql':
self.save_mysql()
else:
print('不支持的保存方式')
if __name__ == '__main__':
# 目标城市 广东省:广州,深圳,清远,湛江;湖南省 :长沙,衡阳
start_time = time.time()
# province = '广东省'
province = '湖南省'
city = '衡阳'
# save_func = 'file'
save_func = 'mysql'
spider = AnJuKe(province, city, save_func)
spider.start()
end_time = time.time()
print("已完成,用时%.2f" % (end_time - start_time))
Loading…
Cancel
Save