说明:城市吧(www.city8.com)是以实景地图搜索服务为特色的网站。相对于传统的地图搜索服务,除文字和二维地图的信息,城市吧可以提供每个位置点对应的360度真实场景。
最近通过city8爬取全国各城市的道路名称,参考了 Alfred数据室 的项目 Alfred1984-interesting-python-Roads ,将原来的MongoClient换成了Mysql。
要先修改代码下的数据库信息,并手动创建city_road表,name (city,road),类型(text,text)
import re
import requests
from lxml import etree
import pymysql
class Roads(object):
def __init__(self):
self.city_list = None
self.alphabets = None
self.na_city = []
self.client = pymysql.connect("119.23.111.11","root","password","cityroad")
self.col = self.client.cursor()
def get_city_list(self):
url = 'http://www.city8.com/'
res = requests.get(url)
pat1 = r"city8.com/'>(.*?)</a></li>"
pat2 = r"<a target='_blank' href='(.*?)/'>"
city = re.findall(string=res.text, pattern=pat1)
href = re.findall(string=res.text, pattern=pat2)
self.city_list = dict(zip(city, href))
print('Got city list!')
print(self.city_list)
def get_alphabet(self):
url = 'http://xm.city8.com/road/A/'
res = requests.get(url)
parsed = etree.HTML(res.text)
self.alphabets = parsed.xpath("/html/body/div/div[2]/div[3]/div[1]/div[1]/div[1]/a/text()")
print('Got alphabets!')
print(self.alphabets)
def get_city_roads(self):
for city, href in self.city_list.items():
res_test = requests.get('http:'+href+'/road')
print(res_test)
if res_test.text.find('/road/a/') != -1:
print('Crawling road data of city: {}'.format(city))
for alpha in self.alphabets:
res_road = requests.get('http:'+href+'/road/'+alpha)
parsed = etree.HTML(res_road.text)
roads = parsed.xpath('/html/body/div/div[2]/div[3]/div[1]/div[2]/a/text()')
if len(roads) > 0:
for rd in roads:
self.col.execute("insert into city_road (city,road) values('%s','%s')" % (city, rd.strip()))
self.client.commit()
print('Successfully crawled city: {}, alphabet: {}'.format(city, alpha))
else:
print('City: {} alphabet: {} got no data'.format(city, alpha))
else:
print('There is no road data of city: {}'.format(city))
self.na_city.append(city)
print('These are cities with no road data: {} \n '
'You might want to crawl road data of these cities from elsewhere.'.format(self.na_city))
if __name__ == '__main__':
r = Roads()
r.get_city_list()
r.get_alphabet()
r.get_city_roads()
版权声明:本文为原创文章,版权归 Helo 所有。
本文链接:https://www.ishelo.com/archives/221/
商业转载请联系作者获得授权,非商业转载请注明出处。
Comment here is closed