Get a sample¶
In [6]:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import xml.etree.ElementTree as ET # Use cElementTree or lxml if too slow
OSM_FILE = "beijing_china.osm" # Replace this with your osm file
SAMPLE_FILE = "beijing_sample2222.osm"
k = 100 # Parameter: take every k-th top level element
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag
Reference:
http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
"""
context = iter(ET.iterparse(osm_file, events=('start', 'end')))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
with open(SAMPLE_FILE, 'wb') as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\n'.encode())
output.write('<osm>\n '.encode())
# Write every kth top level element
for i, element in enumerate(get_element(OSM_FILE)):
if i % k == 0:
output.write(ET.tostring(element, encoding='utf-8'))
output.write('</osm>'.encode())
In [44]:
count_tags('beijing_china.osm')
Out[44]:
In [2]:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import pandas as pd
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import csv
import codecs
import pprint
import re
from collections import defaultdict
#import goslate
import sqlite3
In [3]:
def tags_info(filename,elem_tag,tag_k):
'''this function can return tag information in ways or relation or nodes. elem_tag=[way,node,relation], tag_k = keys of tags
that you are interested,such as tag_k =[name,name_zh,name_en...]'''
tags_info_list = []
tags_info_k = []
for _,elem in ET.iterparse(filename,events=('start',)):
if elem.tag == elem_tag:
for child in elem:
if child.tag == 'tag':
tags_info_k.append(child.attrib['k'])
if child.attrib['k']==tag_k:
tags_info_list.append(child.attrib['v'])
else:
continue
return tags_info_list
def count_tags(filename):
tags = {}
taglist = []
for _,elem in ET.iterparse(filename,events=('start',)):
taglist.append(elem.tag)
for tag in taglist:
if tag not in tags:
tags[tag] = 1
else:
tags[tag] += 1
return tags
def process():
filename_path = r'beijing_china.osm'
with open(filename_path,'r',encoding='utf-8') as filename:
#tags = count_tags(filename)
tagsinfo = tags_info(filename,'node','cuisine')
return set(tagsinfo)
hh = process()
Audit and Update data set¶
In [34]:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
mapping_dict = {
'Road':'路',
'Expressway':'高速公路',
'Lu':'路',
'lu':'路',
'Hutong':'胡同',
'hutong':'胡同',
'Embassy':'大使馆',
'Coffee':'咖啡厅'
}
def update_road_name(name,mapping_dict):
name_list = name.split()
if name_list[-1] in mapping_dict:
name_list[-1] = mapping_dict[name_list[-1]]
return ' '.join(name_list)
else:
return name
In [27]:
def update_cuisine(cuisine):
return cuisine.split(';')[0]
def update_phone(phone):
if len(phone) == 8:
return '+86010'+phone
elif len(phone) == 11:
return '+86'+phone
elif len(phone) >11:
return phone.replace(' ','')
elif len(phone) <8:
return 'Error'+'+'+phone
elif len(phone) >8 and len(phone) < 11:
return 'Error'+'+'+phone
def update_source(source):
import re
sourcename = re.split(r'[,;_. ]',source)
sourcename_0 = sourcename[0]
reg_bing = re.compile(re.escape('bing'), re.IGNORECASE)
reg_gps = re.compile(re.escape('gps'), re.IGNORECASE)
reg_yahoo = re.compile(re.escape('yahoo'), re.IGNORECASE)
reg_survey = re.compile(re.escape('survey'), re.IGNORECASE)
sourcename_0 = reg_bing.sub('Bing', sourcename_0)
sourcename_0 = reg_gps.sub('GPS', sourcename_0)
sourcename_0 = reg_yahoo.sub('Yahoo', sourcename_0)
sourcename_0 = reg_survey.sub('Survey', sourcename_0)
if sourcename_0 not in ['Bing','GPS','Yahoo','Survey']:
sourcename_0 = 'other'
return sourcename_0
build dict format¶
In [32]:
OSM_PATH = "beijing_china.osm"
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
#SCHEMA = schema.schema
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
problem_chars=PROBLEMCHARS, default_tag_type='regular'):
"""Clean and shape node or way XML element to Python dict"""
node_attribs = {}
way_attribs = {}
way_nodes = []
tags = [] # Handle secondary tags the same way for both node and way elements
#tags_dict = {}
way_nodes_dict = {}
# YOUR CODE HERE
if element.tag == 'node':
for i in NODE_FIELDS:
node_attribs[i] = element.attrib[i]
if len(element):# this used to show if the element has child or not
for child in element:
tags_dict = {}
if child.tag == 'tag':
problems = PROBLEMCHARS.match(child.attrib['k'])
if problems:
continue
else:
colon = LOWER_COLON.match(child.attrib['k'])
if colon:
tags_dict['type'] = child.attrib['k'].split(':',1)[0]
tags_dict['key'] = child.attrib['k'].split(':',1)[-1]
else:
tags_dict['type'] = 'regular'
tags_dict['key'] = child.attrib['k']
tags_dict['id'] = element.attrib['id']
tags_dict['value'] = child.attrib['v']
if 'name' in tags_dict.values():
tags_dict['value'] = update_road_name(tags_dict['value'],mapping_dict)
elif 'phone' in tags_dict.values():
tags_dict['value'] = update_phone(tags_dict['value'])
elif 'source' in tags_dict.values():
tags_dict['value'] = update_source(tags_dict['value'])
elif 'cuisine' in tags_dict.values():
tags_dict['value'] = update_cuisine(tags_dict['value'])
#print (tags_dict['value'])
else:
continue
tags.append(tags_dict)
else:
tags = []
#print tags
res33 = {'node': node_attribs, 'node_tags': tags}
return res33
''' #这段代码用于调试
for i in res33['node_tags']:
if i['key'] == 'phone':
print (i['value'])'''
elif element.tag == 'way':
for i in WAY_FIELDS:
way_attribs[i] = element.attrib[i]
tag_list_elem = []
for elem in list(element):
tag_list_elem.append(elem.tag)
#print tag_list_elem
count = 0
if len(element):
for child in element:
tags_dict ={}
nds_dict = {}
if child.tag == 'tag':
problems = PROBLEMCHARS.match(child.attrib['k'])
if problems:
continue
else:
colon = LOWER_COLON.match(child.attrib['k'])
if colon:
tags_dict['type'] = child.attrib['k'].split(':',1)[0]
tags_dict['key'] = child.attrib['k'].split(':',1)[-1]
else:
tags_dict['type'] = 'regular'
tags_dict['key'] = child.attrib['k']
tags_dict['id'] = element.attrib['id']
#tags_dict['value'] = child.attrib['v']
if 'name' in tags_dict.values():
tags_dict['value'] = update_road_name(child.attrib['v'],mapping_dict)
elif 'phone' in tags_dict.values():
tags_dict['value'] = update_phone(child.attrib['v'])
#print (child.attrib['v'])
elif 'source' in tags_dict.values():
tags_dict['value'] = update_source(child.attrib['v'])
elif 'cuisine' in tags_dict.values():
tags_dict['value'] = update_cuisine(child.attrib['v'])
else:
tags_dict['value'] = child.attrib['v']
tags.append(tags_dict)
#pprint.pprint(tags_dict)
if child.tag == 'nd':
nds_dict['id'] = element.attrib['id']
nds_dict['node_id'] = child.attrib['ref']
nds_dict['position'] = count
count += 1
way_nodes.append(nds_dict)
#pprint.pprint(nds_dict)
#elif 'nd' not in tag_list_elem:
#way_nodes.append(nds_dict)
else:
tags = []
#print tags
res = {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
return res
'''
for i in res['way_tags']:
if i['key'] == 'source':
print (i['value'])'''
#print (res)
Write into CSV¶
In [43]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag"""
context = ET.iterparse(osm_file, events=('start','end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
class UnicodeDictWriter(csv.DictWriter, object):
"""Extend csv.DictWriter to handle Unicode input"""
def writerow(self, row):
super(UnicodeDictWriter, self).writerow({
k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
})
def writerows(self, rows):
for row in rows:
self.writerow(row)
# ================================================== #
# Main Function #
# ================================================== #
def process_map(file_in):
"""Iteratively process each XML element and write to csv(s)"""
with open(NODES_PATH, 'w',encoding='utf-8',newline = '') as nodes_file, \
open(NODE_TAGS_PATH, 'w',encoding='utf-8',newline = '') as nodes_tags_file, \
open(WAYS_PATH, 'w',encoding='utf-8',newline = '') as ways_file, \
open(WAY_NODES_PATH, 'w',encoding='utf-8',newline = '') as way_nodes_file, \
open(WAY_TAGS_PATH, 'w',encoding='utf-8',newline = '') as way_tags_file:
nodes_writer = csv.DictWriter(nodes_file, NODE_FIELDS)
node_tags_writer = csv.DictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
ways_writer = csv.DictWriter(ways_file, WAY_FIELDS)
way_nodes_writer = csv.DictWriter(way_nodes_file, WAY_NODES_FIELDS)
way_tags_writer = csv.DictWriter(way_tags_file, WAY_TAGS_FIELDS)
nodes_writer.writeheader()# means write a row with the field name
node_tags_writer.writeheader()
ways_writer.writeheader()
way_nodes_writer.writeheader()
way_tags_writer.writeheader()
#validator = cerberus.Validator()
for element in get_element(file_in, tags=('node', 'way')):
el = shape_element(element)
if el:
if element.tag == 'node':
# 这一段就是把shape_element得到的字典转化成csv,这里面是要一行一行地进行写入的
nodes_writer.writerow(el['node'])
node_tags_writer.writerows(el['node_tags']) #好奇
elif element.tag == 'way':
ways_writer.writerow(el['way'])
way_nodes_writer.writerows(el['way_nodes'])
way_tags_writer.writerows(el['way_tags'])
process_map(OSM_PATH)
CSV to SQL¶
In [40]:
import sqlite3
conn = sqlite3.connect('p3_project.db')
c = conn.cursor()
In [ ]:
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"
In [41]:
# read csv to database by using pandas and sqlite
df_nodes = pd.read_csv(NODES_PATH)
df_nodes_tags = pd.read_csv(NODE_TAGS_PATH)
df_ways = pd.read_csv(WAYS_PATH)
df_ways_nodes = pd.read_csv(WAY_NODES_PATH)
df_ways_tags = pd.read_csv(WAY_TAGS_PATH)
df_nodes.to_sql('nodes', conn, if_exists='replace', index=False)
df_nodes_tags.to_sql('nodes_tags', conn, if_exists='replace', index=False, schema = NODE_TAGS_FIELDS)
df_ways.to_sql('ways', conn, if_exists='replace', index=False)
df_ways_tags.to_sql('ways_tags', conn, if_exists='replace', index=False)
df_ways_nodes.to_sql('ways_nodes', conn, if_exists='replace', index=False)
In [451]:
c.execute('select value,count(*) as num from nodes_tags where key = "cuisine" group by value order by num desc').fetchall()
Out[451]:
In [439]:
c.execute('select * from nodes_tags where value like "%大使馆"').fetchall()
Out[439]:
In [440]:
c.execute('select user,count(*) as num from nodes group by user order by num desc').fetchall()
Out[440]: