from .utils import remove_non_ascii, fuzzy_match
from collections import Counter
from geograpy.locator import Locator, City, Region
"""
Takes a list of place names and works place designation (country, region, etc)
and relationships between places (city is inside region is inside country, etc)
"""
[docs]class PlaceContext(Locator):
'''
Adds context information to a place name
'''
def __init__(self, place_names:list, setAll:bool=True,correctMisspelling:bool=False):
'''
Constructor
Args:
place_names:
list: The place names to check
setAll:
boolean: True if all context information should immediately be set
db_file:
string: Path to the database file to be used - if None the default "locs.db" will be used
'''
super().__init__()
self.correctMisspelling=correctMisspelling
self.places = self.normalizePlaces(place_names)
if setAll:
self.setAll()
def __str__(self):
'''
Return a string representation of me
'''
text= "countries=%s\nregions=%s\ncities=%s\nother=%s" % (self.countries,self.regions,self.cities,self.other)
return text
[docs] def getRegions(self, countryName:str)->list:
'''
get a list of regions for the given countryName
countryName(str): the countryName to check
'''
regions = []
queryString="""SELECT r.* FROM
COUNTRIES c
JOIN regions r ON r.countryId=c.wikidataid
WHERE c.name=(?)"""
params=(countryName,)
regionRecords=self.sqlDB.query(queryString, params)
for regionRecord in regionRecords:
region=Region.fromRecord(regionRecord)
regions.append(region)
return regions
[docs] def get_region_names(self, countryName:str)->list:
'''
get region names for the given country
Args:
countryName(str): the name of the country
'''
if self.correctMisspelling:
countryName = self.correct_country_misspelling(countryName)
regionOfCountryQuery="""SELECT name
FROM regions
WHERE countryId IN (
SELECT wikidataid
FROM countries
WHERE name LIKE (?)
OR wikidataid IN (
SELECT wikidataid
FROM country_labels
WHERE label LIKE (?)
)
)"""
regionRecords=self.sqlDB.query(regionOfCountryQuery, params=(countryName,countryName,))
return [r.get('name') for r in regionRecords]
[docs] def setAll(self):
'''
Set all context information
'''
self.set_countries()
self.set_regions()
self.set_cities()
self.set_other()
[docs] def set_countries(self):
'''
get the country information from my places
'''
countries = []
for place in self.places:
country=self.getCountry(place)
if country is not None:
countries.append(country.name)
self.country_mentions = Counter(countries).most_common()
self.countries = list(set(countries))
pass
[docs] def set_regions(self):
'''
get the region information from my places (limited to the already identified countries)
'''
regions = []
self.country_regions = {}
region_names = {}
if not self.countries:
self.set_countries()
def region_match(place_name:str, region_name:str)->bool:
'''
Tests the similarity of the given strings after removing non ascii characters.
Args:
place_name(str): Place name
region_name(str): valid region name to test against
Returns:
True if the similarity of both values is greater equals 80%. Otherwise False
'''
return fuzzy_match(remove_non_ascii(place_name),
remove_non_ascii(region_name))
def is_region(place_name:str, region_names:list):
'''
Filters out the regions that are not similar to the given place_name
Args:
place_name(str): place name to check against the regions
region_names(list): List of valid region names
Returns:
List of regions that are similar to the given place_name
'''
return any([region_match(place_name, rn) for rn in region_names])
for country in self.countries:
region_names = self.get_region_names(country)
matched_regions = [p for p in set(self.places) if is_region(p, region_names)]
regions += matched_regions
self.country_regions[country] = list(set(matched_regions))
self.region_mentions = Counter(regions).most_common()
self.regions = list(set(regions))
[docs] def set_cities(self):
'''
set the cities information
'''
self.cities = []
self.country_cities = {}
self.address_strings = []
if not self.countries:
self.set_countries()
if not self.regions:
self.set_regions()
if not self.db_has_data():
self.populate_db()
# ToDo: Duplicate with Locator.city_for_name e.g. extend method to support multiple names
placesWithoutDuplicates=set(self.places)
params=",".join("?" * len(placesWithoutDuplicates))
query="SELECT * FROM CityLookup WHERE name IN (" + params + ")"
cityLookupRecords=self.sqlDB.query(query,list(placesWithoutDuplicates))
cityLookupRecords.sort(key=lambda cityRecord: float(cityRecord.get('pop')) if cityRecord.get('pop') is not None else 0.0 , reverse=True)
for cityLookupRecord in cityLookupRecords:
city=City.fromCityLookup(cityLookupRecord)
if city.name not in self.cities:
self.cities.append(city.name)
countryName=city.country.name
if countryName not in self.countries:
self.countries.append(countryName)
self.country_mentions.append((countryName, 1))
if countryName not in self.country_cities:
self.country_cities[countryName] = []
if city.name not in self.country_cities[countryName]:
self.country_cities[countryName].append(city.name)
regionName=city.region.name
if countryName in self.country_regions and regionName in self.country_regions[countryName]:
address=f"{city.name}, {regionName}, {countryName}"
self.address_strings.append(address)
all_cities = [p for p in self.places if p in self.cities]
self.city_mentions = Counter(all_cities).most_common()
[docs] def set_other(self):
if not self.cities:
self.set_cities()
def unused(place_name):
places = [self.countries, self.cities, self.regions]
return all(self.correct_country_misspelling(place_name) not in l for l in places)
self.other = [p for p in self.places if unused(p)]