Source code for geograpy.wikidata

Created on 2020-09-23

@author: wf
import re
import time
from lodstorage.sparql import SPARQL

[docs]class Wikidata(object): ''' Wikidata access ''' def __init__(self, endpoint=''): ''' Constructor ''' self.endpoint=endpoint
[docs] def getCityPopulations(self, profile=True): ''' get the city populations from Wikidata Args: profile(bool): if True show profiling information ''' queryString=""" # get a list of human settlements having a geoName identifier # to add to geograpy3 library # see PREFIX rdfs: <> PREFIX wdt: <> PREFIX wd: <> SELECT ?city ?cityLabel ?cityPop ?geoNameId ?country ?countryLabel ?countryIsoCode ?countryPopulation WHERE { # geoName Identifier ?city wdt:P1566 ?geoNameId. # instance of human settlement ?city wdt:P31/wdt:P279* wd:Q486972 . # population of city OPTIONAL { ?city wdt:P1082 ?cityPop.} # label of the City ?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en"). # country this city belongs to ?city wdt:P17 ?country . # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). # ISO 3166-1 alpha-2 code ?country wdt:P297 ?countryIsoCode. # population of country ?country wdt:P1082 ?countryPopulation. OPTIONAL { ?country wdt:P2132 ?countryGdpPerCapita. } }""" if profile: print("getting cities with population and geoNamesId from wikidata endpoint %s" %self.endpoint) starttime=time.time() wd=SPARQL(self.endpoint) results=wd.query(queryString) cityList=wd.asListOfDicts(results) if profile: print("Found %d cities in %5.1f s" % (len(cityList),time.time()-starttime)) return cityList
[docs] def getCities(self,region=None, country=None): ''' get the cities from Wikidata Args: region: List of countryWikiDataIDs. Limits the returned cities to the given countries country: List of regionWikiDataIDs. Limits the returned cities to the given regions ''' values="" if region is not None: values+=Wikidata.getValuesClause("region", region) if country is not None: values+=Wikidata.getValuesClause("country", country) queryString="""# get a list of cities for the given region # for geograpy3 library # see PREFIX rdfs: <> PREFIX wdt: <> PREFIX wd: <> SELECT DISTINCT ?city ?cityLabel ?geoNameId ?cityPop ?cityCoord ?region ?regionLabel ?regionIsoCode ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGdpPerCapita WHERE { # administrative unit of first order # example DE-NW Q1198 %s #?region wdt:P31/wdt:P279* wd:Q10864048. ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en"). # isocode state/province OPTIONAL { ?region wdt:P300 ?regionIsoCode. } # country this region belongs to ?region wdt:P17 ?country . # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). # ISO 3166-1 alpha-2 code ?country wdt:P297 ?countryIsoCode. # population of country ?country wdt:P1082 ?countryPopulation. OPTIONAL { ?country wdt:P2132 ?countryGdpPerCapita. } # located in administrative territory # ?city wdt:P131* ?region. # label of the City ?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en"). # instance of human settlement ?city wdt:P31/wdt:P279* wd:Q486972 . # geoName Identifier ?city wdt:P1566 ?geoNameId. # population of city OPTIONAL { ?city wdt:P1082 ?cityPop.} # get the coordinates OPTIONAL { ?city wdt:P625 ?cityCoord. } } ORDER BY ?cityLabel""" % values wd=SPARQL(self.endpoint) results=wd.query(queryString) cityList=wd.asListOfDicts(results) return cityList
[docs] def getCountries(self): ''' get a list of countries `try query <>`_ ''' queryString="""# get a list of countries # for geograpy3 library # see PREFIX rdfs: <> PREFIX wd: <> PREFIX wdt: <> PREFIX p: <> PREFIX ps: <> PREFIX pq: <> # get City details with Country SELECT DISTINCT ?country ?countryLabel ?countryIsoCode ?countryCoord ?countryPopulation ?continent ?continentLabel WHERE { # instance of Country ?country wdt:P31/wdt:P279* wd:Q6256 . # VALUES ?country { wd:Q55}. # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). # get the continent (s) #OPTIONAL { # ?country wdt:P30 ?continent. # ?continent rdfs:label ?continentLabel filter (lang(?continentLabel) = "en"). #} # get the coordinates OPTIONAL { ?country wdt:P625 ?countryCoord. } # ISO 3166-1 alpha-2 code ?country wdt:P297 ?countryIsoCode. # population of country OPTIONAL { SELECT ?country (max(?countryPopulationValue) as ?countryPopulation) WHERE { ?country wdt:P1082 ?countryPopulationValue } group by ?country } # # nominal GDP per capita # OPTIONAL { ?country wdt:P2132 ?countryGDP_perCapitaValue. } } ORDER BY ?countryIsoCode""" wd=SPARQL(self.endpoint) results=wd.query(queryString) self.countryList=wd.asListOfDicts(results)
[docs] def getRegions(self): ''' get Regions from Wikidata `try query <>`_ ''' queryString="""# get a list of regions # for geograpy3 library # see PREFIX rdfs: <> PREFIX wd: <> PREFIX wdt: <> PREFIX wikibase: <> SELECT DISTINCT ?country ?countryLabel ?countryIsoCode ?region ?regionLabel ?regionIsoCode ?regionPopulation ?location WHERE { # administrative unit of first order ?region wdt:P31/wdt:P279* wd:Q10864048. OPTIONAL { ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en"). } # isocode state/province (mandatory - filters historic regions while at it ...) # filter historic regions # FILTER NOT EXISTS {?region wdt:P576 ?end} { SELECT ?region (max(?regionAlpha2) as ?regionIsoCode) (max(?regionPopulationValue) as ?regionPopulation) (max(?locationValue) as ?location) WHERE { ?region wdt:P300 ?regionAlpha2. # get the population # OPTIONAL { ?region wdt:P1082 ?regionPopulationValue } # get he location # OPTIONAL { ?region wdt:P625 ?locationValue. } } GROUP BY ?region } # # OPTIONAL { ?region wdt:P17 ?country. # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). ?country wdt:P297 ?countryIsoCode. } } ORDER BY ?regionIsoCode""" wd=SPARQL(self.endpoint) results=wd.query(queryString) self.regionList=wd.asListOfDicts(results)
[docs] def getCitiesOfRegion(self, regionWikidataId: str, limit:int): """ Queries the cities of the given region. If the region is a city state the region is returned as city. The cities are ordered by population and can be limited by the given limit attribute. Args: regionWikidataId: wikidata id of the region the cities should be queried for limit: Limits the amount of returned cities Returns: Returns list of cities of the given region ordered by population """ query = """ SELECT distinct ?city ?cityLabel ?cityPop ?cityCoord WHERE { VALUES ?possibleCityID {wd:Q1549591 wd:Q515 wd:Q1637706 wd:Q1093829 wd:Q486972} wd:%s (^wdt:P131|^wdt:P131/^wdt:P131|^wdt:P131/^wdt:P131/^wdt:P131|^wdt:P131/^wdt:P131/^wdt:P131/^wdt:P131) ?city . { ?city wdt:P31 ?x . ?x wdt:P279 ?possibleCityID . }UNION{ ?city wdt:P31 ?possibleCityID . } OPTIONAL{ ?city rdfs:label ?cityLabel . FILTER(lang(?cityLabel)="en") } OPTIONAL{ ?city wdt:P1082 ?cityPop . } OPTIONAL{ ?city wdt:P625 ?cityCoord . } } ORDER BY DESC(?cityPop) LIMIT %s """ % (regionWikidataId, limit) askIfCity = """ SELECT * WHERE{ VALUES ?possibleCityID {wd:Q1549591 wd:Q515 wd:Q1637706 wd:Q1093829 wd:Q486972 wd:Q133442} wd:%s wdt:P31 ?possibleCityID . } """ % (regionWikidataId) wd = SPARQL(self.endpoint) cities = [] ids = [] # check if region is city (city-state) try: isCityResult = wd.query(askIfCity) isCity = wd.asListOfDicts(isCityResult) if isCity: # TODO: return region as city once city class is refactored pass else: pass except Exception as e: print(e) pass try: queryRes = wd.query(query) cityLoD=wd.asListOfDicts(queryRes) res=[] # [{'city': '', 'citylabel': 'Greater Los Angeles Area', 'population': 18550288.0, 'coordinates': 'Point(-118.25 35.05694444)'}] for cityRecord in cityLoD: res.append(cityRecord) return res except Exception as e: print(e) pass return cities
[docs] @staticmethod def getCoordinateComponents(coordinate:str) -> (float, float): ''' Converts the wikidata coordinate representation into its subcomponents longitude and latitude Example: 'Point(-118.25 35.05694444)' results in ('-118.25' '35.05694444') Args: coordinate: coordinate value in the format as returned by wikidata queries Returns: Returns the longitude and latitude of the given coordinate as separate values ''' # floatRegex=r"[-+]?\d+([.,]\d*)?" regexp=fr"Point\((?P<lon>{floatRegex})\s+(?P<lat>{floatRegex})\)" cMatch=None if coordinate: try: cMatch =, coordinate) except Exception as ex: # ignore pass if cMatch:"lat")"lon") lat,lon=float(latStr.replace(",",".")),float(lonStr.replace(",",".")) if lon>180: lon=lon-360 return lat,lon else: # coordinate does not have the expected format return None, None
[docs] @staticmethod def getWikidataId(wikidataURL:str): ''' Extracts the wikidata id from the given wikidata URL Args: wikidataURL: wikidata URL the id should be extracted from Returns: The wikidata id if present in the given wikidata URL otherwise None ''' # regex pattern taken from and extended to also support property ids wikidataidMatch ="[PQ][1-9]\d*", wikidataURL) if wikidataid = return wikidataid else: return None
[docs] @staticmethod def getValuesClause(varName:str, values, wikidataEntities:bool=True): ''' generates the SPARQL value clause for the given variable name containing the given values Args: varName: variable name for the ValuesClause values: values for the clause wikidataEntities(bool): if true the wikidata prefix is added to the values otherwise it is expected taht the given values are proper IRIs Returns: str ''' clauseValues="" if isinstance(values, list): for value in values: if wikidataEntities: clauseValues+=f"wd:{value} " else: clauseValues+=f"{value} " else: if wikidataEntities: clauseValues = f"wd:{values} " else: clauseValues = f"{values} " clause = "VALUES ?%s { %s }" %(varName, clauseValues) return clause