From d33df73b340ca2a967c5e81b6b62d66497caca25 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Mon, 23 Sep 2024 16:55:32 -0400 Subject: [PATCH] Filter city by minimum population only after adding state --- mira/dkg/resources/geonames.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mira/dkg/resources/geonames.py b/mira/dkg/resources/geonames.py index 1334a2683..adfa771e5 100644 --- a/mira/dkg/resources/geonames.py +++ b/mira/dkg/resources/geonames.py @@ -109,16 +109,14 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul ), ) - cities_df = cities_df[cities_df.population.astype(int) > minimum_population] cities_df.synonyms = cities_df.synonyms.str.split(",") terms = {} for term in code_to_country.values(): terms[term.identifier] = term - - cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", "admin2"] - for identifier, name, synonyms, country, admin1, admin2 in cities_df[cols].values: - terms[identifier] = term = Term.from_triple("geonames", identifier, name) + cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", + "admin2", "population"] + for identifier, name, synonyms, country, admin1, admin2, population in (cities_df[cols].values): if synonyms and not isinstance(synonyms, float): for synoynm in synonyms: term.append_synonym(synoynm) @@ -135,6 +133,11 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul terms[admin1_term.identifier] = admin1_term + # We skip cities that don't meet the minimum population requirement + if int(population) < minimum_population: + continue + terms[identifier] = term = Term.from_triple("geonames", identifier, + name) if pd.notna(admin2): admin2_full = f"{country}.{admin1}.{admin2}" admin2_term = code_to_admin2.get(admin2_full)