Merge pull request #21 from opengeos/refactor

Refactor to modules
opengeos · Oct 6, 2023 · 7890dda · 7890dda
2 parents ecd498e + a7485c5
commit 7890dda
Show file tree

Hide file tree

Showing 18 changed files with 546 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -10,17 +10,20 @@
 
 ## Introduction
 
-This repo is intended to be a set of useful scripts for working with Google's [Open Buildings](https://sites.research.google/open-buildings/)
-dataset, specifically to help translate it into [Cloud Native Geospatial](https://cloudnativegeo.org) formats. The outputs will live
-at <https://beta.source.coop/cholmes/google-open-buildings> so most people can just make use of those directly. But these are intended to
-show the process, and then they've expanded to be a way to benchmark performance. It's an odd mix right now, if I have time I'll try to
-factor out an independent 'performance' CLI to compare processes without being specific to google open buildings and mixing in functionality
-like splitting multipolygons. The repo is now named 'open-buildings', to allow it to potentially grow to be a set of useful scripts to work with
-other open buildings datasets.
+This repo is intended to be a set of useful scripts for working with Open Building Datasets, Initially Google's [Open Buildings](https://sites.research.google/open-buildings/)
+dataset and Overture's building dataset, specifically to help translate them into [Cloud Native Geospatial](https://cloudnativegeo.org) formats and then use those. The outputs will live
+on <https://beta.source.coop>, [here for Google](https://beta.source.coop/cholmes/google-open-buildings) and [here for Overture](https://beta.source.coop/cholmes/overture/) so most people can just make use of those directly. 
+
+The main operation that most people will be interested in is the 'get-buildings' command, that
+lets you supply a GeoJSON file to a command-line interface and it'll download all buildings
+in the area supplied, output in common GIS formats (GeoPackage, FlatGeobuf, Shapefile, GeoJSON and GeoParquet).
+
+The rest of the CLI's and scripts are intended to show the process of transforming the data,
+ and then they've expanded to be a way to benchmark performance.
 
 This is basically my first Python project, and certainly my first open source one. It is only possible due to ChatGPT, as I'm not a python
 programmer, and not a great programmer in general (coded professionally for about 2 years, then shifted to doing lots of other stuff). So
-it's likely not great code, but it's been fun to iterate on it and seems like it might be useful to others.
+it's likely not great code, but it's been fun to iterate on it and seems like it might be useful to others. And contributions are welcome! I'm working on making the issue tracker accessible, so anyone who wants to try out some open source coding can jump in.
 
 ## Installation
 
@@ -40,10 +43,9 @@ Should print out a help message. You then should be able run the CLI:
 
 
 ```bash
-open_buildings benchmark 36b_buildings.csv test-output-dir --format parquet
+open_buildings tools get_buildings aoi.json my-buildings.geojson
 ```
 
-The only CSV files that will work are those from Google's Open Buildings dataset.
 
 ## Functionality
 

diff --git a/docs/open_buildings.md b/docs/open_buildings.md
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -82,5 +82,4 @@ nav:
     - Examples:
           - examples/download_buildings.ipynb
     - API Reference:
-          - open_buildings module: open_buildings.md
           - common module: common.md
diff --git a/open_buildings/__init__.py b/open_buildings/__init__.py
@@ -2,6 +2,4 @@
 
 __author__ = """Chris Holmes"""
 __email__ = '[email protected]'
-__version__ = '0.0.7'
-
-from .open_buildings import *
+__version__ = '0.0.8'
diff --git a/open_buildings/cli.py b/open_buildings/cli.py
@@ -3,7 +3,8 @@
 import click
 import pandas as pd
 import matplotlib.pyplot as plt
-from open_buildings import process_benchmark, process_geometries
+from open_buildings.google.process import process_benchmark, process_geometries
+from open_buildings.download_buildings import download as download_buildings
 from datetime import datetime, timedelta
 from tabulate import tabulate
 import boto3  # Required for S3 operations
@@ -23,12 +24,35 @@ def overture():
     """Commands related to Overture operations."""
     pass
 
+@click.group()
+def tools():
+    """Commands useful for working with any buildings data"""
+    pass
+
 main.add_command(google)
 main.add_command(overture)
+main.add_command(tools)
 
 def handle_comma_separated(ctx, param, value):
     return value.split(',')
 
+@tools.command(name="get_buildings")
+@click.argument('geojson_input', type=click.File('r'), required=False)
+@click.option('--only-quadkey', is_flag=True, help='Include only the quadkey in the WHERE clause.')
+@click.option('--format', default=None, type=click.Choice(['shapefile', 'geojson', 'geopackage', 'flatgeobuf', 'parquet']), help='Output format for the SQL query. Defaults to the extension of the dst file.')
+@click.option('--generate-sql', is_flag=True, default=False, help='Generate and print SQL without executing.')
+@click.option('--dst', type=str, default="buildings.parquet", help='Destination file name (without extension) or full path for the output.')
+@click.option('-s', '--silent', is_flag=True, default=False, help='Suppress all print outputs.')
+@click.option('--time-report', is_flag=True, default=True, help='Report how long the operation took to run.')
+@click.option('--overwrite', default=False, is_flag=True, help='Overwrite the destination file if it already exists.')
+@click.option('--verbose', default=False, is_flag=True, help='Print detailed logs with timestamps.')
+@click.option('--run-gpq', is_flag=True, default=True, help='Run gpq conversion to ensure the output is valid GeoParquet.')
+@click.option('--data-path', type=str, default="s3://us-west-2.opendata.source.coop/cholmes/overture/geoparquet-country-quad-2/*.parquet", help='Path to the root of the buildings parquet data.')
+@click.option('--hive-partitioning', is_flag=True, default=False, help='Use Hive partitioning when reading the parquet data.')
+@click.option('--country_iso', type=str, default=None, help='Country ISO code to filter the data by.')
+def get_buildings(geojson_input, only_quadkey, format, generate_sql, dst, silent, time_report, overwrite, verbose, run_gpq, data_path, hive_partitioning, country_iso):
+    download_buildings(geojson_input, only_quadkey, format, generate_sql, dst, silent, time_report, overwrite, verbose, run_gpq, data_path, hive_partitioning, country_iso)
+
 @google.command('benchmark')
 @click.argument('input_path', type=click.Path(exists=True))
 @click.argument('output_directory', type=click.Path(exists=True))

diff --git a/open_buildings/download_buildings.py b/open_buildings/download_buildings.py
@@ -125,20 +125,6 @@ def quad2json(quadkey_input):
     result = quadkey_to_geojson(quadkey_input)
     click.echo(json.dumps(result, indent=2))
 
-@click.command(name="download")
-@click.argument('geojson_input', type=click.File('r'), required=False)
-@click.option('--only-quadkey', is_flag=True, help='Include only the quadkey in the WHERE clause.')
-@click.option('--format', default=None, type=click.Choice(['shapefile', 'geojson', 'geopackage', 'flatgeobuf', 'parquet']), help='Output format for the SQL query. Defaults to the extension of the dst file.')
-@click.option('--generate-sql', is_flag=True, default=False, help='Generate and print SQL without executing.')
-@click.option('--dst', type=str, default="buildings.parquet", help='Destination file name (without extension) or full path for the output.')
-@click.option('-s', '--silent', is_flag=True, default=False, help='Suppress all print outputs.')
-@click.option('--time-report', is_flag=True, default=True, help='Report how long the operation took to run.')
-@click.option('--overwrite', default=False, is_flag=True, help='Overwrite the destination file if it already exists.')
-@click.option('--verbose', default=False, is_flag=True, help='Print detailed logs with timestamps.')
-@click.option('--run-gpq', is_flag=True, default=True, help='Run gpq conversion to ensure the output is valid GeoParquet.')
-@click.option('--data-path', type=str, default="s3://us-west-2.opendata.source.coop/cholmes/overture/geoparquet-country-quad-2/*.parquet", help='Path to the root of the buildings parquet data.')
-@click.option('--hive-partitioning', is_flag=True, default=False, help='Use Hive partitioning when reading the parquet data.')
-@click.option('--country_iso', type=str, default=None, help='Country ISO code to filter the data by.')
 
 def download(geojson_input, only_quadkey, format, generate_sql, dst, silent, time_report, overwrite, verbose, run_gpq, data_path, hive_partitioning, country_iso):
     """Download buildings data based on the input GeoJSON."""
@@ -257,7 +243,7 @@ def print_timestamped_message(message):
 cli.add_command(WKT)
 cli.add_command(sql)
 cli.add_command(quad2json)
-cli.add_command(download)
+#cli.add_command(download)
 
 if __name__ == '__main__':
     cli()

diff --git a/open_buildings/google/__init.py__ b/open_buildings/google/__init.py__
@@ -0,0 +1 @@
+
diff --git a/...buildings/google-buildings-add-columns.py → open_buildings/google/add_columns.py b/...buildings/google-buildings-add-columns.py → open_buildings/google/add_columns.py
@@ -111,7 +111,7 @@ def process_parquet_file(input_parquet_path, output_folder, country_parquet_path
     # Write out to Parquet
     con.execute(f"COPY (SELECT * FROM buildings ORDER BY quadkey) TO '{output_parquet_path}' WITH (FORMAT Parquet)")
 
-    if (True):
+    if (False):
         print(f"Converting to geoparquet: {output_parquet_path}")
         # Create a temporary file
         temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
@@ -127,7 +127,7 @@ def process_parquet_file(input_parquet_path, output_folder, country_parquet_path
 
     print(f"Processing complete for file {input_parquet_path}")
 
-    remove_duckdb = True
+    remove_duckdb = False
 
     # remove duckdb file
     if (remove_duckdb):

diff --git a/open_buildings/google/partition.py b/open_buildings/google/partition.py
@@ -0,0 +1,206 @@
+"""
+This script takes a DuckDB database with a buildings table and converts it to GeoParquet 
+files partitioned on first country and then quadkey. The buildings table must have a
+country_iso field and quadkey field, populated by overture-buildings-parquet-add-columns.py.
+The main function is process_db(), and it will take as input a maximum number of rows per
+file and a row group size for the Parquet files. It will then iterate through the countries
+in the database and partition the buildings table into GeoParquet files for each country.
+If the number of rows for a country is greater than the maximum number of rows per file,
+it will partition the country into quadkeys and create GeoParquet files for each quadkey.
+Those quadkeys will be further partitioned if necessary until the number of rows for a
+quadkey is less than or equal to the maximum number of rows per file. 
+"""
+
+import duckdb
+import datetime
+import subprocess
+import tempfile
+import os
+import click
+import shutil
+import geopandas as gpd
+from shapely import wkb
+import pandas as pd
+import time
+
+def current_time_str():
+    return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+def print_verbose(msg, verbose):
+    if verbose:
+        print(f"[{current_time_str()}] {msg}")
+
+def convert_gpq(input_filename, row_group_size, verbose):
+    print_verbose(f"Starting conversion for {input_filename} using gpq (row_group_size ignored).", verbose)
+
+    # Create a temporary file
+    temp_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
+    temp_file.close()  # Close the file so gpq can open it
+
+    # Convert the Parquet file to a GeoParquet file using gpq
+    gpq_cmd = ['gpq', 'convert', input_filename, temp_file.name]
+    subprocess.run(gpq_cmd, check=True)
+
+    print_verbose(f"Conversion for {input_filename} using gpq finished.", verbose)
+
+    # Rename (move) the temp file to the final filename
+    shutil.move(temp_file.name, input_filename)
+
+    # Delete the initial temp file if it still exists
+    #initial_temp_filename = f'{country_code}_temp.parquet'
+    #if os.path.exists(initial_temp_filename):
+    #    os.remove(initial_temp_filename)
+
+def convert_pandas(input_filename, rg_size, verbose):
+    # Placeholder function to be fleshed out
+    print_verbose("Starting conversion using pandas.", verbose)
+    try:
+        df = pd.read_parquet(input_filename)
+
+        # Convert WKB geometry to geopandas geometry
+        df['geometry'] = df['geometry'].apply(wkb.loads, hex=True)
+        gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
+        # Change output file the input_filename with .parquet replaced with _geo.parquet
+        output_filename = input_filename.replace(".parquet", "_geo.parquet")
+
+        gdf.to_parquet(output_filename, row_group_size=rg_size)
+        # delete the original file
+        os.remove(input_filename)
+        # Rename (move) the output file to the input filename
+        shutil.move(output_filename, input_filename)
+        print(f"Finished processing {input_filename} at {time.ctime()}")
+    except Exception as e:
+        print(f"Error processing {input_filename}: {e}")
+
+#not quite working yet - not sure what's wrong. Should go faster than pandas.  
+def convert_ogr(input_filename, rg_size, verbose):
+    fields_to_keep = ['confidence', 'area_in_meters', 'full_plus_code', 'country_iso', 'quadkey']
+    output_filename = input_filename.replace(".parquet", "_geo.parquet")
+    rg_cmd = f"ROW_GROUP_SIZE={rg_size}"
+    cmd = [
+        'ogr2ogr',
+        '-f',
+        'Parquet',
+        '-select',
+        ','.join(fields_to_keep),
+        output_filename,
+        input_filename,
+   #     '-oo',
+   #     rg_cmd,
+        '-oo',
+        'GEOM_POSSIBLE_NAMES=geometry',
+        '-a_srs',
+        'EPSG:4326', ]
+
+    # print the ogr2ogr command that will be run
+    if verbose:
+        print("ogr2ogr command:")
+        print(' '.join(cmd))
+
+    # Run the command
+    subprocess.run(cmd, check=True)
+
+    # delete the original file
+    os.remove(input_filename)
+    # Rename (move) the output file to the input filename
+    shutil.move(output_filename, input_filename)
+    print(f"Finished processing {input_filename} at {time.ctime()}")
+
+    if verbose:
+        print(f"Converted {input_filename} to {output_filename} using ogr2ogr.")
+
+
+
+def fetch_quadkeys(conn, table_name, country_code, length, verbose, prev_qk=""):
+    query = f"SELECT DISTINCT SUBSTR(quadkey, 1, {length}) FROM {table_name} WHERE country_iso = '{country_code}'"
+    if prev_qk:
+        query += f" AND SUBSTR(quadkey, 1, {len(prev_qk)}) = '{prev_qk}'"
+    print_verbose(f'Executing: {query}', verbose)
+    return conn.execute(query).fetchall()
+
+def convert_to_geoparquet(parquet_path, geo_conversion, row_group_size, verbose):
+    if geo_conversion == 'gpq':
+        convert_gpq(parquet_path, row_group_size, verbose)
+        print_verbose(f"File: {parquet_path} written with gpq", verbose)
+    elif geo_conversion == 'pandas':
+        convert_pandas(parquet_path, row_group_size, verbose)
+        print_verbose(f"File: {parquet_path} written with pandas", verbose)
+    elif geo_conversion == 'ogr':
+        convert_ogr(parquet_path, row_group_size, verbose)
+        print_verbose(f"File: {parquet_path} written with ogr", verbose)
+    else:
+        print_verbose(f"File: {parquet_path} written without converting to GeoParquet", verbose)
+
+#TODO: go all the way into the quad to find the smallest quadkey that contains less than max_per_file rows
+def process_quadkey_recursive(conn, table_name, country_code, output_folder, length, geo_conversion, row_group_size, verbose, max_per_file, current_qk=""):
+    distinct_quadkeys = fetch_quadkeys(conn, table_name, country_code, length, verbose, current_qk)
+    print_verbose(f"The list of quadkeys for country {country_code} and length {length} is {distinct_quadkeys}", verbose)
+    #num_distinct_qk = len(distinct_quadkeys)
+    for qk in distinct_quadkeys:
+        qk_str = qk[0]
+        qk_count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}'"
+        print_verbose(f'Executing: {qk_count_query}', verbose)
+        qk_count = conn.execute(qk_count_query).fetchone()[0]
+        print_verbose(f"Quadkey {qk_str} has {qk_count} rows", verbose)
+        if qk_count > max_per_file:
+            process_quadkey_recursive(conn, table_name, country_code, output_folder, length + 1, geo_conversion, row_group_size, verbose, max_per_file, qk_str)
+        else:
+            quad_output_filename = os.path.join(output_folder, f'{country_code}_{qk_str}.parquet')
+            if os.path.exists(quad_output_filename):
+                print_verbose(f"Output file {quad_output_filename} already exists, skipping...", verbose)
+            else:
+                copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' AND SUBSTR(quadkey, 1, {length}) = '{qk_str}' ORDER BY quadkey) TO '{quad_output_filename}' WITH (FORMAT PARQUET);"
+                print_verbose(f'Executing: {copy_cmd}', verbose)
+                conn.execute(copy_cmd)
+                convert_to_geoparquet(quad_output_filename, geo_conversion, row_group_size, verbose)
+
+
+# TODO: add option for 'hive' output (put things in folder)
+# TODO: add option to read duckdb path from an environment variable
+# TODO: add row group size option (first works with duckdb)
+
+@click.command()
+@click.argument('duckdb-path', type=click.Path(exists=True))
+@click.option('--output-folder', default=os.getcwd(), type=click.Path(), help='Folder to store the output files')
+@click.option('--geo-conversion', default='gpq', type=click.Choice(['gpq', 'none', 'pandas', 'ogr'], case_sensitive=False))
+@click.option('--verbose', is_flag=True, default=False, help='Print verbose output')
+@click.option('--max-per-file', default=10000000, type=int, help='Maximum number of rows per file')
+@click.option('--row-group-size', default=10000, type=int, help='Row group size for Parquet files')
+@click.option('--hive', is_flag=True, default=False, help='Output files in Hive format (folder structure)')
+def process_db(duckdb_path, output_folder, geo_conversion, verbose, max_per_file, row_group_size, hive):
+    table_name = 'buildings'
+    # create output folder if it does not exist
+    os.makedirs(output_folder, exist_ok=True)
+    conn = duckdb.connect(duckdb_path)
+    conn.execute('LOAD spatial;')
+    cursor = conn.execute('SELECT DISTINCT country_iso FROM buildings')
+    countries = cursor.fetchall()
+
+    print_verbose(f'Found {len(countries)} unique countries', verbose)
+    countries.reverse()
+    for country in countries:
+        country_code = country[0]
+        write_folder = output_folder
+        if (hive):
+            write_folder = os.path.join(output_folder, f'country_iso={country_code}')
+            os.makedirs(write_folder, exist_ok=True)
+        output_filename = os.path.join(write_folder, f'{country_code}.parquet')
+        if os.path.exists(output_filename):
+            print_verbose(f"Output file for country {country_code} already exists, skipping...", verbose)
+            continue
+
+        count_query = f"SELECT COUNT(*) FROM {table_name} WHERE country_iso = '{country_code}'"
+        print_verbose(f'Executing: {count_query}', verbose)
+        count = conn.execute(count_query).fetchone()[0]
+        print_verbose(f"Country {country_code} has {count} rows", verbose)
+
+        if count <= max_per_file:
+            copy_cmd = f"COPY (SELECT * FROM {table_name} WHERE country_iso = '{country_code}' ORDER BY quadkey) TO '{output_filename}' WITH (FORMAT PARQUET);"
+            print_verbose(f'Executing: {copy_cmd}', verbose)
+            conn.execute(copy_cmd)
+            convert_to_geoparquet(output_filename, geo_conversion, row_group_size, verbose)
+        else:
+            process_quadkey_recursive(conn, table_name, country_code, write_folder, 1, geo_conversion, row_group_size, verbose, max_per_file)
+
+if __name__ == "__main__":
+    process_db()
diff --git a/open_buildings/open_buildings.py → open_buildings/google/process.py b/open_buildings/open_buildings.py → open_buildings/google/process.py