diff --git a/scripts/v.dissolve/tests/conftest.py b/scripts/v.dissolve/tests/conftest.py new file mode 100644 index 00000000000..b74969999b1 --- /dev/null +++ b/scripts/v.dissolve/tests/conftest.py @@ -0,0 +1,248 @@ +"""Fixtures for v.dissolve tests""" + +from types import SimpleNamespace + +import pytest + +import grass.script as gs +import grass.script.setup as grass_setup + + +def updates_as_transaction(table, cat_column, column, column_quote, cats, values): + """Create SQL statement for categories and values for a given column""" + sql = ["BEGIN TRANSACTION"] + if column_quote: + quote = "'" + else: + quote = "" + for cat, value in zip(cats, values): + sql.append( + f"UPDATE {table} SET {column} = {quote}{value}{quote} " + f"WHERE {cat_column} = {cat};" + ) + sql.append("END TRANSACTION") + return "\n".join(sql) + + +def value_update_by_category(map_name, layer, column_name, cats, values): + """Update column value for multiple rows based on category""" + db_info = gs.vector_db(map_name)[layer] + table = db_info["table"] + database = db_info["database"] + driver = db_info["driver"] + cat_column = "cat" + column_type = gs.vector_columns(map_name, layer)[column_name] + column_quote = bool(column_type["type"] in ("CHARACTER", "TEXT")) + sql = updates_as_transaction( + table=table, + cat_column=cat_column, + column=column_name, + column_quote=column_quote, + cats=cats, + values=values, + ) + gs.write_command( + "db.execute", input="-", database=database, driver=driver, stdin=sql + ) + + +@pytest.fixture(scope="module") +def dataset(tmp_path_factory): + """Creates a session with a mapset which has vector with a float column""" + tmp_path = tmp_path_factory.mktemp("dataset") + location = "test" + point_map_name = "points" + map_name = "areas" + int_column_name = "int_value" + float_column_name = "double_value" + str_column_name = "str_value" + + cats = [1, 2, 3, 4, 5, 6] + int_values = [10, 10, 10, 5, 24, 5] + float_values = [100.78, 102.78, 109.78, 104.78, 103.78, 105.78] + str_values = ["apples", "oranges", "oranges", "plumbs", "oranges", "plumbs"] + num_points = len(cats) + + gs.core._create_location_xy(tmp_path, location) # pylint: disable=protected-access + with grass_setup.init(tmp_path / location): + gs.run_command("g.region", s=0, n=80, w=0, e=120, b=0, t=50, res=10, res3=10) + gs.run_command("v.random", output=point_map_name, npoints=num_points, seed=42) + gs.run_command("v.voronoi", input=point_map_name, output=map_name) + gs.run_command( + "v.db.addtable", + map=map_name, + columns=[ + f"{int_column_name} integer", + f"{float_column_name} double precision", + f"{str_column_name} text", + ], + ) + value_update_by_category( + map_name=map_name, + layer=1, + column_name=int_column_name, + cats=cats, + values=int_values, + ) + value_update_by_category( + map_name=map_name, + layer=1, + column_name=float_column_name, + cats=cats, + values=float_values, + ) + value_update_by_category( + map_name=map_name, + layer=1, + column_name=str_column_name, + cats=cats, + values=str_values, + ) + yield SimpleNamespace( + vector_name=map_name, + int_column_name=int_column_name, + int_values=int_values, + float_column_name=float_column_name, + float_values=float_values, + str_column_name=str_column_name, + str_column_values=str_values, + ) + + +@pytest.fixture(scope="module") +def discontinuous_dataset(tmp_path_factory): + """Creates a session with a mapset which has vector with a float column""" + tmp_path = tmp_path_factory.mktemp("discontinuous_dataset") + location = "test" + point_map_name = "points" + map_name = "areas" + int_column_name = "int_value" + float_column_name = "double_value" + str_column_name = "str_value" + + cats = [1, 2, 3, 4, 5, 6] + int_values = [10, 12, 10, 5, 24, 24] + float_values = [100.78, 102.78, 109.78, 104.78, 103.78, 105.78] + str_values = ["apples", "plumbs", "apples", "plumbs", "oranges", "oranges"] + num_points = len(cats) + + gs.core._create_location_xy(tmp_path, location) # pylint: disable=protected-access + with grass_setup.init(tmp_path / location): + gs.run_command("g.region", s=0, n=80, w=0, e=120, b=0, t=50, res=10, res3=10) + gs.run_command("v.random", output=point_map_name, npoints=num_points, seed=42) + gs.run_command("v.voronoi", input=point_map_name, output=map_name) + gs.run_command( + "v.db.addtable", + map=map_name, + columns=[ + f"{int_column_name} integer", + f"{float_column_name} double precision", + f"{str_column_name} text", + ], + ) + value_update_by_category( + map_name=map_name, + layer=1, + column_name=int_column_name, + cats=cats, + values=int_values, + ) + value_update_by_category( + map_name=map_name, + layer=1, + column_name=float_column_name, + cats=cats, + values=float_values, + ) + value_update_by_category( + map_name=map_name, + layer=1, + column_name=str_column_name, + cats=cats, + values=str_values, + ) + yield SimpleNamespace( + vector_name=map_name, + int_column_name=int_column_name, + int_values=int_values, + float_column_name=float_column_name, + float_values=float_values, + str_column_name=str_column_name, + str_column_values=str_values, + ) + + +@pytest.fixture(scope="module") +def dataset_layer_2(tmp_path_factory): + """Creates a session with a mapset which has vector with a float column""" + tmp_path = tmp_path_factory.mktemp("dataset_layer_2") + location = "test" + point_map_name = "points" + point_map_name_layer_2 = "points2" + map_name = "areas" + int_column_name = "int_value" + float_column_name = "double_value" + str_column_name = "str_value" + + cats = [1, 2, 3, 4, 5, 6] + int_values = [10, 10, 10, 5, 24, 5] + float_values = [100.78, 102.78, 109.78, 104.78, 103.78, 105.78] + str_values = ["apples", "oranges", "oranges", "plumbs", "oranges", "plumbs"] + num_points = len(cats) + + layer = 2 + + gs.core._create_location_xy(tmp_path, location) # pylint: disable=protected-access + with grass_setup.init(tmp_path / location): + gs.run_command("g.region", s=0, n=80, w=0, e=120, b=0, t=50, res=10, res3=10) + gs.run_command("v.random", output=point_map_name, npoints=num_points, seed=42) + gs.run_command( + "v.category", + input=point_map_name, + layer=[1, layer], + output=point_map_name_layer_2, + option="transfer", + ) + gs.run_command( + "v.voronoi", input=point_map_name_layer_2, layer=layer, output=map_name + ) + gs.run_command( + "v.db.addtable", + map=map_name, + layer=layer, + columns=[ + f"{int_column_name} integer", + f"{float_column_name} double precision", + f"{str_column_name} text", + ], + ) + value_update_by_category( + map_name=map_name, + layer=layer, + column_name=int_column_name, + cats=cats, + values=int_values, + ) + value_update_by_category( + map_name=map_name, + layer=layer, + column_name=float_column_name, + cats=cats, + values=float_values, + ) + value_update_by_category( + map_name=map_name, + layer=layer, + column_name=str_column_name, + cats=cats, + values=str_values, + ) + yield SimpleNamespace( + vector_name=map_name, + int_column_name=int_column_name, + int_values=int_values, + float_column_name=float_column_name, + float_values=float_values, + str_column_name=str_column_name, + str_column_values=str_values, + ) diff --git a/scripts/v.dissolve/tests/v_dissolve_aggregate_test.py b/scripts/v.dissolve/tests/v_dissolve_aggregate_test.py new file mode 100644 index 00000000000..1c2b6d45123 --- /dev/null +++ b/scripts/v.dissolve/tests/v_dissolve_aggregate_test.py @@ -0,0 +1,405 @@ +"""Test v.dissolve attribute aggregations""" + +import json +import statistics + +import pytest + +import grass.script as gs + + +@pytest.mark.parametrize( + "aggregate_methods", + [ + ["n"], + ["sum"], + ["range"], + ["min", "max", "mean", "variance"], + ["mean_abs", "stddev", "coeff_var"], + ], +) +def test_aggregate_methods(dataset, aggregate_methods): + """All aggregate methods are accepted and their columns generated""" + dissolved_vector = f"test_methods_{'_'.join(aggregate_methods)}" + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=dataset.float_column_name, + aggregate_method=aggregate_methods, + ) + columns = gs.vector_columns(dissolved_vector) + stats_columns = [ + f"{dataset.float_column_name}_{method}" for method in aggregate_methods + ] + assert sorted(columns.keys()) == sorted( + ["cat", dataset.str_column_name] + stats_columns + ) + + +def test_aggregate_two_columns(dataset): + """Aggregate stats for two columns are generated""" + dissolved_vector = "test_two_columns" + aggregate_methods = ["mean", "stddev"] + aggregate_columns = [dataset.float_column_name, dataset.int_column_name] + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=aggregate_columns, + aggregate_method=aggregate_methods, + ) + stats_columns = [ + f"{column}_{method}" + for method in aggregate_methods + for column in aggregate_columns + ] + columns = gs.vector_columns(dissolved_vector) + assert sorted(columns.keys()) == sorted( + ["cat", dataset.str_column_name] + stats_columns + ) + + +@pytest.mark.parametrize("backend", [None, "univar", "sql"]) +def test_aggregate_column_result(dataset, backend): + """Check resulting types and values of basic stats with different backends + + It assumes that the univar-like names are translated to SQLite names. + """ + dissolved_vector = f"test_results_{backend}" + stats = ["sum", "n", "min", "max", "mean"] + stats_columns = [f"value_{method}" for method in stats] + aggregate_columns = [dataset.float_column_name] * len(stats) + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=aggregate_columns, + aggregate_method=stats, + result_column=stats_columns, + aggregate_backend=backend, + ) + + vector_info = gs.vector_info(dissolved_vector) + assert vector_info["level"] == 2 + assert vector_info["centroids"] == 3 + assert vector_info["areas"] == 3 + assert vector_info["num_dblinks"] == 1 + assert vector_info["attribute_primary_key"] == "cat" + + columns = gs.vector_columns(dissolved_vector) + assert len(columns) == len(stats_columns) + 2 + assert sorted(columns.keys()) == sorted( + ["cat", dataset.str_column_name] + stats_columns + ) + for stats_column in stats_columns: + assert stats_column in columns + column_info = columns[stats_column] + if stats_column.endswith("_n"): + correct_type = "integer" + else: + correct_type = "double precision" + assert ( + columns[stats_column]["type"].lower() == correct_type + ), f"{stats_column} has a wrong type" + assert dataset.str_column_name in columns + column_info = columns[dataset.str_column_name] + assert column_info["type"].lower() == "character" + + records = json.loads( + gs.read_command( + "v.db.select", + map=dissolved_vector, + format="json", + ) + )["records"] + ref_unique_values = set(dataset.str_column_values) + actual_values = [record[dataset.str_column_name] for record in records] + assert len(actual_values) == len(ref_unique_values) + assert set(actual_values) == ref_unique_values + + aggregate_n = [record["value_n"] for record in records] + assert sum(aggregate_n) == gs.vector_info(dataset.vector_name)["areas"] + assert sorted(aggregate_n) == [1, 2, 3] + aggregate_sum = [record["value_sum"] for record in records] + assert sorted(aggregate_sum) == [ + dataset.float_values[0], + pytest.approx(dataset.float_values[3] + dataset.float_values[5]), + pytest.approx( + dataset.float_values[1] + dataset.float_values[2] + dataset.float_values[4] + ), + ] + aggregate_max = [record["value_max"] for record in records] + assert sorted(aggregate_max) == [ + dataset.float_values[0], + pytest.approx(max([dataset.float_values[3], dataset.float_values[5]])), + pytest.approx( + max( + [ + dataset.float_values[1], + dataset.float_values[2], + dataset.float_values[4], + ] + ) + ), + ] + aggregate_min = [record["value_min"] for record in records] + assert sorted(aggregate_min) == [ + dataset.float_values[0], + pytest.approx( + min( + [ + dataset.float_values[1], + dataset.float_values[2], + dataset.float_values[4], + ] + ) + ), + pytest.approx(min([dataset.float_values[3], dataset.float_values[5]])), + ] + aggregate_mean = [record["value_mean"] for record in records] + assert sorted(aggregate_mean) == [ + dataset.float_values[0], + pytest.approx( + statistics.mean([dataset.float_values[3], dataset.float_values[5]]) + ), + pytest.approx( + statistics.mean( + [ + dataset.float_values[1], + dataset.float_values[2], + dataset.float_values[4], + ] + ) + ), + ] + + +def test_sqlite_agg_accepted(dataset): + """Numeric SQLite aggregate functions are accepted + + Additionally, it checks: + 1. generated column names + 2. types of columns + 3. aggregate counts + """ + dissolved_vector = "test_sqlite" + stats = ["avg", "count", "max", "min", "sum", "total"] + expected_stats_columns = [ + f"{dataset.float_column_name}_{method}" for method in stats + ] + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=dataset.float_column_name, + aggregate_method=stats, + aggregate_backend="sql", + ) + + vector_info = gs.vector_info(dissolved_vector) + assert vector_info["level"] == 2 + assert vector_info["centroids"] == 3 + assert vector_info["areas"] == 3 + assert vector_info["num_dblinks"] == 1 + assert vector_info["attribute_primary_key"] == "cat" + + columns = gs.vector_columns(dissolved_vector) + assert len(columns) == len(expected_stats_columns) + 2 + assert sorted(columns.keys()) == sorted( + ["cat", dataset.str_column_name] + expected_stats_columns + ), "Unexpected autogenerated column names" + for method, stats_column in zip(stats, expected_stats_columns): + assert stats_column in columns + column_info = columns[stats_column] + if method == "count": + correct_type = "integer" + else: + correct_type = "double precision" + assert ( + columns[stats_column]["type"].lower() == correct_type + ), f"{stats_column} has a wrong type" + assert dataset.str_column_name in columns + column_info = columns[dataset.str_column_name] + assert column_info["type"].lower() == "character" + + records = json.loads( + gs.read_command( + "v.db.select", + map=dissolved_vector, + format="json", + ) + )["records"] + ref_unique_values = set(dataset.str_column_values) + actual_values = [record[dataset.str_column_name] for record in records] + assert len(actual_values) == len(ref_unique_values) + assert set(actual_values) == ref_unique_values + + aggregate_n = [record[f"{dataset.float_column_name}_count"] for record in records] + assert sum(aggregate_n) == gs.vector_info(dataset.vector_name)["areas"] + assert sorted(aggregate_n) == [1, 2, 3] + + +def test_sqlite_concat(dataset): + """SQLite group concat text-returning aggregate function works""" + dissolved_vector = "test_sqlite_concat" + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=f"group_concat({dataset.int_column_name})", + result_column="concat_values text", + aggregate_backend="sql", + ) + records = json.loads( + gs.read_command( + "v.db.select", + map=dissolved_vector, + format="json", + ) + )["records"] + # Order of records is ignored - they are just sorted. + # Order within values of group_concat is defined as arbitrary by SQLite. + expected_integers = sorted(["10", "10,10,24", "5,5"]) + actual_integers = sorted([record["concat_values"] for record in records]) + for expected, actual in zip(expected_integers, actual_integers): + assert sorted(expected.split(",")) == sorted(actual.split(",")) + + +def test_sqlite_concat_with_two_parameters(dataset): + """SQLite group concat text-returning two-parameter aggregate function works""" + dissolved_vector = "test_sqlite_concat_separator" + separator = "--+--" + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=f"group_concat({dataset.int_column_name}, '{separator}')", + result_column="concat_values text", + aggregate_backend="sql", + ) + records = json.loads( + gs.read_command( + "v.db.select", + map=dissolved_vector, + format="json", + ) + )["records"] + # Order of records is ignored - they are just sorted. + # Order within values of group_concat is defined as arbitrary by SQLite. + expected_integers = sorted(["10", "10,10,24", "5,5"]) + actual_integers = sorted([record["concat_values"] for record in records]) + for expected, actual in zip(expected_integers, actual_integers): + assert sorted(expected.split(",")) == sorted(actual.split(separator)) + + +def test_duplicate_columns_and_methods_accepted(dataset): + """Duplicate aggregate columns and methods are accepted and deduplicated""" + dissolved_vector = "test_duplicates" + stats = ["count", "count", "n", "min", "min", "n", "sum"] + expected_stats_columns = [ + f"{dataset.float_column_name}_{method}" + for method in ["count", "n", "min", "sum"] + ] + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=[dataset.float_column_name, dataset.float_column_name], + aggregate_method=stats, + aggregate_backend="sql", + ) + + vector_info = gs.vector_info(dissolved_vector) + assert vector_info["level"] == 2 + assert vector_info["centroids"] == 3 + assert vector_info["areas"] == 3 + assert vector_info["num_dblinks"] == 1 + assert vector_info["attribute_primary_key"] == "cat" + + columns = gs.vector_columns(dissolved_vector) + assert sorted(columns.keys()) == sorted( + ["cat", dataset.str_column_name] + expected_stats_columns + ), "Unexpected autogenerated column names" + + +def test_sql_expressions_accepted(dataset): + """Arbitrary SQL expressions are accepted for columns""" + dissolved_vector = "test_expressions" + aggregate_columns = ( + f"sum({dataset.float_column_name}), " + f"max({dataset.float_column_name}) - min({dataset.float_column_name}), " + f" count({dataset.float_column_name}) " + ) + result_columns = ( + " sum_of_values double, range_of_values double, count_of_rows integer" + ) + expected_stats_columns = ["sum_of_values", "range_of_values", "count_of_rows"] + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=aggregate_columns, + result_column=result_columns, + aggregate_backend="sql", + ) + + vector_info = gs.vector_info(dissolved_vector) + assert vector_info["level"] == 2 + assert vector_info["centroids"] == 3 + assert vector_info["areas"] == 3 + assert vector_info["num_dblinks"] == 1 + assert vector_info["attribute_primary_key"] == "cat" + + columns = gs.vector_columns(dissolved_vector) + assert sorted(columns.keys()) == sorted( + ["cat", dataset.str_column_name] + expected_stats_columns + ) + + +def test_no_methods_with_univar_and_result_columns_fail(dataset): + """Omitting methods as for sql backend is forbiden for univar""" + dissolved_vector = "test_no_method_univar_fails" + + aggregate_columns = dataset.float_column_name + result_columns = ( + "sum_of_values double,range_of_values double, count_of_rows integer" + ) + assert ( + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=aggregate_columns, + result_column=result_columns, + aggregate_backend="univar", + errors="status", + ) + != 0 + ) + + +def test_int_fails(dataset): + """An integer column fails with aggregates""" + dissolved_vector = "test_int" + assert ( + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.int_column_name, + output=dissolved_vector, + aggregate_column=dataset.float_column_name, + aggregate_method="n", + errors="status", + ) + != 0 + ) diff --git a/scripts/v.dissolve/tests/v_dissolve_geometry_test.py b/scripts/v.dissolve/tests/v_dissolve_geometry_test.py new file mode 100644 index 00000000000..71c950a2141 --- /dev/null +++ b/scripts/v.dissolve/tests/v_dissolve_geometry_test.py @@ -0,0 +1,59 @@ +"""Test v.dissolve with more advanced geometry""" + +import json + +import grass.script as gs + + +def test_dissolve_discontinuous_str(discontinuous_dataset): + """Dissolving of discontinuous areas results in a single attribute record + + Even when the areas are discontinuous, there should be only one row + in the attribute table. + This behavior is assumed by the attribute aggregation functionality. + """ + dataset = discontinuous_dataset + dissolved_vector = "test_discontinuous_str" + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + ) + + vector_info = gs.vector_info(dissolved_vector) + assert vector_info["level"] == 2 + assert vector_info["centroids"] == 5 + assert vector_info["areas"] == 5 + assert vector_info["num_dblinks"] == 1 + assert vector_info["attribute_primary_key"] == "cat" + # Reference values obtained by examining the result. + assert vector_info["north"] == 80 + assert vector_info["south"] == 0 + assert vector_info["east"] == 120 + assert vector_info["west"] == 0 + assert vector_info["nodes"] == 14 + assert vector_info["points"] == 0 + assert vector_info["lines"] == 0 + assert vector_info["boundaries"] == 18 + assert vector_info["islands"] == 1 + assert vector_info["primitives"] == 23 + assert vector_info["map3d"] == 0 + + columns = gs.vector_columns(dissolved_vector) + assert len(columns) == 2 + assert sorted(columns.keys()) == sorted(["cat", dataset.str_column_name]) + column_info = columns[dataset.str_column_name] + assert column_info["type"].lower() == "character" + + records = json.loads( + gs.read_command( + "v.db.select", + map=dissolved_vector, + format="json", + ) + )["records"] + ref_unique_values = set(dataset.str_column_values) + actual_values = [record[dataset.str_column_name] for record in records] + assert len(actual_values) == len(ref_unique_values) + assert set(actual_values) == ref_unique_values diff --git a/scripts/v.dissolve/tests/v_dissolve_layers_test.py b/scripts/v.dissolve/tests/v_dissolve_layers_test.py new file mode 100644 index 00000000000..a13dc93315a --- /dev/null +++ b/scripts/v.dissolve/tests/v_dissolve_layers_test.py @@ -0,0 +1,74 @@ +"""Tests of v.dissolve with layer other than 1""" + +import json + +import grass.script as gs + + +def test_layer_2(dataset_layer_2): + """Numeric SQLite aggregate function are accepted + + Additionally, it checks: + 1. generated column names + 2. types of columns + 3. aggregate counts + """ + dataset = dataset_layer_2 + dissolved_vector = "test_sqlite" + stats = ["avg", "count", "max", "min", "sum", "total"] + expected_stats_columns = [ + f"{dataset.float_column_name}_{method}" for method in stats + ] + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + layer=2, + column=dataset.str_column_name, + output=dissolved_vector, + aggregate_column=dataset.float_column_name, + aggregate_method=stats, + aggregate_backend="sql", + ) + + vector_info = gs.vector_info(dissolved_vector, layer=2) + assert vector_info["level"] == 2 + assert vector_info["centroids"] == 3 + assert vector_info["areas"] == 3 + assert vector_info["num_dblinks"] == 1 + assert vector_info["attribute_primary_key"] == "cat" + + columns = gs.vector_columns(dissolved_vector, layer=2) + assert len(columns) == len(expected_stats_columns) + 2 + assert sorted(columns.keys()) == sorted( + ["cat", dataset.str_column_name] + expected_stats_columns + ), "Unexpected autogenerated column names" + for method, stats_column in zip(stats, expected_stats_columns): + assert stats_column in columns + column_info = columns[stats_column] + if method == "count": + correct_type = "integer" + else: + correct_type = "double precision" + assert ( + columns[stats_column]["type"].lower() == correct_type + ), f"{stats_column} has a wrong type" + assert dataset.str_column_name in columns + column_info = columns[dataset.str_column_name] + assert column_info["type"].lower() == "character" + + records = json.loads( + gs.read_command( + "v.db.select", + map=dissolved_vector, + layer=2, + format="json", + ) + )["records"] + ref_unique_values = set(dataset.str_column_values) + actual_values = [record[dataset.str_column_name] for record in records] + assert len(actual_values) == len(ref_unique_values) + assert set(actual_values) == ref_unique_values + + aggregate_n = [record[f"{dataset.float_column_name}_count"] for record in records] + assert sum(aggregate_n) == gs.vector_info(dataset.vector_name)["areas"] + assert sorted(aggregate_n) == [1, 2, 3] diff --git a/scripts/v.dissolve/tests/v_dissolve_test.py b/scripts/v.dissolve/tests/v_dissolve_test.py new file mode 100644 index 00000000000..f5d579f5139 --- /dev/null +++ b/scripts/v.dissolve/tests/v_dissolve_test.py @@ -0,0 +1,82 @@ +"""Test v.dissolve geometry info and basic attributes""" + +import json + +import grass.script as gs + + +def test_dissolve_int(dataset): + """Dissolving works on integer column""" + dissolved_vector = "test_int" + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.int_column_name, + output=dissolved_vector, + ) + + vector_info = gs.vector_info(dissolved_vector) + assert vector_info["level"] == 2 + assert vector_info["centroids"] == 3 + assert vector_info["areas"] == 3 + assert vector_info["num_dblinks"] == 0 + # Reference values obtained by examining the result. + assert vector_info["north"] == 80 + assert vector_info["south"] == 0 + assert vector_info["east"] == 120 + assert vector_info["west"] == 0 + assert vector_info["nodes"] == 14 + assert vector_info["points"] == 0 + assert vector_info["lines"] == 0 + assert vector_info["boundaries"] == 16 + assert vector_info["islands"] == 1 + assert vector_info["primitives"] == 19 + assert vector_info["map3d"] == 0 + + +def test_dissolve_str(dataset): + """Dissolving works on string column and attributes are present""" + dissolved_vector = "test_str" + gs.run_command( + "v.dissolve", + input=dataset.vector_name, + column=dataset.str_column_name, + output=dissolved_vector, + ) + + vector_info = gs.vector_info(dissolved_vector) + assert vector_info["level"] == 2 + assert vector_info["centroids"] == 3 + assert vector_info["areas"] == 3 + assert vector_info["num_dblinks"] == 1 + assert vector_info["attribute_primary_key"] == "cat" + # Reference values obtained by examining the result. + assert vector_info["north"] == 80 + assert vector_info["south"] == 0 + assert vector_info["east"] == 120 + assert vector_info["west"] == 0 + assert vector_info["nodes"] == 13 + assert vector_info["points"] == 0 + assert vector_info["lines"] == 0 + assert vector_info["boundaries"] == 15 + assert vector_info["islands"] == 1 + assert vector_info["primitives"] == 18 + assert vector_info["map3d"] == 0 + + columns = gs.vector_columns(dissolved_vector) + assert len(columns) == 2 + assert sorted(columns.keys()) == sorted(["cat", dataset.str_column_name]) + column_info = columns[dataset.str_column_name] + assert column_info["type"].lower() == "character" + + records = json.loads( + gs.read_command( + "v.db.select", + map=dissolved_vector, + format="json", + ) + )["records"] + ref_unique_values = set(dataset.str_column_values) + actual_values = [record[dataset.str_column_name] for record in records] + assert len(actual_values) == len(ref_unique_values) + assert set(actual_values) == ref_unique_values diff --git a/scripts/v.dissolve/v.dissolve.html b/scripts/v.dissolve/v.dissolve.html index 5896b491aeb..e22290fa064 100644 --- a/scripts/v.dissolve/v.dissolve.html +++ b/scripts/v.dissolve/v.dissolve.html @@ -7,6 +7,104 @@

DESCRIPTION

boundary dissolving. In this case the categories are not retained, only the values of the new key column. See the v.reclass help page for details. +
+ + +

+ Figure: Areas with the same attribute value (first image) are + merged into into one (second image) +

+
+ +

Merging behavior

+ +Multiple areas with the same category or the same attribute value +which are not adjacent are merged together into one entity +which consists of multiple areas, i.e., a multipolygon. + +

Attribute aggregation

+ +

+Attributes of merged areas can be aggregated using various aggregation methods +such as sum and mean. The specific methods available depend +on the backend used for aggregation. Two aggregate backends (specified in +aggregate_backend) are available, univar and sql. +When univar is used, the methods available are the ones +which v.db.univar uses by default, +i.e., n, min, max, range, +mean, mean_abs, variance, stddev, +coef_var, and sum. +When the sql backend is used, the methods in turn depends on the SQL +database backend used for the attribute table of the input vector. +For SQLite, it is at least the following +build-in aggregate functions: +count, min, max, +avg, sum, and total. +For PostgreSQL, the list of +aggregate functions +is much longer and includes, e.g., count, min, max, +avg, sum, stddev, and variance. +The sql aggregate backend, regardless of the underlying database, +will typically perform significantly better than the univar backend. + +

+Aggregate methods are specified by name in aggregate_methods +or using SQL syntax in aggregate_columns. +If result_columns is provided including type information +and the sql backend is used, +aggregate_columns can contain SQL syntax specifying both columns +and the functions applied, e.g., +aggregate_columns="sum(cows) / sum(animals)". +In this case, aggregate_methods should to be omitted. +This provides the highest flexibility and it is suitable for scripting. + +

+The backend is, by default, determined automatically based on the requested +methods. Specifically, the sql backend is used by default, +but when a method is not one of the SQLite build-in aggregate functions +and, at the same time, is available with the univar backend, +the univar backed is used. +The default behavior is intended for interactive use and testing. +For scripting and other automated usage, specifying the backend explicitly +is strongly recommended. + +

+For convince, certain methods, namely n, count, +mean, and avg, are converted to the name appropriate +for the selected backend. However, for scripting, specifying the appropriate +method (function) name for the backend is recommended because the conversion +is a heuristic which may change in the future. + +

+If only aggregate_columns is provided, methods default to +n, min, max, mean, and sum. +If the univar backend is specified, all the available methods +for the univar backend are used. + +

+If the result_columns is not provided, each method is applied to each +specified column producing result columns for all combinations. These result +columns have auto-generated names based on the aggregate column and method. +If the result_column is provided, each method is applied only once +to the matching column in the aggregate column list and the result will be +available under the name of the matching result column. In other words, number +of items in aggregate_columns, aggregate_methods (unless omitted), +and result_column needs to match and no +combinations are created on the fly. +For scripting, it is recommended to specify all resulting column names, +while for interactive use, automatically created combinations are expected +to be beneficial, especially for exploratory analysis. + +

+Type of the result column is determined based on the method selected. +For n and count, the type is INTEGER and for all other +methods, it is DOUBLE. Aggregate methods which produce other types +require the type to be specified as part of the result_columns. +A type can be provided in result_columns using the SQL syntax +name type, e.g., sum_of_values double precision. +Type specification is mandatory when SQL syntax is used in +aggregate_columns (and aggregate_methods is omitted). +

NOTES

GRASS defines a vector area as composite entity consisting of a set of @@ -57,17 +155,158 @@

Dissolving adjacent SHAPE files to remove tile boundaries

v.dissolve input=clc2000_clean output=clc2000_final col=CODE_00 +

Attribute aggregation

+ +While dissolving, we can aggregate attribute values of the original features. +Let's aggregate area in acres (ACRES) of all municipal boundaries +(boundary_municp) in the full NC dataset while dissolving common boundaries +based on the name in the DOTURBAN_N column +(long lines are split with backslash marking continued line as in Bash): + +
+v.dissolve input=boundary_municp column=DOTURBAN_N output=municipalities \
+    aggregate_columns=ACRES
+
+ +To inspect the result, we will use v.db.select retrieving only one row +for DOTURBAN_N == 'Wadesboro': + +
+v.db.select municipalities where="DOTURBAN_N == 'Wadesboro'" separator=tab
+
+ +The resulting table may look like this: + +
+cat  DOTURBAN_N    ACRES_n    ACRES_min    ACRES_max    ACRES_mean    ACRES_sum
+66   Wadesboro     2          634.987      3935.325     2285.156      4570.312
+
+ +The above created multiple columns for each of the statistics computed +by default. We can limit the number of statistics computed by specifying +the method which should be used: + +
+v.dissolve input=boundary_municp column=DOTURBAN_N output=municipalities_2 \
+    aggregate_columns=ACRES aggregate_methods=sum
+
+ +The above gives a single column with the sum for all values in the ACRES column +for each group of original features which had the same value in the DOTURBAN_N +column and are now dissolved (merged) into one. + +

Aggregating multiple attributes

+ +Expanding on the previous example, we can compute values for multiple columns +at once by adding more columns to the aggregate_columns option. +We will compute average of values in the NEW_PERC_G column: + +
+v.dissolve input=boundary_municp column=DOTURBAN_N output=municipalities_3 \
+    aggregate_columns=ACRES,NEW_PERC_G aggregate_methods=sum,avg
+
+ +By default, all methods specified in the aggregate_methods are applied +to all columns, so result of the above is four columns. +While this is convenient for getting multiple statistics for similar columns +(e.g. averages and standard deviations of multiple population statistics columns), +in our case, each column is different and each aggregate method should be +applied only to its corresponding column. + +

+The v.dissolve module will apply each aggregate method only to the +corresponding column when column names for the results are specified manually +with the result_columns option: + +

+v.dissolve input=boundary_municp column=DOTURBAN_N output=municipalities_4 \
+    aggregate_columns=ACRES,NEW_PERC_G aggregate_methods=sum,avg \
+    result_columns=acres,new_perc_g
+
+ +Now we have full control over what columns are created, but we also need to specify +an aggregate method for each column even when the aggregate methods are the same: + +
+v.dissolve input=boundary_municp column=DOTURBAN_N output=municipalities_5 \
+    aggregate_columns=ACRES,DOTURBAN_N,TEXT_NAME aggregate_methods=sum,count,count \
+    result_columns=acres,number_of_parts,named_parts
+
+ +While it is often not necessary to specify aggregate methods or names for +interactive exploratory analysis, specifying both aggregate_methods +and result_columns manually is a best practice for scripting +(unless SQL syntax is used for aggregate_columns, see below). + +

Aggregating using SQL syntax

+ +The aggregation can be done also using the full SQL syntax and set of aggregate +functions available for a given attribute database backend. +Here, we will assume the default SQLite database backend for attribute. + +

+Modifying the previous example, we will now specify the SQL aggregate function calls +explicitly instead of letting v.dissolve generate them for us. +We will compute sum of the ACRES column using sum(ACRES) +(alternatively, we could use SQLite specific total(ACRES) +which returns zero even when all values are NULL). +Further, we will count number of aggregated (i.e., dissolved) parts using +count(*) which counts all rows regardless of NULL values. +Then, we will count all unique names of parts as distinguished by +the MB_NAME column using count(distinct MB_NAME). +Finally, we will collect all these names into a comma-separated list using +group_concat(MB_NAME): + +

+v.dissolve input=boundary_municp column=DOTURBAN_N output=municipalities_6 \
+    aggregate_columns="total(ACRES),count(*),count(distinct MB_NAME),group_concat(MB_NAME)" \
+    result_columns="acres REAL,named_parts INTEGER,unique_names INTEGER,names TEXT"
+
+ +Here, v.dissolve doesn't make any assumptions about the resulting +column types, so we specified both named and the type of each column. + +

+When working with general SQL syntax, v.dissolve turns off its checks for +number of aggregate and result columns to allow for all SQL syntax to be used +for aggregate columns. This allows us to use also functions with multiple parameters, +for example specify separator to be used with group_concat: + +

+    v.dissolve input=boundary_municp column=DOTURBAN_N output=municipalities_7 \
+        aggregate_columns="group_concat(MB_NAME, ';')" \
+        result_columns="names TEXT"
+
+ +To inspect the result, we will use v.db.select retrieving only one row +for DOTURBAN_N == 'Wadesboro': + +
+v.db.select municipalities_7 where="DOTURBAN_N == 'Wadesboro'" separator=tab
+
+ +The resulting table may look like this: + +
+cat	DOTURBAN_N	names
+66	Wadesboro	Wadesboro;Lilesville
+
+ +

SEE ALSO

v.category, v.centroids, v.extract, -v.reclass +v.reclass, +v.db.univar, +v.db.select

AUTHORS

-module: M. Hamish Bowman, Dept. Marine Science, Otago University, New Zealand
-Markus Neteler for column support
-help page: Trevor Wiens +M. Hamish Bowman, Department of Marine Science, Otago University, New Zealand (module)
+Markus Neteler (column support)
+Trevor Wiens (help page)
+Vaclav Petras, NC State University, Center for Geospatial Analytics, GeoForAll Lab (aggregate statistics) diff --git a/scripts/v.dissolve/v.dissolve.py b/scripts/v.dissolve/v.dissolve.py index a22112cc187..641c48c5b66 100755 --- a/scripts/v.dissolve/v.dissolve.py +++ b/scripts/v.dissolve/v.dissolve.py @@ -2,13 +2,13 @@ ############################################################################ # # MODULE: v.dissolve -# AUTHOR: M. Hamish Bowman, Dept. Marine Science, Otago University, -# New Zealand +# AUTHOR: M. Hamish Bowman, Dept. Marine Science, Otago University # Markus Neteler for column support # Converted to Python by Glynn Clements +# Vaclav Petras (aggregate statistics) # PURPOSE: Dissolve common boundaries between areas with common cat # (frontend to v.extract -d) -# COPYRIGHT: (c) 2006-2014 Hamish Bowman, and the GRASS Development Team +# COPYRIGHT: (c) 2006-2023 Hamish Bowman, and the GRASS Development Team # This program is free software under the GNU General Public # License (>=v2). Read the file COPYING that comes with GRASS # for details. @@ -23,64 +23,583 @@ # % keyword: line # %end # %option G_OPT_V_INPUT +# % guisection: Dissolving # %end # %option G_OPT_V_FIELD # % label: Layer number or name. # % required: no +# % guisection: Dissolving # %end # %option G_OPT_DB_COLUMN # % description: Name of attribute column used to dissolve common boundaries +# % guisection: Dissolving # %end # %option G_OPT_V_OUTPUT +# % guisection: Dissolving # %end +# %option G_OPT_DB_COLUMN +# % key: aggregate_columns +# % label: Names of attribute columns to get aggregate statistics for +# % description: One column name or SQL expression per method if result columns are specified +# % guisection: Aggregation +# % multiple: yes +# %end +# %option +# % key: aggregate_methods +# % label: Aggregate statistics method (e.g., sum) +# % description: Default is all available basic statistics for a given backend (for sql backend: avg, count, max, min, sum) +# % guisection: Aggregation +# % multiple: yes +# %end +# %option G_OPT_DB_COLUMN +# % key: result_columns +# % label: New attribute column names for aggregate statistics results +# % description: Defaults to aggregate column name and statistics name and can contain type +# % guisection: Aggregation +# % multiple: yes +# %end +# %option +# % key: aggregate_backend +# % label: Backend for attribute aggregation +# % description: Default is sql unless the provided aggregate methods are for univar +# % multiple: no +# % required: no +# % options: sql,univar +# % descriptions: sql;Uses SQL attribute database;univar;Uses v.db.univar +# % guisection: Aggregation +# %end +# %rules +# % requires_all: aggregate_methods,aggregate_columns +# % requires_all: result_columns,aggregate_columns +# %end + +"""Dissolve geometries and aggregate attribute values""" -import os import atexit +import json +import subprocess +from collections import defaultdict -import grass.script as grass +import grass.script as gs from grass.exceptions import CalledModuleError -def cleanup(): - nuldev = open(os.devnull, "w") - grass.run_command( +# Methods supported by v.db.univar by default. +UNIVAR_METHODS = [ + "n", + "min", + "max", + "range", + "mean", + "mean_abs", + "variance", + "stddev", + "coeff_var", + "sum", +] + +# Basic SQL aggregate function common between SQLite and PostgreSQL +# (and the SQL standard) using their proper names and order from +# their documentation. +# Notably, this does not include SQLite total which returns zero +# when all values are NULL. +STANDARD_SQL_FUNCTIONS = ["avg", "count", "max", "min", "sum"] + + +def get_methods_and_backend(methods, backend, provide_defaults): + """Get methods and backed based on user-provided methods and backend""" + if methods: + if not backend: + in_univar = 0 + neither_in_sql_nor_univar = 0 + for method in methods: + if method not in STANDARD_SQL_FUNCTIONS: + if method in UNIVAR_METHODS: + in_univar += 1 + else: + neither_in_sql_nor_univar += 1 + # If all the non-basic functions are available in univar, use it. + if in_univar and not neither_in_sql_nor_univar: + backend = "univar" + elif provide_defaults: + if backend == "sql": + methods = STANDARD_SQL_FUNCTIONS + elif backend == "univar": + methods = UNIVAR_METHODS + else: + # This is the default SQL functions but using the univar names (and order). + methods = ["n", "min", "max", "mean", "sum"] + backend = "sql" + if not backend: + backend = "sql" + return methods, backend + + +def modify_methods_for_backend(methods, backend): + """Modify list of methods to fit the backend if they do not + + This allows for support of the same method names for both backends. + It works both ways. + """ + new_methods = [] + if backend == "sql": + for method in methods: + if method == "n": + new_methods.append("count") + elif method == "mean": + new_methods.append("avg") + else: + new_methods.append(method) + elif backend == "univar": + for method in methods: + if method == "count": + new_methods.append("n") + elif method == "avg": + new_methods.append("mean") + else: + new_methods.append(method) + return new_methods + + +def quote_from_type(column_type): + """Returns quote if column values need to be quoted based on their type + + Defaults to quoting for unknown types and no quoting for falsely values, + i.e., unknown types are assumed to be in need of quoting while missing type + information is assumed to be associated with numbers which don't need quoting. + """ + # Needs a general solution, e.g., https://github.com/OSGeo/grass/pull/1110 + if not column_type or column_type.upper() in [ + "INT", + "INTEGER", + "SMALLINT", + "REAL", + "DOUBLE", + "DOUBLE PRECISION", + ]: + return "" + return "'" + + +def sql_escape(text): + """Escape string for use in SQL statement. + + If the argument is not string, it is returned as is. + + Simple support for direct creation of SQL statements. This function, + column_value_to_where, and updates_to_sql need a rewrite with a more systematic + solution for generating statements in Python for GRASS GIS attribute engine. + """ + if isinstance(text, str): + return text.replace("'", "''") + return text + + +def updates_to_sql(table, updates): + """Create SQL from a list of dicts with column, value, where""" + sql = ["BEGIN TRANSACTION"] + for update in updates: + quote = quote_from_type(update.get("type", None)) + value = update["value"] + sql_value = f"{quote}{sql_escape(value) if value else 'NULL'}{quote}" + sql.append( + f"UPDATE {table} SET {update['column']} = {sql_value} " + f"WHERE {update['where']};" + ) + sql.append("END TRANSACTION") + return "\n".join(sql) + + +def update_columns(output_name, output_layer, updates, add_columns): + """Update attribute values based on a list of updates""" + if add_columns: + gs.run_command( + "v.db.addcolumn", + map=output_name, + layer=output_layer, + columns=",".join(add_columns), + ) + db_info = gs.vector_db(output_name)[int(output_layer)] + sql = updates_to_sql(table=db_info["table"], updates=updates) + gs.write_command( + "db.execute", + input="-", + database=db_info["database"], + driver=db_info["driver"], + stdin=sql, + ) + + +def column_value_to_where(column, value, *, quote): + """Create SQL where clause without the where keyword for column and its value""" + if value is None: + return f"{column} IS NULL" + if quote: + return f"{column}='{sql_escape(value)}'" + return f"{column}={value}" + + +def check_aggregate_methods_or_fatal(methods, backend): + """Check for known methods if possible or fail""" + if backend == "univar": + if not methods: + gs.fatal( + _( + "At least one method must be provided when backend " + "<{backend}> is used" + ).format(backend=backend) + ) + for method in methods: + if method not in UNIVAR_METHODS: + gs.fatal( + _( + "Method <{method}> is not available for backend <{backend}>" + ).format(method=method, backend=backend) + ) + # We don't have a list of available SQL functions. It is long for PostgreSQL + # and open for SQLite depending on its extensions. + + +def aggregate_columns_exist_or_fatal(vector, layer, columns): + """Check that all columns exist or end with fatal error""" + column_names = gs.vector_columns(vector, layer).keys() + for column in columns: + if column not in column_names: + if "(" in column: + gs.fatal( + _( + "Column <{column}> does not exist in vector <{vector}> " + "(layer <{layer}>). Specify result columns with 'name type' " + "syntax if you are using function calls instead of aggregate " + "column names only." + ).format(vector=vector, layer=layer, column=column) + ) + gs.fatal( + _( + "Column <{column}> selected for aggregation does not exist " + "in vector <{vector}> (layer <{layer}>)" + ).format(vector=vector, layer=layer, column=column) + ) + + +def match_columns_and_methods(columns, methods): + """Return all combinations of columns and methods + + If a column or a method is specified more than once, only the first occurrence + is used. This makes it suitable for interactive use which values convenience + over predictability. + """ + new_columns = [] + new_methods = [] + used_columns = [] + for column in columns: + if column in used_columns: + continue + used_columns.append(column) + used_methods = [] + for method in methods: + if method in used_methods: + continue + used_methods.append(method) + new_columns.append(column) + new_methods.append(method) + return new_columns, new_methods + + +def create_or_check_result_columns_or_fatal( + result_columns, columns_to_aggregate, methods, backend +): + """Create result columns from input if not provided or check them""" + if not result_columns: + return [ + f"{gs.legalize_vector_name(aggregate_column)}_{method}" + for aggregate_column, method in zip(columns_to_aggregate, methods) + ] + + if methods and len(columns_to_aggregate) != len(methods): + gs.fatal( + _( + "When result columns are specified, the number of " + "aggregate columns ({columns_to_aggregate}) needs to be " + "the same as the number of methods ({methods})" + ).format( + columns_to_aggregate=len(columns_to_aggregate), + methods=len(methods), + ) + ) + # When methods are not set with sql backend, we might be dealing with the general + # SQL syntax provided for columns, so we can't parse that easily, so let's not + # check that here. + if (methods or backend != "sql") and len(result_columns) != len( + columns_to_aggregate + ): + gs.fatal( + _( + "The number of result columns ({result_columns}) needs to be " + "the same as the number of aggregate columns " + "({columns_to_aggregate})" + ).format( + result_columns=len(result_columns), + columns_to_aggregate=len(columns_to_aggregate), + ) + ) + if methods and len(result_columns) != len(methods): + gs.fatal( + _( + "The number of result columns ({result_columns}) needs to be " + "the same as the number of aggregation methods ({methods})" + ).format( + result_columns=len(result_columns), + methods=len(methods), + ) + ) + if not methods: + if backend == "sql": + for column in result_columns: + if " " not in column: + gs.fatal( + _( + "Result column '{column}' needs a type " + "specified (using the syntax: 'name type') " + "when no methods are provided with the " + "{option_name} option and aggregation backend is '{backend}'" + ).format( + column=column, + option_name="aggregate_methods", + backend=backend, + ) + ) + else: + gs.fatal( + _( + "Methods must be specified with {backend} backend " + "and with result columns provided" + ).format(backend=backend) + ) + return result_columns + + +def aggregate_attributes_sql( + input_name, + input_layer, + column, + quote_column, + columns_to_aggregate, + methods, + result_columns, +): + """Aggregate values in selected columns grouped by column using SQL backend""" + if methods and len(columns_to_aggregate) != len(result_columns): + raise ValueError( + "Number of columns_to_aggregate and result_columns must be the same" + ) + if methods and len(columns_to_aggregate) != len(methods): + raise ValueError("Number of columns_to_aggregate and methods must be the same") + if not methods: + for result_column in result_columns: + if " " not in result_column: + raise ValueError( + f"Column {result_column} from result_columns without type" + ) + if methods: + select_columns = [ + f"{method}({agg_column})" + for method, agg_column in zip(methods, columns_to_aggregate) + ] + column_types = [ + "INTEGER" if method == "count" else "DOUBLE" for method in methods + ] * len(columns_to_aggregate) + else: + select_columns = columns_to_aggregate + column_types = None + + data = json.loads( + gs.read_command( + "v.db.select", + map=input_name, + layer=input_layer, + columns=",".join([column] + select_columns), + group=column, + format="json", + ) + ) + # We added the group column to the select, so we need to skip it here. + select_column_names = [item["name"] for item in data["info"]["columns"]][1:] + updates = [] + add_columns = [] + if column_types: + for result_column, column_type in zip(result_columns, column_types): + add_columns.append(f"{result_column} {column_type}") + else: + # Column types are part of the result column name list. + add_columns = result_columns.copy() # Ensure we have our own copy. + # Split column definitions into two lists. + result_columns = [] + column_types = [] + for definition in add_columns: + column_name, column_type = definition.split(" ", maxsplit=1) + result_columns.append(column_name) + column_types.append(column_type) + for row in data["records"]: + where = column_value_to_where(column, row[column], quote=quote_column) + for ( + result_column, + column_type, + key, + ) in zip(result_columns, column_types, select_column_names): + updates.append( + { + "column": result_column, + "type": column_type, + "value": row[key], + "where": where, + } + ) + return updates, add_columns + + +def aggregate_attributes_univar( + input_name, + input_layer, + column, + quote_column, + columns_to_aggregate, + methods, + result_columns, +): + """Aggregate values in selected columns grouped by column using v.db.univar""" + if len(columns_to_aggregate) != len(methods) != len(result_columns): + raise ValueError( + "Number of columns_to_aggregate, methods, and result_columns " + "must be the same" + ) + records = json.loads( + gs.read_command( + "v.db.select", + map=input_name, + layer=input_layer, + columns=column, + group=column, + format="json", + ) + )["records"] + columns = defaultdict(list) + for agg_column, method, result in zip( + columns_to_aggregate, methods, result_columns + ): + columns[agg_column].append((method, result)) + column_types = [ + "INTEGER" if method == "n" else "DOUBLE" for method in methods + ] * len(columns_to_aggregate) + add_columns = [] + for result_column, column_type in zip(result_columns, column_types): + add_columns.append(f"{result_column} {column_type}") + unique_values = [record[column] for record in records] + updates = [] + for value in unique_values: + where = column_value_to_where(column, value, quote=quote_column) + # for i, aggregate_column in enumerate(columns_to_aggregate): + for aggregate_column, methods_results in columns.items(): + stats = json.loads( + gs.read_command( + "v.db.univar", + map=input_name, + column=aggregate_column, + format="json", + where=where, + ) + )["statistics"] + for method, result_column in methods_results: + updates.append( + { + "column": result_column, + "value": stats[method], + "where": where, + } + ) + return updates, add_columns + + +def cleanup(name): + """Remove temporary vector silently""" + gs.run_command( "g.remove", flags="f", type="vector", - name="%s_%s" % (output, tmp), + name=name, quiet=True, - stderr=nuldev, + stderr=subprocess.DEVNULL, + errors="ignore", ) -def main(): - global output, tmp +def remove_mapset_from_name(name): + """Remove the at-mapset part (if any) from the name""" + return name.split("@", maxsplit=1)[0] + + +def option_as_list(options, name): + """Get value of an option as a list""" + option = options[name] + if not option: + return [] + return [value.strip() for value in option.split(",")] + - input = options["input"] +def main(): + """Run the dissolve operation based on command line parameters""" + options, unused_flags = gs.parser() + input_vector = options["input"] output = options["output"] layer = options["layer"] column = options["column"] + aggregate_backend = options["aggregate_backend"] + + columns_to_aggregate = option_as_list(options, "aggregate_columns") + user_aggregate_methods = option_as_list(options, "aggregate_methods") + result_columns = option_as_list(options, "result_columns") - # setup temporary file - tmp = str(os.getpid()) + user_aggregate_methods, aggregate_backend = get_methods_and_backend( + user_aggregate_methods, aggregate_backend, provide_defaults=not result_columns + ) + if not result_columns: + aggregate_columns_exist_or_fatal(input_vector, layer, columns_to_aggregate) + columns_to_aggregate, user_aggregate_methods = match_columns_and_methods( + columns_to_aggregate, user_aggregate_methods + ) + aggregate_methods = modify_methods_for_backend( + user_aggregate_methods, backend=aggregate_backend + ) + check_aggregate_methods_or_fatal(aggregate_methods, backend=aggregate_backend) + result_columns = create_or_check_result_columns_or_fatal( + result_columns=result_columns, + columns_to_aggregate=columns_to_aggregate, + methods=user_aggregate_methods, + backend=aggregate_backend, + ) # does map exist? - if not grass.find_file(input, element="vector")["file"]: - grass.fatal(_("Vector map <%s> not found") % input) + if not gs.find_file(input_vector, element="vector")["file"]: + gs.fatal(_("Vector map <%s> not found") % input_vector) if not column: - grass.warning( + gs.warning( _( "No '%s' option specified. Dissolving based on category values from layer <%s>." ) % ("column", layer) ) - grass.run_command( - "v.extract", flags="d", input=input, output=output, type="area", layer=layer + gs.run_command( + "v.extract", + flags="d", + input=input_vector, + output=output, + type="area", + layer=layer, ) else: if int(layer) == -1: - grass.warning( + gs.warning( _( "Invalid layer number (%d). " "Parameter '%s' specified, assuming layer '1'." @@ -89,20 +608,33 @@ def main(): ) layer = "1" try: - coltype = grass.vector_columns(input, layer)[column] + coltype = gs.vector_columns(input_vector, layer)[column] except KeyError: - grass.fatal(_("Column <%s> not found") % column) + gs.fatal(_("Column <%s> not found") % column) if coltype["type"] not in ("INTEGER", "SMALLINT", "CHARACTER", "TEXT"): - grass.fatal(_("Key column must be of type integer or string")) + gs.fatal(_("Key column must be of type integer or string")) + column_is_str = coltype["type"] in ("CHARACTER", "TEXT") + if columns_to_aggregate and not column_is_str: + gs.fatal( + _( + "Key column type must be string (text) " + "for aggregation method to work, not '{column_type}'" + ).format(column_type=coltype["type"]) + ) - tmpfile = "%s_%s" % (output, tmp) + tmpfile = gs.append_node_pid(remove_mapset_from_name(output)) + atexit.register(cleanup, tmpfile) try: - grass.run_command( - "v.reclass", input=input, output=tmpfile, layer=layer, column=column + gs.run_command( + "v.reclass", + input=input_vector, + output=tmpfile, + layer=layer, + column=column, ) - grass.run_command( + gs.run_command( "v.extract", flags="d", input=tmpfile, @@ -110,21 +642,45 @@ def main(): type="area", layer=layer, ) - except CalledModuleError as e: - grass.fatal( - _( - "Final extraction steps failed." - " Check above error messages and" - " see following details:\n%s" + if columns_to_aggregate: + if aggregate_backend == "sql": + updates, add_columns = aggregate_attributes_sql( + input_name=input_vector, + input_layer=layer, + column=column, + quote_column=column_is_str, + columns_to_aggregate=columns_to_aggregate, + methods=aggregate_methods, + result_columns=result_columns, + ) + else: + updates, add_columns = aggregate_attributes_univar( + input_name=input_vector, + input_layer=layer, + column=column, + quote_column=column_is_str, + columns_to_aggregate=columns_to_aggregate, + methods=aggregate_methods, + result_columns=result_columns, + ) + update_columns( + output_name=output, + output_layer=layer, + updates=updates, + add_columns=add_columns, ) - % e + except CalledModuleError as error: + gs.fatal( + _( + "A processing step failed." + " Check the above error messages and" + " see the following details:\n{error}" + ).format(error=error) ) # write cmd history: - grass.vector_history(output) + gs.vector_history(output) if __name__ == "__main__": - options, flags = grass.parser() - atexit.register(cleanup) main() diff --git a/scripts/v.dissolve/v_dissolve.ipynb b/scripts/v.dissolve/v_dissolve.ipynb new file mode 100644 index 00000000000..f90907a704d --- /dev/null +++ b/scripts/v.dissolve/v_dissolve.ipynb @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# v.dissolve\n", + "\n", + "This notebook presents couple examples of _v.dissolve_ and examination of its outputs.\n", + "\n", + "## Setup\n", + "\n", + "We will be using the NC SPM sample location." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import subprocess\n", + "import sys\n", + "\n", + "# Ask GRASS GIS where its Python packages are.\n", + "sys.path.append(\n", + " subprocess.check_output([\"grass\", \"--config\", \"python_path\"], text=True).strip()\n", + ")\n", + "\n", + "# Import GRASS packages\n", + "import grass.script as gs\n", + "import grass.jupyter as gj\n", + "\n", + "# Start GRASS Session\n", + "gj.init(\"~/data/grassdata/nc_basic_spm_grass7/user1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dissolve by Attribute\n", + "\n", + "We will use ZIP codes to create town boundaries by dissolving boundaries of ZIP code areas. Let's see the ZIP codes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zipcodes = \"zipcodes\"\n", + "town_map = gj.Map()\n", + "town_map.d_vect(map=zipcodes)\n", + "town_map.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " We dissolve boudaries between ZIP codes which have the same town name which is in the NAME attribute." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "towns = \"towns_from_zipcodes\"\n", + "gs.run_command(\n", + " \"v.dissolve\",\n", + " input=zipcodes,\n", + " column=\"NAME\",\n", + " output=towns,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Color boudaries according to the primary key column called cat and display." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs.run_command(\"v.colors\", map=towns, use=\"attr\", column=\"cat\", color=\"wave\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "town_map.d_vect(map=towns)\n", + "town_map.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "town_map.d_vect(map=zipcodes, fill_color=\"none\")\n", + "town_map.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dissolve with Attribute Aggregation\n", + "\n", + "Now let's count number of ZIP codes in each town and compute total area as a sum of an existing column in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "towns_with_area = \"towns_with_area\"\n", + "gs.run_command(\n", + " \"v.dissolve\",\n", + " input=zipcodes,\n", + " column=\"NAME\",\n", + " output=towns_with_area,\n", + " aggregate_column=\"SHAPE_Area,SHAPE_Area\",\n", + " aggregate_method=\"count,sum\",\n", + " result_column=\"num_zip_codes,town_area\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the computed attributes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table = json.loads(gs.read_command(\"v.db.select\", map=towns_with_area, format=\"json\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for row in table[\"records\"]:\n", + " print(f'{row[\"NAME\"]:<14} {row[\"num_zip_codes\"]:>2} {row[\"town_area\"]:>12.0f}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now color the result using the total area:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs.run_command(\n", + " \"v.colors\", map=towns_with_area, use=\"attr\", column=\"town_area\", color=\"plasma\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "town_map = gj.Map()\n", + "town_map.d_vect(map=towns_with_area)\n", + "town_map.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Images for Documentation\n", + "\n", + "Here, we use some of the data created above to create images for documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zip_map = gj.Map()\n", + "zip_map.d_vect(map=towns, flags=\"s\")\n", + "zip_map.d_vect(map=zipcodes, color=\"#222222\", width=2, type=\"boundary\")\n", + "zip_map.d_legend_vect()\n", + "zip_map.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "town_map = gj.Map()\n", + "town_map.d_vect(map=towns, flags=\"s\")\n", + "town_map.d_vect(map=towns_with_area, color=\"#222222\", width=2, type=\"boundary\")\n", + "town_map.d_legend_vect()\n", + "town_map.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This cell requires pngquant and optipng.\n", + "zip_map.save(\"v_dissolve_zipcodes.png\")\n", + "town_map.save(\"v_dissolve_towns.png\")\n", + "for filename in [\"v_dissolve_zipcodes.png\", \"v_dissolve_towns.png\"]:\n", + " !pngquant --ext \".png\" -f {filename}\n", + " !optipng -o7 {filename}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test\n", + "\n", + "For a small dataset, we can easily compute the same attribute values in Python. We do this assuming that all areas (polygons) with same value will be dissolved (merged) together possibly creating multipolygons." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "# Get the original attribute data.\n", + "zip_table = json.loads(gs.read_command(\"v.db.select\", map=zipcodes, format=\"json\"))\n", + "# Restructure original data for easy lookup of area.\n", + "zip_records_by_town = defaultdict(list)\n", + "for row in zip_table[\"records\"]:\n", + " zip_records_by_town[row[\"NAME\"]].append(row[\"SHAPE_Area\"])\n", + "\n", + "# Check each row in the original table.\n", + "for row in table[\"records\"]:\n", + " town_name = row[\"NAME\"]\n", + " town_area = row[\"town_area\"]\n", + " town_zip_codes = row[\"num_zip_codes\"]\n", + " areas_by_zip = zip_records_by_town[town_name]\n", + " # Check number ZIP codes.\n", + " if len(areas_by_zip) != town_zip_codes:\n", + " raise RuntimeError(f'Incorrect number of zipcodes in town {row[\"NAME\"]}')\n", + " # Check total area.\n", + " if round(sum(areas_by_zip)) != round(town_area):\n", + " raise RuntimeError(\n", + " f'Incorrect area for {row[\"NAME\"]}: {sum(areas_by_zip)} != {town_area}'\n", + " )\n", + "print(\"No exceptions. Test passed.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/scripts/v.dissolve/v_dissolve_towns.png b/scripts/v.dissolve/v_dissolve_towns.png new file mode 100644 index 00000000000..52df4e7d67e Binary files /dev/null and b/scripts/v.dissolve/v_dissolve_towns.png differ diff --git a/scripts/v.dissolve/v_dissolve_zipcodes.png b/scripts/v.dissolve/v_dissolve_zipcodes.png new file mode 100644 index 00000000000..50a28233d20 Binary files /dev/null and b/scripts/v.dissolve/v_dissolve_zipcodes.png differ