From 1c5acb56d78a14a07ce72aa842aaef56abe06b11 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:25:39 -0500 Subject: [PATCH] Remove margin fine filtering. (#435) --- .github/workflows/pre-commit-ci.yml | 2 +- .github/workflows/testing-and-coverage.yml | 2 +- requirements.txt | 2 +- .../margin_cache/margin_cache_arguments.py | 5 ++++- .../margin_cache/margin_cache_map_reduce.py | 20 ++++++------------- .../margin_cache/test_margin_cache.py | 2 +- .../test_margin_cache_map_reduce.py | 7 ++++--- .../margin_cache/test_margin_round_trip.py | 4 ++-- 8 files changed, 20 insertions(+), 24 deletions(-) diff --git a/.github/workflows/pre-commit-ci.yml b/.github/workflows/pre-commit-ci.yml index a6eb0283..d4873b14 100644 --- a/.github/workflows/pre-commit-ci.yml +++ b/.github/workflows/pre-commit-ci.yml @@ -7,7 +7,7 @@ on: push: branches: [ main, development ] pull_request: - branches: [ main, development ] + branches: [ main, development, margin ] jobs: pre-commit-ci: diff --git a/.github/workflows/testing-and-coverage.yml b/.github/workflows/testing-and-coverage.yml index 88579cfb..80a61d8f 100644 --- a/.github/workflows/testing-and-coverage.yml +++ b/.github/workflows/testing-and-coverage.yml @@ -7,7 +7,7 @@ on: push: branches: [ main, development ] pull_request: - branches: [ main, development ] + branches: [ main, development, margin ] jobs: build: diff --git a/requirements.txt b/requirements.txt index 315a5bb5..72fdb5c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -git+https://github.com/astronomy-commons/hats.git@main +git+https://github.com/astronomy-commons/hats.git@margin diff --git a/src/hats_import/margin_cache/margin_cache_arguments.py b/src/hats_import/margin_cache/margin_cache_arguments.py index a8d9ad56..69542f07 100644 --- a/src/hats_import/margin_cache/margin_cache_arguments.py +++ b/src/hats_import/margin_cache/margin_cache_arguments.py @@ -28,7 +28,7 @@ class MarginCacheArguments(RuntimeArguments): order of healpix partitioning in the source catalog. if `margin_order` is left default or set to -1, then the `margin_order` will be set dynamically to the highest partition order plus 1.""" - fine_filtering: bool = True + fine_filtering: bool = False """should we perform the precise boundary checking? if false, some results may be greater than `margin_threshold` away from the border (but within `margin_order`).""" @@ -54,6 +54,9 @@ def _check_arguments(self): if len(self.catalog.get_healpix_pixels()) == 0: raise ValueError("debug_filter_pixel_list has created empty catalog") + if self.fine_filtering: + raise NotImplementedError("Fine filtering temporarily removed.") + highest_order = int(self.catalog.partition_info.get_highest_order()) if self.margin_order < 0: diff --git a/src/hats_import/margin_cache/margin_cache_map_reduce.py b/src/hats_import/margin_cache/margin_cache_map_reduce.py index 625046c1..5c5e374c 100644 --- a/src/hats_import/margin_cache/margin_cache_map_reduce.py +++ b/src/hats_import/margin_cache/margin_cache_map_reduce.py @@ -3,7 +3,6 @@ import pandas as pd import pyarrow as pa import pyarrow.dataset as ds -from hats import pixel_math from hats.io import file_io, paths from hats.pixel_math.healpix_pixel import HealpixPixel @@ -11,7 +10,7 @@ from hats_import.pipeline_resume_plan import get_pixel_cache_directory, print_task_failure -# pylint: disable=too-many-arguments +# pylint: disable=too-many-arguments, unused-argument def map_pixel_shards( partition_file, mapping_key, @@ -26,6 +25,9 @@ def map_pixel_shards( ): """Creates margin cache shards from a source partition file.""" try: + if fine_filtering: + raise NotImplementedError("Fine filtering temporarily removed.") + schema = file_io.read_parquet_metadata(original_catalog_metadata).schema.to_arrow_schema() data = file_io.read_parquet_file_to_pandas(partition_file, schema=schema) source_pixel = HealpixPixel(data["Norder"].iloc[0], data["Npix"].iloc[0]) @@ -78,6 +80,7 @@ def map_pixel_shards( raise exception +# pylint: disable=too-many-arguments, unused-argument def _to_pixel_shard( filtered_data, pixel, @@ -89,18 +92,7 @@ def _to_pixel_shard( fine_filtering, ): """Do boundary checking for the cached partition and then output remaining data.""" - if fine_filtering: - margin_check = pixel_math.check_margin_bounds( - filtered_data[ra_column].values, - filtered_data[dec_column].values, - pixel.order, - pixel.pixel, - margin_threshold, - ) - - margin_data = filtered_data.iloc[margin_check] - else: - margin_data = filtered_data + margin_data = filtered_data num_rows = len(margin_data) if num_rows: diff --git a/tests/hats_import/margin_cache/test_margin_cache.py b/tests/hats_import/margin_cache/test_margin_cache.py index 7b480c36..3dba6f56 100644 --- a/tests/hats_import/margin_cache/test_margin_cache.py +++ b/tests/hats_import/margin_cache/test_margin_cache.py @@ -35,7 +35,7 @@ def test_margin_cache_gen(small_sky_source_catalog, tmp_path, dask_client): data = pd.read_parquet(test_file) - assert len(data) == 13 + assert len(data) == 88 assert all(data[paths.PARTITION_ORDER] == norder) assert all(data[paths.PARTITION_PIXEL] == npix) diff --git a/tests/hats_import/margin_cache/test_margin_cache_map_reduce.py b/tests/hats_import/margin_cache/test_margin_cache_map_reduce.py index dc610501..21ba28c2 100644 --- a/tests/hats_import/margin_cache/test_margin_cache_map_reduce.py +++ b/tests/hats_import/margin_cache/test_margin_cache_map_reduce.py @@ -53,7 +53,7 @@ def test_to_pixel_shard_equator(tmp_path, basic_data_shard_df): assert os.path.exists(path) - validate_result_dataframe(path, 2) + validate_result_dataframe(path, 360) @pytest.mark.timeout(5) @@ -79,7 +79,7 @@ def test_to_pixel_shard_polar(tmp_path, polar_data_shard_df): def test_map_pixel_shards_error(tmp_path, capsys): """Test error behavior on reduce stage. e.g. by not creating the original catalog parquet files.""" - with pytest.raises(FileNotFoundError): + with pytest.raises(NotImplementedError): margin_cache_map_reduce.map_pixel_shards( paths.pixel_catalog_file(tmp_path, HealpixPixel(1, 0)), mapping_key="1_21", @@ -94,9 +94,10 @@ def test_map_pixel_shards_error(tmp_path, capsys): ) captured = capsys.readouterr() - assert "Parquet file does not exist" in captured.out + assert "Fine filtering temporarily removed" in captured.out +@pytest.mark.skip() @pytest.mark.timeout(30) def test_map_pixel_shards_fine(tmp_path, test_data_dir, small_sky_source_catalog): """Test basic mapping behavior, with fine filtering enabled.""" diff --git a/tests/hats_import/margin_cache/test_margin_round_trip.py b/tests/hats_import/margin_cache/test_margin_round_trip.py index 8e3e8fee..865327f5 100644 --- a/tests/hats_import/margin_cache/test_margin_round_trip.py +++ b/tests/hats_import/margin_cache/test_margin_round_trip.py @@ -72,7 +72,7 @@ def test_margin_import_gaia_minimum( data = pd.read_parquet(test_file) - assert len(data) == 1 + assert len(data) == 4 @pytest.mark.dask(timeout=180) @@ -117,7 +117,7 @@ def test_margin_import_mixed_schema_csv( catalog = read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path - assert len(catalog.get_healpix_pixels()) == 5 + assert len(catalog.get_healpix_pixels()) == 19 norder = 2 npix = 187