Skip to content

Commit

Permalink
Index search and unit test data (#123)
Browse files Browse the repository at this point in the history
  • Loading branch information
delucchi-cmu authored Jan 17, 2024
1 parent 2ec29db commit 537f2ea
Show file tree
Hide file tree
Showing 10 changed files with 113 additions and 1 deletion.
19 changes: 18 additions & 1 deletion src/lsdb/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from lsdb.catalog.dataset.healpix_dataset import HealpixDataset
from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm
from lsdb.core.crossmatch.crossmatch_algorithms import BuiltInCrossmatchAlgorithm
from lsdb.core.search import ConeSearch, PolygonSearch
from lsdb.core.search import ConeSearch, IndexSearch, PolygonSearch
from lsdb.dask.crossmatch_catalog_data import crossmatch_catalog_data
from lsdb.dask.divisions import get_pixels_divisions
from lsdb.dask.join_catalog_data import join_catalog_data_on, join_catalog_data_through
Expand Down Expand Up @@ -218,6 +218,23 @@ def polygon_search(self, vertices: List[SphericalCoordinates]) -> Catalog:
"""
return self._search(PolygonSearch(vertices, self.hc_structure))

def index_search(self, ids, catalog_index: hc.catalog.index.index_catalog.IndexCatalog):
"""Find rows by ids (or other value indexed by a catalog index).
Filters partitions in the catalog to those that could contain the ids requested.
Filters to points that have matching values in the id field.
NB: This requires a previously-computed catalog index table.
Args:
ids: values to search for
catalog_index: a pre-computed hipscat catalog index
Returns:
A new Catalog containing the points filtered to those matching the ids.
"""
return self._search(IndexSearch(ids, catalog_index))

def _search(self, search):
"""Find rows by reusable search algorithm.
Expand Down
1 change: 1 addition & 0 deletions src/lsdb/core/search/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .cone_search import ConeSearch
from .index_search import IndexSearch
from .polygon_search import PolygonSearch
30 changes: 30 additions & 0 deletions src/lsdb/core/search/index_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import List

import pandas as pd
from hipscat.catalog.index.index_catalog import IndexCatalog
from hipscat.pixel_math import HealpixPixel

from lsdb.core.search.abstract_search import AbstractSearch


class IndexSearch(AbstractSearch):
"""Find rows by ids (or other value indexed by a catalog index).
Filters partitions in the catalog to those that could contain the ids requested.
Filters to points that have matching values in the id field.
NB: This requires a previously-computed catalog index table.
"""

def __init__(self, ids, catalog_index: IndexCatalog):
self.ids = ids
self.catalog_index = catalog_index

def search_partitions(self, _: List[HealpixPixel]) -> List[HealpixPixel]:
"""Determine the target partitions for further filtering."""
return self.catalog_index.loc_partitions(self.ids)

def search_points(self, frame: pd.DataFrame) -> pd.DataFrame:
"""Determine the search results within a data frame"""

return frame[frame[self.catalog_index.catalog_info.indexing_column].isin(self.ids)]
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ def small_sky_hipscat_catalog(small_sky_dir):
return hc.catalog.Catalog.read_from_hipscat(small_sky_dir)


@pytest.fixture
def small_sky_order1_id_index_dir(test_data_dir):
return os.path.join(test_data_dir, "small_sky_order1_id_index")


@pytest.fixture
def small_sky_catalog(small_sky_dir):
return lsdb.read_hipscat(small_sky_dir, catalog_type=lsdb.catalog.Catalog)
Expand Down
Binary file not shown.
Binary file added tests/data/small_sky_order1_id_index/_metadata
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/data/small_sky_order1_id_index/catalog_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"catalog_name": "small_sky_order1_id_index",
"catalog_type": "index",
"total_rows": 131,
"primary_catalog": "/home/delucchi/git/hipscat/tests/data/small_sky_order1",
"indexing_column": "id",
"extra_columns": []
}
Binary file not shown.
31 changes: 31 additions & 0 deletions tests/data/small_sky_order1_id_index/provenance_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"catalog_name": "small_sky_order1_id_index",
"catalog_type": "index",
"total_rows": 131,
"primary_catalog": "/home/delucchi/git/hipscat/tests/data/small_sky_order1",
"indexing_column": "id",
"extra_columns": [],
"version": "0.2.1",
"generation_date": "2024.01.09",
"tool_args": {
"tool_name": "hipscat_import",
"version": "0.2.1",
"runtime_args": {
"catalog_name": "small_sky_order1_id_index",
"output_path": "/home/delucchi/git/hipscat/tests/data/",
"output_artifact_name": "small_sky_order1_id_index",
"tmp_dir": "",
"overwrite": true,
"dask_tmp": "",
"dask_n_workers": 1,
"dask_threads_per_worker": 1,
"catalog_path": "/home/delucchi/git/hipscat/tests/data/small_sky_order1_id_index",
"tmp_path": "/home/delucchi/git/hipscat/tests/data/small_sky_order1_id_index/intermediate",
"input_catalog_path": "/home/delucchi/git/hipscat/tests/data/small_sky_order1",
"indexing_column": "id",
"extra_columns": [],
"include_hipscat_index": "False",
"include_order_pixel": true
}
}
}
20 changes: 20 additions & 0 deletions tests/lsdb/catalog/test_index_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from hipscat.catalog.index.index_catalog import IndexCatalog


def test_index_search(small_sky_order1_catalog, small_sky_order1_id_index_dir, assert_divisions_are_correct):
catalog_index = IndexCatalog.read_from_hipscat(small_sky_order1_id_index_dir)

index_search_catalog = small_sky_order1_catalog.index_search([900], catalog_index)
index_search_df = index_search_catalog.compute()
assert len(index_search_df) == 0
assert_divisions_are_correct(index_search_catalog)

index_search_catalog = small_sky_order1_catalog.index_search(["700"], catalog_index)
index_search_df = index_search_catalog.compute()
assert len(index_search_df) == 0
assert_divisions_are_correct(index_search_catalog)

index_search_catalog = small_sky_order1_catalog.index_search([700], catalog_index)
index_search_df = index_search_catalog.compute()
assert len(index_search_df) == 1
assert_divisions_are_correct(index_search_catalog)

0 comments on commit 537f2ea

Please sign in to comment.