Source code for hats.catalog.index.index_catalog

import numpy as np
import pyarrow.compute as pc
import pyarrow.dataset as pds

from hats.catalog.dataset import Dataset
from hats.io import paths
from hats.pixel_math import HealpixPixel
from hats.pixel_math.healpix_pixel_function import get_pixel_argsort


[docs] class IndexCatalog(Dataset): """An index into HATS Catalog for enabling fast lookups on non-spatial values. Note that this is not a true "HATS Catalog", as it is not partitioned spatially. """
[docs] def loc_partitions(self, ids) -> list[HealpixPixel]: """Find the set of partitions in the primary catalog for the ids provided. Parameters ---------- ids primary catalog for the ids Returns ------- list[HealpixPixel] partitions of leaf parquet files in the primary catalog that may contain rows for the id values """ metadata_file = paths.get_parquet_metadata_pointer(self.catalog_base_dir) dataset = pds.parquet_dataset(metadata_file.path, filesystem=metadata_file.fs) # There's a lot happening in a few pyarrow dataset methods: # We create a simple pyarrow expression that roughly corresponds to a SQL statement like # WHERE id_column IN (<ids>) # We stay in pyarrow to group by Norder/Npix to aggregate the results unique values. # After that convert into pandas, as this handles the integer type conversions # (uint8 and uint64 aren't always friendly between pyarrow and the rest of python), # and offers easy iteration to create our HealpixPixel list. filtered = dataset.filter(pc.field(self.catalog_info.indexing_column).isin(ids)).to_table() unique_pixel_dataframe = filtered.group_by(["Norder", "Npix"]).aggregate([]).to_pandas() loc_partitions = [ HealpixPixel(order, pixel) for order, pixel in zip( unique_pixel_dataframe["Norder"], unique_pixel_dataframe["Npix"], ) ] # Put the partitions in stable order (by nested healpix ordering). argsort = get_pixel_argsort(loc_partitions) loc_partitions = np.array(loc_partitions)[argsort] return loc_partitions