Source code for hats.pixel_math.partition_stats

"""Utilities for generating and manipulating object count histograms"""

from typing import Sequence

import numpy as np
import pandas as pd

import hats.pixel_math.healpix_shim as hp
from hats.pixel_tree.pixel_tree import PixelTree



[docs]
def empty_histogram(highest_order):
    """Use numpy to create an histogram array with the right shape, filled with zeros.

    Parameters
    ----------
    highest_order : int
        the highest healpix order (e.g. 0-10)

    Returns
    -------
    np.ndarray
        one-dimensional numpy array of long integers, where the length is equal to
        the number of pixels in a healpix map of target order, and all values are set to 0.
    """
    return np.zeros(hp.order2npix(highest_order), dtype=np.int64)




[docs]
def generate_histogram(
    data: pd.DataFrame,
    highest_order,
    ra_column="ra",
    dec_column="dec",
):
    """Generate a histogram of counts for objects found in `data`

    Parameters
    ----------
    data : pd.DataFrame
        tabular object data
    highest_order : int
        the highest healpix order (e.g. 0-10)
    ra_column : str
        where in the input to find the celestial coordinate right ascension (Default value = "ra")
    dec_column : str
        where in the input to find the celestial coordinate declination (Default value = "dec")

    Returns
    -------
    np.ndarray
        one-dimensional numpy array of long integers where the value at each index corresponds

    Raises
    ------
    ValueError
        if the `ra_column` or `dec_column` cannot be found in the input data.
    """
    histogram_result = empty_histogram(highest_order)

    # Verify that the data frame has columns with desired names.
    required_columns = [ra_column, dec_column]
    if not all(x in data.columns for x in required_columns):
        raise ValueError(f"Invalid column names in input: {ra_column}, {dec_column}")
    mapped_pixels = hp.radec2pix(
        highest_order,
        data[ra_column].values,
        data[dec_column].values,
    )
    mapped_pixel, count_at_pixel = np.unique(mapped_pixels, return_counts=True)
    histogram_result[mapped_pixel] += count_at_pixel.astype(np.int64)
    return histogram_result




[docs]
def generate_alignment(
    row_count_histogram,
    highest_order=10,
    lowest_order=0,
    threshold=1_000_000,
    drop_empty_siblings=False,
    mem_size_histogram=None,
):
    """Generate alignment from high order pixels to those of equal or lower order

    We may initially find healpix pixels at order 10, but after aggregating up to the pixel
    threshold, some final pixels are order 4 or 7. This method provides a map from pixels
    at order 10 to their destination pixel. This may be used as an input into later partitioning
    map reduce steps.

    Parameters
    ----------
    row_count_histogram : np.array
        one-dimensional numpy array of long integers where the value at each index corresponds to
        the number of objects found at the healpix pixel.
    highest_order : int
        the highest healpix order (e.g. 5-10) (Default value = 10)
    lowest_order : int
        the lowest healpix order (e.g. 1-5). specifying a lowest order
        constrains the partitioning to prevent spatially large pixels. (Default value = 0)
    threshold : int
        the maximum number of objects allowed in a single pixel (Default value = 1_000_000)
    drop_empty_siblings : bool
        if 3 of 4 pixels are empty, keep only the non-empty pixel (Default value = False)
    mem_size_histogram : np.array or None
        one-dimensional numpy array of long integers where the value at each index corresponds to
        the memory size (in bytes) of objects found at the healpix pixel. If provided, this will be
        used to determine the thresholding instead of the param `histogram`. (Default value = None)

    Returns
    -------
    tuple
        one-dimensional numpy array of integer 3-tuples, where the value at each index corresponds
        to the destination pixel at order less than or equal to the `highest_order`.
        The tuple contains three integers:

        - order of the destination pixel
        - pixel number *at the above order*
        - the number of objects in the pixel

    Raises
    ------
    ValueError
        if the histogram is the wrong size, or some initial histogram bins
        exceed threshold.
    """
    # Validate inputs.
    _validate_alignment_arguments(
        row_count_histogram, mem_size_histogram, highest_order, lowest_order, threshold
    )

    # Generate nested sums.
    nested_sums_row_count = _get_nested_sums(row_count_histogram, highest_order, lowest_order)
    if mem_size_histogram is not None:
        nested_sums_mem_size = _get_nested_sums(mem_size_histogram, highest_order, lowest_order)
    else:
        nested_sums_mem_size = None

    # Generate alignment.
    if drop_empty_siblings:
        return _get_alignment_dropping_siblings(
            nested_sums_row_count, highest_order, lowest_order, threshold, nested_sums_mem_size
        )
    return _get_alignment(nested_sums_row_count, highest_order, lowest_order, threshold, nested_sums_mem_size)



def _validate_alignment_arguments(
    row_count_histogram, mem_size_histogram, highest_order, lowest_order, threshold
):
    if len(row_count_histogram) != hp.order2npix(highest_order):
        raise ValueError("histogram is not the right size")
    if lowest_order > highest_order:
        raise ValueError("lowest_order should be less than highest_order")
    if mem_size_histogram is not None:
        max_bin = np.amax(mem_size_histogram)
        if max_bin > threshold:
            raise ValueError(f"single pixel mem_size {max_bin} exceeds threshold {threshold}")
    else:
        max_bin = np.amax(row_count_histogram)
        if max_bin > threshold:
            raise ValueError(f"single pixel row count {max_bin} exceeds threshold {threshold}")


def _get_nested_sums(histogram, highest_order, lowest_order):
    """Computes the total counts per pixel, for all orders
    between `lowest_order` and `highest_order`"""
    nested_sums = []
    for i in range(0, highest_order):
        nested_sums.append(empty_histogram(i))
    nested_sums.append(histogram)

    # work backward - from the highest order, fill in the sums of lower order pixels
    for read_order in range(highest_order, lowest_order, -1):
        parent_order = read_order - 1
        for index in range(0, len(nested_sums[read_order])):
            parent_pixel = index >> 2
            nested_sums[parent_order][parent_pixel] += nested_sums[read_order][index]

    return nested_sums


def _get_alignment(nested_sums_row_count, highest_order, lowest_order, threshold, nested_sums_mem_size):
    """Method to aggregate pixels up to the threshold.

    Checks from low order (large areas), drilling down into higher orders (smaller areas) to
    find the appropriate order for an area of sky."""
    if nested_sums_mem_size is not None:
        nested_sums = nested_sums_mem_size
    else:
        nested_sums = nested_sums_row_count

    # Initialize our alignment structure.
    nested_alignment = []
    for i in range(0, highest_order + 1):
        nested_alignment.append(np.full(hp.order2npix(i), None))

    # Work forward - determine if we should map to a lower order pixel, this pixel, or keep looking.
    for read_order in range(lowest_order, highest_order + 1):
        parent_order = read_order - 1
        for index in range(0, len(nested_sums[read_order])):
            parent_alignment = None
            if parent_order >= 0:
                parent_pixel = index >> 2
                parent_alignment = nested_alignment[parent_order][parent_pixel]

            if parent_alignment:
                nested_alignment[read_order][index] = parent_alignment
            elif nested_sums[read_order][index] == 0:
                continue
            elif nested_sums[read_order][index] <= threshold:
                # For row_count mode, use tuple of (order, pixel, row_count)
                if not nested_sums_mem_size:
                    nested_alignment[read_order][index] = (
                        read_order,
                        index,
                        nested_sums[read_order][index],
                    )
                # For mem_size mode, use tuple of (order, pixel, row_count, mem_size)
                else:
                    mem_size = nested_sums_mem_size[read_order][index]
                    row_count = nested_sums_row_count[read_order][index]
                    nested_alignment[read_order][index] = (
                        read_order,
                        index,
                        row_count,
                        mem_size,
                    )

    # We no longer need to store mem_size sums once the alignment has been constructed.
    if nested_sums_mem_size is not None:
        nested_alignment[highest_order] = np.array(
            [
                pixel_alignment[:3] if pixel_alignment else None
                for pixel_alignment in nested_alignment[highest_order]
            ],
            dtype="object",
        )

    return nested_alignment[highest_order]


def _get_alignment_dropping_siblings(
    nested_sum_row_count, highest_order, lowest_order, threshold, nested_sums_mem_size
):
    """Method to aggregate pixels up to the threshold that collapses completely empty pixels away.

    Checks from higher order (smaller areas) out to lower order (large areas). In this way, we are able to
    keep spatially isolated areas in pixels of higher order.

    This method can be slower than the above `_get_alignment` method, and so should only be used
    when the smaller area pixels are desired.

    This uses a form of hiearchical agglomeration (building a tree bottom-up). For each cell
    at order n, we look at the counts in all 4 subcells at order (n+1). We have two numeric
    values that are easy to compute that we can refer to easily:

    - quad_sum: the total number of counts in this cell
    - quad_max: the largest count within the 4 subcells

    Our agglomeration criteria (the conditions under which we collapse) must BOTH be met:

    - total number in cell is less than the global threshold (quad_sum <= threshold)
    - more than one subcell contains values (quad_sum != quad_max) (if exactly 1
      subcell contains counts, then all of the quad_sum will come from that single quad_max)

    Inversely, we will NOT collapse when EITHER is true:

    - total number in cell is greater than the threshold
    - only one subcell contains values
    """
    # If nested_sums_mem_size is provided, we're in mem_size mode (and thresholding by memory size).
    # This means we'll want to use the mem_size sums to generate our alignment, but still keep track
    # of the row counts for the output.
    if nested_sums_mem_size is not None:
        nested_sums = nested_sums_mem_size
    else:
        nested_sums = nested_sum_row_count

    # Initialize our order map to the highest order.
    order_map = np.array(
        [highest_order if count > 0 else -1 for count in nested_sums[highest_order]], dtype=np.int32
    )
    for pixel_order in range(highest_order - 1, lowest_order - 1, -1):
        for quad_start_index in range(0, hp.order2npix(pixel_order)):
            quad_sum = nested_sums[pixel_order][quad_start_index]
            quad_max = max(nested_sums[pixel_order + 1][quad_start_index * 4 : quad_start_index * 4 + 4])

            if quad_sum != quad_max and quad_sum <= threshold:
                ## Condition where we want to collapse pixels to the lower order (larger area)
                explosion_factor = 4 ** (highest_order - pixel_order)
                exploded_pixels = [
                    *range(
                        quad_start_index * explosion_factor,
                        (quad_start_index + 1) * explosion_factor,
                    )
                ]
                order_map[exploded_pixels] = pixel_order

    # Construct our results.
    nested_alignment = [
        (
            (intended_order, pixel_high_index >> 2 * (highest_order - intended_order))
            if intended_order >= 0
            else None
        )
        for pixel_high_index, intended_order in enumerate(order_map)
    ]
    # In both row_count and mem_size mode, use tuple of (order, pixel, row_count)
    # as mem_size is no longer needed now that alignment has been constructed.
    nested_alignment = [
        (tup[0], tup[1], nested_sum_row_count[tup[0]][tup[1]]) if tup else None for tup in nested_alignment
    ]

    return np.array(nested_alignment, dtype="object")



[docs]
def generate_incremental_alignment(
    row_count_histogram: np.ndarray,
    existing_pixels: Sequence[tuple[int, int]],
    highest_order: int = 10,
    lowest_order: int = 0,
    threshold: int = 1_000_000,
    mem_size_histogram: np.ndarray | None = None,
):
    """Generate alignment for an incremental catalog.

    We will keep the existing pixels and add new pixels for the points in the
    histogram that fall out of the existing coverage. Those pixels will be the
    largest (non-overlapping) possible that obey to the defined pixel `threshold`.

    Unlike `generate_alignment` there is no global guarantee that the number of
    points per pixel remains under the previous `pixel_threshold`.

    Parameters
    ----------
    row_count_histogram : np.ndarray
        one-dimensional numpy array of long integers where the
        value at each index corresponds to the number of objects
        found at the healpix pixel.
    existing_pixels : Sequence[tuple[int,int]]
        the list of pixels in the existing catalog that we want to keep
    highest_order : int
        the highest healpix order (e.g. 5-10) (Default value = 10)
    lowest_order : int
        the lowest healpix order (e.g. 1-5). specifying a lowest order
        constrains the partitioning to prevent spatially large pixels. (Default value = 0)
    threshold : int
        the maximum number of objects allowed in a single pixel (Default value = 1_000_000)
    mem_size_histogram : np.ndarray or None
        one-dimensional numpy array of long integers where the value at each index corresponds to
        the memory size (in bytes) of objects found at the healpix pixel. If provided, this will be
        used to determine the thresholding instead of the param `histogram`. (Default value = None)

    Returns
    -------
    tuple
        one-dimensional numpy array of integer 3-tuples, where the value at each index corresponds
        to the destination pixel at order less than or equal to the mapping order.
        The tuple contains three integers:

        - order of the destination pixel
        - pixel number *at the above order*
        - the number of objects in the pixel
    """
    _validate_alignment_arguments(
        row_count_histogram, mem_size_histogram, highest_order, lowest_order, threshold
    )

    nested_sums = _get_nested_sums(row_count_histogram, highest_order, lowest_order)

    tree = PixelTree.from_healpix(existing_pixels)
    if tree.tree_order > highest_order:
        raise ValueError("`highest_order` must be >= than existing catalog maximum order")

    nested_mask = _get_nested_mask(tree, lowest_order, highest_order)

    return _get_alignment_incremental_dropping_siblings(
        tree, nested_sums, nested_mask, highest_order, lowest_order, threshold
    )



def _get_nested_mask(tree, lowest_order, highest_order):
    """Generate tree masks for orders between `lowest_order` and `highest_order`"""
    nested_masks = {}
    for order in range(lowest_order, highest_order + 1):
        nested_masks[order] = _get_tree_mask_at_order(tree, order)
    return nested_masks


def _get_tree_mask_at_order(tree, order):
    """Generate boolean mask indicating whether each pixel at order
    `order` is in the `tree` or not."""
    mask = np.full(hp.order2npix(order), False, dtype=bool)
    if tree.tree_order > order:
        aligned_tree = tree.tree >> (2 * (tree.tree_order - order))
    else:
        aligned_tree = tree.tree << (2 * (order - tree.tree_order))
    for start, end in aligned_tree:
        if end - start > 0:
            mask[start:end] = True
        else:
            mask[start] = True
    return mask


def _get_alignment_incremental_dropping_siblings(
    tree, nested_sums, nested_mask, highest_order, lowest_order, threshold
):
    """Method to aggregate pixels up to the threshold that collapses completely empty pixels away.

    Checks from higher order (smaller areas) out to lower order (large areas). In this way, we are able to
    keep spatially isolated areas in pixels of higher order.

    Similar to `get_alignment_dropping_siblings` but with a main difference: we only aggregate the pixels
    that do not already exist in the `nested_mask`."""
    # Create the order map but assigning existing pixels
    order_map = _get_order_map_with_existing_pixels(nested_sums[highest_order], highest_order, tree)

    for pixel_order in range(highest_order - 1, lowest_order - 1, -1):
        for quad_start_index in range(0, hp.order2npix(pixel_order)):
            quad_sum = nested_sums[pixel_order][quad_start_index]
            quad_max = max(nested_sums[pixel_order + 1][quad_start_index * 4 : quad_start_index * 4 + 4])

            if (
                quad_sum != quad_max
                and quad_sum <= threshold
                and not nested_mask[pixel_order][quad_start_index]
            ):
                ## Condition where we want to collapse pixels to the lower order (larger area)
                explosion_factor = 4 ** (highest_order - pixel_order)
                exploded_pixels = [
                    *range(
                        quad_start_index * explosion_factor,
                        (quad_start_index + 1) * explosion_factor,
                    )
                ]
                order_map[exploded_pixels] = pixel_order

    # Construct our results.
    nested_alignment = [
        (
            (intended_order, pixel_high_index >> 2 * (highest_order - intended_order))
            if intended_order >= 0
            else None
        )
        for pixel_high_index, intended_order in enumerate(order_map)
    ]
    nested_alignment = [
        (tup[0], tup[1], nested_sums[tup[0]][tup[1]]) if tup else None for tup in nested_alignment
    ]

    return np.array(nested_alignment, dtype="object")


def _get_order_map_with_existing_pixels(histogram, highest_order, tree):
    """Create order map at the histogram `highest_order` and assign the existing `tree` pixels"""
    order_map = np.array([highest_order if count > 0 else -1 for count in histogram], dtype=np.int32)
    # Align tree to the `highest_order` (that of the histogram) first
    aligned_tree = tree.tree << (2 * (highest_order - tree.tree_order))
    # For each interval in the aligned tree, compute the total counts
    # from the histogram and assign the original pixel order in the map
    for (start, end), pixel in zip(aligned_tree, tree.pixels):
        total_sum = np.sum(histogram[start:end])
        if total_sum > 0:
            order_map[start:end] = pixel[0]
    return order_map