Source code for spateo.segmentation.qc

from typing import Callable, List, Optional, Tuple, Union

import numpy as np
from anndata import AnnData

from ..configuration import SKM
from ..errors import SegmentationError
from ..logging import logger_manager as lm


@SKM.check_adata_is_type(SKM.ADATA_AGG_TYPE)
[docs]def select_qc_regions(
    adata: AnnData,
    regions: Union[List[Tuple[int, int]], List[Tuple[int, int, int, int]]] = None,
    n: int = 4,
    size: int = 2000,
    seed: Optional[int] = None,
    use_scale: bool = True,
    absolute: bool = False,
    weight_func: Optional[Callable[[AnnData], float]] = lambda adata: np.log1p(adata.X.sum()),
):
    """Select regions to use for segmentation quality control purposes.

    Note:
        All coordinates are in terms of "real" coordinates (i.e. the coordinates
        in `adata.obs_names` and `adata.var_names`) so that slicing the
        AnnData retains the regions correctly.

    Args:
        adata: Input AnnData
        regions: List of tuples in the form `(xmin, ymin)` or `(xmin, xmax, ymin, ymax)`.
            If the later, the `size` argument is used to compute the bounding box.
        n: Number of regions to select if `regions` is not provided.
        size: Width and height, in pixels, of each randomly selected region.
        seed: Random seed.
        use_scale: Whether or not the provided `regions` are in scale units.
            This option only has effect when `regions` are
            provided. `False` means the provided coordinates are in terms of
            pixels.
        absolute: Whether or not the provided `regions` are in terms of absolute
            X and Y coordinates. This option only has effect when `regions` are
            provided. `False` means the provided coordinates are relative with
            respect to the coordinates in the provided `adata`.
        weight_func: Weighting function when `regions` is not provided. The probability of
            selecting each `size x size` region will be weighted by this function, which
            accepts a single AnnData (the region) as its argument, and returns a single
            float weight, such that higher weights mean higher probability. By default,
            the log1p of the sum of the counts in the `X` layer is used. Set to `None`
            to weight each region equally.
    """
    if not regions:
        lm.main_info(f"Randomly selecting {n} regions of shape {(size, size)}.")
        _regions = np.zeros((n, 4), dtype=int)
        # Construct grid indices
        indices = np.dstack(
            np.meshgrid(np.arange(0, adata.n_obs - size, size), np.arange(0, adata.n_vars - size, size))
        ).reshape(-1, 2)

        if indices.shape[0] == 0:
            raise SegmentationError("No possible regions found. This may indicate the `size` argument is to big.")

        rng = np.random.default_rng(seed)
        if weight_func is None:
            idx = rng.choice(np.arange(indices.shape[0]), n, replace=False)
        else:
            p = np.zeros(indices.shape[0])
            for i, (x, y) in enumerate(indices):
                p[i] = weight_func(adata[x : x + size, y : y + size])
            idx = rng.choice(np.arange(indices.shape[0]), n, replace=False, p=p / p.sum())
        choices = indices[idx]

        for i, (x, y) in enumerate(choices):
            xmin = int(adata.obs_names[x])
            ymin = int(adata.var_names[y])
            _regions[i] = [xmin, xmin + size, ymin, ymin + size]
    else:
        lm.main_info(f"Using regions provided with `regions` argument.")
        _regions = np.zeros((len(regions), 4), dtype=int)
        adata_bounds = SKM.get_agg_bounds(adata)
        binsize = SKM.get_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_BINSIZE_KEY)
        scale = SKM.get_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_SCALE_KEY) * binsize
        unit = SKM.get_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_SCALE_UNIT_KEY)
        for i, region in enumerate(regions):
            if len(region) == 4:
                xmin, xmax, ymin, ymax = region
            elif len(region) == 2:
                xmin, ymin = region
                xmax = xmin + size
                ymax = ymin + size
            else:
                raise SegmentationError("`regions` must be a list of 4-element or 2-element tuples.")

            if use_scale and unit is not None:
                xmin /= scale
                xmax /= scale
                ymin /= scale
                ymax /= scale

            # If absolute = False, adjust for anndata bounds
            if not absolute:
                xmin += adata_bounds[0]
                xmax += adata_bounds[0]
                ymin += adata_bounds[2]
                ymax += adata_bounds[2]

            if xmin < adata_bounds[0] or xmax >= adata_bounds[1] or ymin < adata_bounds[2] or ymax >= adata_bounds[3]:
                lm.main_warning(f"Region {region} is out of bounds. It will be clipped into bounds.")
            xmin = max(xmin, adata_bounds[0])
            xmax = min(xmax, adata_bounds[1])
            ymin = max(ymin, adata_bounds[2])
            ymax = min(ymax, adata_bounds[3])

            _regions[i] = (xmin, xmax, ymin, ymax)

    _regions = _regions.astype(int)
    SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_QC_KEY, _regions)


[docs]def _generate_random_labels(shape: Tuple[int, int], areas: List[int], seed: Optional[int] = None) -> np.ndarray:
    n = np.prod(shape)
    if sum(areas) > n:
        raise SegmentationError("Sum of `areas` exceeds to total area")

    rng = np.random.default_rng(seed)
    labels = np.zeros(n, dtype=int)
    indices = np.arange(n)
    rng.shuffle(indices)
    for i, area in enumerate(areas):
        label = i + 1
        labels[indices[:area]] = label
        indices = indices[area:]
    return labels.reshape(shape)


@SKM.check_adata_is_type(SKM.ADATA_AGG_TYPE)
[docs]def generate_random_labels(
    adata: AnnData,
    areas: List[int],
    seed: Optional[int] = None,
    out_layer: str = "random_labels",
):
    """Create random labels, usually for benchmarking and QC purposes.

    Args:
        adata: Input Anndata
        areas: List of desired areas.
        seed: Random seed.
        out_layer: Layer to save results.
    """
    labels = _generate_random_labels(adata.shape, areas, seed)
    SKM.set_layer_data(adata, out_layer, labels)


@SKM.check_adata_is_type(SKM.ADATA_AGG_TYPE)
[docs]def generate_random_labels_like(
    adata: AnnData,
    layer: str,
    seed: Optional[int] = None,
    out_layer: str = "random_labels",
):
    """Create random labels, using another layer as a template.

    Args:
        adata: Input Anndata
        layer: Layer containing template labels
        seed: Random seed.
        out_layer: Layer to save results.
    """
    labels = SKM.select_layer_data(adata, layer)
    bincount = np.bincount(labels.flatten())
    generate_random_labels(adata, bincount[1:], seed=seed, out_layer=out_layer)