Source code for spateo.io.bgi

"""IO functions for BGI stereo technology.
"""
import gzip
import math
import warnings
from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union

import cv2
import numpy as np
import pandas as pd

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import skimage.io

from anndata import AnnData
from scipy.sparse import csr_matrix
from typing_extensions import Literal

from ..configuration import SKM
from ..logging import logger_manager as lm
from .utils import (
    bin_indices,
    centroids,
    get_bin_props,
    get_coords_labels,
    get_label_props,
    get_points_props,
    in_concave_hull,
)

try:
    import ngs_tools as ngs

[docs] VERSIONS = { "stereo": ngs.chemistry.get_chemistry("Stereo-seq").resolution, }
except ModuleNotFoundError: class SpatialResolution(NamedTuple): scale: float = 1.0 unit: Optional[Literal["nm", "um", "mm"]] = None VERSIONS = {"stereo": SpatialResolution(0.5, "um")}
[docs]COUNT_COLUMN_MAPPING = { SKM.X_LAYER: 3, SKM.SPLICED_LAYER_KEY: 4, SKM.UNSPLICED_LAYER_KEY: 5, }
[docs]def read_bgi_as_dataframe(path: str, label_column: Optional[str] = None) -> pd.DataFrame: """Read a BGI read file as a pandas DataFrame. Args: path: Path to read file. label_column: Column name containing positive cell labels. Returns: Pandas Dataframe with the following standardized column names. * `gene`: Gene name/ID (whatever was used in the original file) * `x`, `y`: X and Y coordinates * `total`, `spliced`, `unspliced`: Counts for each RNA species. The latter two is only present if they are in the original file. """ dtype = { "geneID": "category", # geneID "x": np.uint32, # x "y": np.uint32, # y # Multiple different names for total counts "MIDCounts": np.uint16, "MIDCount": np.uint16, "UMICount": np.uint16, "UMICounts": np.uint16, "EXONIC": np.uint16, # spliced "INTRONIC": np.uint16, # unspliced, } rename = { "MIDCounts": "total", "MIDCount": "total", "UMICount": "total", "UMICounts": "total", "EXONIC": "spliced", "INTRONIC": "unspliced", } # Use first 10 rows for validation. df = pd.read_csv(path, sep="\t", dtype=dtype, comment="#", nrows=10) if label_column: dtype.update({label_column: np.uint32}) rename.update({label_column: "label"}) if label_column not in df.columns: raise IOError(f"Column `{label_column}` is not present.") # If duplicate columns are provided, we don't know which to use! rename_inverse = {} for _from, _to in rename.items(): rename_inverse.setdefault(_to, []).append(_from) for _to, _cols in rename_inverse.items(): if sum(_from in df.columns for _from in _cols) > 1: raise IOError(f"Found multiple columns mapping to `{_to}`.") return pd.read_csv( path, sep="\t", dtype=dtype, comment="#", ).rename(columns=rename)
[docs]def dataframe_to_labels(df: pd.DataFrame, column: str, shape: Optional[Tuple[int, int]] = None) -> np.ndarray: """Convert a BGI dataframe that contains cell labels to a labels matrix. Args: df: Read dataframe, as returned by :func:`read_bgi_as_dataframe`. columns: Column that contains cell labels as positive integers. Any labels that are non-positive are ignored. Returns: Labels matrix """ shape = shape or (df["x"].max() + 1, df["y"].max() + 1) labels = np.zeros(shape, dtype=int) for label, _df in df.drop_duplicates(subset=[column, "x", "y"]).groupby(column): if label <= 0: continue labels[(_df["x"].values, _df["y"].values)] = label return labels
[docs]def dataframe_to_filled_labels(df: pd.DataFrame, column: str, shape: Optional[Tuple[int, int]] = None) -> np.ndarray: """Convert a BGI dataframe that contains cell labels to a (filled) labels matrix. Args: df: Read dataframe, as returned by :func:`read_bgi_as_dataframe`. columns: Column that contains cell labels as positive integers. Any labels that are non-positive are ignored. Returns: Labels matrix """ shape = shape or (df["x"].max() + 1, df["y"].max() + 1) labels = np.zeros(shape, dtype=int) for label, _df in df.drop_duplicates(subset=[column, "x", "y"]).groupby(column): if label <= 0: continue points = _df[["x", "y"]].values.astype(int) min_offset = points.min(axis=0) max_offset = points.max(axis=0) xmin, ymin = min_offset xmax, ymax = max_offset points -= min_offset hull = cv2.convexHull(points, returnPoints=True) mask = cv2.fillConvexPoly(np.zeros((max_offset - min_offset + 1)[::-1], dtype=np.uint8), hull, color=1).T labels[xmin : xmax + 1, ymin : ymax + 1][mask == 1] = label return labels
[docs]def read_bgi_agg( path: str, stain_path: Optional[str] = None, binsize: int = 1, gene_agg: Optional[Dict[str, Union[List[str], Callable[[str], bool]]]] = None, prealigned: bool = False, label_column: Optional[str] = None, version: Literal["stereo"] = "stereo", ) -> AnnData: """Read BGI read file to calculate total number of UMIs observed per coordinate. Args: path: Path to read file. stain_path: Path to nuclei staining image. Must have the same coordinate system as the read file. binsize: Size of pixel bins. gene_agg: Dictionary of layer keys to gene names to aggregate. For example, `{'mito': ['list', 'of', 'mitochondrial', 'genes']}` will yield an AnnData with a layer named "mito" with the aggregate total UMIs of the provided gene list. prealigned: Whether the stain image is already aligned with the minimum x and y RNA coordinates. label_column: Column that contains already-segmented cell labels. version: BGI technology version. Currently only used to set the scale and scale units of each unit coordinate. This may change in the future. Returns: An AnnData object containing the UMIs per coordinate and the nucleus staining image, if provided. The total UMIs are stored as a sparse matrix in `.X`, and spliced and unspliced counts (if present) are stored in `.layers['spliced']` and `.layers['unspliced']` respectively. The nuclei image is stored as a Numpy array in `.layers['nuclei']`. """ lm.main_debug(f"Reading data from {path}.") data = read_bgi_as_dataframe(path, label_column) x_min, y_min = data["x"].min(), data["y"].min() x, y = data["x"].values, data["y"].values x_max, y_max = x.max(), y.max() shape = (x_max + 1, y_max + 1) # Read image and update x,y max if appropriate layers = {} if stain_path: lm.main_debug(f"Reading stain image from {stain_path}.") image = skimage.io.imread(stain_path) if prealigned: lm.main_warning( ( "Assuming stain image was already aligned with the minimum x and y RNA coordinates. " "(prealinged=True)" ) ) image = np.pad(image, ((x_min, 0), (y_min, 0))) x_max = max(x_max, image.shape[0] - 1) y_max = max(y_max, image.shape[1] - 1) shape = (x_max + 1, y_max + 1) # Reshape image to match new x,y max if image.shape != shape: lm.main_warning(f"Padding stain image from {image.shape} to {shape} with zeros.") image = np.pad(image, ((0, shape[0] - image.shape[0]), (0, shape[1] - image.shape[1]))) layers[SKM.STAIN_LAYER_KEY] = image # Construct labels matrix if present labels = None if "label" in data.columns: lm.main_warning("Using the `label_column` option may result in disconnected labels.") labels = dataframe_to_labels(data, "label", shape) layers[SKM.LABELS_LAYER_KEY] = labels if binsize > 1: lm.main_info(f"Binning counts with binsize={binsize}.") shape = (math.ceil(shape[0] / binsize), math.ceil(shape[1] / binsize)) x = bin_indices(x, 0, binsize) y = bin_indices(y, 0, binsize) x_min, y_min = x.min(), y.min() # Resize image if necessary if stain_path: layers[SKM.STAIN_LAYER_KEY] = cv2.resize(image, shape[::-1]) if labels is not None: lm.main_warning("Cell labels were provided, but `binsize` > 1. There may be slight inconsistencies.") layers[SKM.LABELS_LAYER_KEY] = labels[::binsize, ::binsize] # See read_bgi_as_dataframe for standardized column names lm.main_info("Constructing count matrices.") X = csr_matrix((data["total"].values, (x, y)), shape=shape, dtype=np.uint16) if "spliced" in data.columns: layers[SKM.SPLICED_LAYER_KEY] = csr_matrix((data["spliced"].values, (x, y)), shape=shape, dtype=np.uint16) if "unspliced" in data.columns: layers[SKM.UNSPLICED_LAYER_KEY] = csr_matrix((data["unspliced"].values, (x, y)), shape=shape, dtype=np.uint16) # Aggregate gene lists if gene_agg: lm.main_info("Aggregating counts for genes provided by `gene_agg`.") for name, genes in gene_agg.items(): mask = data["geneID"].isin(genes) if isinstance(genes, list) else data["geneID"].map(genes) data_genes = data[mask] _x, _y = data_genes["x"].values, data_genes["y"].values layers[name] = csr_matrix( (data_genes["total"].values, (_x, _y)), shape=shape, dtype=np.uint16, ) adata = AnnData(X=X, layers=layers)[x_min:, y_min:].copy() scale, scale_unit = 1.0, None if version in VERSIONS: resolution = VERSIONS[version] scale, scale_unit = resolution.scale, resolution.unit # Set uns SKM.init_adata_type(adata, SKM.ADATA_AGG_TYPE) SKM.init_uns_pp_namespace(adata) SKM.init_uns_spatial_namespace(adata) SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_BINSIZE_KEY, binsize) SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_SCALE_KEY, scale) SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_SCALE_UNIT_KEY, scale_unit) return adata
@SKM.check_adata_is_type(SKM.ADATA_AGG_TYPE, "segmentation_adata", optional=True)
[docs]def read_bgi( path: str, binsize: Optional[int] = None, segmentation_adata: Optional[AnnData] = None, labels_layer: Optional[str] = None, labels: Optional[Union[np.ndarray, str]] = None, seg_binsize: int = 1, label_column: Optional[str] = None, add_props: bool = True, version: Literal["stereo"] = "stereo", ) -> AnnData: """Read BGI read file as AnnData. Args: path: Path to read file. binsize: Size of pixel bins. Should only be provided when labels (i.e. the `segmentation_adata` and `labels` arguments) are not used. segmentation_adata: AnnData containing segmentation results. labels_layer: Layer name in `segmentation_adata` containing labels. labels: Numpy array or path to numpy array saved with `np.save` that contains labels. seg_binsize: the bin size used in cell segmentation, used in conjunction with `labels` and will be overwritten when `labels_layer` and `segmentation_adata` are not None. label_column: Column that contains already-segmented cell labels. If this column is present, this takes prescedence. add_props: Whether or not to compute label properties, such as area, bounding box, centroid, etc. version: BGI technology version. Currently only used to set the scale and scale units of each unit coordinate. This may change in the future. Returns: Bins x genes or labels x genes AnnData. """ # Check inputs if sum([binsize is not None, segmentation_adata is not None, labels is not None, label_column is not None]) != 1: raise IOError("Exactly one of `segmentation_adata`, `binsize`, `labels`, `label_column` must be provided.") if (segmentation_adata is None) ^ (labels_layer is None): raise IOError("Both `segmentation_adata` and `labels_layer` must be provided.") if segmentation_adata is not None: if SKM.get_adata_type(segmentation_adata) != SKM.ADATA_AGG_TYPE: raise IOError("Only `AGG` type AnnDatas are supported.") if binsize is not None and abs(int(binsize)) != binsize: raise IOError("Positive integer `binsize` must be provided when `segmentation_adata` is not provided.") if isinstance(labels, str): labels = np.load(labels) lm.main_debug(f"Reading data from {path}.") data = read_bgi_as_dataframe(path, label_column) n_columns = data.shape[1] # Obtain total genes from raw data, so that the columns always match # regardless of what method was used. uniq_gene = sorted(data["geneID"].unique()) props = None if label_column is not None: lm.main_info(f"Using cell labels from `{label_column}` column.") binsize = 1 data = data[data["label"] > 0] if add_props: lm.main_warning( "Using `label_column` as cell labels with `add_props=True` may result in incorrect contours." ) props = get_points_props(data[["x", "y", "label"]]) elif binsize is not None: lm.main_info(f"Using binsize={binsize}") if binsize < 2: lm.main_warning("Please consider using a larger bin size.") if binsize > 1: x_bin = bin_indices(data["x"].values, 0, binsize) y_bin = bin_indices(data["y"].values, 0, binsize) data["x"], data["y"] = x_bin, y_bin data["label"] = data["x"].astype(str) + "-" + data["y"].astype(str) if add_props: props = get_bin_props(data[["x", "y", "label"]].drop_duplicates(), binsize) # Use labels. else: binsize = 1 shape = (data["x"].max(), data["y"].max()) if labels is not None: lm.main_info(f"Using labels provided with `labels` argument.") if labels.shape != shape: lm.main_warning(f"Labels matrix {labels.shape} has different shape as data matrix {shape}") else: labels = SKM.select_layer_data(segmentation_adata, labels_layer) label_coords = get_coords_labels(labels) if labels_layer is not None: lm.main_info(f"Using labels provided with `segmentation_adata` and `labels_layer` arguments.") seg_binsize = SKM.get_uns_spatial_attribute(segmentation_adata, SKM.UNS_SPATIAL_BINSIZE_KEY) x_min, y_min = ( int(segmentation_adata.obs_names[0]) * seg_binsize, int(segmentation_adata.var_names[0]) * seg_binsize, ) label_coords["x"] += x_min label_coords["y"] += y_min # When binning was used for segmentation, need to expand indices to cover # every binned pixel. if seg_binsize > 1: lm.main_warning("Binning was used for segmentation.") coords_dfs = [] for i in range(seg_binsize): for j in range(seg_binsize): coords = label_coords.copy() coords["x"] += i coords["y"] += j coords_dfs.append(coords) label_coords = pd.concat(coords_dfs, ignore_index=True) data = pd.merge(data, label_coords, on=["x", "y"], how="inner") if add_props: props = get_label_props(labels) uniq_cell = sorted(data["label"].unique()) shape = (len(uniq_cell), len(uniq_gene)) cell_dict = dict(zip(uniq_cell, range(len(uniq_cell)))) gene_dict = dict(zip(uniq_gene, range(len(uniq_gene)))) x_ind = data["label"].map(cell_dict).astype(int).values y_ind = data["geneID"].map(gene_dict).astype(int).values # See read_bgi_as_dataframe for standardized column names lm.main_info("Constructing count matrices.") X = csr_matrix((data["total"].values, (x_ind, y_ind)), shape=shape) layers = {} if "spliced" in data.columns: layers[SKM.SPLICED_LAYER_KEY] = csr_matrix((data["spliced"].values, (x_ind, y_ind)), shape=shape) if "unspliced" in data.columns: layers[SKM.UNSPLICED_LAYER_KEY] = csr_matrix((data["unspliced"].values, (x_ind, y_ind)), shape=shape) obs = pd.DataFrame(index=uniq_cell) var = pd.DataFrame(index=uniq_gene) adata = AnnData(X=X, obs=obs, var=var, layers=layers) if props is not None: ordered_props = props.loc[adata.obs_names] adata.obs["area"] = ordered_props["area"].values adata.obsm["spatial"] = ordered_props.filter(regex="centroid-").values adata.obsm["contour"] = ordered_props["contour"].values adata.obsm["bbox"] = ordered_props.filter(regex="bbox-").values scale, scale_unit = 1.0, None if version in VERSIONS: resolution = VERSIONS[version] scale, scale_unit = resolution.scale, resolution.unit # Set uns SKM.init_adata_type(adata, SKM.ADATA_UMI_TYPE) SKM.init_uns_pp_namespace(adata) SKM.init_uns_spatial_namespace(adata) SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_BINSIZE_KEY, binsize) SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_SCALE_KEY, scale) SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_SCALE_UNIT_KEY, scale_unit) return adata