"""Functions to generate simulated RNA data.
Adapted from code written by @HailinPan.
"""
from collections import deque
from typing import Optional, Tuple
import cv2
import numpy as np
from anndata import AnnData
from scipy import stats
from ..configuration import SKM
from ..errors import SegmentationError
[docs]def _create_labels(
shape: Tuple[int, int],
xs: np.ndarray,
ys: np.ndarray,
axes1: np.ndarray,
axes2: np.ndarray,
angles: np.ndarray,
shift: int = 3,
) -> np.ndarray:
"""Given simulated cell variables, generate a label array.
Args:
shape: The size of the X and Y axes, in pixels.
xs, ys: Center X and Y coordinates for each cell.
axes1, axes2: Axes lengths for each cell.
angles: Angle for each cell.
shift: Attempt to minimize overlaps between cells by shifting overlapping
cells by this amount (in reality, this is just a scaling factor, so
larger values mean to shift overlapping cells by more). Set to zero
or negative to disable.
Returns:
New Numpy array containing cell labels.
"""
n = xs.size
if n != xs.size or n != axes1.size or n != axes2.size or n != angles.size:
raise SegmentationError(f"All input arrays must have size {n}")
indices_to_add = deque(range(xs.size)) # +1 to get actual label to add!
labels = np.zeros(shape, dtype=np.int32)
i = 0
while indices_to_add:
if i >= n * 100:
raise SegmentationError(
f"Reached iteration {i}. Try reducing the number of cells or turn off shifting by setting `shift=0`."
)
idx = indices_to_add.popleft()
label = idx + 1
x, y, axis1, axis2, angle = xs[idx], ys[idx], axes1[idx], axes2[idx], angles[idx]
prev_labels = labels.copy()
cv2.ellipse(labels, (x, y), (axis1, axis2), angle, 0, 360, label, -1)
if shift > 1:
# Find & remove overlapping labels.
overlapping = np.unique(prev_labels[(labels == label) & (prev_labels > 0)])
labels[np.isin(labels, overlapping)] = 0
# Compute shifted locations for each of the overlapping labels.
for ov_label in overlapping:
ov_idx = ov_label - 1
if ov_idx not in indices_to_add:
indices_to_add.append(int(ov_idx))
ov_x, ov_y = xs[ov_idx], ys[ov_idx]
diff_x = ov_x - x
diff_y = ov_y - y
distance = np.sqrt(diff_x**2 + diff_y**2) + 1e-5
# Shift
xs[ov_idx] = min(max(0, round(ov_x + ((diff_x + 1e-5) / distance * shift))), shape[0])
ys[ov_idx] = min(max(0, round(ov_y + (diff_y + 1e-5) / distance * shift)), shape[1])
i += 1
return labels
[docs]def simulate_cells(
shape: Tuple[int, int],
n: int,
axis1_range: Tuple[int, int] = (7, 15),
axis2_range: Tuple[int, int] = (5, 14),
shift: int = 3,
foreground_params: Tuple[int, int, int] = (0.512, 1.96, 11.4),
background_params: Tuple[int, int, int] = (0.921, 1.08, 1.74),
seed: Optional[int] = None,
) -> AnnData:
"""Create a new AnnData object containing simulated cell labels and UMI
counts.
Cells are simulated as ellipses with the two axes lengths sampled from a
log-uniform distribution. The angle at which the cell is placed is sampled
uniformly at random.
Args:
shape: The size of the X and Y axes, in pixels.
n: Number of cells to simulate.
axis1_range: Range of the first axes.
axis2_range: Range of the second axes.
shift: Attempt to minimize overlaps between cells by shifting overlapping
cells by this amount (in reality, this is just a scaling factor, so
larger values mean to shift overlapping cells by more). Set to zero or
negative to disable.
foreground_params: Parameters for foreground expression, as a 3-element
tuple of (dropout rate, mean, variance).
background_params: Parameters for background expression, as a 3-element
tuple of (dropout rate, mean, variance).
seed: Random seed
Returns:
An Anndata object where X contains the simulated UMI counts and
`.layers['labels']` contains the simulated labels.
"""
def muvar_to_np(mu, var):
n = mu**2 / (var - mu)
p = mu / var
return n, p
f_do, f_mu, f_var = foreground_params
b_do, b_mu, b_var = background_params
if f_var < f_mu or b_var < b_mu:
raise SegmentationError("Variance must be larger than mean.")
f_n, f_p = muvar_to_np(f_mu, f_var)
b_n, b_p = muvar_to_np(b_mu, b_var)
rng = np.random.default_rng(seed)
xs = rng.integers(0, shape[0], n)
ys = rng.integers(0, shape[1], n)
axes1 = stats.loguniform.rvs(axis1_range[0], axis1_range[1], size=n, random_state=rng).astype(np.int32)
axes2 = stats.loguniform.rvs(axis2_range[0], axis2_range[1], size=n, random_state=rng).astype(np.int32)
angles = rng.uniform(0, 360, n)
labels = _create_labels(shape, xs, ys, axes1, axes2, angles, shift=shift)
f_X = stats.nbinom.rvs(f_n, f_p, size=shape, random_state=rng)
b_X = stats.nbinom.rvs(b_n, b_p, size=shape, random_state=rng)
# Dropout
f_X[rng.random(shape) < f_do] = 0
b_X[rng.random(shape) < b_do] = 0
X = np.where(labels > 0, f_X, b_X)
adata = AnnData(X=X, layers={"labels": labels})
SKM.init_adata_type(adata, SKM.ADATA_AGG_TYPE)
SKM.init_uns_pp_namespace(adata)
SKM.init_uns_spatial_namespace(adata)
SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_BINSIZE_KEY, None)
SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_SCALE_KEY, 1)
SKM.set_uns_spatial_attribute(adata, SKM.UNS_SPATIAL_SCALE_UNIT_KEY, None)
return adata