Spatial Decomposition

Calling cell-type compositions for spot-based spatial transcriptomics data

7 datasets · 13 methods · 2 control methods · 1 metrics

Task info Method info Metric info Dataset info Results

Spatial decomposition (also often referred to as Spatial deconvolution) is applicable to spatial transcriptomics data where the transcription profile of each capture location (spot, voxel, bead, etc.) do not share a bijective relationship with the cells in the tissue, i.e., multiple cells may contribute to the same capture location. The task of spatial decomposition then refers to estimating the composition of cell types/states that are present at each capture location. The cell type/states estimates are presented as proportion values, representing the proportion of the cells at each capture location that belong to a given cell type.

We distinguish between reference-based decomposition and de novo decomposition, where the former leverage external data (e.g., scRNA-seq or scNuc-seq) to guide the inference process, while the latter only work with the spatial data. We require that all datasets have an associated reference single cell data set, but methods are free to ignore this information.

Summary

poss_dataset_ids = dataset_info
  .map(d => d.dataset_id)
  .filter(d => results.map(r => r.dataset_id).includes(d))
poss_method_ids = method_info
  .map(d => d.method_id)
  .filter(d => results.map(r => r.method_id).includes(d))
poss_metric_ids = metric_info
  .map(d => d.metric_id)
  .filter(d => results.map(r => Object.keys(r.scaled_scores)).flat().includes(d))

results_long = results.flatMap(d => {
  return Object.entries(d.scaled_scores).map(([metric_id, value]) =>
    ({
      method_id: d.method_id,
      dataset_id: d.dataset_id,
      metric_id: metric_id,
      score: value
    })
  )
}).filter(d => method_ids.includes(d.method_id) && metric_ids.includes(d.metric_id) && dataset_ids.includes(d.dataset_id))

results_resources = results.flatMap(d => {
  return ({
    method_id: d.method_id,
    dataset_id: d.dataset_id,
    ...d.resources
  })
})

function label_time(time) {
  if (time < 1e-5) return "0s";
  if (time < 1) return "<1s";
  if (time < 60) return `${Math.floor(time)}s`;
  if (time < 3600) return `${Math.floor(time / 60)}m`;
  if (time < 3600 * 24) return `${Math.floor(time / 3600)}h`;
  if (time < 3600 * 24 * 7) return `${Math.floor(time / 3600 / 24)}d`;
  return ">7d"; // Assuming missing values are encoded as NaN
}

function label_memory(x_mb, include_mb = true) {
  if (!include_mb && x_mb < 1e3) return "<1G";
  if (x_mb < 1) return "<1M";
  if (x_mb < 1e3) return `${Math.round(x_mb)}M`;
  if (x_mb < 1e6) return `${Math.round(x_mb / 1e3)}G`;
  if (x_mb < 1e9) return `${Math.round(x_mb / 1e6)}T`;
  return ">1P";
}

function aggregate_scores(obj) {
  return d3.mean(obj.map(val => {
    if (val.score === undefined || isNaN(val.score)) return 0;
    return Math.min(1, Math.max(0, val.score))
  }));
}

function mean_na_rm(x) {
  return d3.mean(x.filter(d => !isNaN(d)));
}

function transpose_list_of_objects(list) {
  return Object.fromEntries(Object.keys(list[0]).map(key => [key, list.map(d => d[key])]))
}

overall = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => ({method_id, mean_score: aggregate_scores(values)}))

per_dataset = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const datasets = d3.groups(values, d => d.dataset_id)
      .map(([dataset_id, values]) => ({["dataset_" + dataset_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...datasets}
  })

per_metric = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const metrics = d3.groups(values, d => d.metric_id)
      .map(([metric_id, values]) => ({["metric_" + metric_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...metrics}
  })

resources = d3.groups(results_resources, d => d.method_id)
  .map(([method_id, values]) => {
    const error_pct_oom = d3.mean(values, d => d.exit_code === 137)
    const error_pct_timeout = d3.mean(values, d => d.exit_code === 143)
    const error_pct_error = d3.mean(values, d => d.exit_code > 0) - error_pct_oom - error_pct_timeout
    const error_pct_ok = 1 - error_pct_oom - error_pct_timeout - error_pct_error
    const mean_peak_memory_mb = mean_na_rm(values.map(d => d.peak_memory_mb))
    const mean_disk_read_mb = mean_na_rm(values.map(d => d.disk_read_mb))
    const mean_disk_write_mb = mean_na_rm(values.map(d => d.disk_write_mb))
    const mean_duration_sec = mean_na_rm(values.map(d => d.duration_sec))
    return ({
      method_id,
      error_pct_error,
      error_pct_oom,
      error_pct_timeout,
      error_pct_ok,
      // error_reason: {
      //   "Memory limit exceeded": error_pct_oom,
      //   "Time limit exceeded": error_pct_timeout,
      //   "Execution error": error_pct_error,
      //   "No error": error_pct_ok
      // },
      error_reason: [error_pct_oom, error_pct_timeout, error_pct_error, error_pct_ok],
      mean_cpu_pct: mean_na_rm(values.map(d => d.cpu_pct)),
      mean_peak_memory_mb,
      mean_peak_memory_log: -Math.log10(mean_peak_memory_mb),
      mean_peak_memory_str: " " + label_memory(mean_peak_memory_mb) + " ",
      mean_disk_read_mb: mean_na_rm(values.map(d => d.disk_read_mb)),
      mean_disk_read_log: -Math.log10(mean_disk_read_mb),
      mean_disk_read_str: " " + label_memory(mean_disk_read_mb) + " ",
      mean_disk_write_mb: mean_na_rm(values.map(d => d.disk_write_mb)),
      mean_disk_write_log: -Math.log10(mean_disk_write_mb),
      mean_disk_write_str: " " + label_memory(mean_disk_write_mb) + " ",
      mean_duration_sec,
      mean_duration_log: -Math.log10(mean_duration_sec),
      mean_duration_str: " " + label_time(mean_duration_sec) + " "
    })
  })

summary_all = method_info
  .filter(d => show_con || !d.is_baseline)
  .filter(d => method_ids.includes(d.method_id))
  .map(method => {
    const method_id = method.method_id
    const method_name = method.method_name
    const mean_score = overall.find(d => d.method_id === method_id).mean_score
    const datasets = per_dataset.find(d => d.method_id === method_id)
    const metrics = per_metric.find(d => d.method_id === method_id)
    const resources_ = resources.find(d => d.method_id === method_id)
    return {method_id, method_name, mean_score, ...datasets, ...metrics, ...resources_}
  })
  .sort((a, b) => b.mean_score - a.mean_score)

// make sure the first entry contains all columns
column_info = [
  {id: "method_name", name: "Name", label: null, group: "method", geom: "text", palette: null},
  {id: "mean_score", name: "Score", group: "overall", geom: "bar", palette: "overall"},
  {id: "error_reason", name: "Error reason", group: "overall", geom: "pie", palette: "error_reason"},
  ...dataset_info
    .filter(d => dataset_ids.includes(d.dataset_id)).map(d => ({id: "dataset_" + d.dataset_id, name: d.dataset_name, group: "dataset", geom: "funkyrect", palette: "dataset"}))
    .sort((a, b) => a.name.localeCompare(b.name)),
  ...metric_info
    .filter(d => metric_ids.includes(d.metric_id)).map(d => ({id: "metric_" + d.metric_id, name: d.metric_name, group: "metric", geom: "funkyrect", palette: "metric"}))
    .sort((a, b) => a.name.localeCompare(b.name)),
  {id: "mean_cpu_pct", name: "%CPU", group: "resources", geom: "funkyrect", palette: "resources"},
  {id: "mean_peak_memory_log", name: "Peak memory", label: "mean_peak_memory_str", group: "resources", geom: "rect", palette: "resources"},
  {id: "mean_disk_read_log", name: "Disk read", label: "mean_disk_read_str", group: "resources", geom: "rect", palette: "resources"},
  {id: "mean_disk_write_log", name: "Disk write", label: "mean_disk_write_str", group: "resources", geom: "rect", palette: "resources"},
  {id: "mean_duration_log", name: "Duration", label: "mean_duration_str", group: "resources", geom: "rect", palette: "resources"}
].map(d => {
  if (d.id === "method_name") {
    return {...d, options: {width: 15, hjust: 0}}
  } else if (d.id === "is_baseline") {
    return {...d, options: {width: 1}}
  } else if (d.geom === "bar") {
    return {...d, options: {width: 4}}
  } else {
    return d
  }
})

column_groups = [
  {group: "method", palette: null, level1: ""},
  {group: "overall", palette: "overall", level1: "Overall"},
  {group: "error_reason", palette: "error_reason", level1: "Error reason"},
  {group: "dataset", palette: "dataset", level1: dataset_info.length >= 3 ? "Datasets" : ""},
  {group: "metric", palette: "metric", level1: metric_info.length >= 3 ? "Metrics" : ""},
  {group: "resources", palette: "resources", level1: "Resources"}
]

palettes = [
  {
    overall: "Greys",
    dataset: "Blues",
    metric: "Reds",
    resources: "YlOrBr",
    error_reason: {
      colors: ["#8DD3C7", "#FFFFB3", "#BEBADA", "#FFFFFF"],
      names: ["Memory limit exceeded", "Time limit exceeded", "Execution error", "No error"]
    }
  }
][0]

funkyheatmap(
    transpose_list_of_objects(summary_all),
    transpose_list_of_objects(column_info),
    [],
    transpose_list_of_objects(column_groups),
    [],
    palettes,
    {
        fontSize: 14,
        rowHeight: 26,
        rootStyle: 'max-width: none',
        colorByRank: color_by_rank,
        theme: {
            oddRowBackground: 'var(--bs-body-bg)',
            evenRowBackground: 'var(--bs-button-hover)',
            textColor: 'var(--bs-body-color)',
            strokeColor: 'var(--bs-body-color)',
            headerColor: 'var(--bs-body-color)',
            hoverColor: 'var(--bs-body-color)'
        }
    },
    scale_column
);

Figure 1: Overview of the results per method. This figures shows the mean of the scaled scores (group Overall), the mean scores per dataset (group Dataset) and the mean scores per metric (group Metric).

Display settings

viewof color_by_rank = Inputs.toggle({label: "Color by rank:", value: true})
viewof scale_column = Inputs.toggle({label: "Minmax column:", value: false})
viewof show_con = Inputs.toggle({label: "Show control methods:", value: true})

Filter datasets

viewof dataset_ids = Inputs.checkbox(
  dataset_info.filter(d => poss_dataset_ids.includes(d.dataset_id)),
  {
    keyof: d => d.dataset_name,
    valueof: d => d.dataset_id,
    value: dataset_info.map(d => d.dataset_id),
    label: "Datasets:"
  }
)

Filter methods

viewof method_ids = Inputs.checkbox(
  method_info.filter(d => poss_method_ids.includes(d.method_id)),
  {
    keyof: d => d.method_name,
    valueof: d => d.method_id,
    value: method_info.map(d => d.method_id),
    label: "Methods:"
  }
)

Filter metrics

viewof metric_ids = Inputs.checkbox(
  metric_info.filter(d => poss_metric_ids.includes(d.metric_id)),
  {
    keyof: d => d.metric_name,
    valueof: d => d.metric_id,
    value: metric_info.map(d => d.metric_id),
    label: "Metrics:"
  }
)

funkyheatmap = (await require('d3@7').then(d3 => {
  window.d3 = d3;
  window._ = _;
  return import('https://unpkg.com/funkyheatmapjs@0.2.5');
})).default;

Results

Results table of the scores per method, dataset and metric (after scaling). Use the filters to make a custom subselection of methods and datasets. The “Overall mean” dataset is the mean value across all datasets.

Dataset info

Show

DestVI

scRNA-seq is generated based on learn NB parameters from the destVI manuscripts leveraging sparsePCA. Number of cells and cell types present in each spatial spot is computed via combination of kernel-based parametrization of a categorical distribution and the NB model (Lopez et al. 2022).

Pancreas (alpha=0.5)

Human pancreas cells aggregated from single-cell (Dirichlet alpha=0.5) (Luecken et al. 2021).

Pancreas (alpha=1)

Human pancreas cells aggregated from single-cell (Dirichlet alpha=1) (Luecken et al. 2021).

Pancreas (alpha=5)

Human pancreas cells aggregated from single-cell (Dirichlet alpha=5) (Luecken et al. 2021).

Tabula muris senis (alpha=0.5)

Mouse lung cells aggregated from single-cell (Dirichlet alpha=0.5) (Tabula Muris Consortium 2020).

Tabula muris senis (alpha=1)

Mouse lung cells aggregated from single-cell (Dirichlet alpha=1) (Tabula Muris Consortium 2020).

Tabula muris senis (alpha=5)

Mouse lung cells aggregated from single-cell (Dirichlet alpha=5) (Tabula Muris Consortium 2020).

Method info

Show

Cell2location (alpha=20, amortised, hard-coded)

Cell2location is a decomposition method based on Negative Binomial regression that is able to account for batch effects in estimating the single-cell gene expression signature used for the spatial decomposition step. Note that since batch information is unavailable in this task, here we use either a hard-coded reference, or a negative-binomial learned reference without batch labels. The parameter alpha refers to the detection efficiency prior (Kleshchevnikov et al. 2022). Links: Docs.

Cell2location (alpha=1, reference hard-coded)

Cell2location (alpha=20, reference hard-coded)

Cell2location (alpha=200, reference hard-coded)

Cell2location (alpha=20, NB reference)

DestVI

destVI is a decomposition method that leverages a conditional generative model of spatial transcriptomics down to the sub-cell-type variation level, which is then used to decompose the cell-type proportions determining the spatial organization of a tissue (Lopez et al. 2022). Links: Docs.

Non-Negative Matrix Factorization (NMF)

NMF is a decomposition method based on Non-negative Matrix Factorization (NMF) that reconstructs expression of each spatial location as a weighted combination of cell-type signatures defined by scRNA-seq. It is a simpler baseline than NMFreg as it only performs the NMF step based on mean expression signatures of cell types, returning the weights loading of the NMF as (normalized) cell type proportions, without the regression step (Cichocki and Phan 2009). Links: Docs.

NMF-reg

NMFreg is a decomposition method based on Non-negative Matrix Factorization Regression (NMFreg) that reconstructs expression of each spatial location as a weighted combination of cell-type signatures defined by scRNA-seq. It was originally developed for Slide-seq data (Rodriques et al. 2019). Links: Docs.

Non-Negative Least Squares

NNLS13 is a decomposition method based on Non-Negative Least Square Regression (NNLS). It was originally introduced by the method AutoGenes (Aliee and Theis 2021). Links: Docs.

RCTD

RCTD (Robust Cell Type Decomposition) is a decomposition method that uses signatures learnt from single-cell data to decompose spatial expression of tissues. It is able to platform effect normalization step, which normalizes the scRNA-seq cell type profiles to match the platform effects of the spatial transcriptomics dataset (Cable et al. 2021). Links: Docs.

SeuratV3

SeuratV3 is a decomposition method that is based on Canonical Correlation Analysis (CCA) (Stuart et al. 2019). Links: Docs.

Stereoscope

Stereoscope is a decomposition method based on Negative Binomial regression. It is similar in scope and implementation to cell2location but less flexible to incorporate additional covariates such as batch effects and other type of experimental design annotations (Andersson et al. 2020). Links: Docs.

Tangram

Tangram is a method to map gene expression signatures from scRNA-seq data to spatial data. It performs the cell type mapping by learning a similarity matrix between single-cell and spatial locations based on gene expression profiles (Biancalani et al. 2021). Links: Docs.

Control method info

Show

Random Proportions

Random assignment of predicted celltype proportions from a Dirichlet distribution

True Proportions

Perfect assignment of predicted celltype proportions from the ground truth

Metric info

Show

r2

R2, or the “coefficient of determination”, reports the fraction of the true proportion values’ variance that can be explained by the predicted proportion values. The best score, and upper bound, is 1.0. There is no fixed lower bound for the metric. The uniform/non-weighted average across all cell types/states is used to summarise performance (Miles 2005).

Quality control results

Show

Category	Name	Value	Condition	Severity
Scaling	Worst score seuratv3 r2	-4.847695	worst_score >= -1	✗✗✗
Scaling	Worst score tangram r2	-2.638332	worst_score >= -1	✗✗

Normalisation visualisation

Show

Authors

Aliee, Hananeh, and Fabian J. Theis. 2021. “AutoGeneS: Automatic Gene Selection Using Multi-Objective Optimization for RNA-Seq Deconvolution.” Cell Systems 12 (7): 706–715.e4. https://doi.org/10.1016/j.cels.2021.05.006.

Andersson, Alma, Joseph Bergenstråhle, Michaela Asp, Ludvig Bergenstråhle, Aleksandra Jurek, José Fernández Navarro, and Joakim Lundeberg. 2020. “Single-Cell and Spatial Transcriptomics Enables Probabilistic Inference of Cell Type Topography.” Communications Biology 3 (1). https://doi.org/10.1038/s42003-020-01247-y.

Biancalani, Tommaso, Gabriele Scalia, Lorenzo Buffoni, Raghav Avasthi, Ziqing Lu, Aman Sanger, Neriman Tokcan, et al. 2021. “Deep Learning and Alignment of Spatially Resolved Single-Cell Transcriptomes with Tangram.” Nature Methods 18 (11): 1352–62. https://doi.org/10.1038/s41592-021-01264-7.

Cable, Dylan M., Evan Murray, Luli S. Zou, Aleksandrina Goeva, Evan Z. Macosko, Fei Chen, and Rafael A. Irizarry. 2021. “Robust Decomposition of Cell Type Mixtures in Spatial Transcriptomics.” Nature Biotechnology 40 (4): 517–26. https://doi.org/10.1038/s41587-021-00830-w.

Cichocki, Andrzej, and Anh-Huy Phan. 2009. “Fast Local Algorithms for Large Scale Nonnegative Matrix and Tensor Factorizations.” IEICE Transactions on Fundamentals of Electronics, Communications and Computer Sciences E92-a (3): 708–21. https://doi.org/10.1587/transfun.e92.a.708.

Kleshchevnikov, Vitalii, Artem Shmatko, Emma Dann, Alexander Aivazidis, Hamish W. King, Tong Li, Rasa Elmentaite, et al. 2022. “Cell2location Maps Fine-Grained Cell Types in Spatial Transcriptomics.” Nature Biotechnology 40 (5): 661–71. https://doi.org/10.1038/s41587-021-01139-4.

Lopez, Romain, Baoguo Li, Hadas Keren-Shaul, Pierre Boyeau, Merav Kedmi, David Pilzer, Adam Jelinski, et al. 2022. “DestVI Identifies Continuums of Cell Types in Spatial Transcriptomics Data.” Nature Biotechnology 40 (9): 1360–69. https://doi.org/10.1038/s41587-022-01272-8.

Luecken, Malte D., M. Büttner, K. Chaichoompu, A. Danese, M. Interlandi, M. F. Mueller, D. C. Strobl, et al. 2021. “Benchmarking Atlas-Level Data Integration in Single-Cell Genomics.” Nature Methods 19 (1): 41–50. https://doi.org/10.1038/s41592-021-01336-8.

Miles, Jeremy. 2005. “Encyclopedia of Statistics in Behavioral Science.” In. John Wiley & Sons, Ltd. https://doi.org/10.1002/0470013192.bsa526.

Rodriques, Samuel G., Robert R. Stickels, Aleksandrina Goeva, Carly A. Martin, Evan Murray, Charles R. Vanderburg, Joshua Welch, Linlin M. Chen, Fei Chen, and Evan Z. Macosko. 2019. “Slide-Seq: A Scalable Technology for Measuring Genome-Wide Expression at High Spatial Resolution.” Science 363 (6434): 1463–67. https://doi.org/10.1126/science.aaw1219.

Stuart, T., A. Butler, P. Hoffman, C. Hafemeister, E. Papalexi, W. M. Mauck, Y. Hao, M. Stoeckius, P. Smibert, and R. Satija. 2019. “Comprehensive Integration of Single-Cell Data.” Cell 177 (7): 1888–1902.e21. https://doi.org/10.1016/j.cell.2019.05.031.

Tabula Muris Consortium. 2020. “A Single-Cell Transcriptomic Atlas Characterizes Ageing Tissues in the Mouse.” Nature 583 (7817): 590–95. https://doi.org/10.1038/s41586-020-2496-1.