Batch integration embed

Removing batch effects while preserving biological variation (embedding output)

3 datasets · 32 methods · 8 control methods · 10 metrics

Task info Method info Metric info Dataset info Results

This is a sub-task of the overall batch integration task. Batch (or data) integration integrates datasets across batches that arise from various biological and technical sources. Methods that integrate batches typically have three different types of output: a corrected feature matrix, a joint embedding across batches, and/or an integrated cell-cell similarity graph (e.g., a kNN graph). This sub-task focuses on all methods that can output joint embeddings, and includes methods that canonically output corrected feature matrices with subsequent postprocessing to generate a joint embedding. Other sub-tasks for batch integration can be found for:

graphs, and
corrected features

This sub-task was taken from a benchmarking study of data integration methods.

Summary

poss_dataset_ids = dataset_info
  .map(d => d.dataset_id)
  .filter(d => results.map(r => r.dataset_id).includes(d))
poss_method_ids = method_info
  .map(d => d.method_id)
  .filter(d => results.map(r => r.method_id).includes(d))
poss_metric_ids = metric_info
  .map(d => d.metric_id)
  .filter(d => results.map(r => Object.keys(r.scaled_scores)).flat().includes(d))

results_long = results.flatMap(d => {
  return Object.entries(d.scaled_scores).map(([metric_id, value]) =>
    ({
      method_id: d.method_id,
      dataset_id: d.dataset_id,
      metric_id: metric_id,
      score: value
    })
  )
}).filter(d => method_ids.includes(d.method_id) && metric_ids.includes(d.metric_id) && dataset_ids.includes(d.dataset_id))

results_resources = results.flatMap(d => {
  return ({
    method_id: d.method_id,
    dataset_id: d.dataset_id,
    ...d.resources
  })
})

function label_time(time) {
  if (time < 1e-5) return "0s";
  if (time < 1) return "<1s";
  if (time < 60) return `${Math.floor(time)}s`;
  if (time < 3600) return `${Math.floor(time / 60)}m`;
  if (time < 3600 * 24) return `${Math.floor(time / 3600)}h`;
  if (time < 3600 * 24 * 7) return `${Math.floor(time / 3600 / 24)}d`;
  return ">7d"; // Assuming missing values are encoded as NaN
}

function label_memory(x_mb, include_mb = true) {
  if (!include_mb && x_mb < 1e3) return "<1G";
  if (x_mb < 1) return "<1M";
  if (x_mb < 1e3) return `${Math.round(x_mb)}M`;
  if (x_mb < 1e6) return `${Math.round(x_mb / 1e3)}G`;
  if (x_mb < 1e9) return `${Math.round(x_mb / 1e6)}T`;
  return ">1P";
}

function aggregate_scores(obj) {
  return d3.mean(obj.map(val => {
    if (val.score === undefined || isNaN(val.score)) return 0;
    return Math.min(1, Math.max(0, val.score))
  }));
}

function mean_na_rm(x) {
  return d3.mean(x.filter(d => !isNaN(d)));
}

function transpose_list_of_objects(list) {
  return Object.fromEntries(Object.keys(list[0]).map(key => [key, list.map(d => d[key])]))
}

overall = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => ({method_id, mean_score: aggregate_scores(values)}))

per_dataset = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const datasets = d3.groups(values, d => d.dataset_id)
      .map(([dataset_id, values]) => ({["dataset_" + dataset_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...datasets}
  })

per_metric = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const metrics = d3.groups(values, d => d.metric_id)
      .map(([metric_id, values]) => ({["metric_" + metric_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...metrics}
  })

resources = d3.groups(results_resources, d => d.method_id)
  .map(([method_id, values]) => {
    const error_pct_oom = d3.mean(values, d => d.exit_code === 137)
    const error_pct_timeout = d3.mean(values, d => d.exit_code === 143)
    const error_pct_error = d3.mean(values, d => d.exit_code > 0) - error_pct_oom - error_pct_timeout
    const error_pct_ok = 1 - error_pct_oom - error_pct_timeout - error_pct_error
    const mean_peak_memory_mb = mean_na_rm(values.map(d => d.peak_memory_mb))
    const mean_disk_read_mb = mean_na_rm(values.map(d => d.disk_read_mb))
    const mean_disk_write_mb = mean_na_rm(values.map(d => d.disk_write_mb))
    const mean_duration_sec = mean_na_rm(values.map(d => d.duration_sec))
    return ({
      method_id,
      error_pct_error,
      error_pct_oom,
      error_pct_timeout,
      error_pct_ok,
      // error_reason: {
      //   "Memory limit exceeded": error_pct_oom,
      //   "Time limit exceeded": error_pct_timeout,
      //   "Execution error": error_pct_error,
      //   "No error": error_pct_ok
      // },
      error_reason: [error_pct_oom, error_pct_timeout, error_pct_error, error_pct_ok],
      mean_cpu_pct: mean_na_rm(values.map(d => d.cpu_pct)),
      mean_peak_memory_mb,
      mean_peak_memory_log: -Math.log10(mean_peak_memory_mb),
      mean_peak_memory_str: " " + label_memory(mean_peak_memory_mb) + " ",
      mean_disk_read_mb: mean_na_rm(values.map(d => d.disk_read_mb)),
      mean_disk_read_log: -Math.log10(mean_disk_read_mb),
      mean_disk_read_str: " " + label_memory(mean_disk_read_mb) + " ",
      mean_disk_write_mb: mean_na_rm(values.map(d => d.disk_write_mb)),
      mean_disk_write_log: -Math.log10(mean_disk_write_mb),
      mean_disk_write_str: " " + label_memory(mean_disk_write_mb) + " ",
      mean_duration_sec,
      mean_duration_log: -Math.log10(mean_duration_sec),
      mean_duration_str: " " + label_time(mean_duration_sec) + " "
    })
  })

summary_all = method_info
  .filter(d => show_con || !d.is_baseline)
  .filter(d => method_ids.includes(d.method_id))
  .map(method => {
    const method_id = method.method_id
    const method_name = method.method_name
    const mean_score = overall.find(d => d.method_id === method_id).mean_score
    const datasets = per_dataset.find(d => d.method_id === method_id)
    const metrics = per_metric.find(d => d.method_id === method_id)
    const resources_ = resources.find(d => d.method_id === method_id)
    return {method_id, method_name, mean_score, ...datasets, ...metrics, ...resources_}
  })
  .sort((a, b) => b.mean_score - a.mean_score)

// make sure the first entry contains all columns
column_info = [
  {id: "method_name", name: "Name", label: null, group: "method", geom: "text", palette: null},
  {id: "mean_score", name: "Score", group: "overall", geom: "bar", palette: "overall"},
  {id: "error_reason", name: "Error reason", group: "overall", geom: "pie", palette: "error_reason"},
  ...dataset_info
    .filter(d => dataset_ids.includes(d.dataset_id)).map(d => ({id: "dataset_" + d.dataset_id, name: d.dataset_name, group: "dataset", geom: "funkyrect", palette: "dataset"}))
    .sort((a, b) => a.name.localeCompare(b.name)),
  ...metric_info
    .filter(d => metric_ids.includes(d.metric_id)).map(d => ({id: "metric_" + d.metric_id, name: d.metric_name, group: "metric", geom: "funkyrect", palette: "metric"}))
    .sort((a, b) => a.name.localeCompare(b.name)),
  {id: "mean_cpu_pct", name: "%CPU", group: "resources", geom: "funkyrect", palette: "resources"},
  {id: "mean_peak_memory_log", name: "Peak memory", label: "mean_peak_memory_str", group: "resources", geom: "rect", palette: "resources"},
  {id: "mean_disk_read_log", name: "Disk read", label: "mean_disk_read_str", group: "resources", geom: "rect", palette: "resources"},
  {id: "mean_disk_write_log", name: "Disk write", label: "mean_disk_write_str", group: "resources", geom: "rect", palette: "resources"},
  {id: "mean_duration_log", name: "Duration", label: "mean_duration_str", group: "resources", geom: "rect", palette: "resources"}
].map(d => {
  if (d.id === "method_name") {
    return {...d, options: {width: 15, hjust: 0}}
  } else if (d.id === "is_baseline") {
    return {...d, options: {width: 1}}
  } else if (d.geom === "bar") {
    return {...d, options: {width: 4}}
  } else {
    return d
  }
})

column_groups = [
  {group: "method", palette: null, level1: ""},
  {group: "overall", palette: "overall", level1: "Overall"},
  {group: "error_reason", palette: "error_reason", level1: "Error reason"},
  {group: "dataset", palette: "dataset", level1: dataset_info.length >= 3 ? "Datasets" : ""},
  {group: "metric", palette: "metric", level1: metric_info.length >= 3 ? "Metrics" : ""},
  {group: "resources", palette: "resources", level1: "Resources"}
]

palettes = [
  {
    overall: "Greys",
    dataset: "Blues",
    metric: "Reds",
    resources: "YlOrBr",
    error_reason: {
      colors: ["#8DD3C7", "#FFFFB3", "#BEBADA", "#FFFFFF"],
      names: ["Memory limit exceeded", "Time limit exceeded", "Execution error", "No error"]
    }
  }
][0]

funkyheatmap(
    transpose_list_of_objects(summary_all),
    transpose_list_of_objects(column_info),
    [],
    transpose_list_of_objects(column_groups),
    [],
    palettes,
    {
        fontSize: 14,
        rowHeight: 26,
        rootStyle: 'max-width: none',
        colorByRank: color_by_rank,
        theme: {
            oddRowBackground: 'var(--bs-body-bg)',
            evenRowBackground: 'var(--bs-button-hover)',
            textColor: 'var(--bs-body-color)',
            strokeColor: 'var(--bs-body-color)',
            headerColor: 'var(--bs-body-color)',
            hoverColor: 'var(--bs-body-color)'
        }
    },
    scale_column
);

Figure 1: Overview of the results per method. This figures shows the mean of the scaled scores (group Overall), the mean scores per dataset (group Dataset) and the mean scores per metric (group Metric).

Display settings

viewof color_by_rank = Inputs.toggle({label: "Color by rank:", value: true})
viewof scale_column = Inputs.toggle({label: "Minmax column:", value: false})
viewof show_con = Inputs.toggle({label: "Show control methods:", value: true})

Filter datasets

viewof dataset_ids = Inputs.checkbox(
  dataset_info.filter(d => poss_dataset_ids.includes(d.dataset_id)),
  {
    keyof: d => d.dataset_name,
    valueof: d => d.dataset_id,
    value: dataset_info.map(d => d.dataset_id),
    label: "Datasets:"
  }
)

Filter methods

viewof method_ids = Inputs.checkbox(
  method_info.filter(d => poss_method_ids.includes(d.method_id)),
  {
    keyof: d => d.method_name,
    valueof: d => d.method_id,
    value: method_info.map(d => d.method_id),
    label: "Methods:"
  }
)

Filter metrics

viewof metric_ids = Inputs.checkbox(
  metric_info.filter(d => poss_metric_ids.includes(d.metric_id)),
  {
    keyof: d => d.metric_name,
    valueof: d => d.metric_id,
    value: metric_info.map(d => d.metric_id),
    label: "Metrics:"
  }
)

funkyheatmap = (await require('d3@7').then(d3 => {
  window.d3 = d3;
  window._ = _;
  return import('https://unpkg.com/funkyheatmapjs@0.2.5');
})).default;

Results

Results table of the scores per method, dataset and metric (after scaling). Use the filters to make a custom subselection of methods and datasets. The “Overall mean” dataset is the mean value across all datasets.

Dataset info

Show

Immune (by batch)

Human immune cells from peripheral blood and bone marrow taken from 5 datasets comprising 10 batches across technologies (10X, Smart-seq2) (Luecken et al. 2021).

Lung (Viera Braga et al.)

Human lung scRNA-seq data from 3 datasets with 32,472 cells. From Vieira Braga et al. Technologies: 10X and Drop-seq (Luecken et al. 2021).

Pancreas (by batch)

Human pancreatic islet scRNA-seq data from 6 datasets across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq) (Luecken et al. 2021).

Method info

Show

Combat (full/scaled)

ComBat uses an Empirical Bayes (EB) approach to correct for batch effects. It estimates batch-specific parameters by pooling information across genes in each batch and shrinks the estimates towards the overall mean of the batch effect estimates across all genes. These parameters are then used to adjust the data for batch effects, leading to more accurate and reproducible results (Johnson, Li, and Rabinovic 2006). Links: Docs.

Combat (full/unscaled)

Combat (hvg/scaled)

Combat (hvg/unscaled)

FastMNN embed (full/scaled)

fastMNN performs a multi-sample PCA to reduce dimensionality, identifying MNN paris in the low-dimensional space, and then correcting the target batch towards the reference using locally weighted correction vectors. The corrected target batch is then merged with the reference. The process is repeated with the next target batch except for the PCA step (Lun 2019). Links: Docs.

FastMNN embed (full/unscaled)

FastMNN embed (hvg/scaled)

FastMNN embed (hvg/unscaled)

Harmony (full/scaled)

Harmony is a method that uses PCA to group the cells into multi-dataset clusters, and then computes cluster-specific linear correction factors. Each cell is then corrected by its cell-specific linear factor using the cluster-weighted average. The method keeps iterating these four steps until cell clusters are stable (Korsunsky et al. 2019). Links: Docs.

Harmony (full/unscaled)

Harmony (hvg/scaled)

Harmony (hvg/unscaled)

Liger (full/unscaled)

LIGER or linked inference of genomic experimental relationships uses iNMF deriving and implementing a novel coordinate descent algorithm to efficiently do the factorization. Joint clustering is performed and factor loadings are normalised (Welch et al. 2019). Links: Docs.

Liger (hvg/unscaled)

MNN (full/scaled)

MNN first detects mutual nearest neighbours in two of the batches and infers a projection of the second onto the first batch. After that, additional batches are added iteratively (Haghverdi et al. 2018). Links: Docs.

MNN (full/unscaled)

MNN (hvg/scaled)

MNN (hvg/unscaled)

SCALEX (full)

SCALEX is a method for integrating heterogeneous single-cell data online using a VAE framework. Its generalised encoder disentangles batch-related components from batch-invariant biological components, which are then projected into a common cell-embedding space (Xiong et al. 2022). Links: Docs.

SCALEX (hvg)

Scanorama (full/scaled)

Scanorama is an extension of the MNN method. Other then MNN, it finds mutual nearest neighbours over all batches and embeds observations into a joint hyperplane (Hie, Bryson, and Berger 2019). Links: Docs.

Scanorama (full/unscaled)

Scanorama (hvg/scaled)

Scanorama (hvg/unscaled)

Scanorama gene output (full/scaled)

Scanorama gene output (full/unscaled)

Scanorama gene output (hvg/scaled)

Scanorama gene output (hvg/unscaled)

scANVI (full/unscaled)

ScanVI is an extension of scVI but instead using a Bayesian semi-supervised approach for more principled cell annotation (Xu et al. 2021). Links: Docs.

scANVI (hvg/unscaled)

ScanVI is an extension of scVI but instead using a Bayesian semi-supervised approach for more principled cell annotation (Xu et al. 2021). Links: Docs.

scVI (full/unscaled)

scVI combines a variational autoencoder with a hierarchical Bayesian model (Lopez et al. 2018). Links: Docs.

scVI (hvg/unscaled)

scVI combines a variational autoencoder with a hierarchical Bayesian model (Lopez et al. 2018). Links: Docs.

Control method info

Show

Random Integration by Batch

Feature values, embedding coordinates, and graph connectivity are all randomly permuted within each batch label

Random Embedding by Celltype

Cells are embedded as a one-hot encoding of celltype labels

Random Embedding by Celltype (with jitter)

Cells are embedded as a one-hot encoding of celltype labels, with a small amount of random noise added to the embedding

Random Graph by Celltype

Cells are embedded as a one-hot encoding of celltype labels. A graph is then built on this embedding

Random Integration by Celltype

Feature values, embedding coordinates, and graph connectivity are all randomly permuted within each celltype label

No Integration

Cells are embedded by PCA on the unintegrated data. A graph is built on this PCA embedding

No Integration by Batch

Cells are embedded by computing PCA independently on each batch

Random Integration

Feature values, embedding coordinates, and graph connectivity are all randomly permuted

Metric info

Show

ARI

ARI (Adjusted Rand Index) compares the overlap of two clusterings. It considers both correct clustering overlaps while also counting correct disagreements between two clustering (Luecken et al. 2021).

Cell Cycle Score

The cell-cycle conservation score evaluates how well the cell-cycle effect can be captured before and after integration (Luecken et al. 2021).

Graph connectivity

The graph connectivity metric assesses whether the kNN graph representation, G, of the integrated data connects all cells with the same cell identity label (Luecken et al. 2021).

Isolated label F1

Isolated cell labels are identified as the labels present in the least number of batches in the integration task. The score evaluates how well these isolated labels separate from other cell identities based on clustering (Luecken et al. 2021).

Isolated label Silhouette

This score evaluates the compactness for the label(s) that is(are) shared by fewest batches. It indicates how well rare cell types can be preserved after integration (Luecken et al. 2021).

kBET

kBET determines whether the label composition of a k nearest neighborhood of a cell is similar to the expected (global) label composition. The test is repeated for a random subset of cells, and the results are summarized as a rejection rate over all tested neighborhoods (Büttner et al. 2018).

NMI

NMI compares the overlap of two clusterings. We used NMI to compare the cell-type labels with Louvain clusters computed on the integrated dataset (Luecken et al. 2021).

PC Regression

This compares the explained variance by batch before and after integration. It returns a score between 0 and 1 (scaled=True) with 0 if the variance contribution hasn’t changed. The larger the score, the more different the variance contributions are before and after integration (Luecken et al. 2021).

Silhouette

The absolute silhouette with is computed on cell identity labels, measuring their compactness (Luecken et al. 2021).

Batch ASW

The absolute silhouette width is computed over batch labels per cell. As 0 then indicates that batches are well mixed and any deviation from 0 indicates a batch effect, we use the 1-abs(ASW) to map the score to the scale [0;1] (Luecken et al. 2021).

Quality control results

Show

✓ All checks succeeded!

Normalisation visualisation

Show

Authors

Büttner, Maren, Zhichao Miao, F. Alexander Wolf, Sarah A. Teichmann, and Fabian J. Theis. 2018. “A Test Metric for Assessing Single-Cell RNA-Seq Batch Correction.” Nature Methods 16 (1): 43–49. https://doi.org/10.1038/s41592-018-0254-1.

Haghverdi, Laleh, Aaron T L Lun, Michael D Morgan, and John C Marioni. 2018. “Batch Effects in Single-Cell RNA-Sequencing Data Are Corrected by Matching Mutual Nearest Neighbors.” Nature Biotechnology 36 (5): 421–27. https://doi.org/10.1038/nbt.4091.

Hie, Brian, Bryan Bryson, and Bonnie Berger. 2019. “Efficient Integration of Heterogeneous Single-Cell Transcriptomes Using Scanorama.” Nature Biotechnology 37 (6): 685–91. https://doi.org/10.1038/s41587-019-0113-3.

Johnson, W. Evan, Cheng Li, and Ariel Rabinovic. 2006. “Adjusting Batch Effects in Microarray Expression Data Using Empirical Bayes Methods.” Biostatistics 8 (1): 118–27. https://doi.org/10.1093/biostatistics/kxj037.

Korsunsky, Ilya, Nghia Millard, Jean Fan, Kamil Slowikowski, Fan Zhang, Kevin Wei, Yuriy Baglaenko, Michael Brenner, Po-ru Loh, and Soumya Raychaudhuri. 2019. “Fast, Sensitive and Accurate Integration of Single-Cell Data with Harmony.” Nature Methods 16 (12): 1289–96. https://doi.org/10.1038/s41592-019-0619-0.

Lopez, Romain, Jeffrey Regier, Michael B. Cole, Michael I. Jordan, and Nir Yosef. 2018. “Deep Generative Modeling for Single-Cell Transcriptomics.” Nature Methods 15 (12): 1053–58. https://doi.org/10.1038/s41592-018-0229-2.

Luecken, Malte D., M. Büttner, K. Chaichoompu, A. Danese, M. Interlandi, M. F. Mueller, D. C. Strobl, et al. 2021. “Benchmarking Atlas-Level Data Integration in Single-Cell Genomics.” Nature Methods 19 (1): 41–50. https://doi.org/10.1038/s41592-021-01336-8.

Lun, Aaron. 2019. “A Description of the Theory Behind the fastMNN Algorithm.” https://marionilab.github.io/FurtherMNN2018/theory/description.html.

Welch, Joshua D., Velina Kozareva, Ashley Ferreira, Charles Vanderburg, Carly Martin, and Evan Z. Macosko. 2019. “Single-Cell Multi-Omic Integration Compares and Contrasts Features of Brain Cell Identity.” Cell 177 (7): 1873–1887.e17. https://doi.org/10.1016/j.cell.2019.05.006.

Xiong, Lei, Kang Tian, Yuzhe Li, Weixi Ning, Xin Gao, and Qiangfeng Cliff Zhang. 2022. “Online Single-Cell Data Integration Through Projecting Heterogeneous Datasets into a Common Cell-Embedding Space.” Nature Communications 13 (1). https://doi.org/10.1038/s41467-022-33758-z.

Xu, Chenling, Romain Lopez, Edouard Mehlman, Jeffrey Regier, Michael I Jordan, and Nir Yosef. 2021. “Probabilistic Harmonization and Annotation of Single-Cell Transcriptomics Data with Deep Generative Models.” Molecular Systems Biology 17 (1). https://doi.org/10.15252/msb.20209620.