Benchmarking Chimera Detection Methods • tidypq

library(tidypq)
library(MiscMetabar)
library(phyloseq)
library(ggplot2)
library(patchwork)

Overview

This vignette demonstrates how to benchmark chimera detection methods using synthetic chimeric sequences. The create_chimera_pq() function generates artificial chimeras with known parent sequences, allowing you to evaluate detection sensitivity and specificity across three methods:

chimera_removal_vs() (MiscMetabar): vsearch de novo — infers chimeras from abundance patterns within the sample.
chimera_removal_dada2(): dada2 de novo — same abundance-based logic, different algorithm.
chimera_removal_vs_ref(): vsearch reference-based (UCHIME_ref) — compares each query against a trusted reference database, independent of sample composition.

Data Preparation

To reduce computation time, we use a subset of data_fungi filtered by taxonomy. You can adjust the filtering to test on larger or smaller datasets.

data(data_fungi)

# Filter to a single Class for faster computation
# Original data_fungi has 1420 taxa - this reduces it to ~200-400 taxa
data_fungi_subset <- subset_taxa(
  data_fungi,
  Class %in% c(
    "Agaricomycetes", "Dacrymycetes", "Eurotiomycetes",
    "Lecanoromycetes", "Tremellomycetes"
  )
)

data_fungi_subset

# Mini UNITE database bundled with MiscMetabar (used for chimera_removal_vs_ref)
# NOTE: this is a very small subset of the full UNITE database,
# so detection rates here will be lower than with the complete database.
mini_db <- system.file(
  "extdata", "mini_UNITE_fungi.fasta.gz",
  package = "MiscMetabar"
)

Creating Test Data with Synthetic Chimeras

Create a phyloseq object with known chimeric sequences. min_parent_distance ensures that parent sequences are sufficiently different, making chimeras detectable.

df_chim <- create_chimera_pq(
  data_fungi_subset,
  n_chimeras = 50,
  median_abundance_multiplier = 0.05,
  min_parent_distance = 0.05
)
data_fungi_test <- df_chim$physeq
known_chimeras <- df_chim$chimera_names

df_chim$parent_info

Comparing Detection Methods

Helper function for benchmarking

benchmark_detection <- function(
  physeq,
  method_fn,
  method_name,
  known_chimeras,
  ...
) {
  gc()
  mem_before <- sum(gc()[, 2])

  start_time <- Sys.time()
  nochim <- method_fn(physeq, ...)
  end_time <- Sys.time()

  mem_after <- sum(gc()[, 2])

  detected <- known_chimeras[!known_chimeras %in% taxa_names(nochim)]
  missed <- known_chimeras[known_chimeras %in% taxa_names(nochim)]

  list(
    method = method_name,
    physeq_nochim = nochim,
    n_chimeras = length(known_chimeras),
    n_detected = length(detected),
    n_missed = length(missed),
    detection_rate = 100 * length(detected) / length(known_chimeras),
    total_removed = ntaxa(physeq) - ntaxa(nochim),
    time_seconds = as.numeric(difftime(end_time, start_time, units = "secs")),
    memory_mb = mem_after - mem_before,
    detected_names = detected,
    missed_names = missed
  )
}

calc_detection_rate <- function(physeq_nochim, known_chimeras) {
  detected <- sum(!known_chimeras %in% taxa_names(physeq_nochim))
  100 * detected / length(known_chimeras)
}

Run benchmarks

# vsearch de novo
result_vs <- benchmark_detection(
  data_fungi_test,
  chimera_removal_vs,
  "vsearch",
  known_chimeras
)

# dada2 de novo
result_dada2 <- benchmark_detection(
  data_fungi_test,
  chimera_removal_dada2,
  "dada2",
  known_chimeras
)

# vsearch reference-based
result_vs_ref <- benchmark_detection(
  data_fungi_test,
  chimera_removal_vs_ref,
  "vsearch_ref",
  known_chimeras,
  database = mini_db
)

Benchmark Summary Table

benchmark_summary <- data.frame(
  Method = c(
    result_vs$method,
    result_dada2$method,
    result_vs_ref$method
  ),
  `N Chimeras` = c(
    result_vs$n_chimeras,
    result_dada2$n_chimeras,
    result_vs_ref$n_chimeras
  ),
  Detected = c(
    result_vs$n_detected,
    result_dada2$n_detected,
    result_vs_ref$n_detected
  ),
  `Detection Rate (%)` = round(c(
    result_vs$detection_rate,
    result_dada2$detection_rate,
    result_vs_ref$detection_rate
  ), 1),
  `Total Removed` = c(
    result_vs$total_removed,
    result_dada2$total_removed,
    result_vs_ref$total_removed
  ),
  `Time (s)` = round(c(
    result_vs$time_seconds,
    result_dada2$time_seconds,
    result_vs_ref$time_seconds
  ), 2),
  `Memory (MB)` = round(c(
    result_vs$memory_mb,
    result_dada2$memory_mb,
    result_vs_ref$memory_mb
  ), 1),
  check.names = FALSE
)

knitr::kable(benchmark_summary, caption = "Chimera Detection Benchmark Summary")

Analyzing Missed Chimeras

Examine which chimeras were missed and their parent sequences:

if (length(result_vs$missed_names) > 0) {
  missed_info <- df_chim$parent_info[
    df_chim$parent_info$chimera %in% result_vs$missed_names,
  ]
  knitr::kable(missed_info, caption = "Missed by vsearch (de novo)")
}

if (length(result_dada2$missed_names) > 0) {
  missed_info <- df_chim$parent_info[
    df_chim$parent_info$chimera %in% result_dada2$missed_names,
  ]
  knitr::kable(missed_info, caption = "Missed by dada2")
}

if (length(result_vs_ref$missed_names) > 0) {
  missed_info <- df_chim$parent_info[
    df_chim$parent_info$chimera %in% result_vs_ref$missed_names,
  ]
  knitr::kable(missed_info, caption = "Missed by vsearch (reference-based)")
}

Comparing Detection Across Different Chimera Types

Create different types of synthetic chimeras to test detection robustness:

# More variable proportions (harder to detect)
result_hard <- create_chimera_pq(
  data_fungi_subset,
  n_chimeras = 50,
  prop_mean = 0.5,
  prop_sd = 0.25
)

# Biased proportions (70/30 splits)
result_biased <- create_chimera_pq(
  data_fungi_subset,
  n_chimeras = 50,
  prop_mean = 0.7,
  prop_sd = 0.1
)

# Lower abundance chimeras (harder to detect)
result_low_abund <- create_chimera_pq(
  data_fungi_subset,
  n_chimeras = 50,
  median_abundance_multiplier = 0.01
)

Run all three detection methods on all chimera types:

datasets <- list(
  default = list(physeq = data_fungi_test, chimeras = known_chimeras),
  hard = list(
    physeq = result_hard$physeq,
    chimeras = result_hard$chimera_names
  ),
  biased = list(
    physeq = result_biased$physeq,
    chimeras = result_biased$chimera_names
  ),
  low_abund = list(
    physeq = result_low_abund$physeq,
    chimeras = result_low_abund$chimera_names
  )
)

results <- data.frame(
  dataset = character(),
  method = character(),
  detection_rate = numeric(),
  time_seconds = numeric(),
  stringsAsFactors = FALSE
)

for (name in names(datasets)) {
  ds <- datasets[[name]]

  start_time <- Sys.time()
  nochim_vs <- chimera_removal_vs(ds$physeq)
  time_vs <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_vs <- calc_detection_rate(nochim_vs, ds$chimeras)

  start_time <- Sys.time()
  nochim_dada2 <- chimera_removal_dada2(ds$physeq)
  time_dada2 <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_dada2 <- calc_detection_rate(nochim_dada2, ds$chimeras)

  start_time <- Sys.time()
  nochim_vs_ref <- chimera_removal_vs_ref(ds$physeq, database = mini_db)
  time_vs_ref <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_vs_ref <- calc_detection_rate(nochim_vs_ref, ds$chimeras)

  results <- rbind(
    results,
    data.frame(
      dataset = name,
      method = c("vsearch", "dada2", "vsearch_ref"),
      detection_rate = c(rate_vs, rate_dada2, rate_vs_ref),
      time_seconds = c(time_vs, time_dada2, time_vs_ref),
      stringsAsFactors = FALSE
    )
  )
}

results

Summary Table: All Scenarios

results_wide <- reshape(
  results,
  direction = "wide",
  idvar = "dataset",
  timevar = "method",
  v.names = c("detection_rate", "time_seconds")
)
names(results_wide) <- gsub("\\.", " ", names(results_wide))

knitr::kable(
  results_wide,
  caption = "Detection Rate and Time Across Scenarios",
  digits = 1
)

Visualizing Results

METHOD_COLORS <- c(
  "vsearch" = "steelblue",
  "dada2" = "coral",
  "vsearch_ref" = "mediumseagreen"
)

# Distribution of parent1 proportions for each dataset
prop_data <- rbind(
  data.frame(
    dataset = "Default (sd=0.15)",
    prop = df_chim$parent_info$prop_parent1
  ),
  data.frame(
    dataset = "Hard (sd=0.25)",
    prop = result_hard$parent_info$prop_parent1
  ),
  data.frame(
    dataset = "Biased (mean=0.7)",
    prop = result_biased$parent_info$prop_parent1
  ),
  data.frame(
    dataset = "Low abundance",
    prop = result_low_abund$parent_info$prop_parent1
  )
)
prop_data$dataset <- factor(
  prop_data$dataset,
  levels = c(
    "Default (sd=0.15)", "Hard (sd=0.25)",
    "Biased (mean=0.7)", "Low abundance"
  )
)

p_hist <- ggplot(prop_data, aes(x = prop)) +
  geom_histogram(
    binwidth = 0.05, fill = "steelblue", color = "white", alpha = 0.7
  ) +
  facet_wrap(~dataset, ncol = 2) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    x = "Proportion from parent1",
    y = "Count",
    title = "Distribution of Parent1 Proportions"
  ) +
  theme_bw()

p_hist

# Detection rate per dataset and method
results$method <- factor(
  results$method,
  levels = c("vsearch", "dada2", "vsearch_ref")
)

p_bar <- ggplot(
  results,
  aes(x = dataset, y = detection_rate, fill = method)
) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_hline(
    yintercept = seq(0, 100, 20),
    color = "gray", linetype = "dashed", alpha = 0.5
  ) +
  scale_fill_manual(values = METHOD_COLORS) +
  scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
  labs(
    x = "Dataset",
    y = "Detection rate (%)",
    title = "Chimera Detection Across Different Scenarios",
    fill = "Method"
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_bar

Effect of Chimera Abundance on Detection

Chimera abundance relative to parent sequences affects detection sensitivity.

abundance_multipliers <- c(0.01, 0.05, 0.1, 0.5)

abundance_results <- data.frame(
  multiplier = numeric(),
  method = character(),
  detection_rate = numeric(),
  time_seconds = numeric(),
  stringsAsFactors = FALSE
)

for (mult in abundance_multipliers) {
  result_abund <- create_chimera_pq(
    data_fungi_subset,
    n_chimeras = 50,
    median_abundance_multiplier = mult,
    seed = 123
  )

  start_time <- Sys.time()
  nochim_vs <- chimera_removal_vs(result_abund$physeq)
  time_vs <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_vs <- calc_detection_rate(nochim_vs, result_abund$chimera_names)

  start_time <- Sys.time()
  nochim_dada2 <- chimera_removal_dada2(result_abund$physeq)
  time_dada2 <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_dada2 <- calc_detection_rate(nochim_dada2, result_abund$chimera_names)

  start_time <- Sys.time()
  nochim_vs_ref <- chimera_removal_vs_ref(result_abund$physeq, database = mini_db)
  time_vs_ref <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_vs_ref <- calc_detection_rate(nochim_vs_ref, result_abund$chimera_names)

  abundance_results <- rbind(
    abundance_results,
    data.frame(
      multiplier = mult,
      method = c("vsearch", "dada2", "vsearch_ref"),
      detection_rate = c(rate_vs, rate_dada2, rate_vs_ref),
      time_seconds = c(time_vs, time_dada2, time_vs_ref),
      stringsAsFactors = FALSE
    )
  )
}

abundance_results

Abundance Summary Table

knitr::kable(
  abundance_results,
  caption = "Detection by Chimera Abundance Level",
  digits = c(2, 0, 1, 2)
)

Visualizing Abundance Effect

p_abund_detection <- ggplot(
  abundance_results,
  aes(x = multiplier, y = detection_rate, color = method)
) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  scale_x_log10() +
  scale_y_continuous(limits = c(0, 100)) +
  scale_color_manual(values = METHOD_COLORS) +
  labs(
    x = "median_abundance_multiplier",
    y = "Detection rate (%)",
    title = "Detection vs Chimera Abundance",
    color = "Method"
  ) +
  theme_bw()

p_abund_detection

abundance_results$multiplier_label <- paste0(
  abundance_results$multiplier * 100, "%"
)
abundance_results$multiplier_label <- factor(
  abundance_results$multiplier_label,
  levels = paste0(sort(unique(abundance_results$multiplier)) * 100, "%")
)

p_abund_bar <- ggplot(
  abundance_results,
  aes(x = multiplier_label, y = detection_rate, fill = method)
) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_hline(
    yintercept = seq(0, 100, 20),
    color = "gray", linetype = "dashed", alpha = 0.5
  ) +
  scale_fill_manual(values = METHOD_COLORS) +
  scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
  labs(
    x = "Chimera abundance (% of median)",
    y = "Detection rate (%)",
    title = "Detection Rate by Chimera Abundance Level",
    fill = "Method"
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_abund_bar

Effect of Parent Sequence Distance

Chimeras created from very similar parent sequences are harder to detect.

distance_results <- data.frame(
  min_param_distance = numeric(),
  method = character(),
  mean_parent_distance = numeric(),
  min_parent_distance = numeric(),
  max_parent_distance = numeric(),
  detection_rate = numeric(),
  nb_detected = numeric(),
  stringsAsFactors = FALSE
)

for (min_dist in c(0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.4)) {
  result_dist <- create_chimera_pq(
    data_fungi_subset,
    n_chimeras = 50,
    min_parent_distance = min_dist,
    seed = 456
  )

  mean_dist <- mean(result_dist$parent_info$parent_distance)
  min_dist_parent <- min(result_dist$parent_info$parent_distance)
  max_dist_parent <- max(result_dist$parent_info$parent_distance)

  nochim_vs <- chimera_removal_vs(result_dist$physeq)
  rate_vs <- calc_detection_rate(nochim_vs, result_dist$chimera_names)

  nochim_dada2 <- chimera_removal_dada2(result_dist$physeq)
  rate_dada2 <- calc_detection_rate(nochim_dada2, result_dist$chimera_names)

  nochim_vs_ref <- chimera_removal_vs_ref(result_dist$physeq, database = mini_db)
  rate_vs_ref <- calc_detection_rate(nochim_vs_ref, result_dist$chimera_names)

  distance_results <- rbind(
    distance_results,
    data.frame(
      min_param_distance = min_dist,
      method = c("vsearch", "dada2", "vsearch_ref"),
      mean_parent_distance = mean_dist,
      min_parent_distance = min_dist_parent,
      max_parent_distance = max_dist_parent,
      detection_rate = c(rate_vs, rate_dada2, rate_vs_ref),
      nb_detected = c(
        ntaxa(result_dist$physeq) - ntaxa(nochim_vs),
        ntaxa(result_dist$physeq) - ntaxa(nochim_dada2),
        ntaxa(result_dist$physeq) - ntaxa(nochim_vs_ref)
      ),
      stringsAsFactors = FALSE
    )
  )
}

knitr::kable(
  distance_results,
  caption = "Detection Rate by Minimum Parent Distance",
  digits = c(2, 0, 3, 4, 0)
)

p_dist <- ggplot(
  distance_results,
  aes(x = min_param_distance, y = detection_rate, color = method)
) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  scale_x_continuous(limits = c(0, 0.5)) +
  scale_y_continuous(limits = c(0, 100)) +
  scale_color_manual(values = METHOD_COLORS) +
  labs(
    x = "Min parameter distance",
    y = "Detection rate (%)",
    title = "Detection vs Min Parent Distance",
    color = "Method"
  ) +
  theme_bw() +
  ggrepel::geom_label_repel(
    aes(label = nb_detected),
    size = 3, show.legend = FALSE
  )

p_dist