Benchmarking Chimera Detection Methods
Source:vignettes/chimera_detection_testing.Rmd
chimera_detection_testing.RmdOverview
This vignette demonstrates how to benchmark chimera detection methods
using synthetic chimeric sequences. The create_chimera_pq()
function generates artificial chimeras with known parent sequences,
allowing you to evaluate detection sensitivity and specificity across
three methods:
-
chimera_removal_vs()(MiscMetabar): vsearch de novo — infers chimeras from abundance patterns within the sample. -
chimera_removal_dada2(): dada2 de novo — same abundance-based logic, different algorithm. -
chimera_removal_vs_ref(): vsearch reference-based (UCHIME_ref) — compares each query against a trusted reference database, independent of sample composition.
Data Preparation
To reduce computation time, we use a subset of
data_fungi filtered by taxonomy. You can adjust the
filtering to test on larger or smaller datasets.
data(data_fungi)
# Filter to a single Class for faster computation
# Original data_fungi has 1420 taxa - this reduces it to ~200-400 taxa
data_fungi_subset <- subset_taxa(
data_fungi,
Class %in% c(
"Agaricomycetes", "Dacrymycetes", "Eurotiomycetes",
"Lecanoromycetes", "Tremellomycetes"
)
)
data_fungi_subset
# Mini UNITE database bundled with MiscMetabar (used for chimera_removal_vs_ref)
# NOTE: this is a very small subset of the full UNITE database,
# so detection rates here will be lower than with the complete database.
mini_db <- system.file(
"extdata", "mini_UNITE_fungi.fasta.gz",
package = "MiscMetabar"
)Creating Test Data with Synthetic Chimeras
Create a phyloseq object with known chimeric sequences.
min_parent_distance ensures that parent sequences are
sufficiently different, making chimeras detectable.
df_chim <- create_chimera_pq(
data_fungi_subset,
n_chimeras = 50,
median_abundance_multiplier = 0.05,
min_parent_distance = 0.05
)
data_fungi_test <- df_chim$physeq
known_chimeras <- df_chim$chimera_names
df_chim$parent_infoComparing Detection Methods
Helper function for benchmarking
benchmark_detection <- function(
physeq,
method_fn,
method_name,
known_chimeras,
...
) {
gc()
mem_before <- sum(gc()[, 2])
start_time <- Sys.time()
nochim <- method_fn(physeq, ...)
end_time <- Sys.time()
mem_after <- sum(gc()[, 2])
detected <- known_chimeras[!known_chimeras %in% taxa_names(nochim)]
missed <- known_chimeras[known_chimeras %in% taxa_names(nochim)]
list(
method = method_name,
physeq_nochim = nochim,
n_chimeras = length(known_chimeras),
n_detected = length(detected),
n_missed = length(missed),
detection_rate = 100 * length(detected) / length(known_chimeras),
total_removed = ntaxa(physeq) - ntaxa(nochim),
time_seconds = as.numeric(difftime(end_time, start_time, units = "secs")),
memory_mb = mem_after - mem_before,
detected_names = detected,
missed_names = missed
)
}
calc_detection_rate <- function(physeq_nochim, known_chimeras) {
detected <- sum(!known_chimeras %in% taxa_names(physeq_nochim))
100 * detected / length(known_chimeras)
}Run benchmarks
# vsearch de novo
result_vs <- benchmark_detection(
data_fungi_test,
chimera_removal_vs,
"vsearch",
known_chimeras
)
# dada2 de novo
result_dada2 <- benchmark_detection(
data_fungi_test,
chimera_removal_dada2,
"dada2",
known_chimeras
)
# vsearch reference-based
result_vs_ref <- benchmark_detection(
data_fungi_test,
chimera_removal_vs_ref,
"vsearch_ref",
known_chimeras,
database = mini_db
)Benchmark Summary Table
benchmark_summary <- data.frame(
Method = c(
result_vs$method,
result_dada2$method,
result_vs_ref$method
),
`N Chimeras` = c(
result_vs$n_chimeras,
result_dada2$n_chimeras,
result_vs_ref$n_chimeras
),
Detected = c(
result_vs$n_detected,
result_dada2$n_detected,
result_vs_ref$n_detected
),
`Detection Rate (%)` = round(c(
result_vs$detection_rate,
result_dada2$detection_rate,
result_vs_ref$detection_rate
), 1),
`Total Removed` = c(
result_vs$total_removed,
result_dada2$total_removed,
result_vs_ref$total_removed
),
`Time (s)` = round(c(
result_vs$time_seconds,
result_dada2$time_seconds,
result_vs_ref$time_seconds
), 2),
`Memory (MB)` = round(c(
result_vs$memory_mb,
result_dada2$memory_mb,
result_vs_ref$memory_mb
), 1),
check.names = FALSE
)
knitr::kable(benchmark_summary, caption = "Chimera Detection Benchmark Summary")Analyzing Missed Chimeras
Examine which chimeras were missed and their parent sequences:
if (length(result_vs$missed_names) > 0) {
missed_info <- df_chim$parent_info[
df_chim$parent_info$chimera %in% result_vs$missed_names,
]
knitr::kable(missed_info, caption = "Missed by vsearch (de novo)")
}Comparing Detection Across Different Chimera Types
Create different types of synthetic chimeras to test detection robustness:
# More variable proportions (harder to detect)
result_hard <- create_chimera_pq(
data_fungi_subset,
n_chimeras = 50,
prop_mean = 0.5,
prop_sd = 0.25
)
# Biased proportions (70/30 splits)
result_biased <- create_chimera_pq(
data_fungi_subset,
n_chimeras = 50,
prop_mean = 0.7,
prop_sd = 0.1
)
# Lower abundance chimeras (harder to detect)
result_low_abund <- create_chimera_pq(
data_fungi_subset,
n_chimeras = 50,
median_abundance_multiplier = 0.01
)Run all three detection methods on all chimera types:
datasets <- list(
default = list(physeq = data_fungi_test, chimeras = known_chimeras),
hard = list(
physeq = result_hard$physeq,
chimeras = result_hard$chimera_names
),
biased = list(
physeq = result_biased$physeq,
chimeras = result_biased$chimera_names
),
low_abund = list(
physeq = result_low_abund$physeq,
chimeras = result_low_abund$chimera_names
)
)
results <- data.frame(
dataset = character(),
method = character(),
detection_rate = numeric(),
time_seconds = numeric(),
stringsAsFactors = FALSE
)
for (name in names(datasets)) {
ds <- datasets[[name]]
start_time <- Sys.time()
nochim_vs <- chimera_removal_vs(ds$physeq)
time_vs <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
rate_vs <- calc_detection_rate(nochim_vs, ds$chimeras)
start_time <- Sys.time()
nochim_dada2 <- chimera_removal_dada2(ds$physeq)
time_dada2 <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
rate_dada2 <- calc_detection_rate(nochim_dada2, ds$chimeras)
start_time <- Sys.time()
nochim_vs_ref <- chimera_removal_vs_ref(ds$physeq, database = mini_db)
time_vs_ref <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
rate_vs_ref <- calc_detection_rate(nochim_vs_ref, ds$chimeras)
results <- rbind(
results,
data.frame(
dataset = name,
method = c("vsearch", "dada2", "vsearch_ref"),
detection_rate = c(rate_vs, rate_dada2, rate_vs_ref),
time_seconds = c(time_vs, time_dada2, time_vs_ref),
stringsAsFactors = FALSE
)
)
}
resultsVisualizing Results
METHOD_COLORS <- c(
"vsearch" = "steelblue",
"dada2" = "coral",
"vsearch_ref" = "mediumseagreen"
)
# Distribution of parent1 proportions for each dataset
prop_data <- rbind(
data.frame(
dataset = "Default (sd=0.15)",
prop = df_chim$parent_info$prop_parent1
),
data.frame(
dataset = "Hard (sd=0.25)",
prop = result_hard$parent_info$prop_parent1
),
data.frame(
dataset = "Biased (mean=0.7)",
prop = result_biased$parent_info$prop_parent1
),
data.frame(
dataset = "Low abundance",
prop = result_low_abund$parent_info$prop_parent1
)
)
prop_data$dataset <- factor(
prop_data$dataset,
levels = c(
"Default (sd=0.15)", "Hard (sd=0.25)",
"Biased (mean=0.7)", "Low abundance"
)
)
p_hist <- ggplot(prop_data, aes(x = prop)) +
geom_histogram(
binwidth = 0.05, fill = "steelblue", color = "white", alpha = 0.7
) +
facet_wrap(~dataset, ncol = 2) +
scale_x_continuous(limits = c(0, 1)) +
labs(
x = "Proportion from parent1",
y = "Count",
title = "Distribution of Parent1 Proportions"
) +
theme_bw()
p_hist
# Detection rate per dataset and method
results$method <- factor(
results$method,
levels = c("vsearch", "dada2", "vsearch_ref")
)
p_bar <- ggplot(
results,
aes(x = dataset, y = detection_rate, fill = method)
) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_hline(
yintercept = seq(0, 100, 20),
color = "gray", linetype = "dashed", alpha = 0.5
) +
scale_fill_manual(values = METHOD_COLORS) +
scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
labs(
x = "Dataset",
y = "Detection rate (%)",
title = "Chimera Detection Across Different Scenarios",
fill = "Method"
) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p_barEffect of Chimera Abundance on Detection
Chimera abundance relative to parent sequences affects detection sensitivity.
abundance_multipliers <- c(0.01, 0.05, 0.1, 0.5)
abundance_results <- data.frame(
multiplier = numeric(),
method = character(),
detection_rate = numeric(),
time_seconds = numeric(),
stringsAsFactors = FALSE
)
for (mult in abundance_multipliers) {
result_abund <- create_chimera_pq(
data_fungi_subset,
n_chimeras = 50,
median_abundance_multiplier = mult,
seed = 123
)
start_time <- Sys.time()
nochim_vs <- chimera_removal_vs(result_abund$physeq)
time_vs <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
rate_vs <- calc_detection_rate(nochim_vs, result_abund$chimera_names)
start_time <- Sys.time()
nochim_dada2 <- chimera_removal_dada2(result_abund$physeq)
time_dada2 <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
rate_dada2 <- calc_detection_rate(nochim_dada2, result_abund$chimera_names)
start_time <- Sys.time()
nochim_vs_ref <- chimera_removal_vs_ref(result_abund$physeq, database = mini_db)
time_vs_ref <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
rate_vs_ref <- calc_detection_rate(nochim_vs_ref, result_abund$chimera_names)
abundance_results <- rbind(
abundance_results,
data.frame(
multiplier = mult,
method = c("vsearch", "dada2", "vsearch_ref"),
detection_rate = c(rate_vs, rate_dada2, rate_vs_ref),
time_seconds = c(time_vs, time_dada2, time_vs_ref),
stringsAsFactors = FALSE
)
)
}
abundance_resultsVisualizing Abundance Effect
p_abund_detection <- ggplot(
abundance_results,
aes(x = multiplier, y = detection_rate, color = method)
) +
geom_line(linewidth = 1) +
geom_point(size = 3) +
scale_x_log10() +
scale_y_continuous(limits = c(0, 100)) +
scale_color_manual(values = METHOD_COLORS) +
labs(
x = "median_abundance_multiplier",
y = "Detection rate (%)",
title = "Detection vs Chimera Abundance",
color = "Method"
) +
theme_bw()
p_abund_detection
abundance_results$multiplier_label <- paste0(
abundance_results$multiplier * 100, "%"
)
abundance_results$multiplier_label <- factor(
abundance_results$multiplier_label,
levels = paste0(sort(unique(abundance_results$multiplier)) * 100, "%")
)
p_abund_bar <- ggplot(
abundance_results,
aes(x = multiplier_label, y = detection_rate, fill = method)
) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_hline(
yintercept = seq(0, 100, 20),
color = "gray", linetype = "dashed", alpha = 0.5
) +
scale_fill_manual(values = METHOD_COLORS) +
scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
labs(
x = "Chimera abundance (% of median)",
y = "Detection rate (%)",
title = "Detection Rate by Chimera Abundance Level",
fill = "Method"
) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p_abund_barEffect of Parent Sequence Distance
Chimeras created from very similar parent sequences are harder to detect.
distance_results <- data.frame(
min_param_distance = numeric(),
method = character(),
mean_parent_distance = numeric(),
min_parent_distance = numeric(),
max_parent_distance = numeric(),
detection_rate = numeric(),
nb_detected = numeric(),
stringsAsFactors = FALSE
)
for (min_dist in c(0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.4)) {
result_dist <- create_chimera_pq(
data_fungi_subset,
n_chimeras = 50,
min_parent_distance = min_dist,
seed = 456
)
mean_dist <- mean(result_dist$parent_info$parent_distance)
min_dist_parent <- min(result_dist$parent_info$parent_distance)
max_dist_parent <- max(result_dist$parent_info$parent_distance)
nochim_vs <- chimera_removal_vs(result_dist$physeq)
rate_vs <- calc_detection_rate(nochim_vs, result_dist$chimera_names)
nochim_dada2 <- chimera_removal_dada2(result_dist$physeq)
rate_dada2 <- calc_detection_rate(nochim_dada2, result_dist$chimera_names)
nochim_vs_ref <- chimera_removal_vs_ref(result_dist$physeq, database = mini_db)
rate_vs_ref <- calc_detection_rate(nochim_vs_ref, result_dist$chimera_names)
distance_results <- rbind(
distance_results,
data.frame(
min_param_distance = min_dist,
method = c("vsearch", "dada2", "vsearch_ref"),
mean_parent_distance = mean_dist,
min_parent_distance = min_dist_parent,
max_parent_distance = max_dist_parent,
detection_rate = c(rate_vs, rate_dada2, rate_vs_ref),
nb_detected = c(
ntaxa(result_dist$physeq) - ntaxa(nochim_vs),
ntaxa(result_dist$physeq) - ntaxa(nochim_dada2),
ntaxa(result_dist$physeq) - ntaxa(nochim_vs_ref)
),
stringsAsFactors = FALSE
)
)
}
knitr::kable(
distance_results,
caption = "Detection Rate by Minimum Parent Distance",
digits = c(2, 0, 3, 4, 0)
)
p_dist <- ggplot(
distance_results,
aes(x = min_param_distance, y = detection_rate, color = method)
) +
geom_line(linewidth = 1) +
geom_point(size = 3) +
scale_x_continuous(limits = c(0, 0.5)) +
scale_y_continuous(limits = c(0, 100)) +
scale_color_manual(values = METHOD_COLORS) +
labs(
x = "Min parameter distance",
y = "Detection rate (%)",
title = "Detection vs Min Parent Distance",
color = "Method"
) +
theme_bw() +
ggrepel::geom_label_repel(
aes(label = nb_detected),
size = 3, show.legend = FALSE
)
p_dist