Skip to contents

lifecycle-experimental

This function creates synthetic chimeric sequences by combining parts of existing sequences from a phyloseq object. Useful for benchmarking chimera detection methods like MiscMetabar::chimera_removal_vs() or chimera_removal_dada2().

Usage

create_chimera_pq(
  physeq,
  n_chimeras = 5,
  prop_mean = 0.5,
  prop_sd = 0.15,
  prop_min = 0.1,
  seed = 123,
  median_abundance_multiplier = 0.1,
  min_parent_distance = 0.1
)

Arguments

physeq

(phyloseq, required) A phyloseq object with a refseq slot containing DNA sequences.

n_chimeras

(integer, default: 5) Number of chimeric sequences to create.

prop_mean

(numeric, default: 0.5) Mean of the normal distribution used to sample the proportion of the first parent sequence. A value of 0.5 means chimeras will be centered around 50/50 splits.

prop_sd

(numeric, default: 0.15) Standard deviation of the normal distribution used to sample proportions. Higher values create more variable chimera breakpoints.

prop_min

(numeric, default: 0.1) Minimum proportion threshold. Proportions below this value (or above 1 - prop_min) are resampled to ensure each parent contributes meaningfully to the chimera.

seed

(integer, default: 123) Random seed for reproducibility.

median_abundance_multiplier

(numeric, default: 0.1) Multiplier to set the abundance of chimeric sequences relative to the median abundance of existing sequences. A value of 0.1 means chimeras will have approximately 10% of the median abundance.

min_parent_distance

(numeric, default: 0.1) Minimum sequence distance (proportion of differing positions) between parent1 and parent2. If 0, chimeras can be created from very similar parents, which may be harder to detect. In some cases, with min_parent_distance = 0, you may end up with chimeras that are identical to one of the parents.

Value

A list containing:

physeq

The new phyloseq object with added chimeric sequences

chimera_names

Character vector of chimera taxa names

parent_info

Data frame with details about each chimera: chimera name, parent1, parent2, parent_distance, prop_parent1, breakpoint, seq_length

params

List of parameters used (prop_mean, prop_sd, prop_min, min_parent_distance)

Author

Adrien Taudiere

Examples

library(MiscMetabar)
data(data_fungi)

# Default: centered around 50% with some variation
result <- create_chimera_pq(data_fungi, n_chimeras = 40)
data_fungi_test <- result$physeq
known_chimeras <- result$chimera_names

# View the parent information and proportions
print(result$parent_info)
#>       chimera parent1 parent2 parent_distance prop_parent1 breakpoint
#> 1   CHIMERA_1   ASV27   ASV33          0.5017        0.679        206
#> 2   CHIMERA_2   ASV22   ASV32          0.5082        0.686        209
#> 3   CHIMERA_3   ASV10   ASV27          0.5233        0.569        171
#> 4   CHIMERA_4   ASV10   ASV33          0.4800        0.754        226
#> 5   CHIMERA_5   ASV19    ASV7          0.5412        0.517        176
#> 6   CHIMERA_6   ASV13   ASV23          0.4502        0.768        223
#> 7   CHIMERA_7   ASV32    ASV8          0.5126        0.205         73
#> 8   CHIMERA_8   ASV23   ASV13          0.4502        0.429        125
#> 9   CHIMERA_9   ASV27   ASV22          0.5410        0.450        137
#> 10 CHIMERA_10   ASV19   ASV22          0.4984        0.487        149
#> 11 CHIMERA_11   ASV13   ASV12          0.4880        0.603        175
#> 12 CHIMERA_12   ASV10   ASV19          0.5167        0.456        137
#> 13 CHIMERA_13   ASV25   ASV32          0.5629        0.632        221
#> 14 CHIMERA_14   ASV12   ASV28          0.5364        0.384        127
#> 15 CHIMERA_15   ASV28   ASV12          0.5364        0.469        155
#> 16 CHIMERA_16   ASV13   ASV29          0.5223        0.360        105
#> 17 CHIMERA_17   ASV31   ASV29          0.5479        0.561        170
#> 18 CHIMERA_18   ASV25   ASV10          0.5400        0.302         91
#> 19 CHIMERA_19   ASV32   ASV26          0.5130        0.705        245
#> 20 CHIMERA_20    ASV7   ASV19          0.5412        0.727        247
#> 21 CHIMERA_21   ASV24   ASV27          0.5162        0.519        176
#> 22 CHIMERA_22    ASV7   ASV27          0.5333        0.557        192
#> 23 CHIMERA_23   ASV13    ASV7          0.4674        0.450        131
#> 24 CHIMERA_24   ASV27   ASV10          0.5233        0.546        164
#> 25 CHIMERA_25   ASV22   ASV32          0.5082        0.508        155
#> 26 CHIMERA_26    ASV6   ASV23          0.4252        0.808        243
#> 27 CHIMERA_27   ASV24   ASV27          0.5162        0.365        124
#> 28 CHIMERA_28   ASV26    ASV7          0.4524        0.397        138
#> 29 CHIMERA_29   ASV32   ASV27          0.4986        0.397        137
#> 30 CHIMERA_30   ASV23   ASV13          0.4502        0.479        139
#> 31 CHIMERA_31   ASV12   ASV27          0.4515        0.444        147
#> 32 CHIMERA_32   ASV13   ASV23          0.4502        0.406        118
#> 33 CHIMERA_33   ASV28   ASV23          0.5238        0.511        161
#> 34 CHIMERA_34   ASV26   ASV33          0.5512        0.406        123
#> 35 CHIMERA_35   ASV19   ASV13          0.5086        0.694        202
#> 36 CHIMERA_36   ASV25   ASV33          0.5446        0.546        165
#> 37 CHIMERA_37   ASV27   ASV13          0.4605        0.562        163
#> 38 CHIMERA_38   ASV18   ASV29          0.5050        0.676        205
#> 39 CHIMERA_39   ASV23   ASV29          0.5380        0.358        109
#> 40 CHIMERA_40    ASV7   ASV10          0.5467        0.250         75
#>    seq_length
#> 1         303
#> 2         305
#> 3         300
#> 4         300
#> 5         340
#> 6         291
#> 7         357
#> 8         291
#> 9         305
#> 10        305
#> 11        291
#> 12        300
#> 13        350
#> 14        330
#> 15        330
#> 16        291
#> 17        303
#> 18        300
#> 19        347
#> 20        340
#> 21        339
#> 22        345
#> 23        291
#> 24        300
#> 25        305
#> 26        301
#> 27        339
#> 28        347
#> 29        345
#> 30        291
#> 31        330
#> 32        291
#> 33        315
#> 34        303
#> 35        291
#> 36        303
#> 37        291
#> 38        303
#> 39        303
#> 40        300

# More variable proportions (wider distribution)
result2 <- create_chimera_pq(data_fungi,
  n_chimeras = 40,
  prop_mean = 0.5, prop_sd = 0.25
)

# Biased toward more of parent1 (e.g., 70/30 splits on average)
result3 <- create_chimera_pq(data_fungi,
  n_chimeras = 40,
  prop_mean = 0.7, prop_sd = 0.1
)

# Benchmark chimera detection methods
if (MiscMetabar::is_vsearch_installed()) {
  nochim_vs <- MiscMetabar::chimera_removal_vs(data_fungi_test)
  detected_vs <- known_chimeras[!known_chimeras %in% phyloseq::taxa_names(nochim_vs)]
  cat(
    "vsearch detected:", length(detected_vs), "/",
    length(known_chimeras), "chimeras\n"
  )
}
#> Filtering for sequences under 100 bp remove a total of 0 ( 0 %) unique sequences for a total of 0 sequences removed ( 0 %)
#> Cleaning suppress 0 taxa (  ) and 0 sample(s) (  ).
#> Number of non-matching ASV 0
#> Number of matching ASV 1460
#> Number of filtered-out ASV 279
#> Number of kept ASV 1181
#> Number of kept samples 185
#> vsearch detected: 39 / 40 chimeras

# Visualize the distribution of proportions
hist(result$parent_info$prop_parent1,
  main = "Distribution of parent1 proportions",
  xlab = "Proportion from parent1", xlim = c(0, 1)
)


# Ensure parents are at least 15% different (more detectable chimeras)
result4 <- create_chimera_pq(data_fungi,
  n_chimeras = 40,
  min_parent_distance = 0.15
)

# Disable parent distance filtering (allows similar parents)
result5 <- create_chimera_pq(data_fungi,
  n_chimeras = 40,
  min_parent_distance = 0
)