Skip to contents

Filter samples

Phyloseq package already propose a function to select samples (subset_samples()), but in some case, the subset internal function is painful. MiscMetabar propose a complementary function ([subset_samples_pq()]) which is more versatile but need to be used with caution because the order of the condition must match the orders of the samples.

data(data_fungi)
cond_samp <- grepl("A1", data_fungi@sam_data[["Sample_names"]])
subset_samples_pq(data_fungi, cond_samp)
#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 1420 taxa and 9 samples ]
#> sample_data() Sample Data:       [ 9 samples by 7 sample variables ]
#> tax_table()   Taxonomy Table:    [ 1420 taxa by 12 taxonomic ranks ]
#> refseq()      DNAStringSet:      [ 1420 reference sequences ]

Filter Taxa

Filter taxa using condition(s)

Phyloseq package already propose a function to select samples (subset_taxa()), but in some case, the subset internal function is painful. MiscMetabar propose a complementary function ([subset_taxa_pq()]) which is more versatile and is based on the names of the taxa to match the condition and the taxa in the phyloseq object. In the example code above, we filter fungi from the Phylum “Basidiomycota” using phyloseq::subset_taxa() and then select only taxa with more than 1000 nb_sequences using subset_taxa_pq().

df_basidio <- subset_taxa(data_fungi, Phylum == "Basidiomycota")
df_basidio_abundant <- subset_taxa_pq(
  df_basidio,
  colSums(df_basidio@otu_table) > 1000
)

Filter taxa using blast score against a database

In some cases, we want to select only a given clade of taxon. One solution is to select only taxa with information given by taxonomic assignment (e.g. using the function dada2::assignTaxonomy()). However, in some cases, this information lead to false positive and false negative selection. MiscMetabar implement an other method relying on blast software (Altschul et al. 1990 & 1997) and database. The idea is to set a cutoff in four parameters to select only taxa which are close enough to sequences in the database :

  • id_filter (default: 90) cut of in identity percent to keep result.
  • bit_score_filter (default: 50) cut of in bit score to keep result. The higher the bit-score, the better the sequence similarity. The bit-score is the requires size of a sequence database in which the current match could be found just by chance. The bit-score is a log2 scaled and normalized raw-score. Each increase by one doubles the required database size (2bit-score).
  • min_cover_filter (default: 50) cut of in query cover (%) to keep result.
  • e_value_filter (default: 1e-30) cut of in e-value (%) to keep result. The BLAST E-value is the number of expected hits of similar quality (score) that could be found just by chance.
path_db <- system.file("extdata",
  "100_sp_UNITE_sh_general_release_dynamic.fasta",
  package = "MiscMetabar", mustWork = TRUE
)

suppressWarnings(blast_error_or_not <-
  try(system("blastn 2>&1", intern = TRUE), silent = TRUE))

if (!is(blast_error_or_not, "try-error")) {
  df_blast_80 <- filter_asv_blast(df_basidio, fasta_for_db = path_db)
  df_blast_50 <- filter_asv_blast(df_basidio,
    fasta_for_db = path_db,
    id_filter = 50, e_value_filter = 10,
    bit_score_filter = 20, min_cover_filter = 20
  )

  track_formattable <-
    track_wkflow(
      list(
        "raw data" = df_basidio,
        "id_filter = 80" = df_blast_80,
        "id_filter = 50" = df_blast_50
      )
    )
}
if (!is(blast_error_or_not, "try-error")) {
  formattable(
    track_formattable,
    list(
      area(col = nb_sequences) ~ color_bar("cyan", na.rm = TRUE),
      area(col = nb_clusters) ~ normalize_bar("yellowgreen",
        na.rm = TRUE, min = 0.3
      ),
      area(col = nb_samples) ~ normalize_bar("lightpink",
        na.rm = TRUE, min = 0.3
      )
    )
  )
}
nb_sequences nb_clusters nb_samples
raw data 784054 345 185
id_filter = 80 249494 45 127
id_filter = 50 780242 326 172

Filter taxa using a known taxa for control

To filter out contamination, one solution is to add a proportion of a known taxa which is not present in the environment of the study. In that case we can define some threshold for each sample to discard taxon based on pseudo-abundance. In the example code above, we select taxon using the ASV_50 as control through 6 different algorithms.

res_seq <-
  suppressWarnings(
    subset_taxa_tax_control(data_fungi, as.numeric(data_fungi@otu_table[, 50]),
      method = "cutoff_seq"
    )
  )
res_mixt <-
  suppressWarnings(
    subset_taxa_tax_control(data_fungi, as.numeric(data_fungi@otu_table[, 50]),
      method = "cutoff_mixt"
    )
  )
res_diff <- suppressWarnings(
  subset_taxa_tax_control(
    data_fungi,
    as.numeric(data_fungi@otu_table[, 50]),
    method = "cutoff_diff",
    min_diff_for_cutoff = 2
  )
)
res_min <-
  suppressWarnings(
    subset_taxa_tax_control(
      data_fungi,
      as.numeric(data_fungi@otu_table[, 50]),
      method = "min",
      min_diff_for_cutoff = 2
    )
  )
res_max <-
  suppressWarnings(
    subset_taxa_tax_control(
      data_fungi,
      as.numeric(data_fungi@otu_table[, 50]),
      method = "max",
      min_diff_for_cutoff = 2
    )
  )
res_mean <-
  suppressWarnings(
    subset_taxa_tax_control(
      data_fungi,
      as.numeric(data_fungi@otu_table[, 50]),
      method = "mean",
      min_diff_for_cutoff = 2
    )
  )
track_formattable <- track_wkflow(list(
  "raw data" = data_fungi,
  "cutoff_seq" = res_seq,
  "cutoff_mixt" = res_mixt,
  "cutoff_diff" = res_diff,
  "min" = res_min,
  "max" = res_max,
  "mean" = res_mean
))

formattable(
  track_formattable,
  list(
    area(col = nb_sequences) ~ color_bar("cyan"),
    area(col = nb_clusters) ~ normalize_bar("yellowgreen",
      na.rm = TRUE,
      min = 0.3
    ),
    area(col = nb_samples) ~ normalize_bar("lightpink", na.rm = TRUE)
  )
)
nb_sequences nb_clusters nb_samples
raw data 1839124 1420 185
cutoff_seq 1817543 1402 185
cutoff_mixt 1774180 1279 185
cutoff_diff 1805274 1378 185
min 1836158 1417 185
max 1752217 1214 185
mean 1790569 1325 185

Session information

sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Debian GNU/Linux 12 (bookworm)
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.11.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.11.0
#> 
#> locale:
#>  [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=fr_FR.UTF-8    
#>  [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
#>  [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Europe/Paris
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] formattable_0.2.1  MiscMetabar_0.11.0 purrr_1.0.2        dplyr_1.1.4       
#> [5] dada2_1.32.0       Rcpp_1.0.13-1      ggplot2_3.5.1      phyloseq_1.48.0   
#> 
#> loaded via a namespace (and not attached):
#>   [1] bitops_1.0-9                pbapply_1.7-2              
#>   [3] deldir_2.0-4                permute_0.9-7              
#>   [5] rlang_1.1.4                 magrittr_2.0.3             
#>   [7] ade4_1.7-22                 matrixStats_1.4.1          
#>   [9] compiler_4.4.2              mgcv_1.9-1                 
#>  [11] png_0.1-8                   systemfonts_1.1.0          
#>  [13] vctrs_0.6.5                 reshape2_1.4.4             
#>  [15] stringr_1.5.1               pwalign_1.0.0              
#>  [17] pkgconfig_2.0.3             crayon_1.5.3               
#>  [19] fastmap_1.2.0               XVector_0.44.0             
#>  [21] utf8_1.2.4                  Rsamtools_2.20.0           
#>  [23] rmarkdown_2.29              UCSC.utils_1.0.0           
#>  [25] ragg_1.3.3                  xfun_0.49                  
#>  [27] zlibbioc_1.50.0             cachem_1.1.0               
#>  [29] GenomeInfoDb_1.40.1         jsonlite_1.8.9             
#>  [31] biomformat_1.32.0           rhdf5filters_1.16.0        
#>  [33] DelayedArray_0.30.1         Rhdf5lib_1.26.0            
#>  [35] BiocParallel_1.38.0         jpeg_0.1-10                
#>  [37] parallel_4.4.2              cluster_2.1.6              
#>  [39] R6_2.5.1                    RColorBrewer_1.1-3         
#>  [41] bslib_0.8.0                 stringi_1.8.4              
#>  [43] GenomicRanges_1.56.2        jquerylib_0.1.4            
#>  [45] SummarizedExperiment_1.34.0 iterators_1.0.14           
#>  [47] knitr_1.49                  mixtools_2.0.0             
#>  [49] IRanges_2.38.1              Matrix_1.7-1               
#>  [51] splines_4.4.2               igraph_2.1.2               
#>  [53] tidyselect_1.2.1            rstudioapi_0.17.1          
#>  [55] abind_1.4-8                 yaml_2.3.10                
#>  [57] vegan_2.6-8                 codetools_0.2-20           
#>  [59] hwriter_1.3.2.1             lattice_0.22-6             
#>  [61] tibble_3.2.1                plyr_1.8.9                 
#>  [63] Biobase_2.64.0              withr_3.0.2                
#>  [65] ShortRead_1.62.0            evaluate_1.0.1             
#>  [67] desc_1.4.3                  survival_3.7-0             
#>  [69] RcppParallel_5.1.9          kernlab_0.9-33             
#>  [71] Biostrings_2.72.1           pillar_1.9.0               
#>  [73] MatrixGenerics_1.16.0       foreach_1.5.2              
#>  [75] stats4_4.4.2                plotly_4.10.4              
#>  [77] generics_0.1.3              S4Vectors_0.42.1           
#>  [79] munsell_0.5.1               scales_1.3.0               
#>  [81] glue_1.8.0                  lazyeval_0.2.2             
#>  [83] tools_4.4.2                 interp_1.1-6               
#>  [85] data.table_1.16.4           GenomicAlignments_1.40.0   
#>  [87] fs_1.6.5                    rhdf5_2.48.0               
#>  [89] grid_4.4.2                  tidyr_1.3.1                
#>  [91] ape_5.8                     latticeExtra_0.6-30        
#>  [93] colorspace_2.1-1            nlme_3.1-166               
#>  [95] GenomeInfoDbData_1.2.12     cli_3.6.3                  
#>  [97] textshaping_0.4.1           fansi_1.0.6                
#>  [99] segmented_2.1-3             viridisLite_0.4.2          
#> [101] S4Arrays_1.4.1              gtable_0.3.6               
#> [103] sass_0.4.9                  digest_0.6.37              
#> [105] BiocGenerics_0.50.0         SparseArray_1.4.8          
#> [107] htmlwidgets_1.6.4           htmltools_0.5.8.1          
#> [109] pkgdown_2.1.1               multtest_2.60.0            
#> [111] lifecycle_1.0.4             httr_1.4.7                 
#> [113] MASS_7.3-61