Get information from a custom csv file using taxonomic names present in a phyloseq object

A function to add information from a custom csv file (e.g. FungalTraits, Taxref, ...) to the tax_table slot of a phyloseq object by joining taxonomic names from phyloseq object (column `taxonomic_rank`) with a column of the csv file (`csv_taxonomic_rank`) containing the correspondant taxonomic names. Be carefull that the taxonomic names in the csv file must match exactly the taxonomic names in the phyloseq object. For example, if the taxonomic names in the phyloseq object are in the form "Genus species" the taxonomic names in the csv file must be in the same form (not "Genus_species" or "Genus Species Author"...).

Note that the csv file need to be in a wide-format, i.e. one line for each distinct value in the `csv_taxonomic_rank` columns. You may want to transform your data.frame using [tidyr::tidyr::pivot_wider()] fonctions prior to write it in a new file.

Usage

tax_info_pq(
  physeq = NULL,
  taxnames = NULL,
  taxonomic_rank = "currentCanonicalSimple",
  file_name = NULL,
  csv_taxonomic_rank = NULL,
  add_to_phyloseq = NULL,
  col_prefix = NULL,
  use_duck_db = FALSE,
  csv_cols_select = NULL,
  sep = ",",
  dec = ".",
  verbose = TRUE,
  discard_genus_alone = taxonomic_rank == "currentCanonicalSimple",
  discard_NA = TRUE
)

Arguments

physeq: (optional) A phyloseq object. Either `physeq` or `taxnames` must be provided, but not both.
taxnames: (optional) A character vector of taxonomic names.
taxonomic_rank: (Character, default "currentCanonicalSimple") The column(s) present in the @tax_table slot of the phyloseq object. Can be a vector of two columns (e.g. c("Genus", "Species")).
file_name: (required) A file path to your csv file.
csv_taxonomic_rank: (required) The name of the column in your csv file containing the taxonomic names. Must match the taxonomic_rank of the phyloseq.
add_to_phyloseq: (logical, default TRUE when physeq is provided, FALSE when taxnames is provided) If TRUE, add new column(s) in the tax_table of the phyloseq object. Automatically set to TRUE when a phyloseq object is provided and FALSE when taxnames is provided. Cannot be TRUE if `taxnames` is provided.
col_prefix: A character string to be added as a prefix to the new columns names added to the tax_table slot of the phyloseq object.
use_duck_db: (logical, default FALSE) If TRUE, use duckdb to handle the join between the csv file and the tax_table of the phyloseq object. Useful for large csv files.
csv_cols_select: A character vector of the column names to select in the csv file.
sep: the field separator character. See [utils::read.csv()].
dec: the field separator character. See [utils::read.csv()].
verbose: (logical, default TRUE) If TRUE, prompt some messages.

Value

Either a tibble (if add_to_phyloseq = FALSE) or a new phyloseq object, if add_to_phyloseq = TRUE, with new column(s) in the tax_table.

Author

Adrien Taudiere

Examples

if (FALSE) { # \dontrun{
data_fungi_cleanNames <- gna_verifier_pq(data_fungi,
  data_sources = 210
)

# FUNGAL TRAITS example
# --------------------
fungal_traits <- system.file("extdata", "fun_trait_mini.csv", 
  package = "taxinfo") # minidataset for testing
# fungal_traits <- system.file("extdata", "fungal_traits.csv", package = "taxinfo")
fg_traits <- tax_info_pq(data_fungi_cleanNames,
  taxonomic_rank = "genusEpithet",
  file_name = fungal_traits,
  csv_taxonomic_rank = "GENUS",
  col_prefix = "ft_",
  sep = "\t",
  add_to_phyloseq = FALSE
)

table(fg_traits$ft_primary_lifestyle, fg_traits$Guild) |>
  as.data.frame() |>
  filter(Freq > 0) |>
  arrange(desc(Freq)) |>
  head()

# TAXREF example
# --------------------
TAXREFv18_fungi <- system.file("extdata", "TAXREFv18_fungi.csv", package = "taxinfo")

res_with_R <- tax_info_pq(data_fungi_cleanNames,
  file_name = TAXREFv18_fungi,
  csv_taxonomic_rank = "NOM_VALIDE_SIMPLE",
  col_prefix = "taxref_"
)
res_with_duckDB <- tax_info_pq(
  data_fungi_cleanNames,
  file_name = TAXREFv18_fungi,
  csv_taxonomic_rank = "NOM_VALIDE_SIMPLE",
  use_duck_db = TRUE,
  add_to_phyloseq = FALSE,
  col_prefix = "taxref_",
  csv_cols_select = c(
    "RANG", "HABITAT", "FR", "GF", "MAR", "GUA", "SM", "SB",
    "SPM", "MAY", "EPA", "REU", "SA", "TA", "TAAF", "PF", "NC", "WF", "CLI", "URL"
  )
)

data_fungi_cleanNames_2 <- tax_info_pq(
  data_fungi_cleanNames,
  file_name = TAXREFv18_fungi,
  csv_taxonomic_rank = "NOM_VALIDE_SIMPLE",
  use_duck_db = TRUE,
  col_prefix = "taxref_",
  csv_cols_select = c("RANG", "HABITAT", "FR", "URL", "CD_REF")
)
table(data_fungi_cleanNames_2@tax_table[, "taxref_FR"])
table(data_fungi_cleanNames_2@tax_table[, "taxref_HABITAT"])

# TAXREF example (with status)
# --------------------

taxref_status <- system.file("extdata", "bdc_18_01_wider_mini.csv", package = "taxinfo")
data_fungi_cleanNames_3 <- tax_info_pq(data_fungi_cleanNames_2,
  taxonomic_rank = "taxref_CD_REF",
  file_name = taxref_status,
  csv_taxonomic_rank = "CD_REF",
  col_prefix = "st_",
  use_duck_db = TRUE
)

data_fungi_cleanNames_3@tax_table[, "st_BCD_LRR"] |>
  table(useNA = "always")
data_fungi_cleanNames_3@tax_table[, "st_BCD_ZDET"] |>
  table(useNA = "always")
data_fungi_cleanNames_3@tax_table[, "st_BCD_TAXREF_STATUT_BIOGEO"] |>
  table(useNA = "always")

#' # EPPO (Pest species) example (https://gd.eppo.int/)
# --------------------
# You can visit https://gd.eppo.int/ to download database for other countries
# than France
EPPO_FR <- system.file("extdata", "EPPO_regulated_FR.csv", package = "taxinfo")

res_with_EPPO_FR <- tax_info_pq(data_fungi_cleanNames,
  file_name = EPPO_FR,
  csv_taxonomic_rank = "organism_prefname",
  col_prefix = "EPPO_"
)

res_with_EPPO_FR@tax_table |>
  as.data.frame() |>
  filter(!is.na(EPPO_qlistlabel))
} # }