Data preparation

library(massdataset)
library(tidyverse)
library(metid)

ms1_data =
  readr::read_csv(file.path(
    system.file("ms1_peak", package = "metid"),
    "ms1.peak.table.csv"
  ))

ms1_data = data.frame(ms1_data, sample1 = 1, sample2 = 2)

expression_data = ms1_data %>%
  dplyr::select(-c(name:rt))

variable_info =
  ms1_data %>%
  dplyr::select(name:rt) %>%
  dplyr::rename(variable_id = name)

sample_info =
  data.frame(
    sample_id = colnames(expression_data),
    injection.order = c(1, 2),
    class = c("Subject", "Subject"),
    group = c("Subject", "Subject")
  )
rownames(expression_data) = variable_info$variable_id

object = create_mass_dataset(
  expression_data = expression_data,
  sample_info = sample_info,
  variable_info = variable_info
)

object
#> -------------------- 
#> massdataset version: 0.01 
#> -------------------- 
#> 1.expression_data:[ 100 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 100 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> Creation ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2021-12-27 12:11:00

Add MS2 to mass_dataset object

path = "./example"
dir.create(path)

ms2_data <- system.file("ms2_data", package = "metid")
file.copy(
  from = file.path(ms2_data, "QC1_MSMS_NCE25.mgf"),
  to = path,
  overwrite = TRUE,
  recursive = TRUE
)
#> [1] TRUE

object =
  massdataset::mutate_ms2(
    object = object,
    column = "rp",
    polarity = "positive",
    ms1.ms2.match.mz.tol = 10,
    ms1.ms2.match.rt.tol = 30
  )
#> Reading mgf data...
#> 25 out of 100 variable have MS2 spectra.
#> Selecting the most intense MS2 spectrum for each peak...

object
#> -------------------- 
#> massdataset version: 0.01 
#> -------------------- 
#> 1.expression_data:[ 100 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 100 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 25 variables x 24 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> Creation ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2021-12-27 12:11:00
#> update_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset update_mass_dataset() 2021-12-27 12:11:00
#> mutate_ms2 ---------- 
#>       Package Function.used                Time
#> 1 massdataset  mutate_ms2() 2021-12-27 12:11:04

object@ms2_data
#> $QC1_MSMS_NCE25.mgf
#> -------------------- 
#> column: rp 
#> polarity: positive 
#> mz_tol: 10 
#> rt_tol (second): 30 
#> -------------------- 
#> 25 variables:
#> pRPLC_603 pRPLC_722 pRPLC_778 pRPLC_1046 pRPLC_1112...
#> 24 MS2 spectra.
#> mz162.112442157672rt37.9743312 mz181.072050304971rt226.14144 mz289.227264404297rt284.711172 mz181.072050673093rt196.800648 mz209.092155077047rt58.3735608...
#> 

Identify metabolites according to MS1

data("hmdb_ms1_database0.0.3", package = "metid")
hmdb_ms1_database0.0.3
#> 
[33m-----------Base information------------
#> 
[39m
[32mVersion: 0.0.1 
#> 
[39m
[32mSource: HMDB 
#> 
[39m
[32mLink: http://www.hmdb.ca/ 
#> 
[39m
[32mCreater: Xiaotao Shen ( shenxt@stanford.edu )
#> 
[39m
[32mWithout RT informtaion
#> 
[39m
[33m-----------Spectral information------------
#> 
[39m
[32mThere are 40 items of metabolites in database:
#> 
[39m
[32mLab.ID; Compound.name; mz; RT; CAS.ID; HMDB.ID; KEGG.ID; Formula; Biospecimen.locations; Tissue.locations; Status; Secondary.accession.numbers; Synonyms; Average.molecular.weight; IUPQC.name; Traditional.IUPAC.name; SMILES; inchi; inchikey; State; Foodb.ID; Chemspider.ID; Pubchem.compound.ID; Drugbank.ID; CHEBI.ID; PDB.ID; Phenol.explorer.compound.ID; Knapsack.ID; BIGG.ID; Wikipedia.ID; METLIN.ID; Biocyc.ID; Kingdom; Super.class; Class; Sub.class; Source; Pathway; Pathway2; Disease 
#> 
[39m
[32mThere are 114004 metabolites in total
#> 
[39m
[32mThere are 0 metabolites in positive mode with MS2 spectra.
#> 
[39m
[32mThere are 0 metabolites in negative mode with MS2 spectra.
#> 
[39m
[32mCollision energy in positive mode (number:):
#> 
[39m
[32mTotal number: 0 
#> 
[39m
[32m 
#> 
[39m
[32mCollision energy in negative mode:
#> 
[39m
[32mTotal number: 0 
#> 
[39m
[32m 
#> 
[39m
object1 =
  annotate_metabolites_mass_dataset(object = object,
                                    database = hmdb_ms1_database0.0.3)
#> 
[33mNo MS2 data in database, so only use mz and/or RT for matching.
#> 
[39m
[33mYou set rt.match.tol < 10,000, so if the compounds have RT,  RTs will be used for matching
#> 
[39m
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |======================================================================| 100%
#> 
#> 
[41mAll done.
#> 
[49m
object1
#> -------------------- 
#> massdataset version: 0.01 
#> -------------------- 
#> 1.expression_data:[ 100 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 100 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 25 variables x 24 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> Creation ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2021-12-27 12:11:00
#> update_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset update_mass_dataset() 2021-12-27 12:11:00
#> mutate_ms2 ---------- 
#>       Package Function.used                Time
#> 1 massdataset  mutate_ms2() 2021-12-27 12:11:04
#> annotate_metabolites_mass_dataset ---------- 
#>   Package                       Function.used                Time
#> 1   metid annotate_metabolites_mass_dataset() 2021-12-27 12:11:12

Identify metabolites according to MS2

data("snyder_database_rplc0.0.3", package = "metid")

object2 =
  annotate_metabolites_mass_dataset(object = object1,
                                    database = snyder_database_rplc0.0.3)
#> 
[33mQC1_MSMS_NCE25.mgf file:
#> 
[39m
[32m25 MS2 spectra.
#> 
[39m
[33mUse all CE values.
#> 
[39m
#> 
[32mIdentifing metabolites with MS/MS database...
#> 
[39m
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |======================================================================| 100%
#> 
#> 
[41mAll done.
#> 
[49m
head(object2@annotation_table)
#> 
[38;5;246m# A tibble: 6 × 18
[39m
#>   variable_id ms2_files_id  ms2_spectrum_id Compound.name CAS.ID HMDB.ID KEGG.ID
#>   
[3m
[38;5;246m<chr>
[39m
[23m       
[3m
[38;5;246m<chr>
[39m
[23m         
[3m
[38;5;246m<chr>
[39m
[23m           
[3m
[38;5;246m<chr>
[39m
[23m         
[3m
[38;5;246m<chr>
[39m
[23m  
[3m
[38;5;246m<chr>
[39m
[23m   
[3m
[38;5;246m<chr>
[39m
[23m  
#> 
[38;5;250m1
[39m pRPLC_10319 
[31mNA
[39m            
[31mNA
[39m              Tyrosyl-Seri… 
[31mNA
[39m     HMDB00… 
[31mNA
[39m     
#> 
[38;5;250m2
[39m pRPLC_10319 
[31mNA
[39m            
[31mNA
[39m              Seryltyrosine 21435… HMDB00… 
[31mNA
[39m     
#> 
[38;5;250m3
[39m pRPLC_10319 
[31mNA
[39m            
[31mNA
[39m              Monoglycerid… 36291… HMDB00… 
[31mNA
[39m     
#> 
[38;5;250m4
[39m pRPLC_1046  QC1_MSMS_NCE… mz181.07205067… Theophylline  611-5… HMDB01… C07130 
#> 
[38;5;250m5
[39m pRPLC_1046  QC1_MSMS_NCE… mz181.07205067… Paraxanthine  611-5… HMDB01… C13747 
#> 
[38;5;250m6
[39m pRPLC_1046  QC1_MSMS_NCE… mz181.07205067… Theophylline  
[31mNA
[39m     HMDB00… 
[31mNA
[39m     
#> 
[38;5;246m# … with 11 more variables: Lab.ID <chr>, Adduct <chr>, mz.error <dbl>,
[39m
#> 
[38;5;246m#   mz.match.score <dbl>, RT.error <dbl>, RT.match.score <dbl>, CE <chr>,
[39m
#> 
[38;5;246m#   SS <dbl>, Total.score <dbl>, Database <chr>, Level <dbl>
[39m
head(extract_variable_info(object = object2))
#>   variable_id       mz      rt                             Compound.name
#> 1   pRPLC_376 472.3032 772.906                           Allocholic acid
#> 2   pRPLC_391 466.3292 746.577                         LysoPA(18:0e/0:0)
#> 3   pRPLC_603 162.1125  33.746                               L-Carnitine
#> 4   pRPLC_629 181.0720  36.360                              Paraxanthine
#> 5   pRPLC_685 230.0701 158.205 1-(Methylsulfinyl)propyl propyl disulfide
#> 6   pRPLC_722 181.0721 228.305                              Theophylline
#>        CAS.ID     HMDB.ID KEGG.ID      Lab.ID        Adduct   mz.error
#> 1   2464-18-8 HMDB0000505  C17737 HMDB0000505 (M+CH3CN+Na)+ 0.28813692
#> 2        <NA> HMDB0011144    <NA> HMDB0011144  (M+CH3CN+H)+ 0.03145203
#> 3    541-15-1   HMDB00062  C00318    RPLC_406        (M+H)+ 1.66789418
#> 4    611-59-6 HMDB0001860  C13747 HMDB0001860        (M+H)+ 0.01530000
#> 5 132216-21-8 HMDB0033061    <NA> HMDB0033061      (M+NH4)+ 0.39101000
#> 6        <NA> HMDB0001889    <NA>    RPLC_443        (M+H)+ 1.68826243
#>   mz.match.score  RT.error RT.match.score    CE        SS Total.score
#> 1      0.9999336        NA             NA  <NA>        NA   0.9999336
#> 2      0.9999992        NA             NA  <NA>        NA   0.9999992
#> 3      0.9977770  1.974331      0.9978368 NCE25 0.6048288   0.8013178
#> 4      0.9999998        NA             NA  <NA>        NA   0.9999998
#> 5      0.9998777        NA             NA  <NA>        NA   0.9998777
#> 6      0.9977224 17.615671      0.8416462 NCE25 0.6071017   0.7633930
#>     Database Level
#> 1 HMDB_0.0.1     3
#> 2 HMDB_0.0.1     3
#> 3   MS_0.0.2     1
#> 4 HMDB_0.0.1     3
#> 5 HMDB_0.0.1     3
#> 6   MS_0.0.2     1

Identify metabolites according another database

data("orbitrap_database0.0.3", package = "metid")

object3 =
  annotate_metabolites_mass_dataset(object = object2,
                                    database = orbitrap_database0.0.3)
#> 
[33mNo RT information in database.
#> The weight of RT have been set as 0.
#> 
[39m
[33mQC1_MSMS_NCE25.mgf file:
#> 
[39m
[32m25 MS2 spectra.
#> 
[39m
[33mUse all CE values.
#> 
[39m
#> 
[32mIdentifing metabolites with MS/MS database...
#> 
[39m
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |======================================================================| 100%
#> 
#> 
[41mAll done.
#> 
[49m
head(extract_variable_info(object = object3))
#>   variable_id       mz      rt                             Compound.name
#> 1   pRPLC_376 472.3032 772.906                           Allocholic acid
#> 2   pRPLC_391 466.3292 746.577                         LysoPA(18:0e/0:0)
#> 3   pRPLC_603 162.1125  33.746                               L-Carnitine
#> 4   pRPLC_629 181.0720  36.360                              Paraxanthine
#> 5   pRPLC_685 230.0701 158.205 1-(Methylsulfinyl)propyl propyl disulfide
#> 6   pRPLC_722 181.0721 228.305                              Theophylline
#>        CAS.ID     HMDB.ID KEGG.ID      Lab.ID        Adduct   mz.error
#> 1   2464-18-8 HMDB0000505  C17737 HMDB0000505 (M+CH3CN+Na)+ 0.28813692
#> 2        <NA> HMDB0011144    <NA> HMDB0011144  (M+CH3CN+H)+ 0.03145203
#> 3    541-15-1   HMDB00062  C00318    RPLC_406        (M+H)+ 1.66789418
#> 4    611-59-6 HMDB0001860  C13747 HMDB0001860        (M+H)+ 0.01530000
#> 5 132216-21-8 HMDB0033061    <NA> HMDB0033061      (M+NH4)+ 0.39101000
#> 6        <NA> HMDB0001889    <NA>    RPLC_443        (M+H)+ 1.68826243
#>   mz.match.score  RT.error RT.match.score    CE        SS Total.score
#> 1      0.9999336        NA             NA  <NA>        NA   0.9999336
#> 2      0.9999992        NA             NA  <NA>        NA   0.9999992
#> 3      0.9977770  1.974331      0.9978368 NCE25 0.6048288   0.8013178
#> 4      0.9999998        NA             NA  <NA>        NA   0.9999998
#> 5      0.9998777        NA             NA  <NA>        NA   0.9998777
#> 6      0.9977224 17.615671      0.8416462 NCE25 0.6071017   0.7633930
#>     Database Level
#> 1 HMDB_0.0.1     3
#> 2 HMDB_0.0.1     3
#> 3   MS_0.0.2     1
#> 4 HMDB_0.0.1     3
#> 5 HMDB_0.0.1     3
#> 6   MS_0.0.2     1

Session information

sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] metid_1.2.1        forcats_0.5.1      stringr_1.4.0      dplyr_1.0.7       
#>  [5] purrr_0.3.4        readr_2.0.0        tidyr_1.1.3        tibble_3.1.3      
#>  [9] ggplot2_3.3.5      tidyverse_1.3.1    magrittr_2.0.1     tinytools_0.9.1   
#> [13] massdataset_0.99.1
#> 
#> loaded via a namespace (and not attached):
#>   [1] colorspace_2.0-2      rjson_0.2.20          ellipsis_0.3.2       
#>   [4] leaflet_2.0.4.1       rprojroot_2.0.2       circlize_0.4.14      
#>   [7] GlobalOptions_0.1.2   fs_1.5.0              clue_0.3-59          
#>  [10] rstudioapi_0.13       listenv_0.8.0         furrr_0.2.3          
#>  [13] mzR_2.26.1            affyio_1.62.0         fansi_0.5.0          
#>  [16] lubridate_1.7.10      xml2_1.3.2            codetools_0.2-18     
#>  [19] ncdf4_1.17            doParallel_1.0.16     cachem_1.0.5         
#>  [22] impute_1.66.0         knitr_1.33            jsonlite_1.7.2       
#>  [25] Cairo_1.5-12.2        broom_0.7.9           vsn_3.60.0           
#>  [28] cluster_2.1.2         dbplyr_2.1.1          png_0.1-7            
#>  [31] BiocManager_1.30.16   compiler_4.1.0        httr_1.4.2           
#>  [34] rvcheck_0.1.8         backports_1.2.1       assertthat_0.2.1     
#>  [37] fastmap_1.1.0         lazyeval_0.2.2        limma_3.48.1         
#>  [40] cli_3.0.1             htmltools_0.5.2       tools_4.1.0          
#>  [43] affy_1.70.0           gtable_0.3.0          glue_1.4.2           
#>  [46] MALDIquant_1.20       Rcpp_1.0.7            Biobase_2.52.0       
#>  [49] cellranger_1.1.0      jquerylib_0.1.4       pkgdown_2.0.1        
#>  [52] vctrs_0.3.8           preprocessCore_1.54.0 iterators_1.0.13     
#>  [55] crosstalk_1.1.1       xfun_0.24             globals_0.14.0       
#>  [58] openxlsx_4.2.4        rvest_1.0.1           lifecycle_1.0.0      
#>  [61] XML_3.99-0.6          future_1.21.0         zlibbioc_1.38.0      
#>  [64] MASS_7.3-54           scales_1.1.1          MSnbase_2.18.0       
#>  [67] pcaMethods_1.84.0     ragg_1.1.3            hms_1.1.0            
#>  [70] ProtGenerics_1.24.0   parallel_4.1.0        RColorBrewer_1.1-2   
#>  [73] ComplexHeatmap_2.8.0  yaml_2.2.1            memoise_2.0.0        
#>  [76] pbapply_1.4-3         sass_0.4.0            stringi_1.7.3        
#>  [79] S4Vectors_0.30.0      desc_1.3.0            foreach_1.5.1        
#>  [82] BiocGenerics_0.38.0   zip_2.2.0             BiocParallel_1.26.1  
#>  [85] shape_1.4.6           rlang_0.4.11          pkgconfig_2.0.3      
#>  [88] systemfonts_1.0.2     matrixStats_0.60.0    mzID_1.30.0          
#>  [91] evaluate_0.14         lattice_0.20-44       htmlwidgets_1.5.3    
#>  [94] tidyselect_1.1.1      parallelly_1.27.0     ggsci_2.9            
#>  [97] plyr_1.8.6            R6_2.5.0              IRanges_2.26.0       
#> [100] generics_0.1.0        DBI_1.1.1             pillar_1.6.2         
#> [103] haven_2.4.1           withr_2.4.2           MsCoreUtils_1.4.0    
#> [106] modelr_0.1.8          crayon_1.4.1          utf8_1.2.2           
#> [109] plotly_4.9.4.1        tzdb_0.1.2            rmarkdown_2.9        
#> [112] GetoptLong_1.0.5      grid_4.1.0            readxl_1.3.1         
#> [115] data.table_1.14.0     reprex_2.0.0          digest_0.6.27        
#> [118] gridGraphics_0.5-1    textshaping_0.3.6     stats4_4.1.0         
#> [121] munsell_0.5.0         viridisLite_0.4.0     ggplotify_0.0.8      
#> [124] bslib_0.3.1