vignettes/work_with_mass_dataset.Rmd
work_with_mass_dataset.Rmd
library(massdataset)
library(tidyverse)
library(metid)
ms1_data =
readr::read_csv(file.path(
system.file("ms1_peak", package = "metid"),
"ms1.peak.table.csv"
))
ms1_data = data.frame(ms1_data, sample1 = 1, sample2 = 2)
expression_data = ms1_data %>%
dplyr::select(-c(name:rt))
variable_info =
ms1_data %>%
dplyr::select(name:rt) %>%
dplyr::rename(variable_id = name)
sample_info =
data.frame(
sample_id = colnames(expression_data),
injection.order = c(1, 2),
class = c("Subject", "Subject"),
group = c("Subject", "Subject")
)
rownames(expression_data) = variable_info$variable_id
object = create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info
)
object
#> --------------------
#> massdataset version: 0.01
#> --------------------
#> 1.expression_data:[ 100 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 100 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> Creation ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2021-12-27 12:11:00
mass_dataset
object
path = "./example"
dir.create(path)
ms2_data <- system.file("ms2_data", package = "metid")
file.copy(
from = file.path(ms2_data, "QC1_MSMS_NCE25.mgf"),
to = path,
overwrite = TRUE,
recursive = TRUE
)
#> [1] TRUE
object =
massdataset::mutate_ms2(
object = object,
column = "rp",
polarity = "positive",
ms1.ms2.match.mz.tol = 10,
ms1.ms2.match.rt.tol = 30
)
#> Reading mgf data...
#> 25 out of 100 variable have MS2 spectra.
#> Selecting the most intense MS2 spectrum for each peak...
object
#> --------------------
#> massdataset version: 0.01
#> --------------------
#> 1.expression_data:[ 100 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 100 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 25 variables x 24 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> Creation ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2021-12-27 12:11:00
#> update_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset update_mass_dataset() 2021-12-27 12:11:00
#> mutate_ms2 ----------
#> Package Function.used Time
#> 1 massdataset mutate_ms2() 2021-12-27 12:11:04
object@ms2_data
#> $QC1_MSMS_NCE25.mgf
#> --------------------
#> column: rp
#> polarity: positive
#> mz_tol: 10
#> rt_tol (second): 30
#> --------------------
#> 25 variables:
#> pRPLC_603 pRPLC_722 pRPLC_778 pRPLC_1046 pRPLC_1112...
#> 24 MS2 spectra.
#> mz162.112442157672rt37.9743312 mz181.072050304971rt226.14144 mz289.227264404297rt284.711172 mz181.072050673093rt196.800648 mz209.092155077047rt58.3735608...
#>
data("hmdb_ms1_database0.0.3", package = "metid")
0.3
hmdb_ms1_database0.#>
[33m-----------Base information------------
#>
[39m
[32mVersion: 0.0.1
#>
[39m
[32mSource: HMDB
#>
[39m
[32mLink: http://www.hmdb.ca/
#>
[39m
[32mCreater: Xiaotao Shen ( shenxt@stanford.edu )
#>
[39m
[32mWithout RT informtaion
#>
[39m
[33m-----------Spectral information------------
#>
[39m
[32mThere are 40 items of metabolites in database:
#>
[39m
[32mLab.ID; Compound.name; mz; RT; CAS.ID; HMDB.ID; KEGG.ID; Formula; Biospecimen.locations; Tissue.locations; Status; Secondary.accession.numbers; Synonyms; Average.molecular.weight; IUPQC.name; Traditional.IUPAC.name; SMILES; inchi; inchikey; State; Foodb.ID; Chemspider.ID; Pubchem.compound.ID; Drugbank.ID; CHEBI.ID; PDB.ID; Phenol.explorer.compound.ID; Knapsack.ID; BIGG.ID; Wikipedia.ID; METLIN.ID; Biocyc.ID; Kingdom; Super.class; Class; Sub.class; Source; Pathway; Pathway2; Disease
#>
[39m
[32mThere are 114004 metabolites in total
#>
[39m
[32mThere are 0 metabolites in positive mode with MS2 spectra.
#>
[39m
[32mThere are 0 metabolites in negative mode with MS2 spectra.
#>
[39m
[32mCollision energy in positive mode (number:):
#>
[39m
[32mTotal number: 0
#>
[39m
[32m
#>
[39m
[32mCollision energy in negative mode:
#>
[39m
[32mTotal number: 0
#>
[39m
[32m
#>
[39m
=
object1 annotate_metabolites_mass_dataset(object = object,
database = hmdb_ms1_database0.0.3)
#>
[33mNo MS2 data in database, so only use mz and/or RT for matching.
#>
[39m
[33mYou set rt.match.tol < 10,000, so if the compounds have RT, RTs will be used for matching
#>
[39m
|
| | 0%
|
|======================= | 33%
|
|=============================================== | 67%
|
|======================================================================| 100%
#>
#>
[41mAll done.
#>
[49m
object1
#> --------------------
#> massdataset version: 0.01
#> --------------------
#> 1.expression_data:[ 100 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 100 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 25 variables x 24 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> Creation ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2021-12-27 12:11:00
#> update_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset update_mass_dataset() 2021-12-27 12:11:00
#> mutate_ms2 ----------
#> Package Function.used Time
#> 1 massdataset mutate_ms2() 2021-12-27 12:11:04
#> annotate_metabolites_mass_dataset ----------
#> Package Function.used Time
#> 1 metid annotate_metabolites_mass_dataset() 2021-12-27 12:11:12
data("snyder_database_rplc0.0.3", package = "metid")
=
object2 annotate_metabolites_mass_dataset(object = object1,
database = snyder_database_rplc0.0.3)
#>
[33mQC1_MSMS_NCE25.mgf file:
#>
[39m
[32m25 MS2 spectra.
#>
[39m
[33mUse all CE values.
#>
[39m
#>
[32mIdentifing metabolites with MS/MS database...
#>
[39m
|
| | 0%
|
|======================= | 33%
|
|=============================================== | 67%
|
|======================================================================| 100%
#>
#>
[41mAll done.
#>
[49m
head(object2@annotation_table)
#>
[38;5;246m# A tibble: 6 × 18
[39m
#> variable_id ms2_files_id ms2_spectrum_id Compound.name CAS.ID HMDB.ID KEGG.ID
#>
[3m
[38;5;246m<chr>
[39m
[23m
[3m
[38;5;246m<chr>
[39m
[23m
[3m
[38;5;246m<chr>
[39m
[23m
[3m
[38;5;246m<chr>
[39m
[23m
[3m
[38;5;246m<chr>
[39m
[23m
[3m
[38;5;246m<chr>
[39m
[23m
[3m
[38;5;246m<chr>
[39m
[23m
#>
[38;5;250m1
[39m pRPLC_10319
[31mNA
[39m
[31mNA
[39m Tyrosyl-Seri…
[31mNA
[39m HMDB00…
[31mNA
[39m
#>
[38;5;250m2
[39m pRPLC_10319
[31mNA
[39m
[31mNA
[39m Seryltyrosine 21435… HMDB00…
[31mNA
[39m
#>
[38;5;250m3
[39m pRPLC_10319
[31mNA
[39m
[31mNA
[39m Monoglycerid… 36291… HMDB00…
[31mNA
[39m
#>
[38;5;250m4
[39m pRPLC_1046 QC1_MSMS_NCE… mz181.07205067… Theophylline 611-5… HMDB01… C07130
#>
[38;5;250m5
[39m pRPLC_1046 QC1_MSMS_NCE… mz181.07205067… Paraxanthine 611-5… HMDB01… C13747
#>
[38;5;250m6
[39m pRPLC_1046 QC1_MSMS_NCE… mz181.07205067… Theophylline
[31mNA
[39m HMDB00…
[31mNA
[39m
#>
[38;5;246m# … with 11 more variables: Lab.ID <chr>, Adduct <chr>, mz.error <dbl>,
[39m
#>
[38;5;246m# mz.match.score <dbl>, RT.error <dbl>, RT.match.score <dbl>, CE <chr>,
[39m
#>
[38;5;246m# SS <dbl>, Total.score <dbl>, Database <chr>, Level <dbl>
[39m
head(extract_variable_info(object = object2))
#> variable_id mz rt Compound.name
#> 1 pRPLC_376 472.3032 772.906 Allocholic acid
#> 2 pRPLC_391 466.3292 746.577 LysoPA(18:0e/0:0)
#> 3 pRPLC_603 162.1125 33.746 L-Carnitine
#> 4 pRPLC_629 181.0720 36.360 Paraxanthine
#> 5 pRPLC_685 230.0701 158.205 1-(Methylsulfinyl)propyl propyl disulfide
#> 6 pRPLC_722 181.0721 228.305 Theophylline
#> CAS.ID HMDB.ID KEGG.ID Lab.ID Adduct mz.error
#> 1 2464-18-8 HMDB0000505 C17737 HMDB0000505 (M+CH3CN+Na)+ 0.28813692
#> 2 <NA> HMDB0011144 <NA> HMDB0011144 (M+CH3CN+H)+ 0.03145203
#> 3 541-15-1 HMDB00062 C00318 RPLC_406 (M+H)+ 1.66789418
#> 4 611-59-6 HMDB0001860 C13747 HMDB0001860 (M+H)+ 0.01530000
#> 5 132216-21-8 HMDB0033061 <NA> HMDB0033061 (M+NH4)+ 0.39101000
#> 6 <NA> HMDB0001889 <NA> RPLC_443 (M+H)+ 1.68826243
#> mz.match.score RT.error RT.match.score CE SS Total.score
#> 1 0.9999336 NA NA <NA> NA 0.9999336
#> 2 0.9999992 NA NA <NA> NA 0.9999992
#> 3 0.9977770 1.974331 0.9978368 NCE25 0.6048288 0.8013178
#> 4 0.9999998 NA NA <NA> NA 0.9999998
#> 5 0.9998777 NA NA <NA> NA 0.9998777
#> 6 0.9977224 17.615671 0.8416462 NCE25 0.6071017 0.7633930
#> Database Level
#> 1 HMDB_0.0.1 3
#> 2 HMDB_0.0.1 3
#> 3 MS_0.0.2 1
#> 4 HMDB_0.0.1 3
#> 5 HMDB_0.0.1 3
#> 6 MS_0.0.2 1
data("orbitrap_database0.0.3", package = "metid")
=
object3 annotate_metabolites_mass_dataset(object = object2,
database = orbitrap_database0.0.3)
#>
[33mNo RT information in database.
#> The weight of RT have been set as 0.
#>
[39m
[33mQC1_MSMS_NCE25.mgf file:
#>
[39m
[32m25 MS2 spectra.
#>
[39m
[33mUse all CE values.
#>
[39m
#>
[32mIdentifing metabolites with MS/MS database...
#>
[39m
|
| | 0%
|
|======================= | 33%
|
|=============================================== | 67%
|
|======================================================================| 100%
#>
#>
[41mAll done.
#>
[49m
head(extract_variable_info(object = object3))
#> variable_id mz rt Compound.name
#> 1 pRPLC_376 472.3032 772.906 Allocholic acid
#> 2 pRPLC_391 466.3292 746.577 LysoPA(18:0e/0:0)
#> 3 pRPLC_603 162.1125 33.746 L-Carnitine
#> 4 pRPLC_629 181.0720 36.360 Paraxanthine
#> 5 pRPLC_685 230.0701 158.205 1-(Methylsulfinyl)propyl propyl disulfide
#> 6 pRPLC_722 181.0721 228.305 Theophylline
#> CAS.ID HMDB.ID KEGG.ID Lab.ID Adduct mz.error
#> 1 2464-18-8 HMDB0000505 C17737 HMDB0000505 (M+CH3CN+Na)+ 0.28813692
#> 2 <NA> HMDB0011144 <NA> HMDB0011144 (M+CH3CN+H)+ 0.03145203
#> 3 541-15-1 HMDB00062 C00318 RPLC_406 (M+H)+ 1.66789418
#> 4 611-59-6 HMDB0001860 C13747 HMDB0001860 (M+H)+ 0.01530000
#> 5 132216-21-8 HMDB0033061 <NA> HMDB0033061 (M+NH4)+ 0.39101000
#> 6 <NA> HMDB0001889 <NA> RPLC_443 (M+H)+ 1.68826243
#> mz.match.score RT.error RT.match.score CE SS Total.score
#> 1 0.9999336 NA NA <NA> NA 0.9999336
#> 2 0.9999992 NA NA <NA> NA 0.9999992
#> 3 0.9977770 1.974331 0.9978368 NCE25 0.6048288 0.8013178
#> 4 0.9999998 NA NA <NA> NA 0.9999998
#> 5 0.9998777 NA NA <NA> NA 0.9998777
#> 6 0.9977224 17.615671 0.8416462 NCE25 0.6071017 0.7633930
#> Database Level
#> 1 HMDB_0.0.1 3
#> 2 HMDB_0.0.1 3
#> 3 MS_0.0.2 1
#> 4 HMDB_0.0.1 3
#> 5 HMDB_0.0.1 3
#> 6 MS_0.0.2 1
sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] metid_1.2.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7
#> [5] purrr_0.3.4 readr_2.0.0 tidyr_1.1.3 tibble_3.1.3
#> [9] ggplot2_3.3.5 tidyverse_1.3.1 magrittr_2.0.1 tinytools_0.9.1
#> [13] massdataset_0.99.1
#>
#> loaded via a namespace (and not attached):
#> [1] colorspace_2.0-2 rjson_0.2.20 ellipsis_0.3.2
#> [4] leaflet_2.0.4.1 rprojroot_2.0.2 circlize_0.4.14
#> [7] GlobalOptions_0.1.2 fs_1.5.0 clue_0.3-59
#> [10] rstudioapi_0.13 listenv_0.8.0 furrr_0.2.3
#> [13] mzR_2.26.1 affyio_1.62.0 fansi_0.5.0
#> [16] lubridate_1.7.10 xml2_1.3.2 codetools_0.2-18
#> [19] ncdf4_1.17 doParallel_1.0.16 cachem_1.0.5
#> [22] impute_1.66.0 knitr_1.33 jsonlite_1.7.2
#> [25] Cairo_1.5-12.2 broom_0.7.9 vsn_3.60.0
#> [28] cluster_2.1.2 dbplyr_2.1.1 png_0.1-7
#> [31] BiocManager_1.30.16 compiler_4.1.0 httr_1.4.2
#> [34] rvcheck_0.1.8 backports_1.2.1 assertthat_0.2.1
#> [37] fastmap_1.1.0 lazyeval_0.2.2 limma_3.48.1
#> [40] cli_3.0.1 htmltools_0.5.2 tools_4.1.0
#> [43] affy_1.70.0 gtable_0.3.0 glue_1.4.2
#> [46] MALDIquant_1.20 Rcpp_1.0.7 Biobase_2.52.0
#> [49] cellranger_1.1.0 jquerylib_0.1.4 pkgdown_2.0.1
#> [52] vctrs_0.3.8 preprocessCore_1.54.0 iterators_1.0.13
#> [55] crosstalk_1.1.1 xfun_0.24 globals_0.14.0
#> [58] openxlsx_4.2.4 rvest_1.0.1 lifecycle_1.0.0
#> [61] XML_3.99-0.6 future_1.21.0 zlibbioc_1.38.0
#> [64] MASS_7.3-54 scales_1.1.1 MSnbase_2.18.0
#> [67] pcaMethods_1.84.0 ragg_1.1.3 hms_1.1.0
#> [70] ProtGenerics_1.24.0 parallel_4.1.0 RColorBrewer_1.1-2
#> [73] ComplexHeatmap_2.8.0 yaml_2.2.1 memoise_2.0.0
#> [76] pbapply_1.4-3 sass_0.4.0 stringi_1.7.3
#> [79] S4Vectors_0.30.0 desc_1.3.0 foreach_1.5.1
#> [82] BiocGenerics_0.38.0 zip_2.2.0 BiocParallel_1.26.1
#> [85] shape_1.4.6 rlang_0.4.11 pkgconfig_2.0.3
#> [88] systemfonts_1.0.2 matrixStats_0.60.0 mzID_1.30.0
#> [91] evaluate_0.14 lattice_0.20-44 htmlwidgets_1.5.3
#> [94] tidyselect_1.1.1 parallelly_1.27.0 ggsci_2.9
#> [97] plyr_1.8.6 R6_2.5.0 IRanges_2.26.0
#> [100] generics_0.1.0 DBI_1.1.1 pillar_1.6.2
#> [103] haven_2.4.1 withr_2.4.2 MsCoreUtils_1.4.0
#> [106] modelr_0.1.8 crayon_1.4.1 utf8_1.2.2
#> [109] plotly_4.9.4.1 tzdb_0.1.2 rmarkdown_2.9
#> [112] GetoptLong_1.0.5 grid_4.1.0 readxl_1.3.1
#> [115] data.table_1.14.0 reprex_2.0.0 digest_0.6.27
#> [118] gridGraphics_0.5-1 textshaping_0.3.6 stats4_4.1.0
#> [121] munsell_0.5.0 viridisLite_0.4.0 ggplotify_0.0.8
#> [124] bslib_0.3.1