From 1b23ab7b13793b7945389dc0fd3a2323119de81e Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Sat, 12 Oct 2024 22:09:08 +0200 Subject: [PATCH] remove reference alignment vignette --- .github/workflows/build_deploy.yaml | 9 +- .github/workflows/docker-push.yaml | 1 - vignettes/alignment-to-external-dataset.qmd | 386 -------------------- 3 files changed, 7 insertions(+), 389 deletions(-) delete mode 100644 vignettes/alignment-to-external-dataset.qmd diff --git a/.github/workflows/build_deploy.yaml b/.github/workflows/build_deploy.yaml index cfa25d8..bd11f55 100644 --- a/.github/workflows/build_deploy.yaml +++ b/.github/workflows/build_deploy.yaml @@ -1,8 +1,13 @@ name: pkgdown on: - repository_dispatch: - types: [trigger-workflow-2] + push: + branches: + - jomain + +# on: +# repository_dispatch: +# types: [trigger-workflow-2] jobs: build-and-deploy: diff --git a/.github/workflows/docker-push.yaml b/.github/workflows/docker-push.yaml index 6fd5b80..bec5489 100644 --- a/.github/workflows/docker-push.yaml +++ b/.github/workflows/docker-push.yaml @@ -6,7 +6,6 @@ on: push: branches: - main - - jomain jobs: build-docker-and-push: diff --git a/vignettes/alignment-to-external-dataset.qmd b/vignettes/alignment-to-external-dataset.qmd deleted file mode 100644 index 0603847..0000000 --- a/vignettes/alignment-to-external-dataset.qmd +++ /dev/null @@ -1,386 +0,0 @@ ---- -title: "Seamless Alignment: Merging New Data with and Existing Preprocessed Dataset" -format: html -editor: visual -minimal: true -date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' -vignette: > - %\VignetteIndexEntry{Seamless Alignment: Merging New Data with and Existing Preprocessed Dataset} - %\VignetteEngine{quarto::html} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include=FALSE} -library(knitr) -library(quarto) -knitr::opts_knit$set(root.dir = './') -``` - -# Introduction - -In certain experiments, aligning datasets recorded at different times is -necessary. This can involve comparing runs of the same samples from different -laboratories or matching MS2 data to an initial MS1 run. Variation in retention -time across laboratories and LC systems often requires an alignment step using -`adjustRtime()` with the `LamaParama` parameter. - -As described in the [data -description](https://rformassspectrometry.github.io/metabonaut/articles/dataset-investigation.html) -vignette, some of our samples were run twice: once in LC-MS mode and again in -LC-MS/MS mode. This tutorial will show how to align the LC-MS/MS run to the -preprocessed LC-MS dataset. - -The following packages are needed: - -```{r, message=FALSE, warning=FALSE} -library(MsIO) -library(MsBackendMetaboLights) -library(xcms) -library(MsExperiment) -library(Spectra) -``` - -Setting parallel processing to improve the efficiency of the process: - -```{r, message=FALSE, warning=FALSE} -register(SerialParam()) -#' Set up parallel processing using 2 cores -## if (.Platform$OS.type == "unix") { -## register(MulticoreParam(2)) -## } else { -## register(SnowParam(2)) -## } - -``` - -## Load preprocessed LC-MS object - -First, let's load our pre-processed LC-MS object, all the steps to get this -object is shown in the [End-to-end -worflow](https://rformassspectrometry.github.io/metabonaut/articles/end-to-end-untargeted-metabolomics.html) -vignette. - -```{r} -load("preprocessed_lcms1.RData") -``` - -## Load unprocessed LC-MS/MS data - -Next, we will load the unprocessed LC-MS/MS data from the MetaboLights database: - -```{r, message=FALSE, warning=FALSE} -#' Load form the MetaboLights Database -param <- MetaboLightsParam(mtblsId = "MTBLS8735", - assayName = paste0("a_MTBLS8735_LC-MSMS_positive_", - "hilic_metabolite_profiling.txt"), - filePattern = ".mzML") - -lcms2 <- readMsObject(MsExperiment(), - param, - keepOntology = FALSE, - keepProtocol = FALSE, - simplify = TRUE) - -``` - -We will adjust the `sampleData()` of the LC-MS/MS object to make it easier to -access: - -```{r} -#| tbl-cap: "Table 10. Samples from the LC-MS/MS data set." -#adjust sampleData -colnames(sampleData(lcms2)) <- c("sample_name", "derived_spectra_data_file", - "metabolite_asssignment_file", - "source_name", - "organism", - "blood_sample_type", - "sample_type", "age", "unit", "phenotype") - -#let's look at the updated sample data -sampleData(lcms2)[, c("derived_spectra_data_file", - "phenotype", "sample_name", "age")] |> - kable(format = "pipe") - -``` - -We will only keep the MS runs (not MS/MS) and remove pooled samples, focusing on -samples A and E that are common to both runs. - -```{r} -# Only keep MS run -lcms2 <- lcms2[!grepl("MSMS", sampleData(lcms2)$derived_spectra_data_file),] -``` - -Before alignment, ensure the retention time (RT) ranges match between the -datasets: - -```{r} -range(rtime(lcms1)) - -range(rtime(lcms2)) -``` - -We need to adjust the RT range for the LC-MS/MS object to match the LC-MS data: - -```{r, message=FALSE, warning=FALSE} -#' Filter the data to the same RT range as the LC-MS run -lcms2 <- filterRt(lcms2, range(rtime(lcms1))) -``` - -## Comparing chromatograms - -To evaluate retention time shifts, we'll plot the base peak chromatogram (BPC): - -```{r} -idx_A <- which(sampleData(lcms1)$sample_name == "A") -idx_E <- which(sampleData(lcms1)$sample_name == "E") -bpc1 <-chromatogram(lcms1[c(idx_A,idx_E)], aggregationFun = "max", - msLevel = 1) -bpc2 <- chromatogram(lcms2, aggregationFun = "max", msLevel = 1) -``` - -Compare run1 sample A with run2 sample A - -```{r ini-bpc1} -plot(bpc1[1,1], col = "#00000080", - main = "BPC sample A LC-MS vs A LC-MS/MS", lwd = 1.5, peakType = "none") -grid() -points(rtime(bpc2[1, 1]), intensity(bpc2[1, 1]), col = "#0000ff80", type = "l") -legend("topleft", col = c("#00000080", "#0000ff80"), - legend = c("LC-MS", "LC-MS/MS"), lty = 1, lwd = 2, horiz = TRUE, bty = "n") -``` - -Similarly, compare the BPC for sample E: - -```{r, ini-bpc2} -plot(bpc1[1, 2], col = "#00000080", - main = "BPC sample E LC-MS vs E LC-MS/MS", lwd = 1.5, peakType = "none") -grid() -points(rtime(bpc2[1, 2]), intensity(bpc2[1, 2]), col = "#0000ff80", type = "l") -legend("topleft", col = c("#00000080", "#0000ff80"), - legend = c("LC-MS", "LC-MS/MS"), lty = 1, lwd = 2, horiz = TRUE, bty = "n") -``` - -## Peak detection - -Perform peak detection and refining before alignment, as detailed in the -end-to-end vignette. The same setting were applied. - -```{r quick-proc, message=FALSE, warning=FALSE} -param <- CentWaveParam(peakwidth = c(1, 8), ppm = 15, integrate = 2) -lcms2 <- findChromPeaks(lcms2, param = param, chunkSize = 2) -param <- MergeNeighboringPeaksParam(expandRt = 2.5, expandMz = 0.0015, - minProp = 0.75) -lcms2 <- refineChromPeaks(lcms2, param = param, chunkSize = 2) -``` - -## Alignment - -Now, we will attempt to align these two samples with the previous dataset. The -first step is to extract landmark features (referred to as *lamas*). To achieve -this, we will identify the features present in every phenotype group of the -`lcms1` dataset. To do so, we will categorize (using `factor()`) our data by -`phenotype` and only retain the QC samples. This variable will be utilized to -filter the features using the `PercentMissingFilter` parameter within the -`filterFeatures()` function. Here, by setting `threshold = 0` we select the -features present in all QC samples. - -```{r} -f <- sampleData(lcms1)$phenotype -f[f != "QC"] <- NA -lcms1 <- filterFeatures(lcms1, PercentMissingFilter(threshold = 0, f = f), - filled = FALSE) -lcms1_mz_rt <- featureDefinitions(lcms1)[, c("mzmed","rtmed")] -head(lcms1_mz_rt) -nrow(lcms1_mz_rt) -``` - -This is what the `lamas` input should look like for alignment. In terms of how -this method works, the alignment algorithm matches chromatographic peaks from -the experimental data to the lamas, fitting a model based on this match to -adjust their retention times and minimize differences between the two datasets. - -Now we can define our `param` object `LamaParama` to prepare for the alignment. -Parameters such as `tolerance`, `toleranceRt`, and `ppm` relate to the matching -between chromatographic peaks and lamas. Other parameters are related to the -type of fitting generated between these data points. More details on each -parameter and the overall method can be found by searching `?adjustRtime`. Below -is an example using default parameters. - -```{r} -param <- LamaParama(lamas = lcms1_mz_rt, method = "loess", span = 0.5, - outlierTolerance = 3, zeroWeight = 10, ppm =20, - tolerance = 0, toleranceRt = 20, bs = "tp") -``` - -The `matchLamaChromPeaks()` function facilitates the assessment of how well the -`lamas` correspond with the chromatographic peaks in each file. We then extract -the matched results using the `matchedRtimes()` function. This will be used -later to evaluate the alignment. - -```{r} -param <- matchLamasChromPeaks(lcms2, param = param) -ref_vs_obs <- matchedRtimes(param) -``` - -Now we can adjust the retention time of the LC-MS/MS dataset using the -`adjustRtime()` function. - -```{r} -#' input into `adjustRtime()` -lcms2 <- adjustRtime(lcms2, param = param) -lcms2 <- applyAdjustedRtime(lcms2) -``` - -## Evaluation - -We extract the base peak chromatogram (BPC) of our aligned object: - -```{r, message=FALSE} -#' evaluate the results with BPC -bpc2_adj <- chromatogram(lcms2, aggregationFun = "max", - msLevel = 1) -``` - -### Visualizing Alignment Quality - -To evaluate the performance of our alignment process, we generate plots -comparing the alignment of the reference dataset (in black) with our LC-MS data -before (in red) and after (in blue) the adjustment. - -```{r, fig.height=6, fig.width=6} -#' BPC of sample A -par(mfrow = c(2, 1), mar = c(2.5, 2.5, 2.5, 0.5), mgp = c(1.5, 0.5, 0)) -plot(bpc1[1, 1], col = "#00000080", main = "Before Alignment", lwd = 1.5, - peakType = "none", xlab = NA) -grid() -points(rtime(bpc2[1,1]), intensity(bpc2[1,1]), - type = "l", - col = "#0000ff80") -legend("topleft", col = c("#00000080", "#0000ff80"), - legend = c("LC-MS", "LC-MS/MS"), lty = 1, lwd = 2, horiz = TRUE, bty = "n") - -plot(bpc1[1, 1], col = "#00000080", main = "After Alignment", lwd = 1.5, - peakType = "none", xlab = "rtime (s)") -grid() -points(rtime(bpc2_adj[1,1]), intensity(bpc2_adj[1,1]), - type = "l", - col = "#0000ff80") -``` - -```{r, fig.height=6, fig.width=6} -#' BPC of sample B -par(mfrow = c(2, 1), mar = c(2.5, 2.5, 2.5, 0.5), mgp = c(1.5, 0.5, 0)) -plot(bpc1[1, 2], col = "#00000080", main = "Before Alignment", lwd = 1.5, - peakType = "none", xlab = NA) -grid() -points(rtime(bpc2[1, 2]), intensity(bpc2[1, 2]), type = "l", - col = "#0000ff80") -legend("topleft", col = c("#00000080", "#0000ff80"), - legend = c("LC-MS", "LC-MS/MS"), lty = 1, lwd = 2, horiz = TRUE, bty = "n") - -plot(bpc1[1, 2], col = "#00000080", main = "After Alignment", lwd = 1.5, - peakType = "none", xlab = "rtime (s)") -grid() -points(rtime(bpc2_adj[1, 2]), intensity(bpc2_adj[1, 2]), type = "l", - col = "#0000ff80") -``` - -Although the overall matching is imperfect due to initial sample issues, certain -regions show significant improvement. The alignment of the signal’s start is -particularly well done. Specifically, the regions right before and after 150 -seconds show substantial improvement. - -Below is a visualization of the distribution of chromatographic peaks matched to -anchor peaks (Lamas) for Sample A. The red vertical lines represent the -positions of these matched peaks. - -```{r, message = FALSE, warning = FALSE} -#' BPC of the first sample with matches to lamas overlay -par(mfrow = c(1, 1)) -plot(bpc1[1, 1], col = "#00000080", main = "Distribution CP matched to Lamas", - lwd = 1.5, - peakType = "none") -points(rtime(bpc2_adj[1, 1]), intensity(bpc2_adj[1, 1]), type = "l", - col = "#0000ff80") -grid() -abline(v = ref_vs_obs[[1]]$obs, col = "#c4114510") -``` - -### Quantitative Evaluation of Alignment - -To quantitatively assess the quality of the alignment, we compute the distance -between the chromatographic peaks in our LC-MS data and the anchor peaks (Lamas) -both before and after the alignment. - -```{r} -# Extract data for sample 3 directly -ref_obs_sample_1 <- ref_vs_obs[["1"]] - -# Calculate distances before and after alignment -dist_before <- abs(ref_obs_sample_1$obs - ref_obs_sample_1$ref) -dist_after <- abs(chromPeaks(lcms2)[ref_obs_sample_1$chromPeaksId, - "rt"] - ref_obs_sample_1$ref) - -# Create a data frame for plotting -distances <- data.frame( - Distance = c(dist_before, dist_after), - Alignment = rep(c("Before", "After"), each = length(dist_before)) -) - -# Set factor levels for Alignment to ensure correct order -distances$Alignment <- factor(distances$Alignment, levels = c("Before", "After")) - -# Plot the violin plot -library(vioplot) -vioplot(Distance ~ Alignment, data = distances, xlab = "", - rectCol = "#c4114580", - lineCol = "white", - col="#17138fe8", - border = "white", - ylab = "Distance (s)", - main = "Distance to Anchor Peaks: Before vs. After Alignment") - -``` - -Furthermore, a more detailed examination of the matching and the model used for -fitting each file is possible. Numerical information can be obtained using the -`summarizeLamaMatch()` function. From this, the percentage of chromatographic -peaks utilized for alignment can be computed relative to the total number of -peaks in the file. Additionally, it is feasible to directly `plot()` the `param` -object for the file of interest, showcasing the distribution of these -chromatographic peaks along with the fitted model line. - -```{r} -#' access summary of matches and model information -summary <- summarizeLamaMatch(param) -summary - -#' coverage for each file -summary$Matched_peaks / summary$Total_peaks * 100 - -#' access the information on the model of for the first file -summary$Model_summary[[1]] - -#' Plot obs vs. lcms1 with fitting line -plot(param, index = 1L, main = "ChromPeaks versus Lamas for sample A", - colPoint = "red") -abline(0, 1, lty = 3, col = "grey") -grid() -``` - -## Conclusion - -This tutorial demonstrated how to align LC-MS and LC-MS/MS datasets to correct -retention time shifts, crucial for handling data from different runs or -platforms. We preprocessed the data, detected chromatographic peaks, and used -landmark features (lamas) from QC samples to adjust retention times via the -adjustRtime() function. Visual comparisons of base peak chromatograms before and -after alignment, along with distance calculations, showed clear improvements in -RT synchronization. - -Ultimately, aligning chromatographic data ensures that subsequent analyses, such -as feature extraction and statistical comparisons, are based on consistent time -points, improving data quality and reliability. This tutorial outlined an -end-to-end workflow that can be adapted to various LC-MS-based metabolomics -studies, helping researchers manage retention time variation effectively.