continues notebook for intersecting phospho sites and total peptides

MRCToxBioinformatics · Feb 6, 2023 · 596e62a · 596e62a
1 parent 2a81ded
commit 596e62a
Show file tree

Hide file tree

Showing 9 changed files with 1,340 additions and 1,808 deletions.
diff --git a/Markdowns/TMT_phospho.Rmd b/Markdowns/TMT_phospho.Rmd
@@ -1,6 +1,6 @@
 ---
-title: "Tandem Mass Tags"
-subtitle: "QC PSM-level quantification and summarisation to protein-level abundance"
+title: "Phosphoproteomics using Tandem Mass Tags"
+subtitle: "QC PSM-level quantification, filtering and summarisation to protein-level abundance"
 author: "Tom Smith"
 date: "`r Sys.Date()`"
 output:
@@ -10,7 +10,7 @@ output:
 bibliography: bib.json
 ---
 
-```{r setup, include = FALSE}
+```{r, message=FALSE, warning=FALSE}
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>"
@@ -19,7 +19,6 @@ knitr::opts_chunk$set(
 
 ### Preamble
 
-....
 
 ```{r, message=FALSE}
 library(camprotR)
@@ -117,7 +116,7 @@ We can also add the sequence around the phosphosite, which can be useful to e.g
 psm_phospho_data_parsed <- add_site_sequence(psm_phospho_data_parsed, protein_fasta_inf)
 ```
 
-Finally, we need to add the positions of the peptide for the total peptides, so we can intersect the positions of the phosphosite and total peptides for the statistical testing. We can use `add_peptide_positions` for this, though note that this code chunk takes a long time to run.
+Finally, we need to add the positions of the peptide for the total peptides, so we can intersect the positions of the phosphosite and total peptides for the statistical testing. We can use `add_peptide_positions` for this, though note that this code chunk takes a long time to run (the function needs to be optimised).
 
 ```{r}
 psm_total_data_flt <- add_peptide_positions(psm_total_data_flt, proteome_fasta=protein_fasta_inf)
@@ -139,9 +138,12 @@ colnames(psm.total.e) <- gsub('Abundance.', '', colnames(psm.total.e))
 # we don't have 'phenotype' data to add so we just define the 
 # 'expression' data and 'feature' data
 
-psm.total.p <- data.frame(spike=rep(factor(c('x1', 'x2', 'x6')), c(4,3,3)), row.names=colnames(psm.total.e))
+psm.total.p <- data.frame(spike=rep(factor(c('x1', 'x2', 'x6')), c(4,3,3)),
+                          condition=rep(1:3, c(4,3,3)),
+                          row.names=colnames(psm.total.e))
   
 psm.total <- MSnbase::MSnSet(exprs = psm.total.e, fData = psm.total.f, pData=psm.total.p)
+pData(psm.total)
 ```
 
 ```{r}
@@ -158,9 +160,12 @@ colnames(psm.phospho.e) <- gsub('Abundance.', '', colnames(psm.phospho.e))
 # we don't have 'phenotype' data to add so we just define the 
 # 'expression' data and 'feature' data
 
-psm.phospho.p <- data.frame(spike=rep(factor(c('x1', 'x6')), c(4,6)), row.names=colnames(psm.phospho.e))
+psm.phospho.p <- data.frame(spike=rep(factor(c('x1', 'x6')), c(4,6)),
+                            condition=rep(1:3, c(4,3,3)),
+                            row.names=colnames(psm.phospho.e))
   
 psm.phospho <- MSnbase::MSnSet(exprs = psm.phospho.e, fData = psm.phospho.f, pData=psm.phospho.p)
+
 ```
 
 ```{r}
@@ -202,7 +207,7 @@ for(x in names(psm_res)){
 }
 ```
 
-We can also look into this relationship at the tag level using `plot_missing_SN_per_sample`. In this case, there is no tag which appears to have a high proportion of missing values when signal:noise \> 5. If there were, this may warrant further exploration, e.g was one of the sample preparations inadequate such that fewer peptides were labeled?
+We can also look into this relationship at the tag level using `plot_missing_SN_per_sample`. In this case, there is no tag which appears to have a high proportion of missing values when signal:noise > 5. If there were, this may warrant further exploration, e.g was one of the sample preparations inadequate such that fewer peptides were labeled?
 
 ```{r}
 for(x in names(psm_res)){
@@ -216,26 +221,53 @@ for(x in names(psm_res)){
 
 ```
 
-Based on the above, we will filter the PSMs to only retain those with S:N > 10 using `filter_TMT_PSMs`. Using the same function, we will also remove PSMs with interference/co-isolation >50%.
+Based on the above, we will filter the PSMs to only retain those with S:N > 10 using `filter_TMT_PSMs`. Using the same function, we will also remove PSMs with interference/co-isolation >10%.
 
 ```{r}
 psm_filt_sn_int <- psm_res %>% lapply(function(x){
-  filter_TMT_PSMs(x, inter_thresh = 50, sn_thresh = 5)})
+  filter_TMT_PSMs(x, inter_thresh = 10, sn_thresh = 5)})
 ```
+Since we have merged together material from two species, we will perform a couple more 
 
+Finally, we will also filter our PSMs using the Delta.Score, e.g the difference between the score for the top two hits. Where this is 
 ```{r}
-phospho_sites <- psm_filt_sn_int$Phospho %>%
+
+psm_filt_sn_int_delta <- psm_filt_sn_int %>% lapply(function(x){
+  flt <- x[fData(x)$Delta.Score>0.2,]
+  camprotR:::message_parse(fData(flt), column='Master.Protein.Accessions', message='Delta Score filtering')
+  return(flt)
+})
+
+
+```
+
+
+For a typical phospho TMT proteomics experiment, it would likely be reasonable to use a more relaxed filter for the interference (e.g 50%) and to not filter by Delta Score at all.
+
+```{r}
+phospho_sites <- psm_filt_sn_int_delta$Phospho %>%
     MSnbase::combineFeatures(
-      groupBy = paste(fData(psm_filt_sn_int$Phospho)$Master.Protein.Accessions,
-                      fData(psm_filt_sn_int$Phospho)$ptm_position, sep='_'),
+      groupBy = paste(fData(psm_filt_sn_int_delta$Phospho)$Master.Protein.Accessions,
+                      fData(psm_filt_sn_int_delta$Phospho)$ptm_position, sep='_'),
       method = 'sum')
 
+
+print(nrow(phospho_sites))
 ```
 
+Note that our filtering has removed a lot of PSMs, especially for the phosphosites. We've gone from:
+- 5582 PSMs in the input file
+- 5335 PSMs after removing contaminants and PSMs without a unique master protein
+- 4301 PSMs after removing those without a phosphorylation site
+- 2852 PSMs after removing those with a phosphoRS score < 75
+- 1524 PSMs after removing those with interference > 10 or Signal:Noise < 10
+- 1008 PSMs after removing those with Delta Score < 0.2.
+- 666 phosphosites after combining PSMs for the same phosphosite
+
 ```{r}
-total <- psm_filt_sn_int$Total %>%
+total <- psm_filt_sn_int_delta$Total %>%
     MSnbase::combineFeatures(
-      groupBy = fData(psm_filt_sn_int$Total)$Sequence,
+      groupBy = fData(psm_filt_sn_int_delta$Total)$Sequence,
       method = 'sum')
 ```
 

diff --git a/Markdowns/TMT_phospho.html b/Markdowns/TMT_phospho.html
diff --git a/Markdowns/TMT_phospho_stats.Rmd b/Markdowns/TMT_phospho_stats.Rmd
@@ -0,0 +1,163 @@
+---
+title: "Phosphoproteomics using Tandem Mass Tags"
+subtitle: "Intersecting phosphosites and total peptides and statistical testing"
+author: "Tom Smith"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    code_folding: show
+  pdf_document: default
+bibliography: bib.json
+---
+```{r, message=FALSE, warning=FALSE}
+library(camprotR)
+library(MSnbase)
+library(ggplot2)
+library(tidyr)
+library(dplyr)
+library(here)
+library(limma)
+library(uniprotREST)
+```
+
+
+```{r}
+total <- readRDS(here('results/total.rds'))
+phospho_sites <- readRDS(here('results/phospho_sites.rds'))
+```
+
+
+Below, we identify the total peptides that overlap each phosphosite. Note that each phosphosite may intersect multiple total peptides (due to missed cleavages) and each total peptide may intersect multiple phosphosites (due to multiple phosphosites per peptide). The code chunk takes a while to run..
+```{r}
+phospho_f <- phospho_sites %>% fData() %>% tibble::rownames_to_column('ID')
+total_f <- total %>% fData() %>% tibble::rownames_to_column('ID')
+
+phospho_peptide_to_total_peptides <- vector('list', length=length(phospho_f$ID))
+names(phospho_peptide_to_total_peptides) <- phospho_f$ID
+n <- 0
+for(phospho_id in phospho_f$ID){
+  row <- phospho_f %>% filter(ID==phospho_id)
+  
+  n <- n + 1
+  ptm_positions <- as.numeric(strsplit(row$ptm_position, split='\\|')[[1]])
+  
+  total_peptide_ids <- total_f %>%
+    filter(Master.Protein.Accessions %in% row$Master.Protein.Accessions,
+           peptide_start<=min(ptm_positions),
+           peptide_end>=max(ptm_positions)) %>%
+    pull(ID)  
+  
+  phospho_peptide_to_total_peptides[[row$ID]] <- total_peptide_ids
+}
+
+```
+
+Now, how many overlapping features. Note that for the phospho and total to be considered intersecting, there must be at least one total peptide which overlaps the phosphosite. Out of the `r length(phospho_f$ID)` phosphosites, only `r sum(sapply(phospho_peptide_to_total_peptides, function(x) length(x)>0))` have at least one overlapping total peptide.
+```{r}
+print(length(phospho_f$ID))
+print(table(sapply(phospho_peptide_to_total_peptides, function(x) length(x)>0)))
+print(table(sapply(phospho_peptide_to_total_peptides, length)))
+```
+limma requires a single row per feature with a columns per sample. Here, the phospho and total are separate samples so we need to combine them into a single row.
+```{r}
+combined_exprs <- phospho_peptide_to_total_peptides %>% names() %>% lapply(function(x){
+  total_peptides <- phospho_peptide_to_total_peptides[[x]]
+  
+  if(length(total_peptides)>0) {
+    exprs_values <- c(exprs(phospho_sites[x,]),
+                      colSums(exprs(total[total_peptides,]), na.rm=TRUE))
+    exprs_values <- unname(exprs_values)
+    return(exprs_values)
+  } else { return(NULL) }
+})
+names(combined_exprs) <- names(phospho_peptide_to_total_peptides)
+```
+
+Create the completed matrix of phospho and total.
+```{r}
+
+combined_exprs <- combined_exprs[sapply(combined_exprs, function(x) !is.null(x))]
+all_combined_exprs <- bind_rows(combined_exprs) %>% t() %>% as.matrix
+
+rownames(all_combined_exprs) <- names(combined_exprs)
+
+colnames(all_combined_exprs) <- c(paste0(colnames(phospho_sites), '_phospho'),
+                                  paste0(colnames(total), '_total'))
+
+```
+Inspect the combined quantification matrix. 
+```{r}
+head(all_combined_exprs) #
+```
+
+```{r}
+all_combined_fdata <- fData(phospho_sites[rownames(all_combined_exprs),])[
+  ,c('Master.Protein.Accessions', 'ptm_position')]
+all_combined_fdata$n_total <- sapply(rownames(all_combined_exprs),
+                                     function(x) length(phospho_peptide_to_total_peptides[[x]]))
+
+all_combined_pdata <- rbind((pData(phospho_sites) %>% mutate(type='phospho')),
+                            (pData(total) %>% mutate(type='total')))
+rownames(all_combined_pdata) <- colnames(all_combined_exprs)
+all_combined_res <- MSnSet(as.matrix.data.frame(all_combined_exprs),
+                           all_combined_fdata,
+                           all_combined_pdata)
+
+head(fData(all_combined_res))
+pData(all_combined_res)
+```
+
+Below, we subset to the first 2 'conditions', where condition 1 = tags 1-4 (x1 phospho; x1 total) and condition 2 = tags 5-7 (x6 phospho; x2 total). 
+The ground truth for the difference between the first 4 tags and the next 3 is therefore a 3-fold increase in phosphorylation for the yeast proteins and reduced phosphorylation for the human proteins. For the human proteins, given the amounts of spike in yeast and balancing human proteins to make up the labelled material, we expect a 30% drop in phosphorylation.
+```{r}
+pairwise_comparison_combined_res <- all_combined_res[,pData(all_combined_res)$condition %in% 1:2]
+pData(pairwise_comparison_combined_res)
+```
+
+Below, we perform the limma analysis
+```{r}
+dat <- exprs(pairwise_comparison_combined_res) %>% log(base=2)
+
+type <- factor(pData(pairwise_comparison_combined_res)$type, level=c('total', 'phospho'))
+condition <- factor(pData(pairwise_comparison_combined_res)$condition, levels=1:2)
+
+study.design <- model.matrix(~type*condition)
+
+fit <- lmFit(dat, study.design)
+fit <- eBayes(fit, trend=TRUE)
+
+limma.results <- topTable(fit, coef = colnames(fit$coefficients)[4], n = Inf, confint=TRUE)
+limma.results$sigma <- fit$sigma
+
+```
+
+
+We would like to add information about the species to make sure the fold-changes are in the expected direction
+```{r}
+limma.results$protein <- rownames(limma.results) %>%
+  strsplit(split='_') %>%
+  sapply('[[', 1) 
+
+species_res <-  uniprot_map(
+  ids = unique(limma.results$protein),
+  from = "UniProtKB_AC-ID",
+  to = "UniProtKB",
+  fields = "organism_name") %>%
+  mutate(species=recode(Organism,
+                        "Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)"="S.cerevisiae",
+                        "Homo sapiens (Human)"="H.sapiens"))
+```
+
+```{r}
+limma.results %>%
+  merge(species_res, by.x='protein', by.y='From') %>%
+  ggplot(aes(logFC, -log10(P.Value), fill=adj.P.Val<0.01)) +
+  geom_point(pch=21, size=2, colour='grey70') +
+  facet_wrap(~species) +
+  theme_camprot(base_size=15, base_family='sans', border=FALSE) +
+  theme(strip.background=element_blank()) +
+  xlab('Log2 fold change') +
+  ylab('-log10(p-value)') +
+  scale_fill_manual(values=c('grey90', get_cat_palette(2)[2]), name='FDR < 1%')
+```
+OK, so the direction of change is as expected (reduced for human, increased for yeast), but only a subset of the fold-changes reach a 1% FDR threshold for significance.
diff --git a/Markdowns/TMT_phospho_stats.html b/Markdowns/TMT_phospho_stats.html