Fig3_spatialProtSignatures.Rmd

---
title: "Figure 3 Spatial Signatures"
output: html_document
date: "2023-05-05"
---

The goal of this MD is to evaluate the combination of proteomic and phosphoproteomic measurements to evaluate the spleen data.

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
source("spleenDataFormatting.R")
source('spatialProtUtils.R')
library(pheatmap)
library(PCSF)
```

## First collect and annotate voxels by signature

First lets combine the signatures to see what we get


```{r combined}
pumap<-scater::runPCA(spat.prot)


fullmap<-scater::runPCA(global.sorted)

fullmap<-spatialDiffEx(fullmap)

full<- fullmap%>%
  rowData(.)%>%
  as.data.frame()%>%
  dplyr::select(featureID='X',
                logFC='pulpAnnotation.limma.logFC',
                adj.P.Val='pulpAnnotation.limma.adj.P.Val',
                AveExpr='pulpAnnotation.limma.AveExpr')

upsig<-full%>%
  subset(adj.P.Val<0.01)%>%
  subset(logFC>1)|>  subset(AveExpr>(1))


downsig<-full%>%
  subset(adj.P.Val<0.01)%>%
  subset(logFC<(-1))|>
    subset(AveExpr>(1))

pumap<-calcSigScore(pumap,rownames(downsig),'RedPulp')%>%
  calcSigScore(rownames(upsig),'WhitePulp')

print(paste('White pulp signature is',nrow(upsig),'proteins and red pulp is',nrow(downsig),'proteins'))

```
## Basic diffex and functional enrichment

Based on annotated voxels, let's compute differentially expressed proteins and phosphosites and store in table. 


```{r differential expression}

newVals<-colData(pumap)%>%
  as.data.frame()%>%
  mutate(isRed=RedPulp>0.5,isWhite=WhitePulp>0.5)%>%
  mutate(pulp=ifelse(isRed,'red',ifelse(isWhite,'white','None')))

colData(pumap)[['pulp']]<-newVals$pulp

protDiff<-spatialDiffEx(pumap,column='pulp',feat='Protein')|>
  rowData(.)%>%
  as.data.frame()%>%
  dplyr::select(featureID='Protein',
                logFC='pulp.limma.logFC',
                adj.P.Val='pulp.limma.adj.P.Val',
                AveExpr='pulp.limma.AveExpr')

sigProts<-subset(protDiff,adj.P.Val<0.05)|>subset(logFC>.5)|>
  arrange(logFC)

write.table(protDiff,file='proteomicsDiffExWhitevsRed.csv',sep=',',row.names=F)


pheatmap(as.matrix(exprs(pumap)[intersect(rownames(sigProts),rownames(rowData(pumap))),]),
         annotation_col = as.data.frame(colData(pumap))[,c('WhitePulp','pulp')],
         main='White pulp upregulated',cellheight = 10,filename='suppFullp0.05logfc.5white.pdf')


##now we remove ribosomal proteins
allprots<-rownames(sigProts)
allprots<-allprots[-grep('^RP',allprots)]
pheatmap(as.matrix(exprs(pumap)[intersect(allprots,rownames(rowData(pumap))),]),
         clustering_distance_cols = 'correlation',clustering_distance_rows='correlation',
         clustering_method = 'ward.D2',
         annotation_col = as.data.frame(colData(pumap))[,c('WhitePulp','pulp')],
         main='White pulp upregulated',cellheight = 10,filename='fig3Up_noribo_p0.05logfc.5white.pdf')

###let's do heatmaps of these proteins 


```

## Now we can look for biological pathways

```{r standard enrichment}
library(leapR)
library(org.Hs.eg.db)
data('krbpaths')


####CAN we use MCP counter?
mcptab<-read.table('mcp-genelist.txt',sep='\t',header=T,check.names = F)
mcplist<-lapply(unique(mcptab$`Cell population`),function(x) 
  paste(c(x,mcptab$`HUGO symbols`[which(mcptab$`Cell population`==x)]),sep='\t'))
maxlength=max(sapply(mcplist,length))

mcpmat<-do.call(rbind,
                lapply(mcplist,function(x) paste(c(x,rep("",maxlength-length(x))),sep='\t')))
mcp.gl=list(names=unique(mcptab$`Cell population`),desc="",sizes=sapply(mcplist,length),matrix=mcpmat)
class(mcp.gl)<-c('geneset_data','list')

mcp.enrich<-leapR::leapR(geneset=mcp.gl,enrichment_method='enrichment_in_sets',id_column='featureID',
                        datamatrix=protDiff,primary_columns='logFC',greaterthan=T,threshold=0.5)|>
  subset(ingroup_n>1)

print(mcp.enrich)

###now try another matrix, LM22
lm22<-read.table('LM22.txt',sep='\t',header=T,check.names = F)|>
   tidyr::pivot_longer(cols=c(2:23),names_to='cellType',values_to='weight')

lm22=lm22|> group_by(`Gene symbol`)|>
   summarize(maxVal=max(weight),medVal=median(weight))|>
  right_join(lm22)

dlm22<-lm22|>
  mutate(overMed=weight<medVal,max=weight==maxVal)|>
  subset(max)

lmlist<-lapply(unique(dlm22$cellType),
               function(x) c(x,dlm22$`Gene symbol`[which(dlm22$cellType==x)]))
lmsize<-sapply(lmlist,length)
maxlength=max(lmsize)

lmmat<-do.call(rbind,
                lapply(lmlist,function(x) paste(c(x,rep("",maxlength-length(x))),sep='\t')))
lm22.gl=list(names=unique(dlm22$cellType),desc="",sizes=sapply(lmlist,length),matrix=lmmat)
class(lm22.gl)<-c('geneset_data','list')

lm22.enrich<-leapR::leapR(geneset=lm22.gl,enrichment_method='enrichment_in_sets',id_column='featureID',
                        datamatrix=protDiff,primary_columns='logFC',greaterthan=T,threshold=0)|>
  subset(ingroup_n>1)

print(lm22.enrich)

##what does KEGG say?
prot.enrich<-leapR::leapR(geneset=krbpaths,
                         enrichment_method='enrichment_in_sets',id_column='featureID',
                    datamatrix=protDiff,primary_columns='logFC',greaterthan=T,threshold=0.5)|>
  subset(ingroup_n>1)

sig.enrich<-prot.enrich%>%
  subset(BH_pvalue<0.05)

print(sig.enrich)

##how about GO?

map<-as.list(org.Hs.egSYMBOL2EG)
  gosigs <- leapR::read_gene_sets('GO_Biological_Process_2021.txt')
  
  go.enrich<-leapR::leapR(geneset=gosigs,
                           enrichment_method='enrichment_in_sets',id_column='featureID',
                      datamatrix=protDiff,primary_columns='logFC',greaterthan=T,threshold=0.5)|>
    subset(ingroup_n>1)

sig.go<-go.enrich%>%
  subset(BH_pvalue<0.05)
print(sig.go)


##figure out what to do with this?
```
## Plot enrichment

Now we need to write a function to plot enrichment status

```{r plot enrichment}

plotResult<-function(enrich_res){
###the columns dpeendon the output a bit
  library(ggplot2)
  
  odds<-enrich_res|>
    arrange(desc(oddsratio))
   
  ##get the top 20
  odds<-odds[1:min(20,nrow(enrich_res)),]|>
    tibble::rownames_to_column('Pathway')|>
    mutate(logPval=(-1*log10(pvalue)))
  
  odds$Pathway<-factor(odds$Pathway,levels=rev(odds$Pathway))
  
  ##plot
  res<-ggplot(odds,aes(x=Pathway,y=oddsratio,fill=logPval))+
    geom_bar(stat='identity')+
    coord_flip()

  res
}

goplot<-plotResult(sig.go)
pathplot<-plotResult(sig.enrich)

p<-cowplot::plot_grid(pathplot,goplot,ncol=1)

p

ggsave('allEnrich.pdf',p,height=8,width=10)
```

## Focus on enriched PD1 signaling
There are four genes in thePD1 signaling pathway, let's plot those

```{r PD1}

prots<-sig.enrich['REACTOME_PD1_SIGNALING','ingroupnames']|>
  stringr::str_split(', ')|>
  unlist()
allfigs<-lapply(prots,function(x) plotFeatureGrid(pumap,x,x,'pulp'))

gp<-cowplot::plot_grid(plotlist=allfigs,ncol=2,labels=prots)
gp

ggsave('pd1Signaling.pdf',gp,width=12,height=8)

prots<-sig.enrich['REACTOME_TRANSLOCATION_OF_ZAP70_TO_IMMUNOLOGICAL_SYNAPSE','ingroupnames']|>
  stringr::str_split(', ')|>
  unlist()
allfigs<-lapply(prots,function(x) plotFeatureGrid(pumap,x,x,'pulp'))

gp<-cowplot::plot_grid(plotlist=allfigs,ncol=2,labels=prots)
gp

ggsave('zap700Transloc.pdf',gp,width=12,height=8)

```

## cell type specific expression

The only real enrichment in cell types was in B cells/naive b cells, which makes sense but there is no other.
```{r cell types}

##now many are in the dataset!
dlm22<-dlm22|>mutate(inProt=`Gene symbol`%in%rownames(exprs(pumap)))
rlm22<-lm22|>mutate(inProt=`Gene symbol`%in%rownames(exprs(pumap)))

print(subset(rlm22,inProt)|>group_by(cellType)|>summarize(n()))
##only 28 markers of the 547 are expressed in this dataset

expLM22<-subset(rlm22,inProt)
pheatmap(as.matrix(exprs(pumap)[intersect(expLM22$`Gene symbol`,rownames(rowData(pumap))),]),
         clustering_distance_cols = 'correlation',clustering_distance_rows='correlation',
         clustering_method = 'ward.D2',
         annotation_col = as.data.frame(colData(pumap))[,c('WhitePulp','pulp')],
         main='White pulp upregulated',cellheight = 10,filename='fig3lm22Markers.pdf')

###mcpcounter better?
expMcp<-mcptab|>
  mutate(inProt=`HUGO symbols`%in%rownames(exprs(pumap)))|>
  subset(inProt)

###11 out of 111 mcpcounter markers are present
figs<-lapply(unique(expMcp$`Cell population`),function(ctype){
  print(ctype)
  prots<-expMcp$`HUGO symbols`[which(expMcp$`Cell population`==ctype)]
  plotFeatureGrid(pumap,prots,gsub(" ",'',ctype),'pulp')
})

imp<-cowplot::plot_grid(plotlist=figs,labels=unique(expMcp$`Cell population`),nrow=2)
ggsave('fig3_immuneCells.pdf',imp,width=12)
```

## Networks show what we are missing with enrichment - mechanism? 


Let's try to build biological networks using the signatures.

```{r build networks,echo=F, warning=F}
library(PCSF)
data("STRING")
ppi<-construct_interactome(STRING)

whiteweights<-abs(sigProts$logFC)
names(whiteweights)<-sigProts$featureID

whiteprots<-protDiff$logFC
names(whiteprots)<-protDiff$featureID
#white.net<-buildNetwork(sig.prot.vals=whiteweights,all.prot.vals=whiteprots,sig.phos.vals=c(),
            #            beta=2,nrand=500,featName='whitePulpDiffex')

#wgraph=white.net$graph
wgraph=igraph::read_graph('whitePulpDiffexnetwork.gml',format='gml')
plot.PCSF(wgraph)

nodes<-as_data_frame(wgraph,'vertices')
print(paste('built network with',length(nodes),'nodes'))
```
CD4, CD72, CD209, CD19 are note measured, but implicated in the network. let's see how they are in bulk


```{r bulk conf}

prots<-intersect(rownames(nodes),rownames(exprs(fullmap)))

pheatmap(exprs(fullmap)[prots,],annotation_row = nodes[,c('type','nodeType')],
         annotation_col=as.data.frame(colData(fullmap)),cellwidth = 10,
         cellheight=10,filename='validationHeamap.pdf')

spres<-expToLongForm(pumap,rowname='name')|>
  dplyr::rename(Patient='Voxel')|>
  left_join(tibble::rownames_to_column(as.data.frame(colData(pumap)),'Patient'))|>
  left_join(nodes)|>
  subset(!is.na(type))

spatex<-ggplot(spres,aes(x=type,y=LogRatio,fill=pulp))+geom_boxplot()+scale_fill_viridis_d()

##now make into 
res<-expToLongForm(fullmap,rowname='name')|>
  dplyr::rename(Patient='Voxel')|>
  left_join(tibble::rownames_to_column(as.data.frame(colData(fullmap)),'Patient'))|>
  left_join(nodes)|>
  subset(!is.na(type))

insorted<-ggplot(res,aes(x=type,y=LogRatio,fill=pulpAnnotation))+geom_boxplot()+scale_fill_viridis_d()


p<-cowplot::plot_grid(plotlist=list(spatex,insorted),labels=c('Spatial','Sorted'),nrow=2)
p

ggsave('expressionValidation.pdf',p)
```