diff --git a/mRNAData/mRNADataSetsCLI.py b/mRNAData/mRNADataSetsCLI.py index fa0caea9..9e8a1eab 100644 --- a/mRNAData/mRNADataSetsCLI.py +++ b/mRNAData/mRNADataSetsCLI.py @@ -19,10 +19,10 @@ def main(): dat = cptac.Brca() elif opts.type.lower() == 'ccrcc': dat = cptac.Ccrcc() - elif opts.type.lower() == 'colon': - dat = cptac.Colon() - elif opts.type.lower() == 'endometrial': - dat = cptac.Endometrial() + elif opts.type.lower() == 'coad': + dat = cptac.Coad() + elif opts.type.lower() == 'ucec': + dat = cptac.Ucec() elif opts.type.lower() == 'gbm': dat = cptac.Gbm() elif opts.type.lower() == 'hnscc': @@ -32,7 +32,9 @@ def main(): elif opts.type.lower() == 'luad': dat = cptac.Luad() elif opts.type.lower() == 'ovarian': - dat = cptac.Ovarian() + dat = cptac.Ov() + elif opts.type.loewr() == 'pdac': + dat = cptac.Pdac() else: exit() df = dat.get_transcriptomics() diff --git a/metrics/compare-matrices/compareSigMatrices.py b/metrics/compare-matrices/compareSigMatrices.py index 5abdebb9..14f2dc24 100644 --- a/metrics/compare-matrices/compareSigMatrices.py +++ b/metrics/compare-matrices/compareSigMatrices.py @@ -19,10 +19,17 @@ def get_cor_stat(dat, prot_markers): the correlation of the entitites for a subset of proteins ''' + # we have to add data source + sources = dat.list_data_sources() + psource = [row['Available sources'].split(',')[0] for index,row in sources.iterrows() if row['Data type']=='proteomics'] + tsource = [row['Available sources'].split(',')[0] for index,row in sources.iterrows() if row['Data type']=='transcriptomics'] + genes = [p for p in prot_markers['NAME']] #get the merged data frame from cptac data merged = dat.join_omics_to_omics(df1_name='proteomics', \ df2_name='transcriptomics', \ - genes1=prot_markers['NAME'], genes2=prot_markers['NAME']) + df1_source=psource[0],\ + df2_source=tsource[0],\ + genes1=genes, genes2=genes) if merged.columns.nlevels == 2: merged.columns = merged.columns.droplevel(1) @@ -32,16 +39,18 @@ def get_cor_stat(dat, prot_markers): prots['Name'] = prots.index prots = prots.melt(id_vars="Name", var_name='id', value_name='prot') prots['Gene'] = prots['id'].str.split('_', expand=True)[0] - + prots = prots.drop('id',axis=1) + trans = merged.loc[:, merged.columns.str.contains('transcriptomics')] trans['Name'] = trans.index trans = trans.melt(id_vars="Name", var_name='id', value_name='trans') trans['Gene'] = trans['id'].str.split('_', expand=True)[0] + trans=trans.drop('id',axis=1) #remerge the proteins and transcripts and compute spearman rank correlation both_df = pandas.merge(prots, trans, how='inner', left_on=['Name', 'Gene'],\ right_on=['Name', 'Gene']) - corvals = both_df[['Name', 'Gene', 'prot', 'trans']].groupby('Gene').corr('spearman').unstack()['trans']['prot'] + corvals = both_df[['Gene', 'prot', 'trans']].groupby('Gene').corr('spearman').unstack()['trans']['prot'] corvals = pandas.DataFrame(corvals) corvals["NAME"] = corvals.index #print(corvals.head()) @@ -57,15 +66,24 @@ def get_na_counts(dat, prot_markers): we also want to see how many values are missing between mRNA and protein ''' + # we have to add data source + sources = dat.list_data_sources() + psource = [row['Available sources'].split(',')[0] for index,row in sources.iterrows() if row['Data type']=='proteomics'] + tsource = [row['Available sources'].split(',')[0] for index,row in sources.iterrows() if row['Data type']=='transcriptomics'] + genes = [p for p in prot_markers['NAME']] + #get the merged data frame from cptac data merged = dat.join_omics_to_omics(df1_name='proteomics', \ df2_name='transcriptomics', \ - genes1=prot_markers['NAME'], genes2=prot_markers['NAME']) + df1_source=psource[0],\ + df2_source=tsource[0],\ + genes1=genes, genes2=genes) + if merged.columns.nlevels == 2: merged.columns = merged.columns.droplevel(1) sums = merged.isna().sum()/merged.shape[0] inds = sums.index.str.split('_', expand=False) - sumtab = pandas.DataFrame({'NAs':sums, 'NAME': [i[0] for i in inds], 'Data': [i[1] for i in inds]}) + sumtab = pandas.DataFrame({'NAs':sums, 'NAME': [i[0] for i in inds], 'Data': [i[2] for i in inds]}) counts = sumtab.merge(prot_markers).groupby(['cell_type', 'Data']).mean("NAs") return counts @@ -96,25 +114,29 @@ def main(): ##now we get the cptac datasets from the package dslist = {} countlist = {} - for ds in ['brca', 'ccrcc', 'endometrial', 'colon', - 'ovarian', 'luad', 'hnscc', 'gbm']: - cptac.download(dataset=ds) + for ds in ['brca', 'ccrcc', 'ucec', 'coad','pdac', + 'ovarian', 'luad', 'hnscc', 'gbm','lscc']: + #cptac.download(dataset=ds) if ds == 'brca': dat = cptac.Brca() elif ds == 'ccrcc': dat = cptac.Ccrcc() - elif ds == 'colon': - dat = cptac.Colon() + elif ds == 'coad': + dat = cptac.Coad() elif ds == 'ovarian': - dat = cptac.Ovarian() - elif ds == 'endometrial': - dat = cptac.Endometrial() + dat = cptac.Ov() + elif ds == 'ucec': + dat = cptac.Ucec() elif ds == 'hnscc': dat = cptac.Hnscc() elif ds == 'luad': dat = cptac.Luad() elif ds == 'gbm': dat = cptac.Gbm() + elif ds == 'pdac': + dat = cptac.Pdac() + elif ds == 'lscc': + dat = cptac.Lscc() else: exit() ##now get the data, create one large data frame @@ -124,11 +146,15 @@ def main(): countlist[ds] = get_na_counts(dat, prot_markers) countlist[ds]['Tumor'] = ds + + mval= os.path.splitext(os.path.basename(args.matrix))[0] #then plot cor values by cell type and cancer type fulltab = pandas.concat([dslist[d] for d in dslist.keys()]) fulltab['Tumor'] = fulltab.Tumor.astype('category') fulltab['cell_type'] = fulltab.cell_type.astype('category') - fulltab.to_csv(os.path.splitext(os.path.basename(args.matrix))[0]+'_allCors.csv') + fulltab['sigMatrix'] = mval + fulltab.to_csv(mval+'_allCors.csv') + ##now we can plot the statistics using the plotnine pacakge print(fulltab.head()) @@ -140,7 +166,7 @@ def main(): + ylab("Spearman rank correlation") + theme(axis_text_x=element_text(rotation=90, hjust=1)) ) - fname = os.path.splitext(os.path.basename(args.matrix))[0]+'_sigMatrix_box_and_points.pdf' + fname = mval+'_sigMatrix_box_and_points.pdf' plot.save(fname) plot = ( @@ -150,13 +176,14 @@ def main(): + ylab("Spearman rank correlation") + theme(axis_text_x=element_text(rotation=90, hjust=1)) ) - fname = os.path.splitext(os.path.basename(args.matrix))[0]+'_sigMatrix_boxplot.pdf' + fname = mval+'_sigMatrix_boxplot.pdf' plot.save(fname, height=8, width=10) counttab = pandas.concat([countlist[d] for d in countlist.keys()]) + counttab['SigMatrix']=mval print(counttab.head()) - counttab.to_csv(os.path.splitext(os.path.basename(args.matrix))[0]+'_allNAs.csv') - counttab = pandas.read_csv(os.path.splitext(os.path.basename(args.matrix))[0]+'_allNAs.csv') + counttab.to_csv(mval+'_allNAs.csv') + counttab = pandas.read_csv(mval+'_allNAs.csv') plot = ( ggplot(counttab, aes(x='cell_type', fill='Tumor', y='NAs')) + geom_bar(stat='identity', position='dodge') @@ -165,7 +192,16 @@ def main(): + ylab("Fraction missing data") + theme(axis_text_x=element_text(rotation=90, hjust=1)) ) - fname = os.path.splitext(os.path.basename(args.matrix))[0]+'_nacounts.pdf' + fname = mval+'_nacounts_bar.pdf' + plot.save(fname) + plot = ( + ggplot(counttab, aes(x='cell_type', fill='Data', y='NAs')) + + geom_boxplot(position='dodge') + + xlab('Cell Type') + + ylab("Fraction missing data") + + theme(axis_text_x=element_text(rotation=90, hjust=1)) + ) + fname = mval+'_nacounts_box.pdf' plot.save(fname) diff --git a/metrics/data-sim/runSamplingManually.py b/metrics/data-sim/runSamplingManually.py new file mode 100644 index 00000000..0b18672e --- /dev/null +++ b/metrics/data-sim/runSamplingManually.py @@ -0,0 +1,42 @@ +''' +run sampling manually to avoid having to loop in CWL, whcih turned out to be cumberson + +''' + +import os + +filelist = [] +for i in [0,1,2,3,4]: + estring = 'cwltool simul-data-sampling.cwl --prot-sigs LM9 --simType prot --repNumber '+str(i) + print(estring) + os.system(estring) + fname='combined-cellType-correlation-'+str(i)+'.tsv' + os.rename(fname,'prot-LM9-'+fname) + filelist.append('prot-LM9-'+fname) + + estring = 'cwltool simul-data-sampling.cwl --prot-sigs LM7c --simType prot --repNumber '+str(i) + print(estring) + os.system(estring) + fname='combined-cellType-correlation-'+str(i)+'.tsv' + os.rename(fname,'prot-LM7c-'+fname) + filelist.append('prot-LM7c-'+fname) + + estring = 'cwltool simul-data-sampling.cwl --rna-sigs LM22 --simType mrna --repNumber '+str(i) + print(estring) + os.system(estring) + fname='combined-cellType-correlation-'+str(i)+'.tsv' + os.rename(fname,'mrna-LM22-'+fname) + filelist.append('mrna-LM22-'+fname) + + estring = 'cwltool simul-data-sampling.cwl --rna-sigs PBMC --simType mrna --repNumber '+str(i) + print(estring) + os.system(estring) + fname='combined-cellType-correlation-'+str(i)+'.tsv' + os.rename(fname,'mrna-pbmc-'+fname) + filelist.append('mrna-pbmc-'+fname) + + + + + + diff --git a/metrics/data-sim/simul-data-comparison.cwl b/metrics/data-sim/simul-data-comparison.cwl index bef96d24..4f3edfa0 100644 --- a/metrics/data-sim/simul-data-comparison.cwl +++ b/metrics/data-sim/simul-data-comparison.cwl @@ -33,9 +33,12 @@ inputs: default: ['LM7c','LM9'] simType: type: string - sample: + sample: #how much do want to sample the matrix? type: int default: 100 + repNumber: ##if you wanted to run this numer times + type: int + defalut 0 outputs: cell-cor-tab: diff --git a/metrics/data-sim/simul-data-sampling.cwl b/metrics/data-sim/simul-data-sampling.cwl index fab7e704..f0420f2e 100644 --- a/metrics/data-sim/simul-data-sampling.cwl +++ b/metrics/data-sim/simul-data-sampling.cwl @@ -17,19 +17,19 @@ inputs: # default: 10 samples: type: int[] - default: [20,40,80]#[10,20,40,60,80,100] + default: [10,20,40,60,80,100] mrna-perms: type: string[] - default: ['1','2','3','4','5','6','7','8','9','10','pbmc'] + default: ['1','2','3','4','5']#,'6','7','8','9','10','pbmc'] prot-perms: type: string[] - default: ['1','2','3']##,'4','5'] + default: ['1','2','3','4','5'] prot-algorithms: type: string[] default: - mcpcounter - # - xcell - # - epic + - xcell + - epic - cibersort rna-sigs: type: string[] @@ -39,6 +39,9 @@ inputs: default: ['LM7c','LM9'] simType: type: string + repNumber: + type: int + default: 0 outputs: cell-cor-tab: @@ -50,7 +53,7 @@ outputs: steps: run-all-algs-by-mrna: - run: sim-loop.cwl #call-deconv-on-sim.cwl + run: call-deconv-on-sim.cwl #call-deconv-on-sim.cwl when: $(inputs.simType.trim() == 'mrna') scatter: [protAlg,simulation,signature,sample] scatterMethod: flat_crossproduct @@ -59,15 +62,16 @@ steps: simulation: mrna-perms signature: rna-sigs sample: samples + sampleRep: repNumber sampleType: valueFrom: 'normal' dataType: valueFrom: 'prot' simType: simType out: - [cell-cor-files] + [cell-cor-file] run-all-algs-by-prot: - run: sim-loop.cwl #call-deconv-on-sim.cwl + run: call-deconv-on-sim.cwl when: $(inputs.simType.trim() == 'prot') scatter: [protAlg,simulation,signature,sample] scatterMethod: flat_crossproduct @@ -75,6 +79,7 @@ steps: protAlg: prot-algorithms simulation: prot-perms sample: samples + sampleRep: repNumber signature: prot-sigs sampleType: valueFrom: 'normal' @@ -82,56 +87,19 @@ steps: valueFrom: 'prot' simType: simType out: - [cell-cor-files] - arrayBusiness: - run: - class: ExpressionTool - inputs: - arrayTwoDim: - type: - type: array - items: - type: array - items: File - inputBinding: - loadContents: true - outputs: - array1d: - type: File[] - expression: > - ${ - var newArray= []; - for (var i = 0; i < inputs.arrayTwoDim.length; i++) { - for (var k = 0; k < inputs.arrayTwoDim[i].length; k++) { - newArray.push((inputs.arrayTwoDim[i])[k]); - } - } - return { 'array1d' : newArray } - } - in: - arrayTwoDim: - source: - - run-all-algs-by-mrna/cell-cor-files - - run-all-algs-by-prot/cell-cor-files - linkMerge: merge_flattened - pickValue: all_non_null - out: [array1d] + [cell-cor-file] get-celltype-cors: run: ../figures/plot-figs.cwl in: metricType: valueFrom: "cellType" files: - source: arrayBusiness/array1d - # type: - # type: array - # items: - # type: array - # items: File - # source: - # - run-all-algs-by-mrna/cell-cor-files - # - run-all-algs-by-prot/cell-cor-files - # linkMerge: merge_flattened - # pickValue: all_non_null + source: + - run-all-algs-by-mrna/cell-cor-file + - run-all-algs-by-prot/cell-cor-file + linkMerge: merge_flattened + pickValue: all_non_null + repNumber: + source: repNumber out: [table,fig] diff --git a/metrics/figures/combine_results.R b/metrics/figures/combine_results.R index 910033bf..8ff07dda 100644 --- a/metrics/figures/combine_results.R +++ b/metrics/figures/combine_results.R @@ -5,11 +5,11 @@ #' #' #' to run: -#' Rscript combine_results.R --metricType sample --metric correlation */*corr.tsv -#' Rscript combine_results.R --metricType cellType --metric correlation */*corrXcelltypes.tsv -#' Rscript combine_results.R --metricType sample --metric meanCorrelation */*corr.tsv -#' Rscript combine_results.R --metricType cellType --metric meanCorrelation */*corrXcelltypes.tsv -#' Rscript combine_results.R --metricType js --metric distance */*dist.tsv +#' Rscript combine_results.R --metricType sample --metric correlation --repNumber 0 */*corr.tsv +#' Rscript combine_results.R --metricType cellType --metric correlation --repNumber 0 */*corrXcelltypes.tsv +#' Rscript combine_results.R --metricType sample --metric meanCorrelation --repNumber 0 */*corr.tsv +#' Rscript combine_results.R --metricType cellType --metric meanCorrelation --repNumber 0 */*corrXcelltypes.tsv +#' Rscript combine_results.R --metricType js --metric distance --repNumber 0 */*dist.tsv library(dplyr) library(argparser) @@ -305,25 +305,26 @@ main<-function(){ ##todo: store in synapse argv <- commandArgs(trailingOnly = TRUE) - file.list<-argv[3:length(argv)] + file.list<-argv[4:length(argv)] metric=argv[2] metricType = argv[1] + repNumber = argv[3] if(metric=='correlation' && metricType=='sample'){ tab<-combinePatientCors(file.list,metric) print(dim(tab)) - write.table(tab,paste0('combined-', metricType, '-', metric,'.tsv'),row.names=F,col.names=T) + write.table(tab,paste0('combined-', metricType, '-', metric,'-',repNumber,'.tsv'),row.names=F,col.names=T) }else if(metric=='correlation' && metricType=='cellType'){ tab<-combineCellTypeCors(file.list,metric) print(dim(tab)) - write.table(tab,paste0('combined-', metricType, '-', metric,'.tsv'),row.names=F,col.names=T) + write.table(tab,paste0('combined-', metricType, '-', metric,'-',repNumber,'.tsv'),row.names=F,col.names=T) } else if (metric=='meanCorrelation'){ tab<-combineCorsMean(file.list,metric, metricType) print(dim(tab)) - write.table(tab,paste0('combined-', metricType, '-', metric,'.tsv'),row.names=F,col.names=T) + write.table(tab,paste0('combined-', metricType, '-', metric,'-',repNumber,'.tsv'),row.names=F,col.names=T) }else if(metric=='distance'){ tab<-combineDists(file.list,metric,metricType) print(dim(tab)) - write.table(tab,paste0('combined-', metricType, '-', metric,'.tsv'),row.names=F,col.names=T) + write.table(tab,paste0('combined-', metricType, '-', metric,'-',repNumber,'.tsv'),row.names=F,col.names=T) }else{ print("First argument must be metricType and second must be metric name") diff --git a/metrics/figures/plot-figs.cwl b/metrics/figures/plot-figs.cwl index 98148d34..48380319 100644 --- a/metrics/figures/plot-figs.cwl +++ b/metrics/figures/plot-figs.cwl @@ -26,6 +26,10 @@ inputs: position: 2 files: type: File[] + inputBinding: + position: 4 + repNumber: + type: int inputBinding: position: 3 diff --git a/protData/protDataSetsCLI.py b/protData/protDataSetsCLI.py index 586f37ee..ecdc21dd 100644 --- a/protData/protDataSetsCLI.py +++ b/protData/protDataSetsCLI.py @@ -20,10 +20,10 @@ def main(): dat = cptac.Brca() elif opts.type.lower() == 'ccrcc': dat = cptac.Ccrcc() - elif opts.type.lower() == 'colon': - dat = cptac.Colon() - elif opts.type.lower() == 'endometrial': - dat = cptac.Endometrial() + elif opts.type.lower() == 'coad': + dat = cptac.Coad() + elif opts.type.lower() == 'ucec': + dat = cptac.Ucec() elif opts.type.lower() == 'gbm': dat = cptac.Gbm() elif opts.type.lower() == 'hnscc': @@ -33,7 +33,9 @@ def main(): elif opts.type.lower() == 'luad': dat = cptac.Luad() elif opts.type.lower() == 'ovarian': - dat = cptac.Ovarian() + dat = cptac.Ov() + elif opts.type.loewr() == 'pdac': + dat = cptac.Pdac() else: exit() df = dat.get_proteomics() diff --git a/signature_matrices/getSigMatrices.R b/signature_matrices/getSigMatrices.R index 21db6724..db4dc80c 100644 --- a/signature_matrices/getSigMatrices.R +++ b/signature_matrices/getSigMatrices.R @@ -7,8 +7,8 @@ main<-function(){ argv <- commandArgs(trailingOnly = TRUE) sig_name <- trimws(argv[1]) - sample <-as.numeric(trimws(argv[2])) - print(sample) + sampval <-as.numeric(trimws(argv[2])) + print(samval) #reshape matrisome to adhere to standards if(tolower(sig_name)=='matrisome'){ tab <- readxl::read_xlsx('/Hs_Matrisome_Masterlist_Naba et al_2012.xlsx')[,c(2:3)]|> @@ -25,10 +25,10 @@ main<-function(){ tab <- read.table(paste0('/',sig_name,'.txt'),header=T,sep='\t',row.names=1,check.names=F) num<-nrow(tab) prots<-rownames(tab) - subsamp<-sample(prots,size=round(length(prots)*sample/100),replace=FALSE) + subsamp<-sample(prots,size=round(length(prots)*sampval/100),replace=FALSE) print(paste('reducing signature matrix from',nrow(tab),'to',length(subsamp))) tab<-tab[subsamp,] - sig_name <- paste0(sig_name,'_',sample) + sig_name <- paste0(sig_name,'_',sampval) write.table(tab,paste0('./',sig_name,'.txt'),row.names=T,sep='\t',quote=F) } diff --git a/simulatedData/mapSimDataMatrices.R b/simulatedData/mapSimDataMatrices.R index 0920758d..9c4eac9c 100644 --- a/simulatedData/mapSimDataMatrices.R +++ b/simulatedData/mapSimDataMatrices.R @@ -8,6 +8,7 @@ main <- function(){ print(argv) deconv.mat <- read.table(argv[1],header=T,row.names=1,sep='\t', check.names = F) #matrix to be fixed sig.mat <- argv[2] #signature used to deconvolve matrix + sig.mat <-stringr::str_replace(sig.mat,'_[0-9]*','') ##added this due to sampling changes!!! sim.type <- tolower(argv[3]) #type of simulation performed cell.type <-read.table(argv[4],header=T,row.names=1,sep='\t', check.names = F)