Skip to content

Commit

Permalink
Merge pull request #217 from PNNL-CompBio/dev
Browse files Browse the repository at this point in the history
updated with more issues from sampling
  • Loading branch information
sgosline authored Aug 29, 2023
2 parents 35c723e + 181b6a8 commit 750531e
Show file tree
Hide file tree
Showing 10 changed files with 156 additions and 97 deletions.
12 changes: 7 additions & 5 deletions mRNAData/mRNADataSetsCLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ def main():
dat = cptac.Brca()
elif opts.type.lower() == 'ccrcc':
dat = cptac.Ccrcc()
elif opts.type.lower() == 'colon':
dat = cptac.Colon()
elif opts.type.lower() == 'endometrial':
dat = cptac.Endometrial()
elif opts.type.lower() == 'coad':
dat = cptac.Coad()
elif opts.type.lower() == 'ucec':
dat = cptac.Ucec()
elif opts.type.lower() == 'gbm':
dat = cptac.Gbm()
elif opts.type.lower() == 'hnscc':
Expand All @@ -32,7 +32,9 @@ def main():
elif opts.type.lower() == 'luad':
dat = cptac.Luad()
elif opts.type.lower() == 'ovarian':
dat = cptac.Ovarian()
dat = cptac.Ov()
elif opts.type.loewr() == 'pdac':
dat = cptac.Pdac()
else:
exit()
df = dat.get_transcriptomics()
Expand Down
74 changes: 55 additions & 19 deletions metrics/compare-matrices/compareSigMatrices.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,17 @@ def get_cor_stat(dat, prot_markers):
the correlation of the entitites for a subset of proteins
'''

# we have to add data source
sources = dat.list_data_sources()
psource = [row['Available sources'].split(',')[0] for index,row in sources.iterrows() if row['Data type']=='proteomics']
tsource = [row['Available sources'].split(',')[0] for index,row in sources.iterrows() if row['Data type']=='transcriptomics']
genes = [p for p in prot_markers['NAME']]
#get the merged data frame from cptac data
merged = dat.join_omics_to_omics(df1_name='proteomics', \
df2_name='transcriptomics', \
genes1=prot_markers['NAME'], genes2=prot_markers['NAME'])
df1_source=psource[0],\
df2_source=tsource[0],\
genes1=genes, genes2=genes)
if merged.columns.nlevels == 2:
merged.columns = merged.columns.droplevel(1)

Expand All @@ -32,16 +39,18 @@ def get_cor_stat(dat, prot_markers):
prots['Name'] = prots.index
prots = prots.melt(id_vars="Name", var_name='id', value_name='prot')
prots['Gene'] = prots['id'].str.split('_', expand=True)[0]

prots = prots.drop('id',axis=1)

trans = merged.loc[:, merged.columns.str.contains('transcriptomics')]
trans['Name'] = trans.index
trans = trans.melt(id_vars="Name", var_name='id', value_name='trans')
trans['Gene'] = trans['id'].str.split('_', expand=True)[0]
trans=trans.drop('id',axis=1)

#remerge the proteins and transcripts and compute spearman rank correlation
both_df = pandas.merge(prots, trans, how='inner', left_on=['Name', 'Gene'],\
right_on=['Name', 'Gene'])
corvals = both_df[['Name', 'Gene', 'prot', 'trans']].groupby('Gene').corr('spearman').unstack()['trans']['prot']
corvals = both_df[['Gene', 'prot', 'trans']].groupby('Gene').corr('spearman').unstack()['trans']['prot']
corvals = pandas.DataFrame(corvals)
corvals["NAME"] = corvals.index
#print(corvals.head())
Expand All @@ -57,15 +66,24 @@ def get_na_counts(dat, prot_markers):
we also want to see how many values are missing between mRNA and protein
'''
# we have to add data source
sources = dat.list_data_sources()
psource = [row['Available sources'].split(',')[0] for index,row in sources.iterrows() if row['Data type']=='proteomics']
tsource = [row['Available sources'].split(',')[0] for index,row in sources.iterrows() if row['Data type']=='transcriptomics']
genes = [p for p in prot_markers['NAME']]
#get the merged data frame from cptac data
merged = dat.join_omics_to_omics(df1_name='proteomics', \
df2_name='transcriptomics', \
genes1=prot_markers['NAME'], genes2=prot_markers['NAME'])
df1_source=psource[0],\
df2_source=tsource[0],\
genes1=genes, genes2=genes)

if merged.columns.nlevels == 2:
merged.columns = merged.columns.droplevel(1)

sums = merged.isna().sum()/merged.shape[0]
inds = sums.index.str.split('_', expand=False)
sumtab = pandas.DataFrame({'NAs':sums, 'NAME': [i[0] for i in inds], 'Data': [i[1] for i in inds]})
sumtab = pandas.DataFrame({'NAs':sums, 'NAME': [i[0] for i in inds], 'Data': [i[2] for i in inds]})

counts = sumtab.merge(prot_markers).groupby(['cell_type', 'Data']).mean("NAs")
return counts
Expand Down Expand Up @@ -96,25 +114,29 @@ def main():
##now we get the cptac datasets from the package
dslist = {}
countlist = {}
for ds in ['brca', 'ccrcc', 'endometrial', 'colon',
'ovarian', 'luad', 'hnscc', 'gbm']:
cptac.download(dataset=ds)
for ds in ['brca', 'ccrcc', 'ucec', 'coad','pdac',
'ovarian', 'luad', 'hnscc', 'gbm','lscc']:
#cptac.download(dataset=ds)
if ds == 'brca':
dat = cptac.Brca()
elif ds == 'ccrcc':
dat = cptac.Ccrcc()
elif ds == 'colon':
dat = cptac.Colon()
elif ds == 'coad':
dat = cptac.Coad()
elif ds == 'ovarian':
dat = cptac.Ovarian()
elif ds == 'endometrial':
dat = cptac.Endometrial()
dat = cptac.Ov()
elif ds == 'ucec':
dat = cptac.Ucec()
elif ds == 'hnscc':
dat = cptac.Hnscc()
elif ds == 'luad':
dat = cptac.Luad()
elif ds == 'gbm':
dat = cptac.Gbm()
elif ds == 'pdac':
dat = cptac.Pdac()
elif ds == 'lscc':
dat = cptac.Lscc()
else:
exit()
##now get the data, create one large data frame
Expand All @@ -124,11 +146,15 @@ def main():
countlist[ds] = get_na_counts(dat, prot_markers)
countlist[ds]['Tumor'] = ds


mval= os.path.splitext(os.path.basename(args.matrix))[0]
#then plot cor values by cell type and cancer type
fulltab = pandas.concat([dslist[d] for d in dslist.keys()])
fulltab['Tumor'] = fulltab.Tumor.astype('category')
fulltab['cell_type'] = fulltab.cell_type.astype('category')
fulltab.to_csv(os.path.splitext(os.path.basename(args.matrix))[0]+'_allCors.csv')
fulltab['sigMatrix'] = mval
fulltab.to_csv(mval+'_allCors.csv')


##now we can plot the statistics using the plotnine pacakge
print(fulltab.head())
Expand All @@ -140,7 +166,7 @@ def main():
+ ylab("Spearman rank correlation")
+ theme(axis_text_x=element_text(rotation=90, hjust=1))
)
fname = os.path.splitext(os.path.basename(args.matrix))[0]+'_sigMatrix_box_and_points.pdf'
fname = mval+'_sigMatrix_box_and_points.pdf'
plot.save(fname)

plot = (
Expand All @@ -150,13 +176,14 @@ def main():
+ ylab("Spearman rank correlation")
+ theme(axis_text_x=element_text(rotation=90, hjust=1))
)
fname = os.path.splitext(os.path.basename(args.matrix))[0]+'_sigMatrix_boxplot.pdf'
fname = mval+'_sigMatrix_boxplot.pdf'
plot.save(fname, height=8, width=10)

counttab = pandas.concat([countlist[d] for d in countlist.keys()])
counttab['SigMatrix']=mval
print(counttab.head())
counttab.to_csv(os.path.splitext(os.path.basename(args.matrix))[0]+'_allNAs.csv')
counttab = pandas.read_csv(os.path.splitext(os.path.basename(args.matrix))[0]+'_allNAs.csv')
counttab.to_csv(mval+'_allNAs.csv')
counttab = pandas.read_csv(mval+'_allNAs.csv')
plot = (
ggplot(counttab, aes(x='cell_type', fill='Tumor', y='NAs'))
+ geom_bar(stat='identity', position='dodge')
Expand All @@ -165,7 +192,16 @@ def main():
+ ylab("Fraction missing data")
+ theme(axis_text_x=element_text(rotation=90, hjust=1))
)
fname = os.path.splitext(os.path.basename(args.matrix))[0]+'_nacounts.pdf'
fname = mval+'_nacounts_bar.pdf'
plot.save(fname)
plot = (
ggplot(counttab, aes(x='cell_type', fill='Data', y='NAs'))
+ geom_boxplot(position='dodge')
+ xlab('Cell Type')
+ ylab("Fraction missing data")
+ theme(axis_text_x=element_text(rotation=90, hjust=1))
)
fname = mval+'_nacounts_box.pdf'
plot.save(fname)


Expand Down
42 changes: 42 additions & 0 deletions metrics/data-sim/runSamplingManually.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
'''
run sampling manually to avoid having to loop in CWL, whcih turned out to be cumberson
'''

import os

filelist = []
for i in [0,1,2,3,4]:
estring = 'cwltool simul-data-sampling.cwl --prot-sigs LM9 --simType prot --repNumber '+str(i)
print(estring)
os.system(estring)
fname='combined-cellType-correlation-'+str(i)+'.tsv'
os.rename(fname,'prot-LM9-'+fname)
filelist.append('prot-LM9-'+fname)

estring = 'cwltool simul-data-sampling.cwl --prot-sigs LM7c --simType prot --repNumber '+str(i)
print(estring)
os.system(estring)
fname='combined-cellType-correlation-'+str(i)+'.tsv'
os.rename(fname,'prot-LM7c-'+fname)
filelist.append('prot-LM7c-'+fname)

estring = 'cwltool simul-data-sampling.cwl --rna-sigs LM22 --simType mrna --repNumber '+str(i)
print(estring)
os.system(estring)
fname='combined-cellType-correlation-'+str(i)+'.tsv'
os.rename(fname,'mrna-LM22-'+fname)
filelist.append('mrna-LM22-'+fname)

estring = 'cwltool simul-data-sampling.cwl --rna-sigs PBMC --simType mrna --repNumber '+str(i)
print(estring)
os.system(estring)
fname='combined-cellType-correlation-'+str(i)+'.tsv'
os.rename(fname,'mrna-pbmc-'+fname)
filelist.append('mrna-pbmc-'+fname)






5 changes: 4 additions & 1 deletion metrics/data-sim/simul-data-comparison.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@ inputs:
default: ['LM7c','LM9']
simType:
type: string
sample:
sample: #how much do want to sample the matrix?
type: int
default: 100
repNumber: ##if you wanted to run this numer times
type: int
defalut 0

outputs:
cell-cor-tab:
Expand Down
74 changes: 21 additions & 53 deletions metrics/data-sim/simul-data-sampling.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,19 @@ inputs:
# default: 10
samples:
type: int[]
default: [20,40,80]#[10,20,40,60,80,100]
default: [10,20,40,60,80,100]
mrna-perms:
type: string[]
default: ['1','2','3','4','5','6','7','8','9','10','pbmc']
default: ['1','2','3','4','5']#,'6','7','8','9','10','pbmc']
prot-perms:
type: string[]
default: ['1','2','3']##,'4','5']
default: ['1','2','3','4','5']
prot-algorithms:
type: string[]
default:
- mcpcounter
# - xcell
# - epic
- xcell
- epic
- cibersort
rna-sigs:
type: string[]
Expand All @@ -39,6 +39,9 @@ inputs:
default: ['LM7c','LM9']
simType:
type: string
repNumber:
type: int
default: 0

outputs:
cell-cor-tab:
Expand All @@ -50,7 +53,7 @@ outputs:

steps:
run-all-algs-by-mrna:
run: sim-loop.cwl #call-deconv-on-sim.cwl
run: call-deconv-on-sim.cwl #call-deconv-on-sim.cwl
when: $(inputs.simType.trim() == 'mrna')
scatter: [protAlg,simulation,signature,sample]
scatterMethod: flat_crossproduct
Expand All @@ -59,79 +62,44 @@ steps:
simulation: mrna-perms
signature: rna-sigs
sample: samples
sampleRep: repNumber
sampleType:
valueFrom: 'normal'
dataType:
valueFrom: 'prot'
simType: simType
out:
[cell-cor-files]
[cell-cor-file]
run-all-algs-by-prot:
run: sim-loop.cwl #call-deconv-on-sim.cwl
run: call-deconv-on-sim.cwl
when: $(inputs.simType.trim() == 'prot')
scatter: [protAlg,simulation,signature,sample]
scatterMethod: flat_crossproduct
in:
protAlg: prot-algorithms
simulation: prot-perms
sample: samples
sampleRep: repNumber
signature: prot-sigs
sampleType:
valueFrom: 'normal'
dataType:
valueFrom: 'prot'
simType: simType
out:
[cell-cor-files]
arrayBusiness:
run:
class: ExpressionTool
inputs:
arrayTwoDim:
type:
type: array
items:
type: array
items: File
inputBinding:
loadContents: true
outputs:
array1d:
type: File[]
expression: >
${
var newArray= [];
for (var i = 0; i < inputs.arrayTwoDim.length; i++) {
for (var k = 0; k < inputs.arrayTwoDim[i].length; k++) {
newArray.push((inputs.arrayTwoDim[i])[k]);
}
}
return { 'array1d' : newArray }
}
in:
arrayTwoDim:
source:
- run-all-algs-by-mrna/cell-cor-files
- run-all-algs-by-prot/cell-cor-files
linkMerge: merge_flattened
pickValue: all_non_null
out: [array1d]
[cell-cor-file]
get-celltype-cors:
run: ../figures/plot-figs.cwl
in:
metricType:
valueFrom: "cellType"
files:
source: arrayBusiness/array1d
# type:
# type: array
# items:
# type: array
# items: File
# source:
# - run-all-algs-by-mrna/cell-cor-files
# - run-all-algs-by-prot/cell-cor-files
# linkMerge: merge_flattened
# pickValue: all_non_null
source:
- run-all-algs-by-mrna/cell-cor-file
- run-all-algs-by-prot/cell-cor-file
linkMerge: merge_flattened
pickValue: all_non_null
repNumber:
source: repNumber
out:
[table,fig]
Loading

0 comments on commit 750531e

Please sign in to comment.