Merge pull request #222 from PNNL-CompBio/dev

updated cptac modules to only pull/cache relevant data
PNNL-CompBio · Sep 14, 2023 · 603f4c6 · 603f4c6
2 parents 5f9afb1 + 7f9230a
commit 603f4c6
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 50 deletions.
diff --git a/mRNAData/getAllDatasets.py b/mRNAData/getAllDatasets.py
@@ -4,5 +4,46 @@
 '''
 
 import cptac
+
+
+def getCancerObj(cancertype):
+   # cptac.download(dataset=cancertype,source='harmonized',)
+    if cancertype == 'brca':
+        dat = cptac.Brca()
+    elif cancertype == 'ccrcc':
+        dat = cptac.Ccrcc()
+    elif cancertype == 'coad':
+        dat = cptac.Coad()
+    elif cancertype == 'gbm':
+        dat = cptac.Gbm()
+    elif cancertype == 'hnscc':
+        dat = cptac.Hnscc()
+    elif cancertype == 'lscc':
+        dat = cptac.Lscc()
+    elif cancertype == 'luad':
+        dat = cptac.Luad()
+    elif cancertype == 'ov':
+        dat = cptac.Ov()
+    elif cancertype =='pdac':
+        dat = cptac.Pdac()
+    elif cancertype =='ucec':
+        dat = cptac.Ucec()
+    else:
+        print('Wrong cancer type: '+cancertype)
+        exit()
+    return dat
+
+
 for ds in ['brca', 'ccrcc', 'ucec', 'coad','pdac', 'ovarian', 'luad', 'hnscc', 'gbm','lscc']:
-    cptac.download_cancer(ds)
+    dat=getCancerObj(ds)
+
+    #this call changed in recent version
+    dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
+    clinsource = dat_list['clinical']
+    if 'harmonized' in clinsource:
+        cs = 'harmonized'
+    else:
+        cs = clinsource[0]
+    dat.get_clinical(cs)
+    tsource = dat_list['transcriptomics']
+    dat.get_transcriptomics(tsource[0])
diff --git a/mRNAData/mRNADataSetsCLI.py b/mRNAData/mRNADataSetsCLI.py
@@ -37,24 +37,34 @@ def main():
         dat = cptac.Pdac()        
     else:
         exit()
-    df = dat.get_transcriptomics()
+        #this call changed in recent version
+    dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
+    clinsource = dat_list['clinical']
+    if 'harmonized' in clinsource:
+        cs = 'harmonized'
+    else:
+        cs = clinsource[0]
+    dat.get_clinical(cs)
+    tsource = dat_list['transcriptomics']
+    df = dat.get_transcriptomics(tsource[0])
+
 
     # Get the sample type specific dataframe
-    if opts.sample.lower() != 'all':
-        meta = dat.get_clinical()
-        if opts.sample.lower() == 'tumor':
-            ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index
-            ind = [i for i in ind if i in df.index]
-            df = df.loc[ind]
-        elif opts.sample.lower() == 'normal':
-            nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index)
-            nIDs = list(set(nIDs) & set(df.index))
-            df = df.loc[nIDs]
-            df.index = [nID[:-2] if nID[-2:] ==
-                        ".N" else nID for nID in nIDs]
-        else:
-            exit("The sample type, tumor vs normal vs all (default),\
-            is not correctly set!")
+    # if opts.sample.lower() != 'all':
+    #     meta = dat.get_clinical()
+    #     if opts.sample.lower() == 'tumor':
+    #         ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index
+    #         ind = [i for i in ind if i in df.index]
+    #         df = df.loc[ind]
+    #     elif opts.sample.lower() == 'normal':
+    #         nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index)
+    #         nIDs = list(set(nIDs) & set(df.index))
+    #         df = df.loc[nIDs]
+    #         df.index = [nID[:-2] if nID[-2:] ==
+    #                     ".N" else nID for nID in nIDs]
+    #     else:
+    #         exit("The sample type, tumor vs normal vs all (default),\
+    #         is not correctly set!")
     df.transpose().to_csv(path_or_buf="file.tsv", sep='\t')
 
 

diff --git a/metrics/data-sim/runSamplingManually.py b/metrics/data-sim/runSamplingManually.py
@@ -6,7 +6,7 @@
 import os
 
 filelist = []
-for i in [1,2,3,4]:
+for i in [2,3,4]:
 #    estring = 'cwltool simul-data-sampling.cwl --prot-sigs LM9 --simType prot --repNumber '+str(i)
 #    print(estring)
 #    os.system(estring)

diff --git a/protData/getAllDatasets.py b/protData/getAllDatasets.py
@@ -4,5 +4,46 @@
 '''
 
 import cptac
+
+
+def getCancerObj(cancertype):
+   # cptac.download(dataset=cancertype,source='harmonized',)
+    if cancertype == 'brca':
+        dat = cptac.Brca()
+    elif cancertype == 'ccrcc':
+        dat = cptac.Ccrcc()
+    elif cancertype == 'coad':
+        dat = cptac.Coad()
+    elif cancertype == 'gbm':
+        dat = cptac.Gbm()
+    elif cancertype == 'hnscc':
+        dat = cptac.Hnscc()
+    elif cancertype == 'lscc':
+        dat = cptac.Lscc()
+    elif cancertype == 'luad':
+        dat = cptac.Luad()
+    elif cancertype == 'ov':
+        dat = cptac.Ov()
+    elif cancertype =='pdac':
+        dat = cptac.Pdac()
+    elif cancertype =='ucec':
+        dat = cptac.Ucec()
+    else:
+        print('Wrong cancer type: '+cancertype)
+        exit()
+    return dat
+
+
 for ds in ['brca', 'ccrcc', 'ucec', 'coad','pdac', 'ovarian', 'luad', 'hnscc', 'gbm','lscc']:
-    cptac.download_cancer(ds)
+    dat=getCancerObj(ds)
+
+    #this call changed in recent version
+    dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
+    clinsource = dat_list['clinical']
+    if 'harmonized' in clinsource:
+        cs = 'harmonized'
+    else:
+        cs = clinsource[0]
+    dat.get_clinical(cs)
+    tsource = dat_list['proteomics']
+    dat.get_proteomics(tsource[0])
diff --git a/protData/protDataSetsCLI.py b/protData/protDataSetsCLI.py
@@ -4,7 +4,6 @@
 '''
 import argparse
 import cptac
-import numpy as np
 
 
 def main():
@@ -35,39 +34,38 @@ def main():
     elif opts.type.lower() == 'ovarian':
         dat = cptac.Ov()
     elif opts.type.loewr() == 'pdac':
-        dat = cptac.Pdac()
+        dat = cptac.Pdac()        
     else:
         exit()
-    df = dat.get_proteomics()
+        #this call changed in recent version
+    dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
+    clinsource = dat_list['clinical']
+    if 'harmonized' in clinsource:
+        cs = 'harmonized'
+    else:
+        cs = clinsource[0]
+    dat.get_clinical(cs)
+    tsource = dat_list['proteomics']
+    df = dat.get_proteomics(tsource[0])
+
 
     # Get the sample type specific dataframe
-    if opts.sample.lower() != 'all':
-        meta = dat.get_clinical()
-        if opts.sample.lower() == 'tumor':
-            ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index
-            ind = [i for i in ind if i in df.index]
-            df = df.loc[ind]
-        elif opts.sample.lower() == 'normal':
-            nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index)
-            nIDs = list(set(nIDs) & set(df.index))
-            df = df.loc[nIDs]
-            df.index = [nID[:-2] if nID[-2:] ==
-                        ".N" else nID for nID in nIDs]
-        else:
-            exit("The sample type, tumor vs normal vs all (default), \
-            is not correctly set!")
-
-    # some dataset has two level of indices some has only one
-    if df.columns.nlevels == 2:
-        df.columns = df.columns.droplevel(1)
-    elif df.columns.nlevels != 1:
-        print("The number of column levels is larger not 1 or 2!\n")
-        raise
-    dfE = np.exp(df)
-#    dfU = np.log(dfE.sum(axis=1, level=0, min_count=1))
-    dfU = np.log(dfE.groupby(axis=1, level=0).sum())#sum(axis=1, level=0, min_count=1))
-    dfU.dropna(how='all', axis=0, inplace=True)
-    dfU.transpose().to_csv(path_or_buf="file.tsv", sep='\t')
+    # if opts.sample.lower() != 'all':
+    #     meta = dat.get_clinical()
+    #     if opts.sample.lower() == 'tumor':
+    #         ind = meta[meta["Sample_Tumor_Normal"] == "Tumor"].index
+    #         ind = [i for i in ind if i in df.index]
+    #         df = df.loc[ind]
+    #     elif opts.sample.lower() == 'normal':
+    #         nIDs = list(meta[meta["Sample_Tumor_Normal"] == "Normal"].index)
+    #         nIDs = list(set(nIDs) & set(df.index))
+    #         df = df.loc[nIDs]
+    #         df.index = [nID[:-2] if nID[-2:] ==
+    #                     ".N" else nID for nID in nIDs]
+    #     else:
+    #         exit("The sample type, tumor vs normal vs all (default),\
+    #         is not correctly set!")
+    df.transpose().to_csv(path_or_buf="file.tsv", sep='\t')
 
 
 if __name__ == '__main__':

diff --git a/signature_matrices/getSigMatrices.R b/signature_matrices/getSigMatrices.R
@@ -8,7 +8,7 @@ main<-function(){
     sig_name <- trimws(argv[1])
 
     sampval <-as.numeric(trimws(argv[2]))
-    print(samval)    
+    print(sampval)    
     #reshape matrisome to adhere to standards
     if(tolower(sig_name)=='matrisome'){
       tab <- readxl::read_xlsx('/Hs_Matrisome_Masterlist_Naba et al_2012.xlsx')[,c(2:3)]|>