resolved

draeger-lab · Feb 27, 2023 · 012c694 · 012c694
2 parents 2aa727c + e14f52a
commit 012c694
Show file tree

Hide file tree

Showing 24 changed files with 3,578 additions and 78,226 deletions.
diff --git a/config.yaml b/config.yaml
@@ -55,9 +55,6 @@ growth_basis: 'minimal_uptake' # 'default_uptake' or 'minimal_uptake'
 multiple: FALSE
 multiple_paths: ['../C_striatum_GEMs/models/Cstr_14.xml', '../C_striatum_GEMs/models/Cstr_15.xml', '../C_striatum_GEMs/models/Cstr_16.xml', '../C_striatum_GEMs/models/Cstr_17.xml'] #, '../C_striatum_GEMs/models/Cstr_KC-Na-01.xml']
 
-# Path to database which contains metabolites present in different media
-media_db: 'data/media_db.csv' 
-
 # media to simulate growth from, available: SNM, LB, M9, SMM, CGXII, RPMI
 media: ['SNM3', 'RPMI', 'CGXlab'] #'LB', 'M9', 'RPMI', 'CGXII', 'CasA'
 
@@ -73,13 +70,10 @@ output: xlsx #cl, xlsx, csv
 # All parameters are required for all db_to_compare choices except:
 # - organismid which is only required for db_to_compare: 'KEGG'/'KEGG+BioCyc'
 # - and biocyc_tables which is not required for 'KEGG'
-gapfill_analysis: TRUE
+gapfill_analysis: FALSE
 gapfill_analysis_params:
   db_to_compare: 'KEGG'  # One of the choices KEGG|BioCyc|GFF|KEGG+BioCyc
   organismid: 'T05059'  # Needs to be specified for KEGG
-  bigg_dbs:
-    - 'data/bigg_models_reactions.txt'  # Path to BiGG reactions database
-    - 'data/bigg_models_metabolites.txt'  # Path to BiGG metabolites database
   gff_file: 'data/cstr.gff'  # Path to RefSeq GFF file 
   biocyc_files: 
     - 'Path0'  # Path to TXT file containing a SmartTable from BioCyc with the columns 'Accession-2' 'Reaction of gene' (-)
@@ -91,7 +85,7 @@ gapfill_analysis_params:
 # (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI'
 # For all BioCyc files the order should be the same. If the organism does not occur in the BioCyc database the
 # complete tables for reactions can be used with the same columns.
+gapfill_model: FALSE
 
 ### ModelSEED comparison ###
-modelseed: FALSE # set to False if not needed
-modelseedpath: 'data/modelseed_compounds.tsv'
+modelseed: FALSE # set to False if not needed
diff --git a/data/bigg_models_metabolites.txt b/data/bigg_models_metabolites.txt
diff --git a/data/bigg_models_reactions.txt b/data/bigg_models_reactions.txt
diff --git a/data/database/current_bigg_db_version.txt b/data/database/current_bigg_db_version.txt
@@ -0,0 +1 @@
+1.6.0
diff --git a/data/database/data.db b/data/database/data.db
diff --git a/data/sbo/sbo_database.sql → data/database/sbo_media_db.sql b/data/sbo/sbo_database.sql → data/database/sbo_media_db.sql
diff --git a/data/modelseed_compounds.tsv b/data/modelseed_compounds.tsv
diff --git a/data/sbo/sbo_database.db b/data/sbo/sbo_database.db
diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle
diff --git a/id2loc.csv b/id2loc.csv
diff --git a/main.py b/main.py
@@ -14,6 +14,8 @@ def main():
     print("Author:", __author__)
     today = date.today().strftime("%Y%m%d")
 
+    rg.databases.initialise_database()
+
     with open('config.yaml') as f:
         config = yaml.safe_load(f)
 
@@ -37,7 +39,7 @@ def main():
 
     elif (config['charge_corr']):
         model_libsbml = rg.io.load_model_libsbml(config['model'])
-        rg.charges.correct_charges_modelseed(model_libsbml, config['charge_path'], config['modelseedpath'], config['charge_report_path'])
+        rg.charges.correct_charges_modelseed(model_libsbml, config['charge_path'], config['charge_report_path'])
         model, errors = cobra.io.sbml.validate_sbml_model(config['charge_path'])
         print(errors)
 
@@ -59,7 +61,7 @@ def main():
 
     else:
         if (config['multiple']):
-            growth_all = rg.comparison.simulate_all(config['multiple_paths'], config['media_db'], config['media'], config['growth_basis'])
+            growth_all = rg.comparison.simulate_all(config['multiple_paths'], config['media'], config['growth_basis'])
             growth_all.to_csv(config['out_path'] + 'growth_' + str(today) + '_' + config['growth_basis'] + '.csv', index=False)
 
         try:    
@@ -79,14 +81,18 @@ def main():
             if (config['memote']):
                 score = rg.investigate.run_memote(model_cobra)
 
-            if (config['gapfill_analysis']):
+            if (config['gapfill_analysis'] and config['gapfill_model']):
+                gapfill_analysis = rg.gapfill.gapfill(model_libsbml, config['gapfill_analysis_params'])    
+            elif (config['gapfill_analysis']):
                 gapfill_analysis = rg.gapfill.gapfill_analysis(model_libsbml, config['gapfill_analysis_params'])
+            elif (config['gapfill_model']):
+                rg.gapfill.gapfill_model(model_libsbml, config['gapfill_model_file'])
 
             if(config['modelseed']):
-                charge_mismatch, formula_mismatch = rg.modelseed.compare_to_modelseed(config['modelseedpath'], model_cobra)
+                charge_mismatch, formula_mismatch = rg.modelseed.compare_to_modelseed(model_cobra)
 
-            if (config['media_db'] != None):
-                growth_sim = rg.growth.get_growth_selected_media(model_cobra, config['media_db'], config['media'], config['growth_basis'])
+            if (config['media'] != None):
+                growth_sim = rg.growth.get_growth_selected_media(model_cobra, config['media'], config['growth_basis'])
 
             if (config['output'] == 'cl'):
                 print('---')
@@ -102,7 +108,7 @@ def main():
                 print('Charge unbalanced reactions: ' + str(charge_unbal))
                 print(growth_sim)
                 print(egc)
-                if(config['gapfill_analysis']): 
+                if(config['gapfill_analysis']) or (config['gapfill_analysis'] and config['gapfill_model']): 
                     if type(gapfill_analysis) == tuple:
                         print('BioCyc - Statistics on missing entities:')
                         print(gapfill_analysis[0])
@@ -140,15 +146,16 @@ def main():
                     if(config['modelseed']):
                         charge_mismatch.to_excel(writer, sheet_name='charge mismatches', index=False)
                         formula_mismatch.to_excel(writer, sheet_name='formula mismatches', index=False)
-                if(config['gapfill_analysis']) and type(gapfill_analysis) == tuple: 
-                    with pd.ExcelWriter(config['out_path'] + name + '_gapfill_analysis_' + str(today) + '.xlsx') as writer:
-                        gapfill_analysis[0].to_excel(writer, sheet_name='gap fill statistics', index=False)
-                        gapfill_analysis[1].to_excel(writer, sheet_name='genes', index=False)
-                        gapfill_analysis[2].to_excel(writer, sheet_name='metabolites', index=False)
-                        gapfill_analysis[3].to_excel(writer, sheet_name='metabolites without BiGG IDs', index=False)
-                        gapfill_analysis[4].to_excel(writer, sheet_name='reactions', index=False)
-                        if len(gapfill_analysis) == 6:
-                            gapfill_analysis[5].to_excel(writer, sheet_name='KEGG reactions', index=False)
+                if(config['gapfill_analysis']) or (config['gapfill_analysis'] and config['gapfill_model']): 
+                    if type(gapfill_analysis) == tuple:
+                        with pd.ExcelWriter(config['out_path'] + name + '_gapfill_analysis_' + str(today) + '.xlsx') as writer:
+                            gapfill_analysis[0].to_excel(writer, sheet_name='gap fill statistics', index=False)
+                            gapfill_analysis[1].to_excel(writer, sheet_name='genes', index=False)
+                            gapfill_analysis[2].to_excel(writer, sheet_name='metabolites', index=False)
+                            gapfill_analysis[3].to_excel(writer, sheet_name='metabolites without BiGG IDs', index=False)
+                            gapfill_analysis[4].to_excel(writer, sheet_name='reactions', index=False)
+                            if len(gapfill_analysis) == 6:
+                                gapfill_analysis[5].to_excel(writer, sheet_name='KEGG reactions', index=False)
 
             if (config['output'] == 'csv'): # csv file
                 print('---')
@@ -161,7 +168,7 @@ def main():
                 model_info.to_csv(name + '_modelinfo.csv', index=False)
                 growth_sim.to_csv(name +'_growthsim.csv', index=False)
                 egc.to_csv(name + '_egc.csv', index=False)
-                if(config['gapfill_analysis']):
+                if(config['gapfill_analysis']) or (config['gapfill_analysis'] and config['gapfill_model']):
                     if type(gapfill_analysis) == tuple:
                         gapfill_analysis[0].to_csv(name +'_BioCyc_analysis_statistics.csv', index=False)
                         gapfill_analysis[1].to_csv(name +'_BioCyc_analysis_genes.csv', index=False)

diff --git a/refinegems/__init__.py b/refinegems/__init__.py
@@ -1,5 +1,6 @@
 # declares public API for this module
 # loads functions from subscripts which are needed in main.py
+import refinegems.databases
 import refinegems.growth
 import refinegems.modelseed
 import refinegems.sboann

diff --git a/refinegems/analysis_biocyc.py b/refinegems/analysis_biocyc.py
@@ -139,7 +139,7 @@ def extract_metabolites_from_reactions(missing_reactions: pd.DataFrame):
 
 
 def get_missing_reactions(
-   model_libsbml: Model, bigg_db: str, genes2reaction: pd.DataFrame, inpath: str
+   model_libsbml: Model, genes2reaction: pd.DataFrame, inpath: str
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Subsets the BioCyc table with the following columns: 
    'Reaction' 'Reactants of reaction' 'Products of reaction' 'EC-Number' 'Reaction-Direction' 'Spontaneous?'
@@ -148,7 +148,6 @@ def get_missing_reactions(
 
       Params:
          - model_libsbml (Model):         Model read in with libSBML
-         - bigg_db (str):                 Path to TXT file containing all reactions from the BiGG database
          - genes2reaction (pd.DataFrame): A pandas dataframe containing only 'Accession-2' & 'Reactions' for the 
                                           missing genes
          - inpath (str):                  Path to file from BioCyc containing the following columns:
@@ -178,7 +177,7 @@ def get_missing_reactions(
    statistics_df.loc['Reaction', 'Total'] = len(missing_reactions['Reaction'].unique().tolist())
 
    # Get BiGG BioCyc
-   bigg2biocyc_reacs = get_bigg2other_db(bigg_db, 'BioCyc')
+   bigg2biocyc_reacs = get_bigg2other_db('BioCyc')
 
    # Subset missing_reactions with BiGG BioCyc
    missing_reactions.rename(columns={'Reaction': 'BioCyc'}, inplace=True)
@@ -238,15 +237,14 @@ def get_missing_metabolites_wo_BiGG(
 
 
 def get_missing_metabolites(
-   model_libsbml: Model, bigg_db: str, metabs_from_reacs: pd.DataFrame, inpath: str
+   model_libsbml: Model, metabs_from_reacs: pd.DataFrame, inpath: str
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Subsets the BioCyc table with the following columns: 'Compound' 'Chemical Formula' 'InChI-Key' 'ChEBI'
       to obtain the missing metabolites with all the corresponding data
       & Adds the according BiGG Compound identifiers
       
       Params:
-         - model_libsml (Model):             Model read in with libSBML
-         - bigg_db (str):                    Path to TXT file containing all metabolites from the BiGG database 
+         - model_libsml (Model):             Model read in with libSBML 
          - metabs_from_reacs (pd.DataFrame): A pandas dataframe containing only the metabolites corresponding to the 
                                              missing reactions
          - inpath (str):                     Path to file from BioCyc containing the following columns:
@@ -269,7 +267,7 @@ def get_missing_metabolites(
    statistics_df.loc['Metabolite', 'Total'] = len(missing_metabolites['BioCyc'].unique().tolist())
 
    # Get BiGG BioCyc
-   bigg2biocyc_metabs = get_bigg2other_db(bigg_db, 'BioCyc', True)
+   bigg2biocyc_metabs = get_bigg2other_db('BioCyc', True)
 
    # Subset missing_metabolites with BiGG BioCyc -> To get only metabolites with BiGG IDs
    missing_metabolites = bigg2biocyc_metabs.merge(missing_metabolites, on='BioCyc')
@@ -317,7 +315,7 @@ def add_charges_chemical_formulae_to_metabs(missing_metabs: pd.DataFrame) -> pd.
    """Adds charges & chemical formulae from CHEBI to the provided dataframe
 
       Params:
-         - missing_metabolites (DataFrame): A pandas dataframe containing metabolites & the respective CHEBI IDs
+         - missing_metabs (DataFrame): A pandas dataframe containing metabolites & the respective CHEBI IDs
          
       Returns:
          -> The input pandas dataframe extended with the charges & chemical formulas obtained from CHEBI
@@ -357,23 +355,59 @@ def find_formula(row: pd.Series) -> str:
             chem_formula = chem_form if chem_form != 'nan' else 'No formula'
       return chem_formula
 
-   missing_metabs['charge'] = missing_metabs.apply(find_charge, axis=1)  #str(row['ChEBI']), str(row['bigg_id'])
+   missing_metabs['charge'] = missing_metabs.apply(find_charge, axis=1)
    missing_metabs['New Chemical Formula'] = missing_metabs.apply(find_formula, axis=1)
    missing_metabs['Chemical Formula'] = missing_metabs['New Chemical Formula']
    missing_metabs.drop('New Chemical Formula', axis=1, inplace=True)
 
    return missing_metabs
 
+# Inspired by Dr. Reihaneh Mostolizadeh's function to add BioCyc reactions to a model
+def replace_reaction_direction_with_fluxes(missing_reacs: pd.DataFrame) -> pd.DataFrame:
+   """Extracts the flux lower & upper bounds for each reaction through the entries in column 'Reaction-Direction'
+   
+      Params:
+         - missing_reacs (DataFrame): A pandas dataframe containing reactions & the respective Reaction-Directions
+         
+      Returns:
+         -> The input pandas dataframe extended with the fluxes lower & upper bounds obtained from 
+            the Reaction-Directions
+   """
+
+   def get_fluxes(row: pd.Series) -> dict[str: str]:
+      direction = row['Reaction-Direction']
+      fluxes = {}
+
+      if type(direction) == float:
+         # Use default bounds as described in readthedocs from COBRApy
+         fluxes['lower_bound'] = 'cobra_0_bound'
+         fluxes['upper_bound'] = 'cobra_default_ub'
+      elif 'RIGHT-TO-LEFT' in direction:
+         fluxes['lower_bound'] = 'cobra_default_lb'
+         fluxes['upper_bound'] = 'cobra_0_bound'
+      elif 'LEFT-TO-RIGHT' in direction:
+         fluxes['lower_bound'] = 'cobra_0_bound'
+         fluxes['upper_bound'] = 'cobra_default_ub'
+      elif 'REVERSIBLE' in direction:
+         fluxes['lower_bound'] = 'cobra_default_lb'
+         fluxes['upper_bound'] = 'cobra_default_ub'
+
+      return str(fluxes)
+
+   missing_reacs['fluxes'] = missing_reacs.apply(get_fluxes, axis=1)
+   missing_reacs.drop('Reaction-Direction', axis=1, inplace=True)
+
+   return missing_reacs
+
 
 def biocyc_gene_comp(
-   model_libsbml: Model, biocyc_file_paths: list[str], bigg_dbs: list[str]
+   model_libsbml: Model, biocyc_file_paths: list[str]
    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Main function to retrieve the dataframes for missing genes, metabolites and reactions from BioCyc
    
       Params:
          - model_libsbml (Model):      libSBML Model object
          - biocyc_file_paths (list):   A list of the files required for the BioCyc analysis
-         - bigg_dbs (list):            A list of TXT files where the BiGG metabolites & reactions are stored
          
       Returns: 
          -> Five dataframes (1) - (5):
@@ -385,11 +419,12 @@ def biocyc_gene_comp(
    """
    # Extract missing reactions from all missing genes
    genes2reactions = get_missing_genes2reactions(model_libsbml, biocyc_file_paths[0])
-   metabs_from_reacs, missing_reactions_df = get_missing_reactions(model_libsbml, bigg_dbs[0], genes2reactions, biocyc_file_paths[1])
+   metabs_from_reacs, missing_reactions_df = get_missing_reactions(model_libsbml, genes2reactions, biocyc_file_paths[1])
    missing_reactions_df = add_stoichiometric_values_to_reacs(missing_reactions_df)
+   missing_reactions_df = replace_reaction_direction_with_fluxes(missing_reactions_df)
 
    # Extract missing metabolites that belong to the missing reactions
-   missing_metabolites_df, missing_metabs_wo_BiGG_df = get_missing_metabolites(model_libsbml, bigg_dbs[1], metabs_from_reacs, biocyc_file_paths[2])
+   missing_metabolites_df, missing_metabs_wo_BiGG_df = get_missing_metabolites(model_libsbml, metabs_from_reacs, biocyc_file_paths[2])
    missing_metabolites_df = add_charges_chemical_formulae_to_metabs(missing_metabolites_df)
    # If metabolites should be added due to reactions but no BiGG ID was found
    if len(missing_metabs_wo_BiGG_df.index) > 0: