Skip to content

Commit

Permalink
resolved
Browse files Browse the repository at this point in the history
  • Loading branch information
famosab committed Feb 27, 2023
2 parents 2aa727c + e14f52a commit 012c694
Show file tree
Hide file tree
Showing 24 changed files with 3,578 additions and 78,226 deletions.
12 changes: 3 additions & 9 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ growth_basis: 'minimal_uptake' # 'default_uptake' or 'minimal_uptake'
multiple: FALSE
multiple_paths: ['../C_striatum_GEMs/models/Cstr_14.xml', '../C_striatum_GEMs/models/Cstr_15.xml', '../C_striatum_GEMs/models/Cstr_16.xml', '../C_striatum_GEMs/models/Cstr_17.xml'] #, '../C_striatum_GEMs/models/Cstr_KC-Na-01.xml']

# Path to database which contains metabolites present in different media
media_db: 'data/media_db.csv'

# media to simulate growth from, available: SNM, LB, M9, SMM, CGXII, RPMI
media: ['SNM3', 'RPMI', 'CGXlab'] #'LB', 'M9', 'RPMI', 'CGXII', 'CasA'

Expand All @@ -73,13 +70,10 @@ output: xlsx #cl, xlsx, csv
# All parameters are required for all db_to_compare choices except:
# - organismid which is only required for db_to_compare: 'KEGG'/'KEGG+BioCyc'
# - and biocyc_tables which is not required for 'KEGG'
gapfill_analysis: TRUE
gapfill_analysis: FALSE
gapfill_analysis_params:
db_to_compare: 'KEGG' # One of the choices KEGG|BioCyc|GFF|KEGG+BioCyc
organismid: 'T05059' # Needs to be specified for KEGG
bigg_dbs:
- 'data/bigg_models_reactions.txt' # Path to BiGG reactions database
- 'data/bigg_models_metabolites.txt' # Path to BiGG metabolites database
gff_file: 'data/cstr.gff' # Path to RefSeq GFF file
biocyc_files:
- 'Path0' # Path to TXT file containing a SmartTable from BioCyc with the columns 'Accession-2' 'Reaction of gene' (-)
Expand All @@ -91,7 +85,7 @@ gapfill_analysis_params:
# (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI'
# For all BioCyc files the order should be the same. If the organism does not occur in the BioCyc database the
# complete tables for reactions can be used with the same columns.
gapfill_model: FALSE

### ModelSEED comparison ###
modelseed: FALSE # set to False if not needed
modelseedpath: 'data/modelseed_compounds.tsv'
modelseed: FALSE # set to False if not needed
15,725 changes: 0 additions & 15,725 deletions data/bigg_models_metabolites.txt

This file was deleted.

28,302 changes: 0 additions & 28,302 deletions data/bigg_models_reactions.txt

This file was deleted.

1 change: 1 addition & 0 deletions data/database/current_bigg_db_version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.6.0
Binary file added data/database/data.db
Binary file not shown.
425 changes: 416 additions & 9 deletions data/sbo/sbo_database.sql → data/database/sbo_media_db.sql

Large diffs are not rendered by default.

33,993 changes: 0 additions & 33,993 deletions data/modelseed_compounds.tsv

This file was deleted.

Binary file removed data/sbo/sbo_database.db
Binary file not shown.
Binary file removed docs/build/doctrees/environment.pickle
Binary file not shown.
2,700 changes: 2,700 additions & 0 deletions id2loc.csv

Large diffs are not rendered by default.

41 changes: 24 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def main():
print("Author:", __author__)
today = date.today().strftime("%Y%m%d")

rg.databases.initialise_database()

with open('config.yaml') as f:
config = yaml.safe_load(f)

Expand All @@ -37,7 +39,7 @@ def main():

elif (config['charge_corr']):
model_libsbml = rg.io.load_model_libsbml(config['model'])
rg.charges.correct_charges_modelseed(model_libsbml, config['charge_path'], config['modelseedpath'], config['charge_report_path'])
rg.charges.correct_charges_modelseed(model_libsbml, config['charge_path'], config['charge_report_path'])
model, errors = cobra.io.sbml.validate_sbml_model(config['charge_path'])
print(errors)

Expand All @@ -59,7 +61,7 @@ def main():

else:
if (config['multiple']):
growth_all = rg.comparison.simulate_all(config['multiple_paths'], config['media_db'], config['media'], config['growth_basis'])
growth_all = rg.comparison.simulate_all(config['multiple_paths'], config['media'], config['growth_basis'])
growth_all.to_csv(config['out_path'] + 'growth_' + str(today) + '_' + config['growth_basis'] + '.csv', index=False)

try:
Expand All @@ -79,14 +81,18 @@ def main():
if (config['memote']):
score = rg.investigate.run_memote(model_cobra)

if (config['gapfill_analysis']):
if (config['gapfill_analysis'] and config['gapfill_model']):
gapfill_analysis = rg.gapfill.gapfill(model_libsbml, config['gapfill_analysis_params'])
elif (config['gapfill_analysis']):
gapfill_analysis = rg.gapfill.gapfill_analysis(model_libsbml, config['gapfill_analysis_params'])
elif (config['gapfill_model']):
rg.gapfill.gapfill_model(model_libsbml, config['gapfill_model_file'])

if(config['modelseed']):
charge_mismatch, formula_mismatch = rg.modelseed.compare_to_modelseed(config['modelseedpath'], model_cobra)
charge_mismatch, formula_mismatch = rg.modelseed.compare_to_modelseed(model_cobra)

if (config['media_db'] != None):
growth_sim = rg.growth.get_growth_selected_media(model_cobra, config['media_db'], config['media'], config['growth_basis'])
if (config['media'] != None):
growth_sim = rg.growth.get_growth_selected_media(model_cobra, config['media'], config['growth_basis'])

if (config['output'] == 'cl'):
print('---')
Expand All @@ -102,7 +108,7 @@ def main():
print('Charge unbalanced reactions: ' + str(charge_unbal))
print(growth_sim)
print(egc)
if(config['gapfill_analysis']):
if(config['gapfill_analysis']) or (config['gapfill_analysis'] and config['gapfill_model']):
if type(gapfill_analysis) == tuple:
print('BioCyc - Statistics on missing entities:')
print(gapfill_analysis[0])
Expand Down Expand Up @@ -140,15 +146,16 @@ def main():
if(config['modelseed']):
charge_mismatch.to_excel(writer, sheet_name='charge mismatches', index=False)
formula_mismatch.to_excel(writer, sheet_name='formula mismatches', index=False)
if(config['gapfill_analysis']) and type(gapfill_analysis) == tuple:
with pd.ExcelWriter(config['out_path'] + name + '_gapfill_analysis_' + str(today) + '.xlsx') as writer:
gapfill_analysis[0].to_excel(writer, sheet_name='gap fill statistics', index=False)
gapfill_analysis[1].to_excel(writer, sheet_name='genes', index=False)
gapfill_analysis[2].to_excel(writer, sheet_name='metabolites', index=False)
gapfill_analysis[3].to_excel(writer, sheet_name='metabolites without BiGG IDs', index=False)
gapfill_analysis[4].to_excel(writer, sheet_name='reactions', index=False)
if len(gapfill_analysis) == 6:
gapfill_analysis[5].to_excel(writer, sheet_name='KEGG reactions', index=False)
if(config['gapfill_analysis']) or (config['gapfill_analysis'] and config['gapfill_model']):
if type(gapfill_analysis) == tuple:
with pd.ExcelWriter(config['out_path'] + name + '_gapfill_analysis_' + str(today) + '.xlsx') as writer:
gapfill_analysis[0].to_excel(writer, sheet_name='gap fill statistics', index=False)
gapfill_analysis[1].to_excel(writer, sheet_name='genes', index=False)
gapfill_analysis[2].to_excel(writer, sheet_name='metabolites', index=False)
gapfill_analysis[3].to_excel(writer, sheet_name='metabolites without BiGG IDs', index=False)
gapfill_analysis[4].to_excel(writer, sheet_name='reactions', index=False)
if len(gapfill_analysis) == 6:
gapfill_analysis[5].to_excel(writer, sheet_name='KEGG reactions', index=False)

if (config['output'] == 'csv'): # csv file
print('---')
Expand All @@ -161,7 +168,7 @@ def main():
model_info.to_csv(name + '_modelinfo.csv', index=False)
growth_sim.to_csv(name +'_growthsim.csv', index=False)
egc.to_csv(name + '_egc.csv', index=False)
if(config['gapfill_analysis']):
if(config['gapfill_analysis']) or (config['gapfill_analysis'] and config['gapfill_model']):
if type(gapfill_analysis) == tuple:
gapfill_analysis[0].to_csv(name +'_BioCyc_analysis_statistics.csv', index=False)
gapfill_analysis[1].to_csv(name +'_BioCyc_analysis_genes.csv', index=False)
Expand Down
1 change: 1 addition & 0 deletions refinegems/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# declares public API for this module
# loads functions from subscripts which are needed in main.py
import refinegems.databases
import refinegems.growth
import refinegems.modelseed
import refinegems.sboann
Expand Down
61 changes: 48 additions & 13 deletions refinegems/analysis_biocyc.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def extract_metabolites_from_reactions(missing_reactions: pd.DataFrame):


def get_missing_reactions(
model_libsbml: Model, bigg_db: str, genes2reaction: pd.DataFrame, inpath: str
model_libsbml: Model, genes2reaction: pd.DataFrame, inpath: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Subsets the BioCyc table with the following columns:
'Reaction' 'Reactants of reaction' 'Products of reaction' 'EC-Number' 'Reaction-Direction' 'Spontaneous?'
Expand All @@ -148,7 +148,6 @@ def get_missing_reactions(
Params:
- model_libsbml (Model): Model read in with libSBML
- bigg_db (str): Path to TXT file containing all reactions from the BiGG database
- genes2reaction (pd.DataFrame): A pandas dataframe containing only 'Accession-2' & 'Reactions' for the
missing genes
- inpath (str): Path to file from BioCyc containing the following columns:
Expand Down Expand Up @@ -178,7 +177,7 @@ def get_missing_reactions(
statistics_df.loc['Reaction', 'Total'] = len(missing_reactions['Reaction'].unique().tolist())

# Get BiGG BioCyc
bigg2biocyc_reacs = get_bigg2other_db(bigg_db, 'BioCyc')
bigg2biocyc_reacs = get_bigg2other_db('BioCyc')

# Subset missing_reactions with BiGG BioCyc
missing_reactions.rename(columns={'Reaction': 'BioCyc'}, inplace=True)
Expand Down Expand Up @@ -238,15 +237,14 @@ def get_missing_metabolites_wo_BiGG(


def get_missing_metabolites(
model_libsbml: Model, bigg_db: str, metabs_from_reacs: pd.DataFrame, inpath: str
model_libsbml: Model, metabs_from_reacs: pd.DataFrame, inpath: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Subsets the BioCyc table with the following columns: 'Compound' 'Chemical Formula' 'InChI-Key' 'ChEBI'
to obtain the missing metabolites with all the corresponding data
& Adds the according BiGG Compound identifiers
Params:
- model_libsml (Model): Model read in with libSBML
- bigg_db (str): Path to TXT file containing all metabolites from the BiGG database
- model_libsml (Model): Model read in with libSBML
- metabs_from_reacs (pd.DataFrame): A pandas dataframe containing only the metabolites corresponding to the
missing reactions
- inpath (str): Path to file from BioCyc containing the following columns:
Expand All @@ -269,7 +267,7 @@ def get_missing_metabolites(
statistics_df.loc['Metabolite', 'Total'] = len(missing_metabolites['BioCyc'].unique().tolist())

# Get BiGG BioCyc
bigg2biocyc_metabs = get_bigg2other_db(bigg_db, 'BioCyc', True)
bigg2biocyc_metabs = get_bigg2other_db('BioCyc', True)

# Subset missing_metabolites with BiGG BioCyc -> To get only metabolites with BiGG IDs
missing_metabolites = bigg2biocyc_metabs.merge(missing_metabolites, on='BioCyc')
Expand Down Expand Up @@ -317,7 +315,7 @@ def add_charges_chemical_formulae_to_metabs(missing_metabs: pd.DataFrame) -> pd.
"""Adds charges & chemical formulae from CHEBI to the provided dataframe
Params:
- missing_metabolites (DataFrame): A pandas dataframe containing metabolites & the respective CHEBI IDs
- missing_metabs (DataFrame): A pandas dataframe containing metabolites & the respective CHEBI IDs
Returns:
-> The input pandas dataframe extended with the charges & chemical formulas obtained from CHEBI
Expand Down Expand Up @@ -357,23 +355,59 @@ def find_formula(row: pd.Series) -> str:
chem_formula = chem_form if chem_form != 'nan' else 'No formula'
return chem_formula

missing_metabs['charge'] = missing_metabs.apply(find_charge, axis=1) #str(row['ChEBI']), str(row['bigg_id'])
missing_metabs['charge'] = missing_metabs.apply(find_charge, axis=1)
missing_metabs['New Chemical Formula'] = missing_metabs.apply(find_formula, axis=1)
missing_metabs['Chemical Formula'] = missing_metabs['New Chemical Formula']
missing_metabs.drop('New Chemical Formula', axis=1, inplace=True)

return missing_metabs

# Inspired by Dr. Reihaneh Mostolizadeh's function to add BioCyc reactions to a model
def replace_reaction_direction_with_fluxes(missing_reacs: pd.DataFrame) -> pd.DataFrame:
"""Extracts the flux lower & upper bounds for each reaction through the entries in column 'Reaction-Direction'
Params:
- missing_reacs (DataFrame): A pandas dataframe containing reactions & the respective Reaction-Directions
Returns:
-> The input pandas dataframe extended with the fluxes lower & upper bounds obtained from
the Reaction-Directions
"""

def get_fluxes(row: pd.Series) -> dict[str: str]:
direction = row['Reaction-Direction']
fluxes = {}

if type(direction) == float:
# Use default bounds as described in readthedocs from COBRApy
fluxes['lower_bound'] = 'cobra_0_bound'
fluxes['upper_bound'] = 'cobra_default_ub'
elif 'RIGHT-TO-LEFT' in direction:
fluxes['lower_bound'] = 'cobra_default_lb'
fluxes['upper_bound'] = 'cobra_0_bound'
elif 'LEFT-TO-RIGHT' in direction:
fluxes['lower_bound'] = 'cobra_0_bound'
fluxes['upper_bound'] = 'cobra_default_ub'
elif 'REVERSIBLE' in direction:
fluxes['lower_bound'] = 'cobra_default_lb'
fluxes['upper_bound'] = 'cobra_default_ub'

return str(fluxes)

missing_reacs['fluxes'] = missing_reacs.apply(get_fluxes, axis=1)
missing_reacs.drop('Reaction-Direction', axis=1, inplace=True)

return missing_reacs


def biocyc_gene_comp(
model_libsbml: Model, biocyc_file_paths: list[str], bigg_dbs: list[str]
model_libsbml: Model, biocyc_file_paths: list[str]
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Main function to retrieve the dataframes for missing genes, metabolites and reactions from BioCyc
Params:
- model_libsbml (Model): libSBML Model object
- biocyc_file_paths (list): A list of the files required for the BioCyc analysis
- bigg_dbs (list): A list of TXT files where the BiGG metabolites & reactions are stored
Returns:
-> Five dataframes (1) - (5):
Expand All @@ -385,11 +419,12 @@ def biocyc_gene_comp(
"""
# Extract missing reactions from all missing genes
genes2reactions = get_missing_genes2reactions(model_libsbml, biocyc_file_paths[0])
metabs_from_reacs, missing_reactions_df = get_missing_reactions(model_libsbml, bigg_dbs[0], genes2reactions, biocyc_file_paths[1])
metabs_from_reacs, missing_reactions_df = get_missing_reactions(model_libsbml, genes2reactions, biocyc_file_paths[1])
missing_reactions_df = add_stoichiometric_values_to_reacs(missing_reactions_df)
missing_reactions_df = replace_reaction_direction_with_fluxes(missing_reactions_df)

# Extract missing metabolites that belong to the missing reactions
missing_metabolites_df, missing_metabs_wo_BiGG_df = get_missing_metabolites(model_libsbml, bigg_dbs[1], metabs_from_reacs, biocyc_file_paths[2])
missing_metabolites_df, missing_metabs_wo_BiGG_df = get_missing_metabolites(model_libsbml, metabs_from_reacs, biocyc_file_paths[2])
missing_metabolites_df = add_charges_chemical_formulae_to_metabs(missing_metabolites_df)
# If metabolites should be added due to reactions but no BiGG ID was found
if len(missing_metabs_wo_BiGG_df.index) > 0:
Expand Down
Loading

0 comments on commit 012c694

Please sign in to comment.