Adding new class for wrapping typical ADVI workflow

PNNL-CompBio · Nov 30, 2023 · cfe80dc · cfe80dc
1 parent e0f4445
commit cfe80dc
Showing 1 changed file with 255 additions and 0 deletions.
diff --git a/src/emll/bmca.py b/src/emll/bmca.py
@@ -0,0 +1,255 @@
+"""Code to actually implement the BMCA workflow in emll."""
+
+import gzip
+import os
+
+import cloudpickle
+import cobra
+import numpy as np
+import pandas as pd
+import pymc as pm
+import pytensor.tensor as pt
+
+import emll
+
+os.environ["MKL_THREADING_LAYER"] = "GNU"
+
+
+class BMCA:
+    """Class to build out BMCA experiment."""
+
+    def __init__(
+        self,
+        model_path,
+        v_star_path,
+        metabolite_concentrations_path,
+        boundary_fluxes_path,
+        enzyme_measurements_path,
+        reference_state,
+    ):
+        """Initialize the BMCA (Biochemical Model Calibration and Analysis) instance.
+
+        Parameters
+        ----------
+        model_path : str
+            The path to the YAML file containing the biochemical model.
+
+        v_star_path : str
+            The path to the CSV file containing v_star data.
+
+        metabolite_concentrations_path : str
+            The path to the CSV file containing metabolite concentrations data.
+
+        boundary_fluxes_path : str
+            The path to the CSV file containing boundary fluxes data.
+
+        enzyme_measurements_path : str
+            The path to the CSV file containing enzyme measurements data.
+
+        reference_state : str
+            The reference state identifier for the experiment.
+
+        pt.ributes
+        ----------
+        model : cobra.Model
+            The biochemical model loaded from the provided YAML file.
+
+        v_star : pandas.DataFrame
+            DataFrame containing v_star data loaded from the CSV file.
+
+        x : pandas.DataFrame
+            DataFrame containing metabolite concentrations data loaded from the CSV file.
+
+        v : pandas.DataFrame
+            DataFrame containing boundary fluxes data loaded from the CSV file.
+
+        e : pandas.DataFrame
+            DataFrame containing enzyme measurements data loaded from the CSV file.
+
+        ref_state : str
+            The reference state identifier used in the experiment.
+        """
+        self.model = cobra.io.load_yaml_model(model_path)
+        self.v_star = pd.read_csv(v_star_path, header=None, index_col=0)[1]
+        self.x = pd.read_csv(metabolite_concentrations_path, index_col=0)
+        self.v = pd.read_csv(boundary_fluxes_path, index_col=0)
+        self.e = pd.read_csv(enzyme_measurements_path, index_col=0)
+
+        self.ref_state = reference_state
+
+    def preprocess_data(self):
+        """Read in cobra model as components."""
+        # Establish compartments for reactions and metabolites
+        self.r_compartments = [
+            r.compartments if "e" not in r.compartments else "t" for r in self.model.reactions
+        ]
+
+        self.r_compartments[self.model.reactions.index("SUCCt2r")] = "c"
+        self.r_compartments[self.model.reactions.index("ACt2r")] = "c"
+
+        for rxn in self.model.exchanges:
+            self.r_compartments[self.model.reactions.index(rxn)] = "t"
+
+            self.m_compartments = [m.compartment for m in self.model.metabolites]
+        # Reindex arrays to have the same column ordering
+        to_consider = self.v.columns
+        self.v = self.v.loc[:, to_consider]
+        self.x = self.x.loc[:, to_consider]
+        self.e = self.e.loc[:, to_consider]
+
+        self.n_exp = len(to_consider) - 1
+
+        # Drop reference state
+        self.vn = (
+            self.v.drop(self.ref_state)
+            .divide(self.v.drop(self.ref_state)["flux"], axis=0)
+            .drop("flux", axis=1)
+            .T
+        )
+        self.xn = (self.x.subtract(self.x[self.ref_state], 0) * np.log(2)).T
+        self.en = (2 ** self.e.subtract(self.e[self.ref_state], 0)).T
+
+        # Get indexes for measured values
+        self.x_inds = np.array([self.model.metabolites.index(met) for met in self.xn.columns])
+        self.e_inds = np.array([self.model.reactions.index(rxn) for rxn in self.en.columns])
+        self.v_inds = np.array([self.model.reactions.index(rxn) for rxn in self.vn.columns])
+
+        self.e_laplace_inds = []
+        self.e_zero_inds = []
+
+        for i, rxn in enumerate(self.model.reactions):
+            if rxn.id not in self.en.columns:
+                if ("e" not in rxn.compartments) and (len(rxn.compartments) == 1):
+                    self.e_laplace_inds += [i]
+                else:
+                    self.e_zero_inds += [i]
+
+        self.e_laplace_inds = np.array(self.e_laplace_inds)
+        self.e_zero_inds = np.array(self.e_zero_inds)
+        self.e_indexer = np.hstack([self.e_inds, self.e_laplace_inds, self.e_zero_inds]).argsort()
+
+        self.N = cobra.util.create_stoichiometric_matrix(self.model)
+        self.Ex = emll.util.create_elasticity_matrix(self.model)
+        self.Ey = emll.util.create_Ey_matrix(self.model)
+
+        self.Ex *= 0.1 + 0.8 * np.random.rand(*self.Ex.shape)
+
+        self.ll = emll.LinLogLeastNorm(self.N, self.Ex, self.Ey, self.v_star.values, driver="gelsy")
+
+    def build_pymc_model(self):
+        """Build the PyMC probabilistic model."""
+        with pm.Model() as pymc_model:
+            # Priors on elasticity values
+            self.Ex_t = pm.Deterministic(
+                "Ex",
+                emll.initialize_elasticity(
+                    self.ll.N,
+                    b=0.01,
+                    sigma=1,
+                    alpha=None,
+                    m_compartments=self.m_compartments,
+                    r_compartments=self.r_compartments,
+                ),
+            )
+
+            self.Ey_t = T.as_tensor_variable(self.Ey)
+
+            e_measured = pm.Normal(
+                "log_e_measured",
+                mu=np.log(self.en),
+                sigma=0.2,
+                shape=(self.n_exp, len(self.e_inds)),
+            )
+            e_unmeasured = pm.Laplace(
+                "log_e_unmeasured", mu=0, b=0.1, shape=(self.n_exp, len(self.e_laplace_inds))
+            )
+            log_en_t = pt.concatenate(
+                [e_measured, e_unmeasured, pt.zeros((self.n_exp, len(self.e_zero_inds)))], axis=1
+            )[:, self.e_indexer]
+
+            pm.Deterministic("log_en_t", log_en_t)
+
+            # Priors on external concentrations
+            yn_t = pm.Normal(
+                "yn_t",
+                mu=0,
+                sigma=10,
+                shape=(self.n_exp, self.ll.ny),
+                initval=0.1 * np.random.randn(self.n_exp, self.ll.ny),
+            )
+
+            chi_ss, vn_ss = self.ll.steady_state_pytensor(
+                self.Ex_t, self.Ey_t, pt.exp(log_en_t), yn_t
+            )
+            pm.Deterministic("chi_ss", chi_ss)
+            pm.Deterministic("vn_ss", vn_ss)
+
+            log_vn_ss = pt.log(pt.clip(vn_ss[:, self.v_inds], 1e-8, 1e8))
+            log_vn_ss = pt.clip(log_vn_ss, -1.5, 1.5)
+
+            chi_clip = pt.clip(chi_ss[:, self.x_inds], -1.5, 1.5)
+
+            chi_obs = pm.Normal(
+                "chi_obs", mu=chi_clip, sigma=0.2, observed=self.xn.clip(lower=-1.5, upper=1.5)
+            )
+            log_vn_obs = pm.Normal(
+                "vn_obs",
+                mu=log_vn_ss,
+                sigma=0.1,
+                observed=np.log(self.vn).clip(lower=-1.5, upper=1.5),
+            )
+
+        self.pymc_model = pymc_model
+
+    def run_emll(self):
+        """Build linlog model and run inference."""
+        with self.pymc_model:
+            approx = pm.ADVI()
+            hist = approx.fit(
+                n=40000,
+                obj_optimizer=pm.adagrad_window(learning_rate=0.005),
+                total_grad_norm_constraint=100,
+            )
+
+            # trace = hist.sample(500)
+            # ppc = pm.sample_ppc(trace)
+
+        return approx, hist
+
+    def save_results(self, approx, hist):
+        """Save ADVI results in cloudpickle."""
+        with gzip.open("data/{self.model.name}.pgz", "wb") as f:
+            cloudpickle.dump(
+                {
+                    "approx": approx,
+                    "hist": hist,
+                },
+                f,
+            )
+
+
+def main(
+    model_path,
+    v_star_path,
+    metabolite_concentrations_path,
+    boundary_fluxes_path,
+    enzyme_measurements_path,
+    reference_state,
+):
+    """Run BMCA for the provided model."""
+    emll_model = BMCA(
+        model_path,
+        v_star_path,
+        metabolite_concentrations_path,
+        boundary_fluxes_path,
+        enzyme_measurements_path,
+        reference_state,
+    )
+    emll_model.preprocess_data()
+    emll_model.build_pymc_model()
+    approx, hist = emll_model.run_emll()
+    emll_model.save_results(approx, hist)
+
+
+if __name__ == "__main__":
+    pass