From 024d7206aa325146d49bef95adeda592f30aa220 Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:37:59 +0100 Subject: [PATCH 1/5] Backwards compatibility of test data with pymatgen (#206) * Enable control of matminer `ignore_errors` arg from MODFeaturizer Linting * Dynamically patch old structural data to work with latest pymatgen when depickling * Refactor tests to allow new featurizer columns to exist as long as old ones are present * Now try to update pymatgen again * Also fix composition container loading * Add tensorflow upper bound from other PR --- modnet/featurizers/featurizers.py | 21 +++++++++++++++++---- modnet/tests/conftest.py | 21 ++++++++++++++++++++- modnet/tests/test_preprocessing.py | 23 +++++++---------------- requirements.txt | 2 +- setup.py | 2 +- 5 files changed, 46 insertions(+), 23 deletions(-) diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index 8de3d5af..ee103fa8 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -141,7 +141,10 @@ def _fit_apply_featurizers( _featurizers.set_n_jobs(self._n_jobs) return _featurizers.featurize_dataframe( - df, column, multiindex=True, ignore_errors=True + df, + column, + multiindex=True, + ignore_errors=getattr(self, "ignore_errors", True), ) elif mode == "single": @@ -164,7 +167,10 @@ def _fit_apply_featurizers( ) start = time.monotonic_ns() df = featurizer.featurize_dataframe( - df, column, multiindex=True, ignore_errors=True + df, + column, + multiindex=True, + ignore_errors=getattr(self, "ignore_errors", True), ) LOG.info( f"Applied featurizer {featurizer.__class__.__name__} to column {column!r} in {(time.monotonic_ns() - start) * 1e-9} seconds" @@ -244,7 +250,11 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: else: df = CompositionToOxidComposition( max_sites=-1 if getattr(self, "continuous_only", False) else None - ).featurize_dataframe(df, col_id=col_comp, ignore_errors=True) + ).featurize_dataframe( + df, + col_id=col_comp, + ignore_errors=getattr(self, "ignore_errors", True), + ) df = self._fit_apply_featurizers( df, self.oxid_composition_featurizers, @@ -311,7 +321,10 @@ def featurize_site( fingerprint, stats=self.site_stats ) df = site_stats_fingerprint.featurize_dataframe( - df, "Input data|structure", multiindex=False, ignore_errors=True + df, + "Input data|structure", + multiindex=False, + ignore_errors=getattr(self, "ignore_errors", True), ) if aliases: diff --git a/modnet/tests/conftest.py b/modnet/tests/conftest.py index d5061c90..44e8aa1a 100644 --- a/modnet/tests/conftest.py +++ b/modnet/tests/conftest.py @@ -1,7 +1,9 @@ import pytest from pathlib import Path +from modnet.preprocessing import CompositionContainer from modnet.utils import get_hash_of_file +from pymatgen.core import Structure _TEST_DATA_HASHES = { @@ -41,7 +43,24 @@ def _load_moddata(filename): # what it was when created assert get_hash_of_file(data_file) == _TEST_DATA_HASHES[filename] - return MODData.load(data_file) + moddata = MODData.load(data_file) + # For forwards compatibility with pymatgen, we have to patch our old test data to have the following attributes + # to allow for depickling + # This is hopefully only a temporary solution, and in future, we should serialize pymatgen objects + # with Monty's `from_dict`/`to_dict` to avoid having to hack this private interface + for ind, s in enumerate(moddata.structures): + if isinstance(s, Structure): + # assume all previous data was periodic + moddata.structures[ind].lattice._pbc = [True, True, True] + for jnd, site in enumerate(s.sites): + # assume all of our previous data had ordered sites + moddata.structures[ind].sites[jnd].label = str(next(iter(site.species))) + # required for the global structure.is_ordered to work + moddata.structures[ind].sites[jnd].species._n_atoms = 1.0 + elif isinstance(s, CompositionContainer): + moddata.structures[ind].composition._n_atoms = s.composition._natoms + + return moddata @pytest.fixture(scope="function") diff --git a/modnet/tests/test_preprocessing.py b/modnet/tests/test_preprocessing.py index e27fa477..51c294ca 100644 --- a/modnet/tests/test_preprocessing.py +++ b/modnet/tests/test_preprocessing.py @@ -12,8 +12,14 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03): Allows for some columns to be checked more loosely (see inline comment below). """ + new_cols = set(new.df_featurized.columns) + old_cols = set(reference.df_featurized.columns) + + # Check that the new df only adds new columns and is not missing anything + assert not (old_cols - new_cols) + error_cols = set() - for col in new.df_featurized.columns: + for col in old_cols: if not ( np.absolute( ( @@ -349,14 +355,6 @@ def test_small_moddata_featurization(small_moddata_2023, featurizer_mode): featurizer.featurizer_mode = featurizer_mode new = MODData(structures, targets, target_names=names, featurizer=featurizer) new.featurize(fast=False, n_jobs=1) - - new_cols = sorted(new.df_featurized.columns.tolist()) - old_cols = sorted(old.df_featurized.columns.tolist()) - - for i in range(len(old_cols)): - assert new_cols[i] == old_cols[i] - - np.testing.assert_array_equal(old_cols, new_cols) check_column_values(new, old, tolerance=0.03) @@ -376,13 +374,6 @@ def test_small_moddata_composition_featurization( new = MODData(materials=compositions, featurizer=featurizer) new.featurize(fast=False, n_jobs=1) - new_cols = sorted(new.df_featurized.columns.tolist()) - ref_cols = sorted(reference.df_featurized.columns.tolist()) - - for i in range(len(ref_cols)): - # print(new_cols[i], ref_cols[i]) - assert new_cols[i] == ref_cols[i] - # assert relative error below 3 percent check_column_values(new, reference, tolerance=0.03) diff --git a/requirements.txt b/requirements.txt index ff4c4e39..81c0afa6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,5 @@ pandas==1.5.2 scikit-learn==1.3.2 matminer==0.9.2 numpy>=1.25 -pymatgen==2023.11.12 +pymatgen==2024.3.1 scikit-learn==1.3.2 diff --git a/setup.py b/setup.py index e650934e..28781da1 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ packages=setuptools.find_packages(), install_requires=[ "pandas~=1.5", - "tensorflow~=2.10", + "tensorflow~=2.10,<2.12", "pymatgen>=2023", "matminer~=0.9", "numpy>=1.24", From 1d08ebfa441ca55420e4343f655067dcc960788a Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:45:39 +0100 Subject: [PATCH 2/5] Properly handle Bayesian model import failure (#207) * Properly handle Bayesian model import failure * Fix incompat tensorflow version --------- Co-authored-by: Pierre-Paul De Breuck --- modnet/models/bayesian.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modnet/models/bayesian.py b/modnet/models/bayesian.py index 4ef848d4..be11a7af 100644 --- a/modnet/models/bayesian.py +++ b/modnet/models/bayesian.py @@ -15,7 +15,7 @@ try: import tensorflow_probability as tfp except ImportError: - raise RuntimeError( + raise ImportError( "`tensorflow-probability` is required for Bayesian models: install modnet[bayesian]." ) From 99c322b9c02a12917fecadbbdb3929211143cadc Mon Sep 17 00:00:00 2001 From: Kyle Daniel Miller Date: Tue, 2 Apr 2024 17:30:38 -0500 Subject: [PATCH 3/5] fixed bugged conditionals in evaluate() (#210) --- modnet/models/vanilla.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index 413b03d0..b310f1fd 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -825,7 +825,9 @@ def evaluate( f"Loss {loss} not recognized. Use mae, mse or a callable." ) else: - score.append(loss(y_true, y_pred[i])) + pass + + score.append(loss(y_true, y_pred[i])) return np.mean(score) From af70e614dca44c78fccd5f000f8d4a9842b5e6a0 Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Wed, 3 Apr 2024 17:13:33 +0100 Subject: [PATCH 4/5] Add simple test for evaluate (#211) --- modnet/tests/test_model.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/modnet/tests/test_model.py b/modnet/tests/test_model.py index 970c811c..c4853e07 100644 --- a/modnet/tests/test_model.py +++ b/modnet/tests/test_model.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import pytest +import numpy as np def test_train_small_model_single_target(subset_moddata, tf_session): @@ -21,6 +22,7 @@ def test_train_small_model_single_target(subset_moddata, tf_session): model.fit(data, epochs=2) model.predict(data) + assert not np.isnan(model.evaluate(data)) def test_train_small_model_single_target_classif(subset_moddata, tf_session): @@ -49,6 +51,7 @@ def is_metal(egap): ) model.fit(data, epochs=2) + assert not np.isnan(model.evaluate(data)) def test_train_small_model_multi_target(subset_moddata, tf_session): @@ -70,6 +73,7 @@ def test_train_small_model_multi_target(subset_moddata, tf_session): model.fit(data, epochs=2) model.predict(data) + assert not np.isnan(model.evaluate(data)) def test_train_small_model_presets(subset_moddata, tf_session): @@ -109,6 +113,7 @@ def test_train_small_model_presets(subset_moddata, tf_session): models = results[0] assert len(models) == len(modified_presets) assert len(models[0]) == num_nested + assert not np.isnan(model.evaluate(data)) def test_model_integration(subset_moddata, tf_session): @@ -134,6 +139,7 @@ def test_model_integration(subset_moddata, tf_session): loaded_model = MODNetModel.load("test") assert model.predict(data).equals(loaded_model.predict(data)) + assert not np.isnan(model.evaluate(data)) def test_train_small_bayesian_single_target(subset_moddata, tf_session): @@ -156,6 +162,7 @@ def test_train_small_bayesian_single_target(subset_moddata, tf_session): model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) + assert not np.isnan(model.evaluate(data)) def test_train_small_bayesian_single_target_classif(subset_moddata, tf_session): @@ -186,6 +193,7 @@ def is_metal(egap): model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) + assert not np.isnan(model.evaluate(data)) def test_train_small_bayesian_multi_target(subset_moddata, tf_session): @@ -208,6 +216,7 @@ def test_train_small_bayesian_multi_target(subset_moddata, tf_session): model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) + assert not np.isnan(model.evaluate(data)) def test_train_small_bootstrap_single_target(subset_moddata, tf_session): @@ -232,6 +241,7 @@ def test_train_small_bootstrap_single_target(subset_moddata, tf_session): model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) + assert not np.isnan(model.evaluate(data)) def test_train_small_bootstrap_single_target_classif(small_moddata, tf_session): @@ -264,6 +274,7 @@ def is_metal(egap): model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) + assert not np.isnan(model.evaluate(data)) def test_train_small_bootstrap_multi_target(small_moddata, tf_session): @@ -333,3 +344,5 @@ def test_train_small_bootstrap_presets(small_moddata, tf_session): models = results[0] assert len(models) == len(modified_presets) assert len(models[0]) == num_nested + + assert not np.isnan(model.evaluate(data)) From f0e2e990cb3ad41308655beda750784956589d9c Mon Sep 17 00:00:00 2001 From: ppdebreuck Date: Fri, 5 Apr 2024 12:41:51 +0200 Subject: [PATCH 5/5] version update --- modnet/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modnet/__init__.py b/modnet/__init__.py index df124332..f6b7e267 100644 --- a/modnet/__init__.py +++ b/modnet/__init__.py @@ -1 +1 @@ -__version__ = "0.4.2" +__version__ = "0.4.3"