Skip to content

Commit

Permalink
Backwards compatibility of test data with pymatgen (#206)
Browse files Browse the repository at this point in the history
* Enable control of matminer `ignore_errors` arg from MODFeaturizer

Linting

* Dynamically patch old structural data to work with latest pymatgen when depickling

* Refactor tests to allow new featurizer columns to exist as long as old ones are present

* Now try to update pymatgen again

* Also fix composition container loading

* Add tensorflow upper bound from other PR
  • Loading branch information
ml-evs authored Apr 2, 2024
1 parent c12f02e commit 024d720
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 23 deletions.
21 changes: 17 additions & 4 deletions modnet/featurizers/featurizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,10 @@ def _fit_apply_featurizers(
_featurizers.set_n_jobs(self._n_jobs)

return _featurizers.featurize_dataframe(
df, column, multiindex=True, ignore_errors=True
df,
column,
multiindex=True,
ignore_errors=getattr(self, "ignore_errors", True),
)
elif mode == "single":

Expand All @@ -164,7 +167,10 @@ def _fit_apply_featurizers(
)
start = time.monotonic_ns()
df = featurizer.featurize_dataframe(
df, column, multiindex=True, ignore_errors=True
df,
column,
multiindex=True,
ignore_errors=getattr(self, "ignore_errors", True),
)
LOG.info(
f"Applied featurizer {featurizer.__class__.__name__} to column {column!r} in {(time.monotonic_ns() - start) * 1e-9} seconds"
Expand Down Expand Up @@ -244,7 +250,11 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame:
else:
df = CompositionToOxidComposition(
max_sites=-1 if getattr(self, "continuous_only", False) else None
).featurize_dataframe(df, col_id=col_comp, ignore_errors=True)
).featurize_dataframe(
df,
col_id=col_comp,
ignore_errors=getattr(self, "ignore_errors", True),
)
df = self._fit_apply_featurizers(
df,
self.oxid_composition_featurizers,
Expand Down Expand Up @@ -311,7 +321,10 @@ def featurize_site(
fingerprint, stats=self.site_stats
)
df = site_stats_fingerprint.featurize_dataframe(
df, "Input data|structure", multiindex=False, ignore_errors=True
df,
"Input data|structure",
multiindex=False,
ignore_errors=getattr(self, "ignore_errors", True),
)

if aliases:
Expand Down
21 changes: 20 additions & 1 deletion modnet/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pytest
from pathlib import Path
from modnet.preprocessing import CompositionContainer

from modnet.utils import get_hash_of_file
from pymatgen.core import Structure


_TEST_DATA_HASHES = {
Expand Down Expand Up @@ -41,7 +43,24 @@ def _load_moddata(filename):
# what it was when created
assert get_hash_of_file(data_file) == _TEST_DATA_HASHES[filename]

return MODData.load(data_file)
moddata = MODData.load(data_file)
# For forwards compatibility with pymatgen, we have to patch our old test data to have the following attributes
# to allow for depickling
# This is hopefully only a temporary solution, and in future, we should serialize pymatgen objects
# with Monty's `from_dict`/`to_dict` to avoid having to hack this private interface
for ind, s in enumerate(moddata.structures):
if isinstance(s, Structure):
# assume all previous data was periodic
moddata.structures[ind].lattice._pbc = [True, True, True]
for jnd, site in enumerate(s.sites):
# assume all of our previous data had ordered sites
moddata.structures[ind].sites[jnd].label = str(next(iter(site.species)))
# required for the global structure.is_ordered to work
moddata.structures[ind].sites[jnd].species._n_atoms = 1.0
elif isinstance(s, CompositionContainer):
moddata.structures[ind].composition._n_atoms = s.composition._natoms

return moddata


@pytest.fixture(scope="function")
Expand Down
23 changes: 7 additions & 16 deletions modnet/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,14 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03):
Allows for some columns to be checked more loosely (see inline comment below).
"""
new_cols = set(new.df_featurized.columns)
old_cols = set(reference.df_featurized.columns)

# Check that the new df only adds new columns and is not missing anything
assert not (old_cols - new_cols)

error_cols = set()
for col in new.df_featurized.columns:
for col in old_cols:
if not (
np.absolute(
(
Expand Down Expand Up @@ -349,14 +355,6 @@ def test_small_moddata_featurization(small_moddata_2023, featurizer_mode):
featurizer.featurizer_mode = featurizer_mode
new = MODData(structures, targets, target_names=names, featurizer=featurizer)
new.featurize(fast=False, n_jobs=1)

new_cols = sorted(new.df_featurized.columns.tolist())
old_cols = sorted(old.df_featurized.columns.tolist())

for i in range(len(old_cols)):
assert new_cols[i] == old_cols[i]

np.testing.assert_array_equal(old_cols, new_cols)
check_column_values(new, old, tolerance=0.03)


Expand All @@ -376,13 +374,6 @@ def test_small_moddata_composition_featurization(
new = MODData(materials=compositions, featurizer=featurizer)
new.featurize(fast=False, n_jobs=1)

new_cols = sorted(new.df_featurized.columns.tolist())
ref_cols = sorted(reference.df_featurized.columns.tolist())

for i in range(len(ref_cols)):
# print(new_cols[i], ref_cols[i])
assert new_cols[i] == ref_cols[i]

# assert relative error below 3 percent
check_column_values(new, reference, tolerance=0.03)

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ pandas==1.5.2
scikit-learn==1.3.2
matminer==0.9.2
numpy>=1.25
pymatgen==2023.11.12
pymatgen==2024.3.1
scikit-learn==1.3.2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
packages=setuptools.find_packages(),
install_requires=[
"pandas~=1.5",
"tensorflow~=2.10",
"tensorflow~=2.10,<2.12",
"pymatgen>=2023",
"matminer~=0.9",
"numpy>=1.24",
Expand Down

0 comments on commit 024d720

Please sign in to comment.