Merge pull request #22 from ppdebreuck/ml-evs/readd_cross_nmi

Add ability to load precomputed cross NMI from figshare
ppdebreuck · Dec 16, 2020 · 84c8e43 · 84c8e43
2 parents 242c6d9 + 3b3ae61
commit 84c8e43
Show file tree

Hide file tree

Showing 6 changed files with 196 additions and 70 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+modnet/data/
 
 # Distribution / packaging
 build/

diff --git a/modnet/ext_data.py b/modnet/ext_data.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+# Distributed under the terms of the MIT License.
+
+"""This module defines some remote datasets that can be downloaded
+into the user's installation.
+
+"""
+
+import logging
+import os
+from collections import namedtuple
+from enum import Enum, auto
+from pathlib import Path
+from typing import Union
+
+
+class Usage(Enum):
+    MODData = auto()
+    cross_nmi = auto()
+
+
+Dataset = namedtuple("Dataset", ("url", "description", "filename", "md5", "usage"))
+DATASETS = {
+    "MP_2018.6": Dataset(
+        url="https://ndownloader.figshare.com/files/24364571",
+        description=(
+            "A MODData that contains all inorganic compounds from the Materials Project (MP) as of June 2018, "
+            "decorated with the DeBreuck2020 featurizer preset."
+        ),
+        filename="MP_2018.6.zip",
+        md5="06280c4e539508bbcc5266f07698f8d1",
+        usage=Usage["MODData"]
+    ),
+    "MP_2018.6_CROSS_NMI": Dataset(
+        url="https://ndownloader.figshare.com/files/25584803",
+        description=(
+            "Pickled dataframe containing the Normalized Mutual Information (NMI) between matminer features "
+            "computed on the Materials Project."
+        ),
+        filename="Features_cross",
+        md5="b83e0bd43f71ec53c4d69ee0764acfbe",
+        usage=Usage["cross_nmi"],
+    ),
+}
+
+
+def load_ext_dataset(dataset_name: str, expected_type: Union[Usage, str]):
+    """Load one of the preset datasets from the `DATASETS` constant. Will not
+    overwrite any existing local data with remote datasets. Checks hashes against
+    what is expected and will not depickle if unrecognised.
+
+    Parameters:
+        dataset_name: The name (key) of the dataset in `DATASETS`.
+        expected_type: A string representing the expected usage of the dataset,
+            e.g. `'MODData'` or `'cross_nmi'`.
+
+    Returns:
+        The path to the downloaded or previously installed model.
+
+    """
+    import urllib.request
+    import urllib.error
+
+    if dataset_name not in DATASETS:
+        raise ValueError(
+            f"No dataset {dataset_name} found, must be one of {list(DATASETS.keys())}"
+        )
+
+    dataset = DATASETS[dataset_name]
+    if isinstance(expected_type, str):
+        expected_type = Usage[expected_type]
+    if dataset.usage != expected_type:
+        raise ValueError(
+            f"Cannot load {dataset_name} as it has the wrong type {dataset.usage}."
+        )
+
+    data_dir = Path(__file__).parent.joinpath("data")
+    model_path = data_dir.joinpath(dataset.filename)
+    if not model_path.is_file():
+        logging.info(
+            f"Downloading featurized dataset {dataset_name} from {dataset.url} into {model_path}"
+        )
+        if not data_dir.is_dir():
+            os.makedirs(data_dir)
+
+        try:
+            zip_file, response = urllib.request.urlretrieve(dataset.url, model_path)
+        except (urllib.error.URLError, urllib.error.HTTPError) as exc:
+            raise ValueError(
+                f"There was a problem downloading {dataset.url}: {exc.reason}"
+            )
+
+    if dataset.md5 is not None:
+        from modnet.utils import get_hash_of_file
+
+        file_md5 = get_hash_of_file(model_path, algo="md5")
+        if file_md5 != dataset.md5:
+            raise RuntimeError(
+                "Precomputed {dataset.usage} did not match expected MD5 from {dataset.url}, will not unpickled."
+            )
+
+    return model_path
diff --git a/modnet/models.py b/modnet/models.py
@@ -36,7 +36,7 @@ def __init__(
         targets: List,
         weights: Dict[str, float],
         num_neurons=([64], [32], [16], [16]),
-        num_classes: Optional[Dict[str,int]] = None,
+        num_classes: Optional[Dict[str, int]] = None,
         n_feat=300,
         act="relu",
     ):
@@ -76,11 +76,12 @@ def __init__(
         self.targets_flatten = [x for subl in f_temp for x in subl]
         self.num_classes = {name: 0 for name in self.targets_flatten}
         if num_classes is not None:
-            for k,v in num_classes.items():
-                self.num_classes[k] = v
+            self.num_classes.update(num_classes)
         self._multi_target = len(self.targets_flatten) > 1
 
-        self.build_model(targets, n_feat, num_neurons, act=act, num_classes = self.num_classes)
+        self.build_model(
+            targets, n_feat, num_neurons, act=act, num_classes=self.num_classes
+        )
 
     def build_model(
         self,
@@ -156,12 +157,12 @@ def build_model(
                     n = num_classes[group[prop_idx][pi]]
                     if n >= 2:
                         out = keras.layers.Dense(
-                            n,activation='softmax',name=group[prop_idx][pi]
-                            )(previous_layer)
+                            n, activation="softmax", name=group[prop_idx][pi]
+                        )(previous_layer)
                     else:
                         out = keras.layers.Dense(
                             1, activation="linear", name=group[prop_idx][pi]
-                            )(previous_layer)
+                        )(previous_layer)
                     final_out.append(out)
 
         self.model = keras.models.Model(inputs=f_input, outputs=final_out)
@@ -218,16 +219,21 @@ def fit(
         self.optimal_descriptors = training_data.get_optimal_descriptors()
 
         x = training_data.get_featurized_df()[
-            self.optimal_descriptors[:self.n_feat]
+            self.optimal_descriptors[: self.n_feat]
         ].values
 
         y = []
         for targ in self.targets_flatten:
-            if self.num_classes[targ] >= 2: # Classification
-                y_inner = keras.utils.to_categorical(training_data.df_targets[targ].values,num_classes=self.num_classes[targ])
+            if self.num_classes[targ] >= 2:  # Classification
+                y_inner = keras.utils.to_categorical(
+                    training_data.df_targets[targ].values,
+                    num_classes=self.num_classes[targ],
+                )
                 loss = "categorical_crossentropy"
             else:
-                y_inner = training_data.df_targets[targ].values.astype(np.float, copy=False)
+                y_inner = training_data.df_targets[targ].values.astype(
+                    np.float, copy=False
+                )
             y.append(y_inner)
 
         # Scale the input features:
@@ -241,10 +247,14 @@ def fit(
         x = self._scaler.fit_transform(x)
 
         if val_data is not None:
-            val_x = val_data.get_featurized_df()[self.optimal_descriptors[:self.n_feat]].values
+            val_x = val_data.get_featurized_df()[
+                self.optimal_descriptors[: self.n_feat]
+            ].values
             val_x = np.nan_to_num(val_x)
             val_x = self._scaler.transform(val_x)
-            val_y = list(val_data.get_target_df()[self.targets_flatten].values.transpose())
+            val_y = list(
+                val_data.get_target_df()[self.targets_flatten].values.transpose()
+            )
             validation_data = (val_x, val_y)
         else:
             validation_data = None
@@ -301,7 +311,7 @@ def fit_preset(
         data: MODData,
         presets: List[Dict[str, Any]] = None,
         val_fraction: float = 0.1,
-        verbose: int = 0
+        verbose: int = 0,
     ) -> None:
         """Chooses an optimal hyper-parametered MODNet model from different presets.
 
@@ -339,6 +349,7 @@ def fit_preset(
 
         if presets is None:
             from modnet.model_presets import MODNET_PRESETS
+
             presets = MODNET_PRESETS
 
         val_losses = 1e20 * np.ones((len(presets),))