Skip to content

Commit

Permalink
Merge pull request #22 from ppdebreuck/ml-evs/readd_cross_nmi
Browse files Browse the repository at this point in the history
Add ability to load precomputed cross NMI from figshare
  • Loading branch information
ppdebreuck authored Dec 16, 2020
2 parents 242c6d9 + 3b3ae61 commit 84c8e43
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 70 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
__pycache__/
*.py[cod]
*$py.class
modnet/data/

# Distribution / packaging
build/
Expand Down
102 changes: 102 additions & 0 deletions modnet/ext_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# coding: utf-8
# Distributed under the terms of the MIT License.

"""This module defines some remote datasets that can be downloaded
into the user's installation.
"""

import logging
import os
from collections import namedtuple
from enum import Enum, auto
from pathlib import Path
from typing import Union


class Usage(Enum):
MODData = auto()
cross_nmi = auto()


Dataset = namedtuple("Dataset", ("url", "description", "filename", "md5", "usage"))
DATASETS = {
"MP_2018.6": Dataset(
url="https://ndownloader.figshare.com/files/24364571",
description=(
"A MODData that contains all inorganic compounds from the Materials Project (MP) as of June 2018, "
"decorated with the DeBreuck2020 featurizer preset."
),
filename="MP_2018.6.zip",
md5="06280c4e539508bbcc5266f07698f8d1",
usage=Usage["MODData"]
),
"MP_2018.6_CROSS_NMI": Dataset(
url="https://ndownloader.figshare.com/files/25584803",
description=(
"Pickled dataframe containing the Normalized Mutual Information (NMI) between matminer features "
"computed on the Materials Project."
),
filename="Features_cross",
md5="b83e0bd43f71ec53c4d69ee0764acfbe",
usage=Usage["cross_nmi"],
),
}


def load_ext_dataset(dataset_name: str, expected_type: Union[Usage, str]):
"""Load one of the preset datasets from the `DATASETS` constant. Will not
overwrite any existing local data with remote datasets. Checks hashes against
what is expected and will not depickle if unrecognised.
Parameters:
dataset_name: The name (key) of the dataset in `DATASETS`.
expected_type: A string representing the expected usage of the dataset,
e.g. `'MODData'` or `'cross_nmi'`.
Returns:
The path to the downloaded or previously installed model.
"""
import urllib.request
import urllib.error

if dataset_name not in DATASETS:
raise ValueError(
f"No dataset {dataset_name} found, must be one of {list(DATASETS.keys())}"
)

dataset = DATASETS[dataset_name]
if isinstance(expected_type, str):
expected_type = Usage[expected_type]
if dataset.usage != expected_type:
raise ValueError(
f"Cannot load {dataset_name} as it has the wrong type {dataset.usage}."
)

data_dir = Path(__file__).parent.joinpath("data")
model_path = data_dir.joinpath(dataset.filename)
if not model_path.is_file():
logging.info(
f"Downloading featurized dataset {dataset_name} from {dataset.url} into {model_path}"
)
if not data_dir.is_dir():
os.makedirs(data_dir)

try:
zip_file, response = urllib.request.urlretrieve(dataset.url, model_path)
except (urllib.error.URLError, urllib.error.HTTPError) as exc:
raise ValueError(
f"There was a problem downloading {dataset.url}: {exc.reason}"
)

if dataset.md5 is not None:
from modnet.utils import get_hash_of_file

file_md5 = get_hash_of_file(model_path, algo="md5")
if file_md5 != dataset.md5:
raise RuntimeError(
"Precomputed {dataset.usage} did not match expected MD5 from {dataset.url}, will not unpickled."
)

return model_path
39 changes: 25 additions & 14 deletions modnet/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(
targets: List,
weights: Dict[str, float],
num_neurons=([64], [32], [16], [16]),
num_classes: Optional[Dict[str,int]] = None,
num_classes: Optional[Dict[str, int]] = None,
n_feat=300,
act="relu",
):
Expand Down Expand Up @@ -76,11 +76,12 @@ def __init__(
self.targets_flatten = [x for subl in f_temp for x in subl]
self.num_classes = {name: 0 for name in self.targets_flatten}
if num_classes is not None:
for k,v in num_classes.items():
self.num_classes[k] = v
self.num_classes.update(num_classes)
self._multi_target = len(self.targets_flatten) > 1

self.build_model(targets, n_feat, num_neurons, act=act, num_classes = self.num_classes)
self.build_model(
targets, n_feat, num_neurons, act=act, num_classes=self.num_classes
)

def build_model(
self,
Expand Down Expand Up @@ -156,12 +157,12 @@ def build_model(
n = num_classes[group[prop_idx][pi]]
if n >= 2:
out = keras.layers.Dense(
n,activation='softmax',name=group[prop_idx][pi]
)(previous_layer)
n, activation="softmax", name=group[prop_idx][pi]
)(previous_layer)
else:
out = keras.layers.Dense(
1, activation="linear", name=group[prop_idx][pi]
)(previous_layer)
)(previous_layer)
final_out.append(out)

self.model = keras.models.Model(inputs=f_input, outputs=final_out)
Expand Down Expand Up @@ -218,16 +219,21 @@ def fit(
self.optimal_descriptors = training_data.get_optimal_descriptors()

x = training_data.get_featurized_df()[
self.optimal_descriptors[:self.n_feat]
self.optimal_descriptors[: self.n_feat]
].values

y = []
for targ in self.targets_flatten:
if self.num_classes[targ] >= 2: # Classification
y_inner = keras.utils.to_categorical(training_data.df_targets[targ].values,num_classes=self.num_classes[targ])
if self.num_classes[targ] >= 2: # Classification
y_inner = keras.utils.to_categorical(
training_data.df_targets[targ].values,
num_classes=self.num_classes[targ],
)
loss = "categorical_crossentropy"
else:
y_inner = training_data.df_targets[targ].values.astype(np.float, copy=False)
y_inner = training_data.df_targets[targ].values.astype(
np.float, copy=False
)
y.append(y_inner)

# Scale the input features:
Expand All @@ -241,10 +247,14 @@ def fit(
x = self._scaler.fit_transform(x)

if val_data is not None:
val_x = val_data.get_featurized_df()[self.optimal_descriptors[:self.n_feat]].values
val_x = val_data.get_featurized_df()[
self.optimal_descriptors[: self.n_feat]
].values
val_x = np.nan_to_num(val_x)
val_x = self._scaler.transform(val_x)
val_y = list(val_data.get_target_df()[self.targets_flatten].values.transpose())
val_y = list(
val_data.get_target_df()[self.targets_flatten].values.transpose()
)
validation_data = (val_x, val_y)
else:
validation_data = None
Expand Down Expand Up @@ -301,7 +311,7 @@ def fit_preset(
data: MODData,
presets: List[Dict[str, Any]] = None,
val_fraction: float = 0.1,
verbose: int = 0
verbose: int = 0,
) -> None:
"""Chooses an optimal hyper-parametered MODNet model from different presets.
Expand Down Expand Up @@ -339,6 +349,7 @@ def fit_preset(

if presets is None:
from modnet.model_presets import MODNET_PRESETS

presets = MODNET_PRESETS

val_losses = 1e20 * np.ones((len(presets),))
Expand Down
Loading

0 comments on commit 84c8e43

Please sign in to comment.