Skip to content

Commit

Permalink
Merge pull request #464 from RandomDefaultUser/fix_modules
Browse files Browse the repository at this point in the history
Fix `pkqmeans` and `oapackage` install
  • Loading branch information
RandomDefaultUser authored Jul 4, 2023
2 parents 084fa80 + 61030c4 commit dba1617
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 480 deletions.
1 change: 0 additions & 1 deletion docs/source/usage/basicconcepts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ installed on your system. The output will look something like this:
horovod: not installed Enables training parallelization.
lammps: installed Enables descriptor calculation for data preprocessing and inference.
oapackage: installed Enables usage of OAT method for hyperparameter optimization.
pqkmeans: installed Enables clustering of training data.
total_energy: not installed Enables calculation of total energy.
Attempting to use one of those functionalities without having the modules
Expand Down
2 changes: 0 additions & 2 deletions mala/common/check_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ def check_modules():
"oapackage": {"available": False, "description":
"Enables usage of OAT method for hyperparameter "
"optimization."},
"pqkmeans": {"available": False, "description":
"Enables clustering of training data."},
"total_energy": {"available": False, "description":
"Enables calculation of total energy."},
"asap3": {"available": False, "description":
Expand Down
21 changes: 0 additions & 21 deletions mala/common/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,23 +569,6 @@ class ParametersData(ParametersBase):
If True, will use alternative lazy loading path with prefetching
for higher performance
use_clustering : bool
If True, and use_lazy_loading is True as well, the data is clustered,
i.e. not the entire training data is used by rather only a subset
which is determined by a clustering algorithm.
number_of_clusters : int
If use_clustering is True, this is the number of clusters used per
snapshot.
train_ratio : float
If use_clustering is True, this is the ratio of training data used
to train the encoder for the clustering.
sample_ratio : float
If use_clustering is True, this is the ratio of training data used
for sampling per snapshot (according to clustering then, of course).
use_fast_tensor_data_set : bool
If True, then the new, fast TensorDataSet implemented by Josh Romero
will be used.
Expand All @@ -603,10 +586,6 @@ def __init__(self):
self.output_rescaling_type = "None"
self.use_lazy_loading = False
self.use_lazy_loading_prefetch = False
self.use_clustering = False
self.number_of_clusters = 40
self.train_ratio = 0.1
self.sample_ratio = 0.5
self.use_fast_tensor_data_set = False
self.shuffling_seed = None

Expand Down
114 changes: 41 additions & 73 deletions mala/datahandling/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
from mala.datahandling.data_scaler import DataScaler
from mala.datahandling.snapshot import Snapshot
from mala.datahandling.lazy_load_dataset import LazyLoadDataset
from mala.datahandling.lazy_load_dataset_clustered import \
LazyLoadDatasetClustered
from mala.datahandling.lazy_load_dataset_single import LazyLoadDatasetSingle
from mala.datahandling.fast_tensor_dataset import FastTensorDataset

Expand Down Expand Up @@ -570,50 +568,25 @@ def __build_datasets(self):
if self.parameters.use_lazy_loading and not self.parameters.use_lazy_loading_prefetch:

# Create the lazy loading data sets.
if self.parameters.use_clustering:
self.training_data_sets.append(LazyLoadDatasetClustered(
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.grid_dimension, self.grid_size,
self.use_horovod, self.parameters.number_of_clusters,
self.parameters.train_ratio,
self.parameters.sample_ratio))
self.validation_data_sets.append(LazyLoadDataset(
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))

if self.nr_test_data != 0:
self.test_data_sets.append(LazyLoadDataset(
self.input_dimension,
self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod,
input_requires_grad=True))
self.training_data_sets.append(LazyLoadDataset(
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))
self.validation_data_sets.append(LazyLoadDataset(
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))

else:
self.training_data_sets.append(LazyLoadDataset(
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))
self.validation_data_sets.append(LazyLoadDataset(
self.input_dimension, self.output_dimension,
if self.nr_test_data != 0:
self.test_data_sets.append(LazyLoadDataset(
self.input_dimension,
self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))

if self.nr_test_data != 0:
self.test_data_sets.append(LazyLoadDataset(
self.input_dimension,
self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod,
input_requires_grad=True))
self.use_horovod,
input_requires_grad=True))

# Add snapshots to the lazy loading data sets.
for snapshot in self.parameters.snapshot_directories_list:
Expand All @@ -624,8 +597,6 @@ def __build_datasets(self):
if snapshot.snapshot_function == "te":
self.test_data_sets[0].add_snapshot_to_dataset(snapshot)

if self.parameters.use_clustering:
self.training_data_sets[0].cluster_dataset()
# I don't think we need to mix them here. We can use the standard
# ordering for the first epoch
# and mix it up after.
Expand All @@ -634,34 +605,31 @@ def __build_datasets(self):
# self.test_data_set.mix_datasets()
elif self.parameters.use_lazy_loading and self.parameters.use_lazy_loading_prefetch:
printout("Using lazy loading pre-fetching.", min_verbosity=2)
# Create the lazy loading data sets.
if self.parameters.use_clustering:
raise Exception("clustering not supported in this mode")
else:
# Create LazyLoadDatasetSingle instances per snapshot and add to list
for snapshot in self.parameters.snapshot_directories_list:
if snapshot.snapshot_function == "tr":
self.training_data_sets.append(LazyLoadDatasetSingle(
self.mini_batch_size, snapshot,
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))
if snapshot.snapshot_function == "va":
self.validation_data_sets.append(LazyLoadDatasetSingle(
self.mini_batch_size, snapshot,
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))
if snapshot.snapshot_function == "te":
self.test_data_sets.append(LazyLoadDatasetSingle(
self.mini_batch_size, snapshot,
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod,
input_requires_grad=True))
# Create LazyLoadDatasetSingle instances per snapshot and add to
# list.
for snapshot in self.parameters.snapshot_directories_list:
if snapshot.snapshot_function == "tr":
self.training_data_sets.append(LazyLoadDatasetSingle(
self.mini_batch_size, snapshot,
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))
if snapshot.snapshot_function == "va":
self.validation_data_sets.append(LazyLoadDatasetSingle(
self.mini_batch_size, snapshot,
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod))
if snapshot.snapshot_function == "te":
self.test_data_sets.append(LazyLoadDatasetSingle(
self.mini_batch_size, snapshot,
self.input_dimension, self.output_dimension,
self.input_data_scaler, self.output_data_scaler,
self.descriptor_calculator, self.target_calculator,
self.use_horovod,
input_requires_grad=True))

else:
if self.nr_training_data != 0:
Expand Down
8 changes: 0 additions & 8 deletions mala/datahandling/data_handler_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,6 @@ def __init__(self, parameters: Parameters, target_calculator=None,
self.output_dimension = 0
self.nr_snapshots = 0

# Clustering still needs uniform grids
if self.parameters.use_clustering:
self.grid_dimension = [0, 0, 0]
self.grid_size = 0

##############################
# Properties
##############################
Expand Down Expand Up @@ -182,9 +177,6 @@ def _check_snapshots(self, comm=None):
snapshot.grid_size = int(np.prod(snapshot.grid_dimension))
if firstsnapshot:
self.input_dimension = tmp_input_dimension
if self.parameters.use_clustering:
self.grid_dimension[0:3] = tmp_grid_dim[0:3]
self.grid_size = np.prod(self.grid_dimension)
else:
if self.input_dimension != tmp_input_dimension:
raise Exception("Invalid snapshot entered at ", snapshot.
Expand Down
Loading

0 comments on commit dba1617

Please sign in to comment.