Merge pull request #464 from RandomDefaultUser/fix_modules

Fix `pkqmeans` and `oapackage` install
mala-project · Jul 4, 2023 · dba1617 · dba1617
2 parents 084fa80 + 61030c4
commit dba1617
Show file tree

Hide file tree

Showing 7 changed files with 42 additions and 480 deletions.
diff --git a/docs/source/usage/basicconcepts.rst b/docs/source/usage/basicconcepts.rst
@@ -87,7 +87,6 @@ installed on your system. The output will look something like this:
             horovod: 	 not installed 	 Enables training parallelization.
             lammps: 	 installed 	 Enables descriptor calculation for data preprocessing and inference.
             oapackage: 	 installed 	 Enables usage of OAT method for hyperparameter optimization.
-            pqkmeans: 	 installed 	 Enables clustering of training data.
             total_energy: 	 not installed 	 Enables calculation of total energy.
 
 Attempting to use one of those functionalities without having the modules

diff --git a/mala/common/check_modules.py b/mala/common/check_modules.py
@@ -16,8 +16,6 @@ def check_modules():
         "oapackage": {"available": False, "description":
                       "Enables usage of OAT method for hyperparameter "
                       "optimization."},
-        "pqkmeans": {"available": False, "description":
-                     "Enables clustering of training data."},
         "total_energy": {"available": False, "description":
                          "Enables calculation of total energy."},
         "asap3": {"available": False, "description":

diff --git a/mala/common/parameters.py b/mala/common/parameters.py
@@ -569,23 +569,6 @@ class ParametersData(ParametersBase):
         If True, will use alternative lazy loading path with prefetching
         for higher performance
 
-    use_clustering : bool
-        If True, and use_lazy_loading is True as well, the data is clustered,
-        i.e. not the entire training data is used by rather only a subset
-        which is determined by a clustering algorithm.
-
-    number_of_clusters : int
-        If use_clustering is True, this is the number of clusters used per
-        snapshot.
-
-    train_ratio : float
-        If use_clustering is True, this is the ratio of training data used
-        to train the encoder for the clustering.
-
-    sample_ratio : float
-        If use_clustering is True, this is the ratio of training data used
-        for sampling per snapshot (according to clustering then, of course).
-
     use_fast_tensor_data_set : bool
         If True, then the new, fast TensorDataSet implemented by Josh Romero
         will be used.
@@ -603,10 +586,6 @@ def __init__(self):
         self.output_rescaling_type = "None"
         self.use_lazy_loading = False
         self.use_lazy_loading_prefetch = False
-        self.use_clustering = False
-        self.number_of_clusters = 40
-        self.train_ratio = 0.1
-        self.sample_ratio = 0.5
         self.use_fast_tensor_data_set = False
         self.shuffling_seed = None
 

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
@@ -16,8 +16,6 @@
 from mala.datahandling.data_scaler import DataScaler
 from mala.datahandling.snapshot import Snapshot
 from mala.datahandling.lazy_load_dataset import LazyLoadDataset
-from mala.datahandling.lazy_load_dataset_clustered import \
-    LazyLoadDatasetClustered
 from mala.datahandling.lazy_load_dataset_single import LazyLoadDatasetSingle
 from mala.datahandling.fast_tensor_dataset import FastTensorDataset
 
@@ -570,50 +568,25 @@ def __build_datasets(self):
         if self.parameters.use_lazy_loading and not self.parameters.use_lazy_loading_prefetch:
 
             # Create the lazy loading data sets.
-            if self.parameters.use_clustering:
-                self.training_data_sets.append(LazyLoadDatasetClustered(
-                    self.input_dimension, self.output_dimension,
-                    self.input_data_scaler, self.output_data_scaler,
-                    self.descriptor_calculator, self.target_calculator,
-                    self.grid_dimension, self.grid_size,
-                    self.use_horovod, self.parameters.number_of_clusters,
-                    self.parameters.train_ratio,
-                    self.parameters.sample_ratio))
-                self.validation_data_sets.append(LazyLoadDataset(
-                    self.input_dimension, self.output_dimension,
-                    self.input_data_scaler, self.output_data_scaler,
-                    self.descriptor_calculator, self.target_calculator,
-                    self.use_horovod))
-
-                if self.nr_test_data != 0:
-                    self.test_data_sets.append(LazyLoadDataset(
-                        self.input_dimension,
-                        self.output_dimension,
-                        self.input_data_scaler, self.output_data_scaler,
-                        self.descriptor_calculator, self.target_calculator,
-                        self.use_horovod,
-                        input_requires_grad=True))
+            self.training_data_sets.append(LazyLoadDataset(
+                self.input_dimension, self.output_dimension,
+                self.input_data_scaler, self.output_data_scaler,
+                self.descriptor_calculator, self.target_calculator,
+                self.use_horovod))
+            self.validation_data_sets.append(LazyLoadDataset(
+                self.input_dimension, self.output_dimension,
+                self.input_data_scaler, self.output_data_scaler,
+                self.descriptor_calculator, self.target_calculator,
+                self.use_horovod))
 
-            else:
-                self.training_data_sets.append(LazyLoadDataset(
-                    self.input_dimension, self.output_dimension,
-                    self.input_data_scaler, self.output_data_scaler,
-                    self.descriptor_calculator, self.target_calculator,
-                    self.use_horovod))
-                self.validation_data_sets.append(LazyLoadDataset(
-                    self.input_dimension, self.output_dimension,
+            if self.nr_test_data != 0:
+                self.test_data_sets.append(LazyLoadDataset(
+                    self.input_dimension,
+                    self.output_dimension,
                     self.input_data_scaler, self.output_data_scaler,
                     self.descriptor_calculator, self.target_calculator,
-                    self.use_horovod))
-
-                if self.nr_test_data != 0:
-                    self.test_data_sets.append(LazyLoadDataset(
-                        self.input_dimension,
-                        self.output_dimension,
-                        self.input_data_scaler, self.output_data_scaler,
-                        self.descriptor_calculator, self.target_calculator,
-                        self.use_horovod,
-                        input_requires_grad=True))
+                    self.use_horovod,
+                    input_requires_grad=True))
 
             # Add snapshots to the lazy loading data sets.
             for snapshot in self.parameters.snapshot_directories_list:
@@ -624,8 +597,6 @@ def __build_datasets(self):
                 if snapshot.snapshot_function == "te":
                     self.test_data_sets[0].add_snapshot_to_dataset(snapshot)
 
-            if self.parameters.use_clustering:
-                self.training_data_sets[0].cluster_dataset()
             # I don't think we need to mix them here. We can use the standard
             # ordering for the first epoch
             # and mix it up after.
@@ -634,34 +605,31 @@ def __build_datasets(self):
             # self.test_data_set.mix_datasets()
         elif self.parameters.use_lazy_loading and self.parameters.use_lazy_loading_prefetch:
             printout("Using lazy loading pre-fetching.", min_verbosity=2)
-            # Create the lazy loading data sets.
-            if self.parameters.use_clustering:
-                raise Exception("clustering not supported in this mode")
-            else:
-                # Create LazyLoadDatasetSingle instances per snapshot and add to list
-                for snapshot in self.parameters.snapshot_directories_list:
-                    if snapshot.snapshot_function == "tr":
-                        self.training_data_sets.append(LazyLoadDatasetSingle(
-                            self.mini_batch_size, snapshot,
-                            self.input_dimension, self.output_dimension,
-                            self.input_data_scaler, self.output_data_scaler,
-                            self.descriptor_calculator, self.target_calculator,
-                            self.use_horovod))
-                    if snapshot.snapshot_function == "va":
-                        self.validation_data_sets.append(LazyLoadDatasetSingle(
-                            self.mini_batch_size, snapshot,
-                            self.input_dimension, self.output_dimension,
-                            self.input_data_scaler, self.output_data_scaler,
-                            self.descriptor_calculator, self.target_calculator,
-                            self.use_horovod))
-                    if snapshot.snapshot_function == "te":
-                        self.test_data_sets.append(LazyLoadDatasetSingle(
-                            self.mini_batch_size, snapshot,
-                            self.input_dimension, self.output_dimension,
-                            self.input_data_scaler, self.output_data_scaler,
-                            self.descriptor_calculator, self.target_calculator,
-                            self.use_horovod,
-                            input_requires_grad=True))
+            # Create LazyLoadDatasetSingle instances per snapshot and add to
+            # list.
+            for snapshot in self.parameters.snapshot_directories_list:
+                if snapshot.snapshot_function == "tr":
+                    self.training_data_sets.append(LazyLoadDatasetSingle(
+                        self.mini_batch_size, snapshot,
+                        self.input_dimension, self.output_dimension,
+                        self.input_data_scaler, self.output_data_scaler,
+                        self.descriptor_calculator, self.target_calculator,
+                        self.use_horovod))
+                if snapshot.snapshot_function == "va":
+                    self.validation_data_sets.append(LazyLoadDatasetSingle(
+                        self.mini_batch_size, snapshot,
+                        self.input_dimension, self.output_dimension,
+                        self.input_data_scaler, self.output_data_scaler,
+                        self.descriptor_calculator, self.target_calculator,
+                        self.use_horovod))
+                if snapshot.snapshot_function == "te":
+                    self.test_data_sets.append(LazyLoadDatasetSingle(
+                        self.mini_batch_size, snapshot,
+                        self.input_dimension, self.output_dimension,
+                        self.input_data_scaler, self.output_data_scaler,
+                        self.descriptor_calculator, self.target_calculator,
+                        self.use_horovod,
+                        input_requires_grad=True))
 
         else:
             if self.nr_training_data != 0:

diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py
@@ -47,11 +47,6 @@ def __init__(self, parameters: Parameters, target_calculator=None,
         self.output_dimension = 0
         self.nr_snapshots = 0
 
-        # Clustering still needs uniform grids
-        if self.parameters.use_clustering:
-            self.grid_dimension = [0, 0, 0]
-            self.grid_size = 0
-
     ##############################
     # Properties
     ##############################
@@ -182,9 +177,6 @@ def _check_snapshots(self, comm=None):
             snapshot.grid_size = int(np.prod(snapshot.grid_dimension))
             if firstsnapshot:
                 self.input_dimension = tmp_input_dimension
-                if self.parameters.use_clustering:
-                    self.grid_dimension[0:3] = tmp_grid_dim[0:3]
-                    self.grid_size = np.prod(self.grid_dimension)
             else:
                 if self.input_dimension != tmp_input_dimension:
                     raise Exception("Invalid snapshot entered at ", snapshot.