ACEsuit · ilyes319 · Mar 28, 2024 · Jan 6, 2023 · Feb 8, 2023 · Feb 9, 2023
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,9 @@ build/
 .vscode/
 logs/MACE_run-5.log
 *.txt
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# DS_Store
+.DS_Store
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ python ./mace/scripts/run_train.py \
 
 To give a specific validation set, use the argument `--valid_file`. To set a larger batch size for evaluating the validation set, specify `--valid_batch_size`. 
 
-To control the model's size, you need to change `--hidden_irreps`. For most applications, the recommended default model size is `--hidden_irreps='256x0e'` (meaning 256 invariant messages) or `--hidden_irreps='128x0e + 128x1o'`. If the model is not accurate enough, you can include higher order features, e.g., `128x0e + 128x1o + 128x2e`, or increase the number of channels to `256`. 
+To control the model's size, you need to change `--hidden_irreps`. For most applications, the recommended default model size is `--hidden_irreps='256x0e'` (meaning 256 invariant messages) or `--hidden_irreps='128x0e + 128x1o'`. If the model is not accurate enough, you can include higher order features, e.g., `128x0e + 128x1o + 128x2e`, or increase the number of channels to `256`. It is also possible to specify the model using the     `--num_channels=128` and `--max_L=1`keys. 
 
 It is usually preferred to add the isolated atoms to the training set, rather than reading in their energies through the command line like in the example above. To label them in the training set, set `config_type=IsolatedAtom` in their info fields. If you prefer not to use or do not know the energies of the isolated atoms, you can use the option `--E0s="average"` which estimates the atomic energies using least squares regression. 
 
@@ -105,6 +105,52 @@ python3 ./mace/scripts/eval_configs.py \
 
 You can run our [Colab tutorial](https://colab.research.google.com/drive/1D6EtMUjQPey_GkuxUAbPgld6_9ibIa-V?authuser=1#scrollTo=Z10787RE1N8T) to quickly get started with MACE.
 
+## On-line data loading for large datasets
+
+If you have a large dataset that might not fit into the GPU memory it is recommended to preprocess the data on a CPU and use on-line dataloading for training the model. To preprocess your dataset specified as an xyz file run the `preprocess_data.py` script. An example is given here:
+
+```sh
+mkdir processed_data
+python ./mace/scripts/preprocess_data.py \
+    --train_file="/path/to/train_large.xyz" \
+    --valid_fraction=0.05 \
+    --test_file="/path/to/test_large.xyz" \
+    --atomic_numbers="[1, 6, 7, 8, 9, 15, 16, 17, 35, 53]" \
+    --r_max=4.5 \
+    --h5_prefix="processed_data/" \
+    --compute_statistics \
+    --E0s="average" \
+    --seed=123 \
+```
+
+To see all options and a little description of them run `python ./mace/scripts/preprocess_data.py --help` . The script will create a number of HDF5 files in the `processed_data` folder which can be used for training. There wiull be one file for trainin, one for validation and a separate one for each `config_type` in the test set. To train the model use the `run_train.py` script as follows:
+
+```sh
+python ./mace/scripts/run_train.py \
+    --name="MACE_on_big_data" \
+    --num_workers=16 \
+    --train_file="./processed_data/train.h5" \
+    --valid_file="./processed_data/valid.h5" \
+    --test_dir="./processed_data" \
+    --statistics_file="./processed_data/statistics.json" \
+    --model="ScaleShiftMACE" \
+    --num_interactions=2 \
+    --num_channels=128 \
+    --max_L=1 \
+    --correlation=3 \
+    --batch_size=32 \
+    --valid_batch_size=32 \
+    --max_num_epochs=100 \
+    --swa \
+    --start_swa=60 \
+    --ema \
+    --ema_decay=0.99 \
+    --amsgrad \
+    --error_table='PerAtomMAE' \
+    --device=cuda \
+    --seed=123 \
+```
+
 ## Weights and Biases for experiment tracking
 
 If you would like to use MACE with Weights and Biases to log your experiments simply install with 

diff --git a/mace/data/__init__.py b/mace/data/__init__.py
@@ -8,8 +8,12 @@
     config_from_atoms_list,
     load_from_xyz,
     random_train_valid_split,
+    save_configurations_as_HDF5,
     test_config_types,
+    save_dataset_as_HDF5,
+    save_AtomicData_to_HDF5,
 )
+from .hdf5_dataset import HDF5Dataset, dataset_from_sharded_hdf5
 
 __all__ = [
     "get_neighborhood",
@@ -22,4 +26,8 @@
     "config_from_atoms_list",
     "AtomicData",
     "compute_average_E0s",
+    "save_dataset_as_HDF5",
+    "HDF5Dataset",
+    "save_AtomicData_to_HDF5",
+    "save_configurations_as_HDF5",
 ]
diff --git a/mace/data/hdf5_dataset.py b/mace/data/hdf5_dataset.py
@@ -0,0 +1,170 @@
+import h5py
+import torch
+from torch.utils.data import Dataset, IterableDataset, ChainDataset
+from mace import data
+from mace.data.utils import Configuration
+from torch.utils.data import ConcatDataset
+from glob import glob
+from typing import List
+from mace.tools.utils import AtomicNumberTable
+
+
+class HDF5ChainDataset(ChainDataset):
+    def __init__(self, file_path, r_max, z_table, **kwargs):
+        super(HDF5ChainDataset, self).__init__()
+        self.file_path = file_path
+        self._file = None
+
+        self.length = len(self.file.keys())
+        self.r_max = r_max
+        self.z_table = z_table
+
+    @property
+    def file(self):
+        if self._file is None:
+            # If a file has not already been opened, open one here
+            self._file = h5py.File(self.file_path, "r")
+        return self._file
+
+    def __getstate__(self):
+        _d = dict(self.__dict__)
+
+        # An opened h5py.File cannot be pickled, so we must exclude it from the state
+        _d["_file"] = None
+        return _d
+
+    def __call__(self):
+        datasets = []
+        for i in range(self.length):
+            grp = self.file["config_" + str(i)]
+            datasets.append(
+                HDF5IterDataset(
+                    iter_group=grp,
+                    r_max=self.r_max,
+                    z_table=self.z_table,
+                )
+            )
+        return ChainDataset(datasets)
+
+
+class HDF5IterDataset(IterableDataset):
+    def __init__(self, iter_group, r_max, z_table, **kwargs):
+        super(HDF5IterDataset, self).__init__()
+        # it might be dangerous to open the file here
+        # move opening of file to __getitem__?
+        self.iter_group = iter_group
+        self.length = len(self.iter_group.keys())
+        self.r_max = r_max
+        self.z_table = z_table
+        # self.file = file
+        # self.length = len(h5py.File(file, 'r').keys())
+
+    def __len__(self):
+        return self.length
+
+    def __iter__(self):
+        # file = h5py.File(self.file, 'r')
+        # grp = file["config_" + str(index)]
+        grp = self.iter_group
+        len_subgrp = len(grp.keys())
+        grp_list = []
+        for i in range(len_subgrp):
+            subgrp = grp["config_" + str(i)]
+            config = Configuration(
+                atomic_numbers=subgrp["atomic_numbers"][()],
+                positions=subgrp["positions"][()],
+                energy=subgrp["energy"][()],
+                forces=subgrp["forces"][()],
+                stress=subgrp["stress"][()],
+                virials=subgrp["virials"][()],
+                dipole=subgrp["dipole"][()],
+                charges=subgrp["charges"][()],
+                weight=subgrp["weight"][()],
+                energy_weight=subgrp["energy_weight"][()],
+                forces_weight=subgrp["forces_weight"][()],
+                stress_weight=subgrp["stress_weight"][()],
+                virials_weight=subgrp["virials_weight"][()],
+                config_type=subgrp["config_type"][()],
+                pbc=subgrp["pbc"][()],
+                cell=subgrp["cell"][()],
+            )
+            atomic_data = data.AtomicData.from_config(
+                config, z_table=self.z_table, cutoff=self.r_max
+            )
+            grp_list.append(atomic_data)
+
+        return iter(grp_list)
+
+
+class HDF5Dataset(Dataset):
+    def __init__(self, file_path, r_max, z_table, **kwargs):
+        super(HDF5Dataset, self).__init__()
+        self.file_path = file_path
+        self._file = None
+        batch_key = list(self.file.keys())[0]
+        self.batch_size = len(self.file[batch_key].keys())
+        self.length = len(self.file.keys()) * self.batch_size
+        self.r_max = r_max
+        self.z_table = z_table
+        try:
+            self.drop_last = bool(self.file.attrs["drop_last"])
+        except KeyError:
+            self.drop_last = False
+
+    @property
+    def file(self):
+        if self._file is None:
+            # If a file has not already been opened, open one here
+            self._file = h5py.File(self.file_path, "r")
+        return self._file
+
+    def __getstate__(self):
+        _d = dict(self.__dict__)
+
+        # An opened h5py.File cannot be pickled, so we must exclude it from the state
+        _d["_file"] = None
+        return _d
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, index):
+        # compute the index of the batch
+        batch_index = index // self.batch_size
+        config_index = index % self.batch_size
+        grp = self.file["config_batch_" + str(batch_index)]
+        subgrp = grp["config_" + str(config_index)]
+        config = Configuration(
+            atomic_numbers=subgrp["atomic_numbers"][()],
+            positions=subgrp["positions"][()],
+            energy=unpack_value(subgrp["energy"][()]),
+            forces=unpack_value(subgrp["forces"][()]),
+            stress=unpack_value(subgrp["stress"][()]),
+            virials=unpack_value(subgrp["virials"][()]),
+            dipole=unpack_value(subgrp["dipole"][()]),
+            charges=unpack_value(subgrp["charges"][()]),
+            weight=unpack_value(subgrp["weight"][()]),
+            energy_weight=unpack_value(subgrp["energy_weight"][()]),
+            forces_weight=unpack_value(subgrp["forces_weight"][()]),
+            stress_weight=unpack_value(subgrp["stress_weight"][()]),
+            virials_weight=unpack_value(subgrp["virials_weight"][()]),
+            config_type=unpack_value(subgrp["config_type"][()]),
+            pbc=unpack_value(subgrp["pbc"][()]),
+            cell=unpack_value(subgrp["cell"][()]),
+        )
+        atomic_data = data.AtomicData.from_config(
+            config, z_table=self.z_table, cutoff=self.r_max
+        )
+        return atomic_data
+
+def dataset_from_sharded_hdf5(files: List, z_table: AtomicNumberTable, r_max: float):
+    files = glob(files+'/*')
+    datasets = []
+    for file in files:
+        datasets.append(data.HDF5Dataset(file, z_table=z_table, r_max=r_max))
+    full_dataset = ConcatDataset(datasets)
+    return full_dataset
+
+def unpack_value(value):
+    value = value.decode("utf-8") if isinstance(value, bytes) else value
+    return None if str(value) == "None" else value
diff --git a/mace/data/neighborhood.py b/mace/data/neighborhood.py
@@ -1,13 +1,7 @@
-###########################################################################################
-# Neighborhood construction
-# Authors: Ilyes Batatia, Gregor Simm
-# This program is distributed under the MIT License (see MIT.md)
-###########################################################################################
-
 from typing import Optional, Tuple
 
-import ase.neighborlist
 import numpy as np
+from matscipy.neighbours import neighbour_list
 
 
 def get_neighborhood(
@@ -16,24 +10,37 @@ def get_neighborhood(
     pbc: Optional[Tuple[bool, bool, bool]] = None,
     cell: Optional[np.ndarray] = None,  # [3, 3]
     true_self_interaction=False,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+) -> Tuple[np.ndarray, np.ndarray]:
     if pbc is None:
         pbc = (False, False, False)
 
     if cell is None or cell.any() == np.zeros((3, 3)).any():
         cell = np.identity(3, dtype=float)
-
+    
     assert len(pbc) == 3 and all(isinstance(i, (bool, np.bool_)) for i in pbc)
     assert cell.shape == (3, 3)
+
+    pbc_x = pbc[0]
+    pbc_y = pbc[1]
+    pbc_z = pbc[2]
+    identity = np.identity(3, dtype=float)
+    max_positions = np.max(np.absolute(positions)) + 1
+    # Extend cell in non-periodic directions
+    if not pbc_x:
+        cell[:,0] = max_positions * 5 * cutoff * identity[:,0]
+    if not pbc_y:
+        cell[:,1] = max_positions * 5 * cutoff * identity[:,1]
+    if not pbc_z:
+        cell[:,2] = max_positions * 5 * cutoff * identity[:,2]
 
-    sender, receiver, unit_shifts = ase.neighborlist.primitive_neighbor_list(
+    sender, receiver, unit_shifts = neighbour_list(
         quantities="ijS",
         pbc=pbc,
         cell=cell,
         positions=positions,
         cutoff=cutoff,
-        self_interaction=True,  # we want edges from atom to itself in different periodic images
-        use_scaled_positions=False,  # positions are not scaled positions
+        # self_interaction=True,  # we want edges from atom to itself in different periodic images
+        # use_scaled_positions=False,  # positions are not scaled positions
     )
 
     if not true_self_interaction: