mir-group · Linux-cpp-lisp · May 10, 2024 · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -29,7 +29,7 @@ jobs:
           python-version: '3.x'
       - name: Install flake8
         run: |
-          pip install flake8==4.0.1
+          pip install flake8==7.0.0
       - name: run flake8
         run: |
           flake8 . --count --show-source --statistics
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.9]
-        torch-version: [1.11.0, 1.12.1]
+        torch-version: [1.13.1, "2.*"]
 
     steps:
     - uses: actions/checkout@v2
@@ -32,6 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install setuptools wheel
         pip install torch==${TORCH} -f https://download.pytorch.org/whl/cpu/torch_stable.html
+        pip install h5py scikit-learn  # install packages that aren't required dependencies but that the tests do need
         pip install  --upgrade-strategy only-if-needed .
     - name: Install pytest
       run: |

diff --git a/.github/workflows/tests_develop.yml b/.github/workflows/tests_develop.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.9]
-        torch-version: [1.12.1]
+        torch-version: ["2.*"]
 
     steps:
     - uses: actions/checkout@v2
@@ -32,6 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install setuptools wheel
         pip install torch==${TORCH} -f https://download.pytorch.org/whl/cpu/torch_stable.html
+        pip install h5py scikit-learn  # install packages that aren't required dependencies but that the tests do need
         pip install --upgrade-strategy only-if-needed .
     - name: Install pytest
       run: |

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,20 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file for details
+
+# Required
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.9"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# Optionally declare the Python requirements required to build your docs
+python:
+   install:
+   - requirements: docs/requirements.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,51 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 Most recent change on the bottom.
 
+## Unreleased
+
+## [0.6.0] - 2024-5-10
+### Added
+- add Tensorboard as logger option
+- [Breaking] Refactor overall model logic into `GraphModel` top-level module
+- [Breaking] Added `model_dtype`
+- `BATCH_PTR_KEY` in `AtomicDataDict`
+- `AtomicInMemoryDataset.rdf()` and `examples/rdf.py`
+- `type_to_chemical_symbol`
+- Pair potential terms
+- `nequip-evaluate --output-fields-from-original-dataset`
+- Error (or warn) on unused options in YAML that likely indicate typos
+- `dataset_*_absmax` statistics option
+- `HDF5Dataset` (#227)
+- `include_file_as_baseline_config` for simple modifications of existing configs
+- `nequip-deploy --using-dataset` to support data-dependent deployment steps
+- Support for Gaussian Mixture Model uncertainty quantification (https://doi.org/10.1063/5.0136574)
+- `start_of_epoch_callbacks`
+- `nequip.train.callbacks.loss_schedule.SimpleLossSchedule` for changing the loss coefficients at specified epochs
+- `nequip-deploy build --checkpoint` and `--override` to avoid many largely duplicated YAML files
+- matscipy neighborlist support enabled with `NEQUIP_MATSCIPY_NL` environment variable
+
+### Changed
+- Always require explicit `seed`
+- [Breaking] Set `dataset_seed` to `seed` if it is not explicitly provided
+- Don't log as often by default
+- [Breaking] Default nonlinearities are `silu` (`e`) and `tanh` (`o`)
+- Will not reproduce previous versions' data shuffling order (for all practical purposes this does not matter, the `shuffle` option is unchanged)
+- [Breaking] `default_dtype` defaults to `float64` (`model_dtype` default `float32`, `allow_tf32: true` by default--- see https://arxiv.org/abs/2304.10061)
+- `nequip-benchmark` now only uses `--n-data` frames to build the model
+- [Breaking] By default models now use `StressForceOutput`, not `ForceOutput`
+- Added `edge_energy` to `ALL_ENERGY_KEYS` subjecting it to global rescale
+
+### Fixed
+- Work with `wandb>=0.13.8`
+- Better error for standard deviation with too few data
+- `load_model_state` GPU -> CPU
+- No negative volumes in rare cases 
+
+### Removed
+- [Breaking] `fixed_fields` machinery (`npz_fixed_field_keys` is still supported, but through a more straightforward implementation)
+- Default run name/WandB project name of `NequIP`, they must now always be provided explicitly
+- [Breaking] Removed `_params` as an allowable subconfiguration suffix (i.e. instead of `optimizer_params` now only `optimizer_kwargs` is valid, not both)
+- [Breaking] Removed `per_species_rescale_arguments_in_dataset_units`
 
 ## [0.5.6] - 2022-12-19
 ### Added
@@ -14,6 +59,7 @@ Most recent change on the bottom.
 - `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary`
 - `nequip-benchmark --pdb` for debugging model (builder) errors
 - More information in `nequip-deploy info`
+- GPU OOM offloading mode
 
 ### Changed
 - Minimum e3nn is now 0.4.4

diff --git a/README.md b/README.md
@@ -13,11 +13,13 @@ NequIP is an open-source code for building E(3)-equivariant interatomic potentia
 NequIP requires:
 
 * Python >= 3.7
-* PyTorch >= 1.8, !=1.9, <=1.11.*. PyTorch can be installed following the [instructions from their documentation](https://pytorch.org/get-started/locally/). Note that neither `torchvision` nor `torchaudio`, included in the default install command, are needed for NequIP.
+* PyTorch == `1.11.*` or `1.13.*` or later (do **not** use `1.12`). (Some users have observed silent issues with PyTorch 2+, as reported in #311. Please report any similar issues you encounter.) PyTorch can be installed following the [instructions from their documentation](https://pytorch.org/get-started/locally/). Note that neither `torchvision` nor `torchaudio`, included in the default install command, are needed for NequIP.
+
+**You must install PyTorch before installing NequIP, however it is not marked as a dependency of `nequip` to prevent `pip` from trying to overwrite your PyTorch installation.**
 
 To install:
 
-* We use [Weights&Biases](https://wandb.ai) to keep track of experiments. This is not a strict requirement — you can use our package without it — but it may make your life easier. If you want to use it, create an account [here](https://wandb.ai) and install the Python package:
+* We use [Weights&Biases](https://wandb.ai) (or TensorBoard) to keep track of experiments. This is not a strict requirement — you can use our package without it — but it may make your life easier. If you want to use it, create an account [here](https://wandb.ai) and install the Python package:
 
   ```
   pip install wandb
@@ -130,6 +132,12 @@ pair_coeff	* * deployed.pth <NequIP type for LAMMPS type 1> <NequIP type for LAM
 
 For installation instructions, please see the [`pair_nequip` repository](https://github.com/mir-group/pair_nequip).
 
+## Plugins / extending `nequip`
+
+`nequip` is a modular framework and extension packages can provide new model components, architectures, etc. The main extension package(s) currently available are:
+ - [Allegro](https://github.com/mir-group/allegro): implements the highly parallelizable Allegro model architecture.
+
+Details on writing and using plugins can be found in the [Allegro tutorial](https://colab.research.google.com/drive/1yq2UwnET4loJYg_Fptt9kpklVaZvoHnq) and in [`nequip-example-extension`](https://github.com/mir-group/nequip-example-extension/).
 
 ## References & citing
 

diff --git a/configs/example.yaml b/configs/example.yaml
@@ -9,7 +9,11 @@ run_name: example-run-toluene
 seed: 123                                                                         # model seed
 dataset_seed: 456                                                                 # data set seed
 append: true                                                                      # set true if a restarted run should append to the previous log file
-default_dtype: float32                                                            # type of float to use, e.g. float32 and float64
+
+# see https://arxiv.org/abs/2304.10061 for discussion of numerical precision
+default_dtype: float64
+model_dtype: float32
+allow_tf32: true    # consider setting to false if you plan to mix training/inference over any devices that are not NVIDIA Ampere or later
 
 # network
 r_max: 4.0                                                                        # cutoff radius in length units, here Angstrom, this is an important hyperparamter to scan
@@ -68,7 +72,7 @@ wandb: true
 wandb_project: toluene-example                                                     # project name used in wandb
 
 verbose: info                                                                      # the same as python logging, e.g. warning, info, debug, error; case insensitive
-log_batch_freq: 10                                                                 # batch frequency, how often to print training errors withinin the same epoch
+log_batch_freq: 100                                                                # batch frequency, how often to print training errors withinin the same epoch
 log_epoch_freq: 1                                                                  # epoch frequency, how often to print 
 save_checkpoint_freq: -1                                                           # frequency to save the intermediate checkpoint. no saving of intermediate checkpoints when the value is not positive.
 save_ema_checkpoint_freq: -1                                                       # frequency to save the intermediate ema checkpoint. no saving of intermediate checkpoints when the value is not positive.
@@ -95,6 +99,9 @@ early_stopping_patiences:
 early_stopping_lower_bounds:                                                       # stop early if a metric value is lower than the bound
   LR: 1.0e-5
 
+early_stopping_upper_bounds:                                                       # stop early if the training appears to have exploded
+  validation_loss: 1.0e+4
+
 # loss function
 loss_coeffs:                                                                        
   forces: 1                                                                        # if using PerAtomMSELoss, a default weight of 1:1 on each should work well
@@ -141,17 +148,12 @@ lr_scheduler_factor: 0.5
 # the default is to scale the atomic energy and forces by scaling them by the force standard deviation and to shift the energy by the mean atomic energy
 # in certain cases, it can be useful to have a trainable shift/scale and to also have species-dependent shifts/scales for each atom
 
-# whether the shifts and scales are trainable. Defaults to False. Optional
-per_species_rescale_shifts_trainable: false
-per_species_rescale_scales_trainable: false
-
 # initial atomic energy shift for each species. default to the mean of per atom energy. Optional
 # the value can be a constant float value, an array for each species, or a string that defines a statistics over the training dataset
+# if numbers are explicitly provided, they must be in the same energy units as the training data
 per_species_rescale_shifts: dataset_per_atom_total_energy_mean
 
 # initial atomic energy scale for each species. Optional.
 # the value can be a constant float value, an array for each species, or a string
-per_species_rescale_scales: dataset_forces_rms
-
-# if explicit numbers are given for the shifts/scales, this parameter must specify whether the given numbers are unitless shifts/scales or are in the units of the dataset. If ``True``, any global rescalings will correctly be applied to the per-species values.
-# per_species_rescale_arguments_in_dataset_units: True
+# if numbers are explicitly provided, they must be in the same energy units as the training data
+per_species_rescale_scales: null
diff --git a/configs/full.yaml b/configs/full.yaml
@@ -13,9 +13,11 @@ run_name: example-run-toluene
 seed: 123                                                                         # model seed
 dataset_seed: 456                                                                 # data set seed
 append: true                                                                      # set true if a restarted run should append to the previous log file
-default_dtype: float32                                                            # type of float to use, e.g. float32 and float64
-allow_tf32: false                                                                 # whether to use TensorFloat32 if it is available
-# device:  cuda                                                                   # which device to use. Default: automatically detected cuda or "cpu"
+
+# see https://arxiv.org/abs/2304.10061 for discussion of numerical precision
+default_dtype: float64
+model_dtype: float32
+allow_tf32: true    # consider setting to false if you plan to mix training/inference over any devices that are not NVIDIA Ampere or later
 
 # == network ==
 
@@ -161,14 +163,17 @@ wandb: true
 wandb_project: toluene-example                                                     # project name used in wandb
 wandb_watch: false
 
+# # using tensorboard for logging
+# tensorboard: true
+
 # see https://docs.wandb.ai/ref/python/watch
 # wandb_watch_kwargs:
 #   log: all
 #   log_freq: 1
 #   log_graph: true
 
 verbose: info                                                                      # the same as python logging, e.g. warning, info, debug, error. case insensitive
-log_batch_freq: 1                                                                  # batch frequency, how often to print training errors withinin the same epoch
+log_batch_freq: 100                                                                # batch frequency, how often to print training errors withinin the same epoch
 log_epoch_freq: 1                                                                  # epoch frequency, how often to print 
 save_checkpoint_freq: -1                                                           # frequency to save the intermediate checkpoint. no saving of intermediate checkpoints when the value is not positive.
 save_ema_checkpoint_freq: -1                                                       # frequency to save the intermediate ema checkpoint. no saving of intermediate checkpoints when the value is not positive.
@@ -207,9 +212,9 @@ early_stopping_upper_bounds:
 
 # loss function
 loss_coeffs:                                                                       # different weights to use in a weighted loss functions
-  forces: 1                                                                        # if using PerAtomMSELoss, a default weight of 1:1 on each should work well
+  forces: 1.0                                                                      # if using PerAtomMSELoss, a default weight of 1:1 on each should work well
   total_energy:                                                                    
-    - 1
+    - 1.0
     - PerAtomMSELoss
 # note that the ratio between force and energy loss matters for the training process. One may consider using 1:1 with the PerAtomMSELoss. If the energy loss still significantly dominate the loss function at the initial epochs, tune the energy loss weight lower helps the training a lot.
 
@@ -244,6 +249,15 @@ loss_coeffs:
 #   - L1Loss
 #   forces: 1.0
 
+# You can schedule changes in the loss coefficients using a callback:
+# In the "schedule" key each entry is a two-element list of:
+#  - the 1-based epoch index at which to start the new loss coefficients
+#  - the new loss coefficients as a dict
+#
+# start_of_epoch_callbacks:
+#  - !!python/object:nequip.train.callbacks.loss_schedule.SimpleLossSchedule {"schedule": [[2, {"forces": 0.0, "total_energy": 1.0}]]}
+#
+
 # output metrics
 metrics_components:
   - - forces                               # key 
@@ -282,8 +296,9 @@ optimizer_weight_decay: 0
 # setting to inf or null disables it
 max_gradient_norm: null
 
-# lr scheduler, currently only supports the two options listed below, if you need more please file an issue
+# lr scheduler
 # first: on-plateau, reduce lr by factory of lr_scheduler_factor if metrics_key hasn't improved for lr_scheduler_patience epoch
+# you can also set other options of the underlying PyTorch scheduler, for example lr_scheduler_threshold
 lr_scheduler_name: ReduceLROnPlateau
 lr_scheduler_patience: 100
 lr_scheduler_factor: 0.5
@@ -304,35 +319,42 @@ per_species_rescale_scales_trainable: false
 # whether the scales are trainable. Defaults to False. Optional
 per_species_rescale_shifts_trainable: false
 # whether the shifts are trainable. Defaults to False. Optional
+
 per_species_rescale_shifts: dataset_per_atom_total_energy_mean
 # initial atomic energy shift for each species. default to the mean of per atom energy. Optional
 # the value can be a constant float value, an array for each species, or a string
+# if numbers are explicitly provided, they must be in the same energy units as the training data
 # string option include: 
 # *  "dataset_per_atom_total_energy_mean", which computes the per atom average
 # *  "dataset_per_species_total_energy_mean", which automatically compute the per atom energy mean using a GP model
-per_species_rescale_scales: dataset_forces_rms
+
+per_species_rescale_scales: null
 # initial atomic energy scale for each species. Optional.
 # the value can be a constant float value, an array for each species, or a string
+# if numbers are explicitly provided, they must be in the same energy units as the training data
 # string option include: 
+# *  "dataset_forces_absmax", which computes the dataset maxmimum force component magnitude
 # *  "dataset_per_atom_total_energy_std", which computes the per atom energy std
 # *  "dataset_per_species_total_energy_std", which uses the GP model uncertainty
 # *  "dataset_per_species_forces_rms", which compute the force rms for each species
-# If not provided, defaults to dataset_per_species_force_rms or dataset_per_atom_total_energy_std, depending on whether forces are being trained.
+# If not provided, defaults to null.
+
 # per_species_rescale_kwargs: 
 #   total_energy: 
 #     alpha: 0.001
 #     max_iteration: 20
 #     stride: 100
-# keywords for ridge regression decomposition of per specie energy. Optional. Defaults to 0.001. The value should be in the range of 1e-3 to 1e-2
-# per_species_rescale_arguments_in_dataset_units: True
-# if explicit numbers are given for the shifts/scales, this parameter must specify whether the given numbers are unitless shifts/scales or are in the units of the dataset. If ``True``, any global rescalings will correctly be applied to the per-species values.
+# keywords for ridge regression decomposition of per species energy. Optional. Defaults to 0.001. The value should be in the range of 1e-3 to 1e-2
 
 # global energy shift and scale
 # When "dataset_total_energy_mean", the mean energy of the dataset. When None, disables the global shift. When a number, used directly.
 # Warning: if this value is not None, the model is no longer size extensive
 global_rescale_shift: null
 
-# global energy scale. When "dataset_force_rms", the RMS of force components in the dataset. When "dataset_total_energy_std", the stdev of energies in the dataset. When null, disables the global scale. When a number, used directly.
+# global energy scale. When "dataset_force_rms", the RMS of force components in the dataset.
+# When "dataset_forces_absmax", the maximum force component magnitude in the dataset.
+# When "dataset_total_energy_std", the stdev of energies in the dataset.
+# When null, disables the global scale. When a number, used directly.
 # If not provided, defaults to either dataset_force_rms or dataset_total_energy_std, depending on whether forces are being trained.
 global_rescale_scale: dataset_forces_rms
 
@@ -361,6 +383,3 @@ global_rescale_scale_trainable: false
 # per_species_rescale_shifts: null
 # per_species_rescale_scales: null
 
-# Options for e3nn's set_optimization_defaults. A dict:
-# e3nn_optimization_defaults:
-#   explicit_backward: True