mir-group · stevetorr · Jun 19, 2020 · Jun 18, 2020 · Jun 18, 2020 · Jun 18, 2020
diff --git a/flare/gp.py b/flare/gp.py
@@ -13,7 +13,7 @@
 from numpy.random import random
 from scipy.linalg import solve_triangular
 from scipy.optimize import minimize
-from typing import List, Callable, Union, Tuple
+from typing import List, Callable, Union, Tuple, Sequence
 
 from flare.env import AtomicEnvironment
 from flare.gp_algebra import get_like_from_mats, get_neg_like_grad, \
@@ -59,7 +59,7 @@ class GaussianProcess:
         name (str, optional): Name for the GP instance.
     """
 
-    def __init__(self, kernels: list = ['two', 'three'],
+    def __init__(self, kernels: List[str] = ['two', 'three'],
                  component: str = 'mc',
                  hyps: 'ndarray' = None, cutoffs={},
                  hyps_mask: dict = {},
@@ -152,6 +152,9 @@ def __init__(self, kernels: list = ['two', 'three'],
         self.likelihood_gradient = None
         self.bounds = None
 
+        # File used for reading / writing model if model is large
+        self.ky_mat_file = None
+
         self.check_instantiation()
 
     def check_instantiation(self):
@@ -670,20 +673,24 @@ def from_dict(dictionary):
         new_gp.n_envs_prev = len(new_gp.training_data)
 
         # Save time by attempting to load in computed attributes
-        if len(new_gp.training_data) > 5000:
+        if dictionary.get('ky_mat_file'):
             try:
                 new_gp.ky_mat = np.load(dictionary['ky_mat_file'])
                 new_gp.compute_matrices()
+                new_gp.ky_mat_file = None
+
             except FileNotFoundError:
                 new_gp.ky_mat = None
                 new_gp.l_mat = None
                 new_gp.alpha = None
                 new_gp.ky_mat_inv = None
-                filename = dictionary['ky_mat_file']
-                logger = logging.getLogger(self.logger_name)
+                filename = dictionary.get('ky_mat_file')
+                logger = logging.getLogger(new_gp.logger_name)
                 logger.warning("the covariance matrices are not loaded"
                                f"because {filename} cannot be found")
         else:
+            new_gp.ky_mat = np.array(dictionary['ky_mat']) \
+                if dictionary.get('ky_mat') is not None else None
             new_gp.ky_mat_inv = np.array(dictionary['ky_mat_inv']) \
                 if dictionary.get('ky_mat_inv') is not None else None
             new_gp.ky_mat = np.array(dictionary['ky_mat']) \
@@ -702,14 +709,21 @@ def compute_matrices(self):
         :return:
         """
         ky_mat = self.ky_mat
-        l_mat = np.linalg.cholesky(ky_mat)
-        l_mat_inv = np.linalg.inv(l_mat)
-        ky_mat_inv = l_mat_inv.T @ l_mat_inv
-        alpha = np.matmul(ky_mat_inv, self.all_labels)
 
-        self.l_mat = l_mat
-        self.alpha = alpha
-        self.ky_mat_inv = ky_mat_inv
+        if ky_mat is None or \
+                (isinstance(ky_mat, np.ndarray) and not np.any(
+                ky_mat)):
+            Warning("Warning: Covariance matrix was not loaded but "
+                    "compute_matrices was called. Computing covariance "
+                    "matrix and proceeding...")
+            self.set_L_alpha()
+
+        else:
+            self.l_mat = np.linalg.cholesky(ky_mat)
+            self.l_mat_inv = np.linalg.inv(self.l_mat)
+            self.ky_mat_inv = self.l_mat_inv.T @ self.l_mat_inv
+            self.alpha = np.matmul(self.ky_mat_inv, self.all_labels)
+
 
     def adjust_cutoffs(self, new_cutoffs: Union[list, tuple, 'np.ndarray'],
                        reset_L_alpha=True, train=True, new_hyps_mask=None):
@@ -783,6 +797,9 @@ def remove_force_data(self, indexes: Union[int, List[int]],
         if max(indexes) > len(self.training_data):
             raise ValueError("Index out of range of data")
 
+        if len(indexes) == 0:
+            return [], []
+
         # Get in reverse order so that modifying higher indexes doesn't affect
         # lower indexes
         indexes.sort(reverse=True)
@@ -807,15 +824,24 @@ def remove_force_data(self, indexes: Union[int, List[int]],
 
         return removed_data, removed_labels
 
-    def write_model(self, name: str, format: str = 'json'):
+    def write_model(self, name: str, format: str = None,
+                    split_matrix_size_cutoff: int = 5000):
         """
         Write model in a variety of formats to a file for later re-use.
+        JSON files are open to visual inspection and are easier to use 
+        across different versions of FLARE or GP implementations. However,
+        they are larger and loading them in takes longer (by setting up a
+        new GP from the specifications). Pickled files can be faster to
+        read & write, and they take up less memory.
+
         Args:
             name (str): Output name.
             format (str): Output format.
+            split_matrix_size_cutoff (int): If there are more than this
+            number of training points in the set, save the matrices seperately.
         """
 
-        if len(self.training_data) > 5000:
+        if len(self.training_data) > split_matrix_size_cutoff:
             np.save(f"{name}_ky_mat.npy", self.ky_mat)
             self.ky_mat_file = f"{name}_ky_mat.npy"
 
@@ -829,21 +855,35 @@ def write_model(self, name: str, format: str = 'json'):
             self.alpha = None
             self.ky_mat_inv = None
 
+        # Automatically detect output format from name variable
+
+        for detect in ['json','pickle','binary']:
+            if detect in name.lower():
+                format = detect
+                break
+
+        if format is None:
+            format = 'json'
+
         supported_formats = ['json', 'pickle', 'binary']
 
         if format.lower() == 'json':
-            with open(f'{name}.json', 'w') as f:
+            if '.json' != name[-5:]:
+                name += '.json'
+            with open(name, 'w') as f:
                 json.dump(self.as_dict(), f, cls=NumpyEncoder)
 
         elif format.lower() == 'pickle' or format.lower() == 'binary':
-            with open(f'{name}.pickle', 'wb') as f:
+            if '.pickle' != name[-7:]:
+                name += '.pickle'
+            with open(name, 'wb') as f:
                 pickle.dump(self, f)
 
         else:
             raise ValueError("Output format not supported: try from "
                              "{}".format(supported_formats))
 
-        if len(self.training_data) > 5000:
+        if len(self.training_data) > split_matrix_size_cutoff:
             self.ky_mat = temp_ky_mat
             self.l_mat = temp_l_mat
             self.alpha = temp_alpha
@@ -875,7 +915,7 @@ def from_file(filename: str, format: str = ''):
 
                 GaussianProcess.backward_attributes(gp_model.__dict__)
 
-                if len(gp_model.training_data) > 5000:
+                if hasattr(gp_model, 'ky_mat_file') and gp_model.ky_mat_file:
                     try:
                         gp_model.ky_mat = np.load(gp_model.ky_mat_file,
                                                   allow_pickle=True)

diff --git a/tests/test_gp.py b/tests/test_gp.py
@@ -240,7 +240,6 @@ class TestIO():
     def test_representation_method(self, all_gps, multihyps):
         test_gp = all_gps[multihyps]
         the_str = str(test_gp)
-        print(the_str)
         assert 'GaussianProcess Object' in the_str
         assert 'Kernel: [\'twobody\', \'threebody\', \'manybody\']' in the_str
         assert 'Cutoffs: {\'twobody\': 0.8, \'threebody\': 0.8, \'manybody\': 0.8}' in the_str
@@ -302,6 +301,15 @@ def test_load_and_reload(self, all_gps, validation_env, multihyps):
         with raises(ValueError):
             test_gp.write_model('test_gp_write', 'cucumber')
 
+        # Test logic for auto-detecting format in write command
+        for format in ['json', 'pickle']:
+            write_string = 'format_write_test.'+format
+            if os.path.exists(write_string):
+                os.remove(write_string)
+
+            test_gp.write_model(write_string)
+            assert os.path.exists(write_string)
+            os.remove(write_string)
 
     def test_load_reload_huge(self, all_gps):
         """
@@ -312,19 +320,22 @@ def test_load_reload_huge(self, all_gps):
         test_gp = deepcopy(all_gps[False])
         test_gp.set_L_alpha()
         dummy_gp = deepcopy(test_gp)
-        dummy_gp.training_data = [1]*5001
 
+        N_data = len(dummy_gp.training_data)
         prev_ky_mat = deepcopy(dummy_gp.ky_mat)
         prev_l_mat = deepcopy(dummy_gp.l_mat)
 
-        dummy_gp.training_data = [1]*5001
-        test_gp.write_model('test_gp_write', 'json')
-        new_gp = GaussianProcess.from_file('test_gp_write.json')
-        assert np.array_equal(prev_ky_mat, new_gp.ky_mat)
-        assert np.array_equal(prev_l_mat, new_gp.l_mat)
-        assert new_gp.training_data is not test_gp.training_data
+        for model_format in ['pickle','json']:
+            dummy_gp.write_model('test_gp_write', model_format, N_data-1)
+            new_gp = GaussianProcess.from_file(f'test_gp_write.{model_format}')
+            assert np.allclose(prev_ky_mat, new_gp.ky_mat)
+            assert np.allclose(prev_l_mat, new_gp.l_mat)
+            assert new_gp.training_data is not test_gp.training_data
 
-        os.remove('test_gp_write.json')
+            os.remove(f'test_gp_write.{model_format}')
+            dummy_gp = deepcopy(test_gp)
+
+        os.remove(f'test_gp_write_ky_mat.npy')
 
 
 def dumpcompare(obj1, obj2):