Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Steven/feature/gp write choice #188

Merged
merged 8 commits into from
Jun 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 58 additions & 18 deletions flare/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from numpy.random import random
from scipy.linalg import solve_triangular
from scipy.optimize import minimize
from typing import List, Callable, Union, Tuple
from typing import List, Callable, Union, Tuple, Sequence

from flare.env import AtomicEnvironment
from flare.gp_algebra import get_like_from_mats, get_neg_like_grad, \
Expand Down Expand Up @@ -59,7 +59,7 @@ class GaussianProcess:
name (str, optional): Name for the GP instance.
"""

def __init__(self, kernels: list = ['two', 'three'],
def __init__(self, kernels: List[str] = ['two', 'three'],
component: str = 'mc',
hyps: 'ndarray' = None, cutoffs={},
hyps_mask: dict = {},
Expand Down Expand Up @@ -152,6 +152,9 @@ def __init__(self, kernels: list = ['two', 'three'],
self.likelihood_gradient = None
self.bounds = None

# File used for reading / writing model if model is large
self.ky_mat_file = None

self.check_instantiation()

def check_instantiation(self):
Expand Down Expand Up @@ -670,20 +673,24 @@ def from_dict(dictionary):
new_gp.n_envs_prev = len(new_gp.training_data)

# Save time by attempting to load in computed attributes
if len(new_gp.training_data) > 5000:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for removing this arbitrary criteria

if dictionary.get('ky_mat_file'):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe next time this part can be written as a function? it repeats again in the from_dict method.

try:
new_gp.ky_mat = np.load(dictionary['ky_mat_file'])
new_gp.compute_matrices()
new_gp.ky_mat_file = None

except FileNotFoundError:
new_gp.ky_mat = None
new_gp.l_mat = None
new_gp.alpha = None
new_gp.ky_mat_inv = None
filename = dictionary['ky_mat_file']
logger = logging.getLogger(self.logger_name)
filename = dictionary.get('ky_mat_file')
logger = logging.getLogger(new_gp.logger_name)
logger.warning("the covariance matrices are not loaded"
f"because {filename} cannot be found")
else:
new_gp.ky_mat = np.array(dictionary['ky_mat']) \
if dictionary.get('ky_mat') is not None else None
new_gp.ky_mat_inv = np.array(dictionary['ky_mat_inv']) \
if dictionary.get('ky_mat_inv') is not None else None
new_gp.ky_mat = np.array(dictionary['ky_mat']) \
Expand All @@ -702,14 +709,21 @@ def compute_matrices(self):
:return:
"""
ky_mat = self.ky_mat
l_mat = np.linalg.cholesky(ky_mat)
l_mat_inv = np.linalg.inv(l_mat)
ky_mat_inv = l_mat_inv.T @ l_mat_inv
alpha = np.matmul(ky_mat_inv, self.all_labels)

self.l_mat = l_mat
self.alpha = alpha
self.ky_mat_inv = ky_mat_inv
if ky_mat is None or \
(isinstance(ky_mat, np.ndarray) and not np.any(
ky_mat)):
Warning("Warning: Covariance matrix was not loaded but "
"compute_matrices was called. Computing covariance "
"matrix and proceeding...")
self.set_L_alpha()

else:
self.l_mat = np.linalg.cholesky(ky_mat)
self.l_mat_inv = np.linalg.inv(self.l_mat)
self.ky_mat_inv = self.l_mat_inv.T @ self.l_mat_inv
self.alpha = np.matmul(self.ky_mat_inv, self.all_labels)


def adjust_cutoffs(self, new_cutoffs: Union[list, tuple, 'np.ndarray'],
reset_L_alpha=True, train=True, new_hyps_mask=None):
Expand Down Expand Up @@ -783,6 +797,9 @@ def remove_force_data(self, indexes: Union[int, List[int]],
if max(indexes) > len(self.training_data):
raise ValueError("Index out of range of data")

if len(indexes) == 0:
return [], []

# Get in reverse order so that modifying higher indexes doesn't affect
# lower indexes
indexes.sort(reverse=True)
Expand All @@ -807,15 +824,24 @@ def remove_force_data(self, indexes: Union[int, List[int]],

return removed_data, removed_labels

def write_model(self, name: str, format: str = 'json'):
def write_model(self, name: str, format: str = None,
split_matrix_size_cutoff: int = 5000):
"""
Write model in a variety of formats to a file for later re-use.
JSON files are open to visual inspection and are easier to use
across different versions of FLARE or GP implementations. However,
they are larger and loading them in takes longer (by setting up a
new GP from the specifications). Pickled files can be faster to
read & write, and they take up less memory.

Args:
name (str): Output name.
format (str): Output format.
split_matrix_size_cutoff (int): If there are more than this
number of training points in the set, save the matrices seperately.
"""

if len(self.training_data) > 5000:
if len(self.training_data) > split_matrix_size_cutoff:
np.save(f"{name}_ky_mat.npy", self.ky_mat)
self.ky_mat_file = f"{name}_ky_mat.npy"

Expand All @@ -829,21 +855,35 @@ def write_model(self, name: str, format: str = 'json'):
self.alpha = None
self.ky_mat_inv = None

# Automatically detect output format from name variable

for detect in ['json','pickle','binary']:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is very thoughtful

if detect in name.lower():
format = detect
break

if format is None:
format = 'json'

supported_formats = ['json', 'pickle', 'binary']

if format.lower() == 'json':
with open(f'{name}.json', 'w') as f:
if '.json' != name[-5:]:
name += '.json'
with open(name, 'w') as f:
json.dump(self.as_dict(), f, cls=NumpyEncoder)

elif format.lower() == 'pickle' or format.lower() == 'binary':
with open(f'{name}.pickle', 'wb') as f:
if '.pickle' != name[-7:]:
name += '.pickle'
with open(name, 'wb') as f:
pickle.dump(self, f)

else:
raise ValueError("Output format not supported: try from "
"{}".format(supported_formats))

if len(self.training_data) > 5000:
if len(self.training_data) > split_matrix_size_cutoff:
self.ky_mat = temp_ky_mat
self.l_mat = temp_l_mat
self.alpha = temp_alpha
Expand Down Expand Up @@ -875,7 +915,7 @@ def from_file(filename: str, format: str = ''):

GaussianProcess.backward_attributes(gp_model.__dict__)

if len(gp_model.training_data) > 5000:
if hasattr(gp_model, 'ky_mat_file') and gp_model.ky_mat_file:
try:
gp_model.ky_mat = np.load(gp_model.ky_mat_file,
allow_pickle=True)
Expand Down
29 changes: 20 additions & 9 deletions tests/test_gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ class TestIO():
def test_representation_method(self, all_gps, multihyps):
test_gp = all_gps[multihyps]
the_str = str(test_gp)
print(the_str)
assert 'GaussianProcess Object' in the_str
assert 'Kernel: [\'twobody\', \'threebody\', \'manybody\']' in the_str
assert 'Cutoffs: {\'twobody\': 0.8, \'threebody\': 0.8, \'manybody\': 0.8}' in the_str
Expand Down Expand Up @@ -302,6 +301,15 @@ def test_load_and_reload(self, all_gps, validation_env, multihyps):
with raises(ValueError):
test_gp.write_model('test_gp_write', 'cucumber')

# Test logic for auto-detecting format in write command
for format in ['json', 'pickle']:
write_string = 'format_write_test.'+format
if os.path.exists(write_string):
os.remove(write_string)

test_gp.write_model(write_string)
assert os.path.exists(write_string)
os.remove(write_string)

def test_load_reload_huge(self, all_gps):
"""
Expand All @@ -312,19 +320,22 @@ def test_load_reload_huge(self, all_gps):
test_gp = deepcopy(all_gps[False])
test_gp.set_L_alpha()
dummy_gp = deepcopy(test_gp)
dummy_gp.training_data = [1]*5001

N_data = len(dummy_gp.training_data)
prev_ky_mat = deepcopy(dummy_gp.ky_mat)
prev_l_mat = deepcopy(dummy_gp.l_mat)

dummy_gp.training_data = [1]*5001
test_gp.write_model('test_gp_write', 'json')
new_gp = GaussianProcess.from_file('test_gp_write.json')
assert np.array_equal(prev_ky_mat, new_gp.ky_mat)
assert np.array_equal(prev_l_mat, new_gp.l_mat)
assert new_gp.training_data is not test_gp.training_data
for model_format in ['pickle','json']:
dummy_gp.write_model('test_gp_write', model_format, N_data-1)
new_gp = GaussianProcess.from_file(f'test_gp_write.{model_format}')
assert np.allclose(prev_ky_mat, new_gp.ky_mat)
assert np.allclose(prev_l_mat, new_gp.l_mat)
assert new_gp.training_data is not test_gp.training_data

os.remove('test_gp_write.json')
os.remove(f'test_gp_write.{model_format}')
dummy_gp = deepcopy(test_gp)

os.remove(f'test_gp_write_ky_mat.npy')


def dumpcompare(obj1, obj2):
Expand Down