Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI for openPMD #449

Merged
merged 13 commits into from
Jun 6, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ jobs:
with:
repository: mala-project/test-data
path: mala_data
ref: v1.6.0
lfs: false
ref: v1.7.0
lfs: true

- name: Test mala
shell: 'bash -c "docker exec -i mala-cpu bash < {0}"'
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN conda env create -f mala_${DEVICE}_environment.yml && rm -rf /opt/conda/pkgs
RUN /opt/conda/envs/mala-${DEVICE}/bin/pip install --no-input --no-cache-dir \
pytest \
oapackage==2.6.8 \
openpmd-api==0.14.5 \
openpmd-api==0.15.1 \
pqkmeans

RUN echo "source activate mala-${DEVICE}" > ~/.bashrc
Expand Down
10 changes: 9 additions & 1 deletion docs/source/usage/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,15 @@ MALA can be used to process raw data into ready-to-use data fro the surrogate mo
For this, the ``DataConverter`` class can be used; see example ``ex02_preprocess_data``.
If you are not sure which descriptor hyperparameters to use (e.g.: "Which cutoff
radius do I need?") MALA provides a fast analysis that does not involve
model tuning. See ``ex13_acsd``
model tuning. See ``ex13_acsd``.

By default, MALA saves its data files to numpy ``.npy`` files. However, for
storing large amounts of volumetric data (plus metadata), libraries such as
`OpenPMD <https://github.com/openPMD/openPMD-api>`_ are more suitable.
MALA provides a full OpenPMD interface that is currently tested in production.
We recommend usage of the OpenPMD interface, which will become the new default
in upcoming versions. Examples related to data processing and general workflow
usage include lines that showcase the usage of OpenPMD within MALA.

Using input and output data
###########################
Expand Down
8 changes: 8 additions & 0 deletions examples/ex01_train_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@
"Be_snapshot0.out.npy", data_path, "tr")
data_handler.add_snapshot("Be_snapshot1.in.npy", data_path,
"Be_snapshot1.out.npy", data_path, "va")
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_handler.add_snapshot("Be_snapshot0.in.h5", data_path,
# "Be_snapshot0.out.h5", data_path, "tr",
# snapshot_type="openpmd")
# data_handler.add_snapshot("Be_snapshot1.in.h5", data_path,
# "Be_snapshot1.out.h5", data_path, "va",
# snapshot_type="openpmd")
data_handler.prepare_data()
printout("Read data: DONE.")

Expand Down
13 changes: 13 additions & 0 deletions examples/ex02_test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,19 @@
"Be_snapshot3.out.npy", data_path, "te",
calculation_output_file=
os.path.join(data_path, "Be_snapshot3.out"))
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_handler.add_snapshot("Be_snapshot2.in.h5", data_path,
# "Be_snapshot2.out.h5", data_path, "te",
# calculation_output_file=
# os.path.join(data_path, "Be_snapshot2.out"),
# snapshot_type="openpmd")
# data_handler.add_snapshot("Be_snapshot3.in.h5", data_path,
# "Be_snapshot3.out.h5", data_path, "te",
# calculation_output_file=
# os.path.join(data_path, "Be_snapshot3.out"),
# snapshot_type="openpmd")

data_handler.prepare_data(reparametrize_scaler=False)


Expand Down
6 changes: 6 additions & 0 deletions examples/ex03_preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@
target_save_path="./",
additional_info_save_path="./",
naming_scheme="Be_snapshot*.npy")
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_converter.convert_snapshots(descriptor_save_path="./",
# target_save_path="./",
# additional_info_save_path="./",
# naming_scheme="Be_snapshot*.h5")

# If parts of the data have already been processed, the DataConverter class can
# also be used to convert the rest.
Expand Down
5 changes: 5 additions & 0 deletions examples/ex04_postprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
ldos = mala.LDOS.from_numpy_file(test_parameters,
os.path.join(data_path,
"Be_snapshot0.out.npy"))
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# ldos = mala.LDOS.from_numpy_file(test_parameters,
# os.path.join(data_path,
# "Be_snapshot0.out.h5"))

# Read additional information about the calculation.
# By doing this, the calculator is able to know e.g. the temperature
Expand Down
12 changes: 12 additions & 0 deletions examples/ex18_shuffle_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,23 @@
"Be_snapshot0.out.npy", data_path)
data_shuffler.add_snapshot("Be_snapshot1.in.npy", data_path,
"Be_snapshot1.out.npy", data_path)
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
# "Be_snapshot0.out.h5", data_path,
# snapshot_type="openpmd")
# data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
# "Be_snapshot1.out.h5", data_path,
# snapshot_type="openpmd")

# After shuffling, these snapshots can be loaded as regular snapshots for
# lazily loaded training. Both OpenPMD and numpy can be used as save format
# for data.
data_shuffler.shuffle_snapshots(complete_save_path="./",
save_name="Be_shuffled*")
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_shuffler.shuffle_snapshots(complete_save_path="./",
# save_name="Be_shuffled*.h5")


17 changes: 12 additions & 5 deletions mala/datahandling/data_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,11 +502,18 @@ def __convert_single_snapshot(self, snapshot_number,
self.descriptor_calculator.\
write_to_numpy_file(input_path, tmp_input)
else:
tmp_input, local_offset, local_reach = \
self.descriptor_calculator.convert_local_to_3d(tmp_input)
self.descriptor_calculator.\
write_to_openpmd_iteration(input_iteration,
tmp_input, local_offset=local_offset, local_reach=local_reach)
if self.parameters._configuration["mpi"]:
tmp_input, local_offset, local_reach = \
self.descriptor_calculator.convert_local_to_3d(tmp_input)
self.descriptor_calculator. \
write_to_openpmd_iteration(input_iteration,
tmp_input,
local_offset=local_offset,
local_reach=local_reach)
else:
self.descriptor_calculator. \
write_to_openpmd_iteration(input_iteration,
tmp_input)
del tmp_input

###########
Expand Down
10 changes: 5 additions & 5 deletions mala/datahandling/data_shuffler.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,10 @@ def __shuffle_numpy(self, number_of_new_snapshots, shuffle_dimensions,
new_targets)
else:
# We check above that in the non-numpy case, OpenPMD will work.
self.descriptor_calculator.grid_dimensions = shuffle_dimensions
self.target_calculator.grid_dimensions = shuffle_dimensions
self.descriptor_calculator.grid_dimensions = \
list(shuffle_dimensions)
self.target_calculator.grid_dimensions = \
list(shuffle_dimensions)
self.descriptor_calculator.\
write_to_openpmd_file(descriptor_name+".in."+file_ending,
new_descriptors,
Expand Down Expand Up @@ -174,7 +176,6 @@ def __init__(self, save_path, npy_directory, npy_file, calculator,
self.name_infix = name_infix
self.dimension = dimension


def __shuffle_openpmd(self, dot: __DescriptorOrTarget,
number_of_new_snapshots, shuffle_dimensions,
save_name, permutations, file_ending):
Expand Down Expand Up @@ -227,7 +228,7 @@ def from_chunk_i(i, n, dset, slice_dimension=0):
# snapshots would be opened one after another in parallel
for i in range(0, number_of_new_snapshots):
# We check above that in the non-numpy case, OpenPMD will work.
dot.calculator.grid_dimensions = shuffle_dimensions
dot.calculator.grid_dimensions = list(shuffle_dimensions)
name_prefix = os.path.join(dot.save_path,
save_name.replace("*", str(i)))
shuffled_snapshot_series = dot.calculator.\
Expand Down Expand Up @@ -277,7 +278,6 @@ def from_chunk_i(i, n, dset, slice_dimension=0):
shuffle_dimensions)
shuffled_snapshot_series.close()


def shuffle_snapshots(self,
complete_save_path=None,
descriptor_save_path=None,
Expand Down
2 changes: 1 addition & 1 deletion mala/targets/density.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def read_from_cube(self, path, units="1/A^3", **kwargs):
data, meta = read_cube(path)
data *= self.convert_units(1, in_units=units)
self.density = data
self.grid_dimensions = np.shape(data)[0:3]
self.grid_dimensions = list(np.shape(data)[0:3])
return data

def read_from_xsf(self, path, units="1/A^3", **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion mala/targets/ldos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1475,7 +1475,7 @@ def _read_from_qe_files(self, path_scheme, units,
# Convert and then append the LDOS data.
data = data*self.convert_units(1, in_units=units)
ldos_data[:, :, :, i-start_index] = data[:, :, :]
self.grid_dimensions = np.shape(ldos_data)[0:3]
self.grid_dimensions = list(np.shape(ldos_data)[0:3])

# We have to gather the LDOS either file based or not.
if self.parameters._configuration["mpi"]:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ optuna
scipy
pandas
tensorboard
openpmd-api>=0.15
2 changes: 1 addition & 1 deletion test/clean.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

# Remove artifact files that some example scripts write.

rm -rv *.pth *.pkl ex09.db *.pw* __pycache__ *.cube ex10_vis *.tmp *.npy *.json
rm -rv *.pth *.pkl ex09.db *.pw* __pycache__ *.cube ex10_vis *.tmp *.npy *.json *.h5 *.bp
160 changes: 160 additions & 0 deletions test/shuffling_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def test_seed(self):
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*")

test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

Expand All @@ -48,6 +49,64 @@ def test_seed(self):
new = np.load("Be_REshuffled1.out.npy")
assert np.isclose(np.sum(np.abs(old-new)), 0.0, atol=accuracy)

def test_seed_openpmd(self):
"""Test that the shuffling is handled correctly internally."""
RandomDefaultUser marked this conversation as resolved.
Show resolved Hide resolved
test_parameters = mala.Parameters()
test_parameters.openpmd_configuration = {
'adios2': {
'engine': {
'parameters': {
'StatsLevel': 1
}
}
}
}
RandomDefaultUser marked this conversation as resolved.
Show resolved Hide resolved
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
"Be_snapshot0.out.h5", data_path,
snapshot_type="openpmd")
data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
"Be_snapshot1.out.h5", data_path,
snapshot_type="openpmd")

# After shuffling, these snapshots can be loaded as regular snapshots
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")

test_parameters = mala.Parameters()
test_parameters.openpmd_configuration = {
'adios2': {
'engine': {
'parameters': {
'StatsLevel': 1
}
}
}
}
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
data_shuffler.add_snapshot("Be_snapshot0.in.npy", data_path,
"Be_snapshot0.out.npy", data_path,
snapshot_type="numpy")
data_shuffler.add_snapshot("Be_snapshot1.in.npy", data_path,
"Be_snapshot1.out.npy", data_path,
snapshot_type="numpy")

# After shuffling, these snapshots can be loaded as regular snapshots
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5")

old = data_shuffler.target_calculator.\
read_from_openpmd_file("Be_shuffled1.out.h5")
new = data_shuffler.target_calculator.\
read_from_openpmd_file("Be_REshuffled1.out.h5")
assert np.isclose(np.sum(np.abs(old-new)), 0.0, atol=accuracy)

def test_training(self):
test_parameters = mala.Parameters()
test_parameters.data.data_splitting_type = "by_snapshot"
Expand Down Expand Up @@ -79,6 +138,7 @@ def test_training(self):
old_loss = test_trainer.final_validation_loss

# Shuffle.
test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

Expand Down Expand Up @@ -108,3 +168,103 @@ def test_training(self):
test_trainer.train_network()
new_loss = test_trainer.final_validation_loss
assert old_loss > new_loss

def test_training_openpmd(self):
test_parameters = mala.Parameters()
test_parameters.openpmd_configuration = {
'adios2': {
'engine': {
'parameters': {
'StatsLevel': 1
}
}
}
}
test_parameters.data.data_splitting_type = "by_snapshot"
test_parameters.data.input_rescaling_type = "feature-wise-standard"
test_parameters.data.output_rescaling_type = "normal"
test_parameters.network.layer_activations = ["ReLU"]
test_parameters.running.max_number_epochs = 50
test_parameters.running.mini_batch_size = 40
test_parameters.running.learning_rate = 0.00001
test_parameters.running.trainingtype = "Adam"
test_parameters.verbosity = 1
test_parameters.data.use_lazy_loading = True

# Train without shuffling.
data_handler = mala.DataHandler(test_parameters)
data_handler.add_snapshot("Be_snapshot0.in.h5", data_path,
"Be_snapshot0.out.h5", data_path, "tr",
snapshot_type="openpmd")
data_handler.add_snapshot("Be_snapshot1.in.h5", data_path,
"Be_snapshot1.out.h5", data_path, "va",
snapshot_type="openpmd")
data_handler.prepare_data()

test_parameters.network.layer_sizes = [data_handler.input_dimension,
100,
data_handler.output_dimension]
test_network = mala.Network(test_parameters)
test_trainer = mala.Trainer(test_parameters, test_network,
data_handler)
test_trainer.train_network()
old_loss = test_trainer.final_validation_loss

# Shuffle.
test_parameters = mala.Parameters()
test_parameters.openpmd_configuration = {
'adios2': {
'engine': {
'parameters': {
'StatsLevel': 1
}
}
}
}
test_parameters.data.shuffling_seed = 1234
test_parameters.data.data_splitting_type = "by_snapshot"
test_parameters.data.input_rescaling_type = "feature-wise-standard"
test_parameters.data.output_rescaling_type = "normal"
test_parameters.network.layer_activations = ["ReLU"]
test_parameters.running.max_number_epochs = 50
test_parameters.running.mini_batch_size = 40
test_parameters.running.learning_rate = 0.00001
test_parameters.running.trainingtype = "Adam"
test_parameters.verbosity = 1
test_parameters.data.use_lazy_loading = True

data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
"Be_snapshot0.out.h5", data_path,
snapshot_type="openpmd")
data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
"Be_snapshot1.out.h5", data_path,
snapshot_type="openpmd")

# After shuffling, these snapshots can be loaded as regular snapshots
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")
test_parameters.descriptors.descriptors_contain_xyz = True

# Train with shuffling.
data_handler = mala.DataHandler(test_parameters)
# Add a snapshot we want to use in to the list.
data_handler.add_snapshot("Be_shuffled0.in.h5", ".",
"Be_shuffled0.out.h5", ".", "tr",
snapshot_type="openpmd")
data_handler.add_snapshot("Be_shuffled1.in.h5", ".",
"Be_shuffled1.out.h5", ".", "va",
snapshot_type="openpmd")
data_handler.prepare_data()
test_parameters.network.layer_sizes = [data_handler.input_dimension,
100,
data_handler.output_dimension]

test_network = mala.Network(test_parameters)
test_trainer = mala.Trainer(test_parameters, test_network,
data_handler)
test_trainer.train_network()
new_loss = test_trainer.final_validation_loss
assert old_loss > new_loss
Loading