From 695f7e1b97bead99ffe5c7f7dc8c8f30959c240b Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 18 Sep 2024 16:58:15 +0200 Subject: [PATCH 1/7] Added refactoring --- swiftsimio/masks.py | 5 +- .../metadata/metadata/metadata_fields.py | 2 + swiftsimio/metadata/objects.py | 1247 +++++++++++++++++ swiftsimio/reader.py | 1125 +-------------- 4 files changed, 1255 insertions(+), 1124 deletions(-) create mode 100644 swiftsimio/metadata/objects.py diff --git a/swiftsimio/masks.py b/swiftsimio/masks.py index dde09d3f..1c65f6cf 100644 --- a/swiftsimio/masks.py +++ b/swiftsimio/masks.py @@ -50,9 +50,8 @@ def __init__(self, metadata: SWIFTMetadata, spatial_only=True): self.units = metadata.units self.spatial_only = spatial_only - if self.metadata.filetype == "FOF": - # No virtual snapshots or cells metadata for fof currently - raise NotImplementedError("Masking not supported for FOF filetype") + if not self.metadata.masking_valid: + raise NotImplementedError(f"Masking not supported for {self.metadata.output_type} filetype") if self.metadata.partial_snapshot: raise InvalidSnapshot( diff --git a/swiftsimio/metadata/metadata/metadata_fields.py b/swiftsimio/metadata/metadata/metadata_fields.py index 2e2b2f99..182a3907 100644 --- a/swiftsimio/metadata/metadata/metadata_fields.py +++ b/swiftsimio/metadata/metadata/metadata_fields.py @@ -23,6 +23,8 @@ header_unpack_arrays = { "BoxSize": "boxsize", "NumPart_ThisFile": "num_part", + "NumGroup_ThisFile": "num_group", + "NumSubhalos_ThisFile": "num_subhalo", "CanHaveTypes": "has_type", "NumFilesPerSnapshot": "num_files_per_snapshot", "OutputType": "output_type", diff --git a/swiftsimio/metadata/objects.py b/swiftsimio/metadata/objects.py new file mode 100644 index 00000000..cdccc478 --- /dev/null +++ b/swiftsimio/metadata/objects.py @@ -0,0 +1,1247 @@ + +import numpy as np +import unyt + +import h5py +from swiftsimio.conversions import swift_cosmology_to_astropy +from swiftsimio import metadata +from abc import ABC, abstractmethod + +import re +import warnings + +from datetime import datetime +from pathlib import Path + +from typing import List, Optional + +class MassTable(object): + """ + Extracts a mass table to local variables based on the + particle type names. + """ + + def __init__(self, base_mass_table: np.array, mass_units: unyt.unyt_quantity): + """ + Parameters + ---------- + + base_mass_table : np.array + Mass table of the same length as the number of particle types. + + mass_units : unyt_quantity + Base mass units for the simulation. + """ + + # TODO: Extract these names from the files themselves if possible. + + for index, name in metadata.particle_types.particle_name_underscores.items(): + try: + setattr( + self, + name, + unyt.unyt_quantity(base_mass_table[index], units=mass_units), + ) + except IndexError: + # Backwards compatible. + setattr(self, name, None) + + return + + def __str__(self): + return f"Mass table for {' '.join(metadata.particle_types.particle_name_underscores.values())}" + + def __repr__(self): + return self.__str__() + + +class MappingTable(object): + """ + A mapping table from one named column instance to the other. + Initially designed for the mapping between dust and elements. + """ + + def __init__( + self, + data: np.ndarray, + named_columns_x: List[str], + named_columns_y: List[str], + named_columns_x_name: str, + named_columns_y_name: str, + ): + """ + Parameters + ---------- + + data: np.ndarray + The data array providing the mapping between the named + columns. Should be of size N x M, where N is the number + of elements in ``named_columns_x`` and M the number + of elements in ``named_columns_y``. + + named_columns_x: List[str] + The names of the columns in the first axis. + + named_columns_y: List[str] + The names of the columns in the second axis. + + named_columns_x_name: str + The name of the first mapping. + + named_columns_y_name: str + The name of the second mapping. + """ + + self.data = data + self.named_columns_x = named_columns_x + self.named_columns_y = named_columns_y + self.named_columns_x_name = named_columns_x_name + self.named_columns_y_name = named_columns_y_name + + for x, name_x in enumerate(named_columns_x): + for y, name_y in enumerate(named_columns_y): + setattr(self, f"{name_x.lower()}_to_{name_y.lower()}", data[x][y]) + + return + + def __str__(self): + return ( + f"Mapping table from {self.named_columns_x_name} to " + f"{self.named_columns_y_name}, containing {len(self.data)} " + f"by {len(self.data[0])} elements." + ) + + def __repr__(self): + return f"{self.__str__()}. Raw data: " "\n" f"{self.data}." + + + +class SWIFTGroupMetadata(object): + """ + Object that contains the metadata for one hdf5 group. + + This, for instance, could be part type 0, or 'gas'. This will load in + the names of all datasets, their units, possible named fields, + and their cosmology, and present them for use in the actual i/o routines. + + Methods + ------- + load_metadata(self): + Loads the required metadata. + load_field_names(self): + Loads in the field names. + load_field_units(self): + Loads in the units from each dataset. + load_field_descriptions(self): + Loads in descriptions of the fields for each dataset. + load_field_compressions(self): + Loads in compressions of the fields for each dataset. + load_cosmology(self): + Loads in the field cosmologies. + load_physical(self): + Loads in whether the field is saved as comoving or physical. + load_valid_transforms(self): + Loads in whether the field can be converted to comoving. + load_named_columns(self): + Loads the named column data for relevant fields. + """ + + def __init__( + self, group: str, group_name: str, metadata: "SWIFTMetadata", scale_factor: float + ): + """ + Constructor for SWIFTGroupMetadata class + + Parameters + ---------- + group: str + the name of the group in the hdf5 file + group_name : str + the corresponding group name for swiftsimio + metadata : SWIFTMetadata + the snapshot metadata + scale_factor : float + the snapshot scale factor + """ + self.group = group + self.group_name = group_name + self.metadata = metadata + self.units = metadata.units + self.scale_factor = scale_factor + + self.filename = metadata.filename + + self.load_metadata() + + return + + def __str__(self): + return f"Metadata class for {self.group} ({self.group_name})" + + def __repr__(self): + return self.__str__() + + def load_metadata(self): + """ + Loads the required metadata. + + This includes loading the field names, units and descriptions, as well as the + cosmology metadata and any custom named columns + """ + + self.load_field_names() + self.load_field_units() + self.load_field_descriptions() + self.load_field_compressions() + self.load_cosmology() + self.load_physical() + self.load_valid_transforms() + self.load_named_columns() + + def load_field_names(self): + """ + Loads in only the field names. + """ + + # regular expression for camel case to snake case + # https://stackoverflow.com/a/1176023 + def convert(name): + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() + + # Skip fields which are groups themselves + self.field_paths = [] + self.field_names = [] + for item in self.metadata.handle[f"{self.group}"].keys(): + # Skip fields which are groups themselves + if f"{self.group}/{item}" not in self.metadata.present_groups: + self.field_paths.append(f"{self.group}/{item}") + self.field_names.append(convert(item)) + + return + + def load_field_units(self): + """ + Loads in the units from each dataset. + """ + + unit_dict = { + "I": self.units.current, + "L": self.units.length, + "M": self.units.mass, + "T": self.units.temperature, + "t": self.units.time, + } + + def get_units(unit_attribute): + units = 1.0 + + for exponent, unit in unit_dict.items(): + # We store the 'unit exponent' in the SWIFT metadata. This corresponds + # to the power we need to raise each unit to, to return the correct units + try: + # Need to check if the exponent is 0 manually because of float precision + unit_exponent = unit_attribute[f"U_{exponent} exponent"][0] + if unit_exponent != 0.0: + units *= unit ** unit_exponent + except KeyError: + # Can't load that data! + # We should probably warn the user here... + pass + + # Deal with case where we _really_ have a dimensionless quantity. Comparing with + # 1.0 doesn't work, beacause in these cases unyt reverts to a floating point + # comparison. + try: + units.units + except AttributeError: + units = None + + return units + + self.field_units = [ + get_units(self.metadata.handle[x].attrs) for x in self.field_paths + ] + + return + + def load_field_descriptions(self): + """ + Loads in the text descriptions of the fields for each dataset. + For SOAP filetypes a description of the mask is included. + """ + + def get_desc(dataset): + try: + description = dataset.attrs["Description"].decode("utf-8") + except AttributeError: + # Description is saved as a string not bytes + description = dataset.attrs["Description"] + except KeyError: + # Can't load description! + description = "No description available" + + if self.metadata.filetype != "SOAP": + return description + + try: + is_masked = dataset.attrs["Masked"] + except KeyError: + is_masked = False + if not is_masked: + return description + " Not masked." + + # TODO: Update for https://github.com/SWIFTSIM/SOAP/pull/81 + # TODO: Also need to add metadata for each halo type for that PR + # If is_masked is true then these other attributes should exist + mask_datasets = dataset.attrs["Mask Datasets"] + mask_threshold = dataset.attrs["Mask Threshold"] + if len(mask_datasets) == 1: + mask_str = f" Only computed for objects with {mask_datasets[0]} >= {mask_threshold}." + else: + mask_str = f' Only computed for objects where {" + ".join(mask_datasets)} >= {mask_threshold}.' + return description + mask_str + + self.field_descriptions = [ + get_desc(self.metadata.handle[x]) for x in self.field_paths + ] + + return + + def load_field_compressions(self): + """ + Loads in the string describing the compression filters of the fields for each dataset. + """ + + def get_comp(dataset): + try: + # SOAP catalogues can be compressed/uncompressed + is_compressed = dataset.attrs["Is Compressed"] + except KeyError: + is_compressed = True + try: + comp = dataset.attrs["Lossy compression filter"].decode("utf-8") + except AttributeError: + # Compression is saved as str not bytes + comp = dataset.attrs["Lossy compression filter"] + except KeyError: + # Can't load compression string! + comp = "No compression info available" + + return comp if is_compressed else "Not compressed." + + self.field_compressions = [ + get_comp(self.metadata.handle[x]) for x in self.field_paths + ] + + return + + def load_cosmology(self): + """ + Loads in the field cosmologies. + """ + + current_scale_factor = self.scale_factor + + def get_cosmo(dataset): + try: + cosmo_exponent = dataset.attrs["a-scale exponent"][0] + except: + # Can't load, 'graceful' fallback. + cosmo_exponent = 0.0 + + a_factor_this_dataset = a ** cosmo_exponent + + return cosmo_factor(a_factor_this_dataset, current_scale_factor) + + self.field_cosmologies = [ + get_cosmo(self.metadata.handle[x]) for x in self.field_paths + ] + + return + + def load_physical(self): + """ + Loads in whether the field is saved as comoving or physical. + """ + + def get_physical(dataset): + try: + physical = dataset.attrs["Value stored as physical"][0] == 1 + except: + physical = False + return physical + + self.field_physicals = [ + get_physical(self.metadata.handle[x]) for x in self.field_paths + ] + + return + + def load_valid_transforms(self): + """ + Loads in whether the field can be converted to comoving. + """ + + def get_valid_transform(dataset): + try: + valid_transform = ( + dataset.attrs["Property can be converted to comoving"][0] == 1 + ) + except: + valid_transform = True + return valid_transform + + self.field_valid_transforms = [ + get_valid_transform(self.metadata.handle[x]) for x in self.field_paths + ] + + return + + def load_named_columns(self): + """ + Loads the named column data for relevant fields. + """ + + named_columns = {} + + for field in self.field_paths: + property_name = field.split("/")[-1] + + if property_name in self.metadata.named_columns.keys(): + field_names = self.metadata.named_columns[property_name] + + # Now need to make a decision on capitalisation. If we have a set of + # words with only one capital in them, then it's likely that they are + # element names or something similar, so they should be lower case. + # If on average we have many more capitals, then they are likely to be + # ionized fractions (e.g. HeII) and so we want to leave them with their + # original capitalisation. + + num_capitals = lambda x: sum(1 for c in x if c.isupper()) + mean_num_capitals = sum(map(num_capitals, field_names)) / len( + field_names + ) + + if mean_num_capitals < 1.01: + # Decapitalise them as they are likely individual element names + formatted_field_names = [x.lower() for x in field_names] + else: + formatted_field_names = field_names + + named_columns[field] = formatted_field_names + else: + named_columns[field] = None + + self.named_columns = named_columns + + return + + +class SWIFTUnits(object): + """ + Generates a unyt system that can then be used with the SWIFT data. + + These give the unit mass, length, time, current, and temperature as + unyt unit variables in simulation units. I.e. you can take any value + that you get out of the code and multiply it by the appropriate values + to get it 'unyt-ified' with the correct units. + + Attributes + ---------- + mass : float + unit for mass used + length : float + unit for length used + time : float + unit for time used + current : float + unit for current used + temperature : float + unit for temperature used + + """ + + def __init__(self, filename: Path, handle: Optional[h5py.File] = None): + """ + SWIFTUnits constructor + + Sets filename for file to read units from and gets unit dictionary + + Parameters + ---------- + + filename : Path + Name of file to read units from + + handle: h5py.File, optional + The h5py file handle, optional. Will open a new handle with the + filename if required. + + """ + self.filename = filename + self._handle = handle + + self.get_unit_dictionary() + + return + + @property + def handle(self): + """ + Property that gets the file handle, which can be shared + with other objects for efficiency reasons. + """ + if isinstance(self._handle, h5py.File): + # Can be open or closed, let's test. + try: + file = self._handle.file + + return self._handle + except ValueError: + # This will be the case if there is no active file handle + pass + + self._handle = h5py.File(self.filename, "r") + + return self._handle + + def get_unit_dictionary(self): + """ + Store unit data and metadata + + Length 1 arrays are used to store the unit data. This dictionary + also contains the metadata information that connects the unyt + objects to the names that are stored in the SWIFT snapshots. + """ + + self.units = { + name: unyt.unyt_quantity( + value[0], units=metadata.unit_types.unit_names_to_unyt[name] + ) + for name, value in self.handle["Units"].attrs.items() + } + + # We now unpack this into variables. + self.mass = metadata.unit_types.find_nearest_base_unit( + self.units["Unit mass in cgs (U_M)"], "mass" + ) + self.length = metadata.unit_types.find_nearest_base_unit( + self.units["Unit length in cgs (U_L)"], "length" + ) + self.time = metadata.unit_types.find_nearest_base_unit( + self.units["Unit time in cgs (U_t)"], "time" + ) + self.current = metadata.unit_types.find_nearest_base_unit( + self.units["Unit current in cgs (U_I)"], "current" + ) + self.temperature = metadata.unit_types.find_nearest_base_unit( + self.units["Unit temperature in cgs (U_T)"], "temperature" + ) + + def __del__(self): + if isinstance(self._handle, h5py.File): + self._handle.close() + + + +def metadata_discriminator(filename: str, units: SWIFTUnits) -> "SWIFTMetadata": + """ + Discriminates between the different types of metadata objects read from SWIFT-compatible + files. + + Parameters + ---------- + + filename : str + Name of the file to read metadata from + + units : SWIFTUnits + The units object associated with the file + + + Returns + ------- + + SWIFTMetadata + The appropriate metadata object for the file type + """ + # Old snapshots did not have this attribute, so we need to default to FullVolume + file_type = units.handle["Header"].attrs.get("OutputType", "FullVolume") + + if isinstance(file_type, bytes): + file_type = file_type.decode("utf-8") + + if file_type in ["FullVolume"]: + return SWIFTSnapshotMetadata(filename, units) + elif file_type in ["SOAP"]: + return SWIFTSOAPMetadata(filename, units) + elif file_type in ["FOF"]: + return SWIFTFOFMetadata(filename, units) + else: + raise ValueError(f"File type {file_type} not recognised.") + + + +class SWIFTMetadata(ABC): + """ + An abstract base class for all SWIFT-related file metadata. + """ + + filename: str + units: SWIFTUnits + header: dict + masking_valid: bool = False + + @abstractmethod + def __init__(self, filename, units: SWIFTUnits): + raise NotImplementedError + + @property + def handle(self): + # Handle, which is shared with units. Units handles + # file opening and closing. + return self.units.handle + + def load_groups(self): + """ + Loads the groups and metadata into objects: + + metadata._properties + + This contains eight arrays, + + metadata._properties.field_names + metadata._properties.field_paths + metadata._properties.field_units + metadata._properties.field_cosmologies + metadata._properties.field_descriptions + metadata._properties.field_compressions + metadata._properties.field_physicals + metadata._properties.field_valid_transforms + + As well as some more information about the group. + """ + + for group, name in zip(self.present_groups, self.present_group_names): + filetype_metadata = SWIFTGroupMetadata( + group=group, + group_name=name, + metadata=self, + scale_factor=self.scale_factor, + ) + setattr(self, f"{name}_properties", filetype_metadata) + + return + + def get_metadata(self): + """ + Loads the metadata as specified in metadata.metadata_fields. + """ + + for field, name in metadata.metadata_fields.metadata_fields_to_read.items(): + try: + setattr(self, name, dict(self.handle[field].attrs)) + except KeyError: + setattr(self, name, None) + + return + + def postprocess_header(self): + """ + Some minor postprocessing on the header to local variables. + """ + + # These are just read straight in to variables + header_unpack_arrays_units = metadata.metadata_fields.generate_units_header_unpack_arrays( + m=self.units.mass, + l=self.units.length, + t=self.units.time, + I=self.units.current, + T=self.units.temperature, + ) + + for field, name in metadata.metadata_fields.header_unpack_arrays.items(): + try: + if name in header_unpack_arrays_units.keys(): + setattr( + self, + name, + unyt.unyt_array( + self.header[field], units=header_unpack_arrays_units[name] + ), + ) + # This is required or we automatically get everything in CGS! + getattr(self, name).convert_to_units( + header_unpack_arrays_units[name] + ) + else: + # Must not have any units! Oh well. + setattr(self, name, self.header[field]) + except KeyError: + # Must not be present, just skip it + continue + + # Now unpack the 'mass table' type items: + for field, name in metadata.metadata_fields.header_unpack_mass_tables.items(): + try: + setattr( + self, + name, + MassTable( + base_mass_table=self.header[field], mass_units=self.units.mass + ), + ) + except KeyError: + setattr( + self, + name, + MassTable( + base_mass_table=np.zeros( + len(metadata.particle_types.particle_name_underscores) + ), + mass_units=self.units.mass, + ), + ) + + # These must be unpacked as 'real' strings (i.e. converted to utf-8) + + for field, name in metadata.metadata_fields.header_unpack_string.items(): + try: + # Deal with h5py's quirkiness that fixed-sized and variable-sized + # strings are read as strings or bytes + # See: https://github.com/h5py/h5py/issues/2172 + raw = self.header[field] + try: + string = raw.decode("utf-8") + except AttributeError: + string = raw + setattr(self, name, string) + except KeyError: + # Must not be present, just skip it + setattr(self, name, "") + + # These must be unpacked as they are stored as length-1 arrays + + header_unpack_float_units = metadata.metadata_fields.generate_units_header_unpack_single_float( + m=self.units.mass, + l=self.units.length, + t=self.units.time, + I=self.units.current, + T=self.units.temperature, + ) + + for field, names in metadata.metadata_fields.header_unpack_single_float.items(): + try: + if isinstance(names, list): + # Sometimes we store a list in case we have multiple names, for example + # Redshift -> metadata.redshift AND metadata.z. Can't just do the iteration + # because we may loop over the letters in the string. + for variable in names: + if variable in header_unpack_float_units.keys(): + # We have an associated unit! + unit = header_unpack_float_units[variable] + setattr( + self, + variable, + unyt.unyt_quantity(self.header[field][0], units=unit), + ) + else: + # No unit + setattr(self, variable, self.header[field][0]) + else: + # We can just check for the unit and set the attribute + variable = names + if variable in header_unpack_float_units.keys(): + # We have an associated unit! + unit = header_unpack_float_units[variable] + setattr( + self, + variable, + unyt.unyt_quantity(self.header[field][0], units=unit), + ) + else: + # No unit + setattr(self, variable, self.header[field][0]) + except KeyError: + # Must not be present, just skip it + continue + + # These are special cases, sorry! + # Date and time of snapshot dump + try: + try: + # Try and decode bytes, otherwise save raw string + snapshot_date = self.header.get( + "SnapshotDate", self.header.get("Snapshot date", b"") + ).decode("utf-8") + except AttributeError: + snapshot_date = self.header.get( + "SnapshotDate", self.header.get("Snapshot date", "") + ) + try: + self.snapshot_date = datetime.strptime( + snapshot_date, "%H:%M:%S %Y-%m-%d %Z" + ) + except ValueError: + # Backwards compatibility; this was used previously due to simplicity + # but is not portable between regions. So if you ran a simulation on + # a British (en_GB) machine, and then tried to read on a Dutch + # machine (nl_NL), this would _not_ work because %c is different. + try: + self.snapshot_date = datetime.strptime(snapshot_date, "%c\n") + except ValueError: + # Oh dear this has gone _very_wrong. Let's just keep it as a string. + self.snapshot_date = snapshot_date + except KeyError: + # Old file + pass + + # get photon group edges RT dataset from the SubgridScheme group + try: + self.photon_group_edges = ( + self.handle["SubgridScheme/PhotonGroupEdges"][:] / self.units.time + ) + except KeyError: + self.photon_group_edges = None + + # get reduced speed of light RT dataset from the SubgridScheme group + try: + self.reduced_lightspeed = ( + self.handle["SubgridScheme/ReducedLightspeed"][0] + * self.units.length + / self.units.time + ) + except KeyError: + self.reduced_lightspeed = None + + # Store these separately as self.n_gas = number of gas particles for example + for (part_number, (_, part_name)) in enumerate( + metadata.particle_types.particle_name_underscores.items() + ): + try: + setattr(self, f"n_{part_name}", self.num_part[part_number]) + except IndexError: + # Backwards compatibility; mass/number table can change size. + setattr(self, f"n_{part_name}", 0) + + # Need to unpack the gas gamma for cosmology + try: + self.gas_gamma = self.hydro_scheme["Adiabatic index"] + except (KeyError, TypeError): + # We can set a default and print a message whenever we require this value + self.gas_gamma = None + + try: + self.a = self.scale_factor + except AttributeError: + # These must always be present for the initialisation of cosmology properties + self.a = 1.0 + self.scale_factor = 1.0 + + return + + + def extract_cosmology(self): + """ + Creates an astropy.cosmology object from the internal cosmology system. + + This will be saved as ``self.cosmology``. + """ + + if self.cosmology_raw is not None: + cosmo = self.cosmology_raw + else: + cosmo = {"Cosmological run": 0} + + if cosmo.get("Cosmological run", 0): + self.cosmology = swift_cosmology_to_astropy(cosmo, units=self.units) + else: + self.cosmology = None + + return + + @property + @abstractmethod + def present_groups(self): + raise NotImplementedError + + @property + @abstractmethod + def present_group_names(self): + raise NotImplementedError + + @property + def partial_snapshot(self) -> bool: + return False + +class SWIFTSnapshotMetadata(SWIFTMetadata): + """ + Loads all metadata (apart from Units, those are handled by SWIFTUnits) + into dictionaries. + + This also does some extra parsing on some well-used metadata. + """ + + # Name of the file that has been read from + filename: str + # Unit instance associated with this file + units: SWIFTUnits + # Header dictionary, metadata about snapshot. + header: dict + masking_valid: bool = True + + def __init__(self, filename, units: SWIFTUnits): + """ + Constructor for SWIFTMetadata object + + Parameters + ---------- + + filename : str + name of file to read from + + units : SWIFTUnits + the units being used + """ + self.filename = filename + self.units = units + + self.get_metadata() + self.get_named_column_metadata() + self.get_mapping_metadata() + + self.postprocess_header() + + self.load_groups() + self.extract_cosmology() + + # After we've loaded all this metadata, we can safely release the file handle. + self.handle.close() + + return + + def get_named_column_metadata(self): + """ + Loads the custom named column metadata (if it exists) from + SubgridScheme/NamedColumns. + """ + + try: + data = self.handle["SubgridScheme/NamedColumns"] + + self.named_columns = { + k: [x.decode("utf-8") for x in data[k][:]] for k in data.keys() + } + except KeyError: + self.named_columns = {} + + return + + def get_mapping_metadata(self): + """ + Gets the mappings based on the named columns (must have already been read), + from the form: + + SubgridScheme/{X}To{Y}Mapping. + + Includes a hack of `Dust` -> `Grains` that will be deprecated. + """ + + try: + possible_keys = self.handle["SubgridScheme"].keys() + + available_keys = [key for key in possible_keys if key.endswith("Mapping")] + available_data = [ + self.handle[f"SubgridScheme/{key}"][:] for key in available_keys + ] + except KeyError: + available_keys = [] + available_data = [] + + # Keys have form {X}To{Y}Mapping + + # regular expression for camel case to snake case + # https://stackoverflow.com/a/1176023 + def convert(name): + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() + + regex = r"([a-zA-Z]*)To([a-zA-Z]*)Mapping" + compiled = re.compile(regex) + + for key, data in zip(available_keys, available_data): + match = compiled.match(key) + snake_case = convert(key) + + if match: + x = match.group(1) + y = match.group(2) + + if x == "Grain": + warnings.warn( + "Use of the GrainToElementMapping is deprecated, please use a newer " + "version of SWIFT to run this simulation.", + DeprecationWarning, + ) + + x = "Dust" + + named_column_name_x = [ + key for key in self.named_columns.keys() if key.startswith(x) + ][0] + named_column_name_y = [ + key for key in self.named_columns.keys() if key.startswith(y) + ][0] + + setattr( + self, + snake_case, + MappingTable( + data=data, + named_columns_x=self.named_columns[named_column_name_x], + named_columns_y=self.named_columns[named_column_name_y], + named_columns_x_name=named_column_name_x, + named_columns_y_name=named_column_name_y, + ), + ) + + return + + + @property + def present_groups(self): + """ + The groups containing datasets that are present in the file. + """ + types = np.where(np.array(getattr(self, "has_type", self.num_part)) != 0)[0] + return [f"PartType{i}" for i in types] + + @property + def present_group_names(self): + """ + The names of the groups that we want to expose. + """ + + return [ + metadata.particle_types.particle_name_underscores[x] + for x in self.present_groups + ] + + @property + def code_info(self) -> str: + """ + Gets a nicely printed set of code information with: + + Name (Git Branch) + Git Revision + Git Date + """ + + def get_string(x): + return self.code[x].decode("utf-8") + + output = ( + f"{get_string('Code')} ({get_string('Git Branch')})\n" + f"{get_string('Git Revision')}\n" + f"{get_string('Git Date')}" + ) + + return output + + @property + def compiler_info(self) -> str: + """ + Gets information about the compiler and formats it as: + + Compiler Name (Compiler Version) + MPI library + """ + + def get_string(x): + return self.code[x].decode("utf-8") + + output = ( + f"{get_string('Compiler Name')} ({get_string('Compiler Version')})\n" + f"{get_string('MPI library')}" + ) + + return output + + @property + def library_info(self) -> str: + """ + Gets information about the libraries used and formats it as: + + FFTW vFFTW library version + GSL vGSL library version + HDF5 vHDF5 library version + """ + + def get_string(x): + return self.code[f"{x} library version"].decode("utf-8") + + output = ( + f"FFTW v{get_string('FFTW')}\n" + f"GSL v{get_string('GSL')}\n" + f"HDF5 v{get_string('HDF5')}" + ) + + return output + + @property + def hydro_info(self) -> str: + r""" + Gets information about the hydro scheme and formats it as: + + Scheme + Kernel function in DimensionD + $\eta$ = Kernel eta (Kernel target N_ngb $N_{ngb}$) + $C_{\rm CFL}$ = CFL parameter + """ + + def get_float(x): + return "{:4.2f}".format(self.hydro_scheme[x][0]) + + def get_int(x): + return int(self.hydro_scheme[x][0]) + + def get_string(x): + return self.hydro_scheme[x].decode("utf-8") + + output = ( + f"{get_string('Scheme')}\n" + f"{get_string('Kernel function')} in {get_int('Dimension')}D\n" + rf"$\eta$ = {get_float('Kernel eta')} " + rf"({get_float('Kernel target N_ngb')} $N_{{ngb}}$)" + "\n" + rf"$C_{{\rm CFL}}$ = {get_float('CFL parameter')}" + ) + + return output + + @property + def viscosity_info(self) -> str: + r""" + Gets information about the viscosity scheme and formats it as: + + Viscosity Model + $\alpha_{V, 0}$ = Alpha viscosity, $\ell_V$ = Viscosity decay length [internal units], $\beta_V$ = Beta viscosity + Alpha viscosity (min) < $\alpha_V$ < Alpha viscosity (max) + """ + + def get_float(x): + return "{:4.2f}".format(self.hydro_scheme[x][0]) + + def get_string(x): + return self.hydro_scheme[x].decode("utf-8") + + output = ( + f"{get_string('Viscosity Model')}\n" + rf"$\alpha_{{V, 0}}$ = {get_float('Alpha viscosity')}, " + rf"$\ell_V$ = {get_float('Viscosity decay length [internal units]')}, " + rf"$\beta_V$ = {get_float('Beta viscosity')}" + "\n" + rf"{get_float('Alpha viscosity (min)')} < $\alpha_V$ < {get_float('Alpha viscosity (max)')}" + ) + + return output + + @property + def diffusion_info(self) -> str: + """ + Gets information about the diffusion scheme and formats it as: + + $\alpha_{D, 0}$ = Diffusion alpha, $\beta_D$ = Diffusion beta + Diffusion alpha (min) < $\alpha_D$ < Diffusion alpha (max) + """ + + def get_float(x): + return "{:4.2f}".format(self.hydro_scheme[x][0]) + + output = ( + rf"$\alpha_{{D, 0}}$ = {get_float('Diffusion alpha')}, " + rf"$\beta_D$ = {get_float('Diffusion beta')}" + "\n" + rf"${get_float('Diffusion alpha (min)')} < " + rf"\alpha_D < {get_float('Diffusion alpha (max)')}$" + ) + + return output + + @property + def partial_snapshot(self) -> bool: + """ + Whether or not this snapshot is partial (e.g. a "x.0.hdf5" file), or + a file describing an entire snapshot. + """ + + # Partial snapshots have num_files_per_snapshot set to 1. Virtual snapshots + # collating multiple sub-snapshots together have num_files_per_snapshot = 1. + + return self.num_files_per_snapshot > 1 + + +class SWIFTFOFMetadata(SWIFTMetadata): + masking_valid: bool = False + + def __init__(self, filename: str, units: SWIFTUnits): + self.filename = filename + self.units = units + + self.get_metadata() + self.postprocess_header() + + self.load_groups() + + # After we've loaded all this metadata, we can safely release the file handle. + self.handle.close() + + return + + @property + def present_groups(self): + """ + The groups containing datasets that are present in the file. + """ + return ["Groups"] + + @property + def present_group_names(self): + """ + The names of the groups that we want to expose. + """ + return ["fof_groups"] + + +class SWIFTSOAPMetadata(SWIFTMetadata): + masking_valid: bool = True + + def __init__(self, filename: str, units: SWIFTUnits): + self.filename = filename + self.units = units + + self.get_metadata() + self.postprocess_header() + + self.load_groups() + + # After we've loaded all this metadata, we can safely release the file handle. + self.handle.close() + + return + + @property + def present_groups(self): + """ + The groups containing datasets that are present in the file. + """ + return self.subhalo_types + + @property + def present_group_names(self): + """ + The names of the groups that we want to expose. + """ + return [ + metadata.soap_types.get_soap_name_underscore(x) + for x in self.present_groups + ] \ No newline at end of file diff --git a/swiftsimio/reader.py b/swiftsimio/reader.py index abaa5345..2fee88df 100644 --- a/swiftsimio/reader.py +++ b/swiftsimio/reader.py @@ -11,10 +11,11 @@ + SWIFTDataset, a container class for all of the above. """ -from swiftsimio import metadata + from swiftsimio.accelerated import read_ranges_from_file from swiftsimio.objects import cosmo_array, cosmo_factor, a -from swiftsimio.conversions import swift_cosmology_to_astropy + +from swiftsimio.metadata.objects import metadata_discriminator, SWIFTUnits, SWIFTGroupMetadata, SWIFTMetadata import re import h5py @@ -28,1124 +29,6 @@ from typing import Union, Callable, List, Optional -class MassTable(object): - """ - Extracts a mass table to local variables based on the - particle type names. - """ - - def __init__(self, base_mass_table: np.array, mass_units: unyt.unyt_quantity): - """ - Parameters - ---------- - - base_mass_table : np.array - Mass table of the same length as the number of particle types. - - mass_units : unyt_quantity - Base mass units for the simulation. - """ - - for index, name in metadata.particle_types.particle_name_underscores.items(): - try: - setattr( - self, - name, - unyt.unyt_quantity(base_mass_table[index], units=mass_units), - ) - except IndexError: - # Backwards compatible. - setattr(self, name, None) - - return - - def __str__(self): - return f"Mass table for {' '.join(metadata.particle_types.particle_name_underscores.values())}" - - def __repr__(self): - return self.__str__() - - -class MappingTable(object): - """ - A mapping table from one named column instance to the other. - Initially designed for the mapping between dust and elements. - """ - - def __init__( - self, - data: np.ndarray, - named_columns_x: List[str], - named_columns_y: List[str], - named_columns_x_name: str, - named_columns_y_name: str, - ): - """ - Parameters - ---------- - - data: np.ndarray - The data array providing the mapping between the named - columns. Should be of size N x M, where N is the number - of elements in ``named_columns_x`` and M the number - of elements in ``named_columns_y``. - - named_columns_x: List[str] - The names of the columns in the first axis. - - named_columns_y: List[str] - The names of the columns in the second axis. - - named_columns_x_name: str - The name of the first mapping. - - named_columns_y_name: str - The name of the second mapping. - """ - - self.data = data - self.named_columns_x = named_columns_x - self.named_columns_y = named_columns_y - self.named_columns_x_name = named_columns_x_name - self.named_columns_y_name = named_columns_y_name - - for x, name_x in enumerate(named_columns_x): - for y, name_y in enumerate(named_columns_y): - setattr(self, f"{name_x.lower()}_to_{name_y.lower()}", data[x][y]) - - return - - def __str__(self): - return ( - f"Mapping table from {self.named_columns_x_name} to " - f"{self.named_columns_y_name}, containing {len(self.data)} " - f"by {len(self.data[0])} elements." - ) - - def __repr__(self): - return f"{self.__str__()}. Raw data: " "\n" f"{self.data}." - - -class SWIFTUnits(object): - """ - Generates a unyt system that can then be used with the SWIFT data. - - These give the unit mass, length, time, current, and temperature as - unyt unit variables in simulation units. I.e. you can take any value - that you get out of the code and multiply it by the appropriate values - to get it 'unyt-ified' with the correct units. - - Attributes - ---------- - mass : float - unit for mass used - length : float - unit for length used - time : float - unit for time used - current : float - unit for current used - temperature : float - unit for temperature used - - """ - - def __init__(self, filename: Path, handle: Optional[h5py.File] = None): - """ - SWIFTUnits constructor - - Sets filename for file to read units from and gets unit dictionary - - Parameters - ---------- - - filename : Path - Name of file to read units from - - handle: h5py.File, optional - The h5py file handle, optional. Will open a new handle with the - filename if required. - - """ - self.filename = filename - self._handle = handle - - self.get_unit_dictionary() - - return - - @property - def handle(self): - """ - Property that gets the file handle, which can be shared - with other objects for efficiency reasons. - """ - if isinstance(self._handle, h5py.File): - # Can be open or closed, let's test. - try: - file = self._handle.file - - return self._handle - except ValueError: - # This will be the case if there is no active file handle - pass - - self._handle = h5py.File(self.filename, "r") - - return self._handle - - def get_unit_dictionary(self): - """ - Store unit data and metadata - - Length 1 arrays are used to store the unit data. This dictionary - also contains the metadata information that connects the unyt - objects to the names that are stored in the SWIFT snapshots. - """ - - self.units = { - name: unyt.unyt_quantity( - value[0], units=metadata.unit_types.unit_names_to_unyt[name] - ) - for name, value in self.handle["Units"].attrs.items() - } - - # We now unpack this into variables. - self.mass = metadata.unit_types.find_nearest_base_unit( - self.units["Unit mass in cgs (U_M)"], "mass" - ) - self.length = metadata.unit_types.find_nearest_base_unit( - self.units["Unit length in cgs (U_L)"], "length" - ) - self.time = metadata.unit_types.find_nearest_base_unit( - self.units["Unit time in cgs (U_t)"], "time" - ) - self.current = metadata.unit_types.find_nearest_base_unit( - self.units["Unit current in cgs (U_I)"], "current" - ) - self.temperature = metadata.unit_types.find_nearest_base_unit( - self.units["Unit temperature in cgs (U_T)"], "temperature" - ) - - def __del__(self): - if isinstance(self._handle, h5py.File): - self._handle.close() - - -class SWIFTMetadata(object): - """ - Loads all metadata (apart from Units, those are handled by SWIFTUnits) - into dictionaries. - - This also does some extra parsing on some well-used metadata. - """ - - # Name of the file that has been read from - filename: str - # Unit instance associated with this file - units: SWIFTUnits - # Header dictionary, metadata about snapshot. - header: dict - - def __init__(self, filename, units: SWIFTUnits): - """ - Constructor for SWIFTMetadata object - - Parameters - ---------- - - filename : str - name of file to read from - - units : SWIFTUnits - the units being used - """ - self.filename = filename - self.units = units - - self.get_metadata() - self.get_named_column_metadata() - self.get_mapping_metadata() - - self.postprocess_header() - - self.get_filetype() - - self.load_groups() - self.extract_cosmology() - - # After we've loaded all this metadata, we can safely release the file handle. - self.handle.close() - - return - - @property - def handle(self): - # Handle, which is shared with units. Units handles - # file opening and closing. - return self.units.handle - - def get_metadata(self): - """ - Loads the metadata as specified in metadata.metadata_fields. - """ - - for field, name in metadata.metadata_fields.metadata_fields_to_read.items(): - try: - setattr(self, name, dict(self.handle[field].attrs)) - except KeyError: - setattr(self, name, None) - - return - - def get_named_column_metadata(self): - """ - Loads the custom named column metadata (if it exists) from - SubgridScheme/NamedColumns. - """ - - try: - data = self.handle["SubgridScheme/NamedColumns"] - - self.named_columns = { - k: [x.decode("utf-8") for x in data[k][:]] for k in data.keys() - } - except KeyError: - self.named_columns = {} - - return - - def get_mapping_metadata(self): - """ - Gets the mappings based on the named columns (must have already been read), - from the form: - - SubgridScheme/{X}To{Y}Mapping. - - Includes a hack of `Dust` -> `Grains` that will be deprecated. - """ - - try: - possible_keys = self.handle["SubgridScheme"].keys() - - available_keys = [key for key in possible_keys if key.endswith("Mapping")] - available_data = [ - self.handle[f"SubgridScheme/{key}"][:] for key in available_keys - ] - except KeyError: - available_keys = [] - available_data = [] - - # Keys have form {X}To{Y}Mapping - - # regular expression for camel case to snake case - # https://stackoverflow.com/a/1176023 - def convert(name): - return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() - - regex = r"([a-zA-Z]*)To([a-zA-Z]*)Mapping" - compiled = re.compile(regex) - - for key, data in zip(available_keys, available_data): - match = compiled.match(key) - snake_case = convert(key) - - if match: - x = match.group(1) - y = match.group(2) - - if x == "Grain": - warnings.warn( - "Use of the GrainToElementMapping is deprecated, please use a newer " - "version of SWIFT to run this simulation.", - DeprecationWarning, - ) - - x = "Dust" - - named_column_name_x = [ - key for key in self.named_columns.keys() if key.startswith(x) - ][0] - named_column_name_y = [ - key for key in self.named_columns.keys() if key.startswith(y) - ][0] - - setattr( - self, - snake_case, - MappingTable( - data=data, - named_columns_x=self.named_columns[named_column_name_x], - named_columns_y=self.named_columns[named_column_name_y], - named_columns_x_name=named_column_name_x, - named_columns_y_name=named_column_name_y, - ), - ) - - return - - def postprocess_header(self): - """ - Some minor postprocessing on the header to local variables. - """ - - # These are just read straight in to variables - header_unpack_arrays_units = metadata.metadata_fields.generate_units_header_unpack_arrays( - m=self.units.mass, - l=self.units.length, - t=self.units.time, - I=self.units.current, - T=self.units.temperature, - ) - - for field, name in metadata.metadata_fields.header_unpack_arrays.items(): - try: - if name in header_unpack_arrays_units.keys(): - setattr( - self, - name, - unyt.unyt_array( - self.header[field], units=header_unpack_arrays_units[name] - ), - ) - # This is required or we automatically get everything in CGS! - getattr(self, name).convert_to_units( - header_unpack_arrays_units[name] - ) - else: - # Must not have any units! Oh well. - setattr(self, name, self.header[field]) - except KeyError: - # Must not be present, just skip it - continue - - # Now unpack the 'mass table' type items: - for field, name in metadata.metadata_fields.header_unpack_mass_tables.items(): - try: - setattr( - self, - name, - MassTable( - base_mass_table=self.header[field], mass_units=self.units.mass - ), - ) - except KeyError: - setattr( - self, - name, - MassTable( - base_mass_table=np.zeros( - len(metadata.particle_types.particle_name_underscores) - ), - mass_units=self.units.mass, - ), - ) - - # These must be unpacked as 'real' strings (i.e. converted to utf-8) - - for field, name in metadata.metadata_fields.header_unpack_string.items(): - try: - # Deal with h5py's quirkiness that fixed-sized and variable-sized - # strings are read as strings or bytes - # See: https://github.com/h5py/h5py/issues/2172 - raw = self.header[field] - try: - string = raw.decode("utf-8") - except AttributeError: - string = raw - setattr(self, name, string) - except KeyError: - # Must not be present, just skip it - setattr(self, name, "") - - # These must be unpacked as they are stored as length-1 arrays - - header_unpack_float_units = metadata.metadata_fields.generate_units_header_unpack_single_float( - m=self.units.mass, - l=self.units.length, - t=self.units.time, - I=self.units.current, - T=self.units.temperature, - ) - - for field, names in metadata.metadata_fields.header_unpack_single_float.items(): - try: - if isinstance(names, list): - # Sometimes we store a list in case we have multiple names, for example - # Redshift -> metadata.redshift AND metadata.z. Can't just do the iteration - # because we may loop over the letters in the string. - for variable in names: - if variable in header_unpack_float_units.keys(): - # We have an associated unit! - unit = header_unpack_float_units[variable] - setattr( - self, - variable, - unyt.unyt_quantity(self.header[field][0], units=unit), - ) - else: - # No unit - setattr(self, variable, self.header[field][0]) - else: - # We can just check for the unit and set the attribute - variable = names - if variable in header_unpack_float_units.keys(): - # We have an associated unit! - unit = header_unpack_float_units[variable] - setattr( - self, - variable, - unyt.unyt_quantity(self.header[field][0], units=unit), - ) - else: - # No unit - setattr(self, variable, self.header[field][0]) - except KeyError: - # Must not be present, just skip it - continue - - # These are special cases, sorry! - # Date and time of snapshot dump - try: - try: - # Try and decode bytes, otherwise save raw string - snapshot_date = self.header.get( - "SnapshotDate", self.header.get("Snapshot date", b"") - ).decode("utf-8") - except AttributeError: - snapshot_date = self.header.get( - "SnapshotDate", self.header.get("Snapshot date", "") - ) - try: - self.snapshot_date = datetime.strptime( - snapshot_date, "%H:%M:%S %Y-%m-%d %Z" - ) - except ValueError: - # Backwards compatibility; this was used previously due to simplicity - # but is not portable between regions. So if you ran a simulation on - # a British (en_GB) machine, and then tried to read on a Dutch - # machine (nl_NL), this would _not_ work because %c is different. - try: - self.snapshot_date = datetime.strptime(snapshot_date, "%c\n") - except ValueError: - # Oh dear this has gone _very_wrong. Let's just keep it as a string. - self.snapshot_date = snapshot_date - except KeyError: - # Old file - pass - - # get photon group edges RT dataset from the SubgridScheme group - try: - self.photon_group_edges = ( - self.handle["SubgridScheme/PhotonGroupEdges"][:] / self.units.time - ) - except KeyError: - self.photon_group_edges = None - - # get reduced speed of light RT dataset from the SubgridScheme group - try: - self.reduced_lightspeed = ( - self.handle["SubgridScheme/ReducedLightspeed"][0] - * self.units.length - / self.units.time - ) - except KeyError: - self.reduced_lightspeed = None - - # Store these separately as self.n_gas = number of gas particles for example - for (part_number, (_, part_name)) in enumerate( - metadata.particle_types.particle_name_underscores.items() - ): - try: - setattr(self, f"n_{part_name}", self.num_part[part_number]) - except IndexError: - # Backwards compatibility; mass/number table can change size. - setattr(self, f"n_{part_name}", 0) - - # Need to unpack the gas gamma for cosmology - try: - self.gas_gamma = self.hydro_scheme["Adiabatic index"] - except (KeyError, TypeError): - # We can set a default and print a message whenever we require this value - self.gas_gamma = None - - try: - self.a = self.scale_factor - except AttributeError: - # These must always be present for the initialisation of cosmology properties - self.a = 1.0 - self.scale_factor = 1.0 - - return - - def get_filetype(self): - """ - Determine what type of file we are reading - """ - if self.output_type == "FOF": - self.filetype = "FOF" - elif self.output_type == "SOAP": - self.filetype = "SOAP" - else: - # Default to snapshot - self.filetype = "snapshot" - - def load_groups(self): - """ - Loads the groups and metadata into objects: - - metadata._properties - - This contains eight arrays, - - metadata._properties.field_names - metadata._properties.field_paths - metadata._properties.field_units - metadata._properties.field_cosmologies - metadata._properties.field_descriptions - metadata._properties.field_compressions - metadata._properties.field_physicals - metadata._properties.field_valid_transforms - - As well as some more information about the group. - """ - - for group, name in zip(self.present_groups, self.present_group_names): - filetype_metadata = SWIFTGroupMetadata( - group=group, - group_name=name, - metadata=self, - scale_factor=self.scale_factor, - ) - setattr(self, f"{name}_properties", filetype_metadata) - - return - - def extract_cosmology(self): - """ - Creates an astropy.cosmology object from the internal cosmology system. - - This will be saved as ``self.cosmology``. - """ - - if self.cosmology_raw is not None: - cosmo = self.cosmology_raw - else: - cosmo = {"Cosmological run": 0} - - if cosmo.get("Cosmological run", 0): - self.cosmology = swift_cosmology_to_astropy(cosmo, units=self.units) - else: - self.cosmology = None - - return - - @property - def present_groups(self): - """ - The groups containing datasets that are present in the file. - """ - if self.filetype == "snapshot": - types = np.where(np.array(getattr(self, "has_type", self.num_part)) != 0)[0] - return [f"PartType{i}" for i in types] - elif self.filetype == "FOF": - return ["Groups"] - elif self.filetype == "SOAP": - return self.subhalo_types - - @property - def present_group_names(self): - """ - The names of the groups that we want to expose. - """ - - if self.filetype == "snapshot": - return [ - metadata.particle_types.particle_name_underscores[x] - for x in self.present_groups - ] - elif self.filetype == "FOF": - return ["fof_groups"] - elif self.filetype == "SOAP": - return [ - metadata.soap_types.get_soap_name_underscore(x) - for x in self.present_groups - ] - - @property - def code_info(self) -> str: - """ - Gets a nicely printed set of code information with: - - Name (Git Branch) - Git Revision - Git Date - """ - - def get_string(x): - return self.code[x].decode("utf-8") - - output = ( - f"{get_string('Code')} ({get_string('Git Branch')})\n" - f"{get_string('Git Revision')}\n" - f"{get_string('Git Date')}" - ) - - return output - - @property - def compiler_info(self) -> str: - """ - Gets information about the compiler and formats it as: - - Compiler Name (Compiler Version) - MPI library - """ - - def get_string(x): - return self.code[x].decode("utf-8") - - output = ( - f"{get_string('Compiler Name')} ({get_string('Compiler Version')})\n" - f"{get_string('MPI library')}" - ) - - return output - - @property - def library_info(self) -> str: - """ - Gets information about the libraries used and formats it as: - - FFTW vFFTW library version - GSL vGSL library version - HDF5 vHDF5 library version - """ - - def get_string(x): - return self.code[f"{x} library version"].decode("utf-8") - - output = ( - f"FFTW v{get_string('FFTW')}\n" - f"GSL v{get_string('GSL')}\n" - f"HDF5 v{get_string('HDF5')}" - ) - - return output - - @property - def hydro_info(self) -> str: - r""" - Gets information about the hydro scheme and formats it as: - - Scheme - Kernel function in DimensionD - $\eta$ = Kernel eta (Kernel target N_ngb $N_{ngb}$) - $C_{\rm CFL}$ = CFL parameter - """ - - def get_float(x): - return "{:4.2f}".format(self.hydro_scheme[x][0]) - - def get_int(x): - return int(self.hydro_scheme[x][0]) - - def get_string(x): - return self.hydro_scheme[x].decode("utf-8") - - output = ( - f"{get_string('Scheme')}\n" - f"{get_string('Kernel function')} in {get_int('Dimension')}D\n" - rf"$\eta$ = {get_float('Kernel eta')} " - rf"({get_float('Kernel target N_ngb')} $N_{{ngb}}$)" - "\n" - rf"$C_{{\rm CFL}}$ = {get_float('CFL parameter')}" - ) - - return output - - @property - def viscosity_info(self) -> str: - r""" - Gets information about the viscosity scheme and formats it as: - - Viscosity Model - $\alpha_{V, 0}$ = Alpha viscosity, $\ell_V$ = Viscosity decay length [internal units], $\beta_V$ = Beta viscosity - Alpha viscosity (min) < $\alpha_V$ < Alpha viscosity (max) - """ - - def get_float(x): - return "{:4.2f}".format(self.hydro_scheme[x][0]) - - def get_string(x): - return self.hydro_scheme[x].decode("utf-8") - - output = ( - f"{get_string('Viscosity Model')}\n" - rf"$\alpha_{{V, 0}}$ = {get_float('Alpha viscosity')}, " - rf"$\ell_V$ = {get_float('Viscosity decay length [internal units]')}, " - rf"$\beta_V$ = {get_float('Beta viscosity')}" - "\n" - rf"{get_float('Alpha viscosity (min)')} < $\alpha_V$ < {get_float('Alpha viscosity (max)')}" - ) - - return output - - @property - def diffusion_info(self) -> str: - """ - Gets information about the diffusion scheme and formats it as: - - $\alpha_{D, 0}$ = Diffusion alpha, $\beta_D$ = Diffusion beta - Diffusion alpha (min) < $\alpha_D$ < Diffusion alpha (max) - """ - - def get_float(x): - return "{:4.2f}".format(self.hydro_scheme[x][0]) - - output = ( - rf"$\alpha_{{D, 0}}$ = {get_float('Diffusion alpha')}, " - rf"$\beta_D$ = {get_float('Diffusion beta')}" - "\n" - rf"${get_float('Diffusion alpha (min)')} < " - rf"\alpha_D < {get_float('Diffusion alpha (max)')}$" - ) - - return output - - @property - def partial_snapshot(self) -> bool: - """ - Whether or not this snapshot is partial (e.g. a "x.0.hdf5" file), or - a file describing an entire snapshot. - """ - - # Partial snapshots have num_files_per_snapshot set to 1. Virtual snapshots - # collating multiple sub-snapshots together have num_files_per_snapshot = 1. - - return self.num_files_per_snapshot > 1 - - -class SWIFTGroupMetadata(object): - """ - Object that contains the metadata for one hdf5 group. - - This, for instance, could be part type 0, or 'gas'. This will load in - the names of all datasets, their units, possible named fields, - and their cosmology, and present them for use in the actual i/o routines. - - Methods - ------- - load_metadata(self): - Loads the required metadata. - load_field_names(self): - Loads in the field names. - load_field_units(self): - Loads in the units from each dataset. - load_field_descriptions(self): - Loads in descriptions of the fields for each dataset. - load_field_compressions(self): - Loads in compressions of the fields for each dataset. - load_cosmology(self): - Loads in the field cosmologies. - load_physical(self): - Loads in whether the field is saved as comoving or physical. - load_valid_transforms(self): - Loads in whether the field can be converted to comoving. - load_named_columns(self): - Loads the named column data for relevant fields. - """ - - def __init__( - self, group: str, group_name: str, metadata: SWIFTMetadata, scale_factor: float - ): - """ - Constructor for SWIFTGroupMetadata class - - Parameters - ---------- - group: str - the name of the group in the hdf5 file - group_name : str - the corresponding group name for swiftsimio - metadata : SWIFTMetadata - the snapshot metadata - scale_factor : float - the snapshot scale factor - """ - self.group = group - self.group_name = group_name - self.metadata = metadata - self.units = metadata.units - self.scale_factor = scale_factor - - self.filename = metadata.filename - - self.load_metadata() - - return - - def __str__(self): - return f"Metadata class for {self.group} ({self.group_name})" - - def __repr__(self): - return self.__str__() - - def load_metadata(self): - """ - Loads the required metadata. - - This includes loading the field names, units and descriptions, as well as the - cosmology metadata and any custom named columns - """ - - self.load_field_names() - self.load_field_units() - self.load_field_descriptions() - self.load_field_compressions() - self.load_cosmology() - self.load_physical() - self.load_valid_transforms() - self.load_named_columns() - - def load_field_names(self): - """ - Loads in only the field names. - """ - - # regular expression for camel case to snake case - # https://stackoverflow.com/a/1176023 - def convert(name): - return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() - - # Skip fields which are groups themselves - self.field_paths = [] - self.field_names = [] - for item in self.metadata.handle[f"{self.group}"].keys(): - # Skip fields which are groups themselves - if f"{self.group}/{item}" not in self.metadata.present_groups: - self.field_paths.append(f"{self.group}/{item}") - self.field_names.append(convert(item)) - - return - - def load_field_units(self): - """ - Loads in the units from each dataset. - """ - - unit_dict = { - "I": self.units.current, - "L": self.units.length, - "M": self.units.mass, - "T": self.units.temperature, - "t": self.units.time, - } - - def get_units(unit_attribute): - units = 1.0 - - for exponent, unit in unit_dict.items(): - # We store the 'unit exponent' in the SWIFT metadata. This corresponds - # to the power we need to raise each unit to, to return the correct units - try: - # Need to check if the exponent is 0 manually because of float precision - unit_exponent = unit_attribute[f"U_{exponent} exponent"][0] - if unit_exponent != 0.0: - units *= unit ** unit_exponent - except KeyError: - # Can't load that data! - # We should probably warn the user here... - pass - - # Deal with case where we _really_ have a dimensionless quantity. Comparing with - # 1.0 doesn't work, beacause in these cases unyt reverts to a floating point - # comparison. - try: - units.units - except AttributeError: - units = None - - return units - - self.field_units = [ - get_units(self.metadata.handle[x].attrs) for x in self.field_paths - ] - - return - - def load_field_descriptions(self): - """ - Loads in the text descriptions of the fields for each dataset. - For SOAP filetypes a description of the mask is included. - """ - - def get_desc(dataset): - try: - description = dataset.attrs["Description"].decode("utf-8") - except AttributeError: - # Description is saved as a string not bytes - description = dataset.attrs["Description"] - except KeyError: - # Can't load description! - description = "No description available" - - if self.metadata.filetype != "SOAP": - return description - - try: - is_masked = dataset.attrs["Masked"] - except KeyError: - is_masked = False - if not is_masked: - return description + " Not masked." - - # TODO: Update for https://github.com/SWIFTSIM/SOAP/pull/81 - # TODO: Also need to add metadata for each halo type for that PR - # If is_masked is true then these other attributes should exist - mask_datasets = dataset.attrs["Mask Datasets"] - mask_threshold = dataset.attrs["Mask Threshold"] - if len(mask_datasets) == 1: - mask_str = f" Only computed for objects with {mask_datasets[0]} >= {mask_threshold}." - else: - mask_str = f' Only computed for objects where {" + ".join(mask_datasets)} >= {mask_threshold}.' - return description + mask_str - - self.field_descriptions = [ - get_desc(self.metadata.handle[x]) for x in self.field_paths - ] - - return - - def load_field_compressions(self): - """ - Loads in the string describing the compression filters of the fields for each dataset. - """ - - def get_comp(dataset): - try: - # SOAP catalogues can be compressed/uncompressed - is_compressed = dataset.attrs["Is Compressed"] - except KeyError: - is_compressed = True - try: - comp = dataset.attrs["Lossy compression filter"].decode("utf-8") - except AttributeError: - # Compression is saved as str not bytes - comp = dataset.attrs["Lossy compression filter"] - except KeyError: - # Can't load compression string! - comp = "No compression info available" - - return comp if is_compressed else "Not compressed." - - self.field_compressions = [ - get_comp(self.metadata.handle[x]) for x in self.field_paths - ] - - return - - def load_cosmology(self): - """ - Loads in the field cosmologies. - """ - - current_scale_factor = self.scale_factor - - def get_cosmo(dataset): - try: - cosmo_exponent = dataset.attrs["a-scale exponent"][0] - except: - # Can't load, 'graceful' fallback. - cosmo_exponent = 0.0 - - a_factor_this_dataset = a ** cosmo_exponent - - return cosmo_factor(a_factor_this_dataset, current_scale_factor) - - self.field_cosmologies = [ - get_cosmo(self.metadata.handle[x]) for x in self.field_paths - ] - - return - - def load_physical(self): - """ - Loads in whether the field is saved as comoving or physical. - """ - - def get_physical(dataset): - try: - physical = dataset.attrs["Value stored as physical"][0] == 1 - except: - physical = False - return physical - - self.field_physicals = [ - get_physical(self.metadata.handle[x]) for x in self.field_paths - ] - - return - - def load_valid_transforms(self): - """ - Loads in whether the field can be converted to comoving. - """ - - def get_valid_transform(dataset): - try: - valid_transform = ( - dataset.attrs["Property can be converted to comoving"][0] == 1 - ) - except: - valid_transform = True - return valid_transform - - self.field_valid_transforms = [ - get_valid_transform(self.metadata.handle[x]) for x in self.field_paths - ] - - return - - def load_named_columns(self): - """ - Loads the named column data for relevant fields. - """ - - named_columns = {} - - for field in self.field_paths: - property_name = field.split("/")[-1] - - if property_name in self.metadata.named_columns.keys(): - field_names = self.metadata.named_columns[property_name] - - # Now need to make a decision on capitalisation. If we have a set of - # words with only one capital in them, then it's likely that they are - # element names or something similar, so they should be lower case. - # If on average we have many more capitals, then they are likely to be - # ionized fractions (e.g. HeII) and so we want to leave them with their - # original capitalisation. - - num_capitals = lambda x: sum(1 for c in x if c.isupper()) - mean_num_capitals = sum(map(num_capitals, field_names)) / len( - field_names - ) - - if mean_num_capitals < 1.01: - # Decapitalise them as they are likely individual element names - formatted_field_names = [x.lower() for x in field_names] - else: - formatted_field_names = field_names - - named_columns[field] = formatted_field_names - else: - named_columns[field] = None - - self.named_columns = named_columns - - return - def generate_getter( filename, @@ -1722,7 +605,7 @@ def get_metadata(self): this function again if you mess things up. """ - self.metadata = SWIFTMetadata(self.filename, self.units) + self.metadata = metadata_discriminator(self.filename, self.units) return From 6b26291641e3300005039513e2887968fc18f467 Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 18 Sep 2024 17:03:58 +0200 Subject: [PATCH 2/7] Added test loading a SOAP catalogue --- tests/test_soap.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tests/test_soap.py diff --git a/tests/test_soap.py b/tests/test_soap.py new file mode 100644 index 00000000..f835474b --- /dev/null +++ b/tests/test_soap.py @@ -0,0 +1,13 @@ +""" +Tests that we can open SOAP files +""" + +from tests.helper import requires + +from swiftsimio import load + +@requires("soap_example.hdf5") +def test_soap_can_load(filename): + data = load(filename) + + return From 988dc284693b26537bacb5481c6244cf8fc9c01f Mon Sep 17 00:00:00 2001 From: robjmcgibbon Date: Wed, 18 Sep 2024 16:15:32 +0100 Subject: [PATCH 3/7] Nice names --- swiftsimio/metadata/objects.py | 24 +++++++++++++----------- swiftsimio/reader.py | 8 +------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/swiftsimio/metadata/objects.py b/swiftsimio/metadata/objects.py index cdccc478..4624c01b 100644 --- a/swiftsimio/metadata/objects.py +++ b/swiftsimio/metadata/objects.py @@ -5,6 +5,7 @@ import h5py from swiftsimio.conversions import swift_cosmology_to_astropy from swiftsimio import metadata +from swiftsimio.objects import cosmo_array, cosmo_factor, a from abc import ABC, abstractmethod import re @@ -280,19 +281,10 @@ def get_desc(dataset): # Can't load description! description = "No description available" - if self.metadata.filetype != "SOAP": - return description - - try: - is_masked = dataset.attrs["Masked"] - except KeyError: - is_masked = False + is_masked = dataset.attrs.get("Masked", False) if not is_masked: return description + " Not masked." - # TODO: Update for https://github.com/SWIFTSIM/SOAP/pull/81 - # TODO: Also need to add metadata for each halo type for that PR - # If is_masked is true then these other attributes should exist mask_datasets = dataset.attrs["Mask Datasets"] mask_threshold = dataset.attrs["Mask Threshold"] if len(mask_datasets) == 1: @@ -1179,6 +1171,9 @@ def partial_snapshot(self) -> bool: return self.num_files_per_snapshot > 1 + @staticmethod + def get_nice_name(group): + return metadata.particle_types.particle_name_class[group] class SWIFTFOFMetadata(SWIFTMetadata): masking_valid: bool = False @@ -1211,6 +1206,9 @@ def present_group_names(self): """ return ["fof_groups"] + @staticmethod + def get_nice_name(group): + return "FOFGroups" class SWIFTSOAPMetadata(SWIFTMetadata): masking_valid: bool = True @@ -1244,4 +1242,8 @@ def present_group_names(self): return [ metadata.soap_types.get_soap_name_underscore(x) for x in self.present_groups - ] \ No newline at end of file + ] + + @staticmethod + def get_nice_name(group): + return metadata.soap_types.get_soap_name_nice(group) diff --git a/swiftsimio/reader.py b/swiftsimio/reader.py index 2fee88df..6a9f0654 100644 --- a/swiftsimio/reader.py +++ b/swiftsimio/reader.py @@ -395,13 +395,7 @@ def generate_datasets(group_metadata: SWIFTGroupMetadata, mask): group = group_metadata.group group_name = group_metadata.group_name - # Set nice name for group - if group_metadata.metadata.filetype == "snapshot": - group_nice_name = metadata.particle_types.particle_name_class[group] - elif group_metadata.metadata.filetype == "FOF": - group_nice_name = "FOFGroups" - elif group_metadata.metadata.filetype == "SOAP": - group_nice_name = metadata.soap_types.get_soap_name_nice(group) + group_nice_name = group_metadata.metadata.get_nice_name(group) # Mask is an object that contains all masks for all possible datasets. if mask is not None: From 276f4fed3de98f02392063199aa3683efefd9607 Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 18 Sep 2024 17:12:02 +0200 Subject: [PATCH 4/7] Packaging changes --- pyproject.toml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 52dbbecd..8407d060 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,28 +12,27 @@ packages = [ "swiftsimio.metadata.particle", "swiftsimio.metadata.unit", "swiftsimio.metadata.writer", + "swiftsimio.metadata.soap", "swiftsimio.visualisation", "swiftsimio.visualisation.projection_backends", "swiftsimio.visualisation.slice_backends", "swiftsimio.visualisation.tools", "swiftsimio.visualisation.smoothing_length", - "swiftsimio.metadata.soap" ] [project] name = "swiftsimio" -version="8.0.1" +version="9.0.0" authors = [ { name="Josh Borrow", email="josh@joshborrow.com" }, ] -description="SWIFTsim (swift.dur.ac.uk) i/o routines for python." +description="SWIFTsim (swiftsim.com) i/o routines for python." readme = "README.md" -requires-python = ">3.8.0" +requires-python = ">3.10.0" classifiers = [ - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)", "Operating System :: OS Independent", ] From 828d649e04d14338c52ca69dd93bbc59e1644788 Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 18 Sep 2024 17:19:31 +0200 Subject: [PATCH 5/7] Added optional named column metadata --- swiftsimio/metadata/objects.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/swiftsimio/metadata/objects.py b/swiftsimio/metadata/objects.py index 4624c01b..14928bfd 100644 --- a/swiftsimio/metadata/objects.py +++ b/swiftsimio/metadata/objects.py @@ -399,7 +399,10 @@ def load_named_columns(self): for field in self.field_paths: property_name = field.split("/")[-1] - if property_name in self.metadata.named_columns.keys(): + # Not all datasets have named columns + named_columns_metadata = getattr(self.metadata, "named_columns", {}) + + if property_name in named_columns_metadata.keys(): field_names = self.metadata.named_columns[property_name] # Now need to make a decision on capitalisation. If we have a set of From aca50486b32ffb3c61cb72b1a5c855713d9e2d3a Mon Sep 17 00:00:00 2001 From: robjmcgibbon Date: Wed, 18 Sep 2024 16:29:05 +0100 Subject: [PATCH 6/7] Add shared cell counts --- swiftsimio/__init__.py | 2 +- swiftsimio/masks.py | 8 ++++---- swiftsimio/metadata/objects.py | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/swiftsimio/__init__.py b/swiftsimio/__init__.py index ba19d453..4df92f4d 100644 --- a/swiftsimio/__init__.py +++ b/swiftsimio/__init__.py @@ -75,7 +75,7 @@ def mask(filename, spatial_only=True) -> SWIFTMask: """ units = SWIFTUnits(filename) - metadata = SWIFTMetadata(filename, units) + metadata = metadata_discriminator(filename, units) return SWIFTMask(metadata=metadata, spatial_only=spatial_only) diff --git a/swiftsimio/masks.py b/swiftsimio/masks.py index 1c65f6cf..e403a4c2 100644 --- a/swiftsimio/masks.py +++ b/swiftsimio/masks.py @@ -110,12 +110,12 @@ def _unpack_cell_metadata(self): for group, group_name in zip( self.metadata.present_groups, self.metadata.present_group_names ): - if self.metadata.filetype == "SOAP": - counts = count_handle["Subhalos"][:] - offsets = offset_handle["Subhalos"][:] - elif self.metadata.filetype == "snapshot": + if self.metadata.shared_cell_counts is None: counts = count_handle[group][:] offsets = offset_handle[group][:] + else: + counts = count_handle[self.metadata.shared_cell_counts][:] + offsets = offset_handle[self.metadata.shared_cell_counts][:] # When using MPI, we cannot assume that these are sorted. if sort is None: diff --git a/swiftsimio/metadata/objects.py b/swiftsimio/metadata/objects.py index 14928bfd..95dad3da 100644 --- a/swiftsimio/metadata/objects.py +++ b/swiftsimio/metadata/objects.py @@ -586,6 +586,7 @@ class SWIFTMetadata(ABC): units: SWIFTUnits header: dict masking_valid: bool = False + shared_cell_counts: str|None = None @abstractmethod def __init__(self, filename, units: SWIFTUnits): @@ -1215,6 +1216,7 @@ def get_nice_name(group): class SWIFTSOAPMetadata(SWIFTMetadata): masking_valid: bool = True + shared_cell_counts: str|None = "Subhalos" def __init__(self, filename: str, units: SWIFTUnits): self.filename = filename From e6b4a622b928245bdb5052b49072c065ad30be41 Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Thu, 19 Sep 2024 08:56:02 +0200 Subject: [PATCH 7/7] Formatting and cleanup --- swiftsimio/masks.py | 4 +- swiftsimio/metadata/objects.py | 875 +++++++++++++++++---------------- swiftsimio/reader.py | 8 +- tests/test_soap.py | 1 + 4 files changed, 471 insertions(+), 417 deletions(-) diff --git a/swiftsimio/masks.py b/swiftsimio/masks.py index e403a4c2..b3f406ca 100644 --- a/swiftsimio/masks.py +++ b/swiftsimio/masks.py @@ -51,7 +51,9 @@ def __init__(self, metadata: SWIFTMetadata, spatial_only=True): self.spatial_only = spatial_only if not self.metadata.masking_valid: - raise NotImplementedError(f"Masking not supported for {self.metadata.output_type} filetype") + raise NotImplementedError( + f"Masking not supported for {self.metadata.output_type} filetype" + ) if self.metadata.partial_snapshot: raise InvalidSnapshot( diff --git a/swiftsimio/metadata/objects.py b/swiftsimio/metadata/objects.py index 95dad3da..e3bd3b00 100644 --- a/swiftsimio/metadata/objects.py +++ b/swiftsimio/metadata/objects.py @@ -1,3 +1,11 @@ +""" +Objects describing the metadata in SWIFTsimIO files. There is a main +abstract class, ``SWIFTMetadata``, that contains the required base +methods to correctly represent the internal representation of an +HDF5 file to what SWIFTsimIO expects to be able to unpack into the +object notation (e.g. PartType0/Coordinates -> gas.coordinates). +""" + import numpy as np import unyt @@ -16,6 +24,334 @@ from typing import List, Optional + +class SWIFTMetadata(ABC): + """ + An abstract base class for all SWIFT-related file metadata. + """ + + # Underlying path to the file that this metadata is associated with. + filename: str + # The units object associated with this file. All SWIFT metadata objects + # must use this units system. + units: "SWIFTUnits" + # The header dictionary which will later be unpackaged according to the + # metadata fields. + header: dict + # Whether this type of file can be masked or not (this is a fixed parameter + # that should probably not be changed at run-time). + masking_valid: bool = False + # Whether this file uses shared metadata cell counts for all particle types + # (as is the case in SOAP) or whether each type (e.g. Gas, Dark Matter, etc.) + # has its own top-level cell grid counts. + shared_cell_counts: str | None = None + + @abstractmethod + def __init__(self, filename, units: "SWIFTUnits"): + raise NotImplementedError + + @property + def handle(self): + # Handle, which is shared with units. Units handles + # file opening and closing. + return self.units.handle + + def load_groups(self): + """ + Loads the groups and metadata into objects: + + metadata._properties + + This contains eight arrays, + + metadata._properties.field_names + metadata._properties.field_paths + metadata._properties.field_units + metadata._properties.field_cosmologies + metadata._properties.field_descriptions + metadata._properties.field_compressions + metadata._properties.field_physicals + metadata._properties.field_valid_transforms + + As well as some more information about the group. + """ + + for group, name in zip(self.present_groups, self.present_group_names): + filetype_metadata = SWIFTGroupMetadata( + group=group, + group_name=name, + metadata=self, + scale_factor=self.scale_factor, + ) + setattr(self, f"{name}_properties", filetype_metadata) + + return + + def get_metadata(self): + """ + Loads the metadata as specified in metadata.metadata_fields. + """ + + for field, name in metadata.metadata_fields.metadata_fields_to_read.items(): + try: + setattr(self, name, dict(self.handle[field].attrs)) + except KeyError: + setattr(self, name, None) + + return + + def postprocess_header(self): + """ + Some minor postprocessing on the header to local variables. + """ + + # These are just read straight in to variables + header_unpack_arrays_units = metadata.metadata_fields.generate_units_header_unpack_arrays( + m=self.units.mass, + l=self.units.length, + t=self.units.time, + I=self.units.current, + T=self.units.temperature, + ) + + for field, name in metadata.metadata_fields.header_unpack_arrays.items(): + try: + if name in header_unpack_arrays_units.keys(): + setattr( + self, + name, + unyt.unyt_array( + self.header[field], units=header_unpack_arrays_units[name] + ), + ) + # This is required or we automatically get everything in CGS! + getattr(self, name).convert_to_units( + header_unpack_arrays_units[name] + ) + else: + # Must not have any units! Oh well. + setattr(self, name, self.header[field]) + except KeyError: + # Must not be present, just skip it + continue + + # Now unpack the 'mass table' type items: + for field, name in metadata.metadata_fields.header_unpack_mass_tables.items(): + try: + setattr( + self, + name, + MassTable( + base_mass_table=self.header[field], mass_units=self.units.mass + ), + ) + except KeyError: + setattr( + self, + name, + MassTable( + base_mass_table=np.zeros( + len(metadata.particle_types.particle_name_underscores) + ), + mass_units=self.units.mass, + ), + ) + + # These must be unpacked as 'real' strings (i.e. converted to utf-8) + + for field, name in metadata.metadata_fields.header_unpack_string.items(): + try: + # Deal with h5py's quirkiness that fixed-sized and variable-sized + # strings are read as strings or bytes + # See: https://github.com/h5py/h5py/issues/2172 + raw = self.header[field] + try: + string = raw.decode("utf-8") + except AttributeError: + string = raw + setattr(self, name, string) + except KeyError: + # Must not be present, just skip it + setattr(self, name, "") + + # These must be unpacked as they are stored as length-1 arrays + + header_unpack_float_units = metadata.metadata_fields.generate_units_header_unpack_single_float( + m=self.units.mass, + l=self.units.length, + t=self.units.time, + I=self.units.current, + T=self.units.temperature, + ) + + for field, names in metadata.metadata_fields.header_unpack_single_float.items(): + try: + if isinstance(names, list): + # Sometimes we store a list in case we have multiple names, for example + # Redshift -> metadata.redshift AND metadata.z. Can't just do the iteration + # because we may loop over the letters in the string. + for variable in names: + if variable in header_unpack_float_units.keys(): + # We have an associated unit! + unit = header_unpack_float_units[variable] + setattr( + self, + variable, + unyt.unyt_quantity(self.header[field][0], units=unit), + ) + else: + # No unit + setattr(self, variable, self.header[field][0]) + else: + # We can just check for the unit and set the attribute + variable = names + if variable in header_unpack_float_units.keys(): + # We have an associated unit! + unit = header_unpack_float_units[variable] + setattr( + self, + variable, + unyt.unyt_quantity(self.header[field][0], units=unit), + ) + else: + # No unit + setattr(self, variable, self.header[field][0]) + except KeyError: + # Must not be present, just skip it + continue + + # These are special cases, sorry! + # Date and time of snapshot dump + try: + try: + # Try and decode bytes, otherwise save raw string + snapshot_date = self.header.get( + "SnapshotDate", self.header.get("Snapshot date", b"") + ).decode("utf-8") + except AttributeError: + snapshot_date = self.header.get( + "SnapshotDate", self.header.get("Snapshot date", "") + ) + try: + self.snapshot_date = datetime.strptime( + snapshot_date, "%H:%M:%S %Y-%m-%d %Z" + ) + except ValueError: + # Backwards compatibility; this was used previously due to simplicity + # but is not portable between regions. So if you ran a simulation on + # a British (en_GB) machine, and then tried to read on a Dutch + # machine (nl_NL), this would _not_ work because %c is different. + try: + self.snapshot_date = datetime.strptime(snapshot_date, "%c\n") + except ValueError: + # Oh dear this has gone _very_wrong. Let's just keep it as a string. + self.snapshot_date = snapshot_date + except KeyError: + # Old file + pass + + # get photon group edges RT dataset from the SubgridScheme group + try: + self.photon_group_edges = ( + self.handle["SubgridScheme/PhotonGroupEdges"][:] / self.units.time + ) + except KeyError: + self.photon_group_edges = None + + # get reduced speed of light RT dataset from the SubgridScheme group + try: + self.reduced_lightspeed = ( + self.handle["SubgridScheme/ReducedLightspeed"][0] + * self.units.length + / self.units.time + ) + except KeyError: + self.reduced_lightspeed = None + + # Store these separately as self.n_gas = number of gas particles for example + for (part_number, (_, part_name)) in enumerate( + metadata.particle_types.particle_name_underscores.items() + ): + try: + setattr(self, f"n_{part_name}", self.num_part[part_number]) + except IndexError: + # Backwards compatibility; mass/number table can change size. + setattr(self, f"n_{part_name}", 0) + + # Need to unpack the gas gamma for cosmology + try: + self.gas_gamma = self.hydro_scheme["Adiabatic index"] + except (KeyError, TypeError): + # We can set a default and print a message whenever we require this value + self.gas_gamma = None + + try: + self.a = self.scale_factor + except AttributeError: + # These must always be present for the initialisation of cosmology properties + self.a = 1.0 + self.scale_factor = 1.0 + + return + + def extract_cosmology(self): + """ + Creates an astropy.cosmology object from the internal cosmology system. + + This will be saved as ``self.cosmology``. + """ + + if self.cosmology_raw is not None: + cosmo = self.cosmology_raw + else: + cosmo = {"Cosmological run": 0} + + if cosmo.get("Cosmological run", 0): + self.cosmology = swift_cosmology_to_astropy(cosmo, units=self.units) + else: + self.cosmology = None + + return + + @property + @abstractmethod + def present_groups(self) -> list[str]: + """ + A property giving the present particle groups in the file to be unpackaged + into top-level properties. For instance, in a regular snapshot, this would be + ["PartType0", "PartType1", "PartType4", ...]. In SOAP, this would be + ["SO/200_crit", "SO/200_mean", ...], i.e. one per aperture. + """ + raise NotImplementedError + + @property + @abstractmethod + def present_group_names(self) -> list[str]: + """ + A property giving the mapping for the names in ``present_groups`` to what the + objects are called on the SWIFTsimIO objects. For instance, in a regular snapshot, + this would be ["gas", "dark_matter", "stars", ...]. In SOAP, this would be + ["spherical_overdensity_200_crit", ...]. + """ + raise NotImplementedError + + @property + def partial_snapshot(self) -> bool: + """ + A property defining whether this is a partial snapshot (e.g. a `.0.hdf5` file) or + a full/virtual snapsoht covering all particles. This must be computed at run-time. + """ + return False + + @staticmethod + @abstractmethod + def get_nice_name(group: str) -> str: + """ + Converts the group name to a 'nice name' (i.e. for printing) for the SWIFTsimIO objects. + """ + raise NotImplementedError + + class MassTable(object): """ Extracts a mass table to local variables based on the @@ -114,7 +450,6 @@ def __str__(self): def __repr__(self): return f"{self.__str__()}. Raw data: " "\n" f"{self.data}." - class SWIFTGroupMetadata(object): @@ -148,7 +483,11 @@ class SWIFTGroupMetadata(object): """ def __init__( - self, group: str, group_name: str, metadata: "SWIFTMetadata", scale_factor: float + self, + group: str, + group_name: str, + metadata: "SWIFTMetadata", + scale_factor: float, ): """ Constructor for SWIFTGroupMetadata class @@ -460,430 +799,128 @@ def __init__(self, filename: Path, handle: Optional[h5py.File] = None): """ SWIFTUnits constructor - Sets filename for file to read units from and gets unit dictionary - - Parameters - ---------- - - filename : Path - Name of file to read units from - - handle: h5py.File, optional - The h5py file handle, optional. Will open a new handle with the - filename if required. - - """ - self.filename = filename - self._handle = handle - - self.get_unit_dictionary() - - return - - @property - def handle(self): - """ - Property that gets the file handle, which can be shared - with other objects for efficiency reasons. - """ - if isinstance(self._handle, h5py.File): - # Can be open or closed, let's test. - try: - file = self._handle.file - - return self._handle - except ValueError: - # This will be the case if there is no active file handle - pass - - self._handle = h5py.File(self.filename, "r") - - return self._handle - - def get_unit_dictionary(self): - """ - Store unit data and metadata - - Length 1 arrays are used to store the unit data. This dictionary - also contains the metadata information that connects the unyt - objects to the names that are stored in the SWIFT snapshots. - """ - - self.units = { - name: unyt.unyt_quantity( - value[0], units=metadata.unit_types.unit_names_to_unyt[name] - ) - for name, value in self.handle["Units"].attrs.items() - } - - # We now unpack this into variables. - self.mass = metadata.unit_types.find_nearest_base_unit( - self.units["Unit mass in cgs (U_M)"], "mass" - ) - self.length = metadata.unit_types.find_nearest_base_unit( - self.units["Unit length in cgs (U_L)"], "length" - ) - self.time = metadata.unit_types.find_nearest_base_unit( - self.units["Unit time in cgs (U_t)"], "time" - ) - self.current = metadata.unit_types.find_nearest_base_unit( - self.units["Unit current in cgs (U_I)"], "current" - ) - self.temperature = metadata.unit_types.find_nearest_base_unit( - self.units["Unit temperature in cgs (U_T)"], "temperature" - ) - - def __del__(self): - if isinstance(self._handle, h5py.File): - self._handle.close() - - - -def metadata_discriminator(filename: str, units: SWIFTUnits) -> "SWIFTMetadata": - """ - Discriminates between the different types of metadata objects read from SWIFT-compatible - files. - - Parameters - ---------- - - filename : str - Name of the file to read metadata from - - units : SWIFTUnits - The units object associated with the file - - - Returns - ------- - - SWIFTMetadata - The appropriate metadata object for the file type - """ - # Old snapshots did not have this attribute, so we need to default to FullVolume - file_type = units.handle["Header"].attrs.get("OutputType", "FullVolume") - - if isinstance(file_type, bytes): - file_type = file_type.decode("utf-8") - - if file_type in ["FullVolume"]: - return SWIFTSnapshotMetadata(filename, units) - elif file_type in ["SOAP"]: - return SWIFTSOAPMetadata(filename, units) - elif file_type in ["FOF"]: - return SWIFTFOFMetadata(filename, units) - else: - raise ValueError(f"File type {file_type} not recognised.") - - - -class SWIFTMetadata(ABC): - """ - An abstract base class for all SWIFT-related file metadata. - """ - - filename: str - units: SWIFTUnits - header: dict - masking_valid: bool = False - shared_cell_counts: str|None = None - - @abstractmethod - def __init__(self, filename, units: SWIFTUnits): - raise NotImplementedError - - @property - def handle(self): - # Handle, which is shared with units. Units handles - # file opening and closing. - return self.units.handle - - def load_groups(self): - """ - Loads the groups and metadata into objects: - - metadata._properties - - This contains eight arrays, - - metadata._properties.field_names - metadata._properties.field_paths - metadata._properties.field_units - metadata._properties.field_cosmologies - metadata._properties.field_descriptions - metadata._properties.field_compressions - metadata._properties.field_physicals - metadata._properties.field_valid_transforms - - As well as some more information about the group. - """ - - for group, name in zip(self.present_groups, self.present_group_names): - filetype_metadata = SWIFTGroupMetadata( - group=group, - group_name=name, - metadata=self, - scale_factor=self.scale_factor, - ) - setattr(self, f"{name}_properties", filetype_metadata) - - return - - def get_metadata(self): - """ - Loads the metadata as specified in metadata.metadata_fields. - """ - - for field, name in metadata.metadata_fields.metadata_fields_to_read.items(): - try: - setattr(self, name, dict(self.handle[field].attrs)) - except KeyError: - setattr(self, name, None) - - return - - def postprocess_header(self): - """ - Some minor postprocessing on the header to local variables. - """ - - # These are just read straight in to variables - header_unpack_arrays_units = metadata.metadata_fields.generate_units_header_unpack_arrays( - m=self.units.mass, - l=self.units.length, - t=self.units.time, - I=self.units.current, - T=self.units.temperature, - ) - - for field, name in metadata.metadata_fields.header_unpack_arrays.items(): - try: - if name in header_unpack_arrays_units.keys(): - setattr( - self, - name, - unyt.unyt_array( - self.header[field], units=header_unpack_arrays_units[name] - ), - ) - # This is required or we automatically get everything in CGS! - getattr(self, name).convert_to_units( - header_unpack_arrays_units[name] - ) - else: - # Must not have any units! Oh well. - setattr(self, name, self.header[field]) - except KeyError: - # Must not be present, just skip it - continue - - # Now unpack the 'mass table' type items: - for field, name in metadata.metadata_fields.header_unpack_mass_tables.items(): - try: - setattr( - self, - name, - MassTable( - base_mass_table=self.header[field], mass_units=self.units.mass - ), - ) - except KeyError: - setattr( - self, - name, - MassTable( - base_mass_table=np.zeros( - len(metadata.particle_types.particle_name_underscores) - ), - mass_units=self.units.mass, - ), - ) - - # These must be unpacked as 'real' strings (i.e. converted to utf-8) - - for field, name in metadata.metadata_fields.header_unpack_string.items(): - try: - # Deal with h5py's quirkiness that fixed-sized and variable-sized - # strings are read as strings or bytes - # See: https://github.com/h5py/h5py/issues/2172 - raw = self.header[field] - try: - string = raw.decode("utf-8") - except AttributeError: - string = raw - setattr(self, name, string) - except KeyError: - # Must not be present, just skip it - setattr(self, name, "") - - # These must be unpacked as they are stored as length-1 arrays + Sets filename for file to read units from and gets unit dictionary - header_unpack_float_units = metadata.metadata_fields.generate_units_header_unpack_single_float( - m=self.units.mass, - l=self.units.length, - t=self.units.time, - I=self.units.current, - T=self.units.temperature, - ) + Parameters + ---------- - for field, names in metadata.metadata_fields.header_unpack_single_float.items(): - try: - if isinstance(names, list): - # Sometimes we store a list in case we have multiple names, for example - # Redshift -> metadata.redshift AND metadata.z. Can't just do the iteration - # because we may loop over the letters in the string. - for variable in names: - if variable in header_unpack_float_units.keys(): - # We have an associated unit! - unit = header_unpack_float_units[variable] - setattr( - self, - variable, - unyt.unyt_quantity(self.header[field][0], units=unit), - ) - else: - # No unit - setattr(self, variable, self.header[field][0]) - else: - # We can just check for the unit and set the attribute - variable = names - if variable in header_unpack_float_units.keys(): - # We have an associated unit! - unit = header_unpack_float_units[variable] - setattr( - self, - variable, - unyt.unyt_quantity(self.header[field][0], units=unit), - ) - else: - # No unit - setattr(self, variable, self.header[field][0]) - except KeyError: - # Must not be present, just skip it - continue + filename : Path + Name of file to read units from - # These are special cases, sorry! - # Date and time of snapshot dump - try: - try: - # Try and decode bytes, otherwise save raw string - snapshot_date = self.header.get( - "SnapshotDate", self.header.get("Snapshot date", b"") - ).decode("utf-8") - except AttributeError: - snapshot_date = self.header.get( - "SnapshotDate", self.header.get("Snapshot date", "") - ) - try: - self.snapshot_date = datetime.strptime( - snapshot_date, "%H:%M:%S %Y-%m-%d %Z" - ) - except ValueError: - # Backwards compatibility; this was used previously due to simplicity - # but is not portable between regions. So if you ran a simulation on - # a British (en_GB) machine, and then tried to read on a Dutch - # machine (nl_NL), this would _not_ work because %c is different. - try: - self.snapshot_date = datetime.strptime(snapshot_date, "%c\n") - except ValueError: - # Oh dear this has gone _very_wrong. Let's just keep it as a string. - self.snapshot_date = snapshot_date - except KeyError: - # Old file - pass + handle: h5py.File, optional + The h5py file handle, optional. Will open a new handle with the + filename if required. - # get photon group edges RT dataset from the SubgridScheme group - try: - self.photon_group_edges = ( - self.handle["SubgridScheme/PhotonGroupEdges"][:] / self.units.time - ) - except KeyError: - self.photon_group_edges = None + """ + self.filename = filename + self._handle = handle - # get reduced speed of light RT dataset from the SubgridScheme group - try: - self.reduced_lightspeed = ( - self.handle["SubgridScheme/ReducedLightspeed"][0] - * self.units.length - / self.units.time - ) - except KeyError: - self.reduced_lightspeed = None + self.get_unit_dictionary() - # Store these separately as self.n_gas = number of gas particles for example - for (part_number, (_, part_name)) in enumerate( - metadata.particle_types.particle_name_underscores.items() - ): + return + + @property + def handle(self): + """ + Property that gets the file handle, which can be shared + with other objects for efficiency reasons. + """ + if isinstance(self._handle, h5py.File): + # Can be open or closed, let's test. try: - setattr(self, f"n_{part_name}", self.num_part[part_number]) - except IndexError: - # Backwards compatibility; mass/number table can change size. - setattr(self, f"n_{part_name}", 0) + file = self._handle.file - # Need to unpack the gas gamma for cosmology - try: - self.gas_gamma = self.hydro_scheme["Adiabatic index"] - except (KeyError, TypeError): - # We can set a default and print a message whenever we require this value - self.gas_gamma = None + return self._handle + except ValueError: + # This will be the case if there is no active file handle + pass - try: - self.a = self.scale_factor - except AttributeError: - # These must always be present for the initialisation of cosmology properties - self.a = 1.0 - self.scale_factor = 1.0 + self._handle = h5py.File(self.filename, "r") - return - + return self._handle - def extract_cosmology(self): + def get_unit_dictionary(self): """ - Creates an astropy.cosmology object from the internal cosmology system. + Store unit data and metadata - This will be saved as ``self.cosmology``. + Length 1 arrays are used to store the unit data. This dictionary + also contains the metadata information that connects the unyt + objects to the names that are stored in the SWIFT snapshots. """ - if self.cosmology_raw is not None: - cosmo = self.cosmology_raw - else: - cosmo = {"Cosmological run": 0} + self.units = { + name: unyt.unyt_quantity( + value[0], units=metadata.unit_types.unit_names_to_unyt[name] + ) + for name, value in self.handle["Units"].attrs.items() + } - if cosmo.get("Cosmological run", 0): - self.cosmology = swift_cosmology_to_astropy(cosmo, units=self.units) - else: - self.cosmology = None + # We now unpack this into variables. + self.mass = metadata.unit_types.find_nearest_base_unit( + self.units["Unit mass in cgs (U_M)"], "mass" + ) + self.length = metadata.unit_types.find_nearest_base_unit( + self.units["Unit length in cgs (U_L)"], "length" + ) + self.time = metadata.unit_types.find_nearest_base_unit( + self.units["Unit time in cgs (U_t)"], "time" + ) + self.current = metadata.unit_types.find_nearest_base_unit( + self.units["Unit current in cgs (U_I)"], "current" + ) + self.temperature = metadata.unit_types.find_nearest_base_unit( + self.units["Unit temperature in cgs (U_T)"], "temperature" + ) + + def __del__(self): + if isinstance(self._handle, h5py.File): + self._handle.close() + + +def metadata_discriminator(filename: str, units: SWIFTUnits) -> "SWIFTMetadata": + """ + Discriminates between the different types of metadata objects read from SWIFT-compatible + files. + + Parameters + ---------- + + filename : str + Name of the file to read metadata from + + units : SWIFTUnits + The units object associated with the file - return - - @property - @abstractmethod - def present_groups(self): - raise NotImplementedError - - @property - @abstractmethod - def present_group_names(self): - raise NotImplementedError - @property - def partial_snapshot(self) -> bool: - return False + Returns + ------- -class SWIFTSnapshotMetadata(SWIFTMetadata): + SWIFTMetadata + The appropriate metadata object for the file type """ - Loads all metadata (apart from Units, those are handled by SWIFTUnits) - into dictionaries. + # Old snapshots did not have this attribute, so we need to default to FullVolume + file_type = units.handle["Header"].attrs.get("OutputType", "FullVolume") + + if isinstance(file_type, bytes): + file_type = file_type.decode("utf-8") + + if file_type in ["FullVolume"]: + return SWIFTSnapshotMetadata(filename, units) + elif file_type in ["SOAP"]: + return SWIFTSOAPMetadata(filename, units) + elif file_type in ["FOF"]: + return SWIFTFOFMetadata(filename, units) + else: + raise ValueError(f"File type {file_type} not recognised.") + - This also does some extra parsing on some well-used metadata. +class SWIFTSnapshotMetadata(SWIFTMetadata): + """ + SWIFT Metadata for a snapshot-style file containing particle + information. For more documentation, see the main :cls:`SWIFTMetadata` + class. """ - # Name of the file that has been read from - filename: str - # Unit instance associated with this file - units: SWIFTUnits - # Header dictionary, metadata about snapshot. - header: dict masking_valid: bool = True def __init__(self, filename, units: SWIFTUnits): @@ -1002,7 +1039,6 @@ def convert(name): return - @property def present_groups(self): """ @@ -1174,13 +1210,18 @@ def partial_snapshot(self) -> bool: # collating multiple sub-snapshots together have num_files_per_snapshot = 1. return self.num_files_per_snapshot > 1 - + @staticmethod def get_nice_name(group): return metadata.particle_types.particle_name_class[group] + class SWIFTFOFMetadata(SWIFTMetadata): - masking_valid: bool = False + """ + SWIFT Metadata for a snapshot-style file containing particle + information. For more documentation, see the main :cls:`SWIFTMetadata` + class. + """ def __init__(self, filename: str, units: SWIFTUnits): self.filename = filename @@ -1195,28 +1236,35 @@ def __init__(self, filename: str, units: SWIFTUnits): self.handle.close() return - + @property def present_groups(self): """ The groups containing datasets that are present in the file. """ return ["Groups"] - + @property def present_group_names(self): """ The names of the groups that we want to expose. """ return ["fof_groups"] - + @staticmethod def get_nice_name(group): return "FOFGroups" + class SWIFTSOAPMetadata(SWIFTMetadata): + """ + SWIFT Metadata for a snapshot-style file containing particle + information. For more documentation, see the main :cls:`SWIFTMetadata` + class. + """ + masking_valid: bool = True - shared_cell_counts: str|None = "Subhalos" + shared_cell_counts: str = "Subhalos" def __init__(self, filename: str, units: SWIFTUnits): self.filename = filename @@ -1231,22 +1279,21 @@ def __init__(self, filename: str, units: SWIFTUnits): self.handle.close() return - + @property def present_groups(self): """ The groups containing datasets that are present in the file. """ return self.subhalo_types - + @property def present_group_names(self): """ The names of the groups that we want to expose. """ return [ - metadata.soap_types.get_soap_name_underscore(x) - for x in self.present_groups + metadata.soap_types.get_soap_name_underscore(x) for x in self.present_groups ] @staticmethod diff --git a/swiftsimio/reader.py b/swiftsimio/reader.py index 6a9f0654..cc06f571 100644 --- a/swiftsimio/reader.py +++ b/swiftsimio/reader.py @@ -15,7 +15,12 @@ from swiftsimio.accelerated import read_ranges_from_file from swiftsimio.objects import cosmo_array, cosmo_factor, a -from swiftsimio.metadata.objects import metadata_discriminator, SWIFTUnits, SWIFTGroupMetadata, SWIFTMetadata +from swiftsimio.metadata.objects import ( + metadata_discriminator, + SWIFTUnits, + SWIFTGroupMetadata, + SWIFTMetadata, +) import re import h5py @@ -29,7 +34,6 @@ from typing import Union, Callable, List, Optional - def generate_getter( filename, name: str, diff --git a/tests/test_soap.py b/tests/test_soap.py index f835474b..0a6a4b0d 100644 --- a/tests/test_soap.py +++ b/tests/test_soap.py @@ -6,6 +6,7 @@ from swiftsimio import load + @requires("soap_example.hdf5") def test_soap_can_load(filename): data = load(filename)