Initial implementation of working TESCAN parser

FAIRmat-NFDI · Aug 7, 2024 · d054a90 · d054a90
1 parent bcec9ea
commit d054a90
Show file tree

Hide file tree

Showing 5 changed files with 298 additions and 15 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -13,8 +13,7 @@
             "args": ["convert",
                      // "examples/eln_data.yaml",
                      // "examples/em.oasis.specific.yaml",
-                     "../ebic_dm3_goette/documents-export-2024-06-06/SEM/20240227_A1_2m_0_FA3_1.txt",
-                     "../ebic_dm3_goette/documents-export-2024-06-06/SEM/20240227_A1_2m_0_FA3_1.tif",
+                     "../tescan/CZ04-2_102_Pic_2.tif",
                      "--reader",
                      "em",
                      "--nxdl",

diff --git a/src/pynxtools_em/configurations/image_tiff_tescan_cfg.py b/src/pynxtools_em/configurations/image_tiff_tescan_cfg.py
@@ -0,0 +1,54 @@
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Configuration of the image_tiff_tescan parser."""
+
+from pint import UnitRegistry
+
+ureg = UnitRegistry()
+
+
+TESCAN_VARIOUS_DYNAMIC_TO_NX_EM = {
+    "prefix_trg": "/ENTRY[entry*]/measurement/EVENT_DATA_EM_SET[event_data_em_set]/EVENT_DATA_EM[event_data_em*]",
+    "prefix_src": "",
+    "map_to_f8": [
+        ("em_lab/OPTICAL_SYSTEM_EM[optical_system_em]/magnification", "Magnification"),
+        (
+            "em_lab/OPTICAL_SYSTEM_EM[optical_system_em]/working_distance",
+            ureg.centimeter,
+            "WD",
+            ureg.meter,
+        ),
+        (
+            "em_lab/EBEAM_COLUMN[ebeam_column]/electron_source/voltage",
+            ureg.millivolt,
+            "HV",
+            ureg.kilovolt,
+        ),
+    ],
+}
+
+
+TESCAN_VARIOUS_STATIC_TO_NX_EM = {
+    "prefix_trg": "/ENTRY[entry*]/measurement/em_lab",
+    "prefix_src": "",
+    "use": [("FABRICATION[fabrication]/vendor", "TESCAN")],
+    "map": [
+        ("FABRICATION[fabrication]/model", "Device"),
+        ("FABRICATION[fabrication]/identifier", "SerialNumber"),
+    ],
+}
diff --git a/src/pynxtools_em/parsers/image_tiff_jeol.py b/src/pynxtools_em/parsers/image_tiff_jeol.py
@@ -120,12 +120,12 @@ def check_if_tiff_jeol(self):
                             self.tmp["flat_dict_meta"][tmp[0]] = pint.Quantity(tmp[1])
                     else:
                         raise KeyError(f"Found duplicated key {tmp[0]} !")
-                else:  # len(tmp) > 2:
+                else:
                     print(f"WARNING::{line} is currently ignored !")
 
-            # report metadata just for verbose purposes right now
-            for key, value in self.tmp["flat_dict_meta"].items():
-                print(f"{key}______{type(value)}____{value}")
+            if self.verbose:
+                for key, value in self.tmp["flat_dict_meta"].items():
+                    print(f"{key}______{type(value)}____{value}")
 
             if (
                 self.tmp["flat_dict_meta"]["SEM_DATA_VERSION"] == 1

diff --git a/src/pynxtools_em/parsers/image_tiff_tescan.py b/src/pynxtools_em/parsers/image_tiff_tescan.py
@@ -0,0 +1,222 @@
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Subparser for harmonizing TESCAN-specific content in TIFF files."""
+
+import mmap
+from typing import Dict
+
+import flatdict as fd
+import numpy as np
+from PIL import Image, ImageSequence
+from pynxtools_em.concepts.mapping_functors_pint import add_specific_metadata_pint
+
+# from pynxtools_em.configurations.image_tiff_tescan_cfg import (
+#     TESCAN_VARIOUS_DYNAMIC_TO_NX_EM,
+#     TESCAN_VARIOUS_STATIC_TO_NX_EM
+# )
+from pynxtools_em.parsers.image_tiff import TiffParser
+from pynxtools_em.utils.string_conversions import string_to_number
+
+
+class TescanTiffParser(TiffParser):
+    def __init__(self, file_path: str = "", entry_id: int = 1):
+        super().__init__(file_path)
+        self.entry_id = entry_id
+        self.event_id = 1
+        self.prfx = None
+        self.tmp: Dict = {"data": None, "flat_dict_meta": fd.FlatDict({})}
+        self.supported_version: Dict = {}
+        self.version: Dict = {}
+        self.tags: Dict = {}
+        self.supported = False
+        self.check_if_tiff_tescan()
+
+    def check_if_tiff_tescan(self):
+        """Check if resource behind self.file_path is a TaggedImageFormat file.
+
+        This also loads the metadata first if possible as these contain details
+        about which software was used to process the image data, e.g. DISS software.
+        """
+        self.supported = 0  # voting-based
+        with open(self.file_path, "rb", 0) as file:
+            s = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ)
+            magic = s.read(4)
+            if magic == b"II*\x00":  # https://en.wikipedia.org/wiki/TIFF
+                self.supported += 1
+            else:
+                self.supported = False
+                print(
+                    f"Parser {self.__class__.__name__} finds no content in {self.file_path} that it supports"
+                )
+                return
+        with Image.open(self.file_path, mode="r") as fp:
+            tescan_keys = [50431]
+            for tescan_key in tescan_keys:
+                if tescan_key in fp.tag_v2:
+                    payload = fp.tag_v2[tescan_key]
+                    pos = payload.find(bytes("Description", "utf8"))
+                    txt = payload[pos:].decode("utf8")
+                    del payload
+
+                    self.tmp["flat_dict_meta"] = fd.FlatDict({}, "/")
+                    for line in txt.split():
+                        tmp = [value.strip() for value in line.split("=")]
+                        if len(tmp) == 1:
+                            print(f"Ignore line {line} !")
+                        elif len(tmp) == 2:
+                            if tmp[0] and tmp[0] not in self.tmp["flat_dict_meta"]:
+                                self.tmp["flat_dict_meta"][tmp[0]] = string_to_number(
+                                    tmp[1]
+                                )
+                        else:
+                            print(f"Ignore line {line} !")
+
+        if self.verbose:
+            for key, value in self.tmp["flat_dict_meta"].items():
+                print(f"{key}____{type(value)}____{value}")
+
+        # check if written about with supported DISS version
+        supported_versions = ["TIMA"]
+        if "Device" in self.tmp["flat_dict_meta"]:
+            if self.tmp["flat_dict_meta"]["Device"] in supported_versions:
+                self.supported += 1
+            # but this is quite a weak test, more instance data are required
+            # with TESCAN-specific concept names to make this here more robust
+        if self.supported == 2:
+            self.supported = True
+        else:
+            self.supported = False
+            print(
+                f"Parser {self.__class__.__name__} finds no content in {self.file_path} that it supports"
+            )
+
+    def parse_and_normalize(self):
+        """Perform actual parsing filling cache self.tmp."""
+        if self.supported is True:
+            print(f"Parsing via TESCAN-specific metadata...")
+            # metadata have at this point already been collected into an fd.FlatDict
+        else:
+            print(
+                f"{self.file_path} is not a TESCAN-specific TIFF file that this parser can process !"
+            )
+
+    def process_into_template(self, template: dict) -> dict:
+        if self.supported is True:
+            self.process_event_data_em_metadata(template)
+            self.process_event_data_em_data(template)
+        return template
+
+    def process_event_data_em_data(self, template: dict) -> dict:
+        """Add respective heavy data."""
+        # default display of the image(s) representing the data collected in this event
+        print(f"Writing TESCAN image data to the respective NeXus concept instances...")
+        # read image in-place
+        image_identifier = 1
+        with Image.open(self.file_path, mode="r") as fp:
+            for img in ImageSequence.Iterator(fp):
+                nparr = np.array(img)
+                print(
+                    f"Processing image {image_identifier} ... {type(nparr)}, {np.shape(nparr)}, {nparr.dtype}"
+                )
+                # eventually similar open discussions points as were raised for tiff_tfs parser
+                trg = (
+                    f"/ENTRY[entry{self.entry_id}]/measurement/event_data_em_set/"
+                    f"EVENT_DATA_EM[event_data_em{self.event_id}]/"
+                    f"IMAGE_SET[image_set{image_identifier}]/image_twod"
+                )
+                template[f"{trg}/title"] = f"Image"
+                template[f"{trg}/@signal"] = "real"
+                dims = ["i", "j"]  # i == x (fastest), j == y (fastest)
+                idx = 0
+                for dim in dims:
+                    template[f"{trg}/@AXISNAME_indices[axis_{dim}_indices]"] = (
+                        np.uint32(idx)
+                    )
+                    idx += 1
+                template[f"{trg}/@axes"] = []
+                for dim in dims[::-1]:
+                    template[f"{trg}/@axes"].append(f"axis_{dim}")
+                template[f"{trg}/real"] = {"compress": np.array(fp), "strength": 1}
+                #  0 is y while 1 is x for 2d, 0 is z, 1 is y, while 2 is x for 3d
+                template[f"{trg}/real/@long_name"] = f"Signal"
+
+                sxy = {"i": 1.0, "j": 1.0}
+                scan_unit = {"i": "m", "j": "m"}
+                if ("PixelSizeX" in self.tmp["flat_dict_meta"]) and (
+                    "PixelSizeY" in self.tmp["flat_dict_meta"]
+                ):
+                    sxy = {
+                        "i": self.tmp["flat_dict_meta"]["PixelSizeX"],
+                        "j": self.tmp["flat_dict_meta"]["PixelSizeY"],
+                    }
+                else:
+                    print("WARNING: Assuming pixel width and height unit is meter!")
+                nxy = {"i": np.shape(np.array(fp))[1], "j": np.shape(np.array(fp))[0]}
+                # TODO::be careful we assume here a very specific coordinate system
+                # however, these assumptions need to be confirmed by point electronic
+                # additional points as discussed already in comments to TFS TIFF reader
+                for dim in dims:
+                    template[f"{trg}/AXISNAME[axis_{dim}]"] = {
+                        "compress": np.asarray(
+                            np.linspace(0, nxy[dim] - 1, num=nxy[dim], endpoint=True)
+                            * sxy[dim],
+                            np.float64,
+                        ),
+                        "strength": 1,
+                    }
+                    template[f"{trg}/AXISNAME[axis_{dim}]/@long_name"] = (
+                        f"Coordinate along {dim}-axis ({scan_unit[dim]})"
+                    )
+                    template[f"{trg}/AXISNAME[axis_{dim}]/@units"] = f"{scan_unit[dim]}"
+                image_identifier += 1
+        return template
+
+    def add_various_dynamic(self, template: dict) -> dict:
+        identifier = [self.entry_id, self.event_id, 1]
+        """
+        add_specific_metadata_pint(
+            TESCAN_VARIOUS_DYNAMIC_TO_NX_EM,
+            self.tmp["flat_dict_meta"],
+            identifier,
+            template,
+        )
+        """
+        return template
+
+    def add_various_static(self, template: dict) -> dict:
+        identifier = [self.entry_id, self.event_id, 1]
+        """
+        add_specific_metadata_pint(
+            TESCAN_VARIOUS_STATIC_TO_NX_EM,
+            self.tmp["flat_dict_meta"],
+            identifier,
+            template,
+        )
+        """
+        return template
+
+    def process_event_data_em_metadata(self, template: dict) -> dict:
+        """Add respective metadata."""
+        # contextualization to understand how the image relates to the EM session
+        print(
+            f"Mapping some of the point electronic DISS metadata on respective NeXus concepts..."
+        )
+        self.add_various_dynamic(template)
+        self.add_various_static(template)
+        # ... add more as required ...
+        return template
diff --git a/src/pynxtools_em/parsers/nxs_imgs.py b/src/pynxtools_em/parsers/nxs_imgs.py
@@ -19,6 +19,7 @@
 
 from pynxtools_em.parsers.image_png_protochips import ProtochipsPngSetParser
 from pynxtools_em.parsers.image_tiff_point_electronic import PointElectronicTiffParser
+from pynxtools_em.parsers.image_tiff_tescan import TescanTiffParser
 from pynxtools_em.parsers.image_tiff_tfs import TfsTiffParser
 
 
@@ -41,6 +42,9 @@ def identify_image_type(self):
         img = TfsTiffParser(self.file_path)
         if img.supported:
             return "single_tiff_tfs"
+        img = TescanTiffParser(self.file_path)
+        if img.supported:
+            return "single_tiff_tescan"
         img = PointElectronicTiffParser(self.file_path)
         if img.supported:
             return "tiff_point_electronic"
@@ -60,17 +64,21 @@ def parse(self, template: dict) -> dict:
         # see also comments for respective nxs_pyxem parser
         # and its interaction with tech-partner-specific hfive_* parsers
         if image_parser_type == "single_tiff_tfs":
-            tiff = TfsTiffParser(self.file_path, self.entry_id)
-            tiff.parse_and_normalize()
-            tiff.process_into_template(template)
+            tfs = TfsTiffParser(self.file_path, self.entry_id)
+            tfs.parse_and_normalize()
+            tfs.process_into_template(template)
+        elif image_parser_type == "single_tiff_tescan":
+            tsc = TescanTiffParser(self.file_path, self.entry_id)
+            tsc.parse_and_normalize()
+            tsc.process_into_template(template)
         elif image_parser_type == "tiff_point_electronic":
-            diss = PointElectronicTiffParser(self.file_path, self.entry_id)
-            diss.parse_and_normalize()
-            diss.process_into_template(template)
+            pe = PointElectronicTiffParser(self.file_path, self.entry_id)
+            pe.parse_and_normalize()
+            pe.process_into_template(template)
         elif image_parser_type == "set_of_zipped_png_protochips":
-            pngs = ProtochipsPngSetParser(self.file_path, self.entry_id)
-            pngs.parse_and_normalize()
-            pngs.process_into_template(template)
+            axon = ProtochipsPngSetParser(self.file_path, self.entry_id)
+            axon.parse_and_normalize()
+            axon.process_into_template(template)
         # add here further specific content (sub-)parsers for formats from other
         # tech partner or other custom parsing of images
         return template