From c8fa4fb56bda9db214f102761c8ae9123f8972ca Mon Sep 17 00:00:00 2001
From: SiQube <reich.davidr@gmail.com>
Date: Thu, 19 Sep 2024 00:21:03 +0200
Subject: [PATCH 01/31] add code comprehension dataset

---
 docs/source/bibliography.bib                  |  19 ++
 src/pymovements/datasets/__init__.py          |   3 +
 src/pymovements/datasets/codecomprehension.py | 168 ++++++++++++++++++
 3 files changed, 190 insertions(+)
 create mode 100644 src/pymovements/datasets/codecomprehension.py

diff --git a/docs/source/bibliography.bib b/docs/source/bibliography.bib
index 46f3c7309..0d9bfc901 100644
--- a/docs/source/bibliography.bib
+++ b/docs/source/bibliography.bib
@@ -1,3 +1,22 @@
+@article{CodeComprehension,
+author = {Alakmeh, Tarek and Reich, David and J\"{a}ger, Lena and Fritz, Thomas},
+title = {Predicting Code Comprehension: A Novel Approach to Align Human Gaze with Code using Deep Neural Networks},
+year = {2024},
+issue_date = {July 2024},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {1},
+number = {FSE},
+url = {https://doi.org/10.1145/3660795},
+doi = {10.1145/3660795},
+abstract = {The better the code quality and the less complex the code, the easier it is for software developers to comprehend and evolve it. Yet, how do we best detect quality concerns in the code? Existing measures to assess code quality, such as McCabe’s cyclomatic complexity, are decades old and neglect the human aspect. Research has shown that considering how a developer reads and experiences the code can be an indicator of its quality. In our research, we built on these insights and designed, trained, and evaluated the first deep neural network that aligns a developer’s eye gaze with the code tokens the developer looks at to predict code comprehension and perceived difficulty. To train and analyze our approach, we performed an experiment in which 27 participants worked on a range of 16 short code comprehension tasks while we collected fine-grained gaze data using an eye tracker. The results of our evaluation show that our deep neural sequence model that integrates both the human gaze and the stimulus code, can predict (a) code comprehension and (b) the perceived code difficulty significantly better than current state-of-the-art reference methods. We also show that aligning human gaze with code leads to better performance than models that rely solely on either code or human gaze. We discuss potential applications and propose future work to build better human-inclusive code evaluation systems.},
+journal = {Proc. ACM Softw. Eng.},
+month = {jul},
+articleno = {88},
+numpages = {23},
+keywords = {code comprehension, code-fixation attention, eye-tracking, lab experiment, neural networks}
+}
+
 @inproceedings{CopCoL1Hollenstein,
     title = "The Copenhagen Corpus of Eye Tracking Recordings from Natural Reading of {D}anish Texts",
     author = {Hollenstein, Nora  and
diff --git a/src/pymovements/datasets/__init__.py b/src/pymovements/datasets/__init__.py
index d68e6c1bd..81e45c14c 100644
--- a/src/pymovements/datasets/__init__.py
+++ b/src/pymovements/datasets/__init__.py
@@ -25,6 +25,7 @@
    :toctree:
    :template: class.rst
 
+    pymovements.datasets.CodeComprehension
     pymovements.datasets.CopCo
     pymovements.datasets.DIDEC
     pymovements.datasets.EMTeC
@@ -47,6 +48,7 @@
     pymovements.datasets.ToyDataset
     pymovements.datasets.ToyDatasetEyeLink
 """
+from pymovements.datasets.codecomprehension import CodeComprehension
 from pymovements.datasets.copco import CopCo
 from pymovements.datasets.didec import DIDEC
 from pymovements.datasets.emtec import EMTeC
@@ -64,6 +66,7 @@
 
 
 __all__ = [
+    'CodeComprehension',
     'CopCo',
     'DIDEC',
     'EMTeC',
diff --git a/src/pymovements/datasets/codecomprehension.py b/src/pymovements/datasets/codecomprehension.py
new file mode 100644
index 000000000..4a2ce372a
--- /dev/null
+++ b/src/pymovements/datasets/codecomprehension.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2022-2024 The pymovements Project Authors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Provides a definition for the CodeComprehension dataset."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Any
+
+import polars as pl
+
+from pymovements.dataset.dataset_definition import DatasetDefinition
+from pymovements.dataset.dataset_library import register_dataset
+from pymovements.gaze.experiment import Experiment
+
+
+@dataclass
+@register_dataset
+class CodeComprehension(DatasetDefinition):
+    """CodeComprehension dataset :cite:p:`CodeComprehension`.
+
+    This dataset includes eye-tracking-while-code-reading data from a participants in a single
+    session. Eye movements are recorded at a sampling frequency of 1,000 Hz using an
+    EyeLink 1000 eye tracker and are provided as pixel coordinates.
+
+    The participant is instructed to read the code snippet and answer a code comprehension question.
+
+    Attributes
+    ----------
+    name : str
+        The name of the dataset.
+
+    mirrors : tuple[str, ...]
+        A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
+
+    resources : tuple[dict[str, str], ...]
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
+        - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
+        - `filename`: The filename under which the file is saved as.
+        - `md5`: The MD5 checksum of the respective file.
+
+    experiment : Experiment
+        The experiment definition.
+
+    filename_format : str
+        Regular expression which will be matched before trying to load the file. Namedgroups will
+        appear in the `fileinfo` dataframe.
+
+    filename_format_dtypes : dict[str, type], optional
+        If named groups are present in the `filename_format`, this makes it possible to cast
+        specific named groups to a particular datatype.
+
+    column_map : dict[str, str]
+        The keys are the columns to read, the values are the names to which they should be renamed.
+
+    custom_read_kwargs : dict[str, Any], optional
+        If specified, these keyword arguments will be passed to the file reading function.
+
+    Examples
+    --------
+    Initialize your :py:class:`~pymovements.PublicDataset` object with the
+    :py:class:`~pymovements.CodeComprehension` definition:
+
+    >>> import pymovements as pm
+    >>>
+    >>> dataset = pm.Dataset("CodeComprehension", path='data/CodeComprehension')
+
+    Download the dataset resources:
+
+    >>> dataset.download()# doctest: +SKIP
+
+    Load the data into memory:
+
+    >>> dataset.load()# doctest: +SKIP
+    """
+
+    # pylint: disable=similarities
+    # The PublicDatasetDefinition child classes potentially share code chunks for definitions.
+
+    name: str = 'CodeComprehension'
+
+    has_files: dict[str, bool] = field(
+        default_factory=lambda: {
+            'gaze': False,
+            'precomputed_events': True,
+        },
+    )
+    extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True})
+    mirrors: dict[str, tuple[str, ...]] = field(
+        default_factory=lambda: {
+            'precomputed_events': ('https://zenodo.org/',),
+        },
+    )
+
+    resources: dict[str, tuple[dict[str, str], ...]] = field(
+        default_factory=lambda: {
+            'precomputed_events': (
+                {
+                    'resource':
+                    'records/11123101/files/Predicting%20Code%20Comprehension%20Package'
+                    '.zip?download=1',
+                    'filename': 'data.zip',
+                    'md5': '3a3c6fb96550bc2c2ddcf5d458fb12a2',
+                },
+            ),
+        },
+    )
+
+    # TODO
+    experiment: Experiment = Experiment(
+        screen_width_px=1920,
+        screen_height_px=1080,
+        screen_width_cm=59.,
+        screen_height_cm=33.5,
+        distance_cm=85,
+        origin='center',
+        sampling_rate=1000,
+    )
+
+    filename_format: dict[str, str] = field(
+        default_factory=lambda: {
+            'precomputed_events': r'fix_report_P{subject_id:s}.txt',
+        },
+    )
+
+    filename_format_dtypes: dict[str, dict[str, type]] = field(
+        default_factory=lambda: {
+            'precomputed_events': {'subject_id': pl.Utf8},
+        },
+    )
+
+    trial_columns: list[str] = field(default_factory=lambda: [])
+
+    time_column: str = ''
+
+    time_unit: str = ''
+
+    pixel_columns: list[str] = field(default_factory=lambda: [])
+
+    column_map: dict[str, str] = field(default_factory=lambda: {})
+
+    custom_read_kwargs: dict[str, Any] = field(
+        default_factory=lambda: {
+            'precomputed_events': {
+                'separator': '\t',
+                'null_values': '.',
+                'quote_char': '"',
+            },
+        },
+    )

From 20b69ab445e0e6b0dfb096d617533d76a2693644 Mon Sep 17 00:00:00 2001
From: SiQube <reich.davidr@gmail.com>
Date: Thu, 19 Sep 2024 00:21:37 +0200
Subject: [PATCH 02/31] add code comprehension dataset tests

---
 src/pymovements/datasets/codecomprehension.py | 16 ++++++++--------
 tests/unit/datasets/datasets_test.py          |  1 +
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/pymovements/datasets/codecomprehension.py b/src/pymovements/datasets/codecomprehension.py
index 4a2ce372a..eeacc9222 100644
--- a/src/pymovements/datasets/codecomprehension.py
+++ b/src/pymovements/datasets/codecomprehension.py
@@ -101,6 +101,7 @@ class CodeComprehension(DatasetDefinition):
         default_factory=lambda: {
             'gaze': False,
             'precomputed_events': True,
+            'precomputed_reading_measures': False,
         },
     )
     extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True})
@@ -124,15 +125,14 @@ class CodeComprehension(DatasetDefinition):
         },
     )
 
-    # TODO
     experiment: Experiment = Experiment(
-        screen_width_px=1920,
-        screen_height_px=1080,
-        screen_width_cm=59.,
-        screen_height_cm=33.5,
-        distance_cm=85,
-        origin='center',
-        sampling_rate=1000,
+        screen_width_px=None,
+        screen_height_px=None,
+        screen_width_cm=None,
+        screen_height_cm=None,
+        distance_cm=None,
+        origin=None,
+        sampling_rate=None,
     )
 
     filename_format: dict[str, str] = field(
diff --git a/tests/unit/datasets/datasets_test.py b/tests/unit/datasets/datasets_test.py
index b71700410..9f71da7ac 100644
--- a/tests/unit/datasets/datasets_test.py
+++ b/tests/unit/datasets/datasets_test.py
@@ -31,6 +31,7 @@
     ('public_dataset', 'dataset_name'),
     # XXX: add public dataset in alphabetical order
     [
+        pytest.param(pm.datasets.CodeComprehension, 'CodeComprehension', id='CodeComprehension'),
         pytest.param(pm.datasets.CopCo, 'CopCo', id='CopCo'),
         pytest.param(pm.datasets.DIDEC, 'DIDEC', id='DIDEC'),
         pytest.param(pm.datasets.EMTeC, 'EMTeC', id='EMTeC'),

From 268966fda56671b13d988bbcf2ce9c17a3cccedc Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 09:24:40 +0200
Subject: [PATCH 03/31] update docstring of utils downloads (#838)

---
 src/pymovements/utils/downloads.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/pymovements/utils/downloads.py b/src/pymovements/utils/downloads.py
index ad729063f..9b7958802 100644
--- a/src/pymovements/utils/downloads.py
+++ b/src/pymovements/utils/downloads.py
@@ -219,19 +219,6 @@ class _DownloadProgressBar(tqdm):  # pylint: disable=inconsistent-mro
     Parameters
     ----------
     **kwargs : Any
-
-    Attributes
-    ----------
-    unit: str
-        Unit of progress bar.
-    unit_scale: bool
-        If True, scale progress bar to unit.
-    unit_divisor: int
-        Divisor of progress bar.
-    miniters: int
-        Minimum number of iterations between updates.
-    **kwargs: Any
-        Keyword arguments passed to `tqdm.tqdm`.
     """
 
     def __init__(self, **kwargs: Any):

From 14a03f851cd1e970031da873e458b3f1c36e0e68 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 10:04:08 +0200
Subject: [PATCH 04/31] update docstring of copco dataset definition (#821)

---
 src/pymovements/datasets/copco.py | 57 +++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/src/pymovements/datasets/copco.py b/src/pymovements/datasets/copco.py
index 53b466435..b6603c644 100644
--- a/src/pymovements/datasets/copco.py
+++ b/src/pymovements/datasets/copco.py
@@ -47,34 +47,60 @@ class CopCo(DatasetDefinition):
 
     Attributes
     ----------
-    name : str
+    name: str
         The name of the dataset.
 
-    mirrors : tuple[str, ...]
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
+    mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
-    resources : tuple[dict[str, str], ...]
+    resources: dict[str, tuple[dict[str, str | None], ...]]
         A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
         keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
-    experiment : Experiment
+    experiment: Experiment
         The experiment definition.
 
-    filename_format : str
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
+    filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes : dict[str, type], optional
+    filename_format_dtypes: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
-
-    column_map : dict[str, str]
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
+    column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
-    custom_read_kwargs : dict[str, Any], optional
+    custom_read_kwargs: dict[str, Any]
         If specified, these keyword arguments will be passed to the file reading function.
 
     Examples
@@ -107,12 +133,6 @@ class CopCo(DatasetDefinition):
             'precomputed_reading_measures': True,
         },
     )
-    extract: dict[str, bool] = field(
-        default_factory=lambda: {
-            'precomputed_events': True,
-            'precomputed_reading_measures': True,
-        },
-    )
     mirrors: dict[str, tuple[str, ...]] = field(
         default_factory=lambda: {
             'precomputed_events': ('https://files.de-1.osf.io/',),
@@ -150,6 +170,13 @@ class CopCo(DatasetDefinition):
         sampling_rate=1000,
     )
 
+    extract: dict[str, bool] = field(
+        default_factory=lambda: {
+            'precomputed_events': True,
+            'precomputed_reading_measures': True,
+        },
+    )
+
     filename_format: dict[str, str] = field(
         default_factory=lambda: {
             'precomputed_events': r'FIX_report_P{subject_id:d}.txt',

From 580eae7525079bb12b714b03b3e19c074455fc06 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:11:03 +0200
Subject: [PATCH 05/31] update docstring of dataset definition (#820)

---
 src/pymovements/dataset/dataset_definition.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/pymovements/dataset/dataset_definition.py b/src/pymovements/dataset/dataset_definition.py
index 40168e9cf..eaff6e13b 100644
--- a/src/pymovements/dataset/dataset_definition.py
+++ b/src/pymovements/dataset/dataset_definition.py
@@ -35,6 +35,9 @@ class DatasetDefinition:
     ----------
     name: str
         The name of the dataset. (default: '.')
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
         (default: field(default_factory=dict))
@@ -44,16 +47,16 @@ class DatasetDefinition:
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
         (default: field(default_factory=dict))
-    experiment: Experiment
+    experiment: Experiment | None
         The experiment definition. (default: None)
+    extract: dict[str, bool]
+        Decide whether to extract the data.
     filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe. (default: field(default_factory=dict))
     filename_format_dtypes: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype. (default: field(default_factory=dict))
-    extract: dict[str, bool]
-        Decide whether to extract the data.
     custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function. The
         behavior of this argument depends on the file extension of the dataset files.

From 4532de24123f3800f83ba2a15b7cada473cef406 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 12:47:18 +0200
Subject: [PATCH 06/31] update docstring of fakenews dataset definition (#824)

---
 src/pymovements/datasets/fakenews.py | 52 +++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/pymovements/datasets/fakenews.py b/src/pymovements/datasets/fakenews.py
index c352dbf27..f964fd8f2 100644
--- a/src/pymovements/datasets/fakenews.py
+++ b/src/pymovements/datasets/fakenews.py
@@ -44,31 +44,54 @@ class FakeNewsPerception(DatasetDefinition):
 
     Attributes
     ----------
-    name : str
+    name: str
         The name of the dataset.
-    mirrors : tuple[str, ...]
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+    mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
-    resources : tuple[dict[str, str], ...]
+    resources: dict[str, tuple[dict[str, str], ...]]
         A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
         keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
-    experiment : Experiment
+    experiment: Experiment
         The experiment definition.
-    filename_format : str
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+    filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
-    filename_format_dtypes : dict[str, type], optional
+    filename_format_dtypes: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
-    column_map : dict[str, str]
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+    column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
-    custom_read_kwargs : dict[str, Any], optional
+    custom_read_kwargs: dict[str, Any]
         If specified, these keyword arguments will be passed to the file reading function.
     """
 
     name: str = 'FakeNewsPerception'
+
     has_files: dict[str, bool] = field(
         default_factory=lambda: {
             'gaze': False,
@@ -76,12 +99,13 @@ class FakeNewsPerception(DatasetDefinition):
             'precomputed_reading_measures': False,
         },
     )
-    extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True})
+
     mirrors: dict[str, tuple[str, ...]] = field(
         default_factory=lambda: {
             'precomputed_events': ('https://doi.org/10.7910/DVN/C1UD2A',),
         },
     )
+
     resources: dict[str, tuple[dict[str, str], ...]] = field(
         default_factory=lambda: {
             'precomputed_events': (
@@ -93,6 +117,7 @@ class FakeNewsPerception(DatasetDefinition):
             ),
         },
     )
+
     experiment: Experiment = Experiment(
         screen_width_px=1920,
         screen_height_px=1080,
@@ -103,21 +128,30 @@ class FakeNewsPerception(DatasetDefinition):
         sampling_rate=600,
     )
 
+    extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True})
+
     filename_format: dict[str, str] = field(
         default_factory=lambda: {
             'precomputed_events': r'P{subject_id:d}_{session_id:d}_{truth_value:s}.csv',
         },
     )
+
     filename_format_dtypes: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'precomputed_events': {'subject_id': int, 'session_id': int, 'truth_value': str},
         },
     )
+
     trial_columns: list[str] = field(default_factory=lambda: [])
+
     time_column: str = 'starttime'
+
     time_unit: str = 'milliseconds'
+
     pixel_columns: list[str] = field(default_factory=lambda: [])
+
     column_map: dict[str, str] = field(default_factory=lambda: {})
+
     custom_read_kwargs: dict[str, Any] = field(
         default_factory=lambda: {
             'precomputed_events': {

From 625184909dcf04bb5c9cfa4e388cafb9633c4160 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 13:14:24 +0200
Subject: [PATCH 07/31] update docstring of gaze screen (#837)

---
 src/pymovements/gaze/screen.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/src/pymovements/gaze/screen.py b/src/pymovements/gaze/screen.py
index ea4458898..9ae21e270 100644
--- a/src/pymovements/gaze/screen.py
+++ b/src/pymovements/gaze/screen.py
@@ -51,29 +51,6 @@ class Screen:
         Specifies the screen location of the origin of the pixel
         coordinate system. (default: 'upper left')
 
-    Attributes
-    ----------
-    width_px: int
-        Screen width in pixels
-    height_px: int
-        Screen height in pixels
-    width_cm: float
-        Screen width in centimeters
-    height_cm: float
-        Screen height in centimeters
-    distance_cm: float
-        Eye-to-screen distance in centimeters
-    origin: str
-        Specifies the screen location of the origin of the pixel coordinate system.
-    x_max_dva: float
-        Maximum screen x-coordinate in degrees of visual angle
-    y_max_dva: float
-        Minimum screen y-coordinate in degrees of visual angle
-    x_min_dva: float
-        Maximum screen x-coordinate in degrees of visual angle
-    y_min_dva: float
-        Minimum screen y-coordinate in degrees of visual angle
-
     Examples
     --------
     >>> screen = Screen(

From 5e0297bb7a9efd435870e26d0a36f908d7a20b11 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 13:29:27 +0200
Subject: [PATCH 08/31] update docstring of gaze experiment (#836)

---
 src/pymovements/gaze/experiment.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/pymovements/gaze/experiment.py b/src/pymovements/gaze/experiment.py
index f51da37e2..d8015f1be 100644
--- a/src/pymovements/gaze/experiment.py
+++ b/src/pymovements/gaze/experiment.py
@@ -82,14 +82,6 @@ class Experiment:
     -12.42...
     >>> experiment.screen.y_max_dva# doctest:+ELLIPSIS
     12.42...
-
-
-    Attributes
-    ----------
-    screen: Screen
-        Screen object for experiment
-    eyetracker : EyeTracker | None
-        Eye tracker for experiment
     """
 
     def __init__(

From be0501c08ea3d824efcd2c28d002fe38254d88e1 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 14:06:05 +0200
Subject: [PATCH 09/31] update docstring of events processing (#835)

---
 src/pymovements/events/processing.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/pymovements/events/processing.py b/src/pymovements/events/processing.py
index 7e97346a8..517765558 100644
--- a/src/pymovements/events/processing.py
+++ b/src/pymovements/events/processing.py
@@ -35,11 +35,6 @@
 class EventProcessor:
     """Processes event and gaze dataframes.
 
-    Attributes
-    ----------
-    event_properties: list[str]
-        A list of property names.
-
     Parameters
     ----------
     event_properties: str | list[str]
@@ -95,12 +90,6 @@ def process(self, events: EventDataFrame) -> pl.DataFrame:
 class EventGazeProcessor:
     """Processes event and gaze dataframes.
 
-    Attributes
-    ----------
-    event_properties: list[str]
-        A list of property names.
-
-
     Parameters
     ----------
     event_properties: str | tuple[str, dict[str, Any]] | list[str | tuple[str, dict[str, Any]]]

From d3eecee9eb1da574c58631921793a25a7984d82e Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 14:22:46 +0200
Subject: [PATCH 10/31] update docstring of toy_dataset_eyelink dataset
 definition (#834)

---
 .../datasets/toy_dataset_eyelink.py           | 37 +++++++++++++++++--
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/src/pymovements/datasets/toy_dataset_eyelink.py b/src/pymovements/datasets/toy_dataset_eyelink.py
index 38e7afd58..1dcd2a661 100644
--- a/src/pymovements/datasets/toy_dataset_eyelink.py
+++ b/src/pymovements/datasets/toy_dataset_eyelink.py
@@ -48,15 +48,23 @@ class ToyDatasetEyeLink(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
     resources: dict[str, tuple[dict[str, str], ...]]
-        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -68,6 +76,27 @@ class ToyDatasetEyeLink(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
@@ -104,6 +133,7 @@ class ToyDatasetEyeLink(DatasetDefinition):
             'precomputed_reading_measures': False,
         },
     )
+
     mirrors: dict[str, tuple[str, ...]] = field(
         default_factory=lambda: {
             'gaze': (
@@ -124,6 +154,7 @@ class ToyDatasetEyeLink(DatasetDefinition):
                 ),
         },
     )
+
     extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True})
 
     experiment: Experiment = Experiment(
@@ -157,8 +188,6 @@ class ToyDatasetEyeLink(DatasetDefinition):
         },
     )
 
-    column_map: dict[str, str] = field(default_factory=lambda: {})
-
     trial_columns: list[str] = field(default_factory=lambda: ['subject_id', 'session_id'])
 
     time_column: str = 'time'
@@ -167,6 +196,8 @@ class ToyDatasetEyeLink(DatasetDefinition):
 
     pixel_columns: list[str] = field(default_factory=lambda: ['x_pix', 'y_pix'])
 
+    column_map: dict[str, str] = field(default_factory=lambda: {})
+
     custom_read_kwargs: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             'gaze': {

From fd8dad49adf7eb126f63542b904387a77b6753ec Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 14:48:23 +0200
Subject: [PATCH 11/31] update docstring of toy_dataset dataset definition
 (#833)

---
 src/pymovements/datasets/toy_dataset.py | 31 ++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/pymovements/datasets/toy_dataset.py b/src/pymovements/datasets/toy_dataset.py
index 553b5d0fe..7f45ea72c 100644
--- a/src/pymovements/datasets/toy_dataset.py
+++ b/src/pymovements/datasets/toy_dataset.py
@@ -47,15 +47,23 @@ class ToyDataset(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
     resources: dict[str, tuple[dict[str, str], ...]]
-        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -67,6 +75,27 @@ class ToyDataset(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 

From 1ad8db75e49eb9d8aaa68672b2429cd7b5801d4c Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 18:09:44 +0200
Subject: [PATCH 12/31] update docstring of sb_sat dataset definition (#832)

---
 src/pymovements/datasets/sb_sat.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/pymovements/datasets/sb_sat.py b/src/pymovements/datasets/sb_sat.py
index a4b1c4e0e..1974bd77b 100644
--- a/src/pymovements/datasets/sb_sat.py
+++ b/src/pymovements/datasets/sb_sat.py
@@ -49,6 +49,10 @@ class SBSAT(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
@@ -59,6 +63,9 @@ class SBSAT(DatasetDefinition):
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -70,10 +77,31 @@ class SBSAT(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
-    custom_read_kwargs: dict[str, dict[str, dict[str, Any]]]
+    custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
 
     Examples

From 5903c9355e32a9cc5ae3f0fb2fd50514a2e87f11 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 18:29:34 +0200
Subject: [PATCH 13/31] update docstring of potec dataset definition (#831)

---
 src/pymovements/datasets/potec.py | 34 +++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/pymovements/datasets/potec.py b/src/pymovements/datasets/potec.py
index 12dc9a64e..d26cb3160 100644
--- a/src/pymovements/datasets/potec.py
+++ b/src/pymovements/datasets/potec.py
@@ -59,15 +59,23 @@ class PoTeC(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
     resources: dict[str, tuple[dict[str, str], ...]]
-        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -79,8 +87,26 @@ class PoTeC(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
-    column_map: dict[str, str]
-        The keys are the columns to read, the values are the names to which they should be renamed.
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
 
     custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
@@ -175,7 +201,7 @@ class PoTeC(DatasetDefinition):
         ],
     )
 
-    custom_read_kwargs: dict[str, Any] = field(
+    custom_read_kwargs: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             'gaze': {
                 'dtypes': {

From d9e2b2af7a7e2d37d3ec26ffb74d070d3b3b5cac Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 18:42:51 +0200
Subject: [PATCH 14/31] update docstring of judo1000 dataset definition (#830)

---
 src/pymovements/datasets/judo1000.py | 34 ++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/pymovements/datasets/judo1000.py b/src/pymovements/datasets/judo1000.py
index c2fd57d94..dfbf1f663 100644
--- a/src/pymovements/datasets/judo1000.py
+++ b/src/pymovements/datasets/judo1000.py
@@ -49,15 +49,23 @@ class JuDo1000(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
     resources: dict[str, tuple[dict[str, str], ...]]
-        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -69,12 +77,34 @@ class JuDo1000(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
     custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
 
+
     Examples
     --------
     Initialize your :py:class:`~pymovements.PublicDataset` object with the
@@ -172,7 +202,7 @@ class JuDo1000(DatasetDefinition):
         },
     )
 
-    custom_read_kwargs: dict[str, Any] = field(
+    custom_read_kwargs: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             'gaze': {
                 'dtypes': {

From fdab5de4d4ae99df0b7ceeb92e8213e6f3db5470 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 23:05:50 +0200
Subject: [PATCH 15/31] update docstring of hbn dataset definition (#829)

---
 src/pymovements/datasets/hbn.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/pymovements/datasets/hbn.py b/src/pymovements/datasets/hbn.py
index 8c16b314b..0ef7b0922 100644
--- a/src/pymovements/datasets/hbn.py
+++ b/src/pymovements/datasets/hbn.py
@@ -50,15 +50,23 @@ class HBN(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
     resources: dict[str, tuple[dict[str, str], ...]]
-        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -70,6 +78,27 @@ class HBN(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 

From 242aabe1e64205b51a3ea1e76ad02768a6e31e8c Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 23:32:24 +0200
Subject: [PATCH 16/31] update docstring of gazebasevr dataset definition
 (#828)

---
 src/pymovements/datasets/gazebasevr.py | 36 +++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/pymovements/datasets/gazebasevr.py b/src/pymovements/datasets/gazebasevr.py
index bea0a4928..ef22e7c8e 100644
--- a/src/pymovements/datasets/gazebasevr.py
+++ b/src/pymovements/datasets/gazebasevr.py
@@ -56,15 +56,23 @@ class GazeBaseVR(DatasetDefinition):
     name: str
         The name of the dataset.
 
-    gaze_mirrors: dict[str, tuple[str, ...]]
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
+    mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
-    gaze_resources: dict[str, tuple[dict[str, str], ...]]
-        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+    resources: dict[str, tuple[dict[str, str], ...]]
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -76,12 +84,34 @@ class GazeBaseVR(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    position_columns: list[str]
+        The name of the dva position columns in the input data frame. These columns will be
+        nested into the column ``position``. If the list is empty or None, the nested
+        ``position`` column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
     custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
 
+
     Examples
     --------
     Initialize your :py:class:`~pymovements.PublicDataset` object with the

From 02f701ff3781892d36cc13c44f6b86b25ab05a32 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 23:44:32 +0200
Subject: [PATCH 17/31] update docstring of gazebase dataset definition (#827)

---
 src/pymovements/datasets/gazebase.py | 32 +++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/pymovements/datasets/gazebase.py b/src/pymovements/datasets/gazebase.py
index 865c9d583..a9835b74a 100644
--- a/src/pymovements/datasets/gazebase.py
+++ b/src/pymovements/datasets/gazebase.py
@@ -55,15 +55,23 @@ class GazeBase(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
     resources: dict[str, tuple[dict[str, str], ...]]
-        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -75,12 +83,34 @@ class GazeBase(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    position_columns: list[str]
+        The name of the dva position columns in the input data frame. These columns will be
+        nested into the column ``position``. If the list is empty or None, the nested
+        ``position`` column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
     custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
 
+
     Examples
     --------
     Initialize your :py:class:`~pymovements.PublicDataset` object with the

From fa3025dfb4df45f816d4f2b240b00fb35f0174e0 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Fri, 27 Sep 2024 23:59:40 +0200
Subject: [PATCH 18/31] update docstring of gazes_on_faces dataset definition
 (#826)

---
 src/pymovements/datasets/gaze_on_faces.py | 47 ++++++++++++++++++-----
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/pymovements/datasets/gaze_on_faces.py b/src/pymovements/datasets/gaze_on_faces.py
index 45722a866..131549b87 100644
--- a/src/pymovements/datasets/gaze_on_faces.py
+++ b/src/pymovements/datasets/gaze_on_faces.py
@@ -48,33 +48,62 @@ class GazeOnFaces(DatasetDefinition):
 
     Attributes
     ----------
-    name : str
+    name: str
         The name of the dataset.
 
-    mirrors : dict[str, tuple[str, ...]]
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
+    mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
-    resources : dict[str, tuple[dict[str, str], ...]]
-        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+    resources: dict[str, tuple[dict[str, str], ...]]
+        A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
+        keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
-    experiment : Experiment
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
+    experiment: Experiment
         The experiment definition.
 
-    filename_format : dict[str, str]
+    filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes : dict[str, dict[str, type]]
+    filename_format_dtypes: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
-    column_map : dict[str, str]
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: Any
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: Any
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
+    column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
-    custom_read_kwargs : dict[str, dict[str, Any]]
+    custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
 
     Examples

From 6d22c5ae81b4295cc6bdd8d4b49576ee2cc41a93 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sat, 28 Sep 2024 09:17:54 +0200
Subject: [PATCH 19/31] update docstring of gaze_graph dataset definition
 (#825)

---
 src/pymovements/datasets/gaze_graph.py | 36 +++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/pymovements/datasets/gaze_graph.py b/src/pymovements/datasets/gaze_graph.py
index 995363f7c..e8b146da0 100644
--- a/src/pymovements/datasets/gaze_graph.py
+++ b/src/pymovements/datasets/gaze_graph.py
@@ -53,6 +53,10 @@ class GazeGraph(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
@@ -66,18 +70,42 @@ class GazeGraph(DatasetDefinition):
     experiment: Experiment
         The experiment definition.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, Any]
+    filename_format_dtypes: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: Any
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: Any
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
-    custom_read_kwargs: dict[str, Any]
+    custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
 
     Examples
@@ -117,8 +145,6 @@ class GazeGraph(DatasetDefinition):
         },
     )
 
-    extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True})
-
     resources: dict[str, tuple[dict[str, str], ...]] = field(
         default_factory=lambda: {
             'gaze': (
@@ -142,6 +168,8 @@ class GazeGraph(DatasetDefinition):
         sampling_rate=30,
     )
 
+    extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True})
+
     filename_format: dict[str, str] = field(
         default_factory=lambda: {
             'gaze': r'P{subject_id}_{task}.csv',

From d31b236e9e845ddb1fed32ad7af6c3001d6ab320 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sat, 28 Sep 2024 11:47:19 +0200
Subject: [PATCH 20/31] update docstring of emtec dataset definition (#823)

---
 src/pymovements/datasets/emtec.py | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/pymovements/datasets/emtec.py b/src/pymovements/datasets/emtec.py
index a97a8a129..69432149c 100644
--- a/src/pymovements/datasets/emtec.py
+++ b/src/pymovements/datasets/emtec.py
@@ -47,6 +47,10 @@ class EMTeC(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
@@ -57,6 +61,9 @@ class EMTeC(DatasetDefinition):
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     experiment: Experiment
         The experiment definition.
 
@@ -68,10 +75,28 @@ class EMTeC(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
-    column_map: dict[str, str]
-        The keys are the columns to read, the values are the names to which they should be renamed.
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
 
-    custom_read_kwargs: dict[str, dict[str, dict[str, Any]]]
+    custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
 
     Examples

From eba70205af5a79246d9ca05d9ac41cc2c486de3a Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sat, 28 Sep 2024 16:54:05 +0200
Subject: [PATCH 21/31] update docstring of didec dataset definition (#822)

---
 .../dataset/.dataset_definition.py.swp        | Bin 0 -> 16384 bytes
 src/pymovements/datasets/didec.py             |  32 ++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100644 src/pymovements/dataset/.dataset_definition.py.swp

diff --git a/src/pymovements/dataset/.dataset_definition.py.swp b/src/pymovements/dataset/.dataset_definition.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..2795a03c563f2cbd1a91215d4de1074262011281
GIT binary patch
literal 16384
zcmeHO&5t8T74KvdLIOz;IUynPGD|eOlE<^zB?l};!87hzx8kvl?b*pFtn6}kdE899
zyXoql87CwlBse5oxbh((B*cM3NW>pNB*fPZ2^MkT%rTdUBm7=<x7*|W$ZSADpl<DN
z+Fc*7-uu<7S5;lT<106;UHW46IRn?H4C4>CKA61sqt6=OdD<|P`?X{&Ipf}!y<wlL
z+#9LDO;3zBg`CR`{t?b+!Lgi4KUSM6vd=s_A{I)<o7sKn$cgK_u^af+aDH!{%G{EH
zl7UqQ9yhLCeeMdUUR-;Ip8E9OYJu{(WT0fAWT0fAWT0fAWT0fAWT0f=|CoU|ykL9-
z_C1){cs;v6uyX%rc5P?r&6WHM+4ED`{jHV!?6JI+43rF%43rF%43rF%43rF%43rF%
z43rF%43rG~7Z?zZVeF&bw|K*k^Z$JR|C=Wb<JUj}JPX`<!Z6+hwtz=~hk;){ZW!MJ
zz6o3d{{BhB_&)Gu-~r&BPZ-8efo}t2KmacS&j61Bj{>(IGmJk2e*%6F`~vWRBj78*
zF7P7oB=G*DhVf(I6zBr)f7~$M1HKDf0)G1l^Z}Z{XMlfv3^o8i16~96fe#)wjGq8U
zz!SjR4;jYmz~_M9;U&gvz|+7V@j~Q#z!=y9{sw;D2AHQenXCW&MY!U~72ePeykzF9
z8hNgYUr}*%lfF*vz?TM>;p-AUg%F7s*J$EO&&eN$6Jf_eG=ITy?fCL#Lzl&$!Bo<b
zoGTKMq!<rb3ls+wN#Rs<;<4}&S(Nd`3{NO=GIi}Kmza~928rj;SfU!|J5}9ADsDIv
zA=QZ6W~D2{nlE(7bv&oaeJNpb3=JkhM6&>jJ2Doorzr4g8l13ZiDGkH*7-2uZ%9&g
z*z6P6^H|SNC?y>(Tnp#5N#fhJ(b6|3)g*?<YA7P*`iJ^1hxj(dbN!=HA?<ASz(esW
z6F0Qg@u*7HL=!l1JxTI3mcGI_D&(qBRVaI;^FaSe5IGA?)q<G2Ri;surQf;sihT=V
zV>uPaE*J|~r6p#-UMPJ>v&SB=qN^?riqkJ{bgnAQhl(ncwhL!VeFePVX3&aCO!=ZA
z<_i20^RK_(VVBfO+lh*U*-(>tbb@|_+ZP^G{Hpu&n}suF+SsI>kPf3D3AL{Zba)iI
zc7iU_*2MErR_}Mbz}_>0dDw)JRPZozeQnl=3FJaU_~%T-BOXKGdnS(Hvu;dbpp-j?
zLx2Zcs90OOQfcV0dUvsyvzAO2VJKnKa`Ub~2}at=Nd!&q+svLk)I7;kHctC&I@a^S
z1H0@gM$cHnNLUihxf5ZK7YK~0%#Etl=1`Dz0@uBht;^kdCNMPMSdK)Lkq{WMbPS;g
zflN2R9+^5~mbxrw90XqB6mTh24Q1qVD1gH+z7|{Re-5K*U@VH($SkV;dy#g7Ml<L6
zQB8;3-Ny3-{_G=_%+dkLag%*Fc8}8`Sd1~$%)KM4o<CCJJ0NBehSRYNUsK{3og<b8
zDukecAc<_52}vT4R5F>kr%M{>0#)b;A$echfo}_hF2F%ztRp_n+$aj7Ld!bENeHJ#
zyvSok-k_?IDKJs+q($g}XihT>ZVtji4{5|ify9OEN#J?G340XJO*QQ_3t`LMtebqL
zhz0CE6{=RNOT*(nr<Wx=QWFsv(;%Lil^Ju^4laUGNh4@5BH=r*Gb2mT`sRAdQzr0s
z3s7rS?RAViJD7zDSl2-ibPQ)ws02)8@(7pZ!+dm2F-+j>c@LoLFf;m!)(^$2a=k)l
zq{?F$p^yrFHOEt-hU{Bs;?C_uiG$m1q!cu-SJ%^V#N}!{*K6S$i%UP&V=QULuud8J
zd2Ma2u^5*;*7XOS!gF6`FX1WL_Aq1DMsF-n(v5Ul!i21IDSIv?)r>DO89Em$vq==p
zXjuUBQY{8Nxw9~c;-z7q)j{vFKW5pOb5c<Y*&N!_s(RgZ*W5*gOzv<vNn&*AaG2K@
zz8}Pb=R##LDXE2pMOgAUi6^VbGcHoUvoqMQ_slAOclHL<*`ZF4tTt_T+HG@tV0GHG
zZw;C>Xqt<{T!f$Z`X>ME;CGK2X1!&#ulKn!5+JW-)wiscH8`lye$&*2kXOgQR4&>=
zD}x>$D>Ufz(0V3p-|Cwcs`souB+^c=vs;0@`re?~>GiqPcBgyLv#vJ>)a<kxCK9(y
z-CoP29pu;B2ejR)Te}r%)OYLGO<hYX3PC&8pl$a1X-VqVdjo5GuT}3+cdys&z^Hn=
zLG4c4YVY)*fVpe72Uv@>kw)eXJW#(`Z?!Trt=(?RG8+{X*lz7LSmP})+3pOW(XKTx
z8!%4u1)_WP_P}Coc4*h^Z8!0llLW<>)VxQ?>QlGZxnVURV66^b`)eQ)eZl>di8^3G
zZ0t)`8yc7|cYADDr)OM*I6DYkI2IOdDjBO-_%SvCSU+GL1MkR(U_Y3|SRYAt0iNl^
z7~f5E-Ff65PGf>Qxi*p!kn}<X^We-)C-p2#O+~sUc75AR9E5o+CXl3+m?V+!DxT~y
zD4l>*fc1Ma{wn4wPSJDyB3>ilIQlzmMD0TVG=AoK>UhIdwIWy#$w;v|NI98ec(TO2
zESAcUgB&dAE83<C`@cJx<1V8^B!<Zt6U(%MnFOImhZL=#2cvzi1;aO)Ldgc@%cMmE
zJm~~U%+oJw>osTvg_s8(@Oif9&fIiVYW*~`teT!1c*e~-LDP&l71*wxFP*^F`wu;!
z)73LwVhh)D4c0DWfe2=$VD-91=Rhi!J#sw9Di&SO3+zdR%@3Nir&tfeG}IIRJY$f8
zF$a6rT4<_t_iZoy;F5iru3ou%Z3BP4pgm<N5K=b^beGl>@DVA4_y51bUj6gfzw`b-
zKfnJA_WVBtegGT*F9R2V2Z497_kRb70R=RHXMu-+ze3K>f!BeT0G4|ZD6f)%l7W(e
zl7W(el7W(el7W(el7W(e{{;pJ7w<ZSZ_Dg#PQUNU4&~^D;%rXXw)AAAw}E#*##^p&
zEIn)@<N5hY<4{5GyU$tLJ(c%3O<YzZ#yj(2g8y8-V%hpkt?L(ex<Go)w^C$rF!>>e
zmw9tqTybK!{F)Sd^P4Dkk2B5mE#iG08l6XA{;EAY%sI!o%tw&=Q7paNp~r_b&n{cM
N?!3eK2I}j6`xlPYB{cv5

literal 0
HcmV?d00001

diff --git a/src/pymovements/datasets/didec.py b/src/pymovements/datasets/didec.py
index b9ead4c65..c27faddc6 100644
--- a/src/pymovements/datasets/didec.py
+++ b/src/pymovements/datasets/didec.py
@@ -47,6 +47,10 @@ class DIDEC(DatasetDefinition):
     name: str
         The name of the dataset.
 
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
@@ -59,6 +63,9 @@ class DIDEC(DatasetDefinition):
     experiment: Experiment
         The experiment definition.
 
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
     filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
@@ -67,6 +74,27 @@ class DIDEC(DatasetDefinition):
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
     column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
@@ -122,8 +150,6 @@ class DIDEC(DatasetDefinition):
         },
     )
 
-    extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True})
-
     experiment: Experiment = Experiment(
         screen_width_px=1680,
         screen_height_px=1050,
@@ -134,6 +160,8 @@ class DIDEC(DatasetDefinition):
         sampling_rate=1000,
     )
 
+    extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True})
+
     filename_format: dict[str, str] = field(
         default_factory=lambda: {
             'gaze':

From cc67adb1ee992455781855181bf4ed87ac27288e Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sat, 28 Sep 2024 18:25:17 +0200
Subject: [PATCH 22/31] update transforms tests to account for adding single
 line (#815)

---
 tests/unit/gaze/transforms/deg2pix_test.py | 7 ++++---
 tests/unit/gaze/transforms/pix2deg_test.py | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/unit/gaze/transforms/deg2pix_test.py b/tests/unit/gaze/transforms/deg2pix_test.py
index 465fcd214..aed92830f 100644
--- a/tests/unit/gaze/transforms/deg2pix_test.py
+++ b/tests/unit/gaze/transforms/deg2pix_test.py
@@ -508,9 +508,10 @@ def test_deg2pix_returns(kwargs, series, expected_df, distance_as_column):
         # unit of distance values has to be in mm when passing as column
         distance_value = kwargs['distance'] * 10
 
-        df = df.with_columns(
-            pl.Series('distance', [distance_value], pl.Float64),
-        )
+        try:
+            df = df.with_columns(pl.Series('distance', [distance_value], pl.Float64))
+        except pl.exceptions.InvalidOperationError:
+            df = df.with_columns(distance=distance_value)
 
         kwargs['distance'] = 'distance'
 
diff --git a/tests/unit/gaze/transforms/pix2deg_test.py b/tests/unit/gaze/transforms/pix2deg_test.py
index 7466a6568..afff2eeed 100644
--- a/tests/unit/gaze/transforms/pix2deg_test.py
+++ b/tests/unit/gaze/transforms/pix2deg_test.py
@@ -518,9 +518,10 @@ def test_pix2deg_returns(kwargs, series, expected_df, distance_as_column):
         # unit of distance values has to be in mm when passing as column
         distance_value = kwargs['distance'] * 10
 
-        df = df.with_columns(
-            pl.Series('distance', [distance_value], pl.Float64),
-        )
+        try:
+            df = df.with_columns(pl.Series('distance', [distance_value], pl.Float64))
+        except pl.exceptions.InvalidOperationError:
+            df = df.with_columns(distance=distance_value)
 
         kwargs['distance'] = 'distance'
 

From 4792904ac737d03c32cdca9d4aa96077d60317ea Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sat, 28 Sep 2024 18:37:53 +0200
Subject: [PATCH 23/31] update binocular example, space raises converting
 problems for polars1+ (#814)

---
 tests/files/binocular_example.csv | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/files/binocular_example.csv b/tests/files/binocular_example.csv
index a1f3c95f6..feaec595a 100644
--- a/tests/files/binocular_example.csv
+++ b/tests/files/binocular_example.csv
@@ -1,11 +1,11 @@
 time,x_left_pix,y_left_pix,x_right_pix,y_right_pix,x_left_pos,y_left_pos,x_right_pos,y_right_pos
-0,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-1,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-2,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-3,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-4,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-5,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-6,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-7,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-8,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
-9,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493
+0,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+1,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+2,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+3,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+4,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+5,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+6,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+7,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+8,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493
+9,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493

From ba0773848ba871ab19a04b3353989555f16c5733 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sat, 28 Sep 2024 18:53:34 +0200
Subject: [PATCH 24/31] missed ColumnNotFoundError (#816)

---
 src/pymovements/gaze/gaze_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pymovements/gaze/gaze_dataframe.py b/src/pymovements/gaze/gaze_dataframe.py
index 5a74b68d4..aec927a58 100644
--- a/src/pymovements/gaze/gaze_dataframe.py
+++ b/src/pymovements/gaze/gaze_dataframe.py
@@ -703,7 +703,7 @@ def detect(
                 if trial_column not in self.events.frame.columns
             ]
             if missing_trial_columns:
-                raise pl.ColumnNotFoundError(
+                raise pl.exceptions.ColumnNotFoundError(
                     f'trial columns {missing_trial_columns} missing from events, '
                     f'available columns: {self.events.frame.columns}',
                 )

From 81b8518335f5b815ffe76b800ff29e0f41976326 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sat, 28 Sep 2024 19:28:09 +0200
Subject: [PATCH 25/31] upgrade to polars 1+ (#809)

* dtypes now called schema_overrides

* preview window updated and arctan2d=>arctan2

* polars improved inference

* upgrade to polars 1+
---
 docs/source/tutorials/local-dataset.ipynb     |  4 +-
 pyproject.toml                                |  2 +-
 src/pymovements/dataset/dataset_definition.py |  8 ++--
 src/pymovements/dataset/dataset_files.py      | 31 +++++++-------
 src/pymovements/datasets/copco.py             |  6 ++-
 src/pymovements/datasets/didec.py             |  4 +-
 src/pymovements/datasets/emtec.py             |  6 +--
 src/pymovements/datasets/fakenews.py          |  4 +-
 src/pymovements/datasets/gaze_graph.py        |  6 +--
 src/pymovements/datasets/gaze_on_faces.py     |  6 +--
 src/pymovements/datasets/gazebase.py          |  6 +--
 src/pymovements/datasets/gazebasevr.py        |  6 +--
 src/pymovements/datasets/hbn.py               |  6 +--
 src/pymovements/datasets/judo1000.py          |  6 +--
 src/pymovements/datasets/potec.py             |  6 +--
 src/pymovements/datasets/sb_sat.py            |  6 +--
 src/pymovements/datasets/toy_dataset.py       |  6 +--
 .../datasets/toy_dataset_eyelink.py           |  4 +-
 src/pymovements/gaze/gaze_dataframe.py        |  2 +-
 src/pymovements/gaze/integration.py           |  6 +++
 src/pymovements/gaze/io.py                    | 42 +++++++++++--------
 src/pymovements/gaze/transforms.py            |  4 +-
 src/pymovements/utils/parsing.py              |  5 +--
 tests/functional/dataset_processing_test.py   | 28 ++++++-------
 tests/unit/dataset/dataset_download_test.py   |  2 +-
 tests/unit/dataset/dataset_files_test.py      |  2 +-
 tests/unit/dataset/dataset_test.py            | 15 ++++---
 tests/unit/datasets/datasets_test.py          |  6 +--
 tests/unit/events/frame_test.py               |  2 +-
 tests/unit/gaze/io/csv_test.py                | 10 ++---
 30 files changed, 132 insertions(+), 115 deletions(-)

diff --git a/docs/source/tutorials/local-dataset.ipynb b/docs/source/tutorials/local-dataset.ipynb
index dbbed32d3..f48f133ac 100644
--- a/docs/source/tutorials/local-dataset.ipynb
+++ b/docs/source/tutorials/local-dataset.ipynb
@@ -142,7 +142,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "filename_format_dtypes = {'gaze': {\n",
+    "filename_format_schema_overrides = {'gaze': {\n",
     "    'text_id': int,\n",
     "    'page_id': int,\n",
     "},\n",
@@ -254,7 +254,7 @@
     "    has_files={'gaze': True, 'precomputed_events': False, 'precomputed_reading_measures': False},\n",
     "    experiment=experiment,\n",
     "    filename_format=filename_format,\n",
-    "    filename_format_dtypes=filename_format_dtypes,\n",
+    "    filename_format_schema_overrides=filename_format_schema_overrides,\n",
     "    custom_read_kwargs=custom_read_kwargs,\n",
     "    time_column=time_column,\n",
     "    time_unit=time_unit,\n",
diff --git a/pyproject.toml b/pyproject.toml
index ab4511458..075d01c49 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
   "matplotlib>=3.8.0,<3.10",
   "numpy>=1.22.4,<3",
   "pandas>=2.1.4,<3",
-  "polars>=0.20.1,<0.20.3",
+  "polars>=1.8.2,<2",
   "pyarrow>=11.0.0,<18",
   "pyopenssl>=16.0.0,<25.0.0",
   "scipy>=1.8.0,<2",
diff --git a/src/pymovements/dataset/dataset_definition.py b/src/pymovements/dataset/dataset_definition.py
index eaff6e13b..e8737b933 100644
--- a/src/pymovements/dataset/dataset_definition.py
+++ b/src/pymovements/dataset/dataset_definition.py
@@ -54,7 +54,7 @@ class DatasetDefinition:
     filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe. (default: field(default_factory=dict))
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype. (default: field(default_factory=dict))
     custom_read_kwargs: dict[str, dict[str, Any]]
@@ -122,10 +122,10 @@ class DatasetDefinition:
     3. Specifying column datatypes
     ``polars.read_csv`` infers data types from a fixed number of rows, which might not be accurate
     for the entire dataset. To ensure correct data types, you can pass a dictionary to the
-    ``dtypes`` keyword argument in ``gaze_custom_read_kwargs``.
+    ``schema_overrides`` keyword argument in ``gaze_custom_read_kwargs``.
     Use data types from the `polars` library.
     For instance:
-    ``gaze_custom_read_kwargs={'dtypes': {'col1': polars.Int64, 'col2': polars.Float64}}``
+    ``gaze_custom_read_kwargs={'schema_overrides': {'col1': polars.Int64, 'col2': polars.Float64}}``
     """
 
     # pylint: disable=too-many-instance-attributes
@@ -141,7 +141,7 @@ class DatasetDefinition:
 
     filename_format: dict[str, str] = field(default_factory=dict)
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(default_factory=dict)
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(default_factory=dict)
 
     custom_read_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
 
diff --git a/src/pymovements/dataset/dataset_files.py b/src/pymovements/dataset/dataset_files.py
index 1e5be0008..dfc4c40eb 100644
--- a/src/pymovements/dataset/dataset_files.py
+++ b/src/pymovements/dataset/dataset_files.py
@@ -74,8 +74,8 @@ def scan_dataset(definition: DatasetDefinition, paths: DatasetPaths) -> pl.DataF
 
         fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
         fileinfo_df = fileinfo_df.sort(by='filepath')
-        if definition.filename_format_dtypes['gaze']:
-            items = definition.filename_format_dtypes['gaze'].items()
+        if definition.filename_format_schema_overrides['gaze']:
+            items = definition.filename_format_schema_overrides['gaze'].items()
             fileinfo_df = fileinfo_df.with_columns([
                 pl.col(fileinfo_key).cast(fileinfo_dtype)
                 for fileinfo_key, fileinfo_dtype in items
@@ -92,31 +92,33 @@ def scan_dataset(definition: DatasetDefinition, paths: DatasetPaths) -> pl.DataF
             raise RuntimeError(f'no matching files found in {paths.precomputed_events}')
         fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
         fileinfo_df = fileinfo_df.sort(by='filepath')
-        if definition.filename_format_dtypes['precomputed_events']:
-            items = definition.filename_format_dtypes['precomputed_events'].items()
+        if definition.filename_format_schema_overrides['precomputed_events']:
+            items = definition.filename_format_schema_overrides['precomputed_events'].items()
             fileinfo_df = fileinfo_df.with_columns([
                 pl.col(fileinfo_key).cast(fileinfo_dtype)
                 for fileinfo_key, fileinfo_dtype in items
             ])
         _fileinfo_dicts['precomputed_events'] = fileinfo_df
 
-    if definition.has_files['precomputed_reading_measures']:
+    pc_rm = 'precomputed_reading_measures'
+    if definition.has_files[pc_rm]:
         fileinfo_dicts = match_filepaths(
             path=paths.precomputed_reading_measures,
-            regex=curly_to_regex(definition.filename_format['precomputed_reading_measures']),
+            regex=curly_to_regex(definition.filename_format[pc_rm]),
             relative=True,
         )
         if not fileinfo_dicts:
             raise RuntimeError(f'no matching files found in {paths.precomputed_reading_measures}')
         fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
         fileinfo_df = fileinfo_df.sort(by='filepath')
-        if definition.filename_format_dtypes['precomputed_reading_measures']:
-            items = definition.filename_format_dtypes['precomputed_reading_measures'].items()
+        if definition.filename_format_schema_overrides[pc_rm]:
+            _schema_overrides = definition.filename_format_schema_overrides[pc_rm]
+            items = _schema_overrides.items()
             fileinfo_df = fileinfo_df.with_columns([
                 pl.col(fileinfo_key).cast(fileinfo_dtype)
                 for fileinfo_key, fileinfo_dtype in items
             ])
-        _fileinfo_dicts['precomputed_reading_measures'] = fileinfo_df
+        _fileinfo_dicts[pc_rm] = fileinfo_df
 
     return _fileinfo_dicts
 
@@ -316,7 +318,7 @@ def load_gaze_file(
                 trial_columns=definition.trial_columns,
                 time_unit=time_unit,
                 add_columns=add_columns,
-                column_dtypes=definition.filename_format_dtypes['gaze'],
+                column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
             )
 
             # suffixes as ordered after using GazeDataFrame.unnest()
@@ -364,7 +366,7 @@ def load_gaze_file(
                 trial_columns=definition.trial_columns,
                 column_map=definition.column_map,
                 add_columns=add_columns,
-                column_dtypes=definition.filename_format_dtypes['gaze'],
+                column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
                 **custom_read_kwargs,
             )
     elif filepath.suffix == '.feather':
@@ -372,14 +374,14 @@ def load_gaze_file(
             filepath,
             experiment=definition.experiment,
             add_columns=add_columns,
-            column_dtypes=definition.filename_format_dtypes['gaze'],
+            column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
         )
     elif filepath.suffix == '.asc':
         gaze_df = from_asc(
             filepath,
             experiment=definition.experiment,
             add_columns=add_columns,
-            column_dtypes=definition.filename_format_dtypes['gaze'],
+            column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
             **custom_read_kwargs,
         )
     else:
@@ -556,9 +558,10 @@ def add_fileinfo(
     )
 
     # Cast columns from fileinfo according to specification.
+    _schema_overrides = definition.filename_format_schema_overrides['gaze']
     df = df.with_columns([
         pl.col(fileinfo_key).cast(fileinfo_dtype)
-        for fileinfo_key, fileinfo_dtype in definition.filename_format_dtypes['gaze'].items()
+        for fileinfo_key, fileinfo_dtype in _schema_overrides.items()
     ])
     return df
 
diff --git a/src/pymovements/datasets/copco.py b/src/pymovements/datasets/copco.py
index b6603c644..52e3400aa 100644
--- a/src/pymovements/datasets/copco.py
+++ b/src/pymovements/datasets/copco.py
@@ -74,14 +74,16 @@ class CopCo(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
+
     trial_columns: list[str]
             The name of the trial columns in the input data frame. If the list is empty or None,
             the input data frame is assumed to contain only one trial. If the list is not empty,
             the input data frame is assumed to contain multiple trials and the transformation
             methods will be applied to each trial separately.
+
     time_column: str
         The name of the timestamp column in the input data frame. This column will be renamed to
         ``time``.
@@ -184,7 +186,7 @@ class CopCo(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'precomputed_events': {},
             'precomputed_reading_measures': {},
diff --git a/src/pymovements/datasets/didec.py b/src/pymovements/datasets/didec.py
index c27faddc6..496e3e7e6 100644
--- a/src/pymovements/datasets/didec.py
+++ b/src/pymovements/datasets/didec.py
@@ -70,7 +70,7 @@ class DIDEC(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -174,7 +174,7 @@ class DIDEC(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'experiment': int,
diff --git a/src/pymovements/datasets/emtec.py b/src/pymovements/datasets/emtec.py
index 69432149c..1efe552a1 100644
--- a/src/pymovements/datasets/emtec.py
+++ b/src/pymovements/datasets/emtec.py
@@ -71,7 +71,7 @@ class EMTeC(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -183,7 +183,7 @@ class EMTeC(DatasetDefinition):
             },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda:
             {
                 'gaze': {'subject_id': int},
@@ -219,7 +219,7 @@ class EMTeC(DatasetDefinition):
                     'y',
                     'pupil_right',
                 ],
-                'dtypes': {
+                'schema_overrides': {
                     'item_id': pl.Utf8,
                     'TRIAL_ID': pl.Int64,
                     'Trial_Index_': pl.Int64,
diff --git a/src/pymovements/datasets/fakenews.py b/src/pymovements/datasets/fakenews.py
index f964fd8f2..64c2cf9f9 100644
--- a/src/pymovements/datasets/fakenews.py
+++ b/src/pymovements/datasets/fakenews.py
@@ -64,7 +64,7 @@ class FakeNewsPerception(DatasetDefinition):
     filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
     trial_columns: list[str]
@@ -136,7 +136,7 @@ class FakeNewsPerception(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'precomputed_events': {'subject_id': int, 'session_id': int, 'truth_value': str},
         },
diff --git a/src/pymovements/datasets/gaze_graph.py b/src/pymovements/datasets/gaze_graph.py
index e8b146da0..530fc046d 100644
--- a/src/pymovements/datasets/gaze_graph.py
+++ b/src/pymovements/datasets/gaze_graph.py
@@ -77,7 +77,7 @@ class GazeGraph(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, Any]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -176,7 +176,7 @@ class GazeGraph(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'subject_id': int,
@@ -201,7 +201,7 @@ class GazeGraph(DatasetDefinition):
                 'separator': ',',
                 'has_header': False,
                 'new_columns': ['x', 'y'],
-                'dtypes': [pl.Float32, pl.Float32],
+                'schema_overrides': [pl.Float32, pl.Float32],
             },
         },
     )
diff --git a/src/pymovements/datasets/gaze_on_faces.py b/src/pymovements/datasets/gaze_on_faces.py
index 131549b87..0195bb0f0 100644
--- a/src/pymovements/datasets/gaze_on_faces.py
+++ b/src/pymovements/datasets/gaze_on_faces.py
@@ -75,7 +75,7 @@ class GazeOnFaces(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides : dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -175,7 +175,7 @@ class GazeOnFaces(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'sub_id': int,
@@ -200,7 +200,7 @@ class GazeOnFaces(DatasetDefinition):
                 'separator': ',',
                 'has_header': False,
                 'new_columns': ['x', 'y'],
-                'dtypes': [pl.Float32, pl.Float32],
+                'schema_overrides': [pl.Float32, pl.Float32],
             },
         },
     )
diff --git a/src/pymovements/datasets/gazebase.py b/src/pymovements/datasets/gazebase.py
index a9835b74a..508327996 100644
--- a/src/pymovements/datasets/gazebase.py
+++ b/src/pymovements/datasets/gazebase.py
@@ -79,7 +79,7 @@ class GazeBase(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -183,7 +183,7 @@ class GazeBase(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {'round_id': int, 'subject_id': int, 'session_id': int},
         },
@@ -211,7 +211,7 @@ class GazeBase(DatasetDefinition):
         default_factory=lambda: {
             'gaze': {
                 'null_values': 'NaN',
-                'dtypes': {
+                'schema_overrides': {
                     'n': pl.Int64,
                     'x': pl.Float32,
                     'y': pl.Float32,
diff --git a/src/pymovements/datasets/gazebasevr.py b/src/pymovements/datasets/gazebasevr.py
index ef22e7c8e..94737fe04 100644
--- a/src/pymovements/datasets/gazebasevr.py
+++ b/src/pymovements/datasets/gazebasevr.py
@@ -80,7 +80,7 @@ class GazeBaseVR(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -184,7 +184,7 @@ class GazeBaseVR(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'round_id': int,
@@ -214,7 +214,7 @@ class GazeBaseVR(DatasetDefinition):
     custom_read_kwargs: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             'gaze': {
-                'dtypes': {
+                'schema_overrides': {
                     'n': pl.Float32,
                     'x': pl.Float32,
                     'y': pl.Float32,
diff --git a/src/pymovements/datasets/hbn.py b/src/pymovements/datasets/hbn.py
index 0ef7b0922..2841cd8ea 100644
--- a/src/pymovements/datasets/hbn.py
+++ b/src/pymovements/datasets/hbn.py
@@ -74,7 +74,7 @@ class HBN(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -173,7 +173,7 @@ class HBN(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'subject_id': str,
@@ -197,7 +197,7 @@ class HBN(DatasetDefinition):
             'gaze': {
                 'separator': ',',
                 'columns': ['time', 'x_pix', 'y_pix'],
-                'dtypes': {
+                'schema_overrides': {
                     'time': pl.Int64,
                     'x_pix': pl.Float32,
                     'y_pix': pl.Float32,
diff --git a/src/pymovements/datasets/judo1000.py b/src/pymovements/datasets/judo1000.py
index dfbf1f663..e953bbdb4 100644
--- a/src/pymovements/datasets/judo1000.py
+++ b/src/pymovements/datasets/judo1000.py
@@ -73,7 +73,7 @@ class JuDo1000(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -172,7 +172,7 @@ class JuDo1000(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'subject_id': int,
@@ -205,7 +205,7 @@ class JuDo1000(DatasetDefinition):
     custom_read_kwargs: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             'gaze': {
-                'dtypes': {
+                'schema_overrides': {
                     'trialId': pl.Int64,
                     'pointId': pl.Int64,
                     'time': pl.Int64,
diff --git a/src/pymovements/datasets/potec.py b/src/pymovements/datasets/potec.py
index d26cb3160..aad21c2b8 100644
--- a/src/pymovements/datasets/potec.py
+++ b/src/pymovements/datasets/potec.py
@@ -83,7 +83,7 @@ class PoTeC(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -178,7 +178,7 @@ class PoTeC(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'subject_id': int,
@@ -204,7 +204,7 @@ class PoTeC(DatasetDefinition):
     custom_read_kwargs: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             'gaze': {
-                'dtypes': {
+                'schema_overrides': {
                     'time': pl.Int64,
                     'x': pl.Float32,
                     'y': pl.Float32,
diff --git a/src/pymovements/datasets/sb_sat.py b/src/pymovements/datasets/sb_sat.py
index 1974bd77b..39095853f 100644
--- a/src/pymovements/datasets/sb_sat.py
+++ b/src/pymovements/datasets/sb_sat.py
@@ -73,7 +73,7 @@ class SBSAT(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -188,7 +188,7 @@ class SBSAT(DatasetDefinition):
             },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda:
             {
                 'gaze': {'subject_id': int},
@@ -218,7 +218,7 @@ class SBSAT(DatasetDefinition):
                 'gaze': {
                     'separator': '\t',
                     'columns': ['time', 'book_name', 'screen_id', 'x_left', 'y_left', 'pupil_left'],
-                    'dtypes': {
+                    'schema_overrides': {
                         'time': pl.Int64,
                         'book_name': pl.Utf8,
                         'screen_id': pl.Int64,
diff --git a/src/pymovements/datasets/toy_dataset.py b/src/pymovements/datasets/toy_dataset.py
index 7f45ea72c..b8ca4ad6e 100644
--- a/src/pymovements/datasets/toy_dataset.py
+++ b/src/pymovements/datasets/toy_dataset.py
@@ -71,7 +71,7 @@ class ToyDataset(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -166,7 +166,7 @@ class ToyDataset(DatasetDefinition):
         default_factory=lambda: {'gaze': r'trial_{text_id:d}_{page_id:d}.csv'},
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'text_id': int,
@@ -189,7 +189,7 @@ class ToyDataset(DatasetDefinition):
         default_factory=lambda: {
             'gaze': {
                 'columns': ['timestamp', 'x', 'y', 'stimuli_x', 'stimuli_y'],
-                'dtypes': {
+                'schema_overrides': {
                     'timestamp': pl.Float64,
                     'x': pl.Float64,
                     'y': pl.Float64,
diff --git a/src/pymovements/datasets/toy_dataset_eyelink.py b/src/pymovements/datasets/toy_dataset_eyelink.py
index 1dcd2a661..4d70501d1 100644
--- a/src/pymovements/datasets/toy_dataset_eyelink.py
+++ b/src/pymovements/datasets/toy_dataset_eyelink.py
@@ -72,7 +72,7 @@ class ToyDatasetEyeLink(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
@@ -179,7 +179,7 @@ class ToyDatasetEyeLink(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'gaze': {
                 'subject_id': int,
diff --git a/src/pymovements/gaze/gaze_dataframe.py b/src/pymovements/gaze/gaze_dataframe.py
index aec927a58..b7adc3d31 100644
--- a/src/pymovements/gaze/gaze_dataframe.py
+++ b/src/pymovements/gaze/gaze_dataframe.py
@@ -713,7 +713,7 @@ def detect(
             for group_identifier, group_gaze in grouped_frames.items():
                 # Create filter expression for selecting respective group rows.
                 if len(self.trial_columns) == 1:
-                    group_filter_expression = pl.col(self.trial_columns[0]) == group_identifier
+                    group_filter_expression = pl.col(self.trial_columns[0]) == group_identifier[0]
                 else:
                     group_filter_expression = pl.col(self.trial_columns[0]) == group_identifier[0]
                     for name, value in zip(self.trial_columns[1:], group_identifier[1:]):
diff --git a/src/pymovements/gaze/integration.py b/src/pymovements/gaze/integration.py
index 06ee617be..f05b22e31 100644
--- a/src/pymovements/gaze/integration.py
+++ b/src/pymovements/gaze/integration.py
@@ -157,11 +157,13 @@ def from_numpy(
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
+    │ 0    ┆ [0.0, 0.0] │
     │ …    ┆ …          │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
+    │ 0    ┆ [0.0, 0.0] │
     └──────┴────────────┘
 
     Use the ``orient`` keyword argument to specify the layout of your array.
@@ -187,11 +189,13 @@ def from_numpy(
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
+    │ 0    ┆ [0.0, 0.0] │
     │ …    ┆ …          │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
+    │ 0    ┆ [0.0, 0.0] │
     └──────┴────────────┘
 
     Pass the data explicitly via the specific keyword arguments, without having to specify a schema.
@@ -212,11 +216,13 @@ def from_numpy(
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
+    │ 0    ┆ [0.0, 0.0] │
     │ …    ┆ …          │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
     │ 0    ┆ [0.0, 0.0] │
+    │ 0    ┆ [0.0, 0.0] │
     └──────┴────────────┘
     """
     # Either data or {time, pixel, position, velocity, acceleration} must be None.
diff --git a/src/pymovements/gaze/io.py b/src/pymovements/gaze/io.py
index 9b3b133f5..722477491 100644
--- a/src/pymovements/gaze/io.py
+++ b/src/pymovements/gaze/io.py
@@ -44,7 +44,7 @@ def from_csv(
         distance_column: str | None = None,
         column_map: dict[str, str] | None = None,
         add_columns: dict[str, str] | None = None,
-        column_dtypes: dict[str, type] | None = None,
+        column_schema_overrides: dict[str, type] | None = None,
         **read_csv_kwargs: Any,
 ) -> GazeDataFrame:
     """Initialize a :py:class:`pymovements.gaze.gaze_dataframe.GazeDataFrame`.
@@ -94,7 +94,7 @@ def from_csv(
     add_columns: dict[str, str] | None
         Dictionary containing columns to add to loaded data frame.
         (default: None)
-    column_dtypes:  dict[str, type] | None
+    column_schema_overrides:  dict[str, type] | None
         Dictionary containing types for columns.
         (default: None)
     **read_csv_kwargs: Any
@@ -141,7 +141,8 @@ def from_csv(
     │ 1    ┆ 0          ┆ 0          │
     │ 2    ┆ 0          ┆ 0          │
     │ 3    ┆ 0          ┆ 0          │
-    │ …    ┆ …          ┆ …          │
+    │ 4    ┆ 0          ┆ 0          │
+    │ 5    ┆ 0          ┆ 0          │
     │ 6    ┆ 0          ┆ 0          │
     │ 7    ┆ 0          ┆ 0          │
     │ 8    ┆ 0          ┆ 0          │
@@ -171,7 +172,8 @@ def from_csv(
     │ 1    ┆ [0, 0]    │
     │ 2    ┆ [0, 0]    │
     │ 3    ┆ [0, 0]    │
-    │ …    ┆ …         │
+    │ 4    ┆ [0, 0]    │
+    │ 5    ┆ [0, 0]    │
     │ 6    ┆ [0, 0]    │
     │ 7    ┆ [0, 0]    │
     │ 8    ┆ [0, 0]    │
@@ -180,7 +182,7 @@ def from_csv(
 
     Please be aware that data types are inferred from a fixed number of rows. To ensure
     correct data types, you can pass a dictionary of column names and data types to the
-    `dtypes` keyword argument of :py:func:`polars.read_csv`:
+    `schema_overrides` keyword argument of :py:func:`polars.read_csv`:
 
     >>> from pymovements.gaze.io import from_csv
     >>> import polars as pl
@@ -189,7 +191,7 @@ def from_csv(
     ...     time_column = 'time',
     ...     time_unit='ms',
     ...     pixel_columns = ['x_left_pix','y_left_pix'],
-    ...     dtypes = {'time': pl.Int64, 'x_left_pix': pl.Int64, 'y_left_pix': pl.Int64},
+    ...     schema_overrides = {'time': pl.Int64, 'x_left_pix': pl.Int64, 'y_left_pix': pl.Int64},
     ... )
     >>> gaze.frame
     shape: (10, 2)
@@ -202,7 +204,8 @@ def from_csv(
     │ 1    ┆ [0, 0]    │
     │ 2    ┆ [0, 0]    │
     │ 3    ┆ [0, 0]    │
-    │ …    ┆ …         │
+    │ 4    ┆ [0, 0]    │
+    │ 5    ┆ [0, 0]    │
     │ 6    ┆ [0, 0]    │
     │ 7    ┆ [0, 0]    │
     │ 8    ┆ [0, 0]    │
@@ -243,10 +246,10 @@ def from_csv(
                 pl.col(column).cast(pl.Float64),
             ])
 
-    if column_dtypes is not None:
+    if column_schema_overrides is not None:
         gaze_data = gaze_data.with_columns([
             pl.col(fileinfo_key).cast(fileinfo_dtype)
-            for fileinfo_key, fileinfo_dtype in column_dtypes.items()
+            for fileinfo_key, fileinfo_dtype in column_schema_overrides.items()
         ])
 
     # Create gaze data frame.
@@ -272,7 +275,7 @@ def from_asc(
         schema: dict[str, Any] | None = None,
         experiment: Experiment | None = None,
         add_columns: dict[str, str] | None = None,
-        column_dtypes: dict[str, type] | None = None,
+        column_schema_overrides: dict[str, type] | None = None,
 ) -> GazeDataFrame:
     """Initialize a :py:class:`pymovements.gaze.gaze_dataframe.GazeDataFrame`.
 
@@ -290,7 +293,7 @@ def from_asc(
     add_columns: dict[str, str] | None
         Dictionary containing columns to add to loaded data frame.
         (default: None)
-    column_dtypes:  dict[str, type] | None
+    column_schema_overrides:  dict[str, type] | None
         Dictionary containing types for columns.
         (default: None)
 
@@ -317,7 +320,9 @@ def from_asc(
     │ 2154557 ┆ 778.0 ┆ [138.2, 132.7] │
     │ 2154560 ┆ 777.0 ┆ [137.9, 131.6] │
     │ 2154564 ┆ 778.0 ┆ [138.1, 131.0] │
+    │ 2154596 ┆ 784.0 ┆ [139.6, 132.1] │
     │ …       ┆ …     ┆ …              │
+    │ 2339246 ┆ 622.0 ┆ [629.9, 531.9] │
     │ 2339271 ┆ 617.0 ┆ [639.4, 531.9] │
     │ 2339272 ┆ 617.0 ┆ [639.0, 531.9] │
     │ 2339290 ┆ 618.0 ┆ [637.6, 531.4] │
@@ -342,10 +347,10 @@ def from_asc(
             if column not in gaze_data.columns
         ])
 
-    if column_dtypes is not None:
+    if column_schema_overrides is not None:
         gaze_data = gaze_data.with_columns([
             pl.col(fileinfo_key).cast(fileinfo_dtype)
-            for fileinfo_key, fileinfo_dtype in column_dtypes.items()
+            for fileinfo_key, fileinfo_dtype in column_schema_overrides.items()
         ])
 
     # Create gaze data frame.
@@ -364,7 +369,7 @@ def from_ipc(
         experiment: Experiment | None = None,
         column_map: dict[str, str] | None = None,
         add_columns: dict[str, str] | None = None,
-        column_dtypes: dict[str, type] | None = None,
+        column_schema_overrides: dict[str, type] | None = None,
         **read_ipc_kwargs: Any,
 ) -> GazeDataFrame:
     """Initialize a :py:class:`pymovements.gaze.gaze_dataframe.GazeDataFrame`.
@@ -382,7 +387,7 @@ def from_ipc(
     add_columns: dict[str, str] | None
         Dictionary containing columns to add to loaded data frame.
         (default: None)
-    column_dtypes:  dict[str, type] | None
+    column_schema_overrides:  dict[str, type] | None
         Dictionary containing types for columns.
         (default: None)
     **read_ipc_kwargs: Any
@@ -411,7 +416,8 @@ def from_ipc(
     │ 1    ┆ [0, 0]    │
     │ 2    ┆ [0, 0]    │
     │ 3    ┆ [0, 0]    │
-    │ …    ┆ …         │
+    │ 4    ┆ [0, 0]    │
+    │ 5    ┆ [0, 0]    │
     │ 6    ┆ [0, 0]    │
     │ 7    ┆ [0, 0]    │
     │ 8    ┆ [0, 0]    │
@@ -438,10 +444,10 @@ def from_ipc(
             if column not in gaze_data.columns
         ])
 
-    if column_dtypes is not None:
+    if column_schema_overrides is not None:
         gaze_data = gaze_data.with_columns([
             pl.col(fileinfo_key).cast(fileinfo_dtype)
-            for fileinfo_key, fileinfo_dtype in column_dtypes.items()
+            for fileinfo_key, fileinfo_dtype in column_schema_overrides.items()
         ])
 
     # Create gaze data frame.
diff --git a/src/pymovements/gaze/transforms.py b/src/pymovements/gaze/transforms.py
index 5b925e2ef..8e5916920 100644
--- a/src/pymovements/gaze/transforms.py
+++ b/src/pymovements/gaze/transforms.py
@@ -280,9 +280,9 @@ def pix2deg(
     ])
 
     degree_components = [
-        pl.arctan2d(
+        pl.arctan2(
             centered_pixels.list.get(component), distance_pixels.list.get(component),
-        )
+        ).degrees()
         for component in range(n_components)
     ]
 
diff --git a/src/pymovements/utils/parsing.py b/src/pymovements/utils/parsing.py
index f9a60feb6..a2e12f43b 100755
--- a/src/pymovements/utils/parsing.py
+++ b/src/pymovements/utils/parsing.py
@@ -381,10 +381,7 @@ def parse_eyelink(
         for column, dtype in schema.items():
             schema_overrides[column] = dtype
 
-    df = pl.from_dict(
-        data=samples,
-        schema_overrides=schema_overrides,
-    )
+    df = pl.from_dict(data=samples).cast(schema_overrides)
 
     return df, pre_processed_metadata
 
diff --git a/tests/functional/dataset_processing_test.py b/tests/functional/dataset_processing_test.py
index 48abacb20..3c3dfdbaa 100644
--- a/tests/functional/dataset_processing_test.py
+++ b/tests/functional/dataset_processing_test.py
@@ -55,7 +55,7 @@ def fixture_dataset_init_kwargs(request):
             pixel_columns=['x_left_pix', 'y_left_pix'],
             experiment=pm.Experiment(1024, 768, 38, 30, 60, 'center', 1000),
             filename_format={'gaze': 'monocular_example.csv'},
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             custom_read_kwargs={'gaze': {}},
         ),
         'csv_binocular': pm.dataset.DatasetDefinition(
@@ -70,7 +70,7 @@ def fixture_dataset_init_kwargs(request):
             pixel_columns=['x_left_pix', 'y_left_pix', 'x_right_pix', 'y_right_pix'],
             position_columns=['x_left_pos', 'y_left_pos', 'x_right_pos', 'y_right_pos'],
             experiment=pm.Experiment(1024, 768, 38, 30, 60, 'center', 1000),
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             custom_read_kwargs={'gaze': {}},
         ),
         'ipc_monocular': pm.dataset.DatasetDefinition(
@@ -81,7 +81,7 @@ def fixture_dataset_init_kwargs(request):
             },
             filename_format={'gaze': 'monocular_example.feather'},
             experiment=pm.Experiment(1024, 768, 38, 30, 60, 'center', 1000),
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             custom_read_kwargs={'gaze': {}},
         ),
         'ipc_binocular': pm.dataset.DatasetDefinition(
@@ -92,7 +92,7 @@ def fixture_dataset_init_kwargs(request):
             },
             filename_format={'gaze': 'binocular_example.feather'},
             experiment=pm.Experiment(1024, 768, 38, 30, 60, 'center', 1000),
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             custom_read_kwargs={'gaze': {}},
         ),
         'emtec': pm.datasets.EMTeC(
@@ -104,7 +104,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'emtec_example.csv'},
             time_column=pm.datasets.EMTeC().time_column,
             time_unit=pm.datasets.EMTeC().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
         'didec': pm.datasets.DIDEC(
@@ -116,7 +116,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'didec_example.txt'},
             time_column=pm.datasets.DIDEC().time_column,
             time_unit=pm.datasets.DIDEC().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
         'hbn': pm.datasets.HBN(
@@ -128,7 +128,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'hbn_example.csv'},
             time_column=pm.datasets.HBN().time_column,
             time_unit=pm.datasets.HBN().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
         'sbsat': pm.datasets.SBSAT(
@@ -140,7 +140,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'sbsat_example.csv'},
             time_column=pm.datasets.SBSAT().time_column,
             time_unit=pm.datasets.SBSAT().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
         'gaze_on_faces': pm.datasets.GazeOnFaces(
@@ -152,7 +152,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'gaze_on_faces_example.csv'},
             time_column=pm.datasets.GazeOnFaces().time_column,
             time_unit=pm.datasets.GazeOnFaces().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
         'gazebase': pm.datasets.GazeBase(
@@ -164,7 +164,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'gazebase_example.csv'},
             time_column=pm.datasets.GazeBase().time_column,
             time_unit=pm.datasets.GazeBase().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
         'gazebase_vr': pm.datasets.GazeBaseVR(
@@ -176,7 +176,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'gazebase_vr_example.csv'},
             time_column=pm.datasets.GazeBaseVR().time_column,
             time_unit=pm.datasets.GazeBaseVR().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
         'gazegraph': pm.datasets.GazeGraph(
@@ -188,7 +188,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'gazegraph_example.csv'},
             time_column=pm.datasets.GazeGraph().time_column,
             time_unit=pm.datasets.GazeGraph().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
         'judo1000': pm.datasets.JuDo1000(
@@ -200,7 +200,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'judo1000_example.csv'},
             time_column=pm.datasets.JuDo1000().time_column,
             time_unit=pm.datasets.JuDo1000().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=['trial_id'],
         ),
         'potec': pm.datasets.PoTeC(
@@ -212,7 +212,7 @@ def fixture_dataset_init_kwargs(request):
             filename_format={'gaze': 'potec_example.tsv'},
             time_column=pm.datasets.PoTeC().time_column,
             time_unit=pm.datasets.PoTeC().time_unit,
-            filename_format_dtypes={'gaze': {}},
+            filename_format_schema_overrides={'gaze': {}},
             trial_columns=None,
         ),
     }
diff --git a/tests/unit/dataset/dataset_download_test.py b/tests/unit/dataset/dataset_download_test.py
index 37e516eae..8ba7f5b64 100644
--- a/tests/unit/dataset/dataset_download_test.py
+++ b/tests/unit/dataset/dataset_download_test.py
@@ -1164,7 +1164,7 @@ def test_public_dataset_registered_correct_attributes(tmp_path, dataset_definiti
     assert dataset.definition.resources == dataset_definition.resources
     assert dataset.definition.experiment == dataset_definition.experiment
     assert dataset.definition.filename_format == dataset_definition.filename_format
-    assert dataset.definition.filename_format_dtypes == dataset_definition.filename_format_dtypes
+    assert dataset.definition.filename_format_schema_overrides == dataset_definition.filename_format_schema_overrides  # noqa: E501
     assert dataset.definition.has_files == dataset_definition.has_files
 
 
diff --git a/tests/unit/dataset/dataset_files_test.py b/tests/unit/dataset/dataset_files_test.py
index 341c8e7dd..80af569a2 100644
--- a/tests/unit/dataset/dataset_files_test.py
+++ b/tests/unit/dataset/dataset_files_test.py
@@ -207,7 +207,7 @@ def test_load_eyelink_file(tmp_path, read_kwargs):
         fileinfo_row={},
         definition=DatasetDefinition(
             experiment=pm.Experiment(1024, 768, 38, 30, None, 'center', 100),
-            filename_format_dtypes={'gaze': {}, 'precomputed_events': {}},
+            filename_format_schema_overrides={'gaze': {}, 'precomputed_events': {}},
         ),
         custom_read_kwargs=read_kwargs,
     )
diff --git a/tests/unit/dataset/dataset_test.py b/tests/unit/dataset/dataset_test.py
index 97d13d679..7bb144707 100644
--- a/tests/unit/dataset/dataset_test.py
+++ b/tests/unit/dataset/dataset_test.py
@@ -109,14 +109,14 @@ def mock_toy(
             'precomputed_reading_measures': False,
         },
         extract={'gaze': True, 'precomputed_events': True},
-        filename_format_dtypes={
+        filename_format_schema_overrides={
             'gaze': {'subject_id': pl.Int64},
             'precomputed_events': {'subject_id': pl.Int64},
             'precomputed_reading_measures': {'subject_id': pl.Int64},
         },
 ):
 
-    if filename_format_dtypes['precomputed_events']:
+    if filename_format_schema_overrides['precomputed_events']:
         subject_ids = list(range(1, 21))
         fileinfo = pl.DataFrame(
             data={'subject_id': subject_ids},
@@ -330,7 +330,7 @@ def mock_toy(
             'precomputed_events': r'{subject_id:d}.' + raw_fileformat,
             'precomputed_reading_measures': r'{subject_id:d}.' + raw_fileformat,
         },
-        filename_format_dtypes=filename_format_dtypes,
+        filename_format_schema_overrides=filename_format_schema_overrides,
         custom_read_kwargs={
             'gaze': {},
             'precomputed_events': {},
@@ -739,7 +739,7 @@ def test_clip(gaze_dataset_configuration):
 
     original_schema = dataset.gaze[0].schema
 
-    dataset.clip(-1000, 1000, input_column='pixel', output_column='pixel_clipped', n_components=4)
+    dataset.clip(-1000, 1000, input_column='pixel', output_column='pixel_clipped', n_components=2)
 
     expected_schema = {**original_schema, 'pixel_clipped': pl.List(pl.Float64)}
     for result_gaze_df in dataset.gaze:
@@ -1802,7 +1802,7 @@ def precomputed_fixture_dataset(request, tmp_path):
                 'precomputed_reading_measures': False,
             },
             extract={'precomputed_events': False},
-            filename_format_dtypes={'precomputed_events': {}},
+            filename_format_schema_overrides={'precomputed_events': {}},
         )
     else:
         raise ValueError(f'{request.param} not supported as dataset mock')
@@ -1874,7 +1874,10 @@ def precomputed_rm_fixture_dataset(request, tmp_path):
                 'precomputed_reading_measures': True,
             },
             extract={'precomputed_reading_measures': False},
-            filename_format_dtypes={'precomputed_events': {}, 'precomputed_reading_measures': {}},
+            filename_format_schema_overrides={
+                'precomputed_events': {},
+                'precomputed_reading_measures': {},
+            },
         )
     else:
         raise ValueError(f'{request.param} not supported as dataset mock')
diff --git a/tests/unit/datasets/datasets_test.py b/tests/unit/datasets/datasets_test.py
index 9f71da7ac..6a3e02d58 100644
--- a/tests/unit/datasets/datasets_test.py
+++ b/tests/unit/datasets/datasets_test.py
@@ -105,7 +105,7 @@ def test_public_dataset_registered(public_dataset, dataset_name, dataset_path, d
         assert dataset_definition.resources['gaze'] == registered_definition.resources['gaze']
         assert dataset_definition.experiment == registered_definition.experiment
         assert dataset_definition.filename_format['gaze'] == registered_definition.filename_format['gaze']  # noqa: E501
-        assert dataset_definition.filename_format_dtypes['gaze'] == registered_definition.filename_format_dtypes['gaze']  # noqa: E501
+        assert dataset_definition.filename_format_schema_overrides['gaze'] == registered_definition.filename_format_schema_overrides['gaze']  # noqa: E501
         assert dataset_definition.custom_read_kwargs['gaze'] == registered_definition.custom_read_kwargs['gaze']  # noqa: E501
 
     if dataset_definition.has_files['precomputed_events']:
@@ -113,7 +113,7 @@ def test_public_dataset_registered(public_dataset, dataset_name, dataset_path, d
         assert dataset_definition.resources['precomputed_events'] == registered_definition.resources['precomputed_events']  # noqa: E501
         assert dataset_definition.experiment == registered_definition.experiment
         assert dataset_definition.filename_format['precomputed_events'] == registered_definition.filename_format['precomputed_events']  # noqa: E501
-        assert dataset_definition.filename_format_dtypes['precomputed_events'] == registered_definition.filename_format_dtypes['precomputed_events']  # noqa: E501
+        assert dataset_definition.filename_format_schema_overrides['precomputed_events'] == registered_definition.filename_format_schema_overrides['precomputed_events']  # noqa: E501
         assert dataset_definition.custom_read_kwargs['precomputed_events'] == registered_definition.custom_read_kwargs['precomputed_events']  # noqa: E501
 
     if dataset_definition.has_files['precomputed_reading_measures']:
@@ -121,7 +121,7 @@ def test_public_dataset_registered(public_dataset, dataset_name, dataset_path, d
         assert dataset_definition.resources['precomputed_reading_measures'] == registered_definition.resources['precomputed_reading_measures']  # noqa: E501
         assert dataset_definition.experiment == registered_definition.experiment
         assert dataset_definition.filename_format['precomputed_reading_measures'] == registered_definition.filename_format['precomputed_reading_measures']  # noqa: E501
-        assert dataset_definition.filename_format_dtypes['precomputed_reading_measures'] == registered_definition.filename_format_dtypes['precomputed_reading_measures']  # noqa: E501
+        assert dataset_definition.filename_format_schema_overrides['precomputed_reading_measures'] == registered_definition.filename_format_schema_overrides['precomputed_reading_measures']  # noqa: E501
         assert dataset_definition.custom_read_kwargs['precomputed_reading_measures'] == registered_definition.custom_read_kwargs['precomputed_reading_measures']  # noqa: E501
 
     dataset, expected_paths = construct_public_dataset(
diff --git a/tests/unit/events/frame_test.py b/tests/unit/events/frame_test.py
index c967284af..5ea0c609b 100644
--- a/tests/unit/events/frame_test.py
+++ b/tests/unit/events/frame_test.py
@@ -379,7 +379,7 @@ def test_event_dataframe_init_expected_trial_column_list(kwargs, expected_trial_
                 ),
                 'trial_columns': 'trial',
             },
-            pl.DataFrame({'trial': [1, 1]}),
+            pl.DataFrame({'trial': [1, 1]}, schema_overrides={'trial': pl.Int32}),
             id='two_rows_plain_trial',
         ),
     ],
diff --git a/tests/unit/gaze/io/csv_test.py b/tests/unit/gaze/io/csv_test.py
index 0efdb0546..ace86e563 100644
--- a/tests/unit/gaze/io/csv_test.py
+++ b/tests/unit/gaze/io/csv_test.py
@@ -121,7 +121,7 @@ def test_shapes(kwargs, shape):
 
 
 @pytest.mark.parametrize(
-    ('kwargs', 'dtypes'),
+    ('kwargs', 'schema_overrides'),
     [
         pytest.param(
             {
@@ -131,7 +131,7 @@ def test_shapes(kwargs, shape):
                 'pixel_columns': ['x_left_pix', 'y_left_pix'],
             },
             [pl.Int64, pl.List(pl.Int64)],
-            id='csv_mono_dtypes',
+            id='csv_mono_schema_overrides',
         ),
         pytest.param(
             {
@@ -142,10 +142,10 @@ def test_shapes(kwargs, shape):
                 'position_columns': ['position_x', 'position_y'],
             },
             [pl.Int64, pl.List(pl.Float64), pl.List(pl.Float64)],
-            id='csv_missing_values_dtypes',
+            id='csv_missing_values_schema_overrides',
         ),
     ],
 )
-def test_dtypes(kwargs, dtypes):
+def test_schema_overrides(kwargs, schema_overrides):
     gaze_dataframe = pm.gaze.from_csv(**kwargs)
-    assert gaze_dataframe.frame.dtypes == dtypes
+    assert gaze_dataframe.frame.dtypes == schema_overrides

From 08a1399159e441dbc4b6c434f445b811287c0ac3 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sat, 28 Sep 2024 20:25:02 +0200
Subject: [PATCH 26/31] manually update pre-commit config (#818)

---
 .pre-commit-config.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b0f9d6466..d3aef8533 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
     hooks:
     -   id: add-trailing-comma
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.16.0
+    rev: v3.17.0
     hooks:
     -   id: pyupgrade
         args: [--py39-plus]
@@ -58,7 +58,7 @@ repos:
     - id: pydoclint
       args: ["--config=pyproject.toml"]
 -   repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.8.5
+    rev: 1.8.7
     hooks:
     -   id: nbqa-autopep8
     -   id: nbqa-flake8
@@ -68,7 +68,7 @@ repos:
     -   id: nbqa-pyupgrade
         args: ["--py39-plus"]
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.11.2
     hooks:
     -   id: mypy
         additional_dependencies: [pandas-stubs, types-tqdm]
@@ -90,7 +90,7 @@ repos:
     -   id: requirements-txt-fixer
     -   id: trailing-whitespace
 -   repo: https://github.com/hhatto/autopep8
-    rev: v2.3.0
+    rev: v2.3.1
     hooks:
     -   id: autopep8
 -   repo: https://github.com/PyCQA/autoflake
@@ -98,11 +98,11 @@ repos:
     hooks:
     -   id: autoflake
 -   repo: https://github.com/PyCQA/doc8
-    rev: v1.1.1
+    rev: v1.1.2
     hooks:
     -   id: doc8
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.1.0
+    rev: 7.1.1
     hooks:
     -   id: flake8
 -   repo: https://github.com/pycqa/pydocstyle
@@ -121,7 +121,7 @@ repos:
             '--ignore=D103,D107,D213',
           ]
 -   repo: https://github.com/PyCQA/pylint
-    rev: v3.2.3
+    rev: v3.3.1
     hooks:
       - id: pylint
         name: pylint

From ee4400695b37e2af2d2d3bef6b2bfb916cafd34c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 28 Sep 2024 19:09:41 +0000
Subject: [PATCH 27/31] build: update nbconvert requirement from <7.14,>=7.0.0
 to >=7.16.4,<7.17 (#741)

Updates the requirements on [nbconvert](https://github.com/jupyter/nbconvert) to permit the latest version.
- [Release notes](https://github.com/jupyter/nbconvert/releases)
- [Changelog](https://github.com/jupyter/nbconvert/blob/main/CHANGELOG.md)
- [Commits](https://github.com/jupyter/nbconvert/compare/7.0.0...v7.16.4)

---
updated-dependencies:
- dependency-name: nbconvert
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 075d01c49..420a8dc3e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ dynamic = ["version"]
 [project.optional-dependencies]
 docs = [
   "ipykernel>=6.13.0",
-  "nbconvert>=7.0.0,<7.14",
+  "nbconvert>=7.0.0,<7.17",
   "nbsphinx>=0.8.8,<0.9.5",
   "pandoc",
   "pybtex",

From eee4f76672fe1060c39c6f3c03adc4601e9f6906 Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Sun, 29 Sep 2024 17:46:24 +0200
Subject: [PATCH 28/31] update pydoclint in pre-commit config (#819)

---
 .pre-commit-config.yaml                | 2 +-
 src/pymovements/datasets/gaze_graph.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d3aef8533..bb3345214 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,7 +53,7 @@ repos:
         args: [--use-current-year]
         types: [python]
 -   repo: https://github.com/jsh9/pydoclint
-    rev: 0.4.2
+    rev: 0.5.8
     hooks:
     - id: pydoclint
       args: ["--config=pyproject.toml"]
diff --git a/src/pymovements/datasets/gaze_graph.py b/src/pymovements/datasets/gaze_graph.py
index 530fc046d..88b50c75e 100644
--- a/src/pymovements/datasets/gaze_graph.py
+++ b/src/pymovements/datasets/gaze_graph.py
@@ -77,7 +77,7 @@ class GazeGraph(DatasetDefinition):
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_schema_overrides: dict[str, dict[str, Any]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 

From dbad9c17137ae5e6685dc1d27df315b353bcf3b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20S=C3=A4uberli?=
 <38892775+saeub@users.noreply.github.com>
Date: Sun, 29 Sep 2024 21:37:22 +0200
Subject: [PATCH 29/31] feat!: Custom patterns for parsing logged metadata in
 ASC files (#767)

* Return metadata from from_asc()

* Parse metadata from ASC files based on custom patterns

* Refactor

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Refactor

* Fix test coverage

* Fix docstrings

* Fix docstring

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: David R. Reich <43832476+SiQube@users.noreply.github.com>
---
 src/pymovements/dataset/dataset_files.py      |   2 +-
 src/pymovements/gaze/io.py                    |  31 +--
 src/pymovements/utils/parsing.py              | 198 +++++++++---------
 tests/functional/gaze_file_processing_test.py |   2 +-
 tests/unit/gaze/io/asc_test.py                |  38 +++-
 tests/unit/utils/parsing_test.py              |  16 ++
 6 files changed, 173 insertions(+), 114 deletions(-)

diff --git a/src/pymovements/dataset/dataset_files.py b/src/pymovements/dataset/dataset_files.py
index dfc4c40eb..1553fddb9 100644
--- a/src/pymovements/dataset/dataset_files.py
+++ b/src/pymovements/dataset/dataset_files.py
@@ -377,7 +377,7 @@ def load_gaze_file(
             column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
         )
     elif filepath.suffix == '.asc':
-        gaze_df = from_asc(
+        gaze_df, _ = from_asc(
             filepath,
             experiment=definition.experiment,
             add_columns=add_columns,
diff --git a/src/pymovements/gaze/io.py b/src/pymovements/gaze/io.py
index 722477491..18868e3dd 100644
--- a/src/pymovements/gaze/io.py
+++ b/src/pymovements/gaze/io.py
@@ -271,21 +271,25 @@ def from_csv(
 def from_asc(
         file: str | Path,
         *,
-        patterns: str | list | None = 'eyelink',
+        patterns: str | list[dict[str, Any] | str] | None = 'eyelink',
+        metadata_patterns: list[dict[str, Any] | str] | None = None,
         schema: dict[str, Any] | None = None,
         experiment: Experiment | None = None,
         add_columns: dict[str, str] | None = None,
-        column_schema_overrides: dict[str, type] | None = None,
-) -> GazeDataFrame:
+        column_schema_overrides: dict[str, Any] | None = None,
+) -> tuple[GazeDataFrame, dict[str, Any]]:
     """Initialize a :py:class:`pymovements.gaze.gaze_dataframe.GazeDataFrame`.
 
     Parameters
     ----------
     file: str | Path
         Path of IPC/feather file.
-    patterns: str | list | None
-        list of patterns to match for additional columns or a key identifier of eye tracker specific
+    patterns: str | list[dict[str, Any] | str] | None
+        List of patterns to match for additional columns or a key identifier of eye tracker specific
         default patterns. Supported values are: eyelink. (default: 'eyelink')
+    metadata_patterns: list[dict[str, Any] | str] | None
+        List of patterns to match for extracting metadata from custom logged messages.
+        (default: None)
     schema: dict[str, Any] | None
         Dictionary to optionally specify types of columns parsed by patterns. (default: None)
     experiment: Experiment | None
@@ -293,14 +297,14 @@ def from_asc(
     add_columns: dict[str, str] | None
         Dictionary containing columns to add to loaded data frame.
         (default: None)
-    column_schema_overrides:  dict[str, type] | None
+    column_schema_overrides: dict[str, Any] | None
         Dictionary containing types for columns.
         (default: None)
 
     Returns
     -------
-    GazeDataFrame
-        The gaze data frame read from the asc file.
+    tuple[GazeDataFrame, dict[str, Any]]
+        The gaze data frame and a metadata dictionary read from the asc file.
 
     Examples
     --------
@@ -308,7 +312,7 @@ def from_asc(
     We can then load the data into a ``GazeDataFrame``:
 
     >>> from pymovements.gaze.io import from_asc
-    >>> gaze = from_asc(file='tests/files/eyelink_monocular_example.asc', patterns='eyelink')
+    >>> gaze, metadata = from_asc(file='tests/files/eyelink_monocular_example.asc')
     >>> gaze.frame
     shape: (16, 3)
     ┌─────────┬───────┬────────────────┐
@@ -328,7 +332,8 @@ def from_asc(
     │ 2339290 ┆ 618.0 ┆ [637.6, 531.4] │
     │ 2339291 ┆ 618.0 ┆ [637.3, 531.2] │
     └─────────┴───────┴────────────────┘
-
+    >>> metadata['sampling_rate']
+    1000.0
     """
     if isinstance(patterns, str):
         if patterns == 'eyelink':
@@ -338,7 +343,9 @@ def from_asc(
             raise ValueError(f"unknown pattern key '{patterns}'. Supported keys are: eyelink")
 
     # Read data.
-    gaze_data, _ = parse_eyelink(file, patterns=patterns, schema=schema)
+    gaze_data, metadata = parse_eyelink(
+        file, patterns=patterns, schema=schema, metadata_patterns=metadata_patterns,
+    )
 
     if add_columns is not None:
         gaze_data = gaze_data.with_columns([
@@ -361,7 +368,7 @@ def from_asc(
         time_unit='ms',
         pixel_columns=['x_pix', 'y_pix'],
     )
-    return gaze_df
+    return gaze_df, metadata
 
 
 def from_ipc(
diff --git a/src/pymovements/utils/parsing.py b/src/pymovements/utils/parsing.py
index a2e12f43b..849cb8641 100755
--- a/src/pymovements/utils/parsing.py
+++ b/src/pymovements/utils/parsing.py
@@ -40,58 +40,58 @@
 )
 
 EYELINK_META_REGEXES = [
-    {'pattern': r'\*\*\s+VERSION:\s+(?P<version_1>.*)\s+'},
-    {
-        'pattern': r'\*\*\s+DATE:\s+(?P<weekday>[A-Z,a-z]+)\s+(?P<month>[A-Z,a-z]+)'
-                   r'\s+(?P<day>\d\d?)\s+(?P<time>\d\d:\d\d:\d\d)\s+(?P<year>\d{4})\s*',
-    },
-    {'pattern': r'\*\*\s+(?P<version_2>EYELINK.*)'},
-    {
-        'pattern': r'MSG\s+\d+[.]?\d*\s+DISPLAY_COORDS\s+(?P<resolution>.*)',
-    },
-    {
-        'pattern': r'MSG\s+\d+[.]?\d*\s+RECCFG\s+(?P<tracking_mode>[A-Z,a-z]+)\s+'
-                   r'(?P<sampling_rate>\d+)\s+'
-                   r'(?P<file_sample_filter>(0|1|2))\s+'
-                   r'(?P<link_sample_filter>(0|1|2))\s+'
-                   r'(?P<tracked_eye>(L|R|LR))\s*',
-    },
-    {
-        'pattern': r'PUPIL\s+(?P<pupil_data_type>(AREA|DIAMETER))\s*',
-    },
-    {
-        'pattern': r'MSG\s+\d+[.]?\d*\s+ELCLCFG\s+(?P<mount_configuration>.*)',
-    },
+    {'pattern': re.compile(regex)} for regex in (
+        r'\*\*\s+VERSION:\s+(?P<version_1>.*)\s+',
+        (
+            r'\*\*\s+DATE:\s+(?P<weekday>[A-Z,a-z]+)\s+(?P<month>[A-Z,a-z]+)'
+            r'\s+(?P<day>\d\d?)\s+(?P<time>\d\d:\d\d:\d\d)\s+(?P<year>\d{4})\s*'
+        ),
+        r'\*\*\s+(?P<version_2>EYELINK.*)',
+        r'MSG\s+\d+[.]?\d*\s+DISPLAY_COORDS\s+(?P<resolution>.*)',
+        (
+            r'MSG\s+\d+[.]?\d*\s+RECCFG\s+(?P<tracking_mode>[A-Z,a-z]+)\s+'
+            r'(?P<sampling_rate>\d+)\s+'
+            r'(?P<file_sample_filter>(0|1|2))\s+'
+            r'(?P<link_sample_filter>(0|1|2))\s+'
+            r'(?P<tracked_eye>(L|R|LR))\s*'
+        ),
+        r'PUPIL\s+(?P<pupil_data_type>(AREA|DIAMETER))\s*',
+        r'MSG\s+\d+[.]?\d*\s+ELCLCFG\s+(?P<mount_configuration>.*)',
+    )
 ]
 
-VALIDATION_REGEX = (
+VALIDATION_REGEX = re.compile(
     r'MSG\s+(?P<timestamp>\d+[.]?\d*)\s+!CAL\s+VALIDATION\s+HV'
     r'(?P<num_points>\d\d?).*'
     r'(?P<tracked_eye>LEFT|RIGHT)\s+'
     r'(?P<error>\D*)\s+'
     r'(?P<validation_score_avg>\d.\d\d)\s+avg\.\s+'
-    r'(?P<validation_score_max>\d.\d\d)\s+max'
+    r'(?P<validation_score_max>\d.\d\d)\s+max',
 )
 
-BLINK_START_REGEX = r'SBLINK\s+(R|L)\s+(?P<timestamp>(\d+[.]?\d*))\s*'
-BLINK_STOP_REGEX = (
+BLINK_START_REGEX = re.compile(r'SBLINK\s+(R|L)\s+(?P<timestamp>(\d+[.]?\d*))\s*')
+BLINK_STOP_REGEX = re.compile(
     r'EBLINK\s+(R|L)\s+(?P<timestamp_start>(\d+[.]?\d*))\s+'
-    r'(?P<timestamp_end>(\d+[.]?\d*))\s+(?P<duration_ms>(\d+[.]?\d*))\s*'
+    r'(?P<timestamp_end>(\d+[.]?\d*))\s+(?P<duration_ms>(\d+[.]?\d*))\s*',
+)
+INVALID_SAMPLE_REGEX = re.compile(
+    r'(?P<timestamp>(\d+[.]?\d*))\s+\.\s+\.\s+0\.0\s+0\.0\s+\.\.\.\s*',
 )
-INVALID_SAMPLE_REGEX = r'(?P<timestamp>(\d+[.]?\d*))\s+\.\s+\.\s+0\.0\s+0\.0\s+\.\.\.\s*'
 
-CALIBRATION_TIMESTAMP_REGEX = r'MSG\s+(?P<timestamp>\d+[.]?\d*)\s+!CAL\s*\n'
+CALIBRATION_TIMESTAMP_REGEX = re.compile(r'MSG\s+(?P<timestamp>\d+[.]?\d*)\s+!CAL\s*\n')
 
-CALIBRATION_REGEX = (
+CALIBRATION_REGEX = re.compile(
     r'>+\s+CALIBRATION\s+\(HV(?P<num_points>\d\d?),'
     r'(?P<type>.*)\).*'
-    r'(?P<tracked_eye>RIGHT|LEFT):\s+<{9}'
+    r'(?P<tracked_eye>RIGHT|LEFT):\s+<{9}',
 )
 
-START_RECORDING_REGEX = r'START\s+(?P<timestamp>(\d+[.]?\d*))\s+(RIGHT|LEFT)\s+(?P<types>.*)'
-STOP_RECORDING_REGEX = (
+START_RECORDING_REGEX = re.compile(
+    r'START\s+(?P<timestamp>(\d+[.]?\d*))\s+(RIGHT|LEFT)\s+(?P<types>.*)',
+)
+STOP_RECORDING_REGEX = re.compile(
     r'END\s+(?P<timestamp>(\d+[.]?\d*))\s+\s+(?P<types>.*)\s+RES\s+'
-    r'(?P<xres>[\d\.]*)\s+(?P<yres>[\d\.]*)\s*'
+    r'(?P<xres>[\d\.]*)\s+(?P<yres>[\d\.]*)\s*',
 )
 
 
@@ -115,12 +115,12 @@ def check_nan(sample_location: str) -> float:
     return ret
 
 
-def compile_patterns(patterns: list[dict[str, Any]]) -> list[dict[str, Any]]:
+def compile_patterns(patterns: list[dict[str, Any] | str]) -> list[dict[str, Any]]:
     """Compile patterns from strings.
 
     Parameters
     ----------
-    patterns: list[dict[str, Any]]
+    patterns: list[dict[str, Any] | str]
         The list of patterns to compile.
 
     Returns
@@ -161,24 +161,25 @@ def compile_patterns(patterns: list[dict[str, Any]]) -> list[dict[str, Any]]:
     return compiled_patterns
 
 
-def get_additional_columns(compiled_patterns: list[dict[str, Any]]) -> set[str]:
-    """Get additionally needed columns from compiled patterns."""
-    additional_columns = set()
+def get_pattern_keys(compiled_patterns: list[dict[str, Any]], pattern_key: str) -> set[str]:
+    """Get names of capture groups or column/metadata keys."""
+    keys = set()
 
     for compiled_pattern_dict in compiled_patterns:
-        if 'column' in compiled_pattern_dict:
-            additional_columns.add(compiled_pattern_dict['column'])
+        if pattern_key in compiled_pattern_dict:
+            keys.add(compiled_pattern_dict[pattern_key])
 
-        for column in compiled_pattern_dict['pattern'].groupindex.keys():
-            additional_columns.add(column)
+        for key in compiled_pattern_dict['pattern'].groupindex.keys():
+            keys.add(key)
 
-    return additional_columns
+    return keys
 
 
 def parse_eyelink(
         filepath: Path | str,
-        patterns: list[dict[str, Any]] | None = None,
+        patterns: list[dict[str, Any] | str] | None = None,
         schema: dict[str, Any] | None = None,
+        metadata_patterns: list[dict[str, Any] | str] | None = None,
 ) -> tuple[pl.DataFrame, dict[str, Any]]:
     """Process EyeLink asc file.
 
@@ -186,10 +187,12 @@ def parse_eyelink(
     ----------
     filepath: Path | str
         file name of ascii file to convert.
-    patterns: list[dict[str, Any]] | None
-        list of patterns to match for additional columns. (default: None)
+    patterns: list[dict[str, Any] | str] | None
+        List of patterns to match for additional columns. (default: None)
     schema: dict[str, Any] | None
         Dictionary to optionally specify types of columns parsed by patterns. (default: None)
+    metadata_patterns: list[dict[str, Any] | str] | None
+        list of patterns to match for additional metadata. (default: None)
 
     Returns
     -------
@@ -205,7 +208,11 @@ def parse_eyelink(
         patterns = []
     compiled_patterns = compile_patterns(patterns)
 
-    additional_columns = get_additional_columns(compiled_patterns)
+    if metadata_patterns is None:
+        metadata_patterns = []
+    compiled_metadata_patterns = compile_patterns(metadata_patterns)
+
+    additional_columns = get_pattern_keys(compiled_patterns, 'column')
     additional: dict[str, list[Any]] = {
         additional_column: [] for additional_column in additional_columns
     }
@@ -227,21 +234,14 @@ def parse_eyelink(
     # will return an empty string if the key does not exist
     metadata: defaultdict = defaultdict(str)
 
-    compiled_metadata_patterns = []
-    for metadata_pattern in EYELINK_META_REGEXES:
-        compiled_metadata_patterns.append({'pattern': re.compile(metadata_pattern['pattern'])})
+    # metadata keys specified by the user should have a default value of None
+    metadata_keys = get_pattern_keys(compiled_metadata_patterns, 'key')
+    for key in metadata_keys:
+        metadata[key] = None
 
-    compiled_validation_pattern = re.compile(VALIDATION_REGEX)
-    compiled_calibration_pattern = re.compile(CALIBRATION_REGEX)
-    compiled_calibration_timestamp = re.compile(CALIBRATION_TIMESTAMP_REGEX)
-    cal_timestamp = ''
-
-    compiled_blink_start = re.compile(BLINK_START_REGEX)
-    compiled_blink_stop = re.compile(BLINK_STOP_REGEX)
-    compiled_invalid_sample = re.compile(INVALID_SAMPLE_REGEX)
+    compiled_metadata_patterns.extend(EYELINK_META_REGEXES)
 
-    compiled_recording_start = re.compile(START_RECORDING_REGEX)
-    compiled_recording_stop = re.compile(STOP_RECORDING_REGEX)
+    cal_timestamp = ''
 
     validations = []
     calibrations = []
@@ -264,8 +264,7 @@ def parse_eyelink(
                     current_additional[current_column] = pattern_dict['value']
 
                 else:
-                    for column, value in match.groupdict().items():
-                        current_additional[column] = value
+                    current_additional.update(match.groupdict())
 
         if cal_timestamp:
             # if a calibration timestamp has been found, the next line will be a
@@ -277,15 +276,15 @@ def parse_eyelink(
                     'timestamp': cal_timestamp,
                     **match.groupdict(),
                 }
-                if (match := compiled_calibration_pattern.match(line))
+                if (match := CALIBRATION_REGEX.match(line))
                 else {'timestamp': cal_timestamp},
             )
             cal_timestamp = ''
 
-        elif compiled_blink_start.match(line):
+        elif BLINK_START_REGEX.match(line):
             blink = True
 
-        elif match := compiled_blink_stop.match(line):
+        elif match := BLINK_STOP_REGEX.match(line):
             blink = False
             parsed_blink = match.groupdict()
             blink_info = {
@@ -297,10 +296,10 @@ def parse_eyelink(
             num_blink_samples = 0
             blinks.append(blink_info)
 
-        elif match := compiled_recording_start.match(line):
+        elif match := START_RECORDING_REGEX.match(line):
             start_recording_timestamp = match.groupdict()['timestamp']
 
-        elif match := compiled_recording_stop.match(line):
+        elif match := STOP_RECORDING_REGEX.match(line):
             stop_recording_timestamp = match.groupdict()['timestamp']
             block_duration = float(stop_recording_timestamp) - float(start_recording_timestamp)
 
@@ -326,50 +325,52 @@ def parse_eyelink(
             for additional_column in additional_columns:
                 samples[additional_column].append(current_additional[additional_column])
 
-            if (match := compiled_invalid_sample.match(line)) and not blink:
-                invalid_samples.append(match.groupdict()['timestamp'])
-            elif compiled_invalid_sample.match(line) and blink:
-                num_blink_samples += 1
+            if match := INVALID_SAMPLE_REGEX.match(line):
+                if blink:
+                    num_blink_samples += 1
+                else:
+                    invalid_samples.append(match.groupdict()['timestamp'])
 
-        elif match := compiled_calibration_timestamp.match(line):
+        elif match := CALIBRATION_TIMESTAMP_REGEX.match(line):
             cal_timestamp = match.groupdict()['timestamp']
 
-        elif match := compiled_validation_pattern.match(line):
+        elif match := VALIDATION_REGEX.match(line):
             validations.append(match.groupdict())
 
         elif compiled_metadata_patterns:
             for pattern_dict in compiled_metadata_patterns.copy():
                 if match := pattern_dict['pattern'].match(line):
-                    for column, value in match.groupdict().items():
-                        metadata[column] = value
+                    if 'value' in pattern_dict:
+                        metadata[pattern_dict['key']] = pattern_dict['value']
+
+                    else:
+                        metadata.update(match.groupdict())
 
                     # each metadata pattern should only match once
                     compiled_metadata_patterns.remove(pattern_dict)
 
-    if metadata:
-        # if the sampling rate is not found, we cannot calculate the data loss
-        actual_number_of_samples = len(samples['time'])
-
-        data_loss_ratio, data_loss_ratio_blinks = _calculate_data_loss(
-            blinks=blinks,
-            invalid_samples=invalid_samples,
-            actual_num_samples=actual_number_of_samples,
-            total_rec_duration=total_recording_duration,
-            sampling_rate=metadata['sampling_rate'],
-        )
+    if not metadata:
+        raise Warning('No metadata found. Please check the file for errors.')
 
-        pre_processed_metadata: dict[str, Any] = _pre_process_metadata(metadata)
+    # if the sampling rate is not found, we cannot calculate the data loss
+    actual_number_of_samples = len(samples['time'])
 
-        # is not yet pre-processed but should be
-        pre_processed_metadata['calibrations'] = calibrations
-        pre_processed_metadata['validations'] = validations
-        pre_processed_metadata['blinks'] = blinks
-        pre_processed_metadata['data_loss_ratio'] = data_loss_ratio
-        pre_processed_metadata['data_loss_ratio_blinks'] = data_loss_ratio_blinks
-        pre_processed_metadata['total_recording_duration_ms'] = total_recording_duration
+    data_loss_ratio, data_loss_ratio_blinks = _calculate_data_loss(
+        blinks=blinks,
+        invalid_samples=invalid_samples,
+        actual_num_samples=actual_number_of_samples,
+        total_rec_duration=total_recording_duration,
+        sampling_rate=metadata['sampling_rate'],
+    )
 
-    else:
-        raise Warning('No metadata found. Please check the file for errors.')
+    pre_processed_metadata: dict[str, Any] = _pre_process_metadata(metadata)
+    # is not yet pre-processed but should be
+    pre_processed_metadata['calibrations'] = calibrations
+    pre_processed_metadata['validations'] = validations
+    pre_processed_metadata['blinks'] = blinks
+    pre_processed_metadata['data_loss_ratio'] = data_loss_ratio
+    pre_processed_metadata['data_loss_ratio_blinks'] = data_loss_ratio_blinks
+    pre_processed_metadata['total_recording_duration_ms'] = total_recording_duration
 
     schema_overrides = {
         'time': pl.Float64,
@@ -378,8 +379,7 @@ def parse_eyelink(
         'pupil': pl.Float64,
     }
     if schema is not None:
-        for column, dtype in schema.items():
-            schema_overrides[column] = dtype
+        schema_overrides.update(schema)
 
     df = pl.from_dict(data=samples).cast(schema_overrides)
 
diff --git a/tests/functional/gaze_file_processing_test.py b/tests/functional/gaze_file_processing_test.py
index b824469ef..219fcaf91 100644
--- a/tests/functional/gaze_file_processing_test.py
+++ b/tests/functional/gaze_file_processing_test.py
@@ -156,7 +156,7 @@ def test_gaze_file_processing(gaze_from_kwargs):
     elif file_extension in {'.feather', '.ipc'}:
         gaze = pm.gaze.from_ipc(**gaze_from_kwargs)
     elif file_extension == '.asc':
-        gaze = pm.gaze.from_asc(**gaze_from_kwargs)
+        gaze, _ = pm.gaze.from_asc(**gaze_from_kwargs)
 
     assert gaze is not None
 
diff --git a/tests/unit/gaze/io/asc_test.py b/tests/unit/gaze/io/asc_test.py
index 58e0ef4f9..40afb2b67 100644
--- a/tests/unit/gaze/io/asc_test.py
+++ b/tests/unit/gaze/io/asc_test.py
@@ -131,11 +131,47 @@
     ],
 )
 def test_from_asc_has_shape_and_schema(kwargs, expected_frame):
-    gaze = pm.gaze.from_asc(**kwargs)
+    gaze, _ = pm.gaze.from_asc(**kwargs)
 
     assert_frame_equal(gaze.frame, expected_frame, check_column_order=False)
 
 
+@pytest.mark.parametrize(
+    ('kwargs', 'expected_metadata'),
+    [
+        pytest.param(
+            {
+                'file': 'tests/files/eyelink_monocular_example.asc',
+                'metadata_patterns': [
+                    {'pattern': r'!V TRIAL_VAR SUBJECT_ID (?P<subject_id>-?\d+)'},
+                    r'!V TRIAL_VAR STIMULUS_COMBINATION_ID (?P<stimulus_combination_id>.+)',
+                ],
+            },
+            {
+                'subject_id': '-1',
+                'stimulus_combination_id': 'start',
+            },
+            id='eyelink_asc_metadata_patterns',
+        ),
+        pytest.param(
+            {
+                'file': 'tests/files/eyelink_monocular_example.asc',
+                'metadata_patterns': [r'inexistent pattern (?P<value>-?\d+)'],
+            },
+            {
+                'value': None,
+            },
+            id='eyelink_asc_metadata_pattern_not_found',
+        ),
+    ],
+)
+def test_from_asc_metadata_patterns(kwargs, expected_metadata):
+    _, metadata = pm.gaze.from_asc(**kwargs)
+
+    for key, value in expected_metadata.items():
+        assert metadata[key] == value
+
+
 @pytest.mark.parametrize(
     ('kwargs', 'exception', 'message'),
     [
diff --git a/tests/unit/utils/parsing_test.py b/tests/unit/utils/parsing_test.py
index 0b8347bf4..f33e89fd8 100644
--- a/tests/unit/utils/parsing_test.py
+++ b/tests/unit/utils/parsing_test.py
@@ -65,6 +65,7 @@
 START	10000004 	RIGHT	SAMPLES	EVENTS
 10000004	  850.7	  717.5	  714.0	    0.0	...
 END	10000005 	SAMPLES	EVENTS	RES	  38.54	  31.12
+MSG 10000005 METADATA_1 123
 MSG 10000005 START_B
 the next line now should have the task column set to B
 START	10000006 	RIGHT	SAMPLES	EVENTS
@@ -85,11 +86,14 @@
 MSG 10000013 START_TRIAL_3
 the next line now should have the trial column set to 3
 START	10000014 	RIGHT	SAMPLES	EVENTS
+MSG 10000014 METADATA_2 abc
+MSG 10000014 METADATA_1 456
 10000014	  850.7	  717.5	  714.0	    0.0	...
 END	10000015 	SAMPLES	EVENTS	RES	  38.54	  31.12
 MSG 10000015 STOP_TRIAL_3
 MSG 10000016 STOP_B
 task and trial should be set to None again
+MSG 10000017 METADATA_3
 START	10000017 	RIGHT	SAMPLES	EVENTS
 10000017	  850.7	  717.5	  .	    0.0	...
 SBLINK R 10000018
@@ -125,6 +129,13 @@
     },
 ]
 
+METADATA_PATTERNS = [
+    r'METADATA_1 (?P<metadata_1>\d+)',
+    {'pattern': r'METADATA_2 (?P<metadata_2>\w+)'},
+    {'pattern': r'METADATA_3', 'key': 'metadata_3', 'value': True},
+    {'pattern': r'METADATA_4', 'key': 'metadata_4', 'value': True},
+]
+
 EXPECTED_DF = pl.from_dict(
     {
         'time': [
@@ -174,6 +185,10 @@
         'eyes_recorded': 'binocular / monocular',
         'short_name': 'BTABLER',
     },
+    'metadata_1': '123',
+    'metadata_2': 'abc',
+    'metadata_3': True,
+    'metadata_4': None,
 }
 
 
@@ -184,6 +199,7 @@ def test_parse_eyelink(tmp_path):
     df, metadata = pm.utils.parsing.parse_eyelink(
         filepath,
         patterns=PATTERNS,
+        metadata_patterns=METADATA_PATTERNS,
     )
 
     assert_frame_equal(df, EXPECTED_DF, check_column_order=False)

From 8500aa923787eda82191434c6c40f2634453eedc Mon Sep 17 00:00:00 2001
From: "David R. Reich" <43832476+SiQube@users.noreply.github.com>
Date: Mon, 30 Sep 2024 07:52:17 +0200
Subject: [PATCH 30/31] enable downloading precomputed events for copco dataset
 (#840)

---
 src/pymovements/datasets/copco.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pymovements/datasets/copco.py b/src/pymovements/datasets/copco.py
index 52e3400aa..c3423c2b5 100644
--- a/src/pymovements/datasets/copco.py
+++ b/src/pymovements/datasets/copco.py
@@ -131,7 +131,7 @@ class CopCo(DatasetDefinition):
     has_files: dict[str, bool] = field(
         default_factory=lambda: {
             'gaze': False,
-            'precomputed_events': False,
+            'precomputed_events': True,
             'precomputed_reading_measures': True,
         },
     )

From 504e398ed38d7d303c6a9cf911f79f7314266094 Mon Sep 17 00:00:00 2001
From: SiQube <reich.davidr@gmail.com>
Date: Mon, 30 Sep 2024 08:20:04 +0200
Subject: [PATCH 31/31] update sampling rate due to mutually exclusive sampling
 rate None and eyetracker None

---
 src/pymovements/datasets/codecomprehension.py | 54 ++++++++++++++-----
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/pymovements/datasets/codecomprehension.py b/src/pymovements/datasets/codecomprehension.py
index eeacc9222..c46748643 100644
--- a/src/pymovements/datasets/codecomprehension.py
+++ b/src/pymovements/datasets/codecomprehension.py
@@ -44,34 +44,62 @@ class CodeComprehension(DatasetDefinition):
 
     Attributes
     ----------
-    name : str
+    name: str
         The name of the dataset.
 
-    mirrors : tuple[str, ...]
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
+    mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
-    resources : tuple[dict[str, str], ...]
+    resources: dict[str, tuple[dict[str, str], ...]]
         A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
         keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
-    experiment : Experiment
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
+    experiment: Experiment
         The experiment definition.
 
-    filename_format : str
+    filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes : dict[str, type], optional
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
-    column_map : dict[str, str]
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
+    column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
-    custom_read_kwargs : dict[str, Any], optional
+    custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function.
 
     Examples
@@ -104,7 +132,7 @@ class CodeComprehension(DatasetDefinition):
             'precomputed_reading_measures': False,
         },
     )
-    extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True})
+
     mirrors: dict[str, tuple[str, ...]] = field(
         default_factory=lambda: {
             'precomputed_events': ('https://zenodo.org/',),
@@ -125,6 +153,8 @@ class CodeComprehension(DatasetDefinition):
         },
     )
 
+    extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True})
+
     experiment: Experiment = Experiment(
         screen_width_px=None,
         screen_height_px=None,
@@ -132,7 +162,7 @@ class CodeComprehension(DatasetDefinition):
         screen_height_cm=None,
         distance_cm=None,
         origin=None,
-        sampling_rate=None,
+        sampling_rate=2000,
     )
 
     filename_format: dict[str, str] = field(
@@ -141,7 +171,7 @@ class CodeComprehension(DatasetDefinition):
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'precomputed_events': {'subject_id': pl.Utf8},
         },
@@ -157,7 +187,7 @@ class CodeComprehension(DatasetDefinition):
 
     column_map: dict[str, str] = field(default_factory=lambda: {})
 
-    custom_read_kwargs: dict[str, Any] = field(
+    custom_read_kwargs: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             'precomputed_events': {
                 'separator': '\t',