From c8fa4fb56bda9db214f102761c8ae9123f8972ca Mon Sep 17 00:00:00 2001 From: SiQube Date: Thu, 19 Sep 2024 00:21:03 +0200 Subject: [PATCH 01/31] add code comprehension dataset --- docs/source/bibliography.bib | 19 ++ src/pymovements/datasets/__init__.py | 3 + src/pymovements/datasets/codecomprehension.py | 168 ++++++++++++++++++ 3 files changed, 190 insertions(+) create mode 100644 src/pymovements/datasets/codecomprehension.py diff --git a/docs/source/bibliography.bib b/docs/source/bibliography.bib index 46f3c7309..0d9bfc901 100644 --- a/docs/source/bibliography.bib +++ b/docs/source/bibliography.bib @@ -1,3 +1,22 @@ +@article{CodeComprehension, +author = {Alakmeh, Tarek and Reich, David and J\"{a}ger, Lena and Fritz, Thomas}, +title = {Predicting Code Comprehension: A Novel Approach to Align Human Gaze with Code using Deep Neural Networks}, +year = {2024}, +issue_date = {July 2024}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +volume = {1}, +number = {FSE}, +url = {https://doi.org/10.1145/3660795}, +doi = {10.1145/3660795}, +abstract = {The better the code quality and the less complex the code, the easier it is for software developers to comprehend and evolve it. Yet, how do we best detect quality concerns in the code? Existing measures to assess code quality, such as McCabe’s cyclomatic complexity, are decades old and neglect the human aspect. Research has shown that considering how a developer reads and experiences the code can be an indicator of its quality. In our research, we built on these insights and designed, trained, and evaluated the first deep neural network that aligns a developer’s eye gaze with the code tokens the developer looks at to predict code comprehension and perceived difficulty. To train and analyze our approach, we performed an experiment in which 27 participants worked on a range of 16 short code comprehension tasks while we collected fine-grained gaze data using an eye tracker. The results of our evaluation show that our deep neural sequence model that integrates both the human gaze and the stimulus code, can predict (a) code comprehension and (b) the perceived code difficulty significantly better than current state-of-the-art reference methods. We also show that aligning human gaze with code leads to better performance than models that rely solely on either code or human gaze. We discuss potential applications and propose future work to build better human-inclusive code evaluation systems.}, +journal = {Proc. ACM Softw. Eng.}, +month = {jul}, +articleno = {88}, +numpages = {23}, +keywords = {code comprehension, code-fixation attention, eye-tracking, lab experiment, neural networks} +} + @inproceedings{CopCoL1Hollenstein, title = "The Copenhagen Corpus of Eye Tracking Recordings from Natural Reading of {D}anish Texts", author = {Hollenstein, Nora and diff --git a/src/pymovements/datasets/__init__.py b/src/pymovements/datasets/__init__.py index d68e6c1bd..81e45c14c 100644 --- a/src/pymovements/datasets/__init__.py +++ b/src/pymovements/datasets/__init__.py @@ -25,6 +25,7 @@ :toctree: :template: class.rst + pymovements.datasets.CodeComprehension pymovements.datasets.CopCo pymovements.datasets.DIDEC pymovements.datasets.EMTeC @@ -47,6 +48,7 @@ pymovements.datasets.ToyDataset pymovements.datasets.ToyDatasetEyeLink """ +from pymovements.datasets.codecomprehension import CodeComprehension from pymovements.datasets.copco import CopCo from pymovements.datasets.didec import DIDEC from pymovements.datasets.emtec import EMTeC @@ -64,6 +66,7 @@ __all__ = [ + 'CodeComprehension', 'CopCo', 'DIDEC', 'EMTeC', diff --git a/src/pymovements/datasets/codecomprehension.py b/src/pymovements/datasets/codecomprehension.py new file mode 100644 index 000000000..4a2ce372a --- /dev/null +++ b/src/pymovements/datasets/codecomprehension.py @@ -0,0 +1,168 @@ +# Copyright (c) 2022-2024 The pymovements Project Authors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +"""Provides a definition for the CodeComprehension dataset.""" +from __future__ import annotations + +from dataclasses import dataclass +from dataclasses import field +from typing import Any + +import polars as pl + +from pymovements.dataset.dataset_definition import DatasetDefinition +from pymovements.dataset.dataset_library import register_dataset +from pymovements.gaze.experiment import Experiment + + +@dataclass +@register_dataset +class CodeComprehension(DatasetDefinition): + """CodeComprehension dataset :cite:p:`CodeComprehension`. + + This dataset includes eye-tracking-while-code-reading data from a participants in a single + session. Eye movements are recorded at a sampling frequency of 1,000 Hz using an + EyeLink 1000 eye tracker and are provided as pixel coordinates. + + The participant is instructed to read the code snippet and answer a code comprehension question. + + Attributes + ---------- + name : str + The name of the dataset. + + mirrors : tuple[str, ...] + A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. + + resources : tuple[dict[str, str], ...] + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: + - `resource`: The url suffix of the resource. This will be concatenated with the mirror. + - `filename`: The filename under which the file is saved as. + - `md5`: The MD5 checksum of the respective file. + + experiment : Experiment + The experiment definition. + + filename_format : str + Regular expression which will be matched before trying to load the file. Namedgroups will + appear in the `fileinfo` dataframe. + + filename_format_dtypes : dict[str, type], optional + If named groups are present in the `filename_format`, this makes it possible to cast + specific named groups to a particular datatype. + + column_map : dict[str, str] + The keys are the columns to read, the values are the names to which they should be renamed. + + custom_read_kwargs : dict[str, Any], optional + If specified, these keyword arguments will be passed to the file reading function. + + Examples + -------- + Initialize your :py:class:`~pymovements.PublicDataset` object with the + :py:class:`~pymovements.CodeComprehension` definition: + + >>> import pymovements as pm + >>> + >>> dataset = pm.Dataset("CodeComprehension", path='data/CodeComprehension') + + Download the dataset resources: + + >>> dataset.download()# doctest: +SKIP + + Load the data into memory: + + >>> dataset.load()# doctest: +SKIP + """ + + # pylint: disable=similarities + # The PublicDatasetDefinition child classes potentially share code chunks for definitions. + + name: str = 'CodeComprehension' + + has_files: dict[str, bool] = field( + default_factory=lambda: { + 'gaze': False, + 'precomputed_events': True, + }, + ) + extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True}) + mirrors: dict[str, tuple[str, ...]] = field( + default_factory=lambda: { + 'precomputed_events': ('https://zenodo.org/',), + }, + ) + + resources: dict[str, tuple[dict[str, str], ...]] = field( + default_factory=lambda: { + 'precomputed_events': ( + { + 'resource': + 'records/11123101/files/Predicting%20Code%20Comprehension%20Package' + '.zip?download=1', + 'filename': 'data.zip', + 'md5': '3a3c6fb96550bc2c2ddcf5d458fb12a2', + }, + ), + }, + ) + + # TODO + experiment: Experiment = Experiment( + screen_width_px=1920, + screen_height_px=1080, + screen_width_cm=59., + screen_height_cm=33.5, + distance_cm=85, + origin='center', + sampling_rate=1000, + ) + + filename_format: dict[str, str] = field( + default_factory=lambda: { + 'precomputed_events': r'fix_report_P{subject_id:s}.txt', + }, + ) + + filename_format_dtypes: dict[str, dict[str, type]] = field( + default_factory=lambda: { + 'precomputed_events': {'subject_id': pl.Utf8}, + }, + ) + + trial_columns: list[str] = field(default_factory=lambda: []) + + time_column: str = '' + + time_unit: str = '' + + pixel_columns: list[str] = field(default_factory=lambda: []) + + column_map: dict[str, str] = field(default_factory=lambda: {}) + + custom_read_kwargs: dict[str, Any] = field( + default_factory=lambda: { + 'precomputed_events': { + 'separator': '\t', + 'null_values': '.', + 'quote_char': '"', + }, + }, + ) From 20b69ab445e0e6b0dfb096d617533d76a2693644 Mon Sep 17 00:00:00 2001 From: SiQube Date: Thu, 19 Sep 2024 00:21:37 +0200 Subject: [PATCH 02/31] add code comprehension dataset tests --- src/pymovements/datasets/codecomprehension.py | 16 ++++++++-------- tests/unit/datasets/datasets_test.py | 1 + 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/pymovements/datasets/codecomprehension.py b/src/pymovements/datasets/codecomprehension.py index 4a2ce372a..eeacc9222 100644 --- a/src/pymovements/datasets/codecomprehension.py +++ b/src/pymovements/datasets/codecomprehension.py @@ -101,6 +101,7 @@ class CodeComprehension(DatasetDefinition): default_factory=lambda: { 'gaze': False, 'precomputed_events': True, + 'precomputed_reading_measures': False, }, ) extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True}) @@ -124,15 +125,14 @@ class CodeComprehension(DatasetDefinition): }, ) - # TODO experiment: Experiment = Experiment( - screen_width_px=1920, - screen_height_px=1080, - screen_width_cm=59., - screen_height_cm=33.5, - distance_cm=85, - origin='center', - sampling_rate=1000, + screen_width_px=None, + screen_height_px=None, + screen_width_cm=None, + screen_height_cm=None, + distance_cm=None, + origin=None, + sampling_rate=None, ) filename_format: dict[str, str] = field( diff --git a/tests/unit/datasets/datasets_test.py b/tests/unit/datasets/datasets_test.py index b71700410..9f71da7ac 100644 --- a/tests/unit/datasets/datasets_test.py +++ b/tests/unit/datasets/datasets_test.py @@ -31,6 +31,7 @@ ('public_dataset', 'dataset_name'), # XXX: add public dataset in alphabetical order [ + pytest.param(pm.datasets.CodeComprehension, 'CodeComprehension', id='CodeComprehension'), pytest.param(pm.datasets.CopCo, 'CopCo', id='CopCo'), pytest.param(pm.datasets.DIDEC, 'DIDEC', id='DIDEC'), pytest.param(pm.datasets.EMTeC, 'EMTeC', id='EMTeC'), From 268966fda56671b13d988bbcf2ce9c17a3cccedc Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 09:24:40 +0200 Subject: [PATCH 03/31] update docstring of utils downloads (#838) --- src/pymovements/utils/downloads.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/pymovements/utils/downloads.py b/src/pymovements/utils/downloads.py index ad729063f..9b7958802 100644 --- a/src/pymovements/utils/downloads.py +++ b/src/pymovements/utils/downloads.py @@ -219,19 +219,6 @@ class _DownloadProgressBar(tqdm): # pylint: disable=inconsistent-mro Parameters ---------- **kwargs : Any - - Attributes - ---------- - unit: str - Unit of progress bar. - unit_scale: bool - If True, scale progress bar to unit. - unit_divisor: int - Divisor of progress bar. - miniters: int - Minimum number of iterations between updates. - **kwargs: Any - Keyword arguments passed to `tqdm.tqdm`. """ def __init__(self, **kwargs: Any): From 14a03f851cd1e970031da873e458b3f1c36e0e68 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 10:04:08 +0200 Subject: [PATCH 04/31] update docstring of copco dataset definition (#821) --- src/pymovements/datasets/copco.py | 57 +++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/src/pymovements/datasets/copco.py b/src/pymovements/datasets/copco.py index 53b466435..b6603c644 100644 --- a/src/pymovements/datasets/copco.py +++ b/src/pymovements/datasets/copco.py @@ -47,34 +47,60 @@ class CopCo(DatasetDefinition): Attributes ---------- - name : str + name: str The name of the dataset. - mirrors : tuple[str, ...] + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. - resources : tuple[dict[str, str], ...] + resources: dict[str, tuple[dict[str, str | None], ...]] A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. - experiment : Experiment + experiment: Experiment The experiment definition. - filename_format : str + extract: dict[str, bool] + Decide whether to extract the data. + + filename_format: dict[str, str] Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes : dict[str, type], optional + filename_format_dtypes: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. - - column_map : dict[str, str] + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. - custom_read_kwargs : dict[str, Any], optional + custom_read_kwargs: dict[str, Any] If specified, these keyword arguments will be passed to the file reading function. Examples @@ -107,12 +133,6 @@ class CopCo(DatasetDefinition): 'precomputed_reading_measures': True, }, ) - extract: dict[str, bool] = field( - default_factory=lambda: { - 'precomputed_events': True, - 'precomputed_reading_measures': True, - }, - ) mirrors: dict[str, tuple[str, ...]] = field( default_factory=lambda: { 'precomputed_events': ('https://files.de-1.osf.io/',), @@ -150,6 +170,13 @@ class CopCo(DatasetDefinition): sampling_rate=1000, ) + extract: dict[str, bool] = field( + default_factory=lambda: { + 'precomputed_events': True, + 'precomputed_reading_measures': True, + }, + ) + filename_format: dict[str, str] = field( default_factory=lambda: { 'precomputed_events': r'FIX_report_P{subject_id:d}.txt', From 580eae7525079bb12b714b03b3e19c074455fc06 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:11:03 +0200 Subject: [PATCH 05/31] update docstring of dataset definition (#820) --- src/pymovements/dataset/dataset_definition.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/pymovements/dataset/dataset_definition.py b/src/pymovements/dataset/dataset_definition.py index 40168e9cf..eaff6e13b 100644 --- a/src/pymovements/dataset/dataset_definition.py +++ b/src/pymovements/dataset/dataset_definition.py @@ -35,6 +35,9 @@ class DatasetDefinition: ---------- name: str The name of the dataset. (default: '.') + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. (default: field(default_factory=dict)) @@ -44,16 +47,16 @@ class DatasetDefinition: - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. (default: field(default_factory=dict)) - experiment: Experiment + experiment: Experiment | None The experiment definition. (default: None) + extract: dict[str, bool] + Decide whether to extract the data. filename_format: dict[str, str] Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. (default: field(default_factory=dict)) filename_format_dtypes: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. (default: field(default_factory=dict)) - extract: dict[str, bool] - Decide whether to extract the data. custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. The behavior of this argument depends on the file extension of the dataset files. From 4532de24123f3800f83ba2a15b7cada473cef406 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 12:47:18 +0200 Subject: [PATCH 06/31] update docstring of fakenews dataset definition (#824) --- src/pymovements/datasets/fakenews.py | 52 +++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/src/pymovements/datasets/fakenews.py b/src/pymovements/datasets/fakenews.py index c352dbf27..f964fd8f2 100644 --- a/src/pymovements/datasets/fakenews.py +++ b/src/pymovements/datasets/fakenews.py @@ -44,31 +44,54 @@ class FakeNewsPerception(DatasetDefinition): Attributes ---------- - name : str + name: str The name of the dataset. - mirrors : tuple[str, ...] + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. - resources : tuple[dict[str, str], ...] + resources: dict[str, tuple[dict[str, str], ...]] A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. - experiment : Experiment + experiment: Experiment The experiment definition. - filename_format : str + extract: dict[str, bool] + Decide whether to extract the data. + filename_format: dict[str, str] Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes : dict[str, type], optional + filename_format_dtypes: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. - column_map : dict[str, str] + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. - custom_read_kwargs : dict[str, Any], optional + custom_read_kwargs: dict[str, Any] If specified, these keyword arguments will be passed to the file reading function. """ name: str = 'FakeNewsPerception' + has_files: dict[str, bool] = field( default_factory=lambda: { 'gaze': False, @@ -76,12 +99,13 @@ class FakeNewsPerception(DatasetDefinition): 'precomputed_reading_measures': False, }, ) - extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True}) + mirrors: dict[str, tuple[str, ...]] = field( default_factory=lambda: { 'precomputed_events': ('https://doi.org/10.7910/DVN/C1UD2A',), }, ) + resources: dict[str, tuple[dict[str, str], ...]] = field( default_factory=lambda: { 'precomputed_events': ( @@ -93,6 +117,7 @@ class FakeNewsPerception(DatasetDefinition): ), }, ) + experiment: Experiment = Experiment( screen_width_px=1920, screen_height_px=1080, @@ -103,21 +128,30 @@ class FakeNewsPerception(DatasetDefinition): sampling_rate=600, ) + extract: dict[str, bool] = field(default_factory=lambda: {'precomputed_events': True}) + filename_format: dict[str, str] = field( default_factory=lambda: { 'precomputed_events': r'P{subject_id:d}_{session_id:d}_{truth_value:s}.csv', }, ) + filename_format_dtypes: dict[str, dict[str, type]] = field( default_factory=lambda: { 'precomputed_events': {'subject_id': int, 'session_id': int, 'truth_value': str}, }, ) + trial_columns: list[str] = field(default_factory=lambda: []) + time_column: str = 'starttime' + time_unit: str = 'milliseconds' + pixel_columns: list[str] = field(default_factory=lambda: []) + column_map: dict[str, str] = field(default_factory=lambda: {}) + custom_read_kwargs: dict[str, Any] = field( default_factory=lambda: { 'precomputed_events': { From 625184909dcf04bb5c9cfa4e388cafb9633c4160 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 13:14:24 +0200 Subject: [PATCH 07/31] update docstring of gaze screen (#837) --- src/pymovements/gaze/screen.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/pymovements/gaze/screen.py b/src/pymovements/gaze/screen.py index ea4458898..9ae21e270 100644 --- a/src/pymovements/gaze/screen.py +++ b/src/pymovements/gaze/screen.py @@ -51,29 +51,6 @@ class Screen: Specifies the screen location of the origin of the pixel coordinate system. (default: 'upper left') - Attributes - ---------- - width_px: int - Screen width in pixels - height_px: int - Screen height in pixels - width_cm: float - Screen width in centimeters - height_cm: float - Screen height in centimeters - distance_cm: float - Eye-to-screen distance in centimeters - origin: str - Specifies the screen location of the origin of the pixel coordinate system. - x_max_dva: float - Maximum screen x-coordinate in degrees of visual angle - y_max_dva: float - Minimum screen y-coordinate in degrees of visual angle - x_min_dva: float - Maximum screen x-coordinate in degrees of visual angle - y_min_dva: float - Minimum screen y-coordinate in degrees of visual angle - Examples -------- >>> screen = Screen( From 5e0297bb7a9efd435870e26d0a36f908d7a20b11 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 13:29:27 +0200 Subject: [PATCH 08/31] update docstring of gaze experiment (#836) --- src/pymovements/gaze/experiment.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/pymovements/gaze/experiment.py b/src/pymovements/gaze/experiment.py index f51da37e2..d8015f1be 100644 --- a/src/pymovements/gaze/experiment.py +++ b/src/pymovements/gaze/experiment.py @@ -82,14 +82,6 @@ class Experiment: -12.42... >>> experiment.screen.y_max_dva# doctest:+ELLIPSIS 12.42... - - - Attributes - ---------- - screen: Screen - Screen object for experiment - eyetracker : EyeTracker | None - Eye tracker for experiment """ def __init__( From be0501c08ea3d824efcd2c28d002fe38254d88e1 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:06:05 +0200 Subject: [PATCH 09/31] update docstring of events processing (#835) --- src/pymovements/events/processing.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/pymovements/events/processing.py b/src/pymovements/events/processing.py index 7e97346a8..517765558 100644 --- a/src/pymovements/events/processing.py +++ b/src/pymovements/events/processing.py @@ -35,11 +35,6 @@ class EventProcessor: """Processes event and gaze dataframes. - Attributes - ---------- - event_properties: list[str] - A list of property names. - Parameters ---------- event_properties: str | list[str] @@ -95,12 +90,6 @@ def process(self, events: EventDataFrame) -> pl.DataFrame: class EventGazeProcessor: """Processes event and gaze dataframes. - Attributes - ---------- - event_properties: list[str] - A list of property names. - - Parameters ---------- event_properties: str | tuple[str, dict[str, Any]] | list[str | tuple[str, dict[str, Any]]] From d3eecee9eb1da574c58631921793a25a7984d82e Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:22:46 +0200 Subject: [PATCH 10/31] update docstring of toy_dataset_eyelink dataset definition (#834) --- .../datasets/toy_dataset_eyelink.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/src/pymovements/datasets/toy_dataset_eyelink.py b/src/pymovements/datasets/toy_dataset_eyelink.py index 38e7afd58..1dcd2a661 100644 --- a/src/pymovements/datasets/toy_dataset_eyelink.py +++ b/src/pymovements/datasets/toy_dataset_eyelink.py @@ -48,15 +48,23 @@ class ToyDatasetEyeLink(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. resources: dict[str, tuple[dict[str, str], ...]] - A tuple of dataset resources. Each list entry must be a dictionary with the following keys: + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -68,6 +76,27 @@ class ToyDatasetEyeLink(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. @@ -104,6 +133,7 @@ class ToyDatasetEyeLink(DatasetDefinition): 'precomputed_reading_measures': False, }, ) + mirrors: dict[str, tuple[str, ...]] = field( default_factory=lambda: { 'gaze': ( @@ -124,6 +154,7 @@ class ToyDatasetEyeLink(DatasetDefinition): ), }, ) + extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True}) experiment: Experiment = Experiment( @@ -157,8 +188,6 @@ class ToyDatasetEyeLink(DatasetDefinition): }, ) - column_map: dict[str, str] = field(default_factory=lambda: {}) - trial_columns: list[str] = field(default_factory=lambda: ['subject_id', 'session_id']) time_column: str = 'time' @@ -167,6 +196,8 @@ class ToyDatasetEyeLink(DatasetDefinition): pixel_columns: list[str] = field(default_factory=lambda: ['x_pix', 'y_pix']) + column_map: dict[str, str] = field(default_factory=lambda: {}) + custom_read_kwargs: dict[str, dict[str, Any]] = field( default_factory=lambda: { 'gaze': { From fd8dad49adf7eb126f63542b904387a77b6753ec Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:48:23 +0200 Subject: [PATCH 11/31] update docstring of toy_dataset dataset definition (#833) --- src/pymovements/datasets/toy_dataset.py | 31 ++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/pymovements/datasets/toy_dataset.py b/src/pymovements/datasets/toy_dataset.py index 553b5d0fe..7f45ea72c 100644 --- a/src/pymovements/datasets/toy_dataset.py +++ b/src/pymovements/datasets/toy_dataset.py @@ -47,15 +47,23 @@ class ToyDataset(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. resources: dict[str, tuple[dict[str, str], ...]] - A tuple of dataset resources. Each list entry must be a dictionary with the following keys: + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -67,6 +75,27 @@ class ToyDataset(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. From 1ad8db75e49eb9d8aaa68672b2429cd7b5801d4c Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 18:09:44 +0200 Subject: [PATCH 12/31] update docstring of sb_sat dataset definition (#832) --- src/pymovements/datasets/sb_sat.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/pymovements/datasets/sb_sat.py b/src/pymovements/datasets/sb_sat.py index a4b1c4e0e..1974bd77b 100644 --- a/src/pymovements/datasets/sb_sat.py +++ b/src/pymovements/datasets/sb_sat.py @@ -49,6 +49,10 @@ class SBSAT(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. @@ -59,6 +63,9 @@ class SBSAT(DatasetDefinition): - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -70,10 +77,31 @@ class SBSAT(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. - custom_read_kwargs: dict[str, dict[str, dict[str, Any]]] + custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. Examples From 5903c9355e32a9cc5ae3f0fb2fd50514a2e87f11 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 18:29:34 +0200 Subject: [PATCH 13/31] update docstring of potec dataset definition (#831) --- src/pymovements/datasets/potec.py | 34 +++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/src/pymovements/datasets/potec.py b/src/pymovements/datasets/potec.py index 12dc9a64e..d26cb3160 100644 --- a/src/pymovements/datasets/potec.py +++ b/src/pymovements/datasets/potec.py @@ -59,15 +59,23 @@ class PoTeC(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. resources: dict[str, tuple[dict[str, str], ...]] - A tuple of dataset resources. Each list entry must be a dictionary with the following keys: + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -79,8 +87,26 @@ class PoTeC(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. - column_map: dict[str, str] - The keys are the columns to read, the values are the names to which they should be renamed. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. @@ -175,7 +201,7 @@ class PoTeC(DatasetDefinition): ], ) - custom_read_kwargs: dict[str, Any] = field( + custom_read_kwargs: dict[str, dict[str, Any]] = field( default_factory=lambda: { 'gaze': { 'dtypes': { From d9e2b2af7a7e2d37d3ec26ffb74d070d3b3b5cac Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 18:42:51 +0200 Subject: [PATCH 14/31] update docstring of judo1000 dataset definition (#830) --- src/pymovements/datasets/judo1000.py | 34 ++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/pymovements/datasets/judo1000.py b/src/pymovements/datasets/judo1000.py index c2fd57d94..dfbf1f663 100644 --- a/src/pymovements/datasets/judo1000.py +++ b/src/pymovements/datasets/judo1000.py @@ -49,15 +49,23 @@ class JuDo1000(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. resources: dict[str, tuple[dict[str, str], ...]] - A tuple of dataset resources. Each list entry must be a dictionary with the following keys: + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -69,12 +77,34 @@ class JuDo1000(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. + Examples -------- Initialize your :py:class:`~pymovements.PublicDataset` object with the @@ -172,7 +202,7 @@ class JuDo1000(DatasetDefinition): }, ) - custom_read_kwargs: dict[str, Any] = field( + custom_read_kwargs: dict[str, dict[str, Any]] = field( default_factory=lambda: { 'gaze': { 'dtypes': { From fdab5de4d4ae99df0b7ceeb92e8213e6f3db5470 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 23:05:50 +0200 Subject: [PATCH 15/31] update docstring of hbn dataset definition (#829) --- src/pymovements/datasets/hbn.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/pymovements/datasets/hbn.py b/src/pymovements/datasets/hbn.py index 8c16b314b..0ef7b0922 100644 --- a/src/pymovements/datasets/hbn.py +++ b/src/pymovements/datasets/hbn.py @@ -50,15 +50,23 @@ class HBN(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. resources: dict[str, tuple[dict[str, str], ...]] - A tuple of dataset resources. Each list entry must be a dictionary with the following keys: + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -70,6 +78,27 @@ class HBN(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. From 242aabe1e64205b51a3ea1e76ad02768a6e31e8c Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 23:32:24 +0200 Subject: [PATCH 16/31] update docstring of gazebasevr dataset definition (#828) --- src/pymovements/datasets/gazebasevr.py | 36 +++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/pymovements/datasets/gazebasevr.py b/src/pymovements/datasets/gazebasevr.py index bea0a4928..ef22e7c8e 100644 --- a/src/pymovements/datasets/gazebasevr.py +++ b/src/pymovements/datasets/gazebasevr.py @@ -56,15 +56,23 @@ class GazeBaseVR(DatasetDefinition): name: str The name of the dataset. - gaze_mirrors: dict[str, tuple[str, ...]] + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. - gaze_resources: dict[str, tuple[dict[str, str], ...]] - A tuple of dataset resources. Each list entry must be a dictionary with the following keys: + resources: dict[str, tuple[dict[str, str], ...]] + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -76,12 +84,34 @@ class GazeBaseVR(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + position_columns: list[str] + The name of the dva position columns in the input data frame. These columns will be + nested into the column ``position``. If the list is empty or None, the nested + ``position`` column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. + Examples -------- Initialize your :py:class:`~pymovements.PublicDataset` object with the From 02f701ff3781892d36cc13c44f6b86b25ab05a32 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 23:44:32 +0200 Subject: [PATCH 17/31] update docstring of gazebase dataset definition (#827) --- src/pymovements/datasets/gazebase.py | 32 +++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/pymovements/datasets/gazebase.py b/src/pymovements/datasets/gazebase.py index 865c9d583..a9835b74a 100644 --- a/src/pymovements/datasets/gazebase.py +++ b/src/pymovements/datasets/gazebase.py @@ -55,15 +55,23 @@ class GazeBase(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. resources: dict[str, tuple[dict[str, str], ...]] - A tuple of dataset resources. Each list entry must be a dictionary with the following keys: + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -75,12 +83,34 @@ class GazeBase(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + position_columns: list[str] + The name of the dva position columns in the input data frame. These columns will be + nested into the column ``position``. If the list is empty or None, the nested + ``position`` column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. + Examples -------- Initialize your :py:class:`~pymovements.PublicDataset` object with the From fa3025dfb4df45f816d4f2b240b00fb35f0174e0 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Fri, 27 Sep 2024 23:59:40 +0200 Subject: [PATCH 18/31] update docstring of gazes_on_faces dataset definition (#826) --- src/pymovements/datasets/gaze_on_faces.py | 47 ++++++++++++++++++----- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/src/pymovements/datasets/gaze_on_faces.py b/src/pymovements/datasets/gaze_on_faces.py index 45722a866..131549b87 100644 --- a/src/pymovements/datasets/gaze_on_faces.py +++ b/src/pymovements/datasets/gaze_on_faces.py @@ -48,33 +48,62 @@ class GazeOnFaces(DatasetDefinition): Attributes ---------- - name : str + name: str The name of the dataset. - mirrors : dict[str, tuple[str, ...]] + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. - resources : dict[str, tuple[dict[str, str], ...]] - A tuple of dataset resources. Each list entry must be a dictionary with the following keys: + resources: dict[str, tuple[dict[str, str], ...]] + A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following + keys: - `resource`: The url suffix of the resource. This will be concatenated with the mirror. - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. - experiment : Experiment + extract: dict[str, bool] + Decide whether to extract the data. + + experiment: Experiment The experiment definition. - filename_format : dict[str, str] + filename_format: dict[str, str] Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes : dict[str, dict[str, type]] + filename_format_dtypes: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. - column_map : dict[str, str] + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: Any + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: Any + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. - custom_read_kwargs : dict[str, dict[str, Any]] + custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. Examples From 6d22c5ae81b4295cc6bdd8d4b49576ee2cc41a93 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sat, 28 Sep 2024 09:17:54 +0200 Subject: [PATCH 19/31] update docstring of gaze_graph dataset definition (#825) --- src/pymovements/datasets/gaze_graph.py | 36 +++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/src/pymovements/datasets/gaze_graph.py b/src/pymovements/datasets/gaze_graph.py index 995363f7c..e8b146da0 100644 --- a/src/pymovements/datasets/gaze_graph.py +++ b/src/pymovements/datasets/gaze_graph.py @@ -53,6 +53,10 @@ class GazeGraph(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. @@ -66,18 +70,42 @@ class GazeGraph(DatasetDefinition): experiment: Experiment The experiment definition. + extract: dict[str, bool] + Decide whether to extract the data. + filename_format: dict[str, str] Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, Any] + filename_format_dtypes: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: Any + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: Any + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. - custom_read_kwargs: dict[str, Any] + custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. Examples @@ -117,8 +145,6 @@ class GazeGraph(DatasetDefinition): }, ) - extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True}) - resources: dict[str, tuple[dict[str, str], ...]] = field( default_factory=lambda: { 'gaze': ( @@ -142,6 +168,8 @@ class GazeGraph(DatasetDefinition): sampling_rate=30, ) + extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True}) + filename_format: dict[str, str] = field( default_factory=lambda: { 'gaze': r'P{subject_id}_{task}.csv', From d31b236e9e845ddb1fed32ad7af6c3001d6ab320 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sat, 28 Sep 2024 11:47:19 +0200 Subject: [PATCH 20/31] update docstring of emtec dataset definition (#823) --- src/pymovements/datasets/emtec.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/pymovements/datasets/emtec.py b/src/pymovements/datasets/emtec.py index a97a8a129..69432149c 100644 --- a/src/pymovements/datasets/emtec.py +++ b/src/pymovements/datasets/emtec.py @@ -47,6 +47,10 @@ class EMTeC(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. @@ -57,6 +61,9 @@ class EMTeC(DatasetDefinition): - `filename`: The filename under which the file is saved as. - `md5`: The MD5 checksum of the respective file. + extract: dict[str, bool] + Decide whether to extract the data. + experiment: Experiment The experiment definition. @@ -68,10 +75,28 @@ class EMTeC(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. - column_map: dict[str, str] - The keys are the columns to read, the values are the names to which they should be renamed. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. - custom_read_kwargs: dict[str, dict[str, dict[str, Any]]] + custom_read_kwargs: dict[str, dict[str, Any]] If specified, these keyword arguments will be passed to the file reading function. Examples From eba70205af5a79246d9ca05d9ac41cc2c486de3a Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sat, 28 Sep 2024 16:54:05 +0200 Subject: [PATCH 21/31] update docstring of didec dataset definition (#822) --- .../dataset/.dataset_definition.py.swp | Bin 0 -> 16384 bytes src/pymovements/datasets/didec.py | 32 ++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 src/pymovements/dataset/.dataset_definition.py.swp diff --git a/src/pymovements/dataset/.dataset_definition.py.swp b/src/pymovements/dataset/.dataset_definition.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..2795a03c563f2cbd1a91215d4de1074262011281 GIT binary patch literal 16384 zcmeHO&5t8T74KvdLIOz;IUynPGD|eOlE<^zB?l};!87hzx8kvl?b*pFtn6}kdE899 zyXoql87CwlBse5oxbh((B*cM3NW>pNB*fPZ2^MkT%rTdUBm7=CKA61sqt6=OdD<|P`?X{&Ipf}!y(IGmJk2e*%6F`~vWRBj78* zF7P7oB=G*DhVf(I6zBr)f7~$M1HKDf0)G1l^Z}Z{XMlfv3^o8i16~96fe#)wjGq8U zz!SjR4;jYmz~_M9;U&gvz|+7V@j~Q#z!=y9{sw;D2AHQenXCW&MY!U~72ePeykzF9 z8hNgYUr}*%lfF*vz?TM>;p-AUg%F7s*J$EO&&eN$6Jf_eG=ITy?fCL#Lzl&$!BojJ2Doorzr4g8l13ZiDGkH*7-2uZ%9&g z*z6P6^H|SNC?y>(Tnp#5N#fhJ(b6|3)g*?uPaE*J|~r6p#-UMPJ>v&SB=qN^?riqkJ{bgnAQhl(ncwhL!VeFePVX3&aCO!=ZA z<_i20^RK_(VVBfO+lh*U*-(>tbb@|_+ZP^G{Hpu&n}suF+SsI>kPf3D3AL{Zba)iI zc7iU_*2MErR_}Mbz}_>0dDw)JRPZozeQnl=3FJaU_~%T-BOXKGdnS(Hvu;dbpp-j? zLx2Zcs90OOQfcV0dUvsyvzAO2VJKnKa`Ub~2}at=Nd!&q+svLk)I7;kHctC&I@a^S z1H0@gM$cHnNLUihxf5ZK7YK~0%#Etl=1`Dz0@uBht;^kdCNMPMSdK)Lkq{WMbPS;g zflN2R9+^5~mbxrw90XqB6mTh24Q1qVD1gH+z7|{Re-5K*U@VH($SkV;dy#g7Mlkr%M{>0#)b;A$echfo}_hF2F%ztRp_n+$aj7Ld!bENeHJ# zyvSok-k_?IDKJs+q($g}XihT>ZVtji4{5|ify9OEN#J?G340XJO*QQ_3t`LMtebqL zhz0CE6{=RNOT*(nrc@LoLFf;m!)(^$2a=k)l zq{?F$p^yrFHOEt-hU{Bs;?C_uiG$m1q!cu-SJ%^V#N}!{*K6S$i%UP&V=QULuud8J zd2Ma2u^5*;*7XOS!gF6`FX1WL_Aq1DMsF-n(v5Ul!i21IDSIv?)r>DO89Em$vq==p zXjuUBQY{8Nxw9~c;-z7q)j{vFKW5pOb5cXqt<{T!f$Z`X>ME;CGK2X1!&#ulKn!5+JW-)wiscH8`lye$&*2kXOgQR4&>= zD}x>$D>Ufz(0V3p-|Cwcs`souB+^c=vs;0@`re?~>GiqPcBgyLv#vJ>)akw)eXJW#(`Z?!Trt=(?RG8+{X*lz7LSmP})+3pOW(XKTx z8!%4u1)_WP_P}Coc4*h^Z8!0llLW<>)VxQ?>QlGZxnVURV66^b`)eQ)eZl>di8^3G zZ0t)`8yc7|cYADDr)OM*I6DYkI2IOdDjBO-_%SvCSU+GL1MkR(U_Y3|SRYAt0iNl^ z7~f5E-Ff65PGf>Qxi*p!kn}*fc1Ma{wn4wPSJDyB3>ilIQlzmMD0TVG=AoK>UhIdwIWy#$w;v|NI98ec(TO2 zESAcUgB&dAE83osTvg_s8(@Oif9&fIiVYW*~`teT!1c*e~-LDP&l71*wxFP*^F`wu;! z)73LwVhh)D4c0DWfe2=$VD-91=Rhi!J#sw9Di&SO3+zdR%@3Nir&tfeG}IIRJY$f8 zF$a6rT4<_t_iZoy;F5iru3ou%Z3BP4pgm@DVA4_y51bUj6gfzw`b- zKfnJA_WVBtegGT*F9R2V2Z497_kRb70R=RHXMu-+ze3K>f!BeT0G4|ZD6f)%l7W(e zl7W(el7W(el7W(el7W(e{{;pJ7w>e zmw9tqTybK!{F)Sd^P4Dkk2B5mE#iG08l6XA{;EAY%sI!o%tw&=Q7paNp~r_b&n{cM N?!3eK2I}j6`xlPYB{cv5 literal 0 HcmV?d00001 diff --git a/src/pymovements/datasets/didec.py b/src/pymovements/datasets/didec.py index b9ead4c65..c27faddc6 100644 --- a/src/pymovements/datasets/didec.py +++ b/src/pymovements/datasets/didec.py @@ -47,6 +47,10 @@ class DIDEC(DatasetDefinition): name: str The name of the dataset. + has_files: dict[str, bool] + Indicate whether the dataset contains 'gaze', 'precomputed_events', and + 'precomputed_reading_measures'. + mirrors: dict[str, tuple[str, ...]] A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'. @@ -59,6 +63,9 @@ class DIDEC(DatasetDefinition): experiment: Experiment The experiment definition. + extract: dict[str, bool] + Decide whether to extract the data. + filename_format: dict[str, str] Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. @@ -67,6 +74,27 @@ class DIDEC(DatasetDefinition): If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] + The name of the trial columns in the input data frame. If the list is empty or None, + the input data frame is assumed to contain only one trial. If the list is not empty, + the input data frame is assumed to contain multiple trials and the transformation + methods will be applied to each trial separately. + + time_column: str + The name of the timestamp column in the input data frame. This column will be renamed to + ``time``. + + time_unit: str + The unit of the timestamps in the timestamp column in the input data frame. Supported + units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is + 'step' the experiment definition must be specified. All timestamps will be converted to + milliseconds. + + pixel_columns: list[str] + The name of the pixel position columns in the input data frame. These columns will be + nested into the column ``pixel``. If the list is empty or None, the nested ``pixel`` + column will not be created. + column_map: dict[str, str] The keys are the columns to read, the values are the names to which they should be renamed. @@ -122,8 +150,6 @@ class DIDEC(DatasetDefinition): }, ) - extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True}) - experiment: Experiment = Experiment( screen_width_px=1680, screen_height_px=1050, @@ -134,6 +160,8 @@ class DIDEC(DatasetDefinition): sampling_rate=1000, ) + extract: dict[str, bool] = field(default_factory=lambda: {'gaze': True}) + filename_format: dict[str, str] = field( default_factory=lambda: { 'gaze': From cc67adb1ee992455781855181bf4ed87ac27288e Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sat, 28 Sep 2024 18:25:17 +0200 Subject: [PATCH 22/31] update transforms tests to account for adding single line (#815) --- tests/unit/gaze/transforms/deg2pix_test.py | 7 ++++--- tests/unit/gaze/transforms/pix2deg_test.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/unit/gaze/transforms/deg2pix_test.py b/tests/unit/gaze/transforms/deg2pix_test.py index 465fcd214..aed92830f 100644 --- a/tests/unit/gaze/transforms/deg2pix_test.py +++ b/tests/unit/gaze/transforms/deg2pix_test.py @@ -508,9 +508,10 @@ def test_deg2pix_returns(kwargs, series, expected_df, distance_as_column): # unit of distance values has to be in mm when passing as column distance_value = kwargs['distance'] * 10 - df = df.with_columns( - pl.Series('distance', [distance_value], pl.Float64), - ) + try: + df = df.with_columns(pl.Series('distance', [distance_value], pl.Float64)) + except pl.exceptions.InvalidOperationError: + df = df.with_columns(distance=distance_value) kwargs['distance'] = 'distance' diff --git a/tests/unit/gaze/transforms/pix2deg_test.py b/tests/unit/gaze/transforms/pix2deg_test.py index 7466a6568..afff2eeed 100644 --- a/tests/unit/gaze/transforms/pix2deg_test.py +++ b/tests/unit/gaze/transforms/pix2deg_test.py @@ -518,9 +518,10 @@ def test_pix2deg_returns(kwargs, series, expected_df, distance_as_column): # unit of distance values has to be in mm when passing as column distance_value = kwargs['distance'] * 10 - df = df.with_columns( - pl.Series('distance', [distance_value], pl.Float64), - ) + try: + df = df.with_columns(pl.Series('distance', [distance_value], pl.Float64)) + except pl.exceptions.InvalidOperationError: + df = df.with_columns(distance=distance_value) kwargs['distance'] = 'distance' From 4792904ac737d03c32cdca9d4aa96077d60317ea Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sat, 28 Sep 2024 18:37:53 +0200 Subject: [PATCH 23/31] update binocular example, space raises converting problems for polars1+ (#814) --- tests/files/binocular_example.csv | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/files/binocular_example.csv b/tests/files/binocular_example.csv index a1f3c95f6..feaec595a 100644 --- a/tests/files/binocular_example.csv +++ b/tests/files/binocular_example.csv @@ -1,11 +1,11 @@ time,x_left_pix,y_left_pix,x_right_pix,y_right_pix,x_left_pos,y_left_pos,x_right_pos,y_right_pos -0,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -1,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -2,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -3,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -4,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -5,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -6,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -7,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -8,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 -9,0,0,0,0,-23.104783, -13.489493,-23.104783, -13.489493 +0,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +1,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +2,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +3,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +4,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +5,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +6,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +7,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +8,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 +9,0,0,0,0,-23.104783,-13.489493,-23.104783,-13.489493 From ba0773848ba871ab19a04b3353989555f16c5733 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sat, 28 Sep 2024 18:53:34 +0200 Subject: [PATCH 24/31] missed ColumnNotFoundError (#816) --- src/pymovements/gaze/gaze_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pymovements/gaze/gaze_dataframe.py b/src/pymovements/gaze/gaze_dataframe.py index 5a74b68d4..aec927a58 100644 --- a/src/pymovements/gaze/gaze_dataframe.py +++ b/src/pymovements/gaze/gaze_dataframe.py @@ -703,7 +703,7 @@ def detect( if trial_column not in self.events.frame.columns ] if missing_trial_columns: - raise pl.ColumnNotFoundError( + raise pl.exceptions.ColumnNotFoundError( f'trial columns {missing_trial_columns} missing from events, ' f'available columns: {self.events.frame.columns}', ) From 81b8518335f5b815ffe76b800ff29e0f41976326 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sat, 28 Sep 2024 19:28:09 +0200 Subject: [PATCH 25/31] upgrade to polars 1+ (#809) * dtypes now called schema_overrides * preview window updated and arctan2d=>arctan2 * polars improved inference * upgrade to polars 1+ --- docs/source/tutorials/local-dataset.ipynb | 4 +- pyproject.toml | 2 +- src/pymovements/dataset/dataset_definition.py | 8 ++-- src/pymovements/dataset/dataset_files.py | 31 +++++++------- src/pymovements/datasets/copco.py | 6 ++- src/pymovements/datasets/didec.py | 4 +- src/pymovements/datasets/emtec.py | 6 +-- src/pymovements/datasets/fakenews.py | 4 +- src/pymovements/datasets/gaze_graph.py | 6 +-- src/pymovements/datasets/gaze_on_faces.py | 6 +-- src/pymovements/datasets/gazebase.py | 6 +-- src/pymovements/datasets/gazebasevr.py | 6 +-- src/pymovements/datasets/hbn.py | 6 +-- src/pymovements/datasets/judo1000.py | 6 +-- src/pymovements/datasets/potec.py | 6 +-- src/pymovements/datasets/sb_sat.py | 6 +-- src/pymovements/datasets/toy_dataset.py | 6 +-- .../datasets/toy_dataset_eyelink.py | 4 +- src/pymovements/gaze/gaze_dataframe.py | 2 +- src/pymovements/gaze/integration.py | 6 +++ src/pymovements/gaze/io.py | 42 +++++++++++-------- src/pymovements/gaze/transforms.py | 4 +- src/pymovements/utils/parsing.py | 5 +-- tests/functional/dataset_processing_test.py | 28 ++++++------- tests/unit/dataset/dataset_download_test.py | 2 +- tests/unit/dataset/dataset_files_test.py | 2 +- tests/unit/dataset/dataset_test.py | 15 ++++--- tests/unit/datasets/datasets_test.py | 6 +-- tests/unit/events/frame_test.py | 2 +- tests/unit/gaze/io/csv_test.py | 10 ++--- 30 files changed, 132 insertions(+), 115 deletions(-) diff --git a/docs/source/tutorials/local-dataset.ipynb b/docs/source/tutorials/local-dataset.ipynb index dbbed32d3..f48f133ac 100644 --- a/docs/source/tutorials/local-dataset.ipynb +++ b/docs/source/tutorials/local-dataset.ipynb @@ -142,7 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "filename_format_dtypes = {'gaze': {\n", + "filename_format_schema_overrides = {'gaze': {\n", " 'text_id': int,\n", " 'page_id': int,\n", "},\n", @@ -254,7 +254,7 @@ " has_files={'gaze': True, 'precomputed_events': False, 'precomputed_reading_measures': False},\n", " experiment=experiment,\n", " filename_format=filename_format,\n", - " filename_format_dtypes=filename_format_dtypes,\n", + " filename_format_schema_overrides=filename_format_schema_overrides,\n", " custom_read_kwargs=custom_read_kwargs,\n", " time_column=time_column,\n", " time_unit=time_unit,\n", diff --git a/pyproject.toml b/pyproject.toml index ab4511458..075d01c49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ "matplotlib>=3.8.0,<3.10", "numpy>=1.22.4,<3", "pandas>=2.1.4,<3", - "polars>=0.20.1,<0.20.3", + "polars>=1.8.2,<2", "pyarrow>=11.0.0,<18", "pyopenssl>=16.0.0,<25.0.0", "scipy>=1.8.0,<2", diff --git a/src/pymovements/dataset/dataset_definition.py b/src/pymovements/dataset/dataset_definition.py index eaff6e13b..e8737b933 100644 --- a/src/pymovements/dataset/dataset_definition.py +++ b/src/pymovements/dataset/dataset_definition.py @@ -54,7 +54,7 @@ class DatasetDefinition: filename_format: dict[str, str] Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. (default: field(default_factory=dict)) - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. (default: field(default_factory=dict)) custom_read_kwargs: dict[str, dict[str, Any]] @@ -122,10 +122,10 @@ class DatasetDefinition: 3. Specifying column datatypes ``polars.read_csv`` infers data types from a fixed number of rows, which might not be accurate for the entire dataset. To ensure correct data types, you can pass a dictionary to the - ``dtypes`` keyword argument in ``gaze_custom_read_kwargs``. + ``schema_overrides`` keyword argument in ``gaze_custom_read_kwargs``. Use data types from the `polars` library. For instance: - ``gaze_custom_read_kwargs={'dtypes': {'col1': polars.Int64, 'col2': polars.Float64}}`` + ``gaze_custom_read_kwargs={'schema_overrides': {'col1': polars.Int64, 'col2': polars.Float64}}`` """ # pylint: disable=too-many-instance-attributes @@ -141,7 +141,7 @@ class DatasetDefinition: filename_format: dict[str, str] = field(default_factory=dict) - filename_format_dtypes: dict[str, dict[str, type]] = field(default_factory=dict) + filename_format_schema_overrides: dict[str, dict[str, type]] = field(default_factory=dict) custom_read_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) diff --git a/src/pymovements/dataset/dataset_files.py b/src/pymovements/dataset/dataset_files.py index 1e5be0008..dfc4c40eb 100644 --- a/src/pymovements/dataset/dataset_files.py +++ b/src/pymovements/dataset/dataset_files.py @@ -74,8 +74,8 @@ def scan_dataset(definition: DatasetDefinition, paths: DatasetPaths) -> pl.DataF fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1) fileinfo_df = fileinfo_df.sort(by='filepath') - if definition.filename_format_dtypes['gaze']: - items = definition.filename_format_dtypes['gaze'].items() + if definition.filename_format_schema_overrides['gaze']: + items = definition.filename_format_schema_overrides['gaze'].items() fileinfo_df = fileinfo_df.with_columns([ pl.col(fileinfo_key).cast(fileinfo_dtype) for fileinfo_key, fileinfo_dtype in items @@ -92,31 +92,33 @@ def scan_dataset(definition: DatasetDefinition, paths: DatasetPaths) -> pl.DataF raise RuntimeError(f'no matching files found in {paths.precomputed_events}') fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1) fileinfo_df = fileinfo_df.sort(by='filepath') - if definition.filename_format_dtypes['precomputed_events']: - items = definition.filename_format_dtypes['precomputed_events'].items() + if definition.filename_format_schema_overrides['precomputed_events']: + items = definition.filename_format_schema_overrides['precomputed_events'].items() fileinfo_df = fileinfo_df.with_columns([ pl.col(fileinfo_key).cast(fileinfo_dtype) for fileinfo_key, fileinfo_dtype in items ]) _fileinfo_dicts['precomputed_events'] = fileinfo_df - if definition.has_files['precomputed_reading_measures']: + pc_rm = 'precomputed_reading_measures' + if definition.has_files[pc_rm]: fileinfo_dicts = match_filepaths( path=paths.precomputed_reading_measures, - regex=curly_to_regex(definition.filename_format['precomputed_reading_measures']), + regex=curly_to_regex(definition.filename_format[pc_rm]), relative=True, ) if not fileinfo_dicts: raise RuntimeError(f'no matching files found in {paths.precomputed_reading_measures}') fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1) fileinfo_df = fileinfo_df.sort(by='filepath') - if definition.filename_format_dtypes['precomputed_reading_measures']: - items = definition.filename_format_dtypes['precomputed_reading_measures'].items() + if definition.filename_format_schema_overrides[pc_rm]: + _schema_overrides = definition.filename_format_schema_overrides[pc_rm] + items = _schema_overrides.items() fileinfo_df = fileinfo_df.with_columns([ pl.col(fileinfo_key).cast(fileinfo_dtype) for fileinfo_key, fileinfo_dtype in items ]) - _fileinfo_dicts['precomputed_reading_measures'] = fileinfo_df + _fileinfo_dicts[pc_rm] = fileinfo_df return _fileinfo_dicts @@ -316,7 +318,7 @@ def load_gaze_file( trial_columns=definition.trial_columns, time_unit=time_unit, add_columns=add_columns, - column_dtypes=definition.filename_format_dtypes['gaze'], + column_schema_overrides=definition.filename_format_schema_overrides['gaze'], ) # suffixes as ordered after using GazeDataFrame.unnest() @@ -364,7 +366,7 @@ def load_gaze_file( trial_columns=definition.trial_columns, column_map=definition.column_map, add_columns=add_columns, - column_dtypes=definition.filename_format_dtypes['gaze'], + column_schema_overrides=definition.filename_format_schema_overrides['gaze'], **custom_read_kwargs, ) elif filepath.suffix == '.feather': @@ -372,14 +374,14 @@ def load_gaze_file( filepath, experiment=definition.experiment, add_columns=add_columns, - column_dtypes=definition.filename_format_dtypes['gaze'], + column_schema_overrides=definition.filename_format_schema_overrides['gaze'], ) elif filepath.suffix == '.asc': gaze_df = from_asc( filepath, experiment=definition.experiment, add_columns=add_columns, - column_dtypes=definition.filename_format_dtypes['gaze'], + column_schema_overrides=definition.filename_format_schema_overrides['gaze'], **custom_read_kwargs, ) else: @@ -556,9 +558,10 @@ def add_fileinfo( ) # Cast columns from fileinfo according to specification. + _schema_overrides = definition.filename_format_schema_overrides['gaze'] df = df.with_columns([ pl.col(fileinfo_key).cast(fileinfo_dtype) - for fileinfo_key, fileinfo_dtype in definition.filename_format_dtypes['gaze'].items() + for fileinfo_key, fileinfo_dtype in _schema_overrides.items() ]) return df diff --git a/src/pymovements/datasets/copco.py b/src/pymovements/datasets/copco.py index b6603c644..52e3400aa 100644 --- a/src/pymovements/datasets/copco.py +++ b/src/pymovements/datasets/copco.py @@ -74,14 +74,16 @@ class CopCo(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. + trial_columns: list[str] The name of the trial columns in the input data frame. If the list is empty or None, the input data frame is assumed to contain only one trial. If the list is not empty, the input data frame is assumed to contain multiple trials and the transformation methods will be applied to each trial separately. + time_column: str The name of the timestamp column in the input data frame. This column will be renamed to ``time``. @@ -184,7 +186,7 @@ class CopCo(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'precomputed_events': {}, 'precomputed_reading_measures': {}, diff --git a/src/pymovements/datasets/didec.py b/src/pymovements/datasets/didec.py index c27faddc6..496e3e7e6 100644 --- a/src/pymovements/datasets/didec.py +++ b/src/pymovements/datasets/didec.py @@ -70,7 +70,7 @@ class DIDEC(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -174,7 +174,7 @@ class DIDEC(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'experiment': int, diff --git a/src/pymovements/datasets/emtec.py b/src/pymovements/datasets/emtec.py index 69432149c..1efe552a1 100644 --- a/src/pymovements/datasets/emtec.py +++ b/src/pymovements/datasets/emtec.py @@ -71,7 +71,7 @@ class EMTeC(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -183,7 +183,7 @@ class EMTeC(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': {'subject_id': int}, @@ -219,7 +219,7 @@ class EMTeC(DatasetDefinition): 'y', 'pupil_right', ], - 'dtypes': { + 'schema_overrides': { 'item_id': pl.Utf8, 'TRIAL_ID': pl.Int64, 'Trial_Index_': pl.Int64, diff --git a/src/pymovements/datasets/fakenews.py b/src/pymovements/datasets/fakenews.py index f964fd8f2..64c2cf9f9 100644 --- a/src/pymovements/datasets/fakenews.py +++ b/src/pymovements/datasets/fakenews.py @@ -64,7 +64,7 @@ class FakeNewsPerception(DatasetDefinition): filename_format: dict[str, str] Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. trial_columns: list[str] @@ -136,7 +136,7 @@ class FakeNewsPerception(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'precomputed_events': {'subject_id': int, 'session_id': int, 'truth_value': str}, }, diff --git a/src/pymovements/datasets/gaze_graph.py b/src/pymovements/datasets/gaze_graph.py index e8b146da0..530fc046d 100644 --- a/src/pymovements/datasets/gaze_graph.py +++ b/src/pymovements/datasets/gaze_graph.py @@ -77,7 +77,7 @@ class GazeGraph(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, Any]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -176,7 +176,7 @@ class GazeGraph(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'subject_id': int, @@ -201,7 +201,7 @@ class GazeGraph(DatasetDefinition): 'separator': ',', 'has_header': False, 'new_columns': ['x', 'y'], - 'dtypes': [pl.Float32, pl.Float32], + 'schema_overrides': [pl.Float32, pl.Float32], }, }, ) diff --git a/src/pymovements/datasets/gaze_on_faces.py b/src/pymovements/datasets/gaze_on_faces.py index 131549b87..0195bb0f0 100644 --- a/src/pymovements/datasets/gaze_on_faces.py +++ b/src/pymovements/datasets/gaze_on_faces.py @@ -75,7 +75,7 @@ class GazeOnFaces(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides : dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -175,7 +175,7 @@ class GazeOnFaces(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'sub_id': int, @@ -200,7 +200,7 @@ class GazeOnFaces(DatasetDefinition): 'separator': ',', 'has_header': False, 'new_columns': ['x', 'y'], - 'dtypes': [pl.Float32, pl.Float32], + 'schema_overrides': [pl.Float32, pl.Float32], }, }, ) diff --git a/src/pymovements/datasets/gazebase.py b/src/pymovements/datasets/gazebase.py index a9835b74a..508327996 100644 --- a/src/pymovements/datasets/gazebase.py +++ b/src/pymovements/datasets/gazebase.py @@ -79,7 +79,7 @@ class GazeBase(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -183,7 +183,7 @@ class GazeBase(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': {'round_id': int, 'subject_id': int, 'session_id': int}, }, @@ -211,7 +211,7 @@ class GazeBase(DatasetDefinition): default_factory=lambda: { 'gaze': { 'null_values': 'NaN', - 'dtypes': { + 'schema_overrides': { 'n': pl.Int64, 'x': pl.Float32, 'y': pl.Float32, diff --git a/src/pymovements/datasets/gazebasevr.py b/src/pymovements/datasets/gazebasevr.py index ef22e7c8e..94737fe04 100644 --- a/src/pymovements/datasets/gazebasevr.py +++ b/src/pymovements/datasets/gazebasevr.py @@ -80,7 +80,7 @@ class GazeBaseVR(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -184,7 +184,7 @@ class GazeBaseVR(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'round_id': int, @@ -214,7 +214,7 @@ class GazeBaseVR(DatasetDefinition): custom_read_kwargs: dict[str, dict[str, Any]] = field( default_factory=lambda: { 'gaze': { - 'dtypes': { + 'schema_overrides': { 'n': pl.Float32, 'x': pl.Float32, 'y': pl.Float32, diff --git a/src/pymovements/datasets/hbn.py b/src/pymovements/datasets/hbn.py index 0ef7b0922..2841cd8ea 100644 --- a/src/pymovements/datasets/hbn.py +++ b/src/pymovements/datasets/hbn.py @@ -74,7 +74,7 @@ class HBN(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -173,7 +173,7 @@ class HBN(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'subject_id': str, @@ -197,7 +197,7 @@ class HBN(DatasetDefinition): 'gaze': { 'separator': ',', 'columns': ['time', 'x_pix', 'y_pix'], - 'dtypes': { + 'schema_overrides': { 'time': pl.Int64, 'x_pix': pl.Float32, 'y_pix': pl.Float32, diff --git a/src/pymovements/datasets/judo1000.py b/src/pymovements/datasets/judo1000.py index dfbf1f663..e953bbdb4 100644 --- a/src/pymovements/datasets/judo1000.py +++ b/src/pymovements/datasets/judo1000.py @@ -73,7 +73,7 @@ class JuDo1000(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -172,7 +172,7 @@ class JuDo1000(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'subject_id': int, @@ -205,7 +205,7 @@ class JuDo1000(DatasetDefinition): custom_read_kwargs: dict[str, dict[str, Any]] = field( default_factory=lambda: { 'gaze': { - 'dtypes': { + 'schema_overrides': { 'trialId': pl.Int64, 'pointId': pl.Int64, 'time': pl.Int64, diff --git a/src/pymovements/datasets/potec.py b/src/pymovements/datasets/potec.py index d26cb3160..aad21c2b8 100644 --- a/src/pymovements/datasets/potec.py +++ b/src/pymovements/datasets/potec.py @@ -83,7 +83,7 @@ class PoTeC(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -178,7 +178,7 @@ class PoTeC(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'subject_id': int, @@ -204,7 +204,7 @@ class PoTeC(DatasetDefinition): custom_read_kwargs: dict[str, dict[str, Any]] = field( default_factory=lambda: { 'gaze': { - 'dtypes': { + 'schema_overrides': { 'time': pl.Int64, 'x': pl.Float32, 'y': pl.Float32, diff --git a/src/pymovements/datasets/sb_sat.py b/src/pymovements/datasets/sb_sat.py index 1974bd77b..39095853f 100644 --- a/src/pymovements/datasets/sb_sat.py +++ b/src/pymovements/datasets/sb_sat.py @@ -73,7 +73,7 @@ class SBSAT(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -188,7 +188,7 @@ class SBSAT(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': {'subject_id': int}, @@ -218,7 +218,7 @@ class SBSAT(DatasetDefinition): 'gaze': { 'separator': '\t', 'columns': ['time', 'book_name', 'screen_id', 'x_left', 'y_left', 'pupil_left'], - 'dtypes': { + 'schema_overrides': { 'time': pl.Int64, 'book_name': pl.Utf8, 'screen_id': pl.Int64, diff --git a/src/pymovements/datasets/toy_dataset.py b/src/pymovements/datasets/toy_dataset.py index 7f45ea72c..b8ca4ad6e 100644 --- a/src/pymovements/datasets/toy_dataset.py +++ b/src/pymovements/datasets/toy_dataset.py @@ -71,7 +71,7 @@ class ToyDataset(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -166,7 +166,7 @@ class ToyDataset(DatasetDefinition): default_factory=lambda: {'gaze': r'trial_{text_id:d}_{page_id:d}.csv'}, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'text_id': int, @@ -189,7 +189,7 @@ class ToyDataset(DatasetDefinition): default_factory=lambda: { 'gaze': { 'columns': ['timestamp', 'x', 'y', 'stimuli_x', 'stimuli_y'], - 'dtypes': { + 'schema_overrides': { 'timestamp': pl.Float64, 'x': pl.Float64, 'y': pl.Float64, diff --git a/src/pymovements/datasets/toy_dataset_eyelink.py b/src/pymovements/datasets/toy_dataset_eyelink.py index 1dcd2a661..4d70501d1 100644 --- a/src/pymovements/datasets/toy_dataset_eyelink.py +++ b/src/pymovements/datasets/toy_dataset_eyelink.py @@ -72,7 +72,7 @@ class ToyDatasetEyeLink(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_dtypes: dict[str, dict[str, type]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. @@ -179,7 +179,7 @@ class ToyDatasetEyeLink(DatasetDefinition): }, ) - filename_format_dtypes: dict[str, dict[str, type]] = field( + filename_format_schema_overrides: dict[str, dict[str, type]] = field( default_factory=lambda: { 'gaze': { 'subject_id': int, diff --git a/src/pymovements/gaze/gaze_dataframe.py b/src/pymovements/gaze/gaze_dataframe.py index aec927a58..b7adc3d31 100644 --- a/src/pymovements/gaze/gaze_dataframe.py +++ b/src/pymovements/gaze/gaze_dataframe.py @@ -713,7 +713,7 @@ def detect( for group_identifier, group_gaze in grouped_frames.items(): # Create filter expression for selecting respective group rows. if len(self.trial_columns) == 1: - group_filter_expression = pl.col(self.trial_columns[0]) == group_identifier + group_filter_expression = pl.col(self.trial_columns[0]) == group_identifier[0] else: group_filter_expression = pl.col(self.trial_columns[0]) == group_identifier[0] for name, value in zip(self.trial_columns[1:], group_identifier[1:]): diff --git a/src/pymovements/gaze/integration.py b/src/pymovements/gaze/integration.py index 06ee617be..f05b22e31 100644 --- a/src/pymovements/gaze/integration.py +++ b/src/pymovements/gaze/integration.py @@ -157,11 +157,13 @@ def from_numpy( │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ + │ 0 ┆ [0.0, 0.0] │ │ … ┆ … │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ + │ 0 ┆ [0.0, 0.0] │ └──────┴────────────┘ Use the ``orient`` keyword argument to specify the layout of your array. @@ -187,11 +189,13 @@ def from_numpy( │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ + │ 0 ┆ [0.0, 0.0] │ │ … ┆ … │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ + │ 0 ┆ [0.0, 0.0] │ └──────┴────────────┘ Pass the data explicitly via the specific keyword arguments, without having to specify a schema. @@ -212,11 +216,13 @@ def from_numpy( │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ + │ 0 ┆ [0.0, 0.0] │ │ … ┆ … │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ │ 0 ┆ [0.0, 0.0] │ + │ 0 ┆ [0.0, 0.0] │ └──────┴────────────┘ """ # Either data or {time, pixel, position, velocity, acceleration} must be None. diff --git a/src/pymovements/gaze/io.py b/src/pymovements/gaze/io.py index 9b3b133f5..722477491 100644 --- a/src/pymovements/gaze/io.py +++ b/src/pymovements/gaze/io.py @@ -44,7 +44,7 @@ def from_csv( distance_column: str | None = None, column_map: dict[str, str] | None = None, add_columns: dict[str, str] | None = None, - column_dtypes: dict[str, type] | None = None, + column_schema_overrides: dict[str, type] | None = None, **read_csv_kwargs: Any, ) -> GazeDataFrame: """Initialize a :py:class:`pymovements.gaze.gaze_dataframe.GazeDataFrame`. @@ -94,7 +94,7 @@ def from_csv( add_columns: dict[str, str] | None Dictionary containing columns to add to loaded data frame. (default: None) - column_dtypes: dict[str, type] | None + column_schema_overrides: dict[str, type] | None Dictionary containing types for columns. (default: None) **read_csv_kwargs: Any @@ -141,7 +141,8 @@ def from_csv( │ 1 ┆ 0 ┆ 0 │ │ 2 ┆ 0 ┆ 0 │ │ 3 ┆ 0 ┆ 0 │ - │ … ┆ … ┆ … │ + │ 4 ┆ 0 ┆ 0 │ + │ 5 ┆ 0 ┆ 0 │ │ 6 ┆ 0 ┆ 0 │ │ 7 ┆ 0 ┆ 0 │ │ 8 ┆ 0 ┆ 0 │ @@ -171,7 +172,8 @@ def from_csv( │ 1 ┆ [0, 0] │ │ 2 ┆ [0, 0] │ │ 3 ┆ [0, 0] │ - │ … ┆ … │ + │ 4 ┆ [0, 0] │ + │ 5 ┆ [0, 0] │ │ 6 ┆ [0, 0] │ │ 7 ┆ [0, 0] │ │ 8 ┆ [0, 0] │ @@ -180,7 +182,7 @@ def from_csv( Please be aware that data types are inferred from a fixed number of rows. To ensure correct data types, you can pass a dictionary of column names and data types to the - `dtypes` keyword argument of :py:func:`polars.read_csv`: + `schema_overrides` keyword argument of :py:func:`polars.read_csv`: >>> from pymovements.gaze.io import from_csv >>> import polars as pl @@ -189,7 +191,7 @@ def from_csv( ... time_column = 'time', ... time_unit='ms', ... pixel_columns = ['x_left_pix','y_left_pix'], - ... dtypes = {'time': pl.Int64, 'x_left_pix': pl.Int64, 'y_left_pix': pl.Int64}, + ... schema_overrides = {'time': pl.Int64, 'x_left_pix': pl.Int64, 'y_left_pix': pl.Int64}, ... ) >>> gaze.frame shape: (10, 2) @@ -202,7 +204,8 @@ def from_csv( │ 1 ┆ [0, 0] │ │ 2 ┆ [0, 0] │ │ 3 ┆ [0, 0] │ - │ … ┆ … │ + │ 4 ┆ [0, 0] │ + │ 5 ┆ [0, 0] │ │ 6 ┆ [0, 0] │ │ 7 ┆ [0, 0] │ │ 8 ┆ [0, 0] │ @@ -243,10 +246,10 @@ def from_csv( pl.col(column).cast(pl.Float64), ]) - if column_dtypes is not None: + if column_schema_overrides is not None: gaze_data = gaze_data.with_columns([ pl.col(fileinfo_key).cast(fileinfo_dtype) - for fileinfo_key, fileinfo_dtype in column_dtypes.items() + for fileinfo_key, fileinfo_dtype in column_schema_overrides.items() ]) # Create gaze data frame. @@ -272,7 +275,7 @@ def from_asc( schema: dict[str, Any] | None = None, experiment: Experiment | None = None, add_columns: dict[str, str] | None = None, - column_dtypes: dict[str, type] | None = None, + column_schema_overrides: dict[str, type] | None = None, ) -> GazeDataFrame: """Initialize a :py:class:`pymovements.gaze.gaze_dataframe.GazeDataFrame`. @@ -290,7 +293,7 @@ def from_asc( add_columns: dict[str, str] | None Dictionary containing columns to add to loaded data frame. (default: None) - column_dtypes: dict[str, type] | None + column_schema_overrides: dict[str, type] | None Dictionary containing types for columns. (default: None) @@ -317,7 +320,9 @@ def from_asc( │ 2154557 ┆ 778.0 ┆ [138.2, 132.7] │ │ 2154560 ┆ 777.0 ┆ [137.9, 131.6] │ │ 2154564 ┆ 778.0 ┆ [138.1, 131.0] │ + │ 2154596 ┆ 784.0 ┆ [139.6, 132.1] │ │ … ┆ … ┆ … │ + │ 2339246 ┆ 622.0 ┆ [629.9, 531.9] │ │ 2339271 ┆ 617.0 ┆ [639.4, 531.9] │ │ 2339272 ┆ 617.0 ┆ [639.0, 531.9] │ │ 2339290 ┆ 618.0 ┆ [637.6, 531.4] │ @@ -342,10 +347,10 @@ def from_asc( if column not in gaze_data.columns ]) - if column_dtypes is not None: + if column_schema_overrides is not None: gaze_data = gaze_data.with_columns([ pl.col(fileinfo_key).cast(fileinfo_dtype) - for fileinfo_key, fileinfo_dtype in column_dtypes.items() + for fileinfo_key, fileinfo_dtype in column_schema_overrides.items() ]) # Create gaze data frame. @@ -364,7 +369,7 @@ def from_ipc( experiment: Experiment | None = None, column_map: dict[str, str] | None = None, add_columns: dict[str, str] | None = None, - column_dtypes: dict[str, type] | None = None, + column_schema_overrides: dict[str, type] | None = None, **read_ipc_kwargs: Any, ) -> GazeDataFrame: """Initialize a :py:class:`pymovements.gaze.gaze_dataframe.GazeDataFrame`. @@ -382,7 +387,7 @@ def from_ipc( add_columns: dict[str, str] | None Dictionary containing columns to add to loaded data frame. (default: None) - column_dtypes: dict[str, type] | None + column_schema_overrides: dict[str, type] | None Dictionary containing types for columns. (default: None) **read_ipc_kwargs: Any @@ -411,7 +416,8 @@ def from_ipc( │ 1 ┆ [0, 0] │ │ 2 ┆ [0, 0] │ │ 3 ┆ [0, 0] │ - │ … ┆ … │ + │ 4 ┆ [0, 0] │ + │ 5 ┆ [0, 0] │ │ 6 ┆ [0, 0] │ │ 7 ┆ [0, 0] │ │ 8 ┆ [0, 0] │ @@ -438,10 +444,10 @@ def from_ipc( if column not in gaze_data.columns ]) - if column_dtypes is not None: + if column_schema_overrides is not None: gaze_data = gaze_data.with_columns([ pl.col(fileinfo_key).cast(fileinfo_dtype) - for fileinfo_key, fileinfo_dtype in column_dtypes.items() + for fileinfo_key, fileinfo_dtype in column_schema_overrides.items() ]) # Create gaze data frame. diff --git a/src/pymovements/gaze/transforms.py b/src/pymovements/gaze/transforms.py index 5b925e2ef..8e5916920 100644 --- a/src/pymovements/gaze/transforms.py +++ b/src/pymovements/gaze/transforms.py @@ -280,9 +280,9 @@ def pix2deg( ]) degree_components = [ - pl.arctan2d( + pl.arctan2( centered_pixels.list.get(component), distance_pixels.list.get(component), - ) + ).degrees() for component in range(n_components) ] diff --git a/src/pymovements/utils/parsing.py b/src/pymovements/utils/parsing.py index f9a60feb6..a2e12f43b 100755 --- a/src/pymovements/utils/parsing.py +++ b/src/pymovements/utils/parsing.py @@ -381,10 +381,7 @@ def parse_eyelink( for column, dtype in schema.items(): schema_overrides[column] = dtype - df = pl.from_dict( - data=samples, - schema_overrides=schema_overrides, - ) + df = pl.from_dict(data=samples).cast(schema_overrides) return df, pre_processed_metadata diff --git a/tests/functional/dataset_processing_test.py b/tests/functional/dataset_processing_test.py index 48abacb20..3c3dfdbaa 100644 --- a/tests/functional/dataset_processing_test.py +++ b/tests/functional/dataset_processing_test.py @@ -55,7 +55,7 @@ def fixture_dataset_init_kwargs(request): pixel_columns=['x_left_pix', 'y_left_pix'], experiment=pm.Experiment(1024, 768, 38, 30, 60, 'center', 1000), filename_format={'gaze': 'monocular_example.csv'}, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, custom_read_kwargs={'gaze': {}}, ), 'csv_binocular': pm.dataset.DatasetDefinition( @@ -70,7 +70,7 @@ def fixture_dataset_init_kwargs(request): pixel_columns=['x_left_pix', 'y_left_pix', 'x_right_pix', 'y_right_pix'], position_columns=['x_left_pos', 'y_left_pos', 'x_right_pos', 'y_right_pos'], experiment=pm.Experiment(1024, 768, 38, 30, 60, 'center', 1000), - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, custom_read_kwargs={'gaze': {}}, ), 'ipc_monocular': pm.dataset.DatasetDefinition( @@ -81,7 +81,7 @@ def fixture_dataset_init_kwargs(request): }, filename_format={'gaze': 'monocular_example.feather'}, experiment=pm.Experiment(1024, 768, 38, 30, 60, 'center', 1000), - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, custom_read_kwargs={'gaze': {}}, ), 'ipc_binocular': pm.dataset.DatasetDefinition( @@ -92,7 +92,7 @@ def fixture_dataset_init_kwargs(request): }, filename_format={'gaze': 'binocular_example.feather'}, experiment=pm.Experiment(1024, 768, 38, 30, 60, 'center', 1000), - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, custom_read_kwargs={'gaze': {}}, ), 'emtec': pm.datasets.EMTeC( @@ -104,7 +104,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'emtec_example.csv'}, time_column=pm.datasets.EMTeC().time_column, time_unit=pm.datasets.EMTeC().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), 'didec': pm.datasets.DIDEC( @@ -116,7 +116,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'didec_example.txt'}, time_column=pm.datasets.DIDEC().time_column, time_unit=pm.datasets.DIDEC().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), 'hbn': pm.datasets.HBN( @@ -128,7 +128,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'hbn_example.csv'}, time_column=pm.datasets.HBN().time_column, time_unit=pm.datasets.HBN().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), 'sbsat': pm.datasets.SBSAT( @@ -140,7 +140,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'sbsat_example.csv'}, time_column=pm.datasets.SBSAT().time_column, time_unit=pm.datasets.SBSAT().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), 'gaze_on_faces': pm.datasets.GazeOnFaces( @@ -152,7 +152,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'gaze_on_faces_example.csv'}, time_column=pm.datasets.GazeOnFaces().time_column, time_unit=pm.datasets.GazeOnFaces().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), 'gazebase': pm.datasets.GazeBase( @@ -164,7 +164,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'gazebase_example.csv'}, time_column=pm.datasets.GazeBase().time_column, time_unit=pm.datasets.GazeBase().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), 'gazebase_vr': pm.datasets.GazeBaseVR( @@ -176,7 +176,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'gazebase_vr_example.csv'}, time_column=pm.datasets.GazeBaseVR().time_column, time_unit=pm.datasets.GazeBaseVR().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), 'gazegraph': pm.datasets.GazeGraph( @@ -188,7 +188,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'gazegraph_example.csv'}, time_column=pm.datasets.GazeGraph().time_column, time_unit=pm.datasets.GazeGraph().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), 'judo1000': pm.datasets.JuDo1000( @@ -200,7 +200,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'judo1000_example.csv'}, time_column=pm.datasets.JuDo1000().time_column, time_unit=pm.datasets.JuDo1000().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=['trial_id'], ), 'potec': pm.datasets.PoTeC( @@ -212,7 +212,7 @@ def fixture_dataset_init_kwargs(request): filename_format={'gaze': 'potec_example.tsv'}, time_column=pm.datasets.PoTeC().time_column, time_unit=pm.datasets.PoTeC().time_unit, - filename_format_dtypes={'gaze': {}}, + filename_format_schema_overrides={'gaze': {}}, trial_columns=None, ), } diff --git a/tests/unit/dataset/dataset_download_test.py b/tests/unit/dataset/dataset_download_test.py index 37e516eae..8ba7f5b64 100644 --- a/tests/unit/dataset/dataset_download_test.py +++ b/tests/unit/dataset/dataset_download_test.py @@ -1164,7 +1164,7 @@ def test_public_dataset_registered_correct_attributes(tmp_path, dataset_definiti assert dataset.definition.resources == dataset_definition.resources assert dataset.definition.experiment == dataset_definition.experiment assert dataset.definition.filename_format == dataset_definition.filename_format - assert dataset.definition.filename_format_dtypes == dataset_definition.filename_format_dtypes + assert dataset.definition.filename_format_schema_overrides == dataset_definition.filename_format_schema_overrides # noqa: E501 assert dataset.definition.has_files == dataset_definition.has_files diff --git a/tests/unit/dataset/dataset_files_test.py b/tests/unit/dataset/dataset_files_test.py index 341c8e7dd..80af569a2 100644 --- a/tests/unit/dataset/dataset_files_test.py +++ b/tests/unit/dataset/dataset_files_test.py @@ -207,7 +207,7 @@ def test_load_eyelink_file(tmp_path, read_kwargs): fileinfo_row={}, definition=DatasetDefinition( experiment=pm.Experiment(1024, 768, 38, 30, None, 'center', 100), - filename_format_dtypes={'gaze': {}, 'precomputed_events': {}}, + filename_format_schema_overrides={'gaze': {}, 'precomputed_events': {}}, ), custom_read_kwargs=read_kwargs, ) diff --git a/tests/unit/dataset/dataset_test.py b/tests/unit/dataset/dataset_test.py index 97d13d679..7bb144707 100644 --- a/tests/unit/dataset/dataset_test.py +++ b/tests/unit/dataset/dataset_test.py @@ -109,14 +109,14 @@ def mock_toy( 'precomputed_reading_measures': False, }, extract={'gaze': True, 'precomputed_events': True}, - filename_format_dtypes={ + filename_format_schema_overrides={ 'gaze': {'subject_id': pl.Int64}, 'precomputed_events': {'subject_id': pl.Int64}, 'precomputed_reading_measures': {'subject_id': pl.Int64}, }, ): - if filename_format_dtypes['precomputed_events']: + if filename_format_schema_overrides['precomputed_events']: subject_ids = list(range(1, 21)) fileinfo = pl.DataFrame( data={'subject_id': subject_ids}, @@ -330,7 +330,7 @@ def mock_toy( 'precomputed_events': r'{subject_id:d}.' + raw_fileformat, 'precomputed_reading_measures': r'{subject_id:d}.' + raw_fileformat, }, - filename_format_dtypes=filename_format_dtypes, + filename_format_schema_overrides=filename_format_schema_overrides, custom_read_kwargs={ 'gaze': {}, 'precomputed_events': {}, @@ -739,7 +739,7 @@ def test_clip(gaze_dataset_configuration): original_schema = dataset.gaze[0].schema - dataset.clip(-1000, 1000, input_column='pixel', output_column='pixel_clipped', n_components=4) + dataset.clip(-1000, 1000, input_column='pixel', output_column='pixel_clipped', n_components=2) expected_schema = {**original_schema, 'pixel_clipped': pl.List(pl.Float64)} for result_gaze_df in dataset.gaze: @@ -1802,7 +1802,7 @@ def precomputed_fixture_dataset(request, tmp_path): 'precomputed_reading_measures': False, }, extract={'precomputed_events': False}, - filename_format_dtypes={'precomputed_events': {}}, + filename_format_schema_overrides={'precomputed_events': {}}, ) else: raise ValueError(f'{request.param} not supported as dataset mock') @@ -1874,7 +1874,10 @@ def precomputed_rm_fixture_dataset(request, tmp_path): 'precomputed_reading_measures': True, }, extract={'precomputed_reading_measures': False}, - filename_format_dtypes={'precomputed_events': {}, 'precomputed_reading_measures': {}}, + filename_format_schema_overrides={ + 'precomputed_events': {}, + 'precomputed_reading_measures': {}, + }, ) else: raise ValueError(f'{request.param} not supported as dataset mock') diff --git a/tests/unit/datasets/datasets_test.py b/tests/unit/datasets/datasets_test.py index 9f71da7ac..6a3e02d58 100644 --- a/tests/unit/datasets/datasets_test.py +++ b/tests/unit/datasets/datasets_test.py @@ -105,7 +105,7 @@ def test_public_dataset_registered(public_dataset, dataset_name, dataset_path, d assert dataset_definition.resources['gaze'] == registered_definition.resources['gaze'] assert dataset_definition.experiment == registered_definition.experiment assert dataset_definition.filename_format['gaze'] == registered_definition.filename_format['gaze'] # noqa: E501 - assert dataset_definition.filename_format_dtypes['gaze'] == registered_definition.filename_format_dtypes['gaze'] # noqa: E501 + assert dataset_definition.filename_format_schema_overrides['gaze'] == registered_definition.filename_format_schema_overrides['gaze'] # noqa: E501 assert dataset_definition.custom_read_kwargs['gaze'] == registered_definition.custom_read_kwargs['gaze'] # noqa: E501 if dataset_definition.has_files['precomputed_events']: @@ -113,7 +113,7 @@ def test_public_dataset_registered(public_dataset, dataset_name, dataset_path, d assert dataset_definition.resources['precomputed_events'] == registered_definition.resources['precomputed_events'] # noqa: E501 assert dataset_definition.experiment == registered_definition.experiment assert dataset_definition.filename_format['precomputed_events'] == registered_definition.filename_format['precomputed_events'] # noqa: E501 - assert dataset_definition.filename_format_dtypes['precomputed_events'] == registered_definition.filename_format_dtypes['precomputed_events'] # noqa: E501 + assert dataset_definition.filename_format_schema_overrides['precomputed_events'] == registered_definition.filename_format_schema_overrides['precomputed_events'] # noqa: E501 assert dataset_definition.custom_read_kwargs['precomputed_events'] == registered_definition.custom_read_kwargs['precomputed_events'] # noqa: E501 if dataset_definition.has_files['precomputed_reading_measures']: @@ -121,7 +121,7 @@ def test_public_dataset_registered(public_dataset, dataset_name, dataset_path, d assert dataset_definition.resources['precomputed_reading_measures'] == registered_definition.resources['precomputed_reading_measures'] # noqa: E501 assert dataset_definition.experiment == registered_definition.experiment assert dataset_definition.filename_format['precomputed_reading_measures'] == registered_definition.filename_format['precomputed_reading_measures'] # noqa: E501 - assert dataset_definition.filename_format_dtypes['precomputed_reading_measures'] == registered_definition.filename_format_dtypes['precomputed_reading_measures'] # noqa: E501 + assert dataset_definition.filename_format_schema_overrides['precomputed_reading_measures'] == registered_definition.filename_format_schema_overrides['precomputed_reading_measures'] # noqa: E501 assert dataset_definition.custom_read_kwargs['precomputed_reading_measures'] == registered_definition.custom_read_kwargs['precomputed_reading_measures'] # noqa: E501 dataset, expected_paths = construct_public_dataset( diff --git a/tests/unit/events/frame_test.py b/tests/unit/events/frame_test.py index c967284af..5ea0c609b 100644 --- a/tests/unit/events/frame_test.py +++ b/tests/unit/events/frame_test.py @@ -379,7 +379,7 @@ def test_event_dataframe_init_expected_trial_column_list(kwargs, expected_trial_ ), 'trial_columns': 'trial', }, - pl.DataFrame({'trial': [1, 1]}), + pl.DataFrame({'trial': [1, 1]}, schema_overrides={'trial': pl.Int32}), id='two_rows_plain_trial', ), ], diff --git a/tests/unit/gaze/io/csv_test.py b/tests/unit/gaze/io/csv_test.py index 0efdb0546..ace86e563 100644 --- a/tests/unit/gaze/io/csv_test.py +++ b/tests/unit/gaze/io/csv_test.py @@ -121,7 +121,7 @@ def test_shapes(kwargs, shape): @pytest.mark.parametrize( - ('kwargs', 'dtypes'), + ('kwargs', 'schema_overrides'), [ pytest.param( { @@ -131,7 +131,7 @@ def test_shapes(kwargs, shape): 'pixel_columns': ['x_left_pix', 'y_left_pix'], }, [pl.Int64, pl.List(pl.Int64)], - id='csv_mono_dtypes', + id='csv_mono_schema_overrides', ), pytest.param( { @@ -142,10 +142,10 @@ def test_shapes(kwargs, shape): 'position_columns': ['position_x', 'position_y'], }, [pl.Int64, pl.List(pl.Float64), pl.List(pl.Float64)], - id='csv_missing_values_dtypes', + id='csv_missing_values_schema_overrides', ), ], ) -def test_dtypes(kwargs, dtypes): +def test_schema_overrides(kwargs, schema_overrides): gaze_dataframe = pm.gaze.from_csv(**kwargs) - assert gaze_dataframe.frame.dtypes == dtypes + assert gaze_dataframe.frame.dtypes == schema_overrides From 08a1399159e441dbc4b6c434f445b811287c0ac3 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sat, 28 Sep 2024 20:25:02 +0200 Subject: [PATCH 26/31] manually update pre-commit config (#818) --- .pre-commit-config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0f9d6466..d3aef8533 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: hooks: - id: add-trailing-comma - repo: https://github.com/asottile/pyupgrade - rev: v3.16.0 + rev: v3.17.0 hooks: - id: pyupgrade args: [--py39-plus] @@ -58,7 +58,7 @@ repos: - id: pydoclint args: ["--config=pyproject.toml"] - repo: https://github.com/nbQA-dev/nbQA - rev: 1.8.5 + rev: 1.8.7 hooks: - id: nbqa-autopep8 - id: nbqa-flake8 @@ -68,7 +68,7 @@ repos: - id: nbqa-pyupgrade args: ["--py39-plus"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.0 + rev: v1.11.2 hooks: - id: mypy additional_dependencies: [pandas-stubs, types-tqdm] @@ -90,7 +90,7 @@ repos: - id: requirements-txt-fixer - id: trailing-whitespace - repo: https://github.com/hhatto/autopep8 - rev: v2.3.0 + rev: v2.3.1 hooks: - id: autopep8 - repo: https://github.com/PyCQA/autoflake @@ -98,11 +98,11 @@ repos: hooks: - id: autoflake - repo: https://github.com/PyCQA/doc8 - rev: v1.1.1 + rev: v1.1.2 hooks: - id: doc8 - repo: https://github.com/PyCQA/flake8 - rev: 7.1.0 + rev: 7.1.1 hooks: - id: flake8 - repo: https://github.com/pycqa/pydocstyle @@ -121,7 +121,7 @@ repos: '--ignore=D103,D107,D213', ] - repo: https://github.com/PyCQA/pylint - rev: v3.2.3 + rev: v3.3.1 hooks: - id: pylint name: pylint From ee4400695b37e2af2d2d3bef6b2bfb916cafd34c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 28 Sep 2024 19:09:41 +0000 Subject: [PATCH 27/31] build: update nbconvert requirement from <7.14,>=7.0.0 to >=7.16.4,<7.17 (#741) Updates the requirements on [nbconvert](https://github.com/jupyter/nbconvert) to permit the latest version. - [Release notes](https://github.com/jupyter/nbconvert/releases) - [Changelog](https://github.com/jupyter/nbconvert/blob/main/CHANGELOG.md) - [Commits](https://github.com/jupyter/nbconvert/compare/7.0.0...v7.16.4) --- updated-dependencies: - dependency-name: nbconvert dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 075d01c49..420a8dc3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dynamic = ["version"] [project.optional-dependencies] docs = [ "ipykernel>=6.13.0", - "nbconvert>=7.0.0,<7.14", + "nbconvert>=7.0.0,<7.17", "nbsphinx>=0.8.8,<0.9.5", "pandoc", "pybtex", From eee4f76672fe1060c39c6f3c03adc4601e9f6906 Mon Sep 17 00:00:00 2001 From: "David R. Reich" <43832476+SiQube@users.noreply.github.com> Date: Sun, 29 Sep 2024 17:46:24 +0200 Subject: [PATCH 28/31] update pydoclint in pre-commit config (#819) --- .pre-commit-config.yaml | 2 +- src/pymovements/datasets/gaze_graph.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d3aef8533..bb3345214 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,7 +53,7 @@ repos: args: [--use-current-year] types: [python] - repo: https://github.com/jsh9/pydoclint - rev: 0.4.2 + rev: 0.5.8 hooks: - id: pydoclint args: ["--config=pyproject.toml"] diff --git a/src/pymovements/datasets/gaze_graph.py b/src/pymovements/datasets/gaze_graph.py index 530fc046d..88b50c75e 100644 --- a/src/pymovements/datasets/gaze_graph.py +++ b/src/pymovements/datasets/gaze_graph.py @@ -77,7 +77,7 @@ class GazeGraph(DatasetDefinition): Regular expression which will be matched before trying to load the file. Namedgroups will appear in the `fileinfo` dataframe. - filename_format_schema_overrides: dict[str, dict[str, Any]] + filename_format_schema_overrides: dict[str, dict[str, type]] If named groups are present in the `filename_format`, this makes it possible to cast specific named groups to a particular datatype. From dbad9c17137ae5e6685dc1d27df315b353bcf3b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20S=C3=A4uberli?= <38892775+saeub@users.noreply.github.com> Date: Sun, 29 Sep 2024 21:37:22 +0200 Subject: [PATCH 29/31] feat!: Custom patterns for parsing logged metadata in ASC files (#767) * Return metadata from from_asc() * Parse metadata from ASC files based on custom patterns * Refactor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor * Fix test coverage * Fix docstrings * Fix docstring --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: David R. Reich <43832476+SiQube@users.noreply.github.com> --- src/pymovements/dataset/dataset_files.py | 2 +- src/pymovements/gaze/io.py | 31 +-- src/pymovements/utils/parsing.py | 198 +++++++++--------- tests/functional/gaze_file_processing_test.py | 2 +- tests/unit/gaze/io/asc_test.py | 38 +++- tests/unit/utils/parsing_test.py | 16 ++ 6 files changed, 173 insertions(+), 114 deletions(-) diff --git a/src/pymovements/dataset/dataset_files.py b/src/pymovements/dataset/dataset_files.py index dfc4c40eb..1553fddb9 100644 --- a/src/pymovements/dataset/dataset_files.py +++ b/src/pymovements/dataset/dataset_files.py @@ -377,7 +377,7 @@ def load_gaze_file( column_schema_overrides=definition.filename_format_schema_overrides['gaze'], ) elif filepath.suffix == '.asc': - gaze_df = from_asc( + gaze_df, _ = from_asc( filepath, experiment=definition.experiment, add_columns=add_columns, diff --git a/src/pymovements/gaze/io.py b/src/pymovements/gaze/io.py index 722477491..18868e3dd 100644 --- a/src/pymovements/gaze/io.py +++ b/src/pymovements/gaze/io.py @@ -271,21 +271,25 @@ def from_csv( def from_asc( file: str | Path, *, - patterns: str | list | None = 'eyelink', + patterns: str | list[dict[str, Any] | str] | None = 'eyelink', + metadata_patterns: list[dict[str, Any] | str] | None = None, schema: dict[str, Any] | None = None, experiment: Experiment | None = None, add_columns: dict[str, str] | None = None, - column_schema_overrides: dict[str, type] | None = None, -) -> GazeDataFrame: + column_schema_overrides: dict[str, Any] | None = None, +) -> tuple[GazeDataFrame, dict[str, Any]]: """Initialize a :py:class:`pymovements.gaze.gaze_dataframe.GazeDataFrame`. Parameters ---------- file: str | Path Path of IPC/feather file. - patterns: str | list | None - list of patterns to match for additional columns or a key identifier of eye tracker specific + patterns: str | list[dict[str, Any] | str] | None + List of patterns to match for additional columns or a key identifier of eye tracker specific default patterns. Supported values are: eyelink. (default: 'eyelink') + metadata_patterns: list[dict[str, Any] | str] | None + List of patterns to match for extracting metadata from custom logged messages. + (default: None) schema: dict[str, Any] | None Dictionary to optionally specify types of columns parsed by patterns. (default: None) experiment: Experiment | None @@ -293,14 +297,14 @@ def from_asc( add_columns: dict[str, str] | None Dictionary containing columns to add to loaded data frame. (default: None) - column_schema_overrides: dict[str, type] | None + column_schema_overrides: dict[str, Any] | None Dictionary containing types for columns. (default: None) Returns ------- - GazeDataFrame - The gaze data frame read from the asc file. + tuple[GazeDataFrame, dict[str, Any]] + The gaze data frame and a metadata dictionary read from the asc file. Examples -------- @@ -308,7 +312,7 @@ def from_asc( We can then load the data into a ``GazeDataFrame``: >>> from pymovements.gaze.io import from_asc - >>> gaze = from_asc(file='tests/files/eyelink_monocular_example.asc', patterns='eyelink') + >>> gaze, metadata = from_asc(file='tests/files/eyelink_monocular_example.asc') >>> gaze.frame shape: (16, 3) ┌─────────┬───────┬────────────────┐ @@ -328,7 +332,8 @@ def from_asc( │ 2339290 ┆ 618.0 ┆ [637.6, 531.4] │ │ 2339291 ┆ 618.0 ┆ [637.3, 531.2] │ └─────────┴───────┴────────────────┘ - + >>> metadata['sampling_rate'] + 1000.0 """ if isinstance(patterns, str): if patterns == 'eyelink': @@ -338,7 +343,9 @@ def from_asc( raise ValueError(f"unknown pattern key '{patterns}'. Supported keys are: eyelink") # Read data. - gaze_data, _ = parse_eyelink(file, patterns=patterns, schema=schema) + gaze_data, metadata = parse_eyelink( + file, patterns=patterns, schema=schema, metadata_patterns=metadata_patterns, + ) if add_columns is not None: gaze_data = gaze_data.with_columns([ @@ -361,7 +368,7 @@ def from_asc( time_unit='ms', pixel_columns=['x_pix', 'y_pix'], ) - return gaze_df + return gaze_df, metadata def from_ipc( diff --git a/src/pymovements/utils/parsing.py b/src/pymovements/utils/parsing.py index a2e12f43b..849cb8641 100755 --- a/src/pymovements/utils/parsing.py +++ b/src/pymovements/utils/parsing.py @@ -40,58 +40,58 @@ ) EYELINK_META_REGEXES = [ - {'pattern': r'\*\*\s+VERSION:\s+(?P.*)\s+'}, - { - 'pattern': r'\*\*\s+DATE:\s+(?P[A-Z,a-z]+)\s+(?P[A-Z,a-z]+)' - r'\s+(?P\d\d?)\s+(?P