Merge remote-tracking branch upstream/main into feat/asc-metadata

aeye-lab · Sep 29, 2024 · b2ce27f · b2ce27f
2 parents 58417ce + 84f1416
commit b2ce27f
Show file tree

Hide file tree

Showing 46 changed files with 652 additions and 265 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
     hooks:
     -   id: add-trailing-comma
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.16.0
+    rev: v3.17.0
     hooks:
     -   id: pyupgrade
         args: [--py39-plus]
@@ -53,12 +53,12 @@ repos:
         args: [--use-current-year]
         types: [python]
 -   repo: https://github.com/jsh9/pydoclint
-    rev: 0.4.2
+    rev: 0.5.8
     hooks:
     - id: pydoclint
       args: ["--config=pyproject.toml"]
 -   repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.8.5
+    rev: 1.8.7
     hooks:
     -   id: nbqa-autopep8
     -   id: nbqa-flake8
@@ -68,7 +68,7 @@ repos:
     -   id: nbqa-pyupgrade
         args: ["--py39-plus"]
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.11.2
     hooks:
     -   id: mypy
         additional_dependencies: [pandas-stubs, types-tqdm]
@@ -90,19 +90,19 @@ repos:
     -   id: requirements-txt-fixer
     -   id: trailing-whitespace
 -   repo: https://github.com/hhatto/autopep8
-    rev: v2.3.0
+    rev: v2.3.1
     hooks:
     -   id: autopep8
 -   repo: https://github.com/PyCQA/autoflake
     rev: v2.3.1
     hooks:
     -   id: autoflake
 -   repo: https://github.com/PyCQA/doc8
-    rev: v1.1.1
+    rev: v1.1.2
     hooks:
     -   id: doc8
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.1.0
+    rev: 7.1.1
     hooks:
     -   id: flake8
 -   repo: https://github.com/pycqa/pydocstyle
@@ -121,7 +121,7 @@ repos:
             '--ignore=D103,D107,D213',
           ]
 -   repo: https://github.com/PyCQA/pylint
-    rev: v3.2.3
+    rev: v3.3.1
     hooks:
       - id: pylint
         name: pylint

diff --git a/docs/source/tutorials/local-dataset.ipynb b/docs/source/tutorials/local-dataset.ipynb
@@ -142,7 +142,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "filename_format_dtypes = {'gaze': {\n",
+    "filename_format_schema_overrides = {'gaze': {\n",
     "    'text_id': int,\n",
     "    'page_id': int,\n",
     "},\n",
@@ -254,7 +254,7 @@
     "    has_files={'gaze': True, 'precomputed_events': False, 'precomputed_reading_measures': False},\n",
     "    experiment=experiment,\n",
     "    filename_format=filename_format,\n",
-    "    filename_format_dtypes=filename_format_dtypes,\n",
+    "    filename_format_schema_overrides=filename_format_schema_overrides,\n",
     "    custom_read_kwargs=custom_read_kwargs,\n",
     "    time_column=time_column,\n",
     "    time_unit=time_unit,\n",

diff --git a/pyproject.toml b/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
   "matplotlib>=3.8.0,<3.10",
   "numpy>=1.22.4,<3",
   "pandas>=2.1.4,<3",
-  "polars>=0.20.1,<0.20.3",
+  "polars>=1.8.2,<2",
   "pyarrow>=11.0.0,<18",
   "pyopenssl>=16.0.0,<25.0.0",
   "scipy>=1.8.0,<2",
@@ -47,7 +47,7 @@ dynamic = ["version"]
 [project.optional-dependencies]
 docs = [
   "ipykernel>=6.13.0",
-  "nbconvert>=7.0.0,<7.14",
+  "nbconvert>=7.0.0,<7.17",
   "nbsphinx>=0.8.8,<0.9.5",
   "pandoc",
   "pybtex",

diff --git a/src/pymovements/dataset/.dataset_definition.py.swp b/src/pymovements/dataset/.dataset_definition.py.swp
diff --git a/src/pymovements/dataset/dataset_definition.py b/src/pymovements/dataset/dataset_definition.py
@@ -35,6 +35,9 @@ class DatasetDefinition:
     ----------
     name: str
         The name of the dataset. (default: '.')
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
     mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
         (default: field(default_factory=dict))
@@ -44,16 +47,16 @@ class DatasetDefinition:
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
         (default: field(default_factory=dict))
-    experiment: Experiment
+    experiment: Experiment | None
         The experiment definition. (default: None)
+    extract: dict[str, bool]
+        Decide whether to extract the data.
     filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe. (default: field(default_factory=dict))
-    filename_format_dtypes: dict[str, dict[str, type]]
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype. (default: field(default_factory=dict))
-    extract: dict[str, bool]
-        Decide whether to extract the data.
     custom_read_kwargs: dict[str, dict[str, Any]]
         If specified, these keyword arguments will be passed to the file reading function. The
         behavior of this argument depends on the file extension of the dataset files.
@@ -119,10 +122,10 @@ class DatasetDefinition:
     3. Specifying column datatypes
     ``polars.read_csv`` infers data types from a fixed number of rows, which might not be accurate
     for the entire dataset. To ensure correct data types, you can pass a dictionary to the
-    ``dtypes`` keyword argument in ``gaze_custom_read_kwargs``.
+    ``schema_overrides`` keyword argument in ``gaze_custom_read_kwargs``.
     Use data types from the `polars` library.
     For instance:
-    ``gaze_custom_read_kwargs={'dtypes': {'col1': polars.Int64, 'col2': polars.Float64}}``
+    ``gaze_custom_read_kwargs={'schema_overrides': {'col1': polars.Int64, 'col2': polars.Float64}}``
     """
 
     # pylint: disable=too-many-instance-attributes
@@ -138,7 +141,7 @@ class DatasetDefinition:
 
     filename_format: dict[str, str] = field(default_factory=dict)
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(default_factory=dict)
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(default_factory=dict)
 
     custom_read_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
 

diff --git a/src/pymovements/dataset/dataset_files.py b/src/pymovements/dataset/dataset_files.py
@@ -74,8 +74,8 @@ def scan_dataset(definition: DatasetDefinition, paths: DatasetPaths) -> pl.DataF
 
         fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
         fileinfo_df = fileinfo_df.sort(by='filepath')
-        if definition.filename_format_dtypes['gaze']:
-            items = definition.filename_format_dtypes['gaze'].items()
+        if definition.filename_format_schema_overrides['gaze']:
+            items = definition.filename_format_schema_overrides['gaze'].items()
             fileinfo_df = fileinfo_df.with_columns([
                 pl.col(fileinfo_key).cast(fileinfo_dtype)
                 for fileinfo_key, fileinfo_dtype in items
@@ -92,31 +92,33 @@ def scan_dataset(definition: DatasetDefinition, paths: DatasetPaths) -> pl.DataF
             raise RuntimeError(f'no matching files found in {paths.precomputed_events}')
         fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
         fileinfo_df = fileinfo_df.sort(by='filepath')
-        if definition.filename_format_dtypes['precomputed_events']:
-            items = definition.filename_format_dtypes['precomputed_events'].items()
+        if definition.filename_format_schema_overrides['precomputed_events']:
+            items = definition.filename_format_schema_overrides['precomputed_events'].items()
             fileinfo_df = fileinfo_df.with_columns([
                 pl.col(fileinfo_key).cast(fileinfo_dtype)
                 for fileinfo_key, fileinfo_dtype in items
             ])
         _fileinfo_dicts['precomputed_events'] = fileinfo_df
 
-    if definition.has_files['precomputed_reading_measures']:
+    pc_rm = 'precomputed_reading_measures'
+    if definition.has_files[pc_rm]:
         fileinfo_dicts = match_filepaths(
             path=paths.precomputed_reading_measures,
-            regex=curly_to_regex(definition.filename_format['precomputed_reading_measures']),
+            regex=curly_to_regex(definition.filename_format[pc_rm]),
             relative=True,
         )
         if not fileinfo_dicts:
             raise RuntimeError(f'no matching files found in {paths.precomputed_reading_measures}')
         fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
         fileinfo_df = fileinfo_df.sort(by='filepath')
-        if definition.filename_format_dtypes['precomputed_reading_measures']:
-            items = definition.filename_format_dtypes['precomputed_reading_measures'].items()
+        if definition.filename_format_schema_overrides[pc_rm]:
+            _schema_overrides = definition.filename_format_schema_overrides[pc_rm]
+            items = _schema_overrides.items()
             fileinfo_df = fileinfo_df.with_columns([
                 pl.col(fileinfo_key).cast(fileinfo_dtype)
                 for fileinfo_key, fileinfo_dtype in items
             ])
-        _fileinfo_dicts['precomputed_reading_measures'] = fileinfo_df
+        _fileinfo_dicts[pc_rm] = fileinfo_df
 
     return _fileinfo_dicts
 
@@ -316,7 +318,7 @@ def load_gaze_file(
                 trial_columns=definition.trial_columns,
                 time_unit=time_unit,
                 add_columns=add_columns,
-                column_dtypes=definition.filename_format_dtypes['gaze'],
+                column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
             )
 
             # suffixes as ordered after using GazeDataFrame.unnest()
@@ -364,22 +366,22 @@ def load_gaze_file(
                 trial_columns=definition.trial_columns,
                 column_map=definition.column_map,
                 add_columns=add_columns,
-                column_dtypes=definition.filename_format_dtypes['gaze'],
+                column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
                 **custom_read_kwargs,
             )
     elif filepath.suffix == '.feather':
         gaze_df = from_ipc(
             filepath,
             experiment=definition.experiment,
             add_columns=add_columns,
-            column_dtypes=definition.filename_format_dtypes['gaze'],
+            column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
         )
     elif filepath.suffix == '.asc':
         gaze_df, _ = from_asc(
             filepath,
             experiment=definition.experiment,
             add_columns=add_columns,
-            column_dtypes=definition.filename_format_dtypes['gaze'],
+            column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
             **custom_read_kwargs,
         )
     else:
@@ -556,9 +558,10 @@ def add_fileinfo(
     )
 
     # Cast columns from fileinfo according to specification.
+    _schema_overrides = definition.filename_format_schema_overrides['gaze']
     df = df.with_columns([
         pl.col(fileinfo_key).cast(fileinfo_dtype)
-        for fileinfo_key, fileinfo_dtype in definition.filename_format_dtypes['gaze'].items()
+        for fileinfo_key, fileinfo_dtype in _schema_overrides.items()
     ])
     return df
 

diff --git a/src/pymovements/datasets/copco.py b/src/pymovements/datasets/copco.py
@@ -47,34 +47,62 @@ class CopCo(DatasetDefinition):
 
     Attributes
     ----------
-    name : str
+    name: str
         The name of the dataset.
 
-    mirrors : tuple[str, ...]
+    has_files: dict[str, bool]
+        Indicate whether the dataset contains 'gaze', 'precomputed_events', and
+        'precomputed_reading_measures'.
+
+    mirrors: dict[str, tuple[str, ...]]
         A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
 
-    resources : tuple[dict[str, str], ...]
+    resources: dict[str, tuple[dict[str, str | None], ...]]
         A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
         keys:
         - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
         - `filename`: The filename under which the file is saved as.
         - `md5`: The MD5 checksum of the respective file.
 
-    experiment : Experiment
+    experiment: Experiment
         The experiment definition.
 
-    filename_format : str
+    extract: dict[str, bool]
+        Decide whether to extract the data.
+
+    filename_format: dict[str, str]
         Regular expression which will be matched before trying to load the file. Namedgroups will
         appear in the `fileinfo` dataframe.
 
-    filename_format_dtypes : dict[str, type], optional
+    filename_format_schema_overrides: dict[str, dict[str, type]]
         If named groups are present in the `filename_format`, this makes it possible to cast
         specific named groups to a particular datatype.
 
-    column_map : dict[str, str]
+    trial_columns: list[str]
+            The name of the trial columns in the input data frame. If the list is empty or None,
+            the input data frame is assumed to contain only one trial. If the list is not empty,
+            the input data frame is assumed to contain multiple trials and the transformation
+            methods will be applied to each trial separately.
+
+    time_column: str
+        The name of the timestamp column in the input data frame. This column will be renamed to
+        ``time``.
+
+    time_unit: str
+        The unit of the timestamps in the timestamp column in the input data frame. Supported
+        units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
+        'step' the experiment definition must be specified. All timestamps will be converted to
+        milliseconds.
+
+    pixel_columns: list[str]
+        The name of the pixel position columns in the input data frame. These columns will be
+        nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
+        column will not be created.
+
+    column_map: dict[str, str]
         The keys are the columns to read, the values are the names to which they should be renamed.
 
-    custom_read_kwargs : dict[str, Any], optional
+    custom_read_kwargs: dict[str, Any]
         If specified, these keyword arguments will be passed to the file reading function.
 
     Examples
@@ -107,12 +135,6 @@ class CopCo(DatasetDefinition):
             'precomputed_reading_measures': True,
         },
     )
-    extract: dict[str, bool] = field(
-        default_factory=lambda: {
-            'precomputed_events': True,
-            'precomputed_reading_measures': True,
-        },
-    )
     mirrors: dict[str, tuple[str, ...]] = field(
         default_factory=lambda: {
             'precomputed_events': ('https://files.de-1.osf.io/',),
@@ -150,14 +172,21 @@ class CopCo(DatasetDefinition):
         sampling_rate=1000,
     )
 
+    extract: dict[str, bool] = field(
+        default_factory=lambda: {
+            'precomputed_events': True,
+            'precomputed_reading_measures': True,
+        },
+    )
+
     filename_format: dict[str, str] = field(
         default_factory=lambda: {
             'precomputed_events': r'FIX_report_P{subject_id:d}.txt',
             'precomputed_reading_measures': r'P{subject_id:d}.csv',
         },
     )
 
-    filename_format_dtypes: dict[str, dict[str, type]] = field(
+    filename_format_schema_overrides: dict[str, dict[str, type]] = field(
         default_factory=lambda: {
             'precomputed_events': {},
             'precomputed_reading_measures': {},