Skip to content

Commit

Permalink
Merge remote-tracking branch upstream/main into feat/asc-metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
saeub committed Sep 29, 2024
2 parents 58417ce + 84f1416 commit b2ce27f
Show file tree
Hide file tree
Showing 46 changed files with 652 additions and 265 deletions.
16 changes: 8 additions & 8 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ repos:
hooks:
- id: add-trailing-comma
- repo: https://github.com/asottile/pyupgrade
rev: v3.16.0
rev: v3.17.0
hooks:
- id: pyupgrade
args: [--py39-plus]
Expand Down Expand Up @@ -53,12 +53,12 @@ repos:
args: [--use-current-year]
types: [python]
- repo: https://github.com/jsh9/pydoclint
rev: 0.4.2
rev: 0.5.8
hooks:
- id: pydoclint
args: ["--config=pyproject.toml"]
- repo: https://github.com/nbQA-dev/nbQA
rev: 1.8.5
rev: 1.8.7
hooks:
- id: nbqa-autopep8
- id: nbqa-flake8
Expand All @@ -68,7 +68,7 @@ repos:
- id: nbqa-pyupgrade
args: ["--py39-plus"]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.10.0
rev: v1.11.2
hooks:
- id: mypy
additional_dependencies: [pandas-stubs, types-tqdm]
Expand All @@ -90,19 +90,19 @@ repos:
- id: requirements-txt-fixer
- id: trailing-whitespace
- repo: https://github.com/hhatto/autopep8
rev: v2.3.0
rev: v2.3.1
hooks:
- id: autopep8
- repo: https://github.com/PyCQA/autoflake
rev: v2.3.1
hooks:
- id: autoflake
- repo: https://github.com/PyCQA/doc8
rev: v1.1.1
rev: v1.1.2
hooks:
- id: doc8
- repo: https://github.com/PyCQA/flake8
rev: 7.1.0
rev: 7.1.1
hooks:
- id: flake8
- repo: https://github.com/pycqa/pydocstyle
Expand All @@ -121,7 +121,7 @@ repos:
'--ignore=D103,D107,D213',
]
- repo: https://github.com/PyCQA/pylint
rev: v3.2.3
rev: v3.3.1
hooks:
- id: pylint
name: pylint
Expand Down
4 changes: 2 additions & 2 deletions docs/source/tutorials/local-dataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@
"metadata": {},
"outputs": [],
"source": [
"filename_format_dtypes = {'gaze': {\n",
"filename_format_schema_overrides = {'gaze': {\n",
" 'text_id': int,\n",
" 'page_id': int,\n",
"},\n",
Expand Down Expand Up @@ -254,7 +254,7 @@
" has_files={'gaze': True, 'precomputed_events': False, 'precomputed_reading_measures': False},\n",
" experiment=experiment,\n",
" filename_format=filename_format,\n",
" filename_format_dtypes=filename_format_dtypes,\n",
" filename_format_schema_overrides=filename_format_schema_overrides,\n",
" custom_read_kwargs=custom_read_kwargs,\n",
" time_column=time_column,\n",
" time_unit=time_unit,\n",
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies = [
"matplotlib>=3.8.0,<3.10",
"numpy>=1.22.4,<3",
"pandas>=2.1.4,<3",
"polars>=0.20.1,<0.20.3",
"polars>=1.8.2,<2",
"pyarrow>=11.0.0,<18",
"pyopenssl>=16.0.0,<25.0.0",
"scipy>=1.8.0,<2",
Expand All @@ -47,7 +47,7 @@ dynamic = ["version"]
[project.optional-dependencies]
docs = [
"ipykernel>=6.13.0",
"nbconvert>=7.0.0,<7.14",
"nbconvert>=7.0.0,<7.17",
"nbsphinx>=0.8.8,<0.9.5",
"pandoc",
"pybtex",
Expand Down
Binary file not shown.
17 changes: 10 additions & 7 deletions src/pymovements/dataset/dataset_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ class DatasetDefinition:
----------
name: str
The name of the dataset. (default: '.')
has_files: dict[str, bool]
Indicate whether the dataset contains 'gaze', 'precomputed_events', and
'precomputed_reading_measures'.
mirrors: dict[str, tuple[str, ...]]
A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
(default: field(default_factory=dict))
Expand All @@ -44,16 +47,16 @@ class DatasetDefinition:
- `filename`: The filename under which the file is saved as.
- `md5`: The MD5 checksum of the respective file.
(default: field(default_factory=dict))
experiment: Experiment
experiment: Experiment | None
The experiment definition. (default: None)
extract: dict[str, bool]
Decide whether to extract the data.
filename_format: dict[str, str]
Regular expression which will be matched before trying to load the file. Namedgroups will
appear in the `fileinfo` dataframe. (default: field(default_factory=dict))
filename_format_dtypes: dict[str, dict[str, type]]
filename_format_schema_overrides: dict[str, dict[str, type]]
If named groups are present in the `filename_format`, this makes it possible to cast
specific named groups to a particular datatype. (default: field(default_factory=dict))
extract: dict[str, bool]
Decide whether to extract the data.
custom_read_kwargs: dict[str, dict[str, Any]]
If specified, these keyword arguments will be passed to the file reading function. The
behavior of this argument depends on the file extension of the dataset files.
Expand Down Expand Up @@ -119,10 +122,10 @@ class DatasetDefinition:
3. Specifying column datatypes
``polars.read_csv`` infers data types from a fixed number of rows, which might not be accurate
for the entire dataset. To ensure correct data types, you can pass a dictionary to the
``dtypes`` keyword argument in ``gaze_custom_read_kwargs``.
``schema_overrides`` keyword argument in ``gaze_custom_read_kwargs``.
Use data types from the `polars` library.
For instance:
``gaze_custom_read_kwargs={'dtypes': {'col1': polars.Int64, 'col2': polars.Float64}}``
``gaze_custom_read_kwargs={'schema_overrides': {'col1': polars.Int64, 'col2': polars.Float64}}``
"""

# pylint: disable=too-many-instance-attributes
Expand All @@ -138,7 +141,7 @@ class DatasetDefinition:

filename_format: dict[str, str] = field(default_factory=dict)

filename_format_dtypes: dict[str, dict[str, type]] = field(default_factory=dict)
filename_format_schema_overrides: dict[str, dict[str, type]] = field(default_factory=dict)

custom_read_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)

Expand Down
31 changes: 17 additions & 14 deletions src/pymovements/dataset/dataset_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def scan_dataset(definition: DatasetDefinition, paths: DatasetPaths) -> pl.DataF

fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
fileinfo_df = fileinfo_df.sort(by='filepath')
if definition.filename_format_dtypes['gaze']:
items = definition.filename_format_dtypes['gaze'].items()
if definition.filename_format_schema_overrides['gaze']:
items = definition.filename_format_schema_overrides['gaze'].items()
fileinfo_df = fileinfo_df.with_columns([
pl.col(fileinfo_key).cast(fileinfo_dtype)
for fileinfo_key, fileinfo_dtype in items
Expand All @@ -92,31 +92,33 @@ def scan_dataset(definition: DatasetDefinition, paths: DatasetPaths) -> pl.DataF
raise RuntimeError(f'no matching files found in {paths.precomputed_events}')
fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
fileinfo_df = fileinfo_df.sort(by='filepath')
if definition.filename_format_dtypes['precomputed_events']:
items = definition.filename_format_dtypes['precomputed_events'].items()
if definition.filename_format_schema_overrides['precomputed_events']:
items = definition.filename_format_schema_overrides['precomputed_events'].items()
fileinfo_df = fileinfo_df.with_columns([
pl.col(fileinfo_key).cast(fileinfo_dtype)
for fileinfo_key, fileinfo_dtype in items
])
_fileinfo_dicts['precomputed_events'] = fileinfo_df

if definition.has_files['precomputed_reading_measures']:
pc_rm = 'precomputed_reading_measures'
if definition.has_files[pc_rm]:
fileinfo_dicts = match_filepaths(
path=paths.precomputed_reading_measures,
regex=curly_to_regex(definition.filename_format['precomputed_reading_measures']),
regex=curly_to_regex(definition.filename_format[pc_rm]),
relative=True,
)
if not fileinfo_dicts:
raise RuntimeError(f'no matching files found in {paths.precomputed_reading_measures}')
fileinfo_df = pl.from_dicts(data=fileinfo_dicts, infer_schema_length=1)
fileinfo_df = fileinfo_df.sort(by='filepath')
if definition.filename_format_dtypes['precomputed_reading_measures']:
items = definition.filename_format_dtypes['precomputed_reading_measures'].items()
if definition.filename_format_schema_overrides[pc_rm]:
_schema_overrides = definition.filename_format_schema_overrides[pc_rm]
items = _schema_overrides.items()
fileinfo_df = fileinfo_df.with_columns([
pl.col(fileinfo_key).cast(fileinfo_dtype)
for fileinfo_key, fileinfo_dtype in items
])
_fileinfo_dicts['precomputed_reading_measures'] = fileinfo_df
_fileinfo_dicts[pc_rm] = fileinfo_df

return _fileinfo_dicts

Expand Down Expand Up @@ -316,7 +318,7 @@ def load_gaze_file(
trial_columns=definition.trial_columns,
time_unit=time_unit,
add_columns=add_columns,
column_dtypes=definition.filename_format_dtypes['gaze'],
column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
)

# suffixes as ordered after using GazeDataFrame.unnest()
Expand Down Expand Up @@ -364,22 +366,22 @@ def load_gaze_file(
trial_columns=definition.trial_columns,
column_map=definition.column_map,
add_columns=add_columns,
column_dtypes=definition.filename_format_dtypes['gaze'],
column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
**custom_read_kwargs,
)
elif filepath.suffix == '.feather':
gaze_df = from_ipc(
filepath,
experiment=definition.experiment,
add_columns=add_columns,
column_dtypes=definition.filename_format_dtypes['gaze'],
column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
)
elif filepath.suffix == '.asc':
gaze_df, _ = from_asc(
filepath,
experiment=definition.experiment,
add_columns=add_columns,
column_dtypes=definition.filename_format_dtypes['gaze'],
column_schema_overrides=definition.filename_format_schema_overrides['gaze'],
**custom_read_kwargs,
)
else:
Expand Down Expand Up @@ -556,9 +558,10 @@ def add_fileinfo(
)

# Cast columns from fileinfo according to specification.
_schema_overrides = definition.filename_format_schema_overrides['gaze']
df = df.with_columns([
pl.col(fileinfo_key).cast(fileinfo_dtype)
for fileinfo_key, fileinfo_dtype in definition.filename_format_dtypes['gaze'].items()
for fileinfo_key, fileinfo_dtype in _schema_overrides.items()
])
return df

Expand Down
59 changes: 44 additions & 15 deletions src/pymovements/datasets/copco.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,34 +47,62 @@ class CopCo(DatasetDefinition):
Attributes
----------
name : str
name: str
The name of the dataset.
mirrors : tuple[str, ...]
has_files: dict[str, bool]
Indicate whether the dataset contains 'gaze', 'precomputed_events', and
'precomputed_reading_measures'.
mirrors: dict[str, tuple[str, ...]]
A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
resources : tuple[dict[str, str], ...]
resources: dict[str, tuple[dict[str, str | None], ...]]
A tuple of dataset gaze_resources. Each list entry must be a dictionary with the following
keys:
- `resource`: The url suffix of the resource. This will be concatenated with the mirror.
- `filename`: The filename under which the file is saved as.
- `md5`: The MD5 checksum of the respective file.
experiment : Experiment
experiment: Experiment
The experiment definition.
filename_format : str
extract: dict[str, bool]
Decide whether to extract the data.
filename_format: dict[str, str]
Regular expression which will be matched before trying to load the file. Namedgroups will
appear in the `fileinfo` dataframe.
filename_format_dtypes : dict[str, type], optional
filename_format_schema_overrides: dict[str, dict[str, type]]
If named groups are present in the `filename_format`, this makes it possible to cast
specific named groups to a particular datatype.
column_map : dict[str, str]
trial_columns: list[str]
The name of the trial columns in the input data frame. If the list is empty or None,
the input data frame is assumed to contain only one trial. If the list is not empty,
the input data frame is assumed to contain multiple trials and the transformation
methods will be applied to each trial separately.
time_column: str
The name of the timestamp column in the input data frame. This column will be renamed to
``time``.
time_unit: str
The unit of the timestamps in the timestamp column in the input data frame. Supported
units are 's' for seconds, 'ms' for milliseconds and 'step' for steps. If the unit is
'step' the experiment definition must be specified. All timestamps will be converted to
milliseconds.
pixel_columns: list[str]
The name of the pixel position columns in the input data frame. These columns will be
nested into the column ``pixel``. If the list is empty or None, the nested ``pixel``
column will not be created.
column_map: dict[str, str]
The keys are the columns to read, the values are the names to which they should be renamed.
custom_read_kwargs : dict[str, Any], optional
custom_read_kwargs: dict[str, Any]
If specified, these keyword arguments will be passed to the file reading function.
Examples
Expand Down Expand Up @@ -107,12 +135,6 @@ class CopCo(DatasetDefinition):
'precomputed_reading_measures': True,
},
)
extract: dict[str, bool] = field(
default_factory=lambda: {
'precomputed_events': True,
'precomputed_reading_measures': True,
},
)
mirrors: dict[str, tuple[str, ...]] = field(
default_factory=lambda: {
'precomputed_events': ('https://files.de-1.osf.io/',),
Expand Down Expand Up @@ -150,14 +172,21 @@ class CopCo(DatasetDefinition):
sampling_rate=1000,
)

extract: dict[str, bool] = field(
default_factory=lambda: {
'precomputed_events': True,
'precomputed_reading_measures': True,
},
)

filename_format: dict[str, str] = field(
default_factory=lambda: {
'precomputed_events': r'FIX_report_P{subject_id:d}.txt',
'precomputed_reading_measures': r'P{subject_id:d}.csv',
},
)

filename_format_dtypes: dict[str, dict[str, type]] = field(
filename_format_schema_overrides: dict[str, dict[str, type]] = field(
default_factory=lambda: {
'precomputed_events': {},
'precomputed_reading_measures': {},
Expand Down
Loading

0 comments on commit b2ce27f

Please sign in to comment.