Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TEST][REF] Move functions specific to OASIS1 converter in a dedicated utils file and add unit tests #1313

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 1 addition & 185 deletions clinica/iotools/bids_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,146 +464,6 @@ def create_participants_df(
return participant_df


def create_sessions_dict_oasis(
clinical_data_dir: Path,
bids_dir: Path,
study_name: StudyName,
clinical_specifications_folder: Path,
bids_ids: list[str],
name_column_ids: str,
subj_to_remove: Optional[list[str]] = None,
participants_df: Optional[pd.DataFrame] = None,
) -> dict:
"""Extract the information regarding the sessions and store them in a dictionary (session M00 only).

Parameters
----------
clinical_data_dir : Path
The path to the input folder.

bids_dir : Path
The path to the BIDS directory.

study_name : StudyName
The name of the study (Ex: ADNI).

clinical_specifications_folder : Path
The path to the clinical file.

bids_ids : list of str
The list of bids ids.

name_column_ids : str
The name of the column where the subject ids are stored.

subj_to_remove : list of str, optional
The list of subject IDs to remove.

participants_df : pd.DataFrame, optional
A pandas dataframe that contains the participants data (required for OASIS3 only).

Returns
-------
dict :
Session dict.
"""
import numpy as np

from clinica.utils.stream import cprint

subj_to_remove = subj_to_remove or []
location = f"{study_name.value} location"
sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
sessions_fields = sessions[study_name.value]
field_location = sessions[location]
sessions_fields_bids = sessions["BIDS CLINICA"]
fields_dataset = []
fields_bids = []
sessions_dict = {}

for i in range(0, len(sessions_fields)):
if not pd.isnull(sessions_fields[i]):
fields_bids.append(sessions_fields_bids[i])
fields_dataset.append(sessions_fields[i])

for i in range(0, len(sessions_fields)):
# If the i-th field is available
if not pd.isnull(sessions_fields[i]):
# Load the file
tmp = field_location[i].split("/")
location = tmp[0]
if len(tmp) > 1:
sheet = tmp[1]
else:
sheet = ""

file_to_read_path = clinical_data_dir / location
file_ext = os.path.splitext(location)[1]
if file_ext == ".xlsx":
file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
elif file_ext == ".csv":
file_to_read = pd.read_csv(file_to_read_path)
else:
raise ValueError(
f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
)

for r in range(0, len(file_to_read.values)):
# Extracts the subject ids columns from the dataframe
subj_id = file_to_read.iloc[r][name_column_ids]
if hasattr(subj_id, "dtype"):
if subj_id.dtype == np.int64:
subj_id = str(subj_id)
# Removes all the - from
subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])

# Extract the corresponding BIDS id and create the output file if doesn't exist
subj_bids = [s for s in bids_ids if subj_id_alpha in s]
if len(subj_bids) == 0:
# If the subject is not an excluded one
if subj_id not in subj_to_remove:
cprint(
f"{sessions_fields[i]} for {subj_id} not found in the BIDS converted.",
"info",
)
else:
subj_bids = subj_bids[0]
subj_dir = bids_dir / subj_bids
session_names = get_bids_sess_list(subj_dir)
for s in session_names:
s_name = s.replace("ses-", "")
if study_name == StudyName.OASIS3:
row = file_to_read[
file_to_read["MR ID"].str.startswith(subj_id)
& file_to_read["MR ID"].str.endswith(s_name)
].iloc[0]
else:
row = file_to_read.iloc[r]
if subj_bids not in sessions_dict:
sessions_dict.update({subj_bids: {}})
if s_name not in sessions_dict[subj_bids].keys():
sessions_dict[subj_bids].update({s_name: {"session_id": s}})
(sessions_dict[subj_bids][s_name]).update(
{sessions_fields_bids[i]: row[sessions_fields[i]]}
)
# Calculate the difference in months for OASIS3 only
if (
study_name == StudyName.OASIS3
and sessions_fields_bids[i] == "age"
):
diff_years = (
float(sessions_dict[subj_bids][s_name]["age"])
- participants_df[
participants_df["participant_id"] == subj_bids
]["age_bl"]
)
(sessions_dict[subj_bids][s_name]).update(
{"diff_months": round(float(diff_years) * 12)}
)

return sessions_dict


def create_scans_dict(
clinical_data_dir: Path,
study_name: StudyName,
Expand Down Expand Up @@ -836,51 +696,6 @@ def write_modality_agnostic_files(
_write_bidsignore(bids_dir)


# todo : move to oasis utils ?
def write_sessions_tsv(bids_dir: Path, sessions_dict: dict) -> None:
"""Create <participant_id>_sessions.tsv files.

Basically writes the content of the function
`clinica.iotools.bids_utils.create_sessions_dict` in several TSV files
following the BIDS specification.

Parameters
----------
bids_dir : Path
The path to the BIDS directory.

sessions_dict : dict
Dictionary containing sessions metadata.

.. note::
This is the output of the function
`clinica.iotools.bids_utils.create_sessions_dict`.

See also
--------
create_sessions_dict
write_scans_tsv
"""
for subject_path in bids_dir.glob("sub-*"):
if subject_path.name in sessions_dict:
session_df = pd.DataFrame.from_dict(
sessions_dict[subject_path.name], orient="index"
)
cols = session_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
session_df = session_df[cols]
else:
print(f"No session data available for {subject_path}")
session_df = pd.DataFrame(columns=["session_id"])
session_df["session_id"] = pd.Series("M000")
session_df = session_df.set_index("session_id").fillna("n/a")
session_df.to_csv(
subject_path / f"{subject_path.name}_sessions.tsv",
sep="\t",
encoding="utf8",
)


def _get_pet_tracer_from_filename(filename: str) -> Tracer:
"""Return the PET tracer from the provided filename.

Expand Down Expand Up @@ -1227,6 +1042,7 @@ def identify_modality(filename: str) -> Optional[str]:
return np.nan


# todo : use more ?
def write_to_tsv(df: pd.DataFrame, buffer: Union[Path, BinaryIO]) -> None:
"""Save dataframe as a BIDS-compliant TSV file.

Expand Down
12 changes: 4 additions & 8 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,20 +108,16 @@ def _create_sessions_tsv(
bids_dir: Path,
bids_ids: list[str],
) -> dict:
from clinica.iotools.bids_utils import (
StudyName,
create_sessions_dict_oasis,
write_sessions_tsv,
)
from .oasis_to_bids_utils import create_sessions_dict, write_sessions_tsv

sessions_dict = create_sessions_dict_oasis(
sessions_dict = create_sessions_dict(
clinical_data_dir=clinical_data_dir,
bids_dir=bids_dir,
study_name=StudyName.OASIS,
clinical_specifications_folder=Path(__file__).parents[1] / "specifications",
bids_ids=bids_ids,
name_column_ids="ID",
)

# todo : when tested add to create_sessions_dict bc specific to oasis1
for bids_id in bids_ids:
sessions_dict[bids_id]["M000"]["diagnosis"] = (
"AD" if sessions_dict[bids_id]["M000"]["diagnosis"] > 0 else "CN"
Expand Down
148 changes: 148 additions & 0 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import os
from pathlib import Path
from typing import Iterable

import numpy as np
import pandas as pd

from clinica.iotools.bids_utils import StudyName, get_bids_sess_list
from clinica.utils.stream import cprint

__all__ = ["create_sessions_dict", "write_sessions_tsv"]


def create_sessions_dict(
clinical_data_dir: Path,
bids_dir: Path,
clinical_specifications_folder: Path,
bids_ids: Iterable[str],
) -> dict:
"""Extract the information regarding the sessions and store them in a dictionary (session M000 only).

Parameters
----------
clinical_data_dir : Path
The path to the input folder.

bids_dir : Path
The path to the BIDS directory.

clinical_specifications_folder : Path
The path to the clinical file.

bids_ids : list of str
The list of bids ids.

Returns
-------
dict :
Session dict.
"""

location = f"{StudyName.OASIS.value} location"
sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
sessions_fields = sessions[StudyName.OASIS.value]
field_location = sessions[location]
sessions_fields_bids = sessions["BIDS CLINICA"]
fields_dataset = []
fields_bids = []
sessions_dict = {}

for i in range(0, len(sessions_fields)):
if not pd.isnull(sessions_fields[i]):
fields_bids.append(sessions_fields_bids[i])
fields_dataset.append(sessions_fields[i])

for i in range(0, len(sessions_fields)):
# If the i-th field is available
if not pd.isnull(sessions_fields[i]):
# Load the file
tmp = field_location[i].split("/")
location = tmp[0]
if len(tmp) > 1:
sheet = tmp[1]
else:
sheet = ""

file_to_read_path = clinical_data_dir / location
file_ext = os.path.splitext(location)[1]
if file_ext == ".xlsx":
file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
elif file_ext == ".csv":
file_to_read = pd.read_csv(file_to_read_path)
else:
raise ValueError(
f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
)

for r in range(0, len(file_to_read.values)):
# Extracts the subject ids columns from the dataframe
subj_id = file_to_read.iloc[r]["ID"]
if hasattr(subj_id, "dtype"):
if subj_id.dtype == np.int64:
subj_id = str(subj_id)
# Removes all the - from
subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])

# Extract the corresponding BIDS id and create the output file if doesn't exist
subj_bids = [s for s in bids_ids if subj_id_alpha in s]
if subj_bids:
subj_bids = subj_bids[0]
subj_dir = bids_dir / subj_bids
session_names = get_bids_sess_list(subj_dir)
for s in session_names:
s_name = s.replace("ses-", "")
row = file_to_read.iloc[r]
if subj_bids not in sessions_dict:
sessions_dict.update({subj_bids: {}})
if s_name not in sessions_dict[subj_bids].keys():
sessions_dict[subj_bids].update({s_name: {"session_id": s}})
(sessions_dict[subj_bids][s_name]).update(
{sessions_fields_bids[i]: row[sessions_fields[i]]}
)

return sessions_dict


def write_sessions_tsv(bids_dir: Path, sessions_dict: dict) -> None:
"""Create <participant_id>_sessions.tsv files.

Basically writes the content of the function
`clinica.iotools.bids_utils.create_sessions_dict` in several TSV files
following the BIDS specification.

Parameters
----------
bids_dir : Path
The path to the BIDS directory.

sessions_dict : dict
Dictionary containing sessions metadata.

.. note::
This is the output of the function
`clinica.iotools.bids_utils.create_sessions_dict`.

See also
--------
create_sessions_dict
write_scans_tsv
"""
for subject_path in bids_dir.glob("sub-*"):
if subject_path.name in sessions_dict:
session_df = pd.DataFrame.from_dict(
sessions_dict[subject_path.name], orient="index"
)
cols = session_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
session_df = session_df[cols]
else:
print(f"No session data available for {subject_path}")
session_df = pd.DataFrame(columns=["session_id"])
session_df["session_id"] = pd.Series("M000")
session_df = session_df.set_index("session_id").fillna("n/a")
session_df.to_csv(
subject_path / f"{subject_path.name}_sessions.tsv",
sep="\t",
encoding="utf8",
)
Loading
Loading