Skip to content

Commit

Permalink
First pass
Browse files Browse the repository at this point in the history
  • Loading branch information
AliceJoubert committed Oct 10, 2024
1 parent 6f81685 commit 4186106
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 76 deletions.
88 changes: 28 additions & 60 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import os
from pathlib import Path
from typing import Iterable

import numpy as np
import pandas as pd

from clinica.iotools.bids_utils import StudyName, get_bids_sess_list
from clinica.utils.stream import cprint
from clinica.iotools.bids_utils import StudyName, bids_id_factory

__all__ = ["create_sessions_dict", "write_sessions_tsv"]

Expand All @@ -28,7 +25,7 @@ def create_sessions_dict(
The path to the BIDS directory.
clinical_specifications_folder : Path
The path to the clinical file.
The path to the clinical file folder.
bids_ids : list of str
The list of bids ids.
Expand All @@ -39,63 +36,34 @@ def create_sessions_dict(
Session dict.
"""

location = f"{StudyName.OASIS.value} location"
sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
sessions_fields = sessions[StudyName.OASIS.value]
field_location = sessions[location]
sessions_fields_bids = sessions["BIDS CLINICA"]
fields_dataset = []
fields_bids = []
study = StudyName.OASIS.value
location = f"{study} location"
spec = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")[
[study, location, "BIDS CLINICA"]
].dropna()
sessions_dict = {}

for i in range(0, len(sessions_fields)):
if not pd.isnull(sessions_fields[i]):
fields_bids.append(sessions_fields_bids[i])
fields_dataset.append(sessions_fields[i])

for i in range(0, len(sessions_fields)):
# If the i-th field is available
if not pd.isnull(sessions_fields[i]):
# Load the file
tmp = field_location[i].split("/")
location = tmp[0]
sheet = tmp[1] if len(tmp) > 1 else 0
file_to_read_path = clinical_data_dir / location
file_ext = os.path.splitext(location)[1]
if file_ext == ".xlsx":
file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
elif file_ext == ".csv":
file_to_read = pd.read_csv(file_to_read_path)
else:
raise ValueError(
f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
)

for r in range(0, len(file_to_read.values)):
# Extracts the subject ids columns from the dataframe
subj_id = file_to_read.iloc[r]["ID"]
if hasattr(subj_id, "dtype"):
if subj_id.dtype == np.int64:
subj_id = str(subj_id)
# Removes all the - from
subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])

# Extract the corresponding BIDS id and create the output file if doesn't exist
subj_bids = [s for s in bids_ids if subj_id_alpha in s]
if subj_bids:
subj_bids = subj_bids[0]
subj_dir = bids_dir / subj_bids
session_names = get_bids_sess_list(subj_dir)
for s in session_names:
s_name = s.replace("ses-", "")
row = file_to_read.iloc[r]
if subj_bids not in sessions_dict:
sessions_dict.update({subj_bids: {}})
if s_name not in sessions_dict[subj_bids].keys():
sessions_dict[subj_bids].update({s_name: {"session_id": s}})
(sessions_dict[subj_bids][s_name]).update(
{sessions_fields_bids[i]: row[sessions_fields[i]]}
)
for loc in spec[location].unique():
file = pd.read_excel(clinical_data_dir / loc)
file["BIDS ID"] = file.ID.apply(
lambda x: bids_id_factory(StudyName.OASIS).from_original_study_id(x)
)
file.set_index("BIDS ID", drop=True, inplace=True)
result = pd.DataFrame()
for _, row in spec[spec[location] == loc].iterrows():
result[row["BIDS CLINICA"]] = file[row[[study]]]

# todo : what happens if one subject is not in the metadata ? at this point, I could add a line
# but I have to be sure that it has a corresponding image OR that the bids_ids list was properly
# managed before

result = result.loc[bids_ids]
result["session_id"] = "ses-M000"

for bids_id, row in result.iterrows():
sessions_dict.update(
{bids_id: {"M000": {label: value for label, value in row.items()}}}
)

return sessions_dict

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def clinical_data_path(tmp_path: Path) -> Path:
def _build_clinical_data(clinical_data_path: Path) -> None:
clinical_data_path.mkdir()

# todo :what happens if nan instead of value ? (handling of float...)

df = pd.DataFrame(
{
"ID": ["OAS1_0001_MR1", "OAS1_0002_MR1"],
Expand All @@ -37,9 +39,9 @@ def _build_clinical_data(clinical_data_path: Path) -> None:
"Delay": [float("nan"), float("nan")],
}
)
df.to_csv(clinical_data_path / "oasis_cross-sectional.csv", index=False)

# todo : future with excel
df.to_excel(
clinical_data_path / "oasis_cross-sectional-5708aa0a98d82080.xlsx", index=False
)


@pytest.fixture
Expand All @@ -57,9 +59,9 @@ def _build_spec_sessions_success(sessions_path_success: Path) -> None:
"ADNI": [np.nan, np.nan, np.nan, "foo"],
"OASIS": ["CDR", "MMSE", "CDR", np.nan],
"OASIS location": [
"oasis_cross-sectional.csv",
"oasis_cross-sectional.csv",
"oasis_cross-sectional.csv",
"oasis_cross-sectional-5708aa0a98d82080.xlsx",
"oasis_cross-sectional-5708aa0a98d82080.xlsx",
"oasis_cross-sectional-5708aa0a98d82080.xlsx",
np.nan,
],
}
Expand Down Expand Up @@ -111,12 +113,6 @@ def expected() -> dict:
"MMS": 29,
"diagnosis": 0,
},
"M006": {
"session_id": "ses-M006",
"cdr_global": 0,
"MMS": 29,
"diagnosis": 0,
},
},
"sub-OASIS10002": {
"M000": {
Expand All @@ -138,8 +134,6 @@ def test_create_sessions_dict_success(
sessions_path_success: Path,
expected: dict,
):
# todo : how does it handle nan inside excel/csv ? verify with excel

result = create_sessions_dict(
clinical_data_path,
bids_dir,
Expand All @@ -157,8 +151,6 @@ def test_create_sessions_dict_error(
sessions_path_error: Path,
expected: dict,
):
# todo : how does it handle nan inside excel/csv ? verify with excel

with pytest.raises(FileNotFoundError):
create_sessions_dict(
clinical_data_path,
Expand Down

0 comments on commit 4186106

Please sign in to comment.