First pass

aramis-lab · Oct 10, 2024 · 4186106 · 4186106
1 parent 6f81685
commit 4186106
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 76 deletions.
diff --git a/clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py b/clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py
@@ -1,12 +1,9 @@
-import os
 from pathlib import Path
 from typing import Iterable
 
-import numpy as np
 import pandas as pd
 
-from clinica.iotools.bids_utils import StudyName, get_bids_sess_list
-from clinica.utils.stream import cprint
+from clinica.iotools.bids_utils import StudyName, bids_id_factory
 
 __all__ = ["create_sessions_dict", "write_sessions_tsv"]
 
@@ -28,7 +25,7 @@ def create_sessions_dict(
         The path to the BIDS directory.
 
     clinical_specifications_folder : Path
-        The path to the clinical file.
+        The path to the clinical file folder.
 
     bids_ids : list of str
         The list of bids ids.
@@ -39,63 +36,34 @@ def create_sessions_dict(
         Session dict.
     """
 
-    location = f"{StudyName.OASIS.value} location"
-    sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
-    sessions_fields = sessions[StudyName.OASIS.value]
-    field_location = sessions[location]
-    sessions_fields_bids = sessions["BIDS CLINICA"]
-    fields_dataset = []
-    fields_bids = []
+    study = StudyName.OASIS.value
+    location = f"{study} location"
+    spec = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")[
+        [study, location, "BIDS CLINICA"]
+    ].dropna()
     sessions_dict = {}
 
-    for i in range(0, len(sessions_fields)):
-        if not pd.isnull(sessions_fields[i]):
-            fields_bids.append(sessions_fields_bids[i])
-            fields_dataset.append(sessions_fields[i])
-
-    for i in range(0, len(sessions_fields)):
-        # If the i-th field is available
-        if not pd.isnull(sessions_fields[i]):
-            # Load the file
-            tmp = field_location[i].split("/")
-            location = tmp[0]
-            sheet = tmp[1] if len(tmp) > 1 else 0
-            file_to_read_path = clinical_data_dir / location
-            file_ext = os.path.splitext(location)[1]
-            if file_ext == ".xlsx":
-                file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
-            elif file_ext == ".csv":
-                file_to_read = pd.read_csv(file_to_read_path)
-            else:
-                raise ValueError(
-                    f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
-                )
-
-            for r in range(0, len(file_to_read.values)):
-                # Extracts the subject ids columns from the dataframe
-                subj_id = file_to_read.iloc[r]["ID"]
-                if hasattr(subj_id, "dtype"):
-                    if subj_id.dtype == np.int64:
-                        subj_id = str(subj_id)
-                # Removes all the - from
-                subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])
-
-                # Extract the corresponding BIDS id and create the output file if doesn't exist
-                subj_bids = [s for s in bids_ids if subj_id_alpha in s]
-                if subj_bids:
-                    subj_bids = subj_bids[0]
-                    subj_dir = bids_dir / subj_bids
-                    session_names = get_bids_sess_list(subj_dir)
-                    for s in session_names:
-                        s_name = s.replace("ses-", "")
-                        row = file_to_read.iloc[r]
-                        if subj_bids not in sessions_dict:
-                            sessions_dict.update({subj_bids: {}})
-                        if s_name not in sessions_dict[subj_bids].keys():
-                            sessions_dict[subj_bids].update({s_name: {"session_id": s}})
-                        (sessions_dict[subj_bids][s_name]).update(
-                            {sessions_fields_bids[i]: row[sessions_fields[i]]}
-                        )
+    for loc in spec[location].unique():
+        file = pd.read_excel(clinical_data_dir / loc)
+        file["BIDS ID"] = file.ID.apply(
+            lambda x: bids_id_factory(StudyName.OASIS).from_original_study_id(x)
+        )
+        file.set_index("BIDS ID", drop=True, inplace=True)
+        result = pd.DataFrame()
+        for _, row in spec[spec[location] == loc].iterrows():
+            result[row["BIDS CLINICA"]] = file[row[[study]]]
+
+        # todo : what happens if one subject is not in the metadata ? at this point, I could add a line
+        # but I have to be sure that it has a corresponding image OR that the bids_ids list was properly
+        # managed before
+
+        result = result.loc[bids_ids]
+        result["session_id"] = "ses-M000"
+
+        for bids_id, row in result.iterrows():
+            sessions_dict.update(
+                {bids_id: {"M000": {label: value for label, value in row.items()}}}
+            )
 
     return sessions_dict
 

diff --git a/test/unittests/iotools/converters/oasis_to_bids/test_oasis_to_bids_utils.py b/test/unittests/iotools/converters/oasis_to_bids/test_oasis_to_bids_utils.py
@@ -21,6 +21,8 @@ def clinical_data_path(tmp_path: Path) -> Path:
 def _build_clinical_data(clinical_data_path: Path) -> None:
     clinical_data_path.mkdir()
 
+    # todo :what happens if nan instead of value ? (handling of float...)
+
     df = pd.DataFrame(
         {
             "ID": ["OAS1_0001_MR1", "OAS1_0002_MR1"],
@@ -37,9 +39,9 @@ def _build_clinical_data(clinical_data_path: Path) -> None:
             "Delay": [float("nan"), float("nan")],
         }
     )
-    df.to_csv(clinical_data_path / "oasis_cross-sectional.csv", index=False)
-
-    # todo : future with excel
+    df.to_excel(
+        clinical_data_path / "oasis_cross-sectional-5708aa0a98d82080.xlsx", index=False
+    )
 
 
 @pytest.fixture
@@ -57,9 +59,9 @@ def _build_spec_sessions_success(sessions_path_success: Path) -> None:
             "ADNI": [np.nan, np.nan, np.nan, "foo"],
             "OASIS": ["CDR", "MMSE", "CDR", np.nan],
             "OASIS location": [
-                "oasis_cross-sectional.csv",
-                "oasis_cross-sectional.csv",
-                "oasis_cross-sectional.csv",
+                "oasis_cross-sectional-5708aa0a98d82080.xlsx",
+                "oasis_cross-sectional-5708aa0a98d82080.xlsx",
+                "oasis_cross-sectional-5708aa0a98d82080.xlsx",
                 np.nan,
             ],
         }
@@ -111,12 +113,6 @@ def expected() -> dict:
                 "MMS": 29,
                 "diagnosis": 0,
             },
-            "M006": {
-                "session_id": "ses-M006",
-                "cdr_global": 0,
-                "MMS": 29,
-                "diagnosis": 0,
-            },
         },
         "sub-OASIS10002": {
             "M000": {
@@ -138,8 +134,6 @@ def test_create_sessions_dict_success(
     sessions_path_success: Path,
     expected: dict,
 ):
-    # todo : how does it handle nan inside excel/csv ? verify with excel
-
     result = create_sessions_dict(
         clinical_data_path,
         bids_dir,
@@ -157,8 +151,6 @@ def test_create_sessions_dict_error(
     sessions_path_error: Path,
     expected: dict,
 ):
-    # todo : how does it handle nan inside excel/csv ? verify with excel
-
     with pytest.raises(FileNotFoundError):
         create_sessions_dict(
             clinical_data_path,