From 96ccb417f8d354fb7f43c21451acd6188d0d610e Mon Sep 17 00:00:00 2001 From: AliceJoubert Date: Wed, 30 Oct 2024 17:31:49 +0100 Subject: [PATCH] End of proposition --- .../converters/aibl_to_bids/utils/clinical.py | 145 ++++-------- .../aibl_to_bids/test_aibl_utils.py | 214 ++++++++++++++---- 2 files changed, 216 insertions(+), 143 deletions(-) diff --git a/clinica/iotools/converters/aibl_to_bids/utils/clinical.py b/clinica/iotools/converters/aibl_to_bids/utils/clinical.py index 77e1138df..919c4b77a 100644 --- a/clinica/iotools/converters/aibl_to_bids/utils/clinical.py +++ b/clinica/iotools/converters/aibl_to_bids/utils/clinical.py @@ -1,6 +1,5 @@ -from ast import Index from pathlib import Path -from typing import List, Optional, Union +from typing import Optional, Tuple import numpy as np import pandas as pd @@ -11,8 +10,6 @@ "create_sessions_tsv_file", ] -from sqlalchemy.sql.operators import from_ - def create_participants_tsv_file( input_path: Path, @@ -153,7 +150,6 @@ def _load_specifications( def _mapping_diagnosis(diagnosis: int) -> str: - # todo : what if str(1) ? if diagnosis == 1: return "CN" elif diagnosis == 2: @@ -167,7 +163,6 @@ def _mapping_diagnosis(diagnosis: int) -> str: def _extract_metadata_df( input_df: pd.DataFrame, source_id: int, bids_metadata: str, source_metadata: str ) -> pd.DataFrame: - # todo :test from clinica.iotools.converter_utils import viscode_to_session extract = input_df.loc[(input_df["RID"] == source_id), ["VISCODE", source_metadata]] @@ -182,32 +177,15 @@ def _extract_metadata_df( def _compute_age_at_exam( - birth_date: Union[float, str], exam_date: Union[float, str] -) -> float: - """Compute the ages of the patient at each exam date. - - Parameters - ---------- - birth_date : str - Date of birth of patient ("/%Y" format) - - exam_date : str - Exam date ("%m/%d/%Y" format) - - Return - ------ - float - Age of the patient at exam date. - """ - # todo : test + birth_date: Optional[str], exam_date: Optional[str] +) -> Optional[float]: from datetime import datetime - if not isinstance(birth_date, float) and not isinstance(exam_date, float): + if birth_date and exam_date: date_of_birth = datetime.strptime(birth_date, "/%Y") exam_date = datetime.strptime(exam_date, "%m/%d/%Y") return exam_date.year - date_of_birth.year - else: - return np.nan + return None def create_sessions_tsv_file( @@ -228,6 +206,7 @@ def create_sessions_tsv_file( clinical_specifications_folder : Path The path to the folder containing the clinical specification files. """ + # todo :rename test import glob from clinica.iotools.bids_utils import ( @@ -263,15 +242,21 @@ def create_sessions_tsv_file( test.sort_index(inplace=True) # -4 are considered missing values in AIBL - test.replace([-4, "-4"], np.nan, inplace=True) + test.replace([-4, "-4", np.nan], None, inplace=True) test["diagnosis"] = test.diagnosis.apply(lambda x: _mapping_diagnosis(x)) + test["examination_date"] = test.apply( + lambda x: _complete_examination_dates( + rid, x.session_id, x.examination_date, clinical_data_dir + ), + axis=1, + ) - # todo : handle exam date, see function below # in general age metadata is present only for baseline session test["age"] = test["age"].ffill() test["age"] = test.apply( - lambda row: _compute_age_at_exam(row.age, row.examination_date), axis=1 + lambda x: _compute_age_at_exam(x.age, x.examination_date), axis=1 ) + # in case there is a session in clinical data that was not actually converted test.dropna(subset=["session_id"], inplace=True) test.fillna("n/a", inplace=True) @@ -285,86 +270,56 @@ def create_sessions_tsv_file( ) -def _find_exam_dates( - df: pd.DataFrame, session_id: str, rid: int, clinical_data_dir: Path -) -> str: - # todo :test - - # todo : finish from csv - # todo : finish interaction between 2 - - from_csv = _find_exam_date_in_other_csv_files(rid, session_id, clinical_data_dir) - - try: - baseline_date = df.loc["ses-M000"]["examination_date"] - except KeyError: - from_baseline = None - else: - from_baseline = _compute_exam_date_from_baseline(session_id, baseline_date) - - date = from_csv or from_baseline - return from_baseline +def _complete_examination_dates( + rid: int, + session_id: Optional[str], + examination_date: Optional[str], + clinical_data_dir: Path, +) -> Optional[str]: + if examination_date: + return examination_date + if session_id: + return _find_exam_date_in_other_csv_files(rid, session_id, clinical_data_dir) + return None def _find_exam_date_in_other_csv_files( - rid: int, visit_code: str, clinical_data_dir: Path + rid: int, session_id: str, clinical_data_dir: Path ) -> Optional[str]: """Try to find an alternative exam date by searching in other CSV files.""" - # todo (LATER) : refactor to use session_id - for csv_file in _get_csv_files(clinical_data_dir): - if "aibl_flutemeta" in csv_file: - csv_data = pd.read_csv( - csv_file, low_memory=False, usecols=list(range(0, 36)) - ) - else: - csv_data = pd.read_csv(csv_file, low_memory=False) - exam_date = csv_data[(csv_data.RID == rid) & (csv_data.VISCODE == visit_code)] + from clinica.iotools.converter_utils import viscode_to_session + + for csv in _get_csv_paths(clinical_data_dir): + csv_data = pd.read_csv(csv, low_memory=False) + csv_data["SESSION"] = csv_data.VISCODE.apply(lambda x: viscode_to_session(x)) + exam_date = csv_data[(csv_data.RID == rid) & (csv_data.SESSION == session_id)] if not exam_date.empty and exam_date.iloc[0].EXAMDATE != "-4": return exam_date.iloc[0].EXAMDATE return None -def _get_csv_files(clinical_data_dir: Path) -> List[str]: +def _get_csv_paths(clinical_data_dir: Path) -> Tuple[str]: """Return a list of paths to CSV files in which an alternative exam date could be found.""" import glob - # todo (LATER) : would be better to use a function similar to load_clinical_csv from ADNI - # bc there it does not check for existence and can return anything - - return [ - glob.glob(str(clinical_data_dir / pattern))[0] - for pattern in ( - "aibl_mri3meta_*.csv", - "aibl_mrimeta_*.csv", - "aibl_cdr_*.csv", - "aibl_flutemeta_*.csv", - "aibl_mmse_*.csv", - "aibl_pibmeta_*.csv", - ) - ] + pattern_list = ( + "aibl_mri3meta_*.csv", + "aibl_mrimeta_*.csv", + "aibl_cdr_*.csv", + "aibl_flutemeta_*.csv", + "aibl_mmse_*.csv", + "aibl_pibmeta_*.csv", + ) -def _compute_exam_date_from_baseline( - session_id: str, - baseline_date: str, -) -> Optional[str]: - import re - from datetime import datetime + paths_list = () - from dateutil.relativedelta import relativedelta - - if session_id != "ses-M000": - if not bool(np.isnan(baseline_date)): - try: - months = int(re.match(r"ses-M(\d*)", session_id).group(1)) - except AttributeError: - raise ValueError( - f"Unexpected visit code {session_id}. Should be in format ses-MX :" - "Ex: ses-M000, ses-M006, ses-M012..." - ) - baseline_date = datetime.strptime(baseline_date, "%m/%d/%Y") - exam_date = baseline_date + relativedelta(months=+months) - return exam_date.strftime("%m/%d/%Y") - return None + for pattern in pattern_list: + try: + path = glob.glob(str(clinical_data_dir / pattern))[0] + paths_list += (path,) + except IndexError: + pass + return paths_list def create_scans_tsv_file( diff --git a/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py b/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py index 958a8a42f..6e8277ad1 100644 --- a/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py +++ b/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py @@ -1,3 +1,4 @@ +from distutils.command.build import build from pathlib import Path import numpy as np @@ -6,6 +7,18 @@ from pandas.testing import assert_frame_equal +@pytest.mark.parametrize( + "birth_date, exam_date, expected", + [(None, "foo", None), ("foo", None, None), ("/2000", "01/01/2012", 12)], +) +def test_compute_age_at_exam(birth_date, exam_date, expected): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _compute_age_at_exam, + ) + + assert _compute_age_at_exam(birth_date, exam_date) == expected + + @pytest.mark.parametrize( "diagnosis, expected", [ @@ -14,6 +27,7 @@ (2, "MCI"), (3, "AD"), (0, "n/a"), + (None, "n/a"), ], ) def test_mapping_diagnosis(diagnosis, expected): @@ -24,35 +38,6 @@ def test_mapping_diagnosis(diagnosis, expected): assert _mapping_diagnosis(diagnosis) == expected -@pytest.mark.parametrize( - "session_id, baseline_date, expected", - [ - ("ses-M000", "foo", None), - ("ses-M010", "01/01/2000", "11/01/2000"), - ("ses-M010", np.nan, None), - ], -) -def test_compute_exam_date_from_baseline_success(session_id, baseline_date, expected): - from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( - _compute_exam_date_from_baseline, - ) - - assert _compute_exam_date_from_baseline(session_id, baseline_date) == expected - - -def test_compute_exam_date_from_baseline_raiseValue(): - from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( - _compute_exam_date_from_baseline, - ) - - with pytest.raises( - ValueError, - match=f"Unexpected visit code foo. Should be in format mX :" - "Ex: m0, m6, m12, m048...", - ): - _compute_exam_date_from_baseline("foo", [], []) - - def test_load_specifications_success(tmp_path): from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( _load_specifications, @@ -148,6 +133,8 @@ def build_bids_dir(tmp_path: Path) -> Path: (bids_dir / "sub-AIBL1" / "ses-M000").mkdir(parents=True) (bids_dir / "sub-AIBL100" / "ses-M000").mkdir(parents=True) (bids_dir / "sub-AIBL100" / "ses-M012").mkdir(parents=True) + (bids_dir / "sub-AIBL109" / "ses-M000").mkdir(parents=True) + (bids_dir / "sub-AIBL109" / "ses-M006").mkdir(parents=True) return bids_dir @@ -157,14 +144,16 @@ def build_clinical_data(tmp_path: Path) -> Path: neuro = pd.DataFrame( { - "RID": [1, 2, 12, 100, 100], # %m/%d/%Y - "VISCODE": ["bl", "bl", "bl", "bl", "m12"], + "RID": [1, 2, 12, 100, 100, 109, 109], # %m/%d/%Y + "VISCODE": ["bl", "bl", "bl", "bl", "m12", "bl", "m06"], "EXAMDATE": [ "01/01/2001", "01/01/2002", "01/01/2012", "01/01/2100", "12/01/2100", + "01/01/2109", + -4, ], } ) @@ -181,34 +170,148 @@ def build_clinical_data(tmp_path: Path) -> Path: cdr = pd.DataFrame( { - "RID": [1, 2, 12, 100, 100], - "VISCODE": ["bl", "bl", "bl", "bl", "m12"], - "CDGLOBAL": [-4, 1, 0.5, 0, 0], + "RID": [1, 2, 12, 100, 100, 109, 109], + "VISCODE": ["bl", "bl", "bl", "bl", "m12", "bl", "m06"], + "CDGLOBAL": [-4, 1, 0.5, 0, 0, 0, 0], + "EXAMDATE": [ + "01/01/2001", + "01/01/2002", + "01/01/2012", + "01/01/2100", + "12/01/2100", + "01/01/2109", + -4, + ], } ) # rq:float cdr.to_csv(data_path / "aibl_cdr_230ct2024.csv", index=False) mmse = pd.DataFrame( { - "RID": [1, 2, 12, 100, 100], - "VISCODE": ["bl", "bl", "bl", "bl", "m12"], - "MMSCORE": [-4, 10, 10, 30, 29], + "RID": [1, 2, 12, 100, 100, 109, 109], + "VISCODE": ["bl", "bl", "bl", "bl", "m12", "bl", "m06"], + "MMSCORE": [-4, 10, 10, 30, 29, 10, 10], + "EXAMDATE": [ + "01/01/2001", + "01/01/2002", + "01/01/2012", + "01/01/2100", + "12/01/2100", + "01/01/2109", + -4, + ], } - ) # rq:int + ) mmse.to_csv(data_path / "aibl_mmse_230ct2024.csv", index=False) pdx = pd.DataFrame( { - "RID": [1, 2, 12, 100, 100], - "VISCODE": ["bl", "bl", "bl", "bl", "m12"], - "DXCURREN": [-4, 0, 0, 1, 3], + "RID": [1, 2, 12, 100, 100, 109, 109], + "VISCODE": ["bl", "bl", "bl", "bl", "m12", "bl", "m06"], + "DXCURREN": [-4, 0, 0, 1, 3, 2, 2], } - ) # rq : int + ) pdx.to_csv(data_path / "aibl_pdxconv_230ct2024.csv", index=False) + mri3 = pd.DataFrame( + { + "RID": [1, 2, 12, 100, 100, 109, 109], # %m/%d/%Y + "VISCODE": ["bl", "bl", "bl", "bl", "m12", "bl", "m06"], + "EXAMDATE": [ + "01/01/2001", + "01/01/2002", + "01/01/2012", + "01/01/2100", + "12/01/2100", + "01/01/2109", + -4, + ], + } + ) + mri3.to_csv(data_path / "aibl_mri3meta_230ct2024.csv", index=False) return data_path +def test_extract_metadata_df(tmp_path): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _extract_metadata_df, + ) + + clinical_dir = build_clinical_data(tmp_path) + expected = pd.DataFrame( + { + "session_id": ["ses-M000", "ses-M006"], + "examination_date": ["01/01/2109", "-4"], + } + ).set_index("session_id", drop=True) + result = _extract_metadata_df( + pd.read_csv(clinical_dir / "aibl_neurobat_230ct2024.csv", dtype={"text": str}), + 109, + bids_metadata="examination_date", + source_metadata="EXAMDATE", + ) + + assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + "source_id, session_id, expected", + [ + (109, "ses-M000", "01/01/2109"), + (109, "ses-M006", None), + (109, "ses-M014", None), + (0, "ses-M014", None), + ], +) +def test_find_exam_date_in_other_csv_files(tmp_path, source_id, session_id, expected): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _find_exam_date_in_other_csv_files, + ) + + clinical_dir = build_clinical_data(tmp_path) + assert ( + _find_exam_date_in_other_csv_files(source_id, session_id, clinical_dir) + == expected + ) + + +def test_get_csv_paths(tmp_path): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import _get_csv_paths + + assert _get_csv_paths(tmp_path) == () + + clinical_dir = build_clinical_data(tmp_path) + csv_paths = [Path(path).name for path in _get_csv_paths(clinical_dir)] + + assert set(csv_paths) == { + "aibl_cdr_230ct2024.csv", + "aibl_mmse_230ct2024.csv", + "aibl_mri3meta_230ct2024.csv", + } + + +@pytest.mark.parametrize( + "rid, session_id, exam_date, expected", + [ + (0, "foo", "01/01/2000", "01/01/2000"), + (0, None, "01/01/2000", "01/01/2000"), + (0, None, None, None), + (109, "ses-M000", None, "01/01/2109"), + (109, "ses-M006", None, None), + ], +) +def test_complete_examination_dates(tmp_path, rid, session_id, exam_date, expected): + from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( + _complete_examination_dates, + ) + + clinical_dir = build_clinical_data(tmp_path) + assert ( + _complete_examination_dates(rid, session_id, exam_date, clinical_dir) + == expected + ) + + def test_create_sessions_tsv(tmp_path): from clinica.iotools.converters.aibl_to_bids.utils.clinical import ( create_sessions_tsv_file, @@ -223,17 +326,20 @@ def test_create_sessions_tsv(tmp_path): ) result_sub100_list = list(bids_path.rglob("*sub-AIBL100_sessions.tsv")) result_sub1_list = list(bids_path.rglob("*sub-AIBL1_sessions.tsv")) + result_sub109_list = list(bids_path.rglob("*sub-AIBL109_sessions.tsv")) + assert len(result_sub109_list) == 1 assert len(result_sub100_list) == 1 assert len(result_sub1_list) == 1 - result_sub100 = pd.read_csv(result_sub100_list[0], sep="\t") - result_sub1 = pd.read_csv(result_sub1_list[0], sep="\t") + result_sub109 = pd.read_csv(result_sub109_list[0], sep="\t", keep_default_na=False) + result_sub100 = pd.read_csv(result_sub100_list[0], sep="\t", keep_default_na=False) + result_sub1 = pd.read_csv(result_sub1_list[0], sep="\t", keep_default_na=False) expected_sub100 = pd.DataFrame( { "session_id": ["ses-M000", "ses-M012"], - "age": [np.nan, np.nan], + "age": ["n/a", "n/a"], "MMS": [30, 29], "cdr_global": [0.0, 0.0], "diagnosis": ["CN", "AD"], @@ -245,12 +351,24 @@ def test_create_sessions_tsv(tmp_path): { "session_id": ["ses-M000"], "age": [100], - "MMS": [np.nan], - "cdr_global": [np.nan], - "diagnosis": [np.nan], + "MMS": ["n/a"], + "cdr_global": ["n/a"], + "diagnosis": ["n/a"], "examination_date": ["01/01/2001"], } ) + expected_sub109 = pd.DataFrame( + { + "session_id": ["ses-M000", "ses-M006"], + "age": ["n/a", "n/a"], + "MMS": [10, 10], + "cdr_global": [0.0, 0.0], + "diagnosis": ["MCI", "MCI"], + "examination_date": ["01/01/2109", "n/a"], + } + ) + assert_frame_equal(result_sub1, expected_sub1, check_like=True) assert_frame_equal(result_sub100, expected_sub100, check_like=True) + assert_frame_equal(result_sub109, expected_sub109, check_like=True)