From 9a57060a3c75444fb8d8d2bbb9b49aff08fb0836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roque=20L=C3=B3pez?= Date: Fri, 12 Jul 2024 11:29:11 -0400 Subject: [PATCH] Update preview_domain() to include value names and descriptions (#68) --- bdikit/api.py | 89 ++- bdikit/utils.py | 30 +- docs/source/examples.rst | 5 +- docs/source/getting-started.ipynb | 177 ++++-- examples/analyzing_one_attribute.ipynb | 751 ++++++++++++++++++------- requirements.txt | 3 +- 6 files changed, 738 insertions(+), 317 deletions(-) diff --git a/bdikit/api.py b/bdikit/api.py index e2295e44..2db3ab28 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -5,7 +5,7 @@ import copy import pandas as pd import numpy as np -from bdikit.utils import get_gdc_data +from bdikit.utils import get_gdc_data, get_gdc_metadata from bdikit.mapping_algorithms.column_mapping.algorithms import ( BaseSchemaMatcher, SimFloodSchemaMatcher, @@ -470,75 +470,60 @@ def preview_value_mappings( return result -def preview_domains( - dataset: pd.DataFrame, - column_mapping: Tuple[str, str], - target: Union[str, pd.DataFrame] = "gdc", +def preview_domain( + dataset: Union[str, pd.DataFrame], + column: str, limit: Optional[int] = None, ) -> pd.DataFrame: """ - Preview the domain (set of unique values) of the given columns in the source and target - dataset (or target data dictionary). + Preview the domain, i.e. set of unique values, column description and value description + (if applicable) of the given column of the source or target dataset. Args: - dataset (pd.DataFrame): The source dataset containing the columns to preview. - column_mapping (Tuple[str, str]): The mapping between the source and target columns. - The first and second positions should contain the names of the source and target - columns respectively. - target (Union[str, pd.DataFrame], optional): The target dataset or standard vocabulary name. - If a string is provided and it is equal to "gdc", the target domain will be retrieved + dataset (Union[str, pd.DataFrame], optional): The dataset or standard vocabulary name + containing the column to preview. + If a string is provided and it is equal to "gdc", the domain will be retrieved from the GDC data. - If a DataFrame is provided, the target domain will be retrieved from the specified DataFrame. - Defaults to "gdc". + If a DataFrame is provided, the domain will be retrieved from the specified DataFrame. + column(str): The column name to show the domain. limit (int, optional): The maximum number of unique values to include in the preview. - Defaults to 10. + Defaults to None. Returns: - pd.DataFrame: A DataFrame containing the source and target domain values (or a sample of - them if the parameter `limit` was specified). The DataFrame will have two columns: - "source_domain" and "target_domain". + pd.DataFrame: A DataFrame containing the unique domain values (or a sample of + them if the parameter `limit` was specified), column description and value description + (if applicable). """ - source_column, target_column = column_mapping - - source_domain = dataset[source_column].unique() - if isinstance(target, str) and target == "gdc": - gdc_col_domain = get_gdc_data([target_column])[target_column] - target_domain = ( - np.array([]) if gdc_col_domain is None else np.array(gdc_col_domain) - ) - elif isinstance(target, pd.DataFrame): - target_domain = target[target_column].unique() + if isinstance(dataset, str) and dataset == "gdc": + gdc_metadata = get_gdc_metadata() + value_names = gdc_metadata[column]["value_names"] + value_descriptions = gdc_metadata[column]["value_descriptions"] + column_description = gdc_metadata[column]["description"] + assert len(value_names) == len(value_descriptions) + elif isinstance(dataset, pd.DataFrame): + value_names = dataset[column].unique() + value_descriptions = [] + column_description = "" else: raise ValueError( - "The target must be a DataFrame or a standard vocabulary name." + "The dataset must be a DataFrame or a standard vocabulary name." ) - # Find the final output size based on the the largest domain size and limit parameter - largest_domain_size = max(len(source_domain), len(target_domain)) - output_size = ( - largest_domain_size if limit is None else min(largest_domain_size, limit) - ) + if isinstance(limit, int): + value_names = value_names[:limit] + value_descriptions = value_descriptions[:limit] - # Truncate the domains to the output size if they are larger - if len(source_domain) > output_size: - source_domain = source_domain[:output_size] - if len(target_domain) > output_size: - target_domain = target_domain[:output_size] + domain = {"value_name": value_names} - # Fill the domains with empty strings if they are smaller than the output size - if len(source_domain) < output_size: - source_domain = np.append( - source_domain, np.full(output_size - len(source_domain), "") - ) - if len(target_domain) < output_size: - target_domain = np.append( - target_domain, np.full(output_size - len(target_domain), "") - ) + if len(value_descriptions) > 0: + domain["value_description"] = value_descriptions - return pd.DataFrame( - {"source_domain": source_domain, "target_domain": target_domain} - ) + if len(column_description) > 0: + empty_rows_size = len(value_names) - 1 + domain["column_description"] = [column_description] + [""] * empty_rows_size + + return pd.DataFrame(domain) ValueMatchingLike = Union[List[ValueMatchingResult], List[Dict], pd.DataFrame] diff --git a/bdikit/utils.py b/bdikit/utils.py index bf876e56..8827987d 100644 --- a/bdikit/utils.py +++ b/bdikit/utils.py @@ -35,22 +35,28 @@ def get_gdc_values(column_name, gdc_schema): return None -def get_all_gdc_columns(): - all_columns = [] - gdc_schema = read_gdc_schema() - for key, values in gdc_schema.items(): - for key in values["properties"].keys(): - all_columns.append(key) - return all_columns - - def get_gdc_metadata(): metadata = {} gdc_schema = read_gdc_schema() - for key, values in gdc_schema.items(): - for key, data in values["properties"].items(): - metadata[key] = data + for attrib_data in gdc_schema.values(): + for attrib_name, attrib_properties in attrib_data["properties"].items(): + metadata[attrib_name] = {} + attrib_description = attrib_properties.get("description", "") + metadata[attrib_name]["description"] = attrib_description + + value_names = attrib_properties.get("enum", []) + metadata[attrib_name]["value_names"] = value_names + + descriptions = attrib_properties.get("enumDef", {}) + value_descriptions = [] + for value_name in value_names: + description = "" + if value_name in descriptions: + description = descriptions[value_name].get("description", "") + value_descriptions.append(description) + + metadata[attrib_name]["value_descriptions"] = value_descriptions return metadata diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 39893ec7..f1dafa4a 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -1,6 +1,7 @@ Examples -========== +========= Here can find different Jupyter notebook examples about how to use `bdi-kit`: -- `API overview: mapping data to the GDC vocabulary `__ +- `Analyzing one attribute/column at a time `__ +- `Exploring schema and value matching through a visualization tool `__ diff --git a/docs/source/getting-started.ipynb b/docs/source/getting-started.ipynb index 34ce42dd..ac63459f 100644 --- a/docs/source/getting-started.ipynb +++ b/docs/source/getting-started.ipynb @@ -24,7 +24,7 @@ "import pandas as pd\n", "\n", "import flair, torch\n", - "flair.device = torch.device('cpu') " + "flair.device = torch.device(\"cpu\") " ] }, { @@ -239,7 +239,7 @@ } ], "source": [ - "dataset = pd.read_csv('./datasets/dou.csv')\n", + "dataset = pd.read_csv(\"../../examples/datasets/dou.csv\")\n", "\n", "columns = [\n", " \"Country\",\n", @@ -403,7 +403,7 @@ } ], "source": [ - "column_mappings = bdi.match_schema(dataset[columns], target='gdc', method='two_phase')\n", + "column_mappings = bdi.match_schema(dataset[columns], target=\"gdc\", method=\"two_phase\")\n", "column_mappings" ] }, @@ -856,7 +856,7 @@ } ], "source": [ - "value_mappings = bdi.match_values(dataset, column_mapping=column_mappings, target='gdc', method='tfidf')\n", + "value_mappings = bdi.match_values(dataset, column_mapping=column_mappings, target=\"gdc\", method=\"tfidf\")\n", "bdi.materialize_mapping(dataset, value_mappings)" ] }, @@ -1027,7 +1027,7 @@ } ], "source": [ - "hist_type_matches = bdi.top_matches(dataset, columns=['Histologic_type'], target='gdc')\n", + "hist_type_matches = bdi.top_matches(dataset, columns=[\"Histologic_type\"], target=\"gdc\")\n", "hist_type_matches" ] }, @@ -1068,97 +1068,200 @@ " \n", " \n", " \n", - " source_domain\n", - " target_domain\n", + " value_name\n", " \n", " \n", " \n", " \n", " 0\n", " Endometrioid\n", - " Abdominal desmoid\n", " \n", " \n", " 1\n", " Carcinosarcoma\n", - " Abdominal fibromatosis\n", " \n", " \n", " 2\n", " Serous\n", - " Achromic nevus\n", " \n", " \n", " 3\n", " Clear cell\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " value_name\n", + "0 Endometrioid\n", + "1 Carcinosarcoma\n", + "2 Serous\n", + "3 Clear cell" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bdi.preview_domain(dataset, \"Histologic_type\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", "
value_namevalue_descriptioncolumn_description
0Abdominal desmoidAn insidious poorly circumscribed neoplasm ari...Text term used to describe the patient's histo...
1Abdominal fibromatosisAn insidious poorly circumscribed neoplasm ari...
2Achromic nevusA benign nevus characterized by the absence of...
3Acidophil adenocarcinomaA malignant epithelial neoplasm of the anterio...
4Acidophil adenomaAn epithelial neoplasm of the anterior pituita...
............
2620Wolffian duct tumorAn epithelial neoplasm of the female reproduct...
2621XanthofibromaA benign neoplasm composed of fibroblastic spi...
2622Yolk sac tumorA non-seminomatous malignant germ cell tumor c...
2623UnknownNot known, not observed, not recorded, or refu...
2624Not ReportedNot provided or available.
\n", - "

2625 rows × 2 columns

\n", + "

2625 rows × 3 columns

\n", "
" ], "text/plain": [ - " source_domain target_domain\n", - "0 Endometrioid Abdominal desmoid\n", - "1 Carcinosarcoma Abdominal fibromatosis\n", - "2 Serous Achromic nevus\n", - "3 Clear cell Acidophil adenocarcinoma\n", - "4 Acidophil adenoma\n", - "... ... ...\n", - "2620 Wolffian duct tumor\n", - "2621 Xanthofibroma\n", - "2622 Yolk sac tumor\n", - "2623 Unknown\n", - "2624 Not Reported\n", - "\n", - "[2625 rows x 2 columns]" + " value_name \\\n", + "0 Abdominal desmoid \n", + "1 Abdominal fibromatosis \n", + "2 Achromic nevus \n", + "3 Acidophil adenocarcinoma \n", + "4 Acidophil adenoma \n", + "... ... \n", + "2620 Wolffian duct tumor \n", + "2621 Xanthofibroma \n", + "2622 Yolk sac tumor \n", + "2623 Unknown \n", + "2624 Not Reported \n", + "\n", + " value_description \\\n", + "0 An insidious poorly circumscribed neoplasm ari... \n", + "1 An insidious poorly circumscribed neoplasm ari... \n", + "2 A benign nevus characterized by the absence of... \n", + "3 A malignant epithelial neoplasm of the anterio... \n", + "4 An epithelial neoplasm of the anterior pituita... \n", + "... ... \n", + "2620 An epithelial neoplasm of the female reproduct... \n", + "2621 A benign neoplasm composed of fibroblastic spi... \n", + "2622 A non-seminomatous malignant germ cell tumor c... \n", + "2623 Not known, not observed, not recorded, or refu... \n", + "2624 Not provided or available. \n", + "\n", + " column_description \n", + "0 Text term used to describe the patient's histo... \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "... ... \n", + "2620 \n", + "2621 \n", + "2622 \n", + "2623 \n", + "2624 \n", + "\n", + "[2625 rows x 3 columns]" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "bdi.preview_domains(\n", - " dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\"\n", - ")" + "bdi.preview_domain(\"gdc\", \"primary_diagnosis\")" ] }, { @@ -1273,7 +1376,7 @@ } ], "source": [ - "column_mappings.loc[column_mappings['source'] == 'Histologic_type', 'target'] = 'primary_diagnosis'\n", + "column_mappings.loc[column_mappings[\"source\"] == \"Histologic_type\", \"target\"] = \"primary_diagnosis\"\n", "column_mappings" ] }, @@ -2372,7 +2475,7 @@ } ], "source": [ - "race_vmap = race_vmap[race_vmap['similarity'] >= 1.0]\n", + "race_vmap = race_vmap[race_vmap[\"similarity\"] >= 1.0]\n", "race_vmap" ] }, diff --git a/examples/analyzing_one_attribute.ipynb b/examples/analyzing_one_attribute.ipynb index e0477c81..1b6fe70f 100644 --- a/examples/analyzing_one_attribute.ipynb +++ b/examples/analyzing_one_attribute.ipynb @@ -260,7 +260,7 @@ "text": [ "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.11it/s]\n" + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.49it/s]\n" ] }, { @@ -274,8 +274,8 @@ "name": "stderr", "output_type": "stream", "text": [ - " 0%| | 0/734 [00:000\n", " FIGO_stage\n", " figo_stage\n", - " 0.768761\n", + " 0.765089\n", " \n", " \n", " 1\n", " FIGO_stage\n", - " uicc_pathologic_stage\n", - " 0.715802\n", + " ajcc_clinical_stage\n", + " 0.645107\n", " \n", " \n", " 2\n", " FIGO_stage\n", " uicc_clinical_stage\n", - " 0.711589\n", + " 0.636324\n", " \n", " \n", " 3\n", " FIGO_stage\n", - " ajcc_clinical_stage\n", - " 0.686146\n", + " uicc_pathologic_stage\n", + " 0.633208\n", " \n", " \n", " 4\n", " FIGO_stage\n", - " ajcc_pathologic_stage\n", - " 0.652875\n", + " irs_group\n", + " 0.559433\n", " \n", " \n", " 5\n", " FIGO_stage\n", - " enneking_msts_stage\n", - " 0.606072\n", + " ajcc_pathologic_stage\n", + " 0.519987\n", " \n", " \n", " 6\n", " FIGO_stage\n", - " iss_stage\n", - " 0.602410\n", + " inss_stage\n", + " 0.494000\n", " \n", " \n", " 7\n", " FIGO_stage\n", - " irs_group\n", - " 0.551544\n", + " iss_stage\n", + " 0.460177\n", " \n", " \n", " 8\n", " FIGO_stage\n", - " masaoka_stage\n", - " 0.506091\n", + " cog_liver_stage\n", + " 0.441571\n", " \n", " \n", " 9\n", " FIGO_stage\n", - " inss_stage\n", - " 0.464668\n", + " shortest_dimension\n", + " 0.424832\n", " \n", " \n", "\n", @@ -385,16 +385,16 @@ ], "text/plain": [ " source target similarity\n", - "0 FIGO_stage figo_stage 0.768761\n", - "1 FIGO_stage uicc_pathologic_stage 0.715802\n", - "2 FIGO_stage uicc_clinical_stage 0.711589\n", - "3 FIGO_stage ajcc_clinical_stage 0.686146\n", - "4 FIGO_stage ajcc_pathologic_stage 0.652875\n", - "5 FIGO_stage enneking_msts_stage 0.606072\n", - "6 FIGO_stage iss_stage 0.602410\n", - "7 FIGO_stage irs_group 0.551544\n", - "8 FIGO_stage masaoka_stage 0.506091\n", - "9 FIGO_stage inss_stage 0.464668" + "0 FIGO_stage figo_stage 0.765089\n", + "1 FIGO_stage ajcc_clinical_stage 0.645107\n", + "2 FIGO_stage uicc_clinical_stage 0.636324\n", + "3 FIGO_stage uicc_pathologic_stage 0.633208\n", + "4 FIGO_stage irs_group 0.559433\n", + "5 FIGO_stage ajcc_pathologic_stage 0.519987\n", + "6 FIGO_stage inss_stage 0.494000\n", + "7 FIGO_stage iss_stage 0.460177\n", + "8 FIGO_stage cog_liver_stage 0.441571\n", + "9 FIGO_stage shortest_dimension 0.424832" ] }, "execution_count": 3, @@ -411,7 +411,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "From the above outputs, we can see that the highest similarity is `FIGO_stage` -> `figo_stage`. To have more context about these columns, let's see their unique values (method `preview_domains`)." + "From the above outputs, we can see that the highest similarity is `FIGO_stage` -> `figo_stage`. To have more context about these columns, let's see their unique values (method `preview_domain`)." ] }, { @@ -440,231 +440,393 @@ " \n", " \n", " \n", - " source_domain\n", - " target_domain\n", + " value_name\n", " \n", " \n", " \n", " \n", " 0\n", " IA\n", - " Stage 0\n", " \n", " \n", " 1\n", " NaN\n", - " Stage I\n", " \n", " \n", " 2\n", " IIIA\n", - " Stage IA\n", " \n", " \n", " 3\n", " IIIC2\n", - " Stage IA1\n", " \n", " \n", " 4\n", " IB\n", - " Stage IA2\n", " \n", " \n", " 5\n", " II\n", - " Stage IB\n", " \n", " \n", " 6\n", " IIIC1\n", - " Stage IB1\n", " \n", " \n", " 7\n", " IVB\n", - " Stage IB2\n", " \n", " \n", " 8\n", " IIIB\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " value_name\n", + "0 IA\n", + "1 NaN\n", + "2 IIIA\n", + "3 IIIC2\n", + "4 IB\n", + "5 II\n", + "6 IIIC1\n", + "7 IVB\n", + "8 IIIB" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bdi.preview_domain(dataset, 'FIGO_stage')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", "
value_namevalue_descriptioncolumn_description
0Stage 0A FIGO stage term that applies to gynecologic ...The extent of a cervical or endometrial cancer...
1Stage IA FIGO stage term that applies to gynecologic ...
2Stage IAInvasive cancer confined to the original anato...
3Stage IA1A FIGO stage term that applies to gynecologic ...
4Stage IA2A FIGO stage term that applies to gynecologic ...
5Stage IBA FIGO stage term that applies to gynecologic ...
6Stage IB1A FIGO stage term that applies to gynecologic ...
7Stage IB2A FIGO stage term that applies to gynecologic ...
8Stage ICA FIGO stage term that applies to gynecologic ...
9Stage IC1A FIGO stage term that applies to ovarian canc...
10Stage IC2A FIGO stage term that applies to ovarian canc...
11Stage IC3A FIGO stage term that applies to ovarian canc...
12Stage IIA FIGO stage term that applies to gynecologic ...
13Stage IIAA FIGO stage term that applies to gynecologic ...
14Stage IIA1A FIGO stage term that applies to gynecologic ...
15Stage IIA2A FIGO stage term that applies to gynecologic ...
16Stage IIBA FIGO stage term that applies to gynecologic ...
17Stage IICA cancer stage generally indicating the invasi...
18Stage IIIA FIGO stage term that applies to gynecologic ...
19Stage IIIAA FIGO stage term that applies to gynecologic ...
20Stage IIIA1A FIGO stage term that applies to ovarian canc...
21Stage IIIA2A FIGO stage term that applies to ovarian canc...
22Stage IIIAiA FIGO stage term that applies to ovarian canc...
23Stage IIIAiiA FIGO stage term that applies to ovarian canc...
24Stage IIIBA FIGO stage term that applies to gynecologic ...
25Stage IIICA FIGO stage term that applies to gynecologic ...
26Stage IIIC1A FIGO stage term that applies to gynecologic ...
27Stage IIIC2A FIGO stage term that applies to gynecologic ...
28Stage IVA FIGO stage term that applies to gynecologic ...
29Stage IVAA FIGO stage term that applies to gynecologic ...
30Stage IVBA FIGO stage term that applies to gynecologic ...
31UnknownNot known, not observed, not recorded, or refu...
32Not ReportedNot provided or available.
33Not Allowed To CollectAn indicator that specifies that a collection ...
\n", "
" ], "text/plain": [ - " source_domain target_domain\n", - "0 IA Stage 0\n", - "1 NaN Stage I\n", - "2 IIIA Stage IA\n", - "3 IIIC2 Stage IA1\n", - "4 IB Stage IA2\n", - "5 II Stage IB\n", - "6 IIIC1 Stage IB1\n", - "7 IVB Stage IB2\n", - "8 IIIB Stage IC\n", - "9 Stage IC1\n", - "10 Stage IC2\n", - "11 Stage IC3\n", - "12 Stage II\n", - "13 Stage IIA\n", - "14 Stage IIA1\n", - "15 Stage IIA2\n", - "16 Stage IIB\n", - "17 Stage IIC\n", - "18 Stage III\n", - "19 Stage IIIA\n", - "20 Stage IIIA1\n", - "21 Stage IIIA2\n", - "22 Stage IIIAi\n", - "23 Stage IIIAii\n", - "24 Stage IIIB\n", - "25 Stage IIIC\n", - "26 Stage IIIC1\n", - "27 Stage IIIC2\n", - "28 Stage IV\n", - "29 Stage IVA\n", - "30 Stage IVB\n", - "31 Unknown\n", - "32 Not Reported\n", - "33 Not Allowed To Collect" + " value_name value_description \\\n", + "0 Stage 0 A FIGO stage term that applies to gynecologic ... \n", + "1 Stage I A FIGO stage term that applies to gynecologic ... \n", + "2 Stage IA Invasive cancer confined to the original anato... \n", + "3 Stage IA1 A FIGO stage term that applies to gynecologic ... \n", + "4 Stage IA2 A FIGO stage term that applies to gynecologic ... \n", + "5 Stage IB A FIGO stage term that applies to gynecologic ... \n", + "6 Stage IB1 A FIGO stage term that applies to gynecologic ... \n", + "7 Stage IB2 A FIGO stage term that applies to gynecologic ... \n", + "8 Stage IC A FIGO stage term that applies to gynecologic ... \n", + "9 Stage IC1 A FIGO stage term that applies to ovarian canc... \n", + "10 Stage IC2 A FIGO stage term that applies to ovarian canc... \n", + "11 Stage IC3 A FIGO stage term that applies to ovarian canc... \n", + "12 Stage II A FIGO stage term that applies to gynecologic ... \n", + "13 Stage IIA A FIGO stage term that applies to gynecologic ... \n", + "14 Stage IIA1 A FIGO stage term that applies to gynecologic ... \n", + "15 Stage IIA2 A FIGO stage term that applies to gynecologic ... \n", + "16 Stage IIB A FIGO stage term that applies to gynecologic ... \n", + "17 Stage IIC A cancer stage generally indicating the invasi... \n", + "18 Stage III A FIGO stage term that applies to gynecologic ... \n", + "19 Stage IIIA A FIGO stage term that applies to gynecologic ... \n", + "20 Stage IIIA1 A FIGO stage term that applies to ovarian canc... \n", + "21 Stage IIIA2 A FIGO stage term that applies to ovarian canc... \n", + "22 Stage IIIAi A FIGO stage term that applies to ovarian canc... \n", + "23 Stage IIIAii A FIGO stage term that applies to ovarian canc... \n", + "24 Stage IIIB A FIGO stage term that applies to gynecologic ... \n", + "25 Stage IIIC A FIGO stage term that applies to gynecologic ... \n", + "26 Stage IIIC1 A FIGO stage term that applies to gynecologic ... \n", + "27 Stage IIIC2 A FIGO stage term that applies to gynecologic ... \n", + "28 Stage IV A FIGO stage term that applies to gynecologic ... \n", + "29 Stage IVA A FIGO stage term that applies to gynecologic ... \n", + "30 Stage IVB A FIGO stage term that applies to gynecologic ... \n", + "31 Unknown Not known, not observed, not recorded, or refu... \n", + "32 Not Reported Not provided or available. \n", + "33 Not Allowed To Collect An indicator that specifies that a collection ... \n", + "\n", + " column_description \n", + "0 The extent of a cervical or endometrial cancer... \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "5 \n", + "6 \n", + "7 \n", + "8 \n", + "9 \n", + "10 \n", + "11 \n", + "12 \n", + "13 \n", + "14 \n", + "15 \n", + "16 \n", + "17 \n", + "18 \n", + "19 \n", + "20 \n", + "21 \n", + "22 \n", + "23 \n", + "24 \n", + "25 \n", + "26 \n", + "27 \n", + "28 \n", + "29 \n", + "30 \n", + "31 \n", + "32 \n", + "33 " ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preview_domains = bdi.preview_domains(dataset, ('FIGO_stage', 'figo_stage'))\n", - "preview_domains" + "bdi.preview_domain('gdc', 'figo_stage')" ] }, { @@ -676,7 +838,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -688,7 +850,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -789,7 +951,7 @@ "8 nan Unknown 0.350" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -819,21 +981,25 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def map_columns_values(dataset, columns, target='gdc', match_index=0):\n", - " top_matches = bdi.top_matches(dataset, columns=columns, target=target) # Call top_matches\n", + " #top_matches = bdi.top_matches(dataset, columns=columns, target=target) # Call top_matches\n", " print('Top matches:')\n", " display(top_matches)\n", " selected_match = top_matches.iloc[[match_index]]\n", " column_mapping = selected_match.drop(columns=['similarity']).iloc[0]\n", " column_mapping = tuple(column_mapping)\n", - " preview_domains = bdi.preview_domains(dataset, column_mapping)\n", + " preview_domain_source = bdi.preview_domain(dataset, column_mapping[0])\n", + " preview_domain_target = bdi.preview_domain('gdc', column_mapping[1])\n", + " \n", + " print(f'Preview domain {column_mapping[0]} (source):')\n", + " display(preview_domain_source)\n", " \n", - " print(f'Preview domain {column_mapping}:')\n", - " display(preview_domains)\n", + " print(f'Preview domain {column_mapping[0]} (target):')\n", + " display(preview_domain_target)\n", " \n", " print(f'Value mappings {column_mapping}:')\n", " value_mappings = bdi.preview_value_mappings(dataset, column_mapping, target=target)\n", @@ -844,7 +1010,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -885,61 +1051,61 @@ " 0\n", " FIGO_stage\n", " figo_stage\n", - " 0.768761\n", + " 0.765089\n", " \n", " \n", " 1\n", " FIGO_stage\n", - " uicc_pathologic_stage\n", - " 0.715802\n", + " ajcc_clinical_stage\n", + " 0.645107\n", " \n", " \n", " 2\n", " FIGO_stage\n", " uicc_clinical_stage\n", - " 0.711589\n", + " 0.636324\n", " \n", " \n", " 3\n", " FIGO_stage\n", - " ajcc_clinical_stage\n", - " 0.686146\n", + " uicc_pathologic_stage\n", + " 0.633208\n", " \n", " \n", " 4\n", " FIGO_stage\n", - " ajcc_pathologic_stage\n", - " 0.652875\n", + " irs_group\n", + " 0.559433\n", " \n", " \n", " 5\n", " FIGO_stage\n", - " enneking_msts_stage\n", - " 0.606072\n", + " ajcc_pathologic_stage\n", + " 0.519987\n", " \n", " \n", " 6\n", " FIGO_stage\n", - " iss_stage\n", - " 0.602410\n", + " inss_stage\n", + " 0.494000\n", " \n", " \n", " 7\n", " FIGO_stage\n", - " irs_group\n", - " 0.551544\n", + " iss_stage\n", + " 0.460177\n", " \n", " \n", " 8\n", " FIGO_stage\n", - " masaoka_stage\n", - " 0.506091\n", + " cog_liver_stage\n", + " 0.441571\n", " \n", " \n", " 9\n", " FIGO_stage\n", - " inss_stage\n", - " 0.464668\n", + " shortest_dimension\n", + " 0.424832\n", " \n", " \n", "\n", @@ -947,16 +1113,16 @@ ], "text/plain": [ " source target similarity\n", - "0 FIGO_stage figo_stage 0.768761\n", - "1 FIGO_stage uicc_pathologic_stage 0.715802\n", - "2 FIGO_stage uicc_clinical_stage 0.711589\n", - "3 FIGO_stage ajcc_clinical_stage 0.686146\n", - "4 FIGO_stage ajcc_pathologic_stage 0.652875\n", - "5 FIGO_stage enneking_msts_stage 0.606072\n", - "6 FIGO_stage iss_stage 0.602410\n", - "7 FIGO_stage irs_group 0.551544\n", - "8 FIGO_stage masaoka_stage 0.506091\n", - "9 FIGO_stage inss_stage 0.464668" + "0 FIGO_stage figo_stage 0.765089\n", + "1 FIGO_stage ajcc_clinical_stage 0.645107\n", + "2 FIGO_stage uicc_clinical_stage 0.636324\n", + "3 FIGO_stage uicc_pathologic_stage 0.633208\n", + "4 FIGO_stage irs_group 0.559433\n", + "5 FIGO_stage ajcc_pathologic_stage 0.519987\n", + "6 FIGO_stage inss_stage 0.494000\n", + "7 FIGO_stage iss_stage 0.460177\n", + "8 FIGO_stage cog_liver_stage 0.441571\n", + "9 FIGO_stage shortest_dimension 0.424832" ] }, "metadata": {}, @@ -966,7 +1132,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Preview domain ('FIGO_stage', 'figo_stage'):\n" + "Preview domain FIGO_stage (source):\n" ] }, { @@ -990,221 +1156,380 @@ " \n", " \n", " \n", - " source_domain\n", - " target_domain\n", + " value_name\n", " \n", " \n", " \n", " \n", " 0\n", " IA\n", - " Stage 0\n", " \n", " \n", " 1\n", " NaN\n", - " Stage I\n", " \n", " \n", " 2\n", " IIIA\n", - " Stage IA\n", " \n", " \n", " 3\n", " IIIC2\n", - " Stage IA1\n", " \n", " \n", " 4\n", " IB\n", - " Stage IA2\n", " \n", " \n", " 5\n", " II\n", - " Stage IB\n", " \n", " \n", " 6\n", " IIIC1\n", - " Stage IB1\n", " \n", " \n", " 7\n", " IVB\n", - " Stage IB2\n", " \n", " \n", " 8\n", " IIIB\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " value_name\n", + "0 IA\n", + "1 NaN\n", + "2 IIIA\n", + "3 IIIC2\n", + "4 IB\n", + "5 II\n", + "6 IIIC1\n", + "7 IVB\n", + "8 IIIB" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preview domain FIGO_stage (target):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", "
value_namevalue_descriptioncolumn_description
0Stage 0A FIGO stage term that applies to gynecologic ...The extent of a cervical or endometrial cancer...
1Stage IA FIGO stage term that applies to gynecologic ...
2Stage IAInvasive cancer confined to the original anato...
3Stage IA1A FIGO stage term that applies to gynecologic ...
4Stage IA2A FIGO stage term that applies to gynecologic ...
5Stage IBA FIGO stage term that applies to gynecologic ...
6Stage IB1A FIGO stage term that applies to gynecologic ...
7Stage IB2A FIGO stage term that applies to gynecologic ...
8Stage ICA FIGO stage term that applies to gynecologic ...
9Stage IC1A FIGO stage term that applies to ovarian canc...
10Stage IC2A FIGO stage term that applies to ovarian canc...
11Stage IC3A FIGO stage term that applies to ovarian canc...
12Stage IIA FIGO stage term that applies to gynecologic ...
13Stage IIAA FIGO stage term that applies to gynecologic ...
14Stage IIA1A FIGO stage term that applies to gynecologic ...
15Stage IIA2A FIGO stage term that applies to gynecologic ...
16Stage IIBA FIGO stage term that applies to gynecologic ...
17Stage IICA cancer stage generally indicating the invasi...
18Stage IIIA FIGO stage term that applies to gynecologic ...
19Stage IIIAA FIGO stage term that applies to gynecologic ...
20Stage IIIA1A FIGO stage term that applies to ovarian canc...
21Stage IIIA2A FIGO stage term that applies to ovarian canc...
22Stage IIIAiA FIGO stage term that applies to ovarian canc...
23Stage IIIAiiA FIGO stage term that applies to ovarian canc...
24Stage IIIBA FIGO stage term that applies to gynecologic ...
25Stage IIICA FIGO stage term that applies to gynecologic ...
26Stage IIIC1A FIGO stage term that applies to gynecologic ...
27Stage IIIC2A FIGO stage term that applies to gynecologic ...
28Stage IVA FIGO stage term that applies to gynecologic ...
29Stage IVAA FIGO stage term that applies to gynecologic ...
30Stage IVBA FIGO stage term that applies to gynecologic ...
31UnknownNot known, not observed, not recorded, or refu...
32Not ReportedNot provided or available.
33Not Allowed To CollectAn indicator that specifies that a collection ...
\n", "
" ], "text/plain": [ - " source_domain target_domain\n", - "0 IA Stage 0\n", - "1 NaN Stage I\n", - "2 IIIA Stage IA\n", - "3 IIIC2 Stage IA1\n", - "4 IB Stage IA2\n", - "5 II Stage IB\n", - "6 IIIC1 Stage IB1\n", - "7 IVB Stage IB2\n", - "8 IIIB Stage IC\n", - "9 Stage IC1\n", - "10 Stage IC2\n", - "11 Stage IC3\n", - "12 Stage II\n", - "13 Stage IIA\n", - "14 Stage IIA1\n", - "15 Stage IIA2\n", - "16 Stage IIB\n", - "17 Stage IIC\n", - "18 Stage III\n", - "19 Stage IIIA\n", - "20 Stage IIIA1\n", - "21 Stage IIIA2\n", - "22 Stage IIIAi\n", - "23 Stage IIIAii\n", - "24 Stage IIIB\n", - "25 Stage IIIC\n", - "26 Stage IIIC1\n", - "27 Stage IIIC2\n", - "28 Stage IV\n", - "29 Stage IVA\n", - "30 Stage IVB\n", - "31 Unknown\n", - "32 Not Reported\n", - "33 Not Allowed To Collect" + " value_name value_description \\\n", + "0 Stage 0 A FIGO stage term that applies to gynecologic ... \n", + "1 Stage I A FIGO stage term that applies to gynecologic ... \n", + "2 Stage IA Invasive cancer confined to the original anato... \n", + "3 Stage IA1 A FIGO stage term that applies to gynecologic ... \n", + "4 Stage IA2 A FIGO stage term that applies to gynecologic ... \n", + "5 Stage IB A FIGO stage term that applies to gynecologic ... \n", + "6 Stage IB1 A FIGO stage term that applies to gynecologic ... \n", + "7 Stage IB2 A FIGO stage term that applies to gynecologic ... \n", + "8 Stage IC A FIGO stage term that applies to gynecologic ... \n", + "9 Stage IC1 A FIGO stage term that applies to ovarian canc... \n", + "10 Stage IC2 A FIGO stage term that applies to ovarian canc... \n", + "11 Stage IC3 A FIGO stage term that applies to ovarian canc... \n", + "12 Stage II A FIGO stage term that applies to gynecologic ... \n", + "13 Stage IIA A FIGO stage term that applies to gynecologic ... \n", + "14 Stage IIA1 A FIGO stage term that applies to gynecologic ... \n", + "15 Stage IIA2 A FIGO stage term that applies to gynecologic ... \n", + "16 Stage IIB A FIGO stage term that applies to gynecologic ... \n", + "17 Stage IIC A cancer stage generally indicating the invasi... \n", + "18 Stage III A FIGO stage term that applies to gynecologic ... \n", + "19 Stage IIIA A FIGO stage term that applies to gynecologic ... \n", + "20 Stage IIIA1 A FIGO stage term that applies to ovarian canc... \n", + "21 Stage IIIA2 A FIGO stage term that applies to ovarian canc... \n", + "22 Stage IIIAi A FIGO stage term that applies to ovarian canc... \n", + "23 Stage IIIAii A FIGO stage term that applies to ovarian canc... \n", + "24 Stage IIIB A FIGO stage term that applies to gynecologic ... \n", + "25 Stage IIIC A FIGO stage term that applies to gynecologic ... \n", + "26 Stage IIIC1 A FIGO stage term that applies to gynecologic ... \n", + "27 Stage IIIC2 A FIGO stage term that applies to gynecologic ... \n", + "28 Stage IV A FIGO stage term that applies to gynecologic ... \n", + "29 Stage IVA A FIGO stage term that applies to gynecologic ... \n", + "30 Stage IVB A FIGO stage term that applies to gynecologic ... \n", + "31 Unknown Not known, not observed, not recorded, or refu... \n", + "32 Not Reported Not provided or available. \n", + "33 Not Allowed To Collect An indicator that specifies that a collection ... \n", + "\n", + " column_description \n", + "0 The extent of a cervical or endometrial cancer... \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "5 \n", + "6 \n", + "7 \n", + "8 \n", + "9 \n", + "10 \n", + "11 \n", + "12 \n", + "13 \n", + "14 \n", + "15 \n", + "16 \n", + "17 \n", + "18 \n", + "19 \n", + "20 \n", + "21 \n", + "22 \n", + "23 \n", + "24 \n", + "25 \n", + "26 \n", + "27 \n", + "28 \n", + "29 \n", + "30 \n", + "31 \n", + "32 \n", + "33 " ] }, "metadata": {}, diff --git a/requirements.txt b/requirements.txt index b9e8a81d..627d62c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ panel!=1.4.3 Levenshtein autofj natsort -jupyter_bokeh \ No newline at end of file +jupyter_bokeh +conllu<5.0.0 \ No newline at end of file