Skip to content

Commit

Permalink
Update preview_domain() to include value names and descriptions (#68)
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez authored Jul 12, 2024
1 parent 59440f6 commit 9a57060
Show file tree
Hide file tree
Showing 6 changed files with 738 additions and 317 deletions.
89 changes: 37 additions & 52 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import copy
import pandas as pd
import numpy as np
from bdikit.utils import get_gdc_data
from bdikit.utils import get_gdc_data, get_gdc_metadata
from bdikit.mapping_algorithms.column_mapping.algorithms import (
BaseSchemaMatcher,
SimFloodSchemaMatcher,
Expand Down Expand Up @@ -470,75 +470,60 @@ def preview_value_mappings(
return result


def preview_domains(
dataset: pd.DataFrame,
column_mapping: Tuple[str, str],
target: Union[str, pd.DataFrame] = "gdc",
def preview_domain(
dataset: Union[str, pd.DataFrame],
column: str,
limit: Optional[int] = None,
) -> pd.DataFrame:
"""
Preview the domain (set of unique values) of the given columns in the source and target
dataset (or target data dictionary).
Preview the domain, i.e. set of unique values, column description and value description
(if applicable) of the given column of the source or target dataset.
Args:
dataset (pd.DataFrame): The source dataset containing the columns to preview.
column_mapping (Tuple[str, str]): The mapping between the source and target columns.
The first and second positions should contain the names of the source and target
columns respectively.
target (Union[str, pd.DataFrame], optional): The target dataset or standard vocabulary name.
If a string is provided and it is equal to "gdc", the target domain will be retrieved
dataset (Union[str, pd.DataFrame], optional): The dataset or standard vocabulary name
containing the column to preview.
If a string is provided and it is equal to "gdc", the domain will be retrieved
from the GDC data.
If a DataFrame is provided, the target domain will be retrieved from the specified DataFrame.
Defaults to "gdc".
If a DataFrame is provided, the domain will be retrieved from the specified DataFrame.
column(str): The column name to show the domain.
limit (int, optional): The maximum number of unique values to include in the preview.
Defaults to 10.
Defaults to None.
Returns:
pd.DataFrame: A DataFrame containing the source and target domain values (or a sample of
them if the parameter `limit` was specified). The DataFrame will have two columns:
"source_domain" and "target_domain".
pd.DataFrame: A DataFrame containing the unique domain values (or a sample of
them if the parameter `limit` was specified), column description and value description
(if applicable).
"""
source_column, target_column = column_mapping

source_domain = dataset[source_column].unique()

if isinstance(target, str) and target == "gdc":
gdc_col_domain = get_gdc_data([target_column])[target_column]
target_domain = (
np.array([]) if gdc_col_domain is None else np.array(gdc_col_domain)
)
elif isinstance(target, pd.DataFrame):
target_domain = target[target_column].unique()
if isinstance(dataset, str) and dataset == "gdc":
gdc_metadata = get_gdc_metadata()
value_names = gdc_metadata[column]["value_names"]
value_descriptions = gdc_metadata[column]["value_descriptions"]
column_description = gdc_metadata[column]["description"]
assert len(value_names) == len(value_descriptions)
elif isinstance(dataset, pd.DataFrame):
value_names = dataset[column].unique()
value_descriptions = []
column_description = ""
else:
raise ValueError(
"The target must be a DataFrame or a standard vocabulary name."
"The dataset must be a DataFrame or a standard vocabulary name."
)

# Find the final output size based on the the largest domain size and limit parameter
largest_domain_size = max(len(source_domain), len(target_domain))
output_size = (
largest_domain_size if limit is None else min(largest_domain_size, limit)
)
if isinstance(limit, int):
value_names = value_names[:limit]
value_descriptions = value_descriptions[:limit]

# Truncate the domains to the output size if they are larger
if len(source_domain) > output_size:
source_domain = source_domain[:output_size]
if len(target_domain) > output_size:
target_domain = target_domain[:output_size]
domain = {"value_name": value_names}

# Fill the domains with empty strings if they are smaller than the output size
if len(source_domain) < output_size:
source_domain = np.append(
source_domain, np.full(output_size - len(source_domain), "")
)
if len(target_domain) < output_size:
target_domain = np.append(
target_domain, np.full(output_size - len(target_domain), "")
)
if len(value_descriptions) > 0:
domain["value_description"] = value_descriptions

return pd.DataFrame(
{"source_domain": source_domain, "target_domain": target_domain}
)
if len(column_description) > 0:
empty_rows_size = len(value_names) - 1
domain["column_description"] = [column_description] + [""] * empty_rows_size

return pd.DataFrame(domain)


ValueMatchingLike = Union[List[ValueMatchingResult], List[Dict], pd.DataFrame]
Expand Down
30 changes: 18 additions & 12 deletions bdikit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,28 @@ def get_gdc_values(column_name, gdc_schema):
return None


def get_all_gdc_columns():
all_columns = []
gdc_schema = read_gdc_schema()
for key, values in gdc_schema.items():
for key in values["properties"].keys():
all_columns.append(key)
return all_columns


def get_gdc_metadata():
metadata = {}
gdc_schema = read_gdc_schema()

for key, values in gdc_schema.items():
for key, data in values["properties"].items():
metadata[key] = data
for attrib_data in gdc_schema.values():
for attrib_name, attrib_properties in attrib_data["properties"].items():
metadata[attrib_name] = {}
attrib_description = attrib_properties.get("description", "")
metadata[attrib_name]["description"] = attrib_description

value_names = attrib_properties.get("enum", [])
metadata[attrib_name]["value_names"] = value_names

descriptions = attrib_properties.get("enumDef", {})
value_descriptions = []
for value_name in value_names:
description = ""
if value_name in descriptions:
description = descriptions[value_name].get("description", "")
value_descriptions.append(description)

metadata[attrib_name]["value_descriptions"] = value_descriptions

return metadata

Expand Down
5 changes: 3 additions & 2 deletions docs/source/examples.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Examples
==========
=========

Here can find different Jupyter notebook examples about how to use `bdi-kit`:

- `API overview: mapping data to the GDC vocabulary <https://github.com/VIDA-NYU/bdi-kit/blob/devel/examples/dou_gdc_harmonization.ipynb>`__
- `Analyzing one attribute/column at a time <https://github.com/VIDA-NYU/bdi-kit/blob/devel/examples/analyzing_one_attribute.ipynb>`__
- `Exploring schema and value matching through a visualization tool <https://github.com/VIDA-NYU/bdi-kit/blob/devel/examples/schema_matching_heatmap.ipynb>`__
Loading

0 comments on commit 9a57060

Please sign in to comment.