From 2ae05e1f7e90ea0792828e88724bccf6b946373e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Thu, 1 Aug 2024 18:44:27 -0400 Subject: [PATCH] docs: Add documentation for schema mapping methods --- bdikit/api.py | 8 ++--- docs/source/_static/css/custom.css | 8 +++++ docs/source/api.rst | 4 +-- docs/source/conf.py | 11 ++++++- docs/source/index.rst | 19 ++++++++++-- docs/source/schema-matching.rst | 48 ++++++++++++++++++++++++++++++ 6 files changed, 89 insertions(+), 9 deletions(-) create mode 100644 docs/source/_static/css/custom.css create mode 100644 docs/source/schema-matching.rst diff --git a/bdikit/api.py b/bdikit/api.py index ca4bc5cc..eecee604 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -507,7 +507,7 @@ def merge_mappings( Defaults to None. Returns: - List: The data harmonization plan that can be used as input to the :py:func:`~bdikit.materialize_mapping()` + List: The data harmonization plan that can be used as input to the :py:func:`~bdikit.api.materialize_mapping()` function. Concretely, the harmonization plan is a list of dictionaries, where each dictionary contains the source column, target column, and mapper object that will be used to transform the input to the output data. @@ -794,17 +794,17 @@ def _create_mapper_from_value_matches(matches: List[ValueMatch]) -> DictionaryMa - `source`: The name of the source column. - `target`: The name of the target column. - `mapper` (optional): A ValueMapper instance or an object that can be used to - create one using :py:func:`~bdikit.create_mapper()`). Examples of valid objects + create one using :py:func:`~bdikit.api.create_mapper()`. Examples of valid objects are Python functions or lambda functions. If empty, an IdentityValueMapper is used by default. - `matches` (optional): Specifies the value mappings. It can be a DataFrame containing - the matches (returned by :py:func:`~bdikit.match_values()`), a list of ValueMatch + the matches (returned by :py:func:`~bdikit.api.match_values()`), a list of ValueMatch objects, or a list of tuples (, ). Alternatively, the list can contain DataFrames. In this case, the DataFrames must contain not only the value mappings (as described in the `matches` key above) but also the `source` and `target` columns as DataFrame attributes. The DataFrames created -by :py:func:`~bdikit.match_values()` include this information by default. +by :py:func:`~bdikit.api.match_values()` include this information by default. If the mapping specification is a DataFrame, it must be compatible with the dictionaries above and contain `source`, `target`, and `mapper` or `matcher` columns. diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 00000000..454cac45 --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,8 @@ +.wy-nav-content { + max-width: 1280px; +} + +.wy-table-responsive table td, +.wy-table-responsive table th { + white-space: collapse; +} \ No newline at end of file diff --git a/docs/source/api.rst b/docs/source/api.rst index c72e61f1..8d86f217 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -1,5 +1,5 @@ -API Reference -============= +`bdikit.api` (Module) +===================== .. automodule:: bdikit.api diff --git a/docs/source/conf.py b/docs/source/conf.py index 9a4f6c49..12e49b93 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -93,4 +93,13 @@ "rapidfuzz", ] -autodoc_type_aliases = {'MappingSpecLike': 'MappingSpecLike'} \ No newline at end of file +autodoc_type_aliases = {'MappingSpecLike': 'MappingSpecLike'} + +# These folders are copied to the documentation's HTML output +html_static_path = ['_static'] + +# These paths are either relative to html_static_path +# or fully qualified paths (eg. https://...) +html_css_files = [ + 'css/custom.css', +] \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index f890d2d4..a7371e85 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,16 +1,31 @@ Overview ========= -This project aims to assist users in performing data integration on biomedical data. It provides tools to streamline the process of integrating disparate biomedical datasets. +The bdi-kit is a library that assist users in performing data harmonization. +It provides state-of-the-art tools to streamline the process of integrating and +transforming disparate datasets (with a focus on biomedical data), and includes +APIs and visualizations for peforming tasks such as: + +- Schema matching +- Value matching +- Data transformation to a target schema/standard + +**Warning:** bdi-kit is currently in *alpha* stage and under heavy development. Expect APIs to change. You can find the source code in our `GitHub repository `__. .. toctree:: :maxdepth: 2 - :caption: Contents: + :caption: Getting Started installation getting-started examples + +.. toctree:: + :maxdepth: 1 + :caption: API Reference + api + schema-matching diff --git a/docs/source/schema-matching.rst b/docs/source/schema-matching.rst new file mode 100644 index 00000000..c09b9a93 --- /dev/null +++ b/docs/source/schema-matching.rst @@ -0,0 +1,48 @@ +Schema Matching Methods +======================= + +This page provides an overview of all schema matching methods available in the `bdikit` library. +Some methods reuse the implementation of other libraries such as `Valentine `_ (e.g, `similarity_flooding`, `coma` and `cupid`) while others are implemented originally for bdikit (e.g., `gpt`, `ct_learning`, and `two_phase`). +To see how to use these methods, please refer to the documentation of :py:func:`~bdikit.api.match_schema()` in the :py:mod:`~bdikit.api` module. + +.. ``bdikit module `. + + + +.. list-table:: bdikit methods + :header-rows: 1 + + * - Method + - Class + - Description + * - ``ct_learning`` + - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.ContrastiveLearningSchemaMatcher` + - | Uses a contrastive (CT) learning model to learn embeddings for columns and retrieves the best match most similar columns using the cosine similarity between the column embeddings. + * - ``two_phase`` + - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.TwoPhaseSchemaMatcher` + - | The two-phase schema matching method first uses a a top-k column matcher (e.g., `ct_learning`) to prune the search space (keeping only the top-k most likely matches), and then uses another column matcher to choose the best match from the pruned search space. + * - ``gpt`` + - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.GPTSchemaMatcher` + - | This method uses the `ct_learning` to prune the search space and then uses a large language model (GPT4) to choose the best column match, given a set of top-k most likely candidates retrieved using the `ct_learning` method in the first phase. + +.. list-table:: Methods from other libraries + :header-rows: 1 + + * - Method + - Class + - Description + * - ``similarity_flooding`` + - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.SimFloodSchemaMatcher` + - | Similarity Flooding transforms schemas into directed graphs and merges them into a propagation graph. The algorithm iteratively propagates similarity scores to neighboring nodes until convergence. This algorithm was proposed by Sergey Melnik, Hector Garcia-Molina, and Erhard Rahm in "Similarity Flooding: A Versatile Graph Matching Algorithm and Its Application to Schema Matching" (ICDE, 2002). + * - ``coma`` + - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.ComaSchemaMatcher` + - | COMA is a matcher that combines multiple schema-based matchers, representing schemas as rooted directed acyclic graphs. This algorithm was proposed by Do, Hong-Hai, and Erhard Rahm in "COMA — a system for flexible combination of schema matching approaches." (VLDB 2002). *This algorithm requires Java to be installed on the system.* + * - ``cupid`` + - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.CupidSchemaMatcher` + - | Cupid is a schema-based approach that translates schemas into tree structures. It calculates overall similarity using linguistic and structural similarities, with tree transformations helping to compute context-based similarity. This algorithm was proposed by Madhavan et al. in "Generic Schema Matching with Cupid" (VLDB, 2001)​. + * - ``distribution_based`` + - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.DistributionBasedSchemaMatcher` + - | Distribution-based Matching compares the distribution of data values in columns using the Earth Mover's Distance. It clusters relational attributes based on these comparisons. This algorithm was proposed by Zhang et al. in "Automatic discovery of attributes in relational databases" (SIGMOD 2011). + * - ``jaccard_distance`` + - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.JaccardSchemaMatcher` + - | This algorithm computes pairwise column similarities using Jaccard similarity, treating values as identical if their Levenshtein distance is below a threshold. The algorithm was proposed by Koutras et al. in "Valentine: Evaluating matching techniques for dataset discovery" (ICDE 2021).