From 2da92efe88b90f498d5ea32368e92403d1306a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Thu, 23 May 2024 18:21:25 -0400 Subject: [PATCH 1/4] Setup CI infrastructure for automated tests --- .github/workflows/build.yml | 30 +++++++++++++++++++++++++++ Makefile | 6 ++++++ tests/__init__.py | 0 tests/test_column_mapping.py | 39 ++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 Makefile create mode 100644 tests/__init__.py create mode 100644 tests/test_column_mapping.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..d55495de --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,30 @@ +name: Tests + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest jupyter + + - name: Install the bdikit package + run: | + pip install -e . + + - name: Test with pytest + run: | + pytest diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..9f265b28 --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +all: test + +PHONY: test + +test: + python3 -m pytest diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_column_mapping.py b/tests/test_column_mapping.py new file mode 100644 index 00000000..c8755712 --- /dev/null +++ b/tests/test_column_mapping.py @@ -0,0 +1,39 @@ +import unittest +import pandas as pd +from bdikit.mapping_algorithms.column_mapping.algorithms import ( + SimFloodAlgorithm, + JaccardDistanceAlgorithm, + DistributionBasedAlgorithm, + ComaAlgorithm, + CupidAlgorithm, +) + + +class ColumnMappingTest(unittest.TestCase): + def test_basic_column_mapping_algorithms(self): + for ColumnMatcher in [ + SimFloodAlgorithm, + JaccardDistanceAlgorithm, + DistributionBasedAlgorithm, + ComaAlgorithm, + CupidAlgorithm, + ]: + # given + table1 = pd.DataFrame( + {"column_1": ["a1", "b1", "c1"], "col_2": ["a2", "b2", "c2"]} + ) + table2 = pd.DataFrame( + {"column_1a": ["a1", "b1", "c1"], "col2": ["a2", "b2", "c2"]} + ) + column_matcher = ColumnMatcher(dataset=table1, global_table=table2) + + # when + mapping = column_matcher.map() + + # then + print(mapping) + self.assertEqual( + {"column_1": "column_1a", "col_2": "col2"}, + mapping, + msg=f"{ColumnMatcher.__name__} failed to map columns", + ) From 39ab9eb9a9b1e69b8fe61764faaedb614c935a25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Thu, 23 May 2024 19:13:14 -0400 Subject: [PATCH 2/4] Temporarily use scipy<1.13 to avoid import error of triu package Pinning scipy to version lower than 1.13 to avoid the following error: ImportError: cannot import name 'triu' from 'scipy.linalg' (/opt/hostedtoolcache/Python/3.9.19/x64/lib/python3.9/site-packages/scipy/linalg/__init__.py) This version restriction can be removed when packages that depend on triu are fixed. See e.g.: - https://github.com/piskvorky/gensim/pull/3524 - https://github.com/scipy/scipy/issues/20402 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b7bd6650..c847408a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ scikit-learn tabulate flair requests +scipy<1.13 From 6ce7ec6cca56f951fab556b33c51595ca74b4cad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Thu, 23 May 2024 19:47:14 -0400 Subject: [PATCH 3/4] Temporarily use matplotlib<3.9 to avoid PolyFuzz error PolyFuzz is being installed as a transitive dependency, and the following error happens due to the removal of `matplotlib.cm.get_cmap` in matplotlib 3.9. We can remove the restriction when PolyFuzz releases a version that fixes the issue /opt/hostedtoolcache/Python/3.9.19/x64/lib/python3.9/site-packages/polyfuzz/__init__.py:1: in from .polyfuzz import PolyFuzz /opt/hostedtoolcache/Python/3.9.19/x64/lib/python3.9/site-packages/polyfuzz/polyfuzz.py:9: in from polyfuzz.metrics import precision_recall_curve, visualize_precision_recall /opt/hostedtoolcache/Python/3.9.19/x64/lib/python3.9/site-packages/polyfuzz/metrics.py:8: in from matplotlib.cm import get_cmap E ImportError: cannot import name 'get_cmap' from 'matplotlib.cm' (/opt/hostedtoolcache/Python/3.9.19/x64/lib/python3.9/site-packages/matplotlib/cm.py) --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c847408a..53b55247 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ tabulate flair requests scipy<1.13 +matplotlib<3.9 \ No newline at end of file From 7f66dfe5948a1d33d4f0c5b5d8aaf26af03eed2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Thu, 23 May 2024 20:11:09 -0400 Subject: [PATCH 4/4] Add Python 3.11 to CI build script --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d55495de..22de4e3f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3