diff --git a/deepform/data/add_features.py b/deepform/data/add_features.py index 7d0e477..b434cb2 100644 --- a/deepform/data/add_features.py +++ b/deepform/data/add_features.py @@ -94,6 +94,9 @@ def extend_and_write_docs( logger.debug(f"Writing document index to {pq_index}...") doc_index = pd.DataFrame(doc_results).set_index("slug", drop=True) + + # Avoid mixed dtypes, which can cause errors in pyarrow while exporting to parquet + doc_index["gross_amount"] = doc_index.gross_amount.astype(str) doc_index.to_parquet(pq_index) diff --git a/deepform/util.py b/deepform/util.py index 3e1ff8b..1f7860c 100644 --- a/deepform/util.py +++ b/deepform/util.py @@ -36,7 +36,9 @@ def default_similarity(lhs, rhs): def is_dollar_amount(s): try: - return num_digits(s) > 0 and bool(re.match(r"^\$?\d*(,\d\d\d)*(\.\d\d)?$", s)) + return num_digits(s) > 0 and bool( + re.match(r"^\$?\d*(,\d\d\d)*(\.[\d]{1,2})?$", s) + ) except TypeError: return False diff --git a/tests/test_util.py b/tests/test_util.py index bb118ed..d9d23bd 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -25,6 +25,8 @@ def test_is_dollar_amount(): assert is_dollar_amount("3") assert is_dollar_amount("04") assert is_dollar_amount("9,000") + assert is_dollar_amount("10.0") + assert is_dollar_amount("$50.0") assert not is_dollar_amount("") assert not is_dollar_amount("$") assert not is_dollar_amount(",") @@ -43,6 +45,7 @@ def test_dollar_amount(): assert dollar_amount("3") == 3 assert dollar_amount("04") == 4 assert dollar_amount("9,000") == 9000 + assert dollar_amount("1,300.0") == 1300.0 assert dollar_amount("") is None assert dollar_amount("C") is None assert dollar_amount("$x") is None