From 08435a0c336048c4926139d9b5b132ee298ecd71 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Mon, 12 Aug 2024 08:28:28 +1000 Subject: [PATCH] Re-enable Polars as a supported library for jsonl --- .../job_runners/dataset/compatible_libraries.py | 6 ++---- .../dataset/test_compatible_libraries.py | 16 ---------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/services/worker/src/worker/job_runners/dataset/compatible_libraries.py b/services/worker/src/worker/job_runners/dataset/compatible_libraries.py index a9af8dd22..54e061f47 100644 --- a/services/worker/src/worker/job_runners/dataset/compatible_libraries.py +++ b/services/worker/src/worker/job_runners/dataset/compatible_libraries.py @@ -697,15 +697,13 @@ def fmt_code( elif builder_name == "json": first_file = f"datasets/{dataset}/" + next(iter(loading_codes[0]["arguments"]["splits"].values())) - if "*" in first_file: - # The pattern 'datasets/[dataset]/**/*' is not supported yet - return None is_json_lines = ".jsonl" in first_file or HfFileSystem(token=hf_token).open(first_file, "r").read(1) != "[" if is_json_lines: read_func = "read_ndjson" else: - read_func = "read_json" + # JSON not yet supported by Polars + return None compatible_library["function"] = f"pl.{read_func}" else: diff --git a/services/worker/tests/job_runners/dataset/test_compatible_libraries.py b/services/worker/tests/job_runners/dataset/test_compatible_libraries.py index dcfe739a4..2ce541487 100644 --- a/services/worker/tests/job_runners/dataset/test_compatible_libraries.py +++ b/services/worker/tests/job_runners/dataset/test_compatible_libraries.py @@ -547,19 +547,3 @@ def test_get_builder_configs_with_simplified_data_files( assert module_name in get_compatible_library_for_builder compatible_library = get_compatible_library_for_builder[module_name](dataset, hf_token, login_required) assert compatible_library["library"] == expected_library - - -@pytest.mark.integration -@pytest.mark.parametrize( - "dataset,builder_name", - [ - ("tcor0005/langchain-docs-400-chunksize", "json"), - ], -) -def test_get_polars_compatible_library( - use_hub_prod_endpoint: pytest.MonkeyPatch, - dataset: str, - builder_name: str, -) -> None: - v = get_polars_compatible_library(builder_name=builder_name, dataset=dataset, hf_token=None, login_required=False) - assert v is None