Skip to content

Commit

Permalink
add optional empty lines filter in read_text (ray-project#22298)
Browse files Browse the repository at this point in the history
ray.data.read_text() currently doesn't take care of empty lines; this pr adds a flag to enable the empty line filter; 
with this change, read_text will only return non-empty line by default, unless otherwise setting drop_empty_line to False.

Co-authored-by: Eric Liang <[email protected]>
Co-authored-by: Jialin Liu <[email protected]>
  • Loading branch information
3 people authored and simonsays1980 committed Feb 27, 2022
1 parent fca047e commit 57e4548
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
9 changes: 8 additions & 1 deletion python/ray/data/read_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,7 @@ def read_text(
*,
encoding: str = "utf-8",
errors: str = "ignore",
drop_empty_lines: bool = True,
filesystem: Optional["pyarrow.fs.FileSystem"] = None,
parallelism: int = 200,
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -496,12 +497,18 @@ def read_text(
Dataset holding lines of text read from the specified paths.
"""

def to_text(s):
lines = s.decode(encoding).split("\n")
if drop_empty_lines:
lines = [line for line in lines if line.strip() != ""]
return lines

return read_binary_files(
paths,
filesystem=filesystem,
parallelism=parallelism,
arrow_open_stream_args=arrow_open_stream_args,
).flat_map(lambda x: x.decode(encoding, errors=errors).split("\n"))
).flat_map(to_text)


@PublicAPI(stability="beta")
Expand Down
6 changes: 5 additions & 1 deletion python/ray/data/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,8 +1003,12 @@ def test_read_text(ray_start_regular_shared, tmp_path):
f.write("world")
with open(os.path.join(path, "file2.txt"), "w") as f:
f.write("goodbye")
with open(os.path.join(path, "file3.txt"), "w") as f:
f.write("ray\n")
ds = ray.data.read_text(path)
assert sorted(ds.take()) == ["goodbye", "hello", "world"]
assert sorted(ds.take()) == ["goodbye", "hello", "ray", "world"]
ds = ray.data.read_text(path, drop_empty_lines=False)
assert ds.count() == 5


@pytest.mark.parametrize("pipelined", [False, True])
Expand Down

0 comments on commit 57e4548

Please sign in to comment.