Skip to content

Commit

Permalink
Include a read_to_polars function
Browse files Browse the repository at this point in the history
  • Loading branch information
AdrianDAlessandro committed Oct 1, 2024
1 parent ddc67f4 commit 6096feb
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 1 deletion.
2 changes: 2 additions & 0 deletions csvy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""
Python reader/writer for CSV files with YAML header information.
"""

__version__ = "0.2.2"
from .readers import ( # noqa: F401
read_header,
read_metadata,
read_to_array,
read_to_dataframe,
read_to_polars,
)
from .writers import Writer, write, write_header # noqa: F401
60 changes: 59 additions & 1 deletion csvy/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,17 @@
except ModuleNotFoundError:
DataFrame = None # type: ignore
logging.getLogger().debug(
"Pandas is not installed. Reading into a DataFrame will not work."
"Pandas is not installed. Reading into a pd.DataFrame will not work."
)

try:
from polars import DataFrame as PolarsDataFrame
from polars import LazyFrame
except ModuleNotFoundError:
LazyFrame = None # type: ignore
PolarsDataFrame = None # type: ignore
logging.getLogger().debug(
"Polars is not installed. Reading into a pl.DataFrame will not work."
)


Expand Down Expand Up @@ -168,6 +178,54 @@ def read_to_dataframe(
return pd.read_csv(filename, **options), header


def read_to_polars(
filename: Union[Path, str],
marker: str = "---",
csv_options: Optional[Dict[str, Any]] = None,
yaml_options: Optional[Dict[str, Any]] = None,
eager: bool = False,
) -> Tuple[Union[LazyFrame, PolarsDataFrame], Dict[str, Any]]:
"""Reads a CSVY file into dict with the header and a Polars LazyFrame with the data.
This uses the `scan_csv` method from Polars to read the data. This returns a polars
LazyFrame, which means the data is not loaded into memory until it is needed. To
load the data into memory, set the `eager` parameter to `True`.
Possible 'skip_rows' and 'comment_prefix' argument provided in the 'csv_options'
dictionary will be ignored.
Args:
filename: Name of the file to read.
marker: The marker characters that indicate the yaml header.
csv_options: Options to pass to pd.read_csv.
yaml_options: Options to pass to yaml.safe_load.
eager: Whether to load the data into memory.
Raises:
ModuleNotFoundError: If polars is not found.
Returns:
Tuple containing: The polars LazyFrame and the header as a dictionary.
"""
if LazyFrame is None:
raise ModuleNotFoundError(
"Module polars is not present. Install it to read data into DataFrame."
)
import polars as pl

yaml_options = yaml_options if yaml_options is not None else {}
header, nlines, comment = read_header(filename, marker=marker, **yaml_options)

options = csv_options.copy() if csv_options is not None else {}
options["skip_rows"] = nlines
options["comment_prefix"] = comment[0] if len(comment) >= 1 else None

lf = pl.scan_csv(filename, **options)
if eager:
return lf.collect(), header
return lf, header


def read_to_list(
filename: Union[Path, str],
marker: str = "---",
Expand Down
23 changes: 23 additions & 0 deletions tests/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,29 @@ def test_read_to_dataframe(data_path):
read_to_dataframe(data_path)


def test_read_to_polars(data_path):
import polars as pl
from polars.testing import assert_frame_equal

from csvy.readers import read_to_polars

lazy_data, header = read_to_polars(data_path)
assert isinstance(lazy_data, pl.LazyFrame)
assert tuple(lazy_data.columns) == ("Date", "WTI")
assert isinstance(header, dict)
assert len(header) > 0

eager_data, _ = read_to_polars(data_path, eager=True)
assert_frame_equal(lazy_data.collect(), eager_data)

import csvy.readers as readers

readers.LazyFrame = None

with pytest.raises(ModuleNotFoundError):
read_to_polars(data_path)


def test_read_to_list(array_data_path):
from csvy.readers import read_to_list

Expand Down

0 comments on commit 6096feb

Please sign in to comment.