From 6096feb11ceb356b0077eddecbc4f71cf281b0a0 Mon Sep 17 00:00:00 2001 From: Adrian D'Alessandro Date: Tue, 1 Oct 2024 13:29:54 +0100 Subject: [PATCH] Include a read_to_polars function --- csvy/__init__.py | 2 ++ csvy/readers.py | 60 +++++++++++++++++++++++++++++++++++++++++++++- tests/test_read.py | 23 ++++++++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/csvy/__init__.py b/csvy/__init__.py index adb2a2b..9996fa1 100644 --- a/csvy/__init__.py +++ b/csvy/__init__.py @@ -1,11 +1,13 @@ """ Python reader/writer for CSV files with YAML header information. """ + __version__ = "0.2.2" from .readers import ( # noqa: F401 read_header, read_metadata, read_to_array, read_to_dataframe, + read_to_polars, ) from .writers import Writer, write, write_header # noqa: F401 diff --git a/csvy/readers.py b/csvy/readers.py index e51bcd9..90f6dc3 100644 --- a/csvy/readers.py +++ b/csvy/readers.py @@ -17,7 +17,17 @@ except ModuleNotFoundError: DataFrame = None # type: ignore logging.getLogger().debug( - "Pandas is not installed. Reading into a DataFrame will not work." + "Pandas is not installed. Reading into a pd.DataFrame will not work." + ) + +try: + from polars import DataFrame as PolarsDataFrame + from polars import LazyFrame +except ModuleNotFoundError: + LazyFrame = None # type: ignore + PolarsDataFrame = None # type: ignore + logging.getLogger().debug( + "Polars is not installed. Reading into a pl.DataFrame will not work." ) @@ -168,6 +178,54 @@ def read_to_dataframe( return pd.read_csv(filename, **options), header +def read_to_polars( + filename: Union[Path, str], + marker: str = "---", + csv_options: Optional[Dict[str, Any]] = None, + yaml_options: Optional[Dict[str, Any]] = None, + eager: bool = False, +) -> Tuple[Union[LazyFrame, PolarsDataFrame], Dict[str, Any]]: + """Reads a CSVY file into dict with the header and a Polars LazyFrame with the data. + + This uses the `scan_csv` method from Polars to read the data. This returns a polars + LazyFrame, which means the data is not loaded into memory until it is needed. To + load the data into memory, set the `eager` parameter to `True`. + + Possible 'skip_rows' and 'comment_prefix' argument provided in the 'csv_options' + dictionary will be ignored. + + Args: + filename: Name of the file to read. + marker: The marker characters that indicate the yaml header. + csv_options: Options to pass to pd.read_csv. + yaml_options: Options to pass to yaml.safe_load. + eager: Whether to load the data into memory. + + Raises: + ModuleNotFoundError: If polars is not found. + + Returns: + Tuple containing: The polars LazyFrame and the header as a dictionary. + """ + if LazyFrame is None: + raise ModuleNotFoundError( + "Module polars is not present. Install it to read data into DataFrame." + ) + import polars as pl + + yaml_options = yaml_options if yaml_options is not None else {} + header, nlines, comment = read_header(filename, marker=marker, **yaml_options) + + options = csv_options.copy() if csv_options is not None else {} + options["skip_rows"] = nlines + options["comment_prefix"] = comment[0] if len(comment) >= 1 else None + + lf = pl.scan_csv(filename, **options) + if eager: + return lf.collect(), header + return lf, header + + def read_to_list( filename: Union[Path, str], marker: str = "---", diff --git a/tests/test_read.py b/tests/test_read.py index 32bf32d..df39077 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -84,6 +84,29 @@ def test_read_to_dataframe(data_path): read_to_dataframe(data_path) +def test_read_to_polars(data_path): + import polars as pl + from polars.testing import assert_frame_equal + + from csvy.readers import read_to_polars + + lazy_data, header = read_to_polars(data_path) + assert isinstance(lazy_data, pl.LazyFrame) + assert tuple(lazy_data.columns) == ("Date", "WTI") + assert isinstance(header, dict) + assert len(header) > 0 + + eager_data, _ = read_to_polars(data_path, eager=True) + assert_frame_equal(lazy_data.collect(), eager_data) + + import csvy.readers as readers + + readers.LazyFrame = None + + with pytest.raises(ModuleNotFoundError): + read_to_polars(data_path) + + def test_read_to_list(array_data_path): from csvy.readers import read_to_list