FEAT: add CsvProcessor to python bindings

NickCrews · Jul 29, 2023 · a27a74e · a27a74e
1 parent e3e7a42
commit a27a74e
Show file tree

Hide file tree

Showing 9 changed files with 114 additions and 42 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,9 +2,9 @@
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.4.0"
+# KEEP IN SYNC WITH python/pyproject.toml
+version = "0.4.1"
 edition = "2021"
 rust-version = "1.69"
 homepage = "https://github.com/NickCrews/feco3"
-documentation = "https://github.com/NickCrews/feco3"
 repository = "https://github.com/NickCrews/feco3"
diff --git a/README.md b/README.md
@@ -1,10 +1,21 @@
 # FECo3
 
-A .FEC file parser in rust, with python bindings
+A .FEC file parser in rust, with python bindings. The rust is intended to
+be extendable, easy to maintain, and performant. The python is intended to
+be easy to use, with type hints, possible to extend,
+integrate with the rest of the python data ecosystem.
 
 Still in alpha.
 
-## Example
+## Links
+
+- [Python docs](https://nickcrews.github.io/feco3/), if you want to use the Python API.
+- [Rust docs](https://docs.rs/feco3), if you want to use the Rust API.
+- [.fec file format reference](https://github.com/NickCrews/feco3/wiki/.fec-File-Format)
+  if you want to know more about the .fec file format or are interested in writing
+  your own parser or improving this one.
+
+## Example Python
 
 ```python
 import pyarrow as pa
@@ -13,10 +24,17 @@ import feco3
 # ruff: noqa: E501
 
 # You can supply a URL or a path to a file.
+# Possibly in the future we'll support reading from a file-like object.
 src = "https://docquery.fec.gov/dcdev/posted/1002596.fec"
 # src = "path/to/file.fec"
 # src = pathlib.Path("path/to/file.fec")
 
+# The straightforward way is to just parse to a directory of files,
+# one file for each itemization type, eg "csvs/SA11AI.csv", etc
+feco3.FecFile(src).to_csvs("csvs/")
+feco3.FecFile(src).to_parquets("parquets/")
+
+# Or, you can look at the file at a lower level.
 # This doesn't actually read or parse any data yet
 fec = feco3.FecFile(src)
 print(fec)
@@ -63,13 +81,7 @@ for batch in batcher:
 
 ```
 
-## Documentation
 
-- [Python docs](https://nickcrews.github.io/feco3/), if you want to use the Python API
-- [Rust docs](https://docs.rs/feco3), if you want to write to the Rust API
-- [.fec file format reference](https://github.com/NickCrews/feco3/wiki/.fec-File-Format)
-  if you want to know more about the .fec file format or are interested in writing
-  your own parser or improving this one.
 
 ## Related projects
 

diff --git a/crates/feco3_python/src/lib.rs b/crates/feco3_python/src/lib.rs
@@ -111,6 +111,24 @@ impl ParquetProcessor {
     }
 }
 
+#[pyclass]
+struct CsvProcessor(feco3::writers::csv::CSVProcessor);
+
+#[pymethods]
+impl CsvProcessor {
+    #[new]
+    fn new(out_dir: PathBuf) -> Self {
+        Self(feco3::writers::csv::CSVProcessor::new(out_dir))
+    }
+
+    fn process(&mut self, fec_file: &mut FecFile) -> PyResult<()> {
+        match self.0.process(&mut fec_file.0) {
+            Ok(()) => Ok(()),
+            Err(e) => Err(to_py_err(e)),
+        }
+    }
+}
+
 #[pyclass]
 struct PyarrowBatcher(feco3::writers::arrow::RecordBatchProcessor);
 
@@ -144,6 +162,7 @@ fn _feco3(_py: Python, m: &PyModule) -> PyResult<()> {
     pyo3_log::init();
     m.add_class::<FecFile>()?;
     m.add_class::<ParquetProcessor>()?;
+    m.add_class::<CsvProcessor>()?;
     m.add_class::<PyarrowBatcher>()?;
     Ok(())
 }

diff --git a/python/example.py b/python/example.py
@@ -1,5 +1,5 @@
-import pyarrow as pa
 import feco3
+import pyarrow as pa
 
 # ruff: noqa: E501
 
@@ -8,6 +8,12 @@
 # src = "path/to/file.fec"
 # src = pathlib.Path("path/to/file.fec")
 
+# The straightforward way is to just parse to a directory of files,
+# one file for each itemization type, eg "csvs/SA11AI.csv", etc
+feco3.FecFile(src).to_csvs("csvs/")
+feco3.FecFile(src).to_parquets("parquets/")
+
+# Or, you can look at the file at a lower level.
 # This doesn't actually read or parse any data yet
 fec = feco3.FecFile(src)
 print(fec)

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -5,7 +5,8 @@ build-backend = "maturin"
 [project]
 name = "feco3"
 # TODO make this dynamic
-version = "0.4.0"
+# KEEP IN SYNC WITH ../Cargo.toml
+version = "0.4.1"
 description = "A Rust-backed Python library for parsing .fec files."
 requires-python = ">=3.7"
 readme = "README.md"

diff --git a/python/src/feco3/__init__.py b/python/src/feco3/__init__.py
@@ -1,16 +1,13 @@
 """FECo3: Python bindings to a .fec file parser written in Rust."""
 
 from __future__ import annotations
-from functools import cached_property
-from typing import NamedTuple
 
 import os
+from functools import cached_property
 from pathlib import Path
-from typing import TYPE_CHECKING
-
+from typing import TYPE_CHECKING, NamedTuple
 
-from . import _version
-from . import _feco3
+from . import _feco3, _version
 
 if TYPE_CHECKING:
     import pyarrow as pa
@@ -104,14 +101,22 @@ def cover(self) -> Cover:
             filer_committee_id=c.filer_committee_id,
         )
 
-    def to_parquet(self, out_dir: str | os.PathLike) -> None:
+    def to_parquets(self, out_dir: str | os.PathLike) -> None:
         """Write all itemizations in this FEC file to parquet files.
 
         There will be one parquet file for each record type, eg. ``sa11.parquet``.
         """
         parser = _feco3.ParquetProcessor(out_dir)
         parser.process(self._wrapped)
 
+    def to_csvs(self, out_dir: str | os.PathLike) -> None:
+        """Write all itemizations in this FEC file to CSV files.
+
+        There will be one CSV file for each record type, eg. ``sa11.csv``.
+        """
+        parser = _feco3.CsvProcessor(out_dir)
+        parser.process(self._wrapped)
+
     def __repr__(self) -> str:
         src_str = f"src={self._src!r}"
         return f"{self.__class__.__name__}({src_str})"
@@ -139,7 +144,14 @@ class PyarrowBatcher:
     Iterates an [FecFile][feco3.FecFile] and yields [ItemizationBatch][feco3.ItemizationBatch]s of itemizations.
     """  # noqa: E501
 
-    def __init__(self, fec_file: FecFile, max_batch_size: int | None = None):
+    def __init__(self, fec_file: FecFile, max_batch_size: int | None = None) -> None:
+        """Create a new PyarrowBatcher.
+
+        Args:
+            fec_file: The [FecFile][feco3.FecFile] to iterate.
+            max_batch_size: The max rows per [pyarrow.RecordBatch][pyarrow.RecordBatch].
+                Defaults to 1024 * 1024, which is what rust parquet uses.
+        """
         self._fec_file = fec_file
         if max_batch_size is None:
             max_batch_size = DEFAULT_PYARROW_RECORD_BATCH_MAX_SIZE

diff --git a/python/test/test_pyarrow.py b/python/test/test_pyarrow.py
diff --git a/python/test/test_writers.py b/python/test/test_writers.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+
+import feco3
+import pyarrow as pa
+
+from . import common
+
+
+def test_pyarrow_batches():
+    path = common.get_case_path("slash_form.fec")
+    fec = feco3.FecFile(path)
+    batcher = feco3.PyarrowBatcher(fec)
+    # Can convert to list
+    batches = list(batcher)
+    assert len(batches) > 1
+    seen_codes = set()
+    for b in batches:
+        assert isinstance(b, feco3.ItemizationBatch)
+        assert isinstance(b.code, str)
+        assert isinstance(b.records, pa.RecordBatch)
+        assert b.records.num_rows > 0
+        assert b.records.num_columns > 0
+        seen_codes.add(b.code)
+
+    assert seen_codes == {"SA11AI", "SD10", "SC2/10", "SC/10", "SB17"}
+
+    # We have used up the fec file, so iterating again finds no itemizations
+    assert list(feco3.PyarrowBatcher(fec)) == []
+
+
+def test_csvs(tmp_path: Path):
+    path = common.get_case_path("slash_form.fec")
+    fec = feco3.FecFile(path)
+    fec.to_csvs(tmp_path)
+    assert len(list(tmp_path.glob("*.csv"))) == 5
+
+
+def test_parquets(tmp_path: Path):
+    path = common.get_case_path("slash_form.fec")
+    fec = feco3.FecFile(path)
+    fec.to_parquets(tmp_path)
+    assert len(list(tmp_path.glob("*.parquet"))) == 5