Skip to content

Commit

Permalink
FEAT: add CsvProcessor to python bindings
Browse files Browse the repository at this point in the history
  • Loading branch information
NickCrews committed Jul 29, 2023
1 parent e3e7a42 commit a27a74e
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 42 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
members = ["crates/*"]

[workspace.package]
version = "0.4.0"
# KEEP IN SYNC WITH python/pyproject.toml
version = "0.4.1"
edition = "2021"
rust-version = "1.69"
homepage = "https://github.com/NickCrews/feco3"
documentation = "https://github.com/NickCrews/feco3"
repository = "https://github.com/NickCrews/feco3"
28 changes: 20 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
# FECo3

A .FEC file parser in rust, with python bindings
A .FEC file parser in rust, with python bindings. The rust is intended to
be extendable, easy to maintain, and performant. The python is intended to
be easy to use, with type hints, possible to extend,
integrate with the rest of the python data ecosystem.

Still in alpha.

## Example
## Links

- [Python docs](https://nickcrews.github.io/feco3/), if you want to use the Python API.
- [Rust docs](https://docs.rs/feco3), if you want to use the Rust API.
- [.fec file format reference](https://github.com/NickCrews/feco3/wiki/.fec-File-Format)
if you want to know more about the .fec file format or are interested in writing
your own parser or improving this one.

## Example Python

```python
import pyarrow as pa
Expand All @@ -13,10 +24,17 @@ import feco3
# ruff: noqa: E501

# You can supply a URL or a path to a file.
# Possibly in the future we'll support reading from a file-like object.
src = "https://docquery.fec.gov/dcdev/posted/1002596.fec"
# src = "path/to/file.fec"
# src = pathlib.Path("path/to/file.fec")

# The straightforward way is to just parse to a directory of files,
# one file for each itemization type, eg "csvs/SA11AI.csv", etc
feco3.FecFile(src).to_csvs("csvs/")
feco3.FecFile(src).to_parquets("parquets/")

# Or, you can look at the file at a lower level.
# This doesn't actually read or parse any data yet
fec = feco3.FecFile(src)
print(fec)
Expand Down Expand Up @@ -63,13 +81,7 @@ for batch in batcher:

```

## Documentation

- [Python docs](https://nickcrews.github.io/feco3/), if you want to use the Python API
- [Rust docs](https://docs.rs/feco3), if you want to write to the Rust API
- [.fec file format reference](https://github.com/NickCrews/feco3/wiki/.fec-File-Format)
if you want to know more about the .fec file format or are interested in writing
your own parser or improving this one.

## Related projects

Expand Down
19 changes: 19 additions & 0 deletions crates/feco3_python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,24 @@ impl ParquetProcessor {
}
}

#[pyclass]
struct CsvProcessor(feco3::writers::csv::CSVProcessor);

#[pymethods]
impl CsvProcessor {
#[new]
fn new(out_dir: PathBuf) -> Self {
Self(feco3::writers::csv::CSVProcessor::new(out_dir))
}

fn process(&mut self, fec_file: &mut FecFile) -> PyResult<()> {
match self.0.process(&mut fec_file.0) {
Ok(()) => Ok(()),
Err(e) => Err(to_py_err(e)),
}
}
}

#[pyclass]
struct PyarrowBatcher(feco3::writers::arrow::RecordBatchProcessor);

Expand Down Expand Up @@ -144,6 +162,7 @@ fn _feco3(_py: Python, m: &PyModule) -> PyResult<()> {
pyo3_log::init();
m.add_class::<FecFile>()?;
m.add_class::<ParquetProcessor>()?;
m.add_class::<CsvProcessor>()?;
m.add_class::<PyarrowBatcher>()?;
Ok(())
}
Expand Down
8 changes: 7 additions & 1 deletion python/example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pyarrow as pa
import feco3
import pyarrow as pa

# ruff: noqa: E501

Expand All @@ -8,6 +8,12 @@
# src = "path/to/file.fec"
# src = pathlib.Path("path/to/file.fec")

# The straightforward way is to just parse to a directory of files,
# one file for each itemization type, eg "csvs/SA11AI.csv", etc
feco3.FecFile(src).to_csvs("csvs/")
feco3.FecFile(src).to_parquets("parquets/")

# Or, you can look at the file at a lower level.
# This doesn't actually read or parse any data yet
fec = feco3.FecFile(src)
print(fec)
Expand Down
3 changes: 2 additions & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ build-backend = "maturin"
[project]
name = "feco3"
# TODO make this dynamic
version = "0.4.0"
# KEEP IN SYNC WITH ../Cargo.toml
version = "0.4.1"
description = "A Rust-backed Python library for parsing .fec files."
requires-python = ">=3.7"
readme = "README.md"
Expand Down
28 changes: 20 additions & 8 deletions python/src/feco3/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
"""FECo3: Python bindings to a .fec file parser written in Rust."""

from __future__ import annotations
from functools import cached_property
from typing import NamedTuple

import os
from functools import cached_property
from pathlib import Path
from typing import TYPE_CHECKING

from typing import TYPE_CHECKING, NamedTuple

from . import _version
from . import _feco3
from . import _feco3, _version

if TYPE_CHECKING:
import pyarrow as pa
Expand Down Expand Up @@ -104,14 +101,22 @@ def cover(self) -> Cover:
filer_committee_id=c.filer_committee_id,
)

def to_parquet(self, out_dir: str | os.PathLike) -> None:
def to_parquets(self, out_dir: str | os.PathLike) -> None:
"""Write all itemizations in this FEC file to parquet files.
There will be one parquet file for each record type, eg. ``sa11.parquet``.
"""
parser = _feco3.ParquetProcessor(out_dir)
parser.process(self._wrapped)

def to_csvs(self, out_dir: str | os.PathLike) -> None:
"""Write all itemizations in this FEC file to CSV files.
There will be one CSV file for each record type, eg. ``sa11.csv``.
"""
parser = _feco3.CsvProcessor(out_dir)
parser.process(self._wrapped)

def __repr__(self) -> str:
src_str = f"src={self._src!r}"
return f"{self.__class__.__name__}({src_str})"
Expand Down Expand Up @@ -139,7 +144,14 @@ class PyarrowBatcher:
Iterates an [FecFile][feco3.FecFile] and yields [ItemizationBatch][feco3.ItemizationBatch]s of itemizations.
""" # noqa: E501

def __init__(self, fec_file: FecFile, max_batch_size: int | None = None):
def __init__(self, fec_file: FecFile, max_batch_size: int | None = None) -> None:
"""Create a new PyarrowBatcher.
Args:
fec_file: The [FecFile][feco3.FecFile] to iterate.
max_batch_size: The max rows per [pyarrow.RecordBatch][pyarrow.RecordBatch].
Defaults to 1024 * 1024, which is what rust parquet uses.
"""
self._fec_file = fec_file
if max_batch_size is None:
max_batch_size = DEFAULT_PYARROW_RECORD_BATCH_MAX_SIZE
Expand Down
20 changes: 0 additions & 20 deletions python/test/test_pyarrow.py

This file was deleted.

42 changes: 42 additions & 0 deletions python/test/test_writers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from pathlib import Path

import feco3
import pyarrow as pa

from . import common


def test_pyarrow_batches():
path = common.get_case_path("slash_form.fec")
fec = feco3.FecFile(path)
batcher = feco3.PyarrowBatcher(fec)
# Can convert to list
batches = list(batcher)
assert len(batches) > 1
seen_codes = set()
for b in batches:
assert isinstance(b, feco3.ItemizationBatch)
assert isinstance(b.code, str)
assert isinstance(b.records, pa.RecordBatch)
assert b.records.num_rows > 0
assert b.records.num_columns > 0
seen_codes.add(b.code)

assert seen_codes == {"SA11AI", "SD10", "SC2/10", "SC/10", "SB17"}

# We have used up the fec file, so iterating again finds no itemizations
assert list(feco3.PyarrowBatcher(fec)) == []


def test_csvs(tmp_path: Path):
path = common.get_case_path("slash_form.fec")
fec = feco3.FecFile(path)
fec.to_csvs(tmp_path)
assert len(list(tmp_path.glob("*.csv"))) == 5


def test_parquets(tmp_path: Path):
path = common.get_case_path("slash_form.fec")
fec = feco3.FecFile(path)
fec.to_parquets(tmp_path)
assert len(list(tmp_path.glob("*.parquet"))) == 5

0 comments on commit a27a74e

Please sign in to comment.