Skip to content

Commit

Permalink
Factor out BeKindRewind
Browse files Browse the repository at this point in the history
  • Loading branch information
cmutel committed Jun 14, 2024
1 parent aa87400 commit 12ff9da
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 123 deletions.
73 changes: 73 additions & 0 deletions bw_simapro_csv/csv_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import itertools
from collections.abc import Iterator
from typing import List
import re

import ftfy

UNDEFINED = re.compile("[\x8d\x81\x8f\x90\x9d]")
CONTROL_CHARACTERS = re.compile(
"[\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16"
+ "\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f]"
)
WARNING_CHARS = "À˜â¿"


def clean(s: str) -> str:
"""Strip string, fix encoding, and remove undefined or control characters"""
# This makes no sense - /u007f is the delete control character
# https://www.ascii-code.com/grid
# But SimaPro uses this as a linebreak inside a CSV line
# This is why we can't have nice things
# olca-simapro-csv does the same thing:
# https://github.com/GreenDelta/olca-simapro-csv/blob/c11e40e7722f2ecaf62e813eebcc8d0793c8c3ff/src/test/java/org/openlca/simapro/csv/CsvLineTest.java#L53
s = s.replace("\x7f", "\n")
s = UNDEFINED.sub("", s)
s = CONTROL_CHARACTERS.sub("", s)
if any(char in s for char in WARNING_CHARS):
s = ftfy.fix_text(s)
return s.strip()


class BeKindRewind(Iterator):
"""CSV reader which acts as a line by line iterator but which allows for one step backwards.
Needed because the file we are consuming will sometimes indicate that a logical block is
finished by using the control word `End`, but other times won't. In that case, our iterator
is already in a new block. To make it simple to pass the iterator to the next function
consuming the new block, we rewind it one line.
Internally this is implemented by caching the last line read, and using `itertools.chain`
when needed to prepend the cached line to the iterator.
Parameters
----------
data_iterable : collections.abc.Iterator
Iterator which returns lists of strings.
clean_elements : bool, optional
Do `[clean(elem) for elem in line]` when returning a new line
"""

def __init__(self, data_iterable: Iterator, clean_elements: bool = True, offset: int = 0):
self.data_iterable = data_iterable
self.current = None
self.clean_elements = clean_elements
# Line numbers are 1-indexed
self.line_no = offset + 1

def __next__(self) -> List[str]:
self.current = next(self.data_iterable)
self.line_no += 1
if self.clean_elements:
self.current = [clean(elem) for elem in self.current]
return self.current

def rewind(self) -> None:
"""Rewinds the iterator by one step, retrieving the element that was
just returned by the previous call to `__next__`."""
self.line_no -= 1
if self.current is None:
return
self.data_iterable = itertools.chain((self.current,), self.data_iterable)
self.current = None
4 changes: 2 additions & 2 deletions bw_simapro_csv/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
substitute_in_formulas,
)
from .units import normalize_units
from .utils import BeKindRewind
from .csv_reader import BeKindRewind


def dummy(data, *args):
Expand Down Expand Up @@ -91,7 +91,7 @@ def __init__(
path_or_stream: Path | StringIO,
encoding: str = "sloppy-windows-1252",
stderr_logs: bool = True,
write_logs: bool = True
write_logs: bool = True,
):
"""Read a SimaPro CSV file object, and parse the contents.
Expand Down
70 changes: 0 additions & 70 deletions bw_simapro_csv/utils.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,11 @@
import itertools
import re
from collections.abc import Iterator
from copy import copy
from datetime import date
from numbers import Number
from typing import Iterable, List, Pattern

import ftfy
from dateutil.parser import parse as dtparse

UNDEFINED = re.compile("[\x8d\x81\x8f\x90\x9d]")
CONTROL_CHARACTERS = re.compile(
"[\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16"
+ "\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f]"
)
WARNING_CHARS = "À˜â¿"


def clean(s: str) -> str:
"""Strip string, fix encoding, and remove undefined or control characters"""
# This makes no sense - /u007f is the delete control character
# https://www.ascii-code.com/grid
# But SimaPro uses this as a linebreak inside a CSV line
# This is why we can't have nice things
# olca-simapro-csv does the same thing:
# https://github.com/GreenDelta/olca-simapro-csv/blob/c11e40e7722f2ecaf62e813eebcc8d0793c8c3ff/src/test/java/org/openlca/simapro/csv/CsvLineTest.java#L53
s = s.replace("\x7f", "\n")
s = UNDEFINED.sub("", s)
s = CONTROL_CHARACTERS.sub("", s)
if any(char in s for char in WARNING_CHARS):
s = ftfy.fix_text(s)
return s.strip()


def nobraces(s: str) -> str:
"""Remove braces from header section elements"""
Expand Down Expand Up @@ -152,50 +126,6 @@ def alternating_key_value(data: List[list]) -> List[tuple]:
return processed


class BeKindRewind(Iterator):
"""CSV reader which acts as a line by line iterator but which allows for one step backwards.
Needed because the file we are consuming will sometimes indicate that a logical block is
finished by using the control word `End`, but other times won't. In that case, our iterator
is already in a new block. To make it simple to pass the iterator to the next function
consuming the new block, we rewind it one line.
Internally this is implemented by caching the last line read, and using `itertools.chain`
when needed to prepend the cached line to the iterator.
Parameters
----------
data_iterable : collections.abc.Iterator
Iterator which returns lists of strings.
clean_elements : bool, optional
Do `[clean(elem) for elem in line]` when returning a new line
"""

def __init__(self, data_iterable: Iterator, clean_elements: bool = True, offset: int = 0):
self.data_iterable = data_iterable
self.current = None
self.clean_elements = clean_elements
# Line numbers are 1-indexed
self.line_no = offset + 1

def __next__(self) -> List[str]:
self.current = next(self.data_iterable)
self.line_no += 1
if self.clean_elements:
self.current = [clean(elem) for elem in self.current]
return self.current

def rewind(self) -> None:
"""Rewinds the iterator by one step, retrieving the element that was
just returned by the previous call to `__next__`."""
self.line_no -= 1
if self.current is None:
return
self.data_iterable = itertools.chain((self.current,), self.data_iterable)
self.current = None


def get_numbers_re(separator: str) -> Pattern:
if separator == ".":
separator = ""
Expand Down
55 changes: 55 additions & 0 deletions tests/test_csv_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest

from bw_simapro_csv.csv_reader import (
BeKindRewind,
clean,
)


def test_rewindable_generator():
a = iter((1, 2, 3, 4, 5))
r = BeKindRewind(a, clean_elements=False)
assert next(r) == 1
assert next(r) == 2
assert next(r) == 3
r.rewind()
assert next(r) == 3
assert next(r) == 4
assert next(r) == 5
with pytest.raises(StopIteration):
next(r)


def test_rewindable_generator_idempotent():
a = iter((1, 2, 3, 4, 5))
r = BeKindRewind(a, clean_elements=False)
assert next(r) == 1
r.rewind()
r.rewind()
r.rewind()
assert next(r) == 1
assert next(r) == 2


def test_rewindable_generator_rewind_before_iteration():
a = iter((1, 2, 3, 4, 5))
r = BeKindRewind(a, clean_elements=False)
r.rewind()
assert next(r) == 1
assert next(r) == 2


def test_rewindable_generator_strip():
a = iter([(" a ", "\tb ", "c"), (" 2", "1 ", "3")])
r = BeKindRewind(a)
assert next(r) == ["a", "b", "c"]
assert next(r) == ["2", "1", "3"]


def test_clean():
assert clean("ï¾µg") == "ᄉg"
assert clean(" \t foo") == "foo"
assert clean(" \t foo") == "foo"
assert clean(\x8dg") == "Âg"
assert clean("CO2\x1a") == "CO2"
assert clean("CO2") == "CO\n2"
51 changes: 0 additions & 51 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
import pytest

from bw_simapro_csv.utils import (
BeKindRewind,
asnumber,
clean,
get_key_multiline_values,
get_numbers_re,
jump_to_nonempty,
Expand All @@ -14,46 +12,6 @@
)


def test_rewindable_generator():
a = iter((1, 2, 3, 4, 5))
r = BeKindRewind(a, clean_elements=False)
assert next(r) == 1
assert next(r) == 2
assert next(r) == 3
r.rewind()
assert next(r) == 3
assert next(r) == 4
assert next(r) == 5
with pytest.raises(StopIteration):
next(r)


def test_rewindable_generator_idempotent():
a = iter((1, 2, 3, 4, 5))
r = BeKindRewind(a, clean_elements=False)
assert next(r) == 1
r.rewind()
r.rewind()
r.rewind()
assert next(r) == 1
assert next(r) == 2


def test_rewindable_generator_rewind_before_iteration():
a = iter((1, 2, 3, 4, 5))
r = BeKindRewind(a, clean_elements=False)
r.rewind()
assert next(r) == 1
assert next(r) == 2


def test_rewindable_generator_strip():
a = iter([(" a ", "\tb ", "c"), (" 2", "1 ", "3")])
r = BeKindRewind(a)
assert next(r) == ["a", "b", "c"]
assert next(r) == ["2", "1", "3"]


def test_asnumber():
assert asnumber("4.2", ".") == 4.2
assert asnumber("400_404.2", ".") == 400404.2
Expand All @@ -74,15 +32,6 @@ def test_asnumber_error():
asnumber("foo")


def test_clean():
assert clean("ï¾µg") == "ᄉg"
assert clean(" \t foo") == "foo"
assert clean(" \t foo") == "foo"
assert clean(\x8dg") == "Âg"
assert clean("CO2\x1a") == "CO2"
assert clean("CO2") == "CO\n2"


def test_normalize_number_in_formula():
assert normalize_number_in_formula("400_404;2", ";") == "400404.2"
assert normalize_number_in_formula("400_404?2", "?") == "400404.2"
Expand Down

0 comments on commit 12ff9da

Please sign in to comment.