Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use read-only openpyxl representation to reduce memory usage #596

Merged
merged 6 commits into from
Mar 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dev_requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ isort
yapf
black
formencode
lxml
lxml
psutil
1 change: 1 addition & 0 deletions dev_requirements.pip
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pathspec==0.9.0 # via black
pep517==0.11.0 # via pip-tools
pip-tools==6.3.0 # via -r dev_requirements.in
platformdirs==2.3.0 # via black, pylint
psutil==5.9.0 # via -r dev_requirements.in
pycodestyle==2.7.0 # via flake8
pyflakes==2.3.1 # via flake8
pylint==2.11.1 # via -r dev_requirements.in
Expand Down
4 changes: 2 additions & 2 deletions pyxform/question.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
import re

from pyxform.constants import (
EXTERNAL_INSTANCE_EXTENSIONS,
EXTERNAL_CHOICES_ITEMSET_REF_LABEL,
EXTERNAL_CHOICES_ITEMSET_REF_LABEL_GEOJSON,
EXTERNAL_CHOICES_ITEMSET_REF_VALUE,
EXTERNAL_CHOICES_ITEMSET_REF_VALUE_GEOJSON,
EXTERNAL_CHOICES_ITEMSET_REF_LABEL_GEOJSON,
EXTERNAL_INSTANCE_EXTENSIONS,
)
from pyxform.errors import PyXFormError
from pyxform.question_type_dictionary import QUESTION_TYPE_DICT
Expand Down
7 changes: 3 additions & 4 deletions pyxform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,12 @@ def xls_sheet_to_csv(workbook_path, csv_path, sheet_name):


def xlsx_sheet_to_csv(workbook_path, csv_path, sheet_name):
wb = openpyxl.open(workbook_path)
wb = openpyxl.open(workbook_path, read_only=True, data_only=True)
try:
sheet = wb.get_sheet_by_name(sheet_name)
except KeyError:
return False
if sheet.max_row < 2:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK 👍 Since we can create XLSX from code now with openpyxl I would prefer to create test fixtures from code where possible. Even if it's using openpyxl directly for edge cases that the Markdown methods don't like.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you were saying I shouldn't have created the empty_sheets.xlsx file and instead built it programmatically. I've removed that test because it actually wasn't relevant. I ended up thinking this was about any empty sheet but it's only for select_one_external and the external_choices sheet.

This code is never reached if the external_choices sheet is either empty or has only a header. So I think it's ok not to build a real XLSX file to check those two cases. It seems very strange to me that this re-opens the XLSX again for parsing but I didn't look any deeper.

Copy link
Contributor Author

@lognaturel lognaturel Mar 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, right, but the empty sheet case is not supported by md to json. So I'll have the two cases use a real xlsx so it's really what users experience.

return False

with open(csv_path, "w", newline="") as f:
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
mask = [not is_empty(cell.value) for cell in sheet[1]]
Expand All @@ -212,7 +211,7 @@ def xlsx_sheet_to_csv(workbook_path, csv_path, sheet_name):
except TypeError:
continue
writer.writerow(csv_data)

wb.close()
return True


Expand Down
3 changes: 1 addition & 2 deletions pyxform/xls2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
from typing import TYPE_CHECKING

from pyxform import aliases, constants
from pyxform.constants import ROW_FORMAT_STRING
from pyxform.constants import EXTERNAL_INSTANCE_EXTENSIONS
from pyxform.constants import EXTERNAL_INSTANCE_EXTENSIONS, ROW_FORMAT_STRING
from pyxform.errors import PyXFormError
from pyxform.utils import default_is_dynamic, is_valid_xml_tag, levenshtein_distance
from pyxform.validators.pyxform import select_from_file_params
Expand Down
21 changes: 14 additions & 7 deletions pyxform/xls2json_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,17 @@ def xlsx_to_dict(path_or_file):
All the keys and leaf elements are strings.
"""
try:
workbook = openpyxl.open(filename=path_or_file, data_only=True)
workbook = openpyxl.open(filename=path_or_file, read_only=True, data_only=True)
except (OSError, BadZipFile, KeyError) as error:
raise PyXFormError("Error reading .xlsx file: %s" % error)

def xlsx_to_dict_normal_sheet(sheet):

# Check for duplicate column headers
column_header_list = list()
for cell in sheet[1]:

first_row = next(sheet.rows, [])
for cell in first_row:
column_header = cell.value
# xls file with 3 columns mostly have a 3 more columns that are
# blank by default or something, skip during check
Expand All @@ -204,16 +206,20 @@ def xlsx_to_dict_normal_sheet(sheet):
if key is None:
continue

value = row[column].value
if isinstance(value, str):
value = value.strip()
try:
value = row[column].value
if isinstance(value, str):
value = value.strip()

if not is_empty(value):
row_dict[key] = xlsx_value_to_str(value)
if not is_empty(value):
row_dict[key] = xlsx_value_to_str(value)
except IndexError:
pass # rows may not have values for every column
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't able to reproduce this IndexError. It seems that openpyxl emits the same length row tuple even if some cells are empty. It would be problematic if it did emit different length tuples, since the columns are accessed by index. Can this try/except be removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It depends on the structure of the XLSX document. I assume it's more likely to happen with machine-written files (e.g. the one from test_itemset_csv_generated_from_external_choices). I'll add a test explicitly targeting it.


result.append(row_dict)

column_header_list = [key for key in column_header_list if key is not None]

return result, _list_to_dict_list(column_header_list)

result = OrderedDict()
Expand All @@ -236,6 +242,7 @@ def xlsx_to_dict_normal_sheet(sheet):
result[f"{sheetname}_header"],
) = xlsx_to_dict_normal_sheet(sheet)

workbook.close()
return result


Expand Down
Binary file added tests/example_xls/extra_columns.xlsx
Binary file not shown.
Binary file modified tests/example_xls/group.xls
Binary file not shown.
Binary file modified tests/example_xls/group.xlsx
Binary file not shown.
Binary file modified tests/example_xls/specify_other.xls
Binary file not shown.
Binary file modified tests/example_xls/specify_other.xlsx
Binary file not shown.
72 changes: 57 additions & 15 deletions tests/test_external_instances_for_selects.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from dataclasses import dataclass, field

from pyxform.constants import EXTERNAL_INSTANCE_EXTENSIONS
from pyxform.errors import PyXFormError
from pyxform.xls2xform import get_xml_path, xls2xform_convert
from tests.pyxform_test_case import PyxformTestCase
from tests.test_utils.md_table import md_table_to_workbook
Expand Down Expand Up @@ -383,28 +384,16 @@ def test_itemset_csv_generated_from_external_choices(self):
| | select_one state | state | State | |
| | select_one_external city | city | City | state=${state} |
| | select_one_external suburb | suburb | Suburb | state=${state} and city=${city} |
| choices | | | |
| | list_name | name | label |
| | state | nsw | NSW |
| | state | vic | VIC |
| external_choices | | | | | |
| | list_name | name | state | | city |
| | city | Sydney | nsw | | |
| | city | Melbourne | vic | | |
| | suburb | Balmain | nsw | | sydney |
| | suburb | Footscray | vic | empty header | melbourne |
"""
wb = md_table_to_workbook(md)
wb = md_table_to_workbook(md + self.all_choices)
with get_temp_dir() as tmp:
wb_path = os.path.join(tmp, "select_one_external.xlsx")
wb.save(wb_path)
wb.close()
with self.assertLogs("pyxform") as log:
xls2xform_convert(
xlsform_path=wb_path,
xform_path=get_xml_path(wb_path),
validate=True,
pretty_print=False,
enketo=False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a dealbreaker but I am wondering why these arguments are removed from the test? I specified them so that the test was explicit about how the file should be generated.

Copy link
Contributor Author

@lognaturel lognaturel Mar 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mostly wanted to make sure to remove the validate=True because it slows the tests a lot. We always run with Validate before release. It seemed to me the others didn't really matter but I'm happy to bring them back. How strongly do you feel? In this case the pretty_print and enketo values don't change the output, right?

Copy link
Contributor Author

@lognaturel lognaturel Mar 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We always run with Validate before release.

Ah, except that these aren't pyxform test cases so the flag change won't affect them. I still think that's better because it runs the tests faster. I also happen to know that the entire "fast external itemset" feature is ignored by JavaRosa/Validate.

)

# Should have written the itemsets.csv file as part of XLSForm conversion.
Expand All @@ -420,6 +409,59 @@ def test_itemset_csv_generated_from_external_choices(self):
# Should have excluded column with "empty header" in the last row.
self.assertEqual('"suburb","Footscray","vic","melbourne"\n', rows[-1])

def test_empty_external_choices__errors(self):
lognaturel marked this conversation as resolved.
Show resolved Hide resolved
md = """
| survey | | | | |
| | type | name | label |choice_filter |
| | select_one state | state | State | |
| | select_one_external city | city | City |state=${state} |
| choices | | | |
| | list_name | name | label |
| | state | nsw | NSW |
| external_choices | | | |
"""
wb = md_table_to_workbook(md)
with get_temp_dir() as tmp:
wb_path = os.path.join(tmp, "empty_sheet.xlsx")
wb.save(wb_path)
wb.close()
try:
xls2xform_convert(
xlsform_path=wb_path,
xform_path=get_xml_path(wb_path),
)
except PyXFormError as e:
self.assertContains(
str(e), "should be an external_choices sheet in this xlsform"
)

def test_external_choices_with_only_header__errors(self):
md = """
| survey | | | | |
| | type | name | label |choice_filter |
| | select_one state | state | State | |
| | select_one_external city | city | City |state=${state} |
| choices | | | |
| | list_name | name | label |
| | state | nsw | NSW |
| external_choices | | | |
| | list_name | name | state | city |
"""
wb = md_table_to_workbook(md)
with get_temp_dir() as tmp:
wb_path = os.path.join(tmp, "empty_sheet.xlsx")
wb.save(wb_path)
wb.close()
try:
xls2xform_convert(
xlsform_path=wb_path,
xform_path=get_xml_path(wb_path),
)
except PyXFormError as e:
self.assertContains(
str(e), "should be an external_choices sheet in this xlsform"
)


class TestInvalidExternalFileInstances(PyxformTestCase):
def test_external_other_extension_instances(self):
Expand All @@ -433,7 +475,7 @@ def test_external_other_extension_instances(self):
| | select_multiple_from_file neighbourhoods.pdf | neighbourhoods | Neighbourhoods |
""", # noqa
errored=True,
error_contains=["should be a choices sheet in this xlsform"],
error__contains=["should be a choices sheet in this xlsform"],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A not so great finding: error_contains passes no matter what the expected error message is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looks like this particular test assertion has been that way for some years. This sort of thing happened a few months ago as well (for warnings__contains, I think). It could be avoided if assertPyxformXform had explicit keyword arguments instead of accepting **kwargs only. Would that refactor be OK with you? I could put it in backlog ticket for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big priority but agreed an issue would be good! 🙏

)

def test_external_choices_sheet_included_instances(self):
Expand Down
42 changes: 40 additions & 2 deletions tests/test_xls2json.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import os

import psutil
lognaturel marked this conversation as resolved.
Show resolved Hide resolved

from pyxform.xls2json_backends import xlsx_to_dict
from pyxform.xls2xform import xls2xform_convert
from pyxform.xls2xform import get_xml_path, xls2xform_convert
from tests import example_xls, test_output
from tests.pyxform_test_case import PyxformTestCase
from tests.test_utils.md_table import md_table_to_workbook
from tests.utils import get_temp_dir

# Common XLSForms used in below TestCases
CHOICES = """
Expand Down Expand Up @@ -41,7 +45,6 @@


class TestXLS2JSONSheetNameHeuristics(PyxformTestCase):

err_similar_found = "the following sheets with similar names were found"
err_survey_required = "You must have a sheet named 'survey'."
err_choices_required = "There should be a choices sheet in this xlsform."
Expand Down Expand Up @@ -601,6 +604,27 @@ def test_workbook_to_json__optional_sheets_ok(self):
warnings_count=0,
)

def test_xls2xform_convert__e2e_row_with_no_column_value(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you remove the code you commented on here, this test should fail.

"""Programmatically-created XLSX files may have rows without column values"""
md = """
| survey | | | | |
| | type | name | label | hint |
| | text | state | State | |
| | text | city | City | A hint |
"""
wb = md_table_to_workbook(md)
with get_temp_dir() as tmp:
wb_path = os.path.join(tmp, "empty_cell.xlsx")
wb.save(wb_path)
wb.close()
xls2xform_convert(
xlsform_path=wb_path,
xform_path=get_xml_path(wb_path),
)

xform_path = os.path.join(tmp, "empty_cell.xml")
self.assertTrue(os.path.exists(xform_path))

def test_xls2xform_convert__e2e_with_settings_misspelling(self):
"""Should warn about settings misspelling when running full pipeline."""
file_name = "extra_sheet_names"
Expand All @@ -617,6 +641,20 @@ def test_xls2xform_convert__e2e_with_settings_misspelling(self):
)
self.assertIn(expected, "\n".join(warnings))

def test_xls2xform_convert__e2e_with_extra_columns__does_not_use_excessive_memory(
self,
):
"""Degenerate form with many blank columns"""
process = psutil.Process(os.getpid())
pre_mem = process.memory_info().rss
xls2xform_convert(
xlsform_path=os.path.join(example_xls.PATH, "extra_columns.xlsx"),
xform_path=os.path.join(test_output.PATH, "extra_columns.xml"),
)
post_mem = process.memory_info().rss
# in v1.8.0, memory usage grew by over 16x
lognaturel marked this conversation as resolved.
Show resolved Hide resolved
self.assertLess(post_mem, pre_mem * 2)

def test_xlsx_to_dict__extra_sheet_names_are_returned_by_parser(self):
"""Should return all sheet names so that later steps can do spellcheck."""
d = xlsx_to_dict(os.path.join(example_xls.PATH, "extra_sheet_names.xlsx"))
Expand Down
47 changes: 44 additions & 3 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,50 @@ def get_temp_file():

@contextmanager
def get_temp_dir():
temp_dir = tempfile.mkdtemp()
temp_dir_prefix = "pyxform_tmp_"
if os.name == "nt":
cleanup_pyxform_temp_files(prefix=temp_dir_prefix)

temp_dir = tempfile.mkdtemp(prefix=temp_dir_prefix)
try:
yield temp_dir
finally:
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
try:
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
except PermissionError:
truncate_temp_files(temp_dir=temp_dir)


def truncate_temp_files(temp_dir):
"""
Truncate files in a folder, recursing into directories.
"""
# If we can't delete, at least the files can be truncated,
# so that they don't take up disk space until next cleanup.
# Seems to be a Windows-specific error for newly-created files.
temp_root = tempfile.gettempdir()
if os.path.exists(temp_dir):
for f in os.scandir(temp_dir):
if os.path.isdir(f.path):
truncate_temp_files(f.path)
else:
# Check still in temp directory
if f.path.startswith(temp_root):
with open(f.path, mode="w") as _:
pass


def cleanup_pyxform_temp_files(prefix: str):
"""
Try to clean up temp pyxform files from previous test runs.
"""
temp_root = tempfile.gettempdir()
if os.path.exists(temp_root):
for f in os.scandir(temp_root):
if os.path.isdir(f.path):
if f.name.startswith(prefix) and f.path.startswith(temp_root):
try:
shutil.rmtree(f.path)
except PermissionError:
pass