XLSForm · lognaturel · Mar 16, 2022 · Mar 14, 2022 · Mar 14, 2022 · Mar 16, 2022
diff --git a/dev_requirements.in b/dev_requirements.in
@@ -9,4 +9,5 @@ isort
 yapf
 black
 formencode
-lxml
+lxml
+psutil
diff --git a/dev_requirements.pip b/dev_requirements.pip
@@ -23,6 +23,7 @@ pathspec==0.9.0           # via black
 pep517==0.11.0            # via pip-tools
 pip-tools==6.3.0          # via -r dev_requirements.in
 platformdirs==2.3.0       # via black, pylint
+psutil==5.9.0             # via -r dev_requirements.in
 pycodestyle==2.7.0        # via flake8
 pyflakes==2.3.1           # via flake8
 pylint==2.11.1            # via -r dev_requirements.in

diff --git a/pyxform/question.py b/pyxform/question.py
@@ -6,11 +6,11 @@
 import re
 
 from pyxform.constants import (
-    EXTERNAL_INSTANCE_EXTENSIONS,
     EXTERNAL_CHOICES_ITEMSET_REF_LABEL,
+    EXTERNAL_CHOICES_ITEMSET_REF_LABEL_GEOJSON,
     EXTERNAL_CHOICES_ITEMSET_REF_VALUE,
     EXTERNAL_CHOICES_ITEMSET_REF_VALUE_GEOJSON,
-    EXTERNAL_CHOICES_ITEMSET_REF_LABEL_GEOJSON,
+    EXTERNAL_INSTANCE_EXTENSIONS,
 )
 from pyxform.errors import PyXFormError
 from pyxform.question_type_dictionary import QUESTION_TYPE_DICT

diff --git a/pyxform/utils.py b/pyxform/utils.py
@@ -190,13 +190,12 @@ def xls_sheet_to_csv(workbook_path, csv_path, sheet_name):
 
 
 def xlsx_sheet_to_csv(workbook_path, csv_path, sheet_name):
-    wb = openpyxl.open(workbook_path)
+    wb = openpyxl.open(workbook_path, read_only=True, data_only=True)
     try:
         sheet = wb.get_sheet_by_name(sheet_name)
     except KeyError:
         return False
-    if sheet.max_row < 2:
-        return False
+
     with open(csv_path, "w", newline="") as f:
         writer = csv.writer(f, quoting=csv.QUOTE_ALL)
         mask = [not is_empty(cell.value) for cell in sheet[1]]
@@ -212,7 +211,7 @@ def xlsx_sheet_to_csv(workbook_path, csv_path, sheet_name):
             except TypeError:
                 continue
             writer.writerow(csv_data)
-
+    wb.close()
     return True
 
 

diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py
@@ -11,8 +11,7 @@
 from typing import TYPE_CHECKING
 
 from pyxform import aliases, constants
-from pyxform.constants import ROW_FORMAT_STRING
-from pyxform.constants import EXTERNAL_INSTANCE_EXTENSIONS
+from pyxform.constants import EXTERNAL_INSTANCE_EXTENSIONS, ROW_FORMAT_STRING
 from pyxform.errors import PyXFormError
 from pyxform.utils import default_is_dynamic, is_valid_xml_tag, levenshtein_distance
 from pyxform.validators.pyxform import select_from_file_params

diff --git a/pyxform/xls2json_backends.py b/pyxform/xls2json_backends.py
@@ -175,15 +175,17 @@ def xlsx_to_dict(path_or_file):
     All the keys and leaf elements are strings.
     """
     try:
-        workbook = openpyxl.open(filename=path_or_file, data_only=True)
+        workbook = openpyxl.open(filename=path_or_file, read_only=True, data_only=True)
     except (OSError, BadZipFile, KeyError) as error:
         raise PyXFormError("Error reading .xlsx file: %s" % error)
 
     def xlsx_to_dict_normal_sheet(sheet):
 
         # Check for duplicate column headers
         column_header_list = list()
-        for cell in sheet[1]:
+
+        first_row = next(sheet.rows, [])
+        for cell in first_row:
             column_header = cell.value
             # xls file with 3 columns mostly have a 3 more columns that are
             # blank by default or something, skip during check
@@ -204,16 +206,20 @@ def xlsx_to_dict_normal_sheet(sheet):
                 if key is None:
                     continue
 
-                value = row[column].value
-                if isinstance(value, str):
-                    value = value.strip()
+                try:
+                    value = row[column].value
+                    if isinstance(value, str):
+                        value = value.strip()
 
-                if not is_empty(value):
-                    row_dict[key] = xlsx_value_to_str(value)
+                    if not is_empty(value):
+                        row_dict[key] = xlsx_value_to_str(value)
+                except IndexError:
+                    pass  # rows may not have values for every column
 
             result.append(row_dict)
 
         column_header_list = [key for key in column_header_list if key is not None]
+
         return result, _list_to_dict_list(column_header_list)
 
     result = OrderedDict()
@@ -236,6 +242,7 @@ def xlsx_to_dict_normal_sheet(sheet):
                 result[f"{sheetname}_header"],
             ) = xlsx_to_dict_normal_sheet(sheet)
 
+    workbook.close()
     return result
 
 

diff --git a/tests/example_xls/extra_columns.xlsx b/tests/example_xls/extra_columns.xlsx
diff --git a/tests/example_xls/group.xls b/tests/example_xls/group.xls
diff --git a/tests/example_xls/group.xlsx b/tests/example_xls/group.xlsx
diff --git a/tests/example_xls/specify_other.xls b/tests/example_xls/specify_other.xls
diff --git a/tests/example_xls/specify_other.xlsx b/tests/example_xls/specify_other.xlsx
diff --git a/tests/test_external_instances_for_selects.py b/tests/test_external_instances_for_selects.py
@@ -8,6 +8,7 @@
 from dataclasses import dataclass, field
 
 from pyxform.constants import EXTERNAL_INSTANCE_EXTENSIONS
+from pyxform.errors import PyXFormError
 from pyxform.xls2xform import get_xml_path, xls2xform_convert
 from tests.pyxform_test_case import PyxformTestCase
 from tests.test_utils.md_table import md_table_to_workbook
@@ -383,28 +384,16 @@ def test_itemset_csv_generated_from_external_choices(self):
         |        | select_one state           | state  | State  |                                 |
         |        | select_one_external city   | city   | City   | state=${state}                  |
         |        | select_one_external suburb | suburb | Suburb | state=${state} and city=${city} |
-        | choices |           |      |       |
-        |         | list_name | name | label |
-        |         | state     | nsw  | NSW   |
-        |         | state     | vic  | VIC   |
-        | external_choices |           |           |       |              |           |
-        |                  | list_name | name      | state |              | city      |
-        |                  | city      | Sydney    | nsw   |              |           |
-        |                  | city      | Melbourne | vic   |              |           |
-        |                  | suburb    | Balmain   | nsw   |              | sydney    |
-        |                  | suburb    | Footscray | vic   | empty header | melbourne |
         """
-        wb = md_table_to_workbook(md)
+        wb = md_table_to_workbook(md + self.all_choices)
         with get_temp_dir() as tmp:
             wb_path = os.path.join(tmp, "select_one_external.xlsx")
             wb.save(wb_path)
+            wb.close()
             with self.assertLogs("pyxform") as log:
                 xls2xform_convert(
                     xlsform_path=wb_path,
                     xform_path=get_xml_path(wb_path),
-                    validate=True,
-                    pretty_print=False,
-                    enketo=False,
                 )
 
             # Should have written the itemsets.csv file as part of XLSForm conversion.
@@ -420,6 +409,59 @@ def test_itemset_csv_generated_from_external_choices(self):
             # Should have excluded column with "empty header" in the last row.
             self.assertEqual('"suburb","Footscray","vic","melbourne"\n', rows[-1])
 
+    def test_empty_external_choices__errors(self):
+        md = """
+        | survey           |                          |       |       |               |
+        |                  | type                     | name  | label |choice_filter  |
+        |                  | select_one state         | state | State |               |
+        |                  | select_one_external city | city  | City  |state=${state} |
+        | choices          |                          |       |       |
+        |                  | list_name                | name  | label |
+        |                  | state                    | nsw   | NSW   |
+        | external_choices |                          |       |       |
+        """
+        wb = md_table_to_workbook(md)
+        with get_temp_dir() as tmp:
+            wb_path = os.path.join(tmp, "empty_sheet.xlsx")
+            wb.save(wb_path)
+            wb.close()
+            try:
+                xls2xform_convert(
+                    xlsform_path=wb_path,
+                    xform_path=get_xml_path(wb_path),
+                )
+            except PyXFormError as e:
+                self.assertContains(
+                    str(e), "should be an external_choices sheet in this xlsform"
+                )
+
+    def test_external_choices_with_only_header__errors(self):
+        md = """
+        | survey           |                          |       |       |               |
+        |                  | type                     | name  | label |choice_filter  |
+        |                  | select_one state         | state | State |               |
+        |                  | select_one_external city | city  | City  |state=${state} |
+        | choices          |                          |       |       |
+        |                  | list_name                | name  | label |
+        |                  | state                    | nsw   | NSW   |
+        | external_choices |                          |       |       |
+        |                  | list_name                | name  | state | city          |
+        """
+        wb = md_table_to_workbook(md)
+        with get_temp_dir() as tmp:
+            wb_path = os.path.join(tmp, "empty_sheet.xlsx")
+            wb.save(wb_path)
+            wb.close()
+            try:
+                xls2xform_convert(
+                    xlsform_path=wb_path,
+                    xform_path=get_xml_path(wb_path),
+                )
+            except PyXFormError as e:
+                self.assertContains(
+                    str(e), "should be an external_choices sheet in this xlsform"
+                )
+
 
 class TestInvalidExternalFileInstances(PyxformTestCase):
     def test_external_other_extension_instances(self):
@@ -433,7 +475,7 @@ def test_external_other_extension_instances(self):
             |        | select_multiple_from_file neighbourhoods.pdf | neighbourhoods | Neighbourhoods |
             """,  # noqa
             errored=True,
-            error_contains=["should be a choices sheet in this xlsform"],
+            error__contains=["should be a choices sheet in this xlsform"],
         )
 
     def test_external_choices_sheet_included_instances(self):

diff --git a/tests/test_xls2json.py b/tests/test_xls2json.py
@@ -1,9 +1,13 @@
 import os
 
+import psutil
+
 from pyxform.xls2json_backends import xlsx_to_dict
-from pyxform.xls2xform import xls2xform_convert
+from pyxform.xls2xform import get_xml_path, xls2xform_convert
 from tests import example_xls, test_output
 from tests.pyxform_test_case import PyxformTestCase
+from tests.test_utils.md_table import md_table_to_workbook
+from tests.utils import get_temp_dir
 
 # Common XLSForms used in below TestCases
 CHOICES = """
@@ -41,7 +45,6 @@
 
 
 class TestXLS2JSONSheetNameHeuristics(PyxformTestCase):
-
     err_similar_found = "the following sheets with similar names were found"
     err_survey_required = "You must have a sheet named 'survey'."
     err_choices_required = "There should be a choices sheet in this xlsform."
@@ -601,6 +604,27 @@ def test_workbook_to_json__optional_sheets_ok(self):
             warnings_count=0,
         )
 
+    def test_xls2xform_convert__e2e_row_with_no_column_value(self):
+        """Programmatically-created XLSX files may have rows without column values"""
+        md = """
+        | survey |        |        |        |         |
+        |        | type   | name   | label  | hint    |
+        |        | text   | state  | State  |         |
+        |        | text   | city   | City   | A hint  |
+        """
+        wb = md_table_to_workbook(md)
+        with get_temp_dir() as tmp:
+            wb_path = os.path.join(tmp, "empty_cell.xlsx")
+            wb.save(wb_path)
+            wb.close()
+            xls2xform_convert(
+                xlsform_path=wb_path,
+                xform_path=get_xml_path(wb_path),
+            )
+
+            xform_path = os.path.join(tmp, "empty_cell.xml")
+            self.assertTrue(os.path.exists(xform_path))
+
     def test_xls2xform_convert__e2e_with_settings_misspelling(self):
         """Should warn about settings misspelling when running full pipeline."""
         file_name = "extra_sheet_names"
@@ -617,6 +641,20 @@ def test_xls2xform_convert__e2e_with_settings_misspelling(self):
         )
         self.assertIn(expected, "\n".join(warnings))
 
+    def test_xls2xform_convert__e2e_with_extra_columns__does_not_use_excessive_memory(
+        self,
+    ):
+        """Degenerate form with many blank columns"""
+        process = psutil.Process(os.getpid())
+        pre_mem = process.memory_info().rss
+        xls2xform_convert(
+            xlsform_path=os.path.join(example_xls.PATH, "extra_columns.xlsx"),
+            xform_path=os.path.join(test_output.PATH, "extra_columns.xml"),
+        )
+        post_mem = process.memory_info().rss
+        # in v1.8.0, memory usage grew by over 16x
+        self.assertLess(post_mem, pre_mem * 2)
+
     def test_xlsx_to_dict__extra_sheet_names_are_returned_by_parser(self):
         """Should return all sheet names so that later steps can do spellcheck."""
         d = xlsx_to_dict(os.path.join(example_xls.PATH, "extra_sheet_names.xlsx"))

diff --git a/tests/utils.py b/tests/utils.py
@@ -66,9 +66,50 @@ def get_temp_file():
 
 @contextmanager
 def get_temp_dir():
-    temp_dir = tempfile.mkdtemp()
+    temp_dir_prefix = "pyxform_tmp_"
+    if os.name == "nt":
+        cleanup_pyxform_temp_files(prefix=temp_dir_prefix)
+
+    temp_dir = tempfile.mkdtemp(prefix=temp_dir_prefix)
     try:
         yield temp_dir
     finally:
-        if os.path.exists(temp_dir):
-            shutil.rmtree(temp_dir)
+        try:
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
+        except PermissionError:
+            truncate_temp_files(temp_dir=temp_dir)
+
+
+def truncate_temp_files(temp_dir):
+    """
+    Truncate files in a folder, recursing into directories.
+    """
+    # If we can't delete, at least the files can be truncated,
+    # so that they don't take up disk space until next cleanup.
+    # Seems to be a Windows-specific error for newly-created files.
+    temp_root = tempfile.gettempdir()
+    if os.path.exists(temp_dir):
+        for f in os.scandir(temp_dir):
+            if os.path.isdir(f.path):
+                truncate_temp_files(f.path)
+            else:
+                # Check still in temp directory
+                if f.path.startswith(temp_root):
+                    with open(f.path, mode="w") as _:
+                        pass
+
+
+def cleanup_pyxform_temp_files(prefix: str):
+    """
+    Try to clean up temp pyxform files from previous test runs.
+    """
+    temp_root = tempfile.gettempdir()
+    if os.path.exists(temp_root):
+        for f in os.scandir(temp_root):
+            if os.path.isdir(f.path):
+                if f.name.startswith(prefix) and f.path.startswith(temp_root):
+                    try:
+                        shutil.rmtree(f.path)
+                    except PermissionError:
+                        pass