Add validate_csv function

onaio · Nov 22, 2019 · 7703687 · 7703687
1 parent c0f5c3a
commit 7703687
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 76 deletions.
diff --git a/onadata/libs/tests/utils/test_csv_import.py b/onadata/libs/tests/utils/test_csv_import.py
@@ -384,8 +384,7 @@ def test_excel_date_conversion(self):
             xl_dates.append(row.get('tdate'))
             xl_datetime.append(row.get('now'))
 
-        result = csv_import.submit_csv(self.user.username, xform, date_csv)
-        import ipdb; ipdb.set_trace()
+        csv_import.submit_csv(self.user.username, xform, date_csv)
         # converted dates
         conv_dates = [instance.json.get('tdate')
                       for instance in Instance.objects.filter(
@@ -400,7 +399,8 @@ def test_excel_date_conversion(self):
             [u'6/12/2020 13:20', u'2019-03-11T16:00:51.147+02:00'])
         self.assertEqual(
             conv_datetime,
-            [u'2020-06-12T13:20:00.000000', u'2019-03-11T16:00:51.147+02:00'])
+            [u'2020-06-12T13:20:00.000000',
+             u'2019-03-11T16:00:51.147000+0200'])
         self.assertEqual(conv_dates, ['2019-03-01', '2019-02-26'])
 
     def test_enforces_data_type(self):
@@ -420,16 +420,16 @@ def test_enforces_data_type(self):
                                        bad_integer_csv)
         self.assertEqual(
             result.get('error'),
-            'Unknown integer format: 20.85')
+            'Unknown integer format(s): 20.85')
 
-        # Test date and datetime constraint is enforced
+        # Test datetime constraint is enforced
         bad_date_csv = open(
-            os.path.join(self.fixtures_dir, 'bad_date.csv'), 'rb')
+            os.path.join(self.fixtures_dir, 'bad_datetime.csv'), 'rb')
         result = csv_import.submit_csv(
             self.user.username, self.xform, bad_date_csv)
         self.assertEqual(
             result.get('error'),
-            'Unknown date format: 2014-0900. Supported format YYYY-mm-dd')
+            'Unknown datetime format(s): 2931093293232')
 
         # Test decimal constraint is enforced
         xls_file_path = os.path.join(self.fixtures_dir, 'bad_decimal.xlsx')
@@ -442,4 +442,4 @@ def test_enforces_data_type(self):
                                        bad_decimal_csv)
         self.assertEqual(
             result.get('error'),
-            'Unknown decimal format: sdsa')
+            'Unknown decimal format(s): sdsa')
diff --git a/onadata/libs/utils/csv_import.py b/onadata/libs/utils/csv_import.py
@@ -18,6 +18,7 @@
 from celery import current_task, task
 from celery.backends.amqp import BacklogLimitExceeded
 from celery.result import AsyncResult
+from dateutil.parser import parse
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.core.files.storage import default_storage
@@ -217,20 +218,16 @@ def submit_csv(username, xform, csv_file, overwrite=False):
     # Get the data dictionary
     xform_header = xform.get_headers()
 
-    missing_col = set(xform_header).difference(csv_header)
-    addition_col = set(csv_header).difference(xform_header)
+    missing_col = list(set(xform_header).difference(csv_header))
+    addition_col = list(set(csv_header).difference(xform_header))
 
-    # change to list
-    missing_col = list(missing_col)
-    addition_col = list(addition_col)
     # remove all metadata columns
-    missing = [
+    missing_col = [
         col for col in missing_col
-        if not col.startswith("_") and col not in IGNORED_COLUMNS
-    ]
-
+        if not col.startswith('_') and col not in IGNORED_COLUMNS
+        ]
     # remove all metadata inside groups
-    missing = [col for col in missing if '/_' not in col]
+    missing_col = [col for col in missing_col if '/_' not in col]
 
     # ignore if is multiple select question
     for col in csv_header:
@@ -239,21 +236,19 @@ def submit_csv(username, xform, csv_file, overwrite=False):
         if survey_element and \
                 survey_element.get('type') == MULTIPLE_SELECT_TYPE:
             # remove from the missing and additional list
-            missing = [x for x in missing if not x.startswith(col)]
-
+            missing_col = [x for x in missing_col if not x.startswith(col)]
             addition_col.remove(col)
 
     # remove headers for repeats that might be missing from csv
-    missing = sorted([m for m in missing if m.find('[') == -1])
-
+    missing_col = sorted([m for m in missing_col if m.find('[') == -1])
     # Include additional repeats
     addition_col = [a for a in addition_col if a.find('[') == -1]
 
-    if missing:
+    if missing_col:
         return async_status(
             FAILED, u"Sorry uploaded file does not match the form. "
             u"The file is missing the column(s): "
-            u"{0}.".format(', '.join(missing)))
+            u"{0}.".format(', '.join(missing_col)))
 
     if overwrite:
         xform.instances.filter(deleted_at__isnull=True)\
@@ -267,61 +262,26 @@ def submit_csv(username, xform, csv_file, overwrite=False):
     additions = duplicates = inserts = 0
 
     x_json = json.loads(xform.json)
-    xl_date_columns = [
-        dt.get('name') for dt in x_json.get('children')
-        if dt.get('type') in XLS_DATE_FIELDS]
-    xl_datetime_columns = [
-        dt.get('name') for dt in x_json.get('children')
-        if dt.get('type') in XLS_DATETIME_FIELDS]
-    xl_integer_columns = [
-        dt.get('name') for dt in x_json.get('children')
-        if dt.get('type') == 'integer'
-    ]
-    xl_decimal_columns = [
-        dt.get('name') for dt in x_json.get('children')
-        if dt.get('type') == 'decimal'
-    ]
+
+    def get_column_by_type(field_list):
+        """
+        """
+        return [
+            dt.get('name') for dt in x_json.get('children')
+            if dt.get('type') in field_list
+        ]
+
+    columns = {
+        'date': (get_column_by_type(XLS_DATE_FIELDS), parse),
+        'datetime': (get_column_by_type(XLS_DATETIME_FIELDS), parse),
+        'integer': (get_column_by_type(['integer']), int),
+        'decimal': (get_column_by_type(['decimal']), float)
+    }
 
     try:
         for row in csv_reader:
-            _check_datatype_constraint(
-                row, xl_integer_columns,
-                err_msg='Unknown integer format: ',
-                constraint_check=lambda x: int(x))
-            _check_datatype_constraint(
-                row, xl_decimal_columns,
-                err_msg='Unknown decimal format: ',
-                constraint_check=lambda x: float(x))
-
-            # convert some excel dates, replace / with -
-            for key in xl_date_columns:
-                val = row.get(key, '')
-
-                if val:
-                    try:
-                        date = datetime.strptime(val, '%m/%d/%Y')
-                    except ValueError:
-                        # TODO: Enforce date datatype constraint
-                        pass
-                    else:
-                        str_date = datetime.strftime(date, '%Y-%m-%d')
-                        row.update({key: str_date})
-
-            # convert some excel dates time, replace / with -
-            for key in xl_datetime_columns:
-                val = row.get(key, '')
-                if val:
-                    try:
-                        date_time = datetime.strptime(
-                            val, '%m/%d/%Y %H:%M')
-                    except ValueError:
-                        # TODO: Enforce datetime datatype constraint
-                        pass
-                    else:
-                        str_date_time = datetime.strftime(
-                            date_time, '%Y-%m-%dT%H:%M:%S.%f')
-                        row.update({key: str_date_time})
-
+            # TODO: Validate all rows and return errors for the rows with issues only
+            row = validate_csv(row, columns)
             # remove the additional columns
             for index in addition_col:
                 del row[index]
@@ -516,3 +476,65 @@ def submission_xls_to_csv(xls_file):
         csv_writer.writerow(row_values)
 
     return csv_file
+
+
+def validate_csv(row, columns):
+    """Validates CSV data according to constraints present on the XForm
+
+    Takes a CSVReader object and an XForms JSON data and validates
+    that the date, datetime, integer and decimal constraints are enforced
+    within the CSV data
+
+    :param (str or file): A CSV formatted file with submission rows.
+    :param dict xform_json: The XForms JSON representation
+    :return: A tuple containing a boolean representing the validity
+                of the data and a dict of validated data if successful
+                otherwise a list of invalid data.
+    :rtype: tuple
+    """
+    def validate_column_data(column, constraint_check):
+        """
+        """
+        invalid_data = []
+        validated_data = {}
+
+        for key in column:
+            value = row.get(key, '')
+
+            if value:
+                try:
+                    value = constraint_check(value)
+                except ValueError:
+                    invalid_data.append(value)
+                else:
+                    validated_data[key] = value
+
+        if invalid_data:
+            return (False, invalid_data)
+        else:
+            return (True, validated_data)
+
+    # Check data doesn't infringe on XForm data constraints
+    for datatype in columns:
+        column, constraint_check = columns.get(datatype)
+        valid, data = validate_column_data(column, constraint_check)
+
+        if valid:
+            if datatype == 'date':
+                for key in data:
+                    value = datetime.strftime(data.get(key), '%Y-%m-%d')
+                    data.update({key: value})
+
+            elif datatype == 'datetime':
+                for key in data:
+                    value = datetime.strftime(
+                        data.get(key), '%Y-%m-%dT%H:%M:%S.%f%z')
+                    data.update({key: value})
+
+            row.update(data)
+        else:
+            raise Exception(
+                    'Unknown {} format(s): {}'.format(
+                        datatype, ', '.join(data)))
+
+    return row