diff --git a/onadata/libs/tests/utils/test_csv_import.py b/onadata/libs/tests/utils/test_csv_import.py index 52f2587bec..52d8bffa18 100644 --- a/onadata/libs/tests/utils/test_csv_import.py +++ b/onadata/libs/tests/utils/test_csv_import.py @@ -384,8 +384,7 @@ def test_excel_date_conversion(self): xl_dates.append(row.get('tdate')) xl_datetime.append(row.get('now')) - result = csv_import.submit_csv(self.user.username, xform, date_csv) - import ipdb; ipdb.set_trace() + csv_import.submit_csv(self.user.username, xform, date_csv) # converted dates conv_dates = [instance.json.get('tdate') for instance in Instance.objects.filter( @@ -400,7 +399,8 @@ def test_excel_date_conversion(self): [u'6/12/2020 13:20', u'2019-03-11T16:00:51.147+02:00']) self.assertEqual( conv_datetime, - [u'2020-06-12T13:20:00.000000', u'2019-03-11T16:00:51.147+02:00']) + [u'2020-06-12T13:20:00.000000', + u'2019-03-11T16:00:51.147000+0200']) self.assertEqual(conv_dates, ['2019-03-01', '2019-02-26']) def test_enforces_data_type(self): @@ -420,16 +420,16 @@ def test_enforces_data_type(self): bad_integer_csv) self.assertEqual( result.get('error'), - 'Unknown integer format: 20.85') + 'Unknown integer format(s): 20.85') - # Test date and datetime constraint is enforced + # Test datetime constraint is enforced bad_date_csv = open( - os.path.join(self.fixtures_dir, 'bad_date.csv'), 'rb') + os.path.join(self.fixtures_dir, 'bad_datetime.csv'), 'rb') result = csv_import.submit_csv( self.user.username, self.xform, bad_date_csv) self.assertEqual( result.get('error'), - 'Unknown date format: 2014-0900. Supported format YYYY-mm-dd') + 'Unknown datetime format(s): 2931093293232') # Test decimal constraint is enforced xls_file_path = os.path.join(self.fixtures_dir, 'bad_decimal.xlsx') @@ -442,4 +442,4 @@ def test_enforces_data_type(self): bad_decimal_csv) self.assertEqual( result.get('error'), - 'Unknown decimal format: sdsa') + 'Unknown decimal format(s): sdsa') diff --git a/onadata/libs/utils/csv_import.py b/onadata/libs/utils/csv_import.py index 2a8c5025d8..0b64091eee 100644 --- a/onadata/libs/utils/csv_import.py +++ b/onadata/libs/utils/csv_import.py @@ -18,6 +18,7 @@ from celery import current_task, task from celery.backends.amqp import BacklogLimitExceeded from celery.result import AsyncResult +from dateutil.parser import parse from django.conf import settings from django.contrib.auth.models import User from django.core.files.storage import default_storage @@ -217,20 +218,16 @@ def submit_csv(username, xform, csv_file, overwrite=False): # Get the data dictionary xform_header = xform.get_headers() - missing_col = set(xform_header).difference(csv_header) - addition_col = set(csv_header).difference(xform_header) + missing_col = list(set(xform_header).difference(csv_header)) + addition_col = list(set(csv_header).difference(xform_header)) - # change to list - missing_col = list(missing_col) - addition_col = list(addition_col) # remove all metadata columns - missing = [ + missing_col = [ col for col in missing_col - if not col.startswith("_") and col not in IGNORED_COLUMNS - ] - + if not col.startswith('_') and col not in IGNORED_COLUMNS + ] # remove all metadata inside groups - missing = [col for col in missing if '/_' not in col] + missing_col = [col for col in missing_col if '/_' not in col] # ignore if is multiple select question for col in csv_header: @@ -239,21 +236,19 @@ def submit_csv(username, xform, csv_file, overwrite=False): if survey_element and \ survey_element.get('type') == MULTIPLE_SELECT_TYPE: # remove from the missing and additional list - missing = [x for x in missing if not x.startswith(col)] - + missing_col = [x for x in missing_col if not x.startswith(col)] addition_col.remove(col) # remove headers for repeats that might be missing from csv - missing = sorted([m for m in missing if m.find('[') == -1]) - + missing_col = sorted([m for m in missing_col if m.find('[') == -1]) # Include additional repeats addition_col = [a for a in addition_col if a.find('[') == -1] - if missing: + if missing_col: return async_status( FAILED, u"Sorry uploaded file does not match the form. " u"The file is missing the column(s): " - u"{0}.".format(', '.join(missing))) + u"{0}.".format(', '.join(missing_col))) if overwrite: xform.instances.filter(deleted_at__isnull=True)\ @@ -267,61 +262,26 @@ def submit_csv(username, xform, csv_file, overwrite=False): additions = duplicates = inserts = 0 x_json = json.loads(xform.json) - xl_date_columns = [ - dt.get('name') for dt in x_json.get('children') - if dt.get('type') in XLS_DATE_FIELDS] - xl_datetime_columns = [ - dt.get('name') for dt in x_json.get('children') - if dt.get('type') in XLS_DATETIME_FIELDS] - xl_integer_columns = [ - dt.get('name') for dt in x_json.get('children') - if dt.get('type') == 'integer' - ] - xl_decimal_columns = [ - dt.get('name') for dt in x_json.get('children') - if dt.get('type') == 'decimal' - ] + + def get_column_by_type(field_list): + """ + """ + return [ + dt.get('name') for dt in x_json.get('children') + if dt.get('type') in field_list + ] + + columns = { + 'date': (get_column_by_type(XLS_DATE_FIELDS), parse), + 'datetime': (get_column_by_type(XLS_DATETIME_FIELDS), parse), + 'integer': (get_column_by_type(['integer']), int), + 'decimal': (get_column_by_type(['decimal']), float) + } try: for row in csv_reader: - _check_datatype_constraint( - row, xl_integer_columns, - err_msg='Unknown integer format: ', - constraint_check=lambda x: int(x)) - _check_datatype_constraint( - row, xl_decimal_columns, - err_msg='Unknown decimal format: ', - constraint_check=lambda x: float(x)) - - # convert some excel dates, replace / with - - for key in xl_date_columns: - val = row.get(key, '') - - if val: - try: - date = datetime.strptime(val, '%m/%d/%Y') - except ValueError: - # TODO: Enforce date datatype constraint - pass - else: - str_date = datetime.strftime(date, '%Y-%m-%d') - row.update({key: str_date}) - - # convert some excel dates time, replace / with - - for key in xl_datetime_columns: - val = row.get(key, '') - if val: - try: - date_time = datetime.strptime( - val, '%m/%d/%Y %H:%M') - except ValueError: - # TODO: Enforce datetime datatype constraint - pass - else: - str_date_time = datetime.strftime( - date_time, '%Y-%m-%dT%H:%M:%S.%f') - row.update({key: str_date_time}) - + # TODO: Validate all rows and return errors for the rows with issues only + row = validate_csv(row, columns) # remove the additional columns for index in addition_col: del row[index] @@ -516,3 +476,65 @@ def submission_xls_to_csv(xls_file): csv_writer.writerow(row_values) return csv_file + + +def validate_csv(row, columns): + """Validates CSV data according to constraints present on the XForm + + Takes a CSVReader object and an XForms JSON data and validates + that the date, datetime, integer and decimal constraints are enforced + within the CSV data + + :param (str or file): A CSV formatted file with submission rows. + :param dict xform_json: The XForms JSON representation + :return: A tuple containing a boolean representing the validity + of the data and a dict of validated data if successful + otherwise a list of invalid data. + :rtype: tuple + """ + def validate_column_data(column, constraint_check): + """ + """ + invalid_data = [] + validated_data = {} + + for key in column: + value = row.get(key, '') + + if value: + try: + value = constraint_check(value) + except ValueError: + invalid_data.append(value) + else: + validated_data[key] = value + + if invalid_data: + return (False, invalid_data) + else: + return (True, validated_data) + + # Check data doesn't infringe on XForm data constraints + for datatype in columns: + column, constraint_check = columns.get(datatype) + valid, data = validate_column_data(column, constraint_check) + + if valid: + if datatype == 'date': + for key in data: + value = datetime.strftime(data.get(key), '%Y-%m-%d') + data.update({key: value}) + + elif datatype == 'datetime': + for key in data: + value = datetime.strftime( + data.get(key), '%Y-%m-%dT%H:%M:%S.%f%z') + data.update({key: value}) + + row.update(data) + else: + raise Exception( + 'Unknown {} format(s): {}'.format( + datatype, ', '.join(data))) + + return row