Skip to content

Commit

Permalink
Add validate_csv function
Browse files Browse the repository at this point in the history
  • Loading branch information
DavisRayM committed Nov 22, 2019
1 parent c0f5c3a commit 7703687
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 76 deletions.
16 changes: 8 additions & 8 deletions onadata/libs/tests/utils/test_csv_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,7 @@ def test_excel_date_conversion(self):
xl_dates.append(row.get('tdate'))
xl_datetime.append(row.get('now'))

result = csv_import.submit_csv(self.user.username, xform, date_csv)
import ipdb; ipdb.set_trace()
csv_import.submit_csv(self.user.username, xform, date_csv)
# converted dates
conv_dates = [instance.json.get('tdate')
for instance in Instance.objects.filter(
Expand All @@ -400,7 +399,8 @@ def test_excel_date_conversion(self):
[u'6/12/2020 13:20', u'2019-03-11T16:00:51.147+02:00'])
self.assertEqual(
conv_datetime,
[u'2020-06-12T13:20:00.000000', u'2019-03-11T16:00:51.147+02:00'])
[u'2020-06-12T13:20:00.000000',
u'2019-03-11T16:00:51.147000+0200'])
self.assertEqual(conv_dates, ['2019-03-01', '2019-02-26'])

def test_enforces_data_type(self):
Expand All @@ -420,16 +420,16 @@ def test_enforces_data_type(self):
bad_integer_csv)
self.assertEqual(
result.get('error'),
'Unknown integer format: 20.85')
'Unknown integer format(s): 20.85')

# Test date and datetime constraint is enforced
# Test datetime constraint is enforced
bad_date_csv = open(
os.path.join(self.fixtures_dir, 'bad_date.csv'), 'rb')
os.path.join(self.fixtures_dir, 'bad_datetime.csv'), 'rb')
result = csv_import.submit_csv(
self.user.username, self.xform, bad_date_csv)
self.assertEqual(
result.get('error'),
'Unknown date format: 2014-0900. Supported format YYYY-mm-dd')
'Unknown datetime format(s): 2931093293232')

# Test decimal constraint is enforced
xls_file_path = os.path.join(self.fixtures_dir, 'bad_decimal.xlsx')
Expand All @@ -442,4 +442,4 @@ def test_enforces_data_type(self):
bad_decimal_csv)
self.assertEqual(
result.get('error'),
'Unknown decimal format: sdsa')
'Unknown decimal format(s): sdsa')
158 changes: 90 additions & 68 deletions onadata/libs/utils/csv_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from celery import current_task, task
from celery.backends.amqp import BacklogLimitExceeded
from celery.result import AsyncResult
from dateutil.parser import parse
from django.conf import settings
from django.contrib.auth.models import User
from django.core.files.storage import default_storage
Expand Down Expand Up @@ -217,20 +218,16 @@ def submit_csv(username, xform, csv_file, overwrite=False):
# Get the data dictionary
xform_header = xform.get_headers()

missing_col = set(xform_header).difference(csv_header)
addition_col = set(csv_header).difference(xform_header)
missing_col = list(set(xform_header).difference(csv_header))
addition_col = list(set(csv_header).difference(xform_header))

# change to list
missing_col = list(missing_col)
addition_col = list(addition_col)
# remove all metadata columns
missing = [
missing_col = [
col for col in missing_col
if not col.startswith("_") and col not in IGNORED_COLUMNS
]

if not col.startswith('_') and col not in IGNORED_COLUMNS
]
# remove all metadata inside groups
missing = [col for col in missing if '/_' not in col]
missing_col = [col for col in missing_col if '/_' not in col]

# ignore if is multiple select question
for col in csv_header:
Expand All @@ -239,21 +236,19 @@ def submit_csv(username, xform, csv_file, overwrite=False):
if survey_element and \
survey_element.get('type') == MULTIPLE_SELECT_TYPE:
# remove from the missing and additional list
missing = [x for x in missing if not x.startswith(col)]

missing_col = [x for x in missing_col if not x.startswith(col)]
addition_col.remove(col)

# remove headers for repeats that might be missing from csv
missing = sorted([m for m in missing if m.find('[') == -1])

missing_col = sorted([m for m in missing_col if m.find('[') == -1])
# Include additional repeats
addition_col = [a for a in addition_col if a.find('[') == -1]

if missing:
if missing_col:
return async_status(
FAILED, u"Sorry uploaded file does not match the form. "
u"The file is missing the column(s): "
u"{0}.".format(', '.join(missing)))
u"{0}.".format(', '.join(missing_col)))

if overwrite:
xform.instances.filter(deleted_at__isnull=True)\
Expand All @@ -267,61 +262,26 @@ def submit_csv(username, xform, csv_file, overwrite=False):
additions = duplicates = inserts = 0

x_json = json.loads(xform.json)
xl_date_columns = [
dt.get('name') for dt in x_json.get('children')
if dt.get('type') in XLS_DATE_FIELDS]
xl_datetime_columns = [
dt.get('name') for dt in x_json.get('children')
if dt.get('type') in XLS_DATETIME_FIELDS]
xl_integer_columns = [
dt.get('name') for dt in x_json.get('children')
if dt.get('type') == 'integer'
]
xl_decimal_columns = [
dt.get('name') for dt in x_json.get('children')
if dt.get('type') == 'decimal'
]

def get_column_by_type(field_list):
"""
"""
return [
dt.get('name') for dt in x_json.get('children')
if dt.get('type') in field_list
]

columns = {
'date': (get_column_by_type(XLS_DATE_FIELDS), parse),
'datetime': (get_column_by_type(XLS_DATETIME_FIELDS), parse),
'integer': (get_column_by_type(['integer']), int),
'decimal': (get_column_by_type(['decimal']), float)
}

try:
for row in csv_reader:
_check_datatype_constraint(
row, xl_integer_columns,
err_msg='Unknown integer format: ',
constraint_check=lambda x: int(x))
_check_datatype_constraint(
row, xl_decimal_columns,
err_msg='Unknown decimal format: ',
constraint_check=lambda x: float(x))

# convert some excel dates, replace / with -
for key in xl_date_columns:
val = row.get(key, '')

if val:
try:
date = datetime.strptime(val, '%m/%d/%Y')
except ValueError:
# TODO: Enforce date datatype constraint
pass
else:
str_date = datetime.strftime(date, '%Y-%m-%d')
row.update({key: str_date})

# convert some excel dates time, replace / with -
for key in xl_datetime_columns:
val = row.get(key, '')
if val:
try:
date_time = datetime.strptime(
val, '%m/%d/%Y %H:%M')
except ValueError:
# TODO: Enforce datetime datatype constraint
pass
else:
str_date_time = datetime.strftime(
date_time, '%Y-%m-%dT%H:%M:%S.%f')
row.update({key: str_date_time})

# TODO: Validate all rows and return errors for the rows with issues only
row = validate_csv(row, columns)
# remove the additional columns
for index in addition_col:
del row[index]
Expand Down Expand Up @@ -516,3 +476,65 @@ def submission_xls_to_csv(xls_file):
csv_writer.writerow(row_values)

return csv_file


def validate_csv(row, columns):
"""Validates CSV data according to constraints present on the XForm
Takes a CSVReader object and an XForms JSON data and validates
that the date, datetime, integer and decimal constraints are enforced
within the CSV data
:param (str or file): A CSV formatted file with submission rows.
:param dict xform_json: The XForms JSON representation
:return: A tuple containing a boolean representing the validity
of the data and a dict of validated data if successful
otherwise a list of invalid data.
:rtype: tuple
"""
def validate_column_data(column, constraint_check):
"""
"""
invalid_data = []
validated_data = {}

for key in column:
value = row.get(key, '')

if value:
try:
value = constraint_check(value)
except ValueError:
invalid_data.append(value)
else:
validated_data[key] = value

if invalid_data:
return (False, invalid_data)
else:
return (True, validated_data)

# Check data doesn't infringe on XForm data constraints
for datatype in columns:
column, constraint_check = columns.get(datatype)
valid, data = validate_column_data(column, constraint_check)

if valid:
if datatype == 'date':
for key in data:
value = datetime.strftime(data.get(key), '%Y-%m-%d')
data.update({key: value})

elif datatype == 'datetime':
for key in data:
value = datetime.strftime(
data.get(key), '%Y-%m-%dT%H:%M:%S.%f%z')
data.update({key: value})

row.update(data)
else:
raise Exception(
'Unknown {} format(s): {}'.format(
datatype, ', '.join(data)))

return row

0 comments on commit 7703687

Please sign in to comment.