Skip to content

Commit

Permalink
Removing errors informed from tests. Adding additional test cases inc…
Browse files Browse the repository at this point in the history
…luding ones starting from an existing schema.
  • Loading branch information
abroglesc committed Nov 9, 2020
1 parent 96ca4ae commit fff6f5b
Show file tree
Hide file tree
Showing 3 changed files with 381 additions and 74 deletions.
13 changes: 1 addition & 12 deletions tests/data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ class DataReader:
ERRORS
line: msg
...
ERRORS INFORMED
line: msg
...
SCHEMA
bigquery_schema
END
Expand All @@ -65,7 +62,6 @@ class DataReader:
* an optional EXISTING_SCHEMA section contains the existing base
BigQuery schema to build off of
* an optional ERRORS section containing the expected error messages
* an optional ERRORS INFORMED section containing the expected error
messages when the schema is known to schema decoder in advance
* a SCHEMA section containing the expected BigQuery schema
* comment lines start with a '#' character.
Expand Down Expand Up @@ -136,11 +132,7 @@ def read_chunk(self):
error_flags, errors = self.read_errors_section()
if errors and error_flags:
raise Exception("Unexpected error flags in the first ERRORS section")
informed_error_flags, informed_errors = self.read_errors_section()
if informed_errors and "INFORMED" not in informed_error_flags:
raise Exception("Expected INFORMED flag in the second ERRORS section")
error_map = self.process_errors(errors or [])
informed_error_map = self.process_errors(informed_errors or [])
schema = self.read_schema_section()
self.read_end_marker()
self.chunk_count += 1
Expand All @@ -153,8 +145,6 @@ def read_chunk(self):
'existing_schema': existing_schema,
'errors': errors or [],
'error_map': error_map,
'informed_errors': informed_errors,
'informed_error_map': informed_error_map,
'schema': schema
}

Expand Down Expand Up @@ -191,7 +181,7 @@ def read_data_section(self):
return (data_flags, records, lineno)

def read_existing_schema_section(self):
"""Returns the JSON string of the schema section.
"""Returns the JSON string of the existing_schema section.
"""

# The next tag must be 'EXISTING_SCHEMA'
Expand All @@ -213,7 +203,6 @@ def read_existing_schema_section(self):
self.push_back(line)
break
schema_lines.append(line)

return ''.join(schema_lines)
else:
self.push_back(tag_line)
Expand Down
68 changes: 12 additions & 56 deletions tests/test_generate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,10 +425,10 @@ def chunks(self):

class TestDataChunksFromFile(unittest.TestCase):
def test_all_data_chunks(self):
self.maxDiff = None
for chunk in ChunksFromDataFile().chunks():
try:
self.verify_data_chunk(chunk)
self.verify_data_chunk_informed(chunk)
except AssertionError as e:
print("\nError when processing chunk on line {}\n".format(chunk['line']))
raise e
Expand All @@ -446,16 +446,22 @@ def verify_data_chunk(self, chunk):
expected_errors = chunk['errors']
expected_error_map = chunk['error_map']
expected_schema = chunk['schema']
existing_schema = chunk['existing_schema']

print("Test chunk %s, line %s: First record: %s" % (chunk_count, line, records[0]))
print("Test chunk %s, line %s: First record: %s" %
(chunk_count, line, records[0]))
# Generate schema.
generator = SchemaGenerator(
input_format=input_format,
infer_mode=infer_mode,
keep_nulls=keep_nulls,
quoted_values_are_strings=quoted_values_are_strings,
sanitize_names=sanitize_names)
schema_map, error_logs = generator.deduce_schema(records)
existing_schema_map = None
if existing_schema:
existing_schema_map = bq_schema_to_map(json.loads(existing_schema))
schema_map, error_logs = generator.deduce_schema(
records, schema_map=existing_schema_map)
schema = generator.flatten_schema(schema_map)

# Check the schema, preserving order
Expand All @@ -466,52 +472,6 @@ def verify_data_chunk(self, chunk):
self.assertEqual(len(expected_errors), len(error_logs))
self.assert_error_messages(expected_error_map, error_logs)

def verify_data_chunk_informed(self, chunk):
chunk_count = chunk['chunk_count']
line = chunk['line']
data_flags = chunk['data_flags']
input_format = 'csv' if ('csv' in data_flags) else 'json'
keep_nulls = ('keep_nulls' in data_flags)
infer_mode = ('infer_mode' in data_flags)
quoted_values_are_strings = ('quoted_values_are_strings' in data_flags)
sanitize_names = ('sanitize_names' in data_flags)
records = chunk['records']
expected_schema = chunk['schema']
expected_errors = chunk['informed_errors']
expected_error_map = chunk['informed_error_map']
if expected_errors is None:
expected_errors = chunk['errors']
expected_error_map = chunk['error_map']

# Check the schema, preserving order
expected = json.loads(expected_schema, object_pairs_hook=OrderedDict)

print("Test informed chunk %s, line %s: First record: %s" % (chunk_count, line, records[0]))

# Test deduction with preloaded schema

expected_map = bq_schema_to_map(expected)
generator = SchemaGenerator(
input_format=input_format,
infer_mode=infer_mode,
keep_nulls=keep_nulls,
quoted_values_are_strings=quoted_values_are_strings,
sanitize_names=sanitize_names)
schema_map, error_logs = generator.deduce_schema(records, schema_map=expected_map)
schema = generator.flatten_schema(schema_map)

# Check the schema, preserving order
self.assertEqual(expected, schema)

print('informed_expected_errors=',expected_errors,'error_logs=',error_logs)
self.assertEqual(len(expected_errors), len(error_logs))
self.assert_error_messages(expected_error_map, error_logs)

# Test roundtrip of schema -> schema_map -> schema
expected_map = bq_schema_to_map(expected)
schema = generator.flatten_schema(expected_map)
self.assertEqual(expected, schema)

def assert_error_messages(self, expected_error_map, error_logs):
# Convert the list of errors into a map
error_map = {}
Expand All @@ -523,19 +483,15 @@ def assert_error_messages(self, expected_error_map, error_logs):
error_map[line_number] = messages
messages.append(error['msg'])

# Check that each entry in 'error_logs' is expected. Currently checks
# only that the number of errors matches on a per line basis.
# TODO: Look deeper and verify that the error message strings match as
# well.
for line_number, messages in sorted(error_map.items()):
expected_entry = expected_error_map.get(line_number)
self.assertIsNotNone(expected_entry)
expected_messages = expected_entry['msgs']
self.assertEqual(len(expected_messages), len(messages))
self.assertEqual(expected_messages, messages)


class TestBigQuerySchemaToSchemaMap(unittest.TestCase):
def test_bq_schema_to_map_permutations(self):
def test_bq_schema_to_map_round_trip_permutations(self):
''' This checks that each possible type of consititued schema, when generated,
then converted to a schema_map, then back to the schema, they are equal.
Expand All @@ -562,7 +518,7 @@ def test_bq_schema_to_map_permutations(self):
quoted_values_are_strings=quote_value_are_strings)
flattened = generator.flatten_schema(schema_map)
try:
self.assertEquals(schema, flattened)
self.assertEqual(schema, flattened)
except AssertionError as e:
print("test_bq_schema_to_map_permutations failed for case where: "
"bq_entry={}\nschema_generator created with values:"
Expand Down
Loading

0 comments on commit fff6f5b

Please sign in to comment.