Skip to content

Commit

Permalink
Reformatted dataprofiler/tests using black. (#511)
Browse files Browse the repository at this point in the history
  • Loading branch information
jakleh authored Jul 7, 2022
1 parent 69ddc11 commit 69d7173
Show file tree
Hide file tree
Showing 64 changed files with 10,886 additions and 8,400 deletions.
139 changes: 84 additions & 55 deletions dataprofiler/tests/data_readers/test_avro_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,86 +11,117 @@


class TestAVRODataClass(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.input_file_path = None
cls.output_file_path = None
cls.ss = None

test_dir = os.path.join(test_root_path, 'data')
test_dir = os.path.join(test_root_path, "data")
cls.input_file_names = [
dict(path=os.path.join(test_dir, 'avro/users.avro'), count=4),
dict(path=os.path.join(test_dir, 'avro/userdata1.avro'), count=1000),
dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.parquet'), count=1000),
dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.csv'), count=1000),
dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.json'), count=1000),
dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.txt'), count=1000),
dict(path=os.path.join(test_dir, 'avro/deflate_compressed_intentionally_mislabeled_file.csv'), count=4),
dict(path=os.path.join(test_dir, 'avro/snappy_compressed_intentionally_mislabeled_file.csv'), count=4),
dict(path=os.path.join(test_dir, "avro/users.avro"), count=4),
dict(path=os.path.join(test_dir, "avro/userdata1.avro"), count=1000),
dict(
path=os.path.join(
test_dir, "avro/userdata1_intentionally_mislabled_file.parquet"
),
count=1000,
),
dict(
path=os.path.join(
test_dir, "avro/userdata1_intentionally_mislabled_file.csv"
),
count=1000,
),
dict(
path=os.path.join(
test_dir, "avro/userdata1_intentionally_mislabled_file.json"
),
count=1000,
),
dict(
path=os.path.join(
test_dir, "avro/userdata1_intentionally_mislabled_file.txt"
),
count=1000,
),
dict(
path=os.path.join(
test_dir,
"avro/deflate_compressed_intentionally_mislabeled_file.csv",
),
count=4,
),
dict(
path=os.path.join(
test_dir, "avro/snappy_compressed_intentionally_mislabeled_file.csv"
),
count=4,
),
]

cls.buffer_list = []
for input_file in cls.input_file_names:
# add BytesIO
buffer_info = input_file.copy()
with open(input_file['path'], 'rb') as fp:
buffer_info['path'] = BytesIO(fp.read())
with open(input_file["path"], "rb") as fp:
buffer_info["path"] = BytesIO(fp.read())
cls.buffer_list.append(buffer_info)

cls.file_or_buf_list = cls.input_file_names + cls.buffer_list

@classmethod
def setUp(cls):
for buffer in cls.buffer_list:
buffer['path'].seek(0)
buffer["path"].seek(0)

def test_is_match(self):
"""
Determine if the avro file can be automatically identified from
byte stream or file path
"""
for input_file in self.file_or_buf_list:
self.assertTrue(AVROData.is_match(input_file['path']))
self.assertTrue(AVROData.is_match(input_file["path"]))

def test_avro_file_identification(self):
"""
Determine if the avro file can be automatically identified
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file['path'])
self.assertEqual(input_data_obj.data_type, 'avro')
input_data_obj = Data(input_file["path"])
self.assertEqual(input_data_obj.data_type, "avro")

def test_specifying_data_type(self):
"""
Determine if the avro file can be loaded with manual data_type setting
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file['path'], data_type='avro')
self.assertEqual(input_data_obj.data_type, 'avro')
input_data_obj = Data(input_file["path"], data_type="avro")
self.assertEqual(input_data_obj.data_type, "avro")

def test_reload_data(self):
"""
Determine if the avro file can be reloaded
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file['path'])
input_data_obj.reload(input_file['path'])
self.assertEqual(input_data_obj.data_type, 'avro')
self.assertEqual(input_file['path'], input_data_obj.input_file_path)
input_data_obj = Data(input_file["path"])
input_data_obj.reload(input_file["path"])
self.assertEqual(input_data_obj.data_type, "avro")
self.assertEqual(input_file["path"], input_data_obj.input_file_path)

def test_data_formats(self):
"""
Determine if the avro file data_formats can be used
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file['path'])
input_data_obj = Data(input_file["path"])
for data_format in list(input_data_obj._data_formats.keys()):
input_data_obj.data_format = data_format
self.assertEqual(input_data_obj.data_format, data_format)
data = input_data_obj.data
if data_format == "dataframe":
import pandas as pd

self.assertIsInstance(data, pd.DataFrame)
elif data_format in ["records", "json"]:
self.assertIsInstance(data, list)
Expand All @@ -100,35 +131,37 @@ def test_nested_keys(self):
"""
Determine if the avro file data_formats can be used
"""
dict =[
{'name': 1, 'favorite_number': 1},
{'favorite_color': 1, 'address': {'streetaddress': 1, 'city': 1}}]
dict = [
{"name": 1, "favorite_number": 1},
{"favorite_color": 1, "address": {"streetaddress": 1, "city": 1}},
]
nested_keys = AVROData._get_nested_keys_from_dicts(dict)
self.assertIsNotNone(nested_keys)
schema_avro = {
'namespace': 'avro_namespace',
'name': 'avro_filename',
'type': 'record',
'fields': [
{'name': 'name', 'type': ['string', 'null']},
{'name': 'favorite_number', 'type': ['string', 'null']},
{'name': 'favorite_color', 'type': ['string', 'null']},
"namespace": "avro_namespace",
"name": "avro_filename",
"type": "record",
"fields": [
{"name": "name", "type": ["string", "null"]},
{"name": "favorite_number", "type": ["string", "null"]},
{"name": "favorite_color", "type": ["string", "null"]},
{
'name': 'address',
'type': [{
'namespace': 'avro_namespace',
'name': 'address',
'type': 'record',
'fields': [
{'name': 'streetaddress', 'type': ['string', 'null']},
{'name': 'city', 'type': ['string', 'null']}
]
},
'null'
]
}
]
}
"name": "address",
"type": [
{
"namespace": "avro_namespace",
"name": "address",
"type": "record",
"fields": [
{"name": "streetaddress", "type": ["string", "null"]},
{"name": "city", "type": ["string", "null"]},
],
},
"null",
],
},
],
}
schema_avro = AVROData._get_schema_avro(nested_keys, schema_avro)
self.assertIsNotNone(schema_avro)

Expand All @@ -140,12 +173,8 @@ def test_len_data(self):

for input_file in self.file_or_buf_list:
data = Data(input_file["path"])
self.assertEqual(input_file['count'],
len(data),
msg=input_file['path'])
self.assertEqual(input_file['count'],
data.length,
msg=input_file['path'])
self.assertEqual(input_file["count"], len(data), msg=input_file["path"])
self.assertEqual(input_file["count"], data.length, msg=input_file["path"])

def test_file_encoding(self):
"""Tests to ensure file_encoding set to None"""
Expand Down Expand Up @@ -175,5 +204,5 @@ def test_is_structured(self):
self.assertFalse(data.is_structured)


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
90 changes: 57 additions & 33 deletions dataprofiler/tests/data_readers/test_base_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,37 +18,58 @@ class TestBaseDataClass(unittest.TestCase):

@classmethod
def setUpClass(cls) -> None:
test_dir = os.path.join(test_root_path, 'data')
test_dir = os.path.join(test_root_path, "data")
cls.input_file_names = [
dict(path=os.path.join(test_dir, 'csv/diamonds.csv'),
encoding='utf-8', data_type='csv'),
dict(path=os.path.join(test_dir, 'avro/users.avro'),
encoding=None, data_type='avro'),
dict(path=os.path.join(test_dir, 'json/iris-utf-16.json'),
encoding='utf-16', data_type='json'),
dict(path=os.path.join(test_dir, 'json/iris-utf-32.json'),
encoding='utf-32', data_type='json'),
dict(path=os.path.join(test_dir, 'parquet/iris.parq'),
encoding=None, data_type='parquet'),
dict(path=os.path.join(test_dir, 'txt/code.txt'),
encoding='utf-8', data_type='text'),
dict(path=os.path.join(test_dir, 'txt/empty.txt'),
encoding='utf-8', data_type='text'),
dict(
path=os.path.join(test_dir, "csv/diamonds.csv"),
encoding="utf-8",
data_type="csv",
),
dict(
path=os.path.join(test_dir, "avro/users.avro"),
encoding=None,
data_type="avro",
),
dict(
path=os.path.join(test_dir, "json/iris-utf-16.json"),
encoding="utf-16",
data_type="json",
),
dict(
path=os.path.join(test_dir, "json/iris-utf-32.json"),
encoding="utf-32",
data_type="json",
),
dict(
path=os.path.join(test_dir, "parquet/iris.parq"),
encoding=None,
data_type="parquet",
),
dict(
path=os.path.join(test_dir, "txt/code.txt"),
encoding="utf-8",
data_type="text",
),
dict(
path=os.path.join(test_dir, "txt/empty.txt"),
encoding="utf-8",
data_type="text",
),
]

for input_file in cls.input_file_names:
# add BytesIO
buffer_info = input_file.copy()
with open(input_file['path'], 'rb') as fp:
buffer_info['path'] = BytesIO(fp.read())
with open(input_file["path"], "rb") as fp:
buffer_info["path"] = BytesIO(fp.read())
cls.buffer_list.append(buffer_info)

cls.file_or_buf_list = cls.input_file_names + cls.buffer_list

@classmethod
def setUp(cls):
for buffer in cls.buffer_list:
buffer['path'].seek(0)
buffer["path"].seek(0)

def test_can_apply_data_functions(self):
class FakeDataClass:
Expand All @@ -63,11 +84,12 @@ def func1(self):

# if the function exists in BaseData fail the test because the results
# may become inaccurate.
self.assertFalse(hasattr(BaseData, 'func1'))
self.assertFalse(hasattr(BaseData, "func1"))

with self.assertRaisesRegex(AttributeError,
"Neither 'BaseData' nor 'FakeDataClass' "
"objects have attribute 'test'"):
with self.assertRaisesRegex(
AttributeError,
"Neither 'BaseData' nor 'FakeDataClass' " "objects have attribute 'test'",
):
data.test

# validate it will take BaseData attribute over the data attribute
Expand All @@ -83,20 +105,22 @@ def test_file_encoding(self):
"""
for input_file in self.file_or_buf_list:
# do not test StringIO, avro, parquet
if (isinstance(input_file['path'], StringIO)
or input_file['data_type'] in ['avro', 'parquet']):
if isinstance(input_file["path"], StringIO) or input_file["data_type"] in [
"avro",
"parquet",
]:
continue

data = BaseData(input_file_path=input_file['path'], data=None,
options={})
self.assertEqual(input_file['encoding'].lower(),
data.file_encoding.lower(),
input_file['path'])
data = BaseData(input_file_path=input_file["path"], data=None, options={})
self.assertEqual(
input_file["encoding"].lower(),
data.file_encoding.lower(),
input_file["path"],
)

# test when data is specified without input_file_object
file_encoding = locale.getpreferredencoding(False)
if file_encoding.lower() in ['ascii', 'ansi_x3.4-1968']:
file_encoding = 'utf-8'
data = BaseData(input_file_path=None, data=[],
options={})
if file_encoding.lower() in ["ascii", "ansi_x3.4-1968"]:
file_encoding = "utf-8"
data = BaseData(input_file_path=None, data=[], options={})
self.assertEqual(file_encoding, data.file_encoding)
Loading

0 comments on commit 69d7173

Please sign in to comment.