Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reformatted dataprofiler/tests using black. #511

Merged
merged 2 commits into from
Jul 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 84 additions & 55 deletions dataprofiler/tests/data_readers/test_avro_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,86 +11,117 @@


class TestAVRODataClass(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.input_file_path = None
cls.output_file_path = None
cls.ss = None

test_dir = os.path.join(test_root_path, 'data')
test_dir = os.path.join(test_root_path, "data")
cls.input_file_names = [
dict(path=os.path.join(test_dir, 'avro/users.avro'), count=4),
dict(path=os.path.join(test_dir, 'avro/userdata1.avro'), count=1000),
dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.parquet'), count=1000),
dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.csv'), count=1000),
dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.json'), count=1000),
dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.txt'), count=1000),
dict(path=os.path.join(test_dir, 'avro/deflate_compressed_intentionally_mislabeled_file.csv'), count=4),
dict(path=os.path.join(test_dir, 'avro/snappy_compressed_intentionally_mislabeled_file.csv'), count=4),
dict(path=os.path.join(test_dir, "avro/users.avro"), count=4),
dict(path=os.path.join(test_dir, "avro/userdata1.avro"), count=1000),
dict(
path=os.path.join(
test_dir, "avro/userdata1_intentionally_mislabled_file.parquet"
),
count=1000,
),
dict(
path=os.path.join(
test_dir, "avro/userdata1_intentionally_mislabled_file.csv"
),
count=1000,
),
dict(
path=os.path.join(
test_dir, "avro/userdata1_intentionally_mislabled_file.json"
),
count=1000,
),
dict(
path=os.path.join(
test_dir, "avro/userdata1_intentionally_mislabled_file.txt"
),
count=1000,
),
dict(
path=os.path.join(
test_dir,
"avro/deflate_compressed_intentionally_mislabeled_file.csv",
),
count=4,
),
dict(
path=os.path.join(
test_dir, "avro/snappy_compressed_intentionally_mislabeled_file.csv"
),
count=4,
),
]

cls.buffer_list = []
for input_file in cls.input_file_names:
# add BytesIO
buffer_info = input_file.copy()
with open(input_file['path'], 'rb') as fp:
buffer_info['path'] = BytesIO(fp.read())
with open(input_file["path"], "rb") as fp:
buffer_info["path"] = BytesIO(fp.read())
cls.buffer_list.append(buffer_info)

cls.file_or_buf_list = cls.input_file_names + cls.buffer_list

@classmethod
def setUp(cls):
for buffer in cls.buffer_list:
buffer['path'].seek(0)
buffer["path"].seek(0)

def test_is_match(self):
"""
Determine if the avro file can be automatically identified from
byte stream or file path
"""
for input_file in self.file_or_buf_list:
self.assertTrue(AVROData.is_match(input_file['path']))
self.assertTrue(AVROData.is_match(input_file["path"]))

def test_avro_file_identification(self):
"""
Determine if the avro file can be automatically identified
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file['path'])
self.assertEqual(input_data_obj.data_type, 'avro')
input_data_obj = Data(input_file["path"])
self.assertEqual(input_data_obj.data_type, "avro")

def test_specifying_data_type(self):
"""
Determine if the avro file can be loaded with manual data_type setting
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file['path'], data_type='avro')
self.assertEqual(input_data_obj.data_type, 'avro')
input_data_obj = Data(input_file["path"], data_type="avro")
self.assertEqual(input_data_obj.data_type, "avro")

def test_reload_data(self):
"""
Determine if the avro file can be reloaded
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file['path'])
input_data_obj.reload(input_file['path'])
self.assertEqual(input_data_obj.data_type, 'avro')
self.assertEqual(input_file['path'], input_data_obj.input_file_path)
input_data_obj = Data(input_file["path"])
input_data_obj.reload(input_file["path"])
self.assertEqual(input_data_obj.data_type, "avro")
self.assertEqual(input_file["path"], input_data_obj.input_file_path)

def test_data_formats(self):
"""
Determine if the avro file data_formats can be used
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file['path'])
input_data_obj = Data(input_file["path"])
for data_format in list(input_data_obj._data_formats.keys()):
input_data_obj.data_format = data_format
self.assertEqual(input_data_obj.data_format, data_format)
data = input_data_obj.data
if data_format == "dataframe":
import pandas as pd

self.assertIsInstance(data, pd.DataFrame)
elif data_format in ["records", "json"]:
self.assertIsInstance(data, list)
Expand All @@ -100,35 +131,37 @@ def test_nested_keys(self):
"""
Determine if the avro file data_formats can be used
"""
dict =[
{'name': 1, 'favorite_number': 1},
{'favorite_color': 1, 'address': {'streetaddress': 1, 'city': 1}}]
dict = [
{"name": 1, "favorite_number": 1},
{"favorite_color": 1, "address": {"streetaddress": 1, "city": 1}},
]
nested_keys = AVROData._get_nested_keys_from_dicts(dict)
self.assertIsNotNone(nested_keys)
schema_avro = {
'namespace': 'avro_namespace',
'name': 'avro_filename',
'type': 'record',
'fields': [
{'name': 'name', 'type': ['string', 'null']},
{'name': 'favorite_number', 'type': ['string', 'null']},
{'name': 'favorite_color', 'type': ['string', 'null']},
"namespace": "avro_namespace",
"name": "avro_filename",
"type": "record",
"fields": [
{"name": "name", "type": ["string", "null"]},
{"name": "favorite_number", "type": ["string", "null"]},
{"name": "favorite_color", "type": ["string", "null"]},
{
'name': 'address',
'type': [{
'namespace': 'avro_namespace',
'name': 'address',
'type': 'record',
'fields': [
{'name': 'streetaddress', 'type': ['string', 'null']},
{'name': 'city', 'type': ['string', 'null']}
]
},
'null'
]
}
]
}
"name": "address",
"type": [
{
"namespace": "avro_namespace",
"name": "address",
"type": "record",
"fields": [
{"name": "streetaddress", "type": ["string", "null"]},
{"name": "city", "type": ["string", "null"]},
],
},
"null",
],
},
],
}
schema_avro = AVROData._get_schema_avro(nested_keys, schema_avro)
self.assertIsNotNone(schema_avro)

Expand All @@ -140,12 +173,8 @@ def test_len_data(self):

for input_file in self.file_or_buf_list:
data = Data(input_file["path"])
self.assertEqual(input_file['count'],
len(data),
msg=input_file['path'])
self.assertEqual(input_file['count'],
data.length,
msg=input_file['path'])
self.assertEqual(input_file["count"], len(data), msg=input_file["path"])
self.assertEqual(input_file["count"], data.length, msg=input_file["path"])

def test_file_encoding(self):
"""Tests to ensure file_encoding set to None"""
Expand Down Expand Up @@ -175,5 +204,5 @@ def test_is_structured(self):
self.assertFalse(data.is_structured)


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
90 changes: 57 additions & 33 deletions dataprofiler/tests/data_readers/test_base_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,37 +18,58 @@ class TestBaseDataClass(unittest.TestCase):

@classmethod
def setUpClass(cls) -> None:
test_dir = os.path.join(test_root_path, 'data')
test_dir = os.path.join(test_root_path, "data")
cls.input_file_names = [
dict(path=os.path.join(test_dir, 'csv/diamonds.csv'),
encoding='utf-8', data_type='csv'),
dict(path=os.path.join(test_dir, 'avro/users.avro'),
encoding=None, data_type='avro'),
dict(path=os.path.join(test_dir, 'json/iris-utf-16.json'),
encoding='utf-16', data_type='json'),
dict(path=os.path.join(test_dir, 'json/iris-utf-32.json'),
encoding='utf-32', data_type='json'),
dict(path=os.path.join(test_dir, 'parquet/iris.parq'),
encoding=None, data_type='parquet'),
dict(path=os.path.join(test_dir, 'txt/code.txt'),
encoding='utf-8', data_type='text'),
dict(path=os.path.join(test_dir, 'txt/empty.txt'),
encoding='utf-8', data_type='text'),
dict(
path=os.path.join(test_dir, "csv/diamonds.csv"),
encoding="utf-8",
data_type="csv",
),
dict(
path=os.path.join(test_dir, "avro/users.avro"),
encoding=None,
data_type="avro",
),
dict(
path=os.path.join(test_dir, "json/iris-utf-16.json"),
encoding="utf-16",
data_type="json",
),
dict(
path=os.path.join(test_dir, "json/iris-utf-32.json"),
encoding="utf-32",
data_type="json",
),
dict(
path=os.path.join(test_dir, "parquet/iris.parq"),
encoding=None,
data_type="parquet",
),
dict(
path=os.path.join(test_dir, "txt/code.txt"),
encoding="utf-8",
data_type="text",
),
dict(
path=os.path.join(test_dir, "txt/empty.txt"),
encoding="utf-8",
data_type="text",
),
]

for input_file in cls.input_file_names:
# add BytesIO
buffer_info = input_file.copy()
with open(input_file['path'], 'rb') as fp:
buffer_info['path'] = BytesIO(fp.read())
with open(input_file["path"], "rb") as fp:
buffer_info["path"] = BytesIO(fp.read())
cls.buffer_list.append(buffer_info)

cls.file_or_buf_list = cls.input_file_names + cls.buffer_list

@classmethod
def setUp(cls):
for buffer in cls.buffer_list:
buffer['path'].seek(0)
buffer["path"].seek(0)

def test_can_apply_data_functions(self):
class FakeDataClass:
Expand All @@ -63,11 +84,12 @@ def func1(self):

# if the function exists in BaseData fail the test because the results
# may become inaccurate.
self.assertFalse(hasattr(BaseData, 'func1'))
self.assertFalse(hasattr(BaseData, "func1"))

with self.assertRaisesRegex(AttributeError,
"Neither 'BaseData' nor 'FakeDataClass' "
"objects have attribute 'test'"):
with self.assertRaisesRegex(
AttributeError,
"Neither 'BaseData' nor 'FakeDataClass' " "objects have attribute 'test'",
):
data.test

# validate it will take BaseData attribute over the data attribute
Expand All @@ -83,20 +105,22 @@ def test_file_encoding(self):
"""
for input_file in self.file_or_buf_list:
# do not test StringIO, avro, parquet
if (isinstance(input_file['path'], StringIO)
or input_file['data_type'] in ['avro', 'parquet']):
if isinstance(input_file["path"], StringIO) or input_file["data_type"] in [
"avro",
"parquet",
]:
continue

data = BaseData(input_file_path=input_file['path'], data=None,
options={})
self.assertEqual(input_file['encoding'].lower(),
data.file_encoding.lower(),
input_file['path'])
data = BaseData(input_file_path=input_file["path"], data=None, options={})
self.assertEqual(
input_file["encoding"].lower(),
data.file_encoding.lower(),
input_file["path"],
)

# test when data is specified without input_file_object
file_encoding = locale.getpreferredencoding(False)
if file_encoding.lower() in ['ascii', 'ansi_x3.4-1968']:
file_encoding = 'utf-8'
data = BaseData(input_file_path=None, data=[],
options={})
if file_encoding.lower() in ["ascii", "ansi_x3.4-1968"]:
file_encoding = "utf-8"
data = BaseData(input_file_path=None, data=[], options={})
self.assertEqual(file_encoding, data.file_encoding)
Loading