capitalone · taylorfturner · Jul 7, 2022 · Jul 7, 2022 · Jul 7, 2022
@@ -11,86 +11,117 @@
 
 
 class TestAVRODataClass(unittest.TestCase):
-
     @classmethod
     def setUpClass(cls):
         cls.input_file_path = None
         cls.output_file_path = None
         cls.ss = None
 
-        test_dir = os.path.join(test_root_path, 'data')
+        test_dir = os.path.join(test_root_path, "data")
         cls.input_file_names = [
-            dict(path=os.path.join(test_dir, 'avro/users.avro'), count=4),
-            dict(path=os.path.join(test_dir, 'avro/userdata1.avro'), count=1000),
-            dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.parquet'), count=1000),
-            dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.csv'), count=1000),
-            dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.json'), count=1000),
-            dict(path=os.path.join(test_dir, 'avro/userdata1_intentionally_mislabled_file.txt'), count=1000),
-            dict(path=os.path.join(test_dir, 'avro/deflate_compressed_intentionally_mislabeled_file.csv'), count=4),
-            dict(path=os.path.join(test_dir, 'avro/snappy_compressed_intentionally_mislabeled_file.csv'), count=4),
+            dict(path=os.path.join(test_dir, "avro/users.avro"), count=4),
+            dict(path=os.path.join(test_dir, "avro/userdata1.avro"), count=1000),
+            dict(
+                path=os.path.join(
+                    test_dir, "avro/userdata1_intentionally_mislabled_file.parquet"
+                ),
+                count=1000,
+            ),
+            dict(
+                path=os.path.join(
+                    test_dir, "avro/userdata1_intentionally_mislabled_file.csv"
+                ),
+                count=1000,
+            ),
+            dict(
+                path=os.path.join(
+                    test_dir, "avro/userdata1_intentionally_mislabled_file.json"
+                ),
+                count=1000,
+            ),
+            dict(
+                path=os.path.join(
+                    test_dir, "avro/userdata1_intentionally_mislabled_file.txt"
+                ),
+                count=1000,
+            ),
+            dict(
+                path=os.path.join(
+                    test_dir,
+                    "avro/deflate_compressed_intentionally_mislabeled_file.csv",
+                ),
+                count=4,
+            ),
+            dict(
+                path=os.path.join(
+                    test_dir, "avro/snappy_compressed_intentionally_mislabeled_file.csv"
+                ),
+                count=4,
+            ),
         ]
 
         cls.buffer_list = []
         for input_file in cls.input_file_names:
             # add BytesIO
             buffer_info = input_file.copy()
-            with open(input_file['path'], 'rb') as fp:
-                buffer_info['path'] = BytesIO(fp.read())
+            with open(input_file["path"], "rb") as fp:
+                buffer_info["path"] = BytesIO(fp.read())
             cls.buffer_list.append(buffer_info)
 
         cls.file_or_buf_list = cls.input_file_names + cls.buffer_list
 
     @classmethod
     def setUp(cls):
         for buffer in cls.buffer_list:
-            buffer['path'].seek(0)
+            buffer["path"].seek(0)
 
     def test_is_match(self):
         """
         Determine if the avro file can be automatically identified from
         byte stream or file path
         """
         for input_file in self.file_or_buf_list:
-            self.assertTrue(AVROData.is_match(input_file['path']))
+            self.assertTrue(AVROData.is_match(input_file["path"]))
 
     def test_avro_file_identification(self):
         """
         Determine if the avro file can be automatically identified
         """
         for input_file in self.file_or_buf_list:
-            input_data_obj = Data(input_file['path'])
-            self.assertEqual(input_data_obj.data_type, 'avro')
+            input_data_obj = Data(input_file["path"])
+            self.assertEqual(input_data_obj.data_type, "avro")
 
     def test_specifying_data_type(self):
         """
         Determine if the avro file can be loaded with manual data_type setting
         """
         for input_file in self.file_or_buf_list:
-            input_data_obj = Data(input_file['path'], data_type='avro')
-            self.assertEqual(input_data_obj.data_type, 'avro')
+            input_data_obj = Data(input_file["path"], data_type="avro")
+            self.assertEqual(input_data_obj.data_type, "avro")
 
     def test_reload_data(self):
         """
         Determine if the avro file can be reloaded
         """
         for input_file in self.file_or_buf_list:
-            input_data_obj = Data(input_file['path'])
-            input_data_obj.reload(input_file['path'])
-            self.assertEqual(input_data_obj.data_type, 'avro')
-            self.assertEqual(input_file['path'], input_data_obj.input_file_path)
+            input_data_obj = Data(input_file["path"])
+            input_data_obj.reload(input_file["path"])
+            self.assertEqual(input_data_obj.data_type, "avro")
+            self.assertEqual(input_file["path"], input_data_obj.input_file_path)
 
     def test_data_formats(self):
         """
         Determine if the avro file data_formats can be used
         """
         for input_file in self.file_or_buf_list:
-            input_data_obj = Data(input_file['path'])
+            input_data_obj = Data(input_file["path"])
             for data_format in list(input_data_obj._data_formats.keys()):
                 input_data_obj.data_format = data_format
                 self.assertEqual(input_data_obj.data_format, data_format)
                 data = input_data_obj.data
                 if data_format == "dataframe":
                     import pandas as pd
+
                     self.assertIsInstance(data, pd.DataFrame)
                 elif data_format in ["records", "json"]:
                     self.assertIsInstance(data, list)
@@ -100,35 +131,37 @@ def test_nested_keys(self):
         """
         Determine if the avro file data_formats can be used
         """
-        dict =[
-            {'name': 1, 'favorite_number': 1},
-            {'favorite_color': 1, 'address': {'streetaddress': 1, 'city': 1}}]
+        dict = [
+            {"name": 1, "favorite_number": 1},
+            {"favorite_color": 1, "address": {"streetaddress": 1, "city": 1}},
+        ]
         nested_keys = AVROData._get_nested_keys_from_dicts(dict)
         self.assertIsNotNone(nested_keys)
         schema_avro = {
-              'namespace': 'avro_namespace',
-              'name': 'avro_filename',
-              'type': 'record',
-              'fields': [
-                {'name': 'name', 'type': ['string', 'null']},
-                {'name': 'favorite_number', 'type': ['string', 'null']},
-                {'name': 'favorite_color', 'type': ['string', 'null']},
+            "namespace": "avro_namespace",
+            "name": "avro_filename",
+            "type": "record",
+            "fields": [
+                {"name": "name", "type": ["string", "null"]},
+                {"name": "favorite_number", "type": ["string", "null"]},
+                {"name": "favorite_color", "type": ["string", "null"]},
                 {
-                  'name': 'address',
-                  'type': [{
-                      'namespace': 'avro_namespace',
-                      'name': 'address',
-                      'type': 'record',
-                      'fields': [
-                          {'name': 'streetaddress', 'type': ['string', 'null']},
-                          {'name': 'city', 'type': ['string', 'null']}
-                      ]
-                    },
-                    'null'
-                  ]
-                }
-              ]
-            }
+                    "name": "address",
+                    "type": [
+                        {
+                            "namespace": "avro_namespace",
+                            "name": "address",
+                            "type": "record",
+                            "fields": [
+                                {"name": "streetaddress", "type": ["string", "null"]},
+                                {"name": "city", "type": ["string", "null"]},
+                            ],
+                        },
+                        "null",
+                    ],
+                },
+            ],
+        }
         schema_avro = AVROData._get_schema_avro(nested_keys, schema_avro)
         self.assertIsNotNone(schema_avro)
 
@@ -140,12 +173,8 @@ def test_len_data(self):
 
         for input_file in self.file_or_buf_list:
             data = Data(input_file["path"])
-            self.assertEqual(input_file['count'],
-                             len(data),
-                             msg=input_file['path'])
-            self.assertEqual(input_file['count'],
-                             data.length,
-                             msg=input_file['path'])
+            self.assertEqual(input_file["count"], len(data), msg=input_file["path"])
+            self.assertEqual(input_file["count"], data.length, msg=input_file["path"])
 
     def test_file_encoding(self):
         """Tests to ensure file_encoding set to None"""
@@ -175,5 +204,5 @@ def test_is_structured(self):
         self.assertFalse(data.is_structured)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
@@ -18,37 +18,58 @@ class TestBaseDataClass(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls) -> None:
-        test_dir = os.path.join(test_root_path, 'data')
+        test_dir = os.path.join(test_root_path, "data")
         cls.input_file_names = [
-            dict(path=os.path.join(test_dir, 'csv/diamonds.csv'),
-                 encoding='utf-8', data_type='csv'),
-            dict(path=os.path.join(test_dir, 'avro/users.avro'),
-                 encoding=None, data_type='avro'),
-            dict(path=os.path.join(test_dir, 'json/iris-utf-16.json'),
-                 encoding='utf-16', data_type='json'),
-            dict(path=os.path.join(test_dir, 'json/iris-utf-32.json'),
-                 encoding='utf-32', data_type='json'),
-            dict(path=os.path.join(test_dir, 'parquet/iris.parq'),
-                 encoding=None, data_type='parquet'),
-            dict(path=os.path.join(test_dir, 'txt/code.txt'),
-                 encoding='utf-8', data_type='text'),
-            dict(path=os.path.join(test_dir, 'txt/empty.txt'),
-                 encoding='utf-8', data_type='text'),
+            dict(
+                path=os.path.join(test_dir, "csv/diamonds.csv"),
+                encoding="utf-8",
+                data_type="csv",
+            ),
+            dict(
+                path=os.path.join(test_dir, "avro/users.avro"),
+                encoding=None,
+                data_type="avro",
+            ),
+            dict(
+                path=os.path.join(test_dir, "json/iris-utf-16.json"),
+                encoding="utf-16",
+                data_type="json",
+            ),
+            dict(
+                path=os.path.join(test_dir, "json/iris-utf-32.json"),
+                encoding="utf-32",
+                data_type="json",
+            ),
+            dict(
+                path=os.path.join(test_dir, "parquet/iris.parq"),
+                encoding=None,
+                data_type="parquet",
+            ),
+            dict(
+                path=os.path.join(test_dir, "txt/code.txt"),
+                encoding="utf-8",
+                data_type="text",
+            ),
+            dict(
+                path=os.path.join(test_dir, "txt/empty.txt"),
+                encoding="utf-8",
+                data_type="text",
+            ),
         ]
 
         for input_file in cls.input_file_names:
             # add BytesIO
             buffer_info = input_file.copy()
-            with open(input_file['path'], 'rb') as fp:
-                buffer_info['path'] = BytesIO(fp.read())
+            with open(input_file["path"], "rb") as fp:
+                buffer_info["path"] = BytesIO(fp.read())
             cls.buffer_list.append(buffer_info)
 
         cls.file_or_buf_list = cls.input_file_names + cls.buffer_list
 
     @classmethod
     def setUp(cls):
         for buffer in cls.buffer_list:
-            buffer['path'].seek(0)
+            buffer["path"].seek(0)
 
     def test_can_apply_data_functions(self):
         class FakeDataClass:
@@ -63,11 +84,12 @@ def func1(self):
 
         # if the function exists in BaseData fail the test because the results
         # may become inaccurate.
-        self.assertFalse(hasattr(BaseData, 'func1'))
+        self.assertFalse(hasattr(BaseData, "func1"))
 
-        with self.assertRaisesRegex(AttributeError,
-                                    "Neither 'BaseData' nor 'FakeDataClass' "
-                                    "objects have attribute 'test'"):
+        with self.assertRaisesRegex(
+            AttributeError,
+            "Neither 'BaseData' nor 'FakeDataClass' " "objects have attribute 'test'",
+        ):
             data.test
 
         # validate it will take BaseData attribute over the data attribute
@@ -83,20 +105,22 @@ def test_file_encoding(self):
         """
         for input_file in self.file_or_buf_list:
             # do not test StringIO, avro, parquet
-            if (isinstance(input_file['path'], StringIO)
-                    or input_file['data_type'] in ['avro', 'parquet']):
+            if isinstance(input_file["path"], StringIO) or input_file["data_type"] in [
+                "avro",
+                "parquet",
+            ]:
                 continue
 
-            data = BaseData(input_file_path=input_file['path'], data=None,
-                            options={})
-            self.assertEqual(input_file['encoding'].lower(),
-                             data.file_encoding.lower(),
-                             input_file['path'])
+            data = BaseData(input_file_path=input_file["path"], data=None, options={})
+            self.assertEqual(
+                input_file["encoding"].lower(),
+                data.file_encoding.lower(),
+                input_file["path"],
+            )
 
         # test when data is specified without input_file_object
         file_encoding = locale.getpreferredencoding(False)
-        if file_encoding.lower() in ['ascii', 'ansi_x3.4-1968']:
-            file_encoding = 'utf-8'
-        data = BaseData(input_file_path=None, data=[],
-                        options={})
+        if file_encoding.lower() in ["ascii", "ansi_x3.4-1968"]:
+            file_encoding = "utf-8"
+        data = BaseData(input_file_path=None, data=[], options={})
         self.assertEqual(file_encoding, data.file_encoding)