From 3f85a5c7910160cb010300b2019fa23ad186639f Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 20 Sep 2022 17:08:27 -0400 Subject: [PATCH 1/8] added type annotations to avro_data.py --- dataprofiler/data_readers/avro_data.py | 34 +++++++++++++++++--------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/dataprofiler/data_readers/avro_data.py b/dataprofiler/data_readers/avro_data.py index 9f9723dae..5bf2857ed 100644 --- a/dataprofiler/data_readers/avro_data.py +++ b/dataprofiler/data_readers/avro_data.py @@ -1,4 +1,7 @@ """Contains class for saving and loading spreadsheet data.""" +from io import BytesIO, StringIO +from optparse import Option +from typing import Any, Dict, List, Optional, Union import fastavro from dataprofiler.data_readers.filepath_or_buffer import FileOrBufferHandler @@ -11,9 +14,9 @@ class AVROData(JSONData, BaseData): """AVROData class to save and load spreadsheet data.""" - data_type = "avro" + data_type: Optional[str] = "avro" - def __init__(self, input_file_path=None, data=None, options=None): + def __init__(self, input_file_path: Optional[str]=None, data: Optional[Any]=None, options: Optional[Dict]=None) -> None: """ Initialize Data class for loading datasets of type AVRO. @@ -40,25 +43,30 @@ def __init__(self, input_file_path=None, data=None, options=None): JSONData.__init__(self, input_file_path, data, options) @property - def file_encoding(self): + def file_encoding(self) -> Optional[str]: """Set file encoding to None since not detected for avro.""" return None - def _load_data_from_file(self, input_file_path): + @file_encoding.setter + def file_encoding(self, value: Any) -> None: + """Does nothing but required by mypy because the inherited self.file_encoding is read-write.""" + pass + + def _load_data_from_file(self, input_file_path: str) -> List: """Load data from file.""" with FileOrBufferHandler(input_file_path, "rb") as input_file: # Currently, string reading with 'r' option has the unicode issue, # even when the option encoding='utf-8' is added. It may come from # some special compression codec, e.g., snappy. Then, binary mode # reading is currently used to get the dict-formatted lines. - df_reader = fastavro.reader(input_file) - lines = list() + df_reader: fastavro.reader = fastavro.reader(input_file) + lines: List = list() for line in df_reader: lines.append(line) return lines @classmethod - def is_match(cls, file_path, options=None): + def is_match(cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict]=None): """ Test the given file to check if the file has valid AVRO format or not. @@ -74,18 +82,20 @@ def is_match(cls, file_path, options=None): # get current position of stream if data_utils.is_stream_buffer(file_path): + assert not isinstance(file_path, str) starting_location = file_path.tell() is_valid_avro = fastavro.is_avro(file_path) # return to original position in stream if data_utils.is_stream_buffer(file_path): + assert not isinstance(file_path, str) file_path.seek(starting_location, 0) return is_valid_avro @classmethod - def _get_nested_key(cls, dict_line, nested_key): + def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict: """ Update nested keys from a dictionary and the current nested key. @@ -113,7 +123,7 @@ def _get_nested_key(cls, dict_line, nested_key): return nested_key @classmethod - def _get_nested_keys_from_dicts(cls, dicts): + def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict: """ Extract nested keys from a list of dictionaries. @@ -125,13 +135,13 @@ def _get_nested_keys_from_dicts(cls, dicts): :type dicts: list(dict) :return: a dictionary containing nested keys """ - nested_keys = {} + nested_keys: Dict for dict_line in dicts: nested_keys = cls._get_nested_key(dict_line, nested_keys) return nested_keys @classmethod - def _get_schema_avro(cls, nested_keys, schema_avro): + def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict: """ Update avro schema from the nested keys and the current avro schema. @@ -172,7 +182,7 @@ def _get_schema_avro(cls, nested_keys, schema_avro): if type(value) is dict: # here, the null option to specify keys not required # for every lines - schema_avro_temp = { + schema_avro_temp: Dict[str, Any] = { "name": key, "type": [{"name": key, "type": "record", "fields": []}, "null"], } From b3e5ebc2709b4713c8b75e8010b0699c58d6443e Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 20 Sep 2022 17:10:56 -0400 Subject: [PATCH 2/8] added missing return type --- dataprofiler/data_readers/avro_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/data_readers/avro_data.py b/dataprofiler/data_readers/avro_data.py index 5bf2857ed..78ae5e3ea 100644 --- a/dataprofiler/data_readers/avro_data.py +++ b/dataprofiler/data_readers/avro_data.py @@ -66,7 +66,7 @@ def _load_data_from_file(self, input_file_path: str) -> List: return lines @classmethod - def is_match(cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict]=None): + def is_match(cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict]=None) -> bool: """ Test the given file to check if the file has valid AVRO format or not. From e7c7cc831cc4e67b2f0b719cf7f3d279a5dda10c Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 20 Sep 2022 17:14:14 -0400 Subject: [PATCH 3/8] removed extra import --- dataprofiler/data_readers/avro_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dataprofiler/data_readers/avro_data.py b/dataprofiler/data_readers/avro_data.py index 78ae5e3ea..3d646b72f 100644 --- a/dataprofiler/data_readers/avro_data.py +++ b/dataprofiler/data_readers/avro_data.py @@ -1,6 +1,5 @@ """Contains class for saving and loading spreadsheet data.""" from io import BytesIO, StringIO -from optparse import Option from typing import Any, Dict, List, Optional, Union import fastavro From 18c015f0df46b5914eadd6c020ad0f4470e0ed30 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Mon, 26 Sep 2022 10:26:14 -0400 Subject: [PATCH 4/8] changed assertions to if statements --- dataprofiler/data_readers/avro_data.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dataprofiler/data_readers/avro_data.py b/dataprofiler/data_readers/avro_data.py index 3d646b72f..912b3255a 100644 --- a/dataprofiler/data_readers/avro_data.py +++ b/dataprofiler/data_readers/avro_data.py @@ -80,15 +80,13 @@ def is_match(cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Di options = dict() # get current position of stream - if data_utils.is_stream_buffer(file_path): - assert not isinstance(file_path, str) + if data_utils.is_stream_buffer(file_path) and not isinstance(file_path, str): starting_location = file_path.tell() is_valid_avro = fastavro.is_avro(file_path) # return to original position in stream - if data_utils.is_stream_buffer(file_path): - assert not isinstance(file_path, str) + if data_utils.is_stream_buffer(file_path) and not isinstance(file_path, str): file_path.seek(starting_location, 0) return is_valid_avro From fc9acc58941c4efd467c8e4053d4cd33ca5854ac Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 27 Sep 2022 19:11:56 -0400 Subject: [PATCH 5/8] added static typing to base_data and json_data --- dataprofiler/data_readers/avro_data.py | 21 ++++++-- dataprofiler/data_readers/base_data.py | 61 +++++++++++---------- dataprofiler/data_readers/json_data.py | 73 ++++++++++++++++---------- 3 files changed, 94 insertions(+), 61 deletions(-) diff --git a/dataprofiler/data_readers/avro_data.py b/dataprofiler/data_readers/avro_data.py index 912b3255a..cdc52bb02 100644 --- a/dataprofiler/data_readers/avro_data.py +++ b/dataprofiler/data_readers/avro_data.py @@ -1,6 +1,7 @@ """Contains class for saving and loading spreadsheet data.""" from io import BytesIO, StringIO -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union + import fastavro from dataprofiler.data_readers.filepath_or_buffer import FileOrBufferHandler @@ -15,7 +16,12 @@ class AVROData(JSONData, BaseData): data_type: Optional[str] = "avro" - def __init__(self, input_file_path: Optional[str]=None, data: Optional[Any]=None, options: Optional[Dict]=None) -> None: + def __init__( + self, + input_file_path: Optional[str] = None, + data: Optional[Any] = None, + options: Optional[Dict] = None, + ) -> None: """ Initialize Data class for loading datasets of type AVRO. @@ -48,7 +54,10 @@ def file_encoding(self) -> Optional[str]: @file_encoding.setter def file_encoding(self, value: Any) -> None: - """Does nothing but required by mypy because the inherited self.file_encoding is read-write.""" + """Do nothing. + + Required by mypy because the inherited self.file_encoding is read-write. + """ pass def _load_data_from_file(self, input_file_path: str) -> List: @@ -65,7 +74,9 @@ def _load_data_from_file(self, input_file_path: str) -> List: return lines @classmethod - def is_match(cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict]=None) -> bool: + def is_match( + cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict] = None + ) -> bool: """ Test the given file to check if the file has valid AVRO format or not. @@ -132,7 +143,7 @@ def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict: :type dicts: list(dict) :return: a dictionary containing nested keys """ - nested_keys: Dict + nested_keys: Dict = {} for dict_line in dicts: nested_keys = cls._get_nested_key(dict_line, nested_keys) return nested_keys diff --git a/dataprofiler/data_readers/base_data.py b/dataprofiler/data_readers/base_data.py index 45422cb6c..97bda59f8 100644 --- a/dataprofiler/data_readers/base_data.py +++ b/dataprofiler/data_readers/base_data.py @@ -3,6 +3,7 @@ import sys from collections import OrderedDict from io import StringIO +from typing import Any, Dict, Generator, List, Optional, Union import numpy as np import pandas as pd @@ -16,10 +17,12 @@ class BaseData(object): """Abstract class for data loading and saving.""" - data_type = None - info = None + data_type: Optional[str] = None + info: Optional[str] = None - def __init__(self, input_file_path, data, options): + def __init__( + self, input_file_path: Optional[str], data: Any, options: Dict + ) -> None: """ Initialize Base class for loading a dataset. @@ -39,7 +42,7 @@ def __init__(self, input_file_path, data, options): # Public properties self.input_file_path = input_file_path - self.options = options + self.options: Optional[Dict] = options # 'Private' properties # _data_formats: dict containing data_formats (key) and function @@ -53,12 +56,12 @@ def __init__(self, input_file_path, data, options): # constant across function calls. # _tmp_file_name: randomly set variables for file name usable by system # _file_encoding: contains the suggested file encoding for reading data - self._data_formats = OrderedDict() - self._selected_data_format = None - self._data = data - self._batch_info = dict(perm=list(), iter=0) - self._tmp_file_name = None - self._file_encoding = options.get("encoding", None) + self._data_formats: Dict[str, Any] = OrderedDict() + self._selected_data_format: Optional[str] = None + self._data: Optional[Any] = data + self._batch_info: Dict = dict(perm=list(), iter=0) + self._tmp_file_name: Optional[str] = None + self._file_encoding: Optional[str] = options.get("encoding", None) @property def data(self): @@ -79,17 +82,12 @@ def data(self): ) @property - def data_format(self): + def data_format(self) -> Optional[str]: """Return data format.""" return self._selected_data_format - @property - def is_structured(self): - """Determine compatibility with StructuredProfiler.""" - raise NotImplementedError - @data_format.setter - def data_format(self, value): + def data_format(self, value: str): allowed_data_formats = list(self._data_formats.keys()) if value.lower() not in allowed_data_formats: raise ValueError( @@ -100,7 +98,12 @@ def data_format(self, value): self._selected_data_format = value.lower() @property - def file_encoding(self): + def is_structured(self) -> bool: + """Determine compatibility with StructuredProfiler.""" + raise NotImplementedError + + @property + def file_encoding(self) -> Optional[str]: """Return file encoding.""" if not self._file_encoding: # get system default, but if set to ascii, just update to utf-8 @@ -122,7 +125,7 @@ def file_encoding(self): return self._file_encoding @file_encoding.setter - def file_encoding(self, value): + def file_encoding(self, value: str) -> None: """Set file encoding.""" valid_user_set_encodings = ["ascii", "utf-8", "utf-16", "utf-32"] if not value or value.lower() not in valid_user_set_encodings: @@ -134,7 +137,7 @@ def file_encoding(self, value): self._file_encoding = value @staticmethod - def _check_and_return_options(options): + def _check_and_return_options(options: Optional[Dict]) -> Dict: """Return options or raise error.""" if not options: options = dict() @@ -142,11 +145,13 @@ def _check_and_return_options(options): raise ValueError("Options must be a dictionary.") return options - def _load_data(self, data=None): + def _load_data(self, data: Optional[Any] = None) -> None: """Load data.""" raise NotImplementedError() - def get_batch_generator(self, batch_size): + def get_batch_generator( + self, batch_size: int + ) -> Generator[Union[pd.DataFrame, List], None, None]: """Get batch generator.""" data_length = len(self.data) indices = np.random.permutation(data_length) @@ -157,11 +162,13 @@ def get_batch_generator(self, batch_size): yield list(self.data[k] for k in indices[i : i + batch_size]) @classmethod - def is_match(cls, input_file_path, options): + def is_match(cls, input_file_path: str, options: Optional[Dict]) -> bool: """Return true if match, false otherwise.""" raise NotImplementedError() - def reload(self, input_file_path, data, options): + def reload( + self, input_file_path: Optional[str], data: Any, options: Optional[Dict] + ) -> None: """ Reload the data class with a new dataset. @@ -185,7 +192,7 @@ def reload(self, input_file_path, data, options): self.options = None self._batch_info = dict(perm=list(), iter=0) - def __len__(self): + def __len__(self) -> int: """ Return the length of the dataset which is loaded. @@ -194,7 +201,7 @@ def __len__(self): return len(self.data) @property - def length(self): + def length(self) -> int: """ Return the length of the dataset which is loaded. @@ -202,7 +209,7 @@ def length(self): """ return len(self) - def __getattribute__(self, name): + def __getattribute__(self, name: Any) -> Any: """ Override getattr for the class. diff --git a/dataprofiler/data_readers/json_data.py b/dataprofiler/data_readers/json_data.py index 959f59d95..463569228 100644 --- a/dataprofiler/data_readers/json_data.py +++ b/dataprofiler/data_readers/json_data.py @@ -2,6 +2,7 @@ import json import warnings from collections import OrderedDict +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd @@ -16,9 +17,14 @@ class JSONData(SpreadSheetDataMixin, BaseData): """SpreadsheetData class to save and load spreadsheet data.""" - data_type = "json" + data_type: Optional[str] = "json" - def __init__(self, input_file_path=None, data=None, options=None): + def __init__( + self, + input_file_path: Optional[str] = None, + data: Optional[Union[str, pd.DataFrame]] = None, + options: Optional[Dict] = None, + ): """ Initialize Data class for loading datasets of type JSON. @@ -66,30 +72,32 @@ def __init__(self, input_file_path=None, data=None, options=None): self._data_formats[ "flattened_dataframe" ] = self._get_data_as_flattened_dataframe - self._selected_data_format = options.get("data_format", "flattened_dataframe") - self._payload_keys = options.get("payload_keys", ["data", "payload"]) + self._selected_data_format: str = options.get( + "data_format", "flattened_dataframe" + ) + self._payload_keys: List[str] = options.get("payload_keys", ["data", "payload"]) if not isinstance(self._payload_keys, list): self._payload_keys = [self._payload_keys] - self._key_separator = options.get("key_separator", ".") - self._selected_keys = options.get("selected_keys", list()) - self._metadata = None + self._key_separator: str = options.get("key_separator", ".") + self._selected_keys: Optional[List[str]] = options.get("selected_keys", list()) + self._metadata: Optional[pd.DataFrame] = None if data is not None: self._load_data(data) @property - def selected_keys(self): + def selected_keys(self) -> Optional[List[str]]: """Return selected keys.""" return self._selected_keys @property - def metadata(self): + def metadata(self) -> Optional[pd.DataFrame]: """Return a data frame that contains the metadata.""" if self._metadata is None or self._metadata.empty: warnings.warn("No metadata was detected.") return self._metadata @property - def data_and_metadata(self): + def data_and_metadata(self) -> Optional[pd.DataFrame]: """Return a data frame that joins the data and the metadata.""" data = self.data if self._metadata is not None and not self._metadata.empty: @@ -227,13 +235,13 @@ def _get_data_as_flattened_dataframe(self, json_lines): return data - def _load_data_from_str(self, data_as_str): + def _load_data_from_str(self, data_as_str: str) -> List: """ Load the data from a string. :param data_as_str: data in string format. :type data_as_str: str - :return: + :return: dict """ try: data = json.loads(data_as_str) @@ -246,7 +254,7 @@ def _load_data_from_str(self, data_as_str): ) return data - def _load_data_from_file(self, input_file_path): + def _load_data_from_file(self, input_file_path: str) -> List: """ Load the data from a file. @@ -268,7 +276,7 @@ def _load_data_from_file(self, input_file_path): ) return data - def _get_data_as_records(self, data): + def _get_data_as_records(self, data: List) -> List[str]: """ Extract the data as a record format. @@ -276,15 +284,15 @@ def _get_data_as_records(self, data): :type data: list :return: dataframe in record format """ - data = self._get_data_as_df(data) - data = data.to_dict(orient="records", into=OrderedDict) - for i, sample in enumerate(data): - data[i] = json.dumps( + data_df: pd.DataFrame = self._get_data_as_df(data) + data_records: List = data_df.to_dict(orient="records", into=OrderedDict) + for i, sample in enumerate(data_records): + data_records[i] = json.dumps( self._convert_flat_to_nested_cols(sample), ensure_ascii=False ) - return super(JSONData, self)._get_data_as_records(data) + return super(JSONData, self)._get_data_as_records(data_records) - def _get_data_as_json(self, data): + def _get_data_as_json(self, data: List) -> List[str]: """ Extract the data as a json format. @@ -292,12 +300,12 @@ def _get_data_as_json(self, data): :type data: list :return: dataframe in json format """ - data = self._get_data_as_df(data) - data = data.to_json(orient="records") - char_per_line = min(len(data), self.SAMPLES_PER_LINE_DEFAULT) - return list(map("".join, zip(*[iter(data)] * char_per_line))) + data_df: pd.DataFrame = self._get_data_as_df(data) + data_json = data_df.to_json(orient="records") + char_per_line = min(len(data_json), self.SAMPLES_PER_LINE_DEFAULT) + return list(map("".join, zip(*[iter(data_json)] * char_per_line))) - def _get_data_as_df(self, data): + def _get_data_as_df(self, data: Union[pd.DataFrame, Dict, List]) -> pd.DataFrame: """ Extract the data as pandas formats it. @@ -316,7 +324,7 @@ def _get_data_as_df(self, data): return data @classmethod - def _convert_flat_to_nested_cols(cls, dic, separator="."): + def _convert_flat_to_nested_cols(cls, dic: Dict, separator: str = ".") -> Dict: """ Convert a flat dict to nested dict. @@ -350,7 +358,9 @@ def _convert_flat_to_nested_cols(cls, dic, separator="."): return dic @classmethod - def is_match(cls, file_path, options=None): + def is_match( + cls, file_path: Union[str, StringIO], options: Optional[Dict] = None + ) -> bool: """ Test whether first 1000 lines of file has valid JSON format or not. @@ -402,7 +412,12 @@ def is_match(cls, file_path, options=None): else: return False - def reload(self, input_file_path=None, data=None, options=None): + def reload( + self, + input_file_path: Optional[str] = None, + data: Optional[Union[str, pd.DataFrame]] = None, + options: Optional[Dict] = None, + ) -> None: """ Reload the data class with a new dataset. @@ -419,4 +434,4 @@ def reload(self, input_file_path=None, data=None, options=None): """ self._selected_keys = None super(JSONData, self).reload(input_file_path, data, options) - self.__init__(self.input_file_path, data, options) + JSONData.__init__(self, self.input_file_path, data, options) From f276b3479ad4f030df38311c4206df31c00f5cb0 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 4 Oct 2022 10:46:47 -0400 Subject: [PATCH 6/8] fixed issue with data being multiple types --- dataprofiler/data_readers/json_data.py | 20 ++++++++++--------- .../data_readers/structured_mixins.py | 6 ++++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/dataprofiler/data_readers/json_data.py b/dataprofiler/data_readers/json_data.py index 463569228..49e6fd922 100644 --- a/dataprofiler/data_readers/json_data.py +++ b/dataprofiler/data_readers/json_data.py @@ -284,13 +284,14 @@ def _get_data_as_records(self, data: List) -> List[str]: :type data: list :return: dataframe in record format """ - data_df: pd.DataFrame = self._get_data_as_df(data) - data_records: List = data_df.to_dict(orient="records", into=OrderedDict) - for i, sample in enumerate(data_records): - data_records[i] = json.dumps( + _data: Union[pd.DataFrame, List] + _data = self._get_data_as_df(data) + _data = _data.to_dict(orient="records", into=OrderedDict) + for i, sample in enumerate(_data): + _data[i] = json.dumps( self._convert_flat_to_nested_cols(sample), ensure_ascii=False ) - return super(JSONData, self)._get_data_as_records(data_records) + return super(JSONData, self)._get_data_as_records(_data) def _get_data_as_json(self, data: List) -> List[str]: """ @@ -300,10 +301,11 @@ def _get_data_as_json(self, data: List) -> List[str]: :type data: list :return: dataframe in json format """ - data_df: pd.DataFrame = self._get_data_as_df(data) - data_json = data_df.to_json(orient="records") - char_per_line = min(len(data_json), self.SAMPLES_PER_LINE_DEFAULT) - return list(map("".join, zip(*[iter(data_json)] * char_per_line))) + _data: Union[pd.DataFrame, List] + _data = self._get_data_as_df(data) + _data = _data.to_json(orient="records") + char_per_line = min(len(_data), self.SAMPLES_PER_LINE_DEFAULT) + return list(map("".join, zip(*[iter(_data)] * char_per_line))) def _get_data_as_df(self, data: Union[pd.DataFrame, Dict, List]) -> pd.DataFrame: """ diff --git a/dataprofiler/data_readers/structured_mixins.py b/dataprofiler/data_readers/structured_mixins.py index cd468dc8c..3331ecbb4 100644 --- a/dataprofiler/data_readers/structured_mixins.py +++ b/dataprofiler/data_readers/structured_mixins.py @@ -25,12 +25,14 @@ class SpreadSheetDataMixin(object): :return: None """ - def __init__(self, input_file_path: str, data: Any, options: Dict) -> None: + def __init__( + self, input_file_path: Optional[str], data: Any, options: Dict + ) -> None: """Initialize spreadsheet mixin object.""" self._data_formats: Dict = dict() self._data_formats["dataframe"] = self._get_data_as_df self._original_df_dtypes: Optional[pd.Series] - self.input_file_path: str = input_file_path + self.input_file_path: Optional[str] = input_file_path if data is not None and isinstance(data, pd.DataFrame): self._original_df_dtypes = data.dtypes else: From 318129e1cc04ad8f43031f314415a30ed47f0bf8 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 4 Oct 2022 10:50:23 -0400 Subject: [PATCH 7/8] changed call in reload back to self.__init__ with type: ignore --- dataprofiler/data_readers/json_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/data_readers/json_data.py b/dataprofiler/data_readers/json_data.py index 49e6fd922..5a561917c 100644 --- a/dataprofiler/data_readers/json_data.py +++ b/dataprofiler/data_readers/json_data.py @@ -436,4 +436,4 @@ def reload( """ self._selected_keys = None super(JSONData, self).reload(input_file_path, data, options) - JSONData.__init__(self, self.input_file_path, data, options) + self.__init__(self.input_file_path, data, options) # type: ignore From af9436a0884dcd3dbddca495a37a60cd590202ce Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 4 Oct 2022 10:53:39 -0400 Subject: [PATCH 8/8] changed isinstance to cast in avro_data --- dataprofiler/data_readers/avro_data.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/dataprofiler/data_readers/avro_data.py b/dataprofiler/data_readers/avro_data.py index cdc52bb02..3680ef09d 100644 --- a/dataprofiler/data_readers/avro_data.py +++ b/dataprofiler/data_readers/avro_data.py @@ -1,6 +1,6 @@ """Contains class for saving and loading spreadsheet data.""" from io import BytesIO, StringIO -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, cast import fastavro @@ -91,13 +91,19 @@ def is_match( options = dict() # get current position of stream - if data_utils.is_stream_buffer(file_path) and not isinstance(file_path, str): + if data_utils.is_stream_buffer(file_path): + file_path = cast( + Union[StringIO, BytesIO], file_path + ) # guaranteed by is_stream_buffer starting_location = file_path.tell() is_valid_avro = fastavro.is_avro(file_path) # return to original position in stream - if data_utils.is_stream_buffer(file_path) and not isinstance(file_path, str): + if data_utils.is_stream_buffer(file_path): + file_path = cast( + Union[StringIO, BytesIO], file_path + ) # guaranteed by is_stream_buffer file_path.seek(starting_location, 0) return is_valid_avro