Skip to content

Commit

Permalink
Static Typing profilers/column profile (#661)
Browse files Browse the repository at this point in the history
* Add typing to column_profile_compilers.py

* Add typing to data_labeler_column_profile.py

* Add typing to order_column_profile.py
  • Loading branch information
tonywu315 authored Sep 22, 2022
1 parent 32f83cd commit a4decf0
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 95 deletions.
74 changes: 45 additions & 29 deletions dataprofiler/profilers/column_profile_compilers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
"""For generating a report."""
from __future__ import annotations

import abc
from collections import OrderedDict
from multiprocessing.pool import Pool
from typing import Dict, List, Optional, Type

from future.utils import with_metaclass
from pandas import Series

from . import utils
from .categorical_column_profile import CategoricalColumn
Expand All @@ -11,39 +16,44 @@
from .float_column_profile import FloatColumn
from .int_column_profile import IntColumn
from .order_column_profile import OrderColumn
from .profiler_options import StructuredOptions, UnstructuredOptions
from .profiler_options import BaseOption, StructuredOptions, UnstructuredOptions
from .text_column_profile import TextColumn
from .unstructured_labeler_profile import UnstructuredLabelerProfile
from .unstructured_text_profile import TextProfiler


class BaseCompiler(with_metaclass(abc.ABCMeta, object)):
class BaseCompiler(with_metaclass(abc.ABCMeta, object)): # type: ignore
"""Abstract class for generating a report."""

# NOTE: these profilers are ordered. Test functionality if changed.
_profilers = list()
_profilers: List = list() # type: ignore[assignment]

_option_class = None
_option_class: Type[BaseOption] = None # type: ignore[assignment]

def __repr__(self):
def __repr__(self) -> str:
"""Represent object as a string."""
return self.__class__.__name__

def __init__(self, df_series=None, options=None, pool=None):
def __init__(
self,
df_series: Series = None,
options: StructuredOptions = None,
pool: Pool = None,
) -> None:
"""Initialize BaseCompiler object."""
if not self._profilers:
raise NotImplementedError("Must add profilers.")

if self._option_class is None:
raise NotImplementedError("Must set the expected OptionClass.")

self._profiles = OrderedDict()
self._profiles: Dict = OrderedDict()
if df_series is not None:
self.name = df_series.name
self._create_profile(df_series, options, pool)

@abc.abstractmethod
def report(self, remove_disabled_flag=False):
def report(self, remove_disabled_flag: bool = False) -> Dict:
"""
Return report.
Expand All @@ -54,11 +64,13 @@ def report(self, remove_disabled_flag=False):
raise NotImplementedError()

@property
def profile(self):
def profile(self) -> Dict:
"""Return the profile of the column."""
return self.report(remove_disabled_flag=False)

def _create_profile(self, df_series, options=None, pool=None):
def _create_profile(
self, df_series: Series, options: StructuredOptions = None, pool=None
) -> None:
"""
Initialize and evaluate all profilers for the given dataframe.
Expand All @@ -72,7 +84,7 @@ def _create_profile(self, df_series, options=None, pool=None):
if not self._profilers:
return

enabled_profiles = None
enabled_profiles: Optional[List[str]] = None
if options and isinstance(options, self._option_class):
enabled_profiles = options.enabled_profiles

Expand All @@ -96,7 +108,7 @@ def _create_profile(self, df_series, options=None, pool=None):
# Update profile after creation
self.update_profile(df_series, pool)

def __add__(self, other):
def __add__(self, other: BaseCompiler) -> BaseCompiler:
"""
Merge two profile compilers together overriding the `+` operator.
Expand Down Expand Up @@ -129,7 +141,7 @@ def __add__(self, other):
)
return merged_profile_compiler

def diff(self, other, options=None):
def diff(self, other: BaseCompiler, options: Dict = None) -> Dict:
"""
Find the difference between 2 compilers and returns the report.
Expand All @@ -145,7 +157,9 @@ def diff(self, other, options=None):
)
return {}

def update_profile(self, df_series, pool=None):
def update_profile(
self, df_series: Series, pool: Pool = None
) -> Optional[BaseCompiler]:
"""
Update the profiles from the data frames.
Expand All @@ -157,7 +171,7 @@ def update_profile(self, df_series, pool=None):
:rtype: BaseCompiler
"""
if not self._profilers:
return
return None

# If single process, loop and return
if pool is None:
Expand Down Expand Up @@ -217,15 +231,15 @@ class ColumnPrimitiveTypeProfileCompiler(BaseCompiler):
]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
def report(self, remove_disabled_flag: bool = False) -> Dict:
"""
Return report.
:param remove_disabled_flag:
flag to determine if disabled options should be excluded in report.
:type remove_disabled_flag: boolean
"""
profile = {
profile: Dict = {
"data_type_representation": dict(),
"data_type": None,
"statistics": dict(),
Expand All @@ -247,27 +261,29 @@ def report(self, remove_disabled_flag=False):
return profile

@property
def profile(self):
def profile(self) -> Dict:
"""Return the profile of the column."""
return self.report(remove_disabled_flag=False)

@property
def selected_data_type(self):
def selected_data_type(self) -> Optional[str]:
"""
Find the selected data_type in a primitive compiler.
:return: name of the selected data type
:rtype: str
"""
matched_profile = None
matched_profile: Optional[str] = None
if self._profiles:
for key, profiler in self._profiles.items():
if matched_profile is None and profiler.data_type_ratio == 1.0:
matched_profile = key
return matched_profile
return matched_profile

def diff(self, other, options=None):
def diff(
self, other: ColumnPrimitiveTypeProfileCompiler, options: Dict = None
) -> Dict:
"""
Find the difference between 2 compilers and returns the report.
Expand Down Expand Up @@ -327,7 +343,7 @@ class ColumnStatsProfileCompiler(BaseCompiler):
]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
def report(self, remove_disabled_flag: bool = False) -> Dict:
"""
Return report.
Expand All @@ -340,7 +356,7 @@ def report(self, remove_disabled_flag=False):
report.update(profiler.report(remove_disabled_flag))
return report

def diff(self, other, options=None):
def diff(self, other: ColumnStatsProfileCompiler, options: Dict = None) -> Dict:
"""
Find the difference between 2 compilers and returns the report.
Expand Down Expand Up @@ -369,23 +385,23 @@ class ColumnDataLabelerCompiler(BaseCompiler):
_profilers = [DataLabelerColumn]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
def report(self, remove_disabled_flag: bool = False) -> Dict:
"""
Return report.
:param remove_disabled_flag:
flag to determine if disabled options should be excluded in report.
:type remove_disabled_flag: boolean
"""
report = {"data_label": None, "statistics": dict()}
report: Dict = {"data_label": None, "statistics": dict()}
# TODO: Only works for last profiler. Abstracted for now.
for _, profiler in self._profiles.items():
col_profile = profiler.report(remove_disabled_flag)
report["data_label"] = col_profile.pop("data_label")
report["statistics"].update(col_profile)
return report

def diff(self, other, options=None):
def diff(self, other: ColumnDataLabelerCompiler, options: Dict = None) -> Dict:
"""
Find the difference between 2 compilers and return the report.
Expand Down Expand Up @@ -424,9 +440,9 @@ class UnstructuredCompiler(BaseCompiler):

_option_class = UnstructuredOptions

def report(self, remove_disabled_flag=False):
def report(self, remove_disabled_flag: bool = False) -> Dict:
"""Report profile attrs of class and potentially pop val from self.profile."""
profile = {"data_label": dict(), "statistics": dict()}
profile: Dict = {"data_label": dict(), "statistics": dict()}
if UnstructuredLabelerProfile.type in self._profiles:
profile["data_label"] = self._profiles[
UnstructuredLabelerProfile.type
Expand All @@ -437,7 +453,7 @@ def report(self, remove_disabled_flag=False):
)
return profile

def diff(self, other, options=None):
def diff(self, other: UnstructuredCompiler, options: Dict = None) -> Dict:
"""
Find the difference between 2 compilers and return the report.
Expand Down
Loading

0 comments on commit a4decf0

Please sign in to comment.