georgian-io-archive · cchoquette · Aug 2, 2019 · Jul 11, 2019 · Jul 15, 2019 · Jul 16, 2019
diff --git a/doc/api.rst b/doc/api.rst
@@ -9,9 +9,9 @@ Foreshadow
    :members:
    :undoc-members:
 
-Preprocessor
+dp
 ------------
-.. automodule:: foreshadow.preprocessor
+.. automodule:: foreshadow.dp
    :members:
    :undoc-members:
 

diff --git a/doc/users.rst b/doc/users.rst
@@ -119,8 +119,8 @@ You now have an initial pipeline. Lets see how it did and serialize it to a JSON
     shadow.score(X_test, y_test)
 
     # Serialize the pipeline
-    x_proc = shadow.X_preprocessor.serialize()
-    y_proc = shadow.y_preprocessor.serialize()
+    x_proc = shadow.X_preparer.serialize()
+    y_proc = shadow.y_preparer.serialize()
 
     # Write the serialized pipelines to file
     json.dump(x_proc, open("x_proc.json", "w"), indent=2)
@@ -156,7 +156,7 @@ Now let's re-create the Foreshadow object with your changes.
     y_processor = fs.Preprocessor(from_json=y_proc)
 
     # Create the foreshadow object
-    shadow = fs.Foreshadow(X_preprocessor=x_processor, y_preprocessor=y_processor, estimator=XGBRegressor())
+    shadow = fs.Foreshadow(X_preparer=x_processor, y_preparer=y_processor, estimator=XGBRegressor())
 
     # Fit the foreshadow object
     shadow.fit(X_train, y_train)
@@ -181,7 +181,7 @@ Once you add a combinations section to figure out the best parameters, create th
     y_processor = Preprocessor(from_json=y_proc_combo)
 
     # Create the foreshadow object
-    shadow = fs.Foreshadow(X_preprocessor=x_processor, y_preprocessor=y_processor, estimator=XGBRegressor(), optimizer=GridSearchCV)
+    shadow = fs.Foreshadow(X_preparer=x_processor, y_preparer=y_processor, estimator=XGBRegressor(), optimizer=GridSearchCV)
 
     # Fit the foreshadow object
     shadow.fit(X_train, y_train)
@@ -198,8 +198,8 @@ Once you add a combinations section to figure out the best parameters, create th
     # Export the best pipelines
 
     # Serialize the pipeline
-    x_proc_best = shadow.X_preprocessor.serialize()
-    y_proc_best = shadow.y_preprocessor.serialize()
+    x_proc_best = shadow.X_preparer.serialize()
+    y_proc_best = shadow.y_preparer.serialize()
 
     # Write the serialized pipelines to file
     json.dump(x_proc_best, open("x_proc_best.json", "w"), indent=2)
@@ -236,7 +236,7 @@ Here is an example of a fully defined :py:obj:`Foreshadow <foreshadow.foreshadow
 
 .. code-block:: python
 
-    shadow = fs.Foreshadow(X_preprocessor=Preprocessor(), y_preprocessor=Preprocessor(), estimator=AutoEstimator(), optimizer=None)
+    shadow = fs.Foreshadow(X_preparer=Preprocessor(), y_preparer=Preprocessor(), estimator=AutoEstimator(), optimizer=None)
 
 This code is equivalent to the :code:`fs.Foreshadow()` definition but explicitly defines each component. In order to disable one or more
 of these components simply pass :code:`False` to the named parameter (Note that the default :code:`None` automatically initializes the above).
@@ -513,7 +513,7 @@ prepend a dollar sign to the column name. For example :code:`["$age_scale_0", "$
 
 
 Through overriding these various components, any combination of feature engineering can be achieved. To generate this configuration dictionary after fitting a Preprocessor or a
-Foreshadow object, run the :code:`serialize()` method on the Preprocessor object or on :code:`Foreshadow.X_preprocessor` or :code:`y_preprocessor`. That dictionary can be programmatically modified in python
+Foreshadow object, run the :code:`serialize()` method on the Preprocessor object or on :code:`Foreshadow.X_preparer` or :code:`y_preparer`. That dictionary can be programmatically modified in python
 or can be serialized to JSON where it can be modified by hand. By default the output of :code:`serialize()` will fix all
 feature engineering to be constant. To only enforce sections of the configuration output from :code:`serialize()` simply copy and paste the relevant sections into a new JSON file.
 

diff --git a/examples/adult_1.py b/examples/adult_1.py
@@ -27,16 +27,16 @@
 print("Accuracy = %f" % accuracy_score(y_test, y_pred))
 
 # Serialize the pipeline
-x_proc = model.X_preprocessor.serialize()
-y_proc = model.y_preprocessor.serialize()
+x_proc = model.X_preparer.serialize()
+y_proc = model.y_preparer.serialize()
 
 # Write the serialized pipelines to file
 json.dump(x_proc, open("adult_x_proc.json", "w"), indent=4)
 json.dump(y_proc, open("adult_y_proc.json", "w"), indent=4)
 
 summary = {
-    "x_summary": model.X_preprocessor.summarize(X_train),
-    "y_summary": model.y_preprocessor.summarize(y_train),
+    "x_summary": model.X_preparer.summarize(X_train),
+    "y_summary": model.y_preparer.summarize(y_train),
 }
 
 json.dump(summary, open("adult_summary.json", "w"), indent=4)
diff --git a/examples/adult_2.py b/examples/adult_2.py
@@ -39,8 +39,8 @@
 print("Accuracy = %f" % accuracy_score(y_test, y_pred))
 
 # Serialize the pipeline
-x_proc = model.X_preprocessor.serialize()
-y_proc = model.y_preprocessor.serialize()
+x_proc = model.X_preparer.serialize()
+y_proc = model.y_preparer.serialize()
 
 # Write the serialized pipelines to file
 json.dump(x_proc, open("adult_x_proc_searched.json", "w"), indent=4)

diff --git a/foreshadow/__init__.py b/foreshadow/__init__.py
@@ -1,25 +1,8 @@
 """An AutoML package to streamline the data science work flow."""
 
-# # Make sure to remove temporary F401
-# from foreshadow.foreshadow import Foreshadow
-# from foreshadow.preprocessor import Preprocessor
-# from foreshadow import console
-
-# This is temporary
-import foreshadow.cleaners  # noqa: F401
-import foreshadow.config  # noqa: F401
-import foreshadow.console  # noqa: F401
-import foreshadow.core  # noqa: F401
-import foreshadow.estimators  # noqa: F401
-import foreshadow.exceptions  # noqa: F401
-import foreshadow.foreshadow  # noqa: F401
-import foreshadow.intents  # noqa: F401
-import foreshadow.metrics  # noqa: F401
-import foreshadow.newintents  # noqa: F401
-import foreshadow.optimizers  # noqa: F401
-import foreshadow.preprocessor  # noqa: F401
-import foreshadow.transformers  # noqa: F401
-import foreshadow.utils  # noqa: F401
+from foreshadow import console
+from foreshadow.foreshadow import Foreshadow
+from foreshadow.preparer.preparer import DataPreparer
 
 
 __doc__ = """
@@ -44,9 +27,5 @@ def get_version():
 
 
 __version__ = get_version()
-
-# __all__ = ["Foreshadow", "Preprocessor", "console", "__version__"]
-
-__all__ = ["Foreshadow", "Preprocessor", "console", "__version__"]
-
+__all__ = ["Foreshadow", "DataPreparer", "console", "__version__"]
 del get_version
diff --git a/foreshadow/cleaners/__init__.py b/foreshadow/cleaners/__init__.py
diff --git a/foreshadow/cleaners/internals/__init__.py b/foreshadow/cleaners/internals/__init__.py
diff --git a/foreshadow/concrete/__init__.py b/foreshadow/concrete/__init__.py
@@ -0,0 +1,9 @@
+"""All the concrete transformers provided by foreshadow."""
+
+from foreshadow.concrete.externals import *  # noqa: F403, F401
+from foreshadow.concrete.externals import __all__ as e_all
+from foreshadow.concrete.internals import *  # noqa: F403, F401
+from foreshadow.concrete.internals import __all__ as i_all
+
+
+__all__ = i_all + e_all
diff --git a/foreshadow/concrete/externals.py b/foreshadow/concrete/externals.py
@@ -0,0 +1,82 @@
+"""External transformers.
+
+All sklearn transformers imported here will be wrapped and made available in
+the module :mod:`foreshadow.transformers.concrete`
+
+"""
+
+from category_encoders import HashingEncoder, OneHotEncoder  # noqa: F401
+from sklearn.decomposition import PCA  # noqa: F401
+from sklearn.feature_extraction.text import (  # noqa: F401
+    TfidfTransformer,
+    TfidfVectorizer,
+)
+from sklearn.preprocessing import (  # noqa: F401
+    Imputer,
+    MinMaxScaler,
+    RobustScaler,
+    StandardScaler,
+)
+
+from foreshadow.utils import is_transformer
+from foreshadow.wrapper import pandas_wrap
+
+
+no_serialize_params = {"OneHotEncoder": ["cols"], "HashingEncoder": ["cols"]}
+
+
+def _get_modules(classes, globals_, mname):  # TODO auto import all
+    # TODO sklearn transformers and test each one generically.
+    """Import sklearn transformers from transformers directory.
+
+    Searches transformers directory for classes implementing BaseEstimator and
+    TransformerMixin and duplicates them, wraps their init methods and public
+    functions to support pandas dataframes, and exposes them as
+    foreshadow.transformers.[name]
+
+    Args:
+        classes: A list of classes
+        globals_: The globals in the callee's context
+        mname: The module name
+
+    Returns:
+        The list of wrapped transformers.
+
+    """
+    transformers = [
+        cls
+        for cls in classes
+        if is_transformer(cls, method="issubclass")  # noqa: F821
+    ]  # flake does not detect due to del.
+
+    for t in transformers:
+        copied_t = type(t.__name__, (t, *t.__bases__), dict(t.__dict__))
+        copied_t.__module__ = mname
+        globals_[copied_t.__name__] = pandas_wrap(  # noqa: F821
+            copied_t  # noqa: F821
+        )
+        # flake does not detect due to del.
+
+    return [t.__name__ for t in transformers]
+
+
+def _get_classes():
+    """Return a list of classes found in transforms directory.
+
+    Returns:
+        list of classes found in transforms directory.
+
+    """
+    import inspect
+
+    return [c for c in globals().values() if inspect.isclass(c)]
+
+
+__all__ = _get_modules(_get_classes(), globals(), __name__) + [
+    "no_serialize_params"
+]
+
+del pandas_wrap
+del is_transformer
+del _get_classes
+del _get_modules
diff --git a/foreshadow/concrete/internals/__init__.py b/foreshadow/concrete/internals/__init__.py
@@ -0,0 +1,51 @@
+"""Custom foreshadow defined transformers."""
+from foreshadow.concrete.internals.boxcox import BoxCox  # noqa: F401
+from foreshadow.concrete.internals.cleaners import *  # noqa: F403, F401
+from foreshadow.concrete.internals.cleaners import __all__ as c_all
+from foreshadow.concrete.internals.dropfeature import DropFeature  # noqa: F401
+from foreshadow.concrete.internals.dummyencoder import (  # noqa: F403, F401
+    DummyEncoder,
+)
+from foreshadow.concrete.internals.fancyimpute import (  # noqa: F403, F401
+    FancyImputer,
+)
+from foreshadow.concrete.internals.financial import (  # noqa: F401
+    ConvertFinancial,
+    PrepareFinancial,
+)
+from foreshadow.concrete.internals.htmlremover import HTMLRemover  # noqa: F401
+from foreshadow.concrete.internals.labelencoder import (  # noqa: F403, F401
+    FixedLabelEncoder,
+)
+from foreshadow.concrete.internals.notransform import NoTransform  # noqa: F401
+from foreshadow.concrete.internals.tfidf import (  # noqa: F403, F401
+    FixedTfidfVectorizer,
+)
+from foreshadow.concrete.internals.tostring import ToString  # noqa: F401
+from foreshadow.concrete.internals.uncommonremover import (  # noqa: F403, F401
+    UncommonRemover,
+)
+
+
+# TODO flake fails here, figure out why.
+#  hypothesis: flake8 uses the __repr__ which is modified to be
+#  DFTransformer.HTMLRemover etc.
+
+__all__ = [
+    "BoxCox",
+    "DropFeature",
+    "DummyEncoder",
+    "FancyImputer",
+    "ConvertFinancial",
+    "PrepareFinancial",
+    "HTMLRemover",
+    "FixedLabelEncoder",
+    "FixedTfidfVectorizer",
+    "ToString",
+    "UncommonRemover",
+    "YYYYMMDDDateCleaner",
+    "DollarFinancialCleaner",
+    "DropCleaner",
+    "StandardJsonFlattener",
+    "NoTransform",
+] + c_all
diff --git a/...transformers/concrete/internals/boxcox.py → foreshadow/concrete/internals/boxcox.py b/...transformers/concrete/internals/boxcox.py → foreshadow/concrete/internals/boxcox.py
@@ -7,7 +7,10 @@
 from sklearn.utils import check_array
 from sklearn.utils.validation import check_is_fitted
 
+from foreshadow.wrapper import pandas_wrap
 
+
+@pandas_wrap
 class BoxCox(BaseEstimator, TransformerMixin):
     """Perform BoxCox transformation on continuous numeric data."""
 

diff --git a/foreshadow/concrete/internals/cleaners/__init__.py b/foreshadow/concrete/internals/cleaners/__init__.py
@@ -0,0 +1,19 @@
+"""Internal cleaners for handling the cleaning and shaping of data."""
+from foreshadow.concrete.internals.cleaners.datetimes import (
+    YYYYMMDDDateCleaner,
+)
+from foreshadow.concrete.internals.cleaners.drop import DropCleaner
+from foreshadow.concrete.internals.cleaners.financial_cleaner import (
+    DollarFinancialCleaner,
+)
+from foreshadow.concrete.internals.cleaners.json_flattener import (
+    StandardJsonFlattener,
+)
+
+
+__all__ = [
+    "YYYYMMDDDateCleaner",
+    "DropCleaner",
+    "DollarFinancialCleaner",
+    "StandardJsonFlattener",
+]