Skip to content

Commit

Permalink
update feature transformers and fix a bug in RegexTokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
mengxr committed May 17, 2015
1 parent 9ca44fb commit 0138fc3
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class RegexTokenizer(override val uid: String)
* Default: 1, to avoid returning empty strings
* @group param
*/
val minTokenLength: IntParam = new IntParam(this, "minLength", "minimum token length (>= 0)",
val minTokenLength: IntParam = new IntParam(this, "minTokenLength", "minimum token length (>= 0)",
ParamValidators.gtEq(0))

/** @group setParam */
Expand Down
55 changes: 34 additions & 21 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
0.0
"""

_java_class = "org.apache.spark.ml.feature.Bucketizer"
# a placeholder to make it appear in the generated doc
splits = \
Param(Params._dummy(), "splits",
Expand All @@ -121,6 +120,7 @@ def __init__(self, splits=None, inputCol=None, outputCol=None):
__init__(self, splits=None, inputCol=None, outputCol=None)
"""
super(Bucketizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
#: param for Splitting points for mapping continuous features into buckets. With n+1 splits,
# there are n buckets. A bucket defined by splits x,y holds values in the range [x,y)
# except the last bucket, which also includes y. The splits should be strictly increasing.
Expand Down Expand Up @@ -177,14 +177,13 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
SparseVector(5, {2: 1.0, 3: 1.0, 4: 1.0})
"""

_java_class = "org.apache.spark.ml.feature.HashingTF"

@keyword_only
def __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
"""
__init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None)
"""
super(HashingTF, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid)
self._setDefault(numFeatures=1 << 18)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
Expand Down Expand Up @@ -256,6 +255,8 @@ def getMinDocFreq(self):
"""
return self.getOrDefault(self.minDocFreq)

def _create_model(self, java_model):
return IDFModel(java_model)

class IDFModel(JavaModel):
"""
Expand Down Expand Up @@ -284,14 +285,13 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
# a placeholder to make it appear in the generated doc
p = Param(Params._dummy(), "p", "the p norm value.")

_java_class = "org.apache.spark.ml.feature.Normalizer"

@keyword_only
def __init__(self, p=2.0, inputCol=None, outputCol=None):
"""
__init__(self, p=2.0, inputCol=None, outputCol=None)
"""
super(Normalizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)
self.p = Param(self, "p", "the p norm value.")
self._setDefault(p=2.0)
kwargs = self.__init__._input_kwargs
Expand Down Expand Up @@ -346,8 +346,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
SparseVector(3, {0: 1.0})
"""

_java_class = "org.apache.spark.ml.feature.OneHotEncoder"

# a placeholder to make it appear in the generated doc
includeFirst = Param(Params._dummy(), "includeFirst", "include first category")

Expand All @@ -357,6 +355,7 @@ def __init__(self, includeFirst=True, inputCol=None, outputCol=None):
__init__(self, includeFirst=True, inputCol=None, outputCol=None)
"""
super(OneHotEncoder, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
self.includeFirst = Param(self, "includeFirst", "include first category")
self._setDefault(includeFirst=True)
kwargs = self.__init__._input_kwargs
Expand Down Expand Up @@ -403,8 +402,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
"""

_java_class = "org.apache.spark.ml.feature.PolynomialExpansion"

# a placeholder to make it appear in the generated doc
degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)")

Expand All @@ -414,6 +411,8 @@ def __init__(self, degree=2, inputCol=None, outputCol=None):
__init__(self, degree=2, inputCol=None, outputCol=None)
"""
super(PolynomialExpansion, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.feature.PolynomialExpansion", self.uid)
self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)")
self._setDefault(degree=2)
kwargs = self.__init__._input_kwargs
Expand Down Expand Up @@ -470,7 +469,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
TypeError: Method setParams forces keyword arguments.
"""

_java_class = "org.apache.spark.ml.feature.RegexTokenizer"
# a placeholder to make it appear in the generated doc
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
gaps = Param(Params._dummy(), "gaps", "Set regex to match gaps or tokens")
Expand All @@ -484,7 +482,8 @@ def __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+"
inputCol=None, outputCol=None)
"""
super(RegexTokenizer, self).__init__()
self.minTokenLength = Param(self, "minLength", "minimum token length (>= 0)")
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
self.gaps = Param(self, "gaps", "Set regex to match gaps or tokens")
self.pattern = Param(self, "pattern", "regex pattern used for tokenizing")
self._setDefault(minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+")
Expand Down Expand Up @@ -556,8 +555,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
DenseVector([1.4142])
"""

_java_class = "org.apache.spark.ml.feature.StandardScaler"

# a placeholder to make it appear in the generated doc
withMean = Param(Params._dummy(), "withMean", "Center data with mean")
withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation")
Expand All @@ -568,6 +565,7 @@ def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None):
__init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
"""
super(StandardScaler, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid)
self.withMean = Param(self, "withMean", "Center data with mean")
self.withStd = Param(self, "withStd", "Scale to unit standard deviation")
self._setDefault(withMean=False, withStd=True)
Expand Down Expand Up @@ -609,6 +607,9 @@ def getWithStd(self):
"""
return self.getOrDefault(self.withStd)

def _create_model(self, java_model):
return StandardScalerModel(java_model)


class StandardScalerModel(JavaModel):
"""
Expand All @@ -632,14 +633,13 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
[(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]
"""

_java_class = "org.apache.spark.ml.feature.StringIndexer"

@keyword_only
def __init__(self, inputCol=None, outputCol=None):
"""
__init__(self, inputCol=None, outputCol=None)
"""
super(StringIndexer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

Expand All @@ -652,6 +652,9 @@ def setParams(self, inputCol=None, outputCol=None):
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

def _create_model(self, java_model):
return StringIndexerModel(java_model)


class StringIndexerModel(JavaModel):
"""
Expand Down Expand Up @@ -685,14 +688,13 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
TypeError: Method setParams forces keyword arguments.
"""

_java_class = "org.apache.spark.ml.feature.Tokenizer"

@keyword_only
def __init__(self, inputCol=None, outputCol=None):
"""
__init__(self, inputCol=None, outputCol=None)
"""
super(Tokenizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Tokenizer", self.uid)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

Expand Down Expand Up @@ -722,14 +724,13 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
DenseVector([0.0, 1.0])
"""

_java_class = "org.apache.spark.ml.feature.VectorAssembler"

@keyword_only
def __init__(self, inputCols=None, outputCol=None):
"""
__init__(self, inputCols=None, outputCol=None)
"""
super(VectorAssembler, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorAssembler", self.uid)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

Expand Down Expand Up @@ -796,7 +797,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
DenseVector([1.0, 0.0])
"""

_java_class = "org.apache.spark.ml.feature.VectorIndexer"
# a placeholder to make it appear in the generated doc
maxCategories = Param(Params._dummy(), "maxCategories",
"Threshold for the number of values a categorical feature can take " +
Expand All @@ -809,6 +809,7 @@ def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
__init__(self, maxCategories=20, inputCol=None, outputCol=None)
"""
super(VectorIndexer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
self.maxCategories = Param(self, "maxCategories",
"Threshold for the number of values a categorical feature " +
"can take (>= 2). If a feature is found to have " +
Expand Down Expand Up @@ -839,6 +840,15 @@ def getMaxCategories(self):
"""
return self.getOrDefault(self.maxCategories)

def _create_model(self, java_model):
return VectorIndexerModel(java_model)


class VectorIndexerModel(JavaModel):
"""
Model fitted by VectorIndexer.
"""


@inherit_doc
@ignore_unicode_prefix
Expand All @@ -854,7 +864,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276])
"""

_java_class = "org.apache.spark.ml.feature.Word2Vec"
# a placeholder to make it appear in the generated doc
vectorSize = Param(Params._dummy(), "vectorSize",
"the dimension of codes after transforming from words")
Expand All @@ -872,6 +881,7 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
seed=42, inputCol=None, outputCol=None)
"""
super(Word2Vec, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
self.vectorSize = Param(self, "vectorSize",
"the dimension of codes after transforming from words")
self.numPartitions = Param(self, "numPartitions",
Expand Down Expand Up @@ -934,6 +944,9 @@ def getMinCount(self):
"""
return self.getOrDefault(self.minCount)

def _create_model(self, java_model):
return Word2VecModel(java_model)


class Word2VecModel(JavaModel):
"""
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class JavaWrapper(Params):
_java_obj = None

@staticmethod
def _new_java_obj(self, java_class, *args):
def _new_java_obj(java_class, *args):
"""
Construct a new Java object.
"""
Expand Down

0 comments on commit 0138fc3

Please sign in to comment.