From 01cd26f856d7236035faf0c42f1f8f01ebbb2ce7 Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Tue, 10 Feb 2015 10:52:47 +0100 Subject: [PATCH 01/15] RegExTokenizer A more complex tokenizer that extracts tokens based on a regex. It also allows to turn lowerCasing on and off, adding a minimum token length and a list of stop words to exclude. --- .../apache/spark/ml/feature/Tokenizer.scala | 53 ++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 0b1f90daa7d8e..d05cd1904504e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.UnaryTransformer -import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.{ParamMap,IntParam,BooleanParam} import org.apache.spark.sql.types.{DataType, StringType, ArrayType} /** @@ -39,3 +39,54 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { override protected def outputDataType: DataType = new ArrayType(StringType, false) } + + +/** + * :: AlphaComponent :: + * A regex based tokenizer that extracts tokens using a regex. + * Optional additional parameters include enabling lowercase stabdarization, a minimum character + * size for tokens as well as an array of stop words to remove from the results. + */ +@AlphaComponent +class RegexTokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { + + val lowercase = new BooleanParam(this, "numFeatures", "number of features", Some(true)) + def setLowercase(value: Boolean) = set(lowercase, value) + def getLowercase: Boolean = get(lowercase) + + val minLength = new IntParam(this, "numFeatures", "number of features", Some(0)) + def setMinLength(value: Int) = set(minLength, value) + def getMinLength: Int = get(minLength) + + val regEx = "\\p{L}+|[^\\p{L}\\s]+".r + // def setRegex(value: scala.util.matching.Regex) = set(regEx, value) + // def getRegex: scala.util.matching.Regex = get(regEx) + + val stopWords = Array[String]() + // def setStopWords(value: Array[String]) = set(stopWords, value) + // def getStopWords: Array[String] = get(stopWords) + + + override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { x => + + var string = x + if (paramMap(lowercase)) { + string = string.toLowerCase + } + var tokens = (regEx findAllIn string).toList + + if(paramMap(minLength) > 0){ + tokens = tokens.filter(_.length > paramMap(minLength)) + } + if(stopWords.length > 0){ + tokens = tokens.filter(!stopWords.contains(_)) + } + tokens + } + + override protected def validateInputType(inputType: DataType): Unit = { + require(inputType == StringType, s"Input type must be string type but got $inputType.") + } + + override protected def outputDataType: DataType = new ArrayType(StringType, false) +} From 9547e9df7f64c74f33526b26b92f6f1ef841ae3c Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Tue, 10 Feb 2015 11:39:39 +0100 Subject: [PATCH 02/15] RegEx Tokenizer A more complex tokenizer that extracts tokens based on a regex. It also allows to turn lowerCasing on and off, adding a minimum token length and a list of stop words to exclude. --- .../main/scala/org/apache/spark/ml/feature/Tokenizer.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index d05cd1904504e..45725788e3219 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.UnaryTransformer -import org.apache.spark.ml.param.{ParamMap,IntParam,BooleanParam} +import org.apache.spark.ml.param.{ParamMap, IntParam, BooleanParam} import org.apache.spark.sql.types.{DataType, StringType, ArrayType} /** @@ -48,7 +48,7 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { * size for tokens as well as an array of stop words to remove from the results. */ @AlphaComponent -class RegexTokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { +class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] { val lowercase = new BooleanParam(this, "numFeatures", "number of features", Some(true)) def setLowercase(value: Boolean) = set(lowercase, value) From 9f8685aed3e0449dee4cc47dcc7ad0de73859af8 Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Wed, 11 Feb 2015 10:36:31 +0100 Subject: [PATCH 03/15] RegexTokenizer Regex and stopwords parameters are now part of the parametergrid --- .../apache/spark/ml/feature/Tokenizer.scala | 44 ++++++++++++------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 45725788e3219..99fc5eddb8d18 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.UnaryTransformer -import org.apache.spark.ml.param.{ParamMap, IntParam, BooleanParam} +import org.apache.spark.ml.param.{ParamMap, IntParam, BooleanParam, Param} import org.apache.spark.sql.types.{DataType, StringType, ArrayType} /** @@ -50,36 +50,48 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { @AlphaComponent class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] { - val lowercase = new BooleanParam(this, "numFeatures", "number of features", Some(true)) - def setLowercase(value: Boolean) = set(lowercase, value) - def getLowercase: Boolean = get(lowercase) - - val minLength = new IntParam(this, "numFeatures", "number of features", Some(0)) + val lowerCase = new BooleanParam(this, + "lowerCase", + "enable case folding to lower case", + Some(true)) + def setLowercase(value: Boolean) = set(lowerCase, value) + def getLowercase: Boolean = get(lowerCase) + + val minLength = new IntParam(this, + "minLength", + "minimum token length (excluded)", + Some(0)) def setMinLength(value: Int) = set(minLength, value) def getMinLength: Int = get(minLength) - val regEx = "\\p{L}+|[^\\p{L}\\s]+".r - // def setRegex(value: scala.util.matching.Regex) = set(regEx, value) - // def getRegex: scala.util.matching.Regex = get(regEx) + val regEx = new Param(this, + "regEx", + "RegEx used for tokenizing", + Some("\\p{L}+|[^\\p{L}\\s]+".r)) + def setRegex(value: scala.util.matching.Regex) = set(regEx, value) + def getRegex: scala.util.matching.Regex = get(regEx) - val stopWords = Array[String]() - // def setStopWords(value: Array[String]) = set(stopWords, value) - // def getStopWords: Array[String] = get(stopWords) + val stopWords = new Param(this, + "stopWords", + "array of tokens to filter from results", + Some(Array[String]())) + def setStopWords(value: Array[String]) = set(stopWords, value) + def getStopWords: Array[String] = get(stopWords) override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { x => var string = x - if (paramMap(lowercase)) { + if (paramMap(lowerCase)) { string = string.toLowerCase } - var tokens = (regEx findAllIn string).toList + var tokens = (paramMap(regEx) findAllIn string).toList if(paramMap(minLength) > 0){ tokens = tokens.filter(_.length > paramMap(minLength)) } - if(stopWords.length > 0){ - tokens = tokens.filter(!stopWords.contains(_)) + if(paramMap(stopWords).length > 0){ + tokens = tokens.filter(!paramMap(stopWords).contains(_)) } tokens } From 9082fc3249e340cb6d99990e33b8a2156fa7861f Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Tue, 3 Mar 2015 01:14:56 +0100 Subject: [PATCH 04/15] Removed stopwords parameters and updated doc Still need to add unit test. --- .../apache/spark/ml/feature/Tokenizer.scala | 61 +++++++------------ 1 file changed, 22 insertions(+), 39 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 99fc5eddb8d18..53a89394b48f8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -43,57 +43,40 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { /** * :: AlphaComponent :: - * A regex based tokenizer that extracts tokens using a regex. - * Optional additional parameters include enabling lowercase stabdarization, a minimum character - * size for tokens as well as an array of stop words to remove from the results. + * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) + * or using it to split the text (set matching to false). Optional parameters also allow to fold + * the text to lowercase prior to it being tokenized and to filer tokens using a minimal length. + * It returns an array of strings that can be empty. + * The default parameters are regex = "\\p{L}+|[^\\p{L}\\s]+", matching = true, + * lowercase = false, minTokenLength = 1 */ @AlphaComponent class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] { - val lowerCase = new BooleanParam(this, - "lowerCase", - "enable case folding to lower case", - Some(true)) + val lowerCase = new BooleanParam(this, "lowerCase", "Folds case to lower case", Some(false)) def setLowercase(value: Boolean) = set(lowerCase, value) def getLowercase: Boolean = get(lowerCase) - val minLength = new IntParam(this, - "minLength", - "minimum token length (excluded)", - Some(0)) - def setMinLength(value: Int) = set(minLength, value) - def getMinLength: Int = get(minLength) + val minTokenLength = new IntParam(this, "minLength", "minimum token length", Some(1)) + def setMinTokenLength(value: Int) = set(minTokenLength, value) + def getMinTokenLength: Int = get(minTokenLength) - val regEx = new Param(this, - "regEx", - "RegEx used for tokenizing", - Some("\\p{L}+|[^\\p{L}\\s]+".r)) - def setRegex(value: scala.util.matching.Regex) = set(regEx, value) - def getRegex: scala.util.matching.Regex = get(regEx) - - val stopWords = new Param(this, - "stopWords", - "array of tokens to filter from results", - Some(Array[String]())) - def setStopWords(value: Array[String]) = set(stopWords, value) - def getStopWords: Array[String] = get(stopWords) + val matching = new BooleanParam(this, "matching", "Sets regex to matching or split", Some(true)) + def setMatching(value: Boolean) = set(matching, value) + def getMatching: Boolean = get(matching) + val regex = new Param(this, "regex", "regex used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+")) + def setRegex(value: String) = set(regex, value) + def getRegex: String = get(regex) override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { x => - var string = x - if (paramMap(lowerCase)) { - string = string.toLowerCase - } - var tokens = (paramMap(regEx) findAllIn string).toList - - if(paramMap(minLength) > 0){ - tokens = tokens.filter(_.length > paramMap(minLength)) - } - if(paramMap(stopWords).length > 0){ - tokens = tokens.filter(!paramMap(stopWords).contains(_)) - } - tokens + val str = if (paramMap(lowerCase)) x.toLowerCase else x + + val re = paramMap(regex) + val tokens = if(paramMap(matching))(re.r.findAllIn(str)).toList else str.split(re).toList + + tokens.filter(_.length >= paramMap(minTokenLength)) } override protected def validateInputType(inputType: DataType): Unit = { From d3ef6d32ee4e9da8feb2db5291763dbb8b53b1ea Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Tue, 3 Mar 2015 01:41:48 +0100 Subject: [PATCH 05/15] Added doc to RegexTokenizer Added @groups {param, setParam, getParam} documentation to RegexTokenizer --- .../apache/spark/ml/feature/Tokenizer.scala | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 53a89394b48f8..824196c772d06 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -53,20 +53,52 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { @AlphaComponent class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] { - val lowerCase = new BooleanParam(this, "lowerCase", "Folds case to lower case", Some(false)) + /** + * param to enable/disable code folding to lowercase prior tokenization + * @group param + */ + val lowerCase = new BooleanParam(this, "lowerCase", "Fold case to lower case", Some(false)) + + /** @group setParam */ def setLowercase(value: Boolean) = set(lowerCase, value) + + /** @group getParam */ def getLowercase: Boolean = get(lowerCase) + /** + * param for minimum token length + * @group param + */ val minTokenLength = new IntParam(this, "minLength", "minimum token length", Some(1)) + + /** @group setParam */ def setMinTokenLength(value: Int) = set(minTokenLength, value) + + /** @group getParam */ def getMinTokenLength: Int = get(minTokenLength) - val matching = new BooleanParam(this, "matching", "Sets regex to matching or split", Some(true)) + /** + * param sets regex as matching (true) or splitting (false) + * @group param + */ + val matching = new BooleanParam(this, "matching", "Set regex to matching or split", Some(true)) + + /** @group setParam */ def setMatching(value: Boolean) = set(matching, value) + + /** @group getParam */ def getMatching: Boolean = get(matching) + /** + * param sets regex used by tokenizer + * @group param + */ val regex = new Param(this, "regex", "regex used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+")) + + /** @group setParam */ def setRegex(value: String) = set(regex, value) + + /** @group getParam */ def getRegex: String = get(regex) override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { x => From 132b00bd236121f90e7c3aa9270a64d3ede4417d Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Tue, 17 Mar 2015 16:16:14 +0100 Subject: [PATCH 06/15] Changed matching to gaps and removed case folding --- .../apache/spark/ml/feature/Tokenizer.scala | 26 +++++-------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 824196c772d06..93118f38133ed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -53,18 +53,6 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { @AlphaComponent class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] { - /** - * param to enable/disable code folding to lowercase prior tokenization - * @group param - */ - val lowerCase = new BooleanParam(this, "lowerCase", "Fold case to lower case", Some(false)) - - /** @group setParam */ - def setLowercase(value: Boolean) = set(lowerCase, value) - - /** @group getParam */ - def getLowercase: Boolean = get(lowerCase) - /** * param for minimum token length * @group param @@ -78,16 +66,16 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize def getMinTokenLength: Int = get(minTokenLength) /** - * param sets regex as matching (true) or splitting (false) + * param sets regex as matching gaps(true) or tokens (false) * @group param */ - val matching = new BooleanParam(this, "matching", "Set regex to matching or split", Some(true)) + val gaps = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens", Some(false)) /** @group setParam */ - def setMatching(value: Boolean) = set(matching, value) + def setGaps(value: Boolean) = set(gaps, value) /** @group getParam */ - def getMatching: Boolean = get(matching) + def getGaps: Boolean = get(gaps) /** * param sets regex used by tokenizer @@ -101,12 +89,10 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize /** @group getParam */ def getRegex: String = get(regex) - override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { x => - - val str = if (paramMap(lowerCase)) x.toLowerCase else x + override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { str => val re = paramMap(regex) - val tokens = if(paramMap(matching))(re.r.findAllIn(str)).toList else str.split(re).toList + val tokens = if(paramMap(gaps)) str.split(re).toList else (re.r.findAllIn(str)).toList tokens.filter(_.length >= paramMap(minTokenLength)) } From cd6642e9810135c6600cdc0bf209c5f7c264949b Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Tue, 17 Mar 2015 16:36:33 +0100 Subject: [PATCH 07/15] Changed regex to pattern As we changed matching to gaps to be consistent with nltk's RegexTokenizer, it is also more consistent to use pattern instead of regex. --- .../scala/org/apache/spark/ml/feature/Tokenizer.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 93118f38133ed..ab22c26e41b04 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -81,17 +81,18 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize * param sets regex used by tokenizer * @group param */ - val regex = new Param(this, "regex", "regex used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+")) + val pattern = new Param(this, "pattern", + "regex pattern used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+")) /** @group setParam */ - def setRegex(value: String) = set(regex, value) + def setPattern(value: String) = set(pattern, value) /** @group getParam */ - def getRegex: String = get(regex) + def getPattern: String = get(pattern) override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { str => - val re = paramMap(regex) + val re = paramMap(pattern) val tokens = if(paramMap(gaps)) str.split(re).toList else (re.r.findAllIn(str)).toList tokens.filter(_.length >= paramMap(minTokenLength)) From e262bacc5fc3cc780466f49200cdf2466ce8e563 Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Wed, 18 Mar 2015 14:13:03 +0100 Subject: [PATCH 08/15] Added unit tests in scala Also changed RegexTokenizer so it extends Tokenizer Class instead of UnaryTransformer It might be interesting to create a Tokenizer trait that could be used by all tokenizers --- .../apache/spark/ml/feature/Tokenizer.scala | 11 +-- .../spark/ml/feature/TokenizerSuite.scala | 94 +++++++++++++++++++ 2 files changed, 99 insertions(+), 6 deletions(-) create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index ab22c26e41b04..fab89c42b1aca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -40,7 +40,6 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { override protected def outputDataType: DataType = new ArrayType(StringType, false) } - /** * :: AlphaComponent :: * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) @@ -51,10 +50,10 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { * lowercase = false, minTokenLength = 1 */ @AlphaComponent -class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] { +class RegexTokenizer extends Tokenizer { /** - * param for minimum token length + * param for minimum token length, default is one to avoid returning empty strings * @group param */ val minTokenLength = new IntParam(this, "minLength", "minimum token length", Some(1)) @@ -66,7 +65,7 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize def getMinTokenLength: Int = get(minTokenLength) /** - * param sets regex as matching gaps(true) or tokens (false) + * param sets regex as splitting on gaps(true) or matching tokens (false) * @group param */ val gaps = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens", Some(false)) @@ -78,7 +77,7 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize def getGaps: Boolean = get(gaps) /** - * param sets regex used by tokenizer + * param sets regex pattern used by tokenizer * @group param */ val pattern = new Param(this, "pattern", @@ -95,7 +94,7 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize val re = paramMap(pattern) val tokens = if(paramMap(gaps)) str.split(re).toList else (re.r.findAllIn(str)).toList - tokens.filter(_.length >= paramMap(minTokenLength)) + tokens.filter(_.length >= paramMap(minTokenLength)).toSeq } override protected def validateInputType(inputType: DataType): Unit = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala new file mode 100644 index 0000000000000..c19ea225940ee --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.scalatest.FunSuite + +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.{DataFrame, Row, SQLContext} + + +case class TextData(rawText : String,wantedTokens: Seq[String]) +class TokenizerSuite extends FunSuite with MLlibTestSparkContext { + + @transient var sqlContext: SQLContext = _ + @transient var dataset: DataFrame = _ + + override def beforeAll(): Unit = { + super.beforeAll() + sqlContext = new SQLContext(sc) + } + + test("RegexTokenizer"){ + var myRegExTokenizer = new RegexTokenizer() + .setInputCol("rawText") + .setOutputCol("tokens") + + dataset = sqlContext.createDataFrame( + sc.parallelize(List( + TextData("Test for tokenization.",List("Test","for","tokenization",".")), + TextData("Te,st. punct",List("Te",",","st",".","punct")) + ))) + testTokenizer(myRegExTokenizer,dataset) + + dataset = sqlContext.createDataFrame( + sc.parallelize(List( + TextData("Test for tokenization.",List("Test","for","tokenization")), + TextData("Te,st. punct",List("punct")) + ))) + myRegExTokenizer.asInstanceOf[RegexTokenizer] + .setMinTokenLength(3) + testTokenizer(myRegExTokenizer,dataset) + + myRegExTokenizer.asInstanceOf[RegexTokenizer] + .setPattern("\\s") + .setGaps(true) + .setMinTokenLength(0) + dataset = sqlContext.createDataFrame( + sc.parallelize(List( + TextData("Test for tokenization.",List("Test","for","tokenization.")), + TextData("Te,st. punct",List("Te,st.","","punct")) + ))) + testTokenizer(myRegExTokenizer,dataset) + } + + test("Tokenizer"){ + val oldTokenizer = new Tokenizer() + .setInputCol("rawText") + .setOutputCol("tokens") + dataset = sqlContext.createDataFrame( + sc.parallelize(List( + TextData("Test for tokenization.",List("test","for","tokenization.")), + TextData("Te,st. punct",List("te,st.","","punct")) + ))) + testTokenizer(oldTokenizer,dataset) + } + + def testTokenizer(t: Tokenizer,dataset: DataFrame){ + t.transform(dataset) + .select("tokens","wantedTokens") + .collect().foreach{ + case Row(tokens: Seq[String], wantedTokens: Seq[String]) => + assert(tokens.length == wantedTokens.length) + tokens.zip(wantedTokens).foreach(x => assert(x._1 == x._2)) + case _ => + println() + assert(false) + } + } +} From b66313fb2346878f2d4de7259148bb4473af3932 Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Thu, 19 Mar 2015 09:40:39 +0100 Subject: [PATCH 09/15] Modified the pattern Param so it is compiled when given to the Tokenizer pattern is set and get as a string but stored as a compiled regex this prevents having to recompile it everytime the transform function is called --- .../org/apache/spark/ml/feature/Tokenizer.scala | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index fab89c42b1aca..b793665eca2ef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -50,7 +50,7 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { * lowercase = false, minTokenLength = 1 */ @AlphaComponent -class RegexTokenizer extends Tokenizer { +class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] { /** * param for minimum token length, default is one to avoid returning empty strings @@ -65,7 +65,7 @@ class RegexTokenizer extends Tokenizer { def getMinTokenLength: Int = get(minTokenLength) /** - * param sets regex as splitting on gaps(true) or matching tokens (false) + * param sets regex as splitting on gaps (true) or matching tokens (false) * @group param */ val gaps = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens", Some(false)) @@ -81,20 +81,20 @@ class RegexTokenizer extends Tokenizer { * @group param */ val pattern = new Param(this, "pattern", - "regex pattern used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+")) + "regex pattern used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+".r)) /** @group setParam */ - def setPattern(value: String) = set(pattern, value) + def setPattern(value: String) = set(pattern, value.r) /** @group getParam */ - def getPattern: String = get(pattern) + def getPattern: String = get(pattern).toString override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { str => val re = paramMap(pattern) - val tokens = if(paramMap(gaps)) str.split(re).toList else (re.r.findAllIn(str)).toList + val tokens = if(paramMap(gaps)) re.split(str).toList else (re.findAllIn(str)).toList - tokens.filter(_.length >= paramMap(minTokenLength)).toSeq + tokens.filter(_.length >= paramMap(minTokenLength)) } override protected def validateInputType(inputType: DataType): Unit = { From 38b95a189ac034172721a7594d207e0bcecf272c Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Thu, 19 Mar 2015 09:44:19 +0100 Subject: [PATCH 10/15] Added Java unit test for RegexTokenizer This unit tests only tests that the getters and setters work properly from java and tries one transform. Other tests have already been carried out in the TokenizerSuite.scala test. --- .../spark/ml/feature/JavaTokenizerSuite.java | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java new file mode 100644 index 0000000000000..0fea933df50d3 --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature; + +import java.util.Arrays; +import java.util.List; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; + +public class JavaTokenizerSuite { + private transient JavaSparkContext jsc; + private transient SQLContext jsql; + + @Before + public void setUp() { + jsc = new JavaSparkContext("local", "JavaTokenizerSuite"); + jsql = new SQLContext(jsc); + } + + @After + public void tearDown() { + jsc.stop(); + jsc = null; + } + + @Test + public void RegexTokenizer() { + RegexTokenizer myRegExTokenizer = new RegexTokenizer() + .setInputCol("rawText") + .setOutputCol("tokens") + .setPattern("\\s") + .setGaps(true) + .setMinTokenLength(0); + + List t = Arrays.asList( + "{\"rawText\": \"Test of tok.\", \"wantedTokens\": [\"Test\", \"of\", \"tok.\"]}", + "{\"rawText\": \"Te,st. punct\", \"wantedTokens\": [\"Te,st.\",\"\",\"punct\"]}"); + + JavaRDD myRdd = jsc.parallelize(t); + DataFrame dataset = jsql.jsonRDD(myRdd); + + Row[] pairs = myRegExTokenizer.transform(dataset) + .select("tokens","wantedTokens") + .collect(); + + Assert.assertEquals(pairs[0].get(0), pairs[0].get(1)); + Assert.assertEquals(pairs[1].get(0), pairs[1].get(1)); + } +} From 6a859820513050a41b46294d741e99f57c999a56 Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Thu, 19 Mar 2015 09:46:44 +0100 Subject: [PATCH 11/15] Style corrections --- .../spark/ml/feature/TokenizerSuite.scala | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala index c19ea225940ee..970eda28ae25f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala @@ -19,15 +19,14 @@ package org.apache.spark.ml.feature import org.scalatest.FunSuite +import org.apache.spark.SparkException import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row, SQLContext} - -case class TextData(rawText : String,wantedTokens: Seq[String]) +case class TextData(rawText : String, wantedTokens: Seq[String]) class TokenizerSuite extends FunSuite with MLlibTestSparkContext { @transient var sqlContext: SQLContext = _ - @transient var dataset: DataFrame = _ override def beforeAll(): Unit = { super.beforeAll() @@ -35,16 +34,16 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext { } test("RegexTokenizer"){ - var myRegExTokenizer = new RegexTokenizer() + val myRegExTokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") - dataset = sqlContext.createDataFrame( + var dataset = sqlContext.createDataFrame( sc.parallelize(List( TextData("Test for tokenization.",List("Test","for","tokenization",".")), TextData("Te,st. punct",List("Te",",","st",".","punct")) ))) - testTokenizer(myRegExTokenizer,dataset) + testRegexTokenizer(myRegExTokenizer,dataset) dataset = sqlContext.createDataFrame( sc.parallelize(List( @@ -53,7 +52,7 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext { ))) myRegExTokenizer.asInstanceOf[RegexTokenizer] .setMinTokenLength(3) - testTokenizer(myRegExTokenizer,dataset) + testRegexTokenizer(myRegExTokenizer,dataset) myRegExTokenizer.asInstanceOf[RegexTokenizer] .setPattern("\\s") @@ -64,14 +63,14 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext { TextData("Test for tokenization.",List("Test","for","tokenization.")), TextData("Te,st. punct",List("Te,st.","","punct")) ))) - testTokenizer(myRegExTokenizer,dataset) + testRegexTokenizer(myRegExTokenizer,dataset) } - test("Tokenizer"){ + test("Tokenizer") { val oldTokenizer = new Tokenizer() .setInputCol("rawText") .setOutputCol("tokens") - dataset = sqlContext.createDataFrame( + var dataset = sqlContext.createDataFrame( sc.parallelize(List( TextData("Test for tokenization.",List("test","for","tokenization.")), TextData("Te,st. punct",List("te,st.","","punct")) @@ -79,16 +78,26 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext { testTokenizer(oldTokenizer,dataset) } - def testTokenizer(t: Tokenizer,dataset: DataFrame){ - t.transform(dataset) + def testTokenizer(t: Tokenizer,dataset: DataFrame): Unit = { + t.transform(dataset) .select("tokens","wantedTokens") .collect().foreach{ - case Row(tokens: Seq[String], wantedTokens: Seq[String]) => - assert(tokens.length == wantedTokens.length) - tokens.zip(wantedTokens).foreach(x => assert(x._1 == x._2)) - case _ => - println() - assert(false) + case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) => + assert(tokens === wantedTokens) + case e => + throw new SparkException(s"Row $e should contain only tokens and wantedTokens columns") } } + + def testRegexTokenizer(t: RegexTokenizer,dataset: DataFrame): Unit = { + t.transform(dataset) + .select("tokens","wantedTokens") + .collect().foreach{ + case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) => + assert(tokens === wantedTokens) + case e => + throw new SparkException(s"Row $e should contain only tokens and wantedTokens columns") + } + } + } From 148126fc0af7682ea951632a29033a0f9d464811 Mon Sep 17 00:00:00 2001 From: Augustin Borsu Date: Mon, 23 Mar 2015 11:17:18 +0100 Subject: [PATCH 12/15] Added return type to public functions Plus some cosmetic changes. --- .../apache/spark/ml/feature/Tokenizer.scala | 13 ++--- .../spark/ml/feature/JavaTokenizerSuite.java | 2 +- .../spark/ml/feature/TokenizerSuite.scala | 53 +++++++++---------- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index b793665eca2ef..e069f4c49c573 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -56,10 +56,10 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize * param for minimum token length, default is one to avoid returning empty strings * @group param */ - val minTokenLength = new IntParam(this, "minLength", "minimum token length", Some(1)) + val minTokenLength: IntParam = new IntParam(this, "minLength", "minimum token length", Some(1)) /** @group setParam */ - def setMinTokenLength(value: Int) = set(minTokenLength, value) + def setMinTokenLength(value: Int): this.type = set(minTokenLength, value) /** @group getParam */ def getMinTokenLength: Int = get(minTokenLength) @@ -68,10 +68,11 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize * param sets regex as splitting on gaps (true) or matching tokens (false) * @group param */ - val gaps = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens", Some(false)) + val gaps: BooleanParam = new BooleanParam(this, "gaps", + "Set regex to match gaps or tokens", Some(false)) /** @group setParam */ - def setGaps(value: Boolean) = set(gaps, value) + def setGaps(value: Boolean): this.type = set(gaps, value) /** @group getParam */ def getGaps: Boolean = get(gaps) @@ -80,11 +81,11 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize * param sets regex pattern used by tokenizer * @group param */ - val pattern = new Param(this, "pattern", + val pattern: Param[scala.util.matching.Regex] = new Param(this, "pattern", "regex pattern used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+".r)) /** @group setParam */ - def setPattern(value: String) = set(pattern, value.r) + def setPattern(value: String): this.type = set(pattern, value.r) /** @group getParam */ def getPattern: String = get(pattern).toString diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java index 0fea933df50d3..41e0aba55745c 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java @@ -58,7 +58,7 @@ public void RegexTokenizer() { List t = Arrays.asList( "{\"rawText\": \"Test of tok.\", \"wantedTokens\": [\"Test\", \"of\", \"tok.\"]}", - "{\"rawText\": \"Te,st. punct\", \"wantedTokens\": [\"Te,st.\",\"\",\"punct\"]}"); + "{\"rawText\": \"Te,st. punct\", \"wantedTokens\": [\"Te,st.\", \"\", \"punct\"]}"); JavaRDD myRdd = jsc.parallelize(t); DataFrame dataset = jsql.jsonRDD(myRdd); diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala index 970eda28ae25f..ffd18de2f7d02 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala @@ -23,7 +23,8 @@ import org.apache.spark.SparkException import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row, SQLContext} -case class TextData(rawText : String, wantedTokens: Seq[String]) +case class TextData(rawText: String, wantedTokens: Seq[String]) + class TokenizerSuite extends FunSuite with MLlibTestSparkContext { @transient var sqlContext: SQLContext = _ @@ -33,37 +34,37 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext { sqlContext = new SQLContext(sc) } - test("RegexTokenizer"){ + test("RegexTokenizer") { val myRegExTokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") var dataset = sqlContext.createDataFrame( - sc.parallelize(List( - TextData("Test for tokenization.",List("Test","for","tokenization",".")), - TextData("Te,st. punct",List("Te",",","st",".","punct")) + sc.parallelize(Seq( + TextData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")), + TextData("Te,st. punct", Seq("Te", ",", "st", ".", "punct")) ))) - testRegexTokenizer(myRegExTokenizer,dataset) + testRegexTokenizer(myRegExTokenizer, dataset) dataset = sqlContext.createDataFrame( - sc.parallelize(List( - TextData("Test for tokenization.",List("Test","for","tokenization")), - TextData("Te,st. punct",List("punct")) + sc.parallelize(Seq( + TextData("Test for tokenization.", Seq("Test", "for", "tokenization")), + TextData("Te,st. punct", Seq("punct")) ))) myRegExTokenizer.asInstanceOf[RegexTokenizer] .setMinTokenLength(3) - testRegexTokenizer(myRegExTokenizer,dataset) + testRegexTokenizer(myRegExTokenizer, dataset) myRegExTokenizer.asInstanceOf[RegexTokenizer] .setPattern("\\s") .setGaps(true) .setMinTokenLength(0) dataset = sqlContext.createDataFrame( - sc.parallelize(List( - TextData("Test for tokenization.",List("Test","for","tokenization.")), - TextData("Te,st. punct",List("Te,st.","","punct")) + sc.parallelize(Seq( + TextData("Test for tokenization.", Seq("Test", "for", "tokenization.")), + TextData("Te,st. punct", Seq("Te,st.", "", "punct")) ))) - testRegexTokenizer(myRegExTokenizer,dataset) + testRegexTokenizer(myRegExTokenizer, dataset) } test("Tokenizer") { @@ -71,32 +72,28 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext { .setInputCol("rawText") .setOutputCol("tokens") var dataset = sqlContext.createDataFrame( - sc.parallelize(List( - TextData("Test for tokenization.",List("test","for","tokenization.")), - TextData("Te,st. punct",List("te,st.","","punct")) + sc.parallelize(Seq( + TextData("Test for tokenization.", Seq("test", "for", "tokenization.")), + TextData("Te,st. punct", Seq("te,st.", "", "punct")) ))) - testTokenizer(oldTokenizer,dataset) + testTokenizer(oldTokenizer, dataset) } - def testTokenizer(t: Tokenizer,dataset: DataFrame): Unit = { + def testTokenizer(t: Tokenizer, dataset: DataFrame): Unit = { t.transform(dataset) - .select("tokens","wantedTokens") - .collect().foreach{ + .select("tokens", "wantedTokens") + .collect().foreach { case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) => assert(tokens === wantedTokens) - case e => - throw new SparkException(s"Row $e should contain only tokens and wantedTokens columns") } } - def testRegexTokenizer(t: RegexTokenizer,dataset: DataFrame): Unit = { + def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = { t.transform(dataset) - .select("tokens","wantedTokens") - .collect().foreach{ + .select("tokens", "wantedTokens") + .collect().foreach { case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) => assert(tokens === wantedTokens) - case e => - throw new SparkException(s"Row $e should contain only tokens and wantedTokens columns") } } From e88d7b8aa95cf5a21e8264761c13b2cf6f75e91e Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Mon, 23 Mar 2015 14:17:10 -0700 Subject: [PATCH 13/15] change pattern to a StringParameter; update tests --- .../apache/spark/ml/feature/Tokenizer.scala | 17 ++-- .../spark/ml/feature/JavaTokenizerSuite.java | 48 +++++------ .../spark/ml/feature/TokenizerSuite.scala | 83 ++++++++----------- 3 files changed, 65 insertions(+), 83 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index e069f4c49c573..16c1439e7554a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -81,21 +81,20 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize * param sets regex pattern used by tokenizer * @group param */ - val pattern: Param[scala.util.matching.Regex] = new Param(this, "pattern", - "regex pattern used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+".r)) + val pattern: Param[String] = new Param( + this, "pattern", "regex pattern used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+")) /** @group setParam */ - def setPattern(value: String): this.type = set(pattern, value.r) + def setPattern(value: String): this.type = set(pattern, value) /** @group getParam */ - def getPattern: String = get(pattern).toString + def getPattern: String = get(pattern) override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { str => - - val re = paramMap(pattern) - val tokens = if(paramMap(gaps)) re.split(str).toList else (re.findAllIn(str)).toList - - tokens.filter(_.length >= paramMap(minTokenLength)) + val re = paramMap(pattern).r + val tokens = if (paramMap(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq + val minLength = paramMap(minTokenLength) + tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java index 41e0aba55745c..e877058564917 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java @@ -17,9 +17,7 @@ package org.apache.spark.ml.feature; -import java.util.Arrays; -import java.util.List; - +import com.google.common.collect.Lists; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -36,38 +34,38 @@ public class JavaTokenizerSuite { private transient SQLContext jsql; @Before - public void setUp() { + public void setUp() { jsc = new JavaSparkContext("local", "JavaTokenizerSuite"); - jsql = new SQLContext(jsc); - } + jsql = new SQLContext(jsc); + } - @After - public void tearDown() { - jsc.stop(); - jsc = null; + @After + public void tearDown() { + jsc.stop(); + jsc = null; } @Test - public void RegexTokenizer() { + public void regexTokenizer() { RegexTokenizer myRegExTokenizer = new RegexTokenizer() .setInputCol("rawText") - .setOutputCol("tokens") - .setPattern("\\s") - .setGaps(true) - .setMinTokenLength(0); - - List t = Arrays.asList( - "{\"rawText\": \"Test of tok.\", \"wantedTokens\": [\"Test\", \"of\", \"tok.\"]}", - "{\"rawText\": \"Te,st. punct\", \"wantedTokens\": [\"Te,st.\", \"\", \"punct\"]}"); + .setOutputCol("tokens") + .setPattern("\\s") + .setGaps(true) + .setMinTokenLength(3); - JavaRDD myRdd = jsc.parallelize(t); - DataFrame dataset = jsql.jsonRDD(myRdd); + JavaRDD rdd = jsc.parallelize(Lists.newArrayList( + new TextData("Test of tok.", new String[] {"Test", "tok."}), + new TextData("Te,st. punct", new String[] {"Te,st.", "punct"}) + )); + DataFrame dataset = jsql.createDataFrame(rdd, TextData.class); - Row[] pairs = myRegExTokenizer.transform(dataset) - .select("tokens","wantedTokens") + Row[] pairs = myRegExTokenizer.transform(dataset) + .select("tokens","wantedTokens") .collect(); - Assert.assertEquals(pairs[0].get(0), pairs[0].get(1)); - Assert.assertEquals(pairs[1].get(0), pairs[1].get(1)); + for (Row r: pairs) { + Assert.assertEquals(r.get(0), r.get(1)); + } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala index ffd18de2f7d02..e8512b4451010 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala @@ -17,15 +17,21 @@ package org.apache.spark.ml.feature +import scala.beans.BeanInfo + import org.scalatest.FunSuite -import org.apache.spark.SparkException import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row, SQLContext} -case class TextData(rawText: String, wantedTokens: Seq[String]) +@BeanInfo +case class TextData(rawText: String, wantedTokens: Seq[String]) { + /** Constructor used in [[org.apache.spark.ml.feature.JavaTokenizerSuite]] */ + def this(rawText: String, wantedTokens: Array[String]) = this(rawText, wantedTokens.toSeq) +} -class TokenizerSuite extends FunSuite with MLlibTestSparkContext { +class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext { + import org.apache.spark.ml.feature.RegexTokenizerSuite._ @transient var sqlContext: SQLContext = _ @@ -35,66 +41,45 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext { } test("RegexTokenizer") { - val myRegExTokenizer = new RegexTokenizer() + val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") - var dataset = sqlContext.createDataFrame( - sc.parallelize(Seq( - TextData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")), - TextData("Te,st. punct", Seq("Te", ",", "st", ".", "punct")) - ))) - testRegexTokenizer(myRegExTokenizer, dataset) + val dataset0 = sqlContext.createDataFrame(Seq( + TextData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")), + TextData("Te,st. punct", Seq("Te", ",", "st", ".", "punct")) + )) + testRegexTokenizer(tokenizer, dataset0) - dataset = sqlContext.createDataFrame( - sc.parallelize(Seq( - TextData("Test for tokenization.", Seq("Test", "for", "tokenization")), - TextData("Te,st. punct", Seq("punct")) - ))) - myRegExTokenizer.asInstanceOf[RegexTokenizer] - .setMinTokenLength(3) - testRegexTokenizer(myRegExTokenizer, dataset) + val dataset1 = sqlContext.createDataFrame(Seq( + TextData("Test for tokenization.", Seq("Test", "for", "tokenization")), + TextData("Te,st. punct", Seq("punct")) + )) - myRegExTokenizer.asInstanceOf[RegexTokenizer] + tokenizer.setMinTokenLength(3) + testRegexTokenizer(tokenizer, dataset1) + + tokenizer .setPattern("\\s") .setGaps(true) .setMinTokenLength(0) - dataset = sqlContext.createDataFrame( - sc.parallelize(Seq( - TextData("Test for tokenization.", Seq("Test", "for", "tokenization.")), - TextData("Te,st. punct", Seq("Te,st.", "", "punct")) - ))) - testRegexTokenizer(myRegExTokenizer, dataset) - } - - test("Tokenizer") { - val oldTokenizer = new Tokenizer() - .setInputCol("rawText") - .setOutputCol("tokens") - var dataset = sqlContext.createDataFrame( - sc.parallelize(Seq( - TextData("Test for tokenization.", Seq("test", "for", "tokenization.")), - TextData("Te,st. punct", Seq("te,st.", "", "punct")) - ))) - testTokenizer(oldTokenizer, dataset) + val dataset2 = sqlContext.createDataFrame(Seq( + TextData("Test for tokenization.", Seq("Test", "for", "tokenization.")), + TextData("Te,st. punct", Seq("Te,st.", "", "punct")) + )) + testRegexTokenizer(tokenizer, dataset2) } +} - def testTokenizer(t: Tokenizer, dataset: DataFrame): Unit = { - t.transform(dataset) - .select("tokens", "wantedTokens") - .collect().foreach { - case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) => - assert(tokens === wantedTokens) - } - } +object RegexTokenizerSuite extends FunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") - .collect().foreach { - case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) => + .collect() + .foreach { + case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) - } + } } - } From 9651aec14b2e9d5c1539650af600a4a3f1383338 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 24 Mar 2015 10:26:34 -0700 Subject: [PATCH 14/15] update test --- .../org/apache/spark/ml/feature/Tokenizer.scala | 4 ++-- .../spark/ml/feature/JavaTokenizerSuite.java | 8 ++++---- .../apache/spark/ml/feature/TokenizerSuite.scala | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 16c1439e7554a..68401e36950bd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -68,8 +68,8 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize * param sets regex as splitting on gaps (true) or matching tokens (false) * @group param */ - val gaps: BooleanParam = new BooleanParam(this, "gaps", - "Set regex to match gaps or tokens", Some(false)) + val gaps: BooleanParam = new BooleanParam( + this, "gaps", "Set regex to match gaps or tokens", Some(false)) /** @group setParam */ def setGaps(value: Boolean): this.type = set(gaps, value) diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java index e877058564917..9cad4f8826a6f 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java @@ -54,11 +54,11 @@ public void regexTokenizer() { .setGaps(true) .setMinTokenLength(3); - JavaRDD rdd = jsc.parallelize(Lists.newArrayList( - new TextData("Test of tok.", new String[] {"Test", "tok."}), - new TextData("Te,st. punct", new String[] {"Te,st.", "punct"}) + JavaRDD rdd = jsc.parallelize(Lists.newArrayList( + new TokenizerTestData("Test of tok.", new String[] {"Test", "tok."}), + new TokenizerTestData("Te,st. punct", new String[] {"Te,st.", "punct"}) )); - DataFrame dataset = jsql.createDataFrame(rdd, TextData.class); + DataFrame dataset = jsql.createDataFrame(rdd, TokenizerTestData.class); Row[] pairs = myRegExTokenizer.transform(dataset) .select("tokens","wantedTokens") diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala index e8512b4451010..bf862b912d326 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row, SQLContext} @BeanInfo -case class TextData(rawText: String, wantedTokens: Seq[String]) { +case class TokenizerTestData(rawText: String, wantedTokens: Seq[String]) { /** Constructor used in [[org.apache.spark.ml.feature.JavaTokenizerSuite]] */ def this(rawText: String, wantedTokens: Array[String]) = this(rawText, wantedTokens.toSeq) } @@ -46,14 +46,14 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext { .setOutputCol("tokens") val dataset0 = sqlContext.createDataFrame(Seq( - TextData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")), - TextData("Te,st. punct", Seq("Te", ",", "st", ".", "punct")) + TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")), + TokenizerTestData("Te,st. punct", Seq("Te", ",", "st", ".", "punct")) )) testRegexTokenizer(tokenizer, dataset0) val dataset1 = sqlContext.createDataFrame(Seq( - TextData("Test for tokenization.", Seq("Test", "for", "tokenization")), - TextData("Te,st. punct", Seq("punct")) + TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization")), + TokenizerTestData("Te,st. punct", Seq("punct")) )) tokenizer.setMinTokenLength(3) @@ -64,8 +64,8 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext { .setGaps(true) .setMinTokenLength(0) val dataset2 = sqlContext.createDataFrame(Seq( - TextData("Test for tokenization.", Seq("Test", "for", "tokenization.")), - TextData("Te,st. punct", Seq("Te,st.", "", "punct")) + TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization.")), + TokenizerTestData("Te,st. punct", Seq("Te,st.", "", "punct")) )) testRegexTokenizer(tokenizer, dataset2) } From a164800ef2c29f76752da8f217c1307d8d04f49a Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 24 Mar 2015 10:31:11 -0700 Subject: [PATCH 15/15] remove tabs --- .../spark/ml/feature/JavaTokenizerSuite.java | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java index 9cad4f8826a6f..3806f650025b2 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java @@ -34,38 +34,38 @@ public class JavaTokenizerSuite { private transient SQLContext jsql; @Before - public void setUp() { + public void setUp() { jsc = new JavaSparkContext("local", "JavaTokenizerSuite"); - jsql = new SQLContext(jsc); - } + jsql = new SQLContext(jsc); + } - @After - public void tearDown() { - jsc.stop(); - jsc = null; + @After + public void tearDown() { + jsc.stop(); + jsc = null; } @Test - public void regexTokenizer() { + public void regexTokenizer() { RegexTokenizer myRegExTokenizer = new RegexTokenizer() .setInputCol("rawText") - .setOutputCol("tokens") - .setPattern("\\s") - .setGaps(true) + .setOutputCol("tokens") + .setPattern("\\s") + .setGaps(true) .setMinTokenLength(3); - JavaRDD rdd = jsc.parallelize(Lists.newArrayList( - new TokenizerTestData("Test of tok.", new String[] {"Test", "tok."}), - new TokenizerTestData("Te,st. punct", new String[] {"Te,st.", "punct"}) - )); + JavaRDD rdd = jsc.parallelize(Lists.newArrayList( + new TokenizerTestData("Test of tok.", new String[] {"Test", "tok."}), + new TokenizerTestData("Te,st. punct", new String[] {"Te,st.", "punct"}) + )); DataFrame dataset = jsql.createDataFrame(rdd, TokenizerTestData.class); - Row[] pairs = myRegExTokenizer.transform(dataset) - .select("tokens","wantedTokens") + Row[] pairs = myRegExTokenizer.transform(dataset) + .select("tokens", "wantedTokens") .collect(); - for (Row r: pairs) { - Assert.assertEquals(r.get(0), r.get(1)); - } + for (Row r : pairs) { + Assert.assertEquals(r.get(0), r.get(1)); + } } }