apache · ghost · Feb 10, 2015 · Feb 10, 2015 · Feb 11, 2015 · Feb 11, 2015
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.UnaryTransformer
-import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.{ParamMap, IntParam, BooleanParam, Param}
 import org.apache.spark.sql.types.{DataType, StringType, ArrayType}
 
 /**
@@ -39,3 +39,67 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] {
 
   override protected def outputDataType: DataType = new ArrayType(StringType, false)
 }
+
+/**
+ * :: AlphaComponent ::
+ * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) 
+ * or using it to split the text (set matching to false). Optional parameters also allow to fold
+ * the text to lowercase prior to it being tokenized and to filer tokens using a minimal length. 
+ * It returns an array of strings that can be empty.
+ * The default parameters are regex = "\\p{L}+|[^\\p{L}\\s]+", matching = true, 
+ * lowercase = false, minTokenLength = 1
+ */
+@AlphaComponent
+class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] {
+
+  /**
+   * param for minimum token length, default is one to avoid returning empty strings
+   * @group param
+   */
+  val minTokenLength = new IntParam(this, "minLength", "minimum token length", Some(1))
+
+  /** @group setParam */
+  def setMinTokenLength(value: Int) = set(minTokenLength, value)
+
+  /** @group getParam */
+  def getMinTokenLength: Int = get(minTokenLength)
+
+  /**
+   * param sets regex as splitting on gaps (true) or matching tokens (false)
+   * @group param
+   */
+  val gaps = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens", Some(false))
+
+  /** @group setParam */
+  def setGaps(value: Boolean) = set(gaps, value)
+
+  /** @group getParam */
+  def getGaps: Boolean = get(gaps)
+
+  /**
+   * param sets regex pattern used by tokenizer 
+   * @group param
+   */
+  val pattern = new Param(this, "pattern", 
+    "regex pattern used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+".r))
+
+  /** @group setParam */
+  def setPattern(value: String) = set(pattern, value.r)
+
+  /** @group getParam */
+  def getPattern: String = get(pattern).toString
+
+  override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { str =>
+
+    val re = paramMap(pattern)
+    val tokens = if(paramMap(gaps)) re.split(str).toList else (re.findAllIn(str)).toList
+
+    tokens.filter(_.length >= paramMap(minTokenLength))
+  }
+
+  override protected def validateInputType(inputType: DataType): Unit = {
+    require(inputType == StringType, s"Input type must be string type but got $inputType.")
+  }
+
+  override protected def outputDataType: DataType = new ArrayType(StringType, false)
+}
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+public class JavaTokenizerSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaTokenizerSuite");
+    jsql = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void RegexTokenizer() {
+    RegexTokenizer myRegExTokenizer = new RegexTokenizer()
+      .setInputCol("rawText")
+      .setOutputCol("tokens")
+      .setPattern("\\s")
+      .setGaps(true)
+      .setMinTokenLength(0);
+
+    List<String> t = Arrays.asList(
 @BeanInfo 
 DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class); 
 @BeanInfo 
 DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class); 
+      "{\"rawText\": \"Test of tok.\", \"wantedTokens\": [\"Test\", \"of\", \"tok.\"]}",
+      "{\"rawText\": \"Te,st.  punct\", \"wantedTokens\": [\"Te,st.\",\"\",\"punct\"]}");
+
+    JavaRDD<String> myRdd = jsc.parallelize(t);
+    DataFrame dataset = jsql.jsonRDD(myRdd);
+
+    Row[] pairs = myRegExTokenizer.transform(dataset)
+      .select("tokens","wantedTokens")
+      .collect();
+
+    Assert.assertEquals(pairs[0].get(0), pairs[0].get(1));
+    Assert.assertEquals(pairs[1].get(0), pairs[1].get(1));
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkException
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+
+case class TextData(rawText : String, wantedTokens: Seq[String])
+class TokenizerSuite extends FunSuite with MLlibTestSparkContext {
+
+  @transient var sqlContext: SQLContext = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sqlContext = new SQLContext(sc)
+  }
+
+  test("RegexTokenizer"){
+    val myRegExTokenizer = new RegexTokenizer()
+      .setInputCol("rawText")
+      .setOutputCol("tokens")
+
+    var dataset = sqlContext.createDataFrame(
+      sc.parallelize(List(
+        TextData("Test for tokenization.",List("Test","for","tokenization",".")),
+        TextData("Te,st. punct",List("Te",",","st",".","punct"))
+    )))
+    testRegexTokenizer(myRegExTokenizer,dataset)
+
+    dataset = sqlContext.createDataFrame(
+      sc.parallelize(List(
+        TextData("Test for tokenization.",List("Test","for","tokenization")),
+        TextData("Te,st. punct",List("punct"))
+    )))
+    myRegExTokenizer.asInstanceOf[RegexTokenizer]
+      .setMinTokenLength(3)
+    testRegexTokenizer(myRegExTokenizer,dataset)
+
+    myRegExTokenizer.asInstanceOf[RegexTokenizer]
+      .setPattern("\\s")
+      .setGaps(true)
+      .setMinTokenLength(0)
+    dataset = sqlContext.createDataFrame(
+      sc.parallelize(List(
+        TextData("Test for tokenization.",List("Test","for","tokenization.")),
+        TextData("Te,st.  punct",List("Te,st.","","punct"))
+    )))
+    testRegexTokenizer(myRegExTokenizer,dataset)
+  }
+
+  test("Tokenizer") {
+    val oldTokenizer =  new Tokenizer()
+      .setInputCol("rawText")
+      .setOutputCol("tokens")
+    var dataset = sqlContext.createDataFrame(
+      sc.parallelize(List(
+        TextData("Test for tokenization.",List("test","for","tokenization.")),
+        TextData("Te,st.  punct",List("te,st.","","punct"))
+    )))
+    testTokenizer(oldTokenizer,dataset)
+  }
+
+  def testTokenizer(t: Tokenizer,dataset: DataFrame): Unit = {
+    t.transform(dataset)
+      .select("tokens","wantedTokens")
+      .collect().foreach{ 
+        case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) =>
+          assert(tokens === wantedTokens)
+        case e =>
+          throw new SparkException(s"Row $e should contain only tokens and wantedTokens columns")
+      }
+  }
+
+  def testRegexTokenizer(t: RegexTokenizer,dataset: DataFrame): Unit = {
+    t.transform(dataset)
+      .select("tokens","wantedTokens")
+      .collect().foreach{ 
+        case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) =>
+          assert(tokens === wantedTokens)
+        case e => 
+          throw new SparkException(s"Row $e should contain only tokens and wantedTokens columns")
+      }
+  }
+
+}