Skip to content

Commit

Permalink
Changed regex to pattern
Browse files Browse the repository at this point in the history
As we changed matching to gaps to be consistent with nltk's RegexTokenizer,
it is also more consistent to use pattern instead of regex.
  • Loading branch information
Augustin Borsu committed Mar 17, 2015
1 parent 132b00b commit cd6642e
Showing 1 changed file with 5 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,18 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize
* param sets regex used by tokenizer
* @group param
*/
val regex = new Param(this, "regex", "regex used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+"))
val pattern = new Param(this, "pattern",
"regex pattern used for tokenizing", Some("\\p{L}+|[^\\p{L}\\s]+"))

/** @group setParam */
def setRegex(value: String) = set(regex, value)
def setPattern(value: String) = set(pattern, value)

/** @group getParam */
def getRegex: String = get(regex)
def getPattern: String = get(pattern)

override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = { str =>

val re = paramMap(regex)
val re = paramMap(pattern)
val tokens = if(paramMap(gaps)) str.split(re).toList else (re.r.findAllIn(str)).toList

tokens.filter(_.length >= paramMap(minTokenLength))
Expand Down

0 comments on commit cd6642e

Please sign in to comment.