Merge pull request #165 from go-ego/en-pr

Add: add load hmm model option and update godoc
go-ego · Jan 14, 2023 · f20a3db · f20a3db
2 parents bfd624e + c85b6f0
commit f20a3db
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 45 deletions.
diff --git a/dict_util.go b/dict_util.go
@@ -38,7 +38,7 @@ const (
 	zhT1 = "dict/zh/t_1.txt"
 )
 
-// Init init seg config
+// Init initializes the segmenter config
 func (seg *Segmenter) Init() {
 	if seg.MinTokenFreq == 0 {
 		seg.MinTokenFreq = 2.0
@@ -47,6 +47,10 @@ func (seg *Segmenter) Init() {
 	if seg.TextFreq == "" {
 		seg.TextFreq = "2.0"
 	}
+
+	if !seg.NotLoadHMM {
+		seg.LoadModel()
+	}
 }
 
 // Dictionary returns the dictionary used by the tokenizer
@@ -66,7 +70,7 @@ func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token {
 	return token
 }
 
-// AddToken add new text to token
+// AddToken add a new text to the token
 func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error {
 	token := seg.ToToken(text, freq, pos...)
 	return seg.Dict.AddToken(token)
@@ -364,7 +368,7 @@ func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error {
 			pos = ""
 		}
 
-		// Add participle tokens to dictionary
+		// Add participle tokens to the dictionary
 		words := seg.SplitTextToWords([]byte(text))
 		token := Token{text: words, freq: freq, pos: pos}
 		seg.Dict.AddToken(token)

diff --git a/dictionary.go b/dictionary.go
@@ -17,37 +17,37 @@ import (
 	"github.com/vcaesar/cedar"
 )
 
-// Dictionary 结构体实现了一个字串双数组树，
-// 一个分词可能出现在叶子节点也有可能出现在非叶节点
+// Dictionary struct implements a string double array trie.
+// one segment maybe in leaf node or not
 type Dictionary struct {
-	trie *cedar.Cedar // Cedar 双数组树
+	trie *cedar.Cedar // Cedar double array trie
 
-	maxTokenLen int     // 词典中最长的分词
-	Tokens      []Token // 词典中所有的分词，方便遍历
-	totalFreq   float64 // 词典中所有分词的频率之和
+	maxTokenLen int     // the maximum length of the dictionary
+	Tokens      []Token // the all tokens in the dictionary, to traverse
+	totalFreq   float64 // the total number of tokens in the dictionary
 }
 
-// NewDict new dictionary
+// NewDict a new dictionary trie
 func NewDict() *Dictionary {
 	return &Dictionary{trie: cedar.New()}
 }
 
-// MaxTokenLen 词典中最长的分词
+// MaxTokenLen the maximum length of the dictionary
 func (dict *Dictionary) MaxTokenLen() int {
 	return dict.maxTokenLen
 }
 
-// NumTokens 词典中分词数目
+// NumTokens the number of tokens in the dictionary
 func (dict *Dictionary) NumTokens() int {
 	return len(dict.Tokens)
 }
 
-// TotalFreq 词典中所有分词的频率之和
+// TotalFreq the total frequency of the dictionary
 func (dict *Dictionary) TotalFreq() float64 {
 	return dict.totalFreq
 }
 
-// AddToken 向词典中加入一个分词
+// AddToken add a token to the dictionary
 func (dict *Dictionary) AddToken(token Token) error {
 	bytes := textSliceToBytes(token.text)
 	val, err := dict.trie.Get(bytes)
@@ -77,8 +77,8 @@ func (dict *Dictionary) RemoveToken(token Token) error {
 	return dict.trie.Delete(bytes)
 }
 
-// LookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词
-// 返回值为找到的分词数
+// LookupTokens finds tokens and words in the dictionary, matching the given pattern
+// and returns the number of tokens
 func (dict *Dictionary) LookupTokens(
 	words []Text, tokens []*Token) (numOfTokens int) {
 	var (
@@ -103,7 +103,7 @@ func (dict *Dictionary) LookupTokens(
 }
 
 // Find find the word in the dictionary is non-existent
-// and the word's frequency, pos
+// and the word's frequency and pos
 func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
 	var (
 		id, value int
@@ -131,7 +131,7 @@ func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
 }
 
 // Value find word in the dictionary
-// retrun the word's value, id
+// retrun the word's value and id
 func (dict *Dictionary) Value(word []byte) (val, id int, err error) {
 	id, err = dict.trie.Jump(word, id)
 	if err != nil {

diff --git a/gse.go b/gse.go
@@ -30,21 +30,17 @@ const (
 	// minTokenFrequency = 2 // only read tokens with frequency >= 2 from the dictionary
 )
 
-func init() {
-	hmm.LoadModel()
-}
-
-// GetVersion get the gse version
+// GetVersion get the version of gse
 func GetVersion() string {
 	return Version
 }
 
-// Prob type hmm model struct
+// Prob define the hmm model struct
 type Prob struct {
 	B, E, M, S map[rune]float64
 }
 
-// New return new gse segmenter
+// New return a new gse segmenter
 func New(files ...string) (seg Segmenter, err error) {
 	if len(files) > 1 && files[1] == "alpha" {
 		seg.AlphaNum = true
@@ -124,7 +120,7 @@ func (seg *Segmenter) CutStr(str []string, separator ...string) (r string) {
 	return
 }
 
-// LoadModel load the hmm model
+// LoadModel load the hmm model (default is Chinese char)
 //
 // Use the user's model:
 //

diff --git a/gse_test.go b/gse_test.go
@@ -39,15 +39,15 @@ func TestLoadDictMap(t *testing.T) {
 }
 
 func TestAnalyze(t *testing.T) {
-	txt := `城市地标建筑: 纽约帝国大厦, 旧金山湾金门大桥, Seattle Space Needle, Toronto CN Tower, 伦敦大笨钟`
+	txt := `城市地标建筑: 纽约帝国大厦, 旧金山湾金门大桥, Seattle Space Needle; Toronto CN Tower, 伦敦大笨钟`
 
 	s := prodSeg.Cut(txt, true)
 	tt.Equal(t, 23, len(s))
-	tt.Equal(t, "[城市地标 建筑 :  纽约 帝国大厦 ,  旧金山湾 金门大桥 ,  seattle   space   needle ,  toronto   cn   tower ,  伦敦 大笨钟]", s)
+	tt.Equal(t, "[城市地标 建筑 :  纽约 帝国大厦 ,  旧金山湾 金门大桥 ,  seattle   space   needle ;  toronto   cn   tower ,  伦敦 大笨钟]", s)
 
 	a := prodSeg.Analyze(s, "", true)
 	tt.Equal(t, 23, len(a))
-	tt.Equal(t, "[{0 4 0 0  城市地标 3 j} {4 6 1 0  建筑 14397 n} {6 8 2 0  :  0 } {8 10 3 0  纽约 1758 ns} {10 14 4 0  帝国大厦 3 nr} {14 16 5 0  ,  0 } {16 20 6 0  旧金山湾 3 ns} {20 24 7 0  金门大桥 38 nz} {24 26 8 0  ,  0 } {26 33 9 0  seattle 0 } {33 34 10 0    0 } {34 39 11 0  space 0 } {39 40 12 0    0 } {40 46 13 0  needle 0 } {46 48 14 0  ,  0 } {48 55 15 0  toronto 0 } {55 56 16 0    0 } {56 58 17 0  cn 0 } {58 59 18 0    0 } {59 64 19 0  tower 0 } {64 66 20 0  ,  0 } {66 68 21 0  伦敦 2255 ns} {68 71 22 0  大笨钟 0 }]", a)
+	tt.Equal(t, "[{0 4 0 0  城市地标 3 j} {4 6 1 0  建筑 14397 n} {6 8 2 0  :  0 } {8 10 3 0  纽约 1758 ns} {10 14 4 0  帝国大厦 3 nr} {14 16 5 0  ,  0 } {16 20 6 0  旧金山湾 3 ns} {20 24 7 0  金门大桥 38 nz} {24 26 8 0  ,  0 } {26 33 9 0  seattle 0 } {33 34 10 0    0 } {34 39 11 0  space 0 } {39 40 12 0    0 } {40 46 13 0  needle 0 } {46 48 14 0  ;  0 } {48 55 15 0  toronto 0 } {55 56 16 0    0 } {56 58 17 0  cn 0 } {58 59 18 0    0 } {59 64 19 0  tower 0 } {64 66 20 0  ,  0 } {66 68 21 0  伦敦 2255 ns} {68 71 22 0  大笨钟 0 }]", a)
 
 	tt.Equal(t, 0, a[0].Start)
 	tt.Equal(t, 4, a[0].End)
@@ -59,11 +59,11 @@ func TestAnalyze(t *testing.T) {
 
 	s = prodSeg.CutSearch(txt, true)
 	tt.Equal(t, 34, len(s))
-	tt.Equal(t, "[城市 市地 地标 城市地标 建筑 :  纽约 帝国 国大 大厦 帝国大厦 ,  金山 山湾 旧金山 旧金山湾 金门 大桥 金门大桥 ,  seattle   space   needle ,  toronto   cn   tower ,  伦敦 大笨钟]", s)
+	tt.Equal(t, "[城市 市地 地标 城市地标 建筑 :  纽约 帝国 国大 大厦 帝国大厦 ,  金山 山湾 旧金山 旧金山湾 金门 大桥 金门大桥 ,  seattle   space   needle ;  toronto   cn   tower ,  伦敦 大笨钟]", s)
 
 	a = prodSeg.Analyze(s, txt)
 	tt.Equal(t, 34, len(a))
-	tt.Equal(t, "[{0 6 0 0  城市 25084 ns} {3 9 1 0  市地 11 n} {6 12 2 0  地标 32 n} {0 12 3 0  城市地标 3 j} {12 18 4 0  建筑 14397 n} {18 20 5 0  :  0 } {20 26 6 0  纽约 1758 ns} {26 32 7 0  帝国 3655 n} {29 35 8 0  国大 114 j} {32 38 9 0  大厦 777 n} {26 38 10 0  帝国大厦 3 nr} {104 106 11 0  ,  0 } {43 49 12 0  金山 291 nr} {46 52 13 0  山湾 7 ns} {40 49 14 0  旧金山 238 ns} {40 52 15 0  旧金山湾 3 ns} {52 58 16 0  金门 149 n} {58 64 17 0  大桥 3288 ns} {52 64 18 0  金门大桥 38 nz} {86 88 19 0  ,  0 } {66 73 20 0  seattle 0 } {105 106 21 0    0 } {74 79 22 0  space 0 } {98 99 23 0    0 } {80 86 24 0  needle 0 } {64 66 25 0  ,  0 } {88 95 26 0  toronto 0 } {95 96 27 0    0 } {96 98 28 0  cn 0 } {87 88 29 0    0 } {99 104 30 0  tower 0 } {38 40 31 0  ,  0 } {106 112 32 0  伦敦 2255 ns} {112 121 33 0  大笨钟 0 }]", a)
+	tt.Equal(t, "[{0 6 0 0  城市 25084 ns} {3 9 1 0  市地 11 n} {6 12 2 0  地标 32 n} {0 12 3 0  城市地标 3 j} {12 18 4 0  建筑 14397 n} {18 20 5 0  :  0 } {20 26 6 0  纽约 1758 ns} {26 32 7 0  帝国 3655 n} {29 35 8 0  国大 114 j} {32 38 9 0  大厦 777 n} {26 38 10 0  帝国大厦 3 nr} {104 106 11 0  ,  0 } {43 49 12 0  金山 291 nr} {46 52 13 0  山湾 7 ns} {40 49 14 0  旧金山 238 ns} {40 52 15 0  旧金山湾 3 ns} {52 58 16 0  金门 149 n} {58 64 17 0  大桥 3288 ns} {52 64 18 0  金门大桥 38 nz} {64 66 19 0  ,  0 } {66 73 20 0  seattle 0 } {105 106 21 0    0 } {74 79 22 0  space 0 } {98 99 23 0    0 } {80 86 24 0  needle 0 } {86 88 25 0  ;  0 } {88 95 26 0  toronto 0 } {95 96 27 0    0 } {96 98 28 0  cn 0 } {87 88 29 0    0 } {99 104 30 0  tower 0 } {38 40 31 0  ,  0 } {106 112 32 0  伦敦 2255 ns} {112 121 33 0  大笨钟 0 }]", a)
 }
 
 func TestHMM(t *testing.T) {
@@ -313,6 +313,7 @@ func TestUrl(t *testing.T) {
 func TestLoadDictSep(t *testing.T) {
 	var seg1 Segmenter
 	seg1.DictSep = ","
+	seg1.NotLoadHMM = true
 	err := seg1.LoadDict("./testdata/test_en.txt")
 	tt.Nil(t, err)
 

diff --git a/segmenter.go b/segmenter.go
@@ -20,12 +20,15 @@ import (
 	"unicode/utf8"
 )
 
-// Segmenter 分词器结构体
+// Segmenter define the segmenter structure
 type Segmenter struct {
 	Dict    *Dictionary
 	Load    bool
 	DictSep string
 
+	// NotLoadHMM option load the default hmm model config (Chinese char)
+	NotLoadHMM bool
+
 	// AlphaNum set splitTextToWords can add token
 	// when words in alphanum
 	// set up alphanum dictionary word segmentation
@@ -60,15 +63,15 @@ type jumper struct {
 	token       *Token
 }
 
-// Segment 对文本分词
+// Segment use shortest path to segment the text
 //
-// 输入参数：
+// input parameter：
 //
-//	bytes	UTF8 文本的字节数组
+//	bytes	UTF8 text []byte
 //
-// 输出：
+// output：
 //
-//	[]Segment	划分的分词
+//	[]Segment	retrun segments result
 func (seg *Segmenter) Segment(bytes []byte) []Segment {
 	return seg.internalSegment(bytes, false)
 }
@@ -84,13 +87,13 @@ func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
 }
 
 func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
-	// 处理特殊情况
+	// specific case
 	if len(bytes) == 0 {
 		// return []Segment{}
 		return nil
 	}
 
-	// 划分字元
+	// split text to words
 	text := seg.SplitTextToWords(bytes)
 
 	return seg.segmentWords(text, searchMode)
@@ -182,13 +185,13 @@ func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
 	}
 }
 
-// SplitWords 将文本划分成字元
+// SplitWords splits a string to token words
 func SplitWords(text Text) []Text {
 	var seg Segmenter
 	return seg.SplitTextToWords(text)
 }
 
-// SplitTextToWords 将文本划分成字元
+// SplitTextToWords splits a string to token words
 func (seg *Segmenter) SplitTextToWords(text Text) []Text {
 	output := make([]Text, 0, len(text)/3)
 	current, alphanumericStart := 0, 0
@@ -221,7 +224,7 @@ func (seg *Segmenter) SplitTextToWords(text Text) []Text {
 		current += size
 	}
 
-	// 处理最后一个字元是英文的情况
+	// procsss last byte is alpha and num
 	if inAlphanumeric && !seg.AlphaNum {
 		if current != 0 {
 			output = append(output, toLow(text[alphanumericStart:current]))
@@ -239,7 +242,7 @@ func toLow(text []byte) []byte {
 	return text
 }
 
-// toLower 将英文词转化为小写
+// toLower converts a string to lower
 func toLower(text []byte) []byte {
 	output := make([]byte, len(text))
 	for i, t := range text {
@@ -253,15 +256,15 @@ func toLower(text []byte) []byte {
 	return output
 }
 
-// minInt 取两整数较小值
+// minInt get min value of int
 func minInt(a, b int) int {
 	if a > b {
 		return b
 	}
 	return a
 }
 
-// maxInt 取两整数较大值
+// maxInt get max value of int
 func maxInt(a, b int) int {
 	if a > b {
 		return a