Skip to content

Commit

Permalink
Merge pull request #165 from go-ego/en-pr
Browse files Browse the repository at this point in the history
Add: add load hmm model option and update godoc
  • Loading branch information
vcaesar authored Jan 14, 2023
2 parents bfd624e + c85b6f0 commit f20a3db
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 45 deletions.
10 changes: 7 additions & 3 deletions dict_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ const (
zhT1 = "dict/zh/t_1.txt"
)

// Init init seg config
// Init initializes the segmenter config
func (seg *Segmenter) Init() {
if seg.MinTokenFreq == 0 {
seg.MinTokenFreq = 2.0
Expand All @@ -47,6 +47,10 @@ func (seg *Segmenter) Init() {
if seg.TextFreq == "" {
seg.TextFreq = "2.0"
}

if !seg.NotLoadHMM {
seg.LoadModel()
}
}

// Dictionary returns the dictionary used by the tokenizer
Expand All @@ -66,7 +70,7 @@ func (seg *Segmenter) ToToken(text string, freq float64, pos ...string) Token {
return token
}

// AddToken add new text to token
// AddToken add a new text to the token
func (seg *Segmenter) AddToken(text string, freq float64, pos ...string) error {
token := seg.ToToken(text, freq, pos...)
return seg.Dict.AddToken(token)
Expand Down Expand Up @@ -364,7 +368,7 @@ func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error {
pos = ""
}

// Add participle tokens to dictionary
// Add participle tokens to the dictionary
words := seg.SplitTextToWords([]byte(text))
token := Token{text: words, freq: freq, pos: pos}
seg.Dict.AddToken(token)
Expand Down
30 changes: 15 additions & 15 deletions dictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,37 +17,37 @@ import (
"github.com/vcaesar/cedar"
)

// Dictionary 结构体实现了一个字串双数组树,
// 一个分词可能出现在叶子节点也有可能出现在非叶节点
// Dictionary struct implements a string double array trie.
// one segment maybe in leaf node or not
type Dictionary struct {
trie *cedar.Cedar // Cedar 双数组树
trie *cedar.Cedar // Cedar double array trie

maxTokenLen int // 词典中最长的分词
Tokens []Token // 词典中所有的分词,方便遍历
totalFreq float64 // 词典中所有分词的频率之和
maxTokenLen int // the maximum length of the dictionary
Tokens []Token // the all tokens in the dictionary, to traverse
totalFreq float64 // the total number of tokens in the dictionary
}

// NewDict new dictionary
// NewDict a new dictionary trie
func NewDict() *Dictionary {
return &Dictionary{trie: cedar.New()}
}

// MaxTokenLen 词典中最长的分词
// MaxTokenLen the maximum length of the dictionary
func (dict *Dictionary) MaxTokenLen() int {
return dict.maxTokenLen
}

// NumTokens 词典中分词数目
// NumTokens the number of tokens in the dictionary
func (dict *Dictionary) NumTokens() int {
return len(dict.Tokens)
}

// TotalFreq 词典中所有分词的频率之和
// TotalFreq the total frequency of the dictionary
func (dict *Dictionary) TotalFreq() float64 {
return dict.totalFreq
}

// AddToken 向词典中加入一个分词
// AddToken add a token to the dictionary
func (dict *Dictionary) AddToken(token Token) error {
bytes := textSliceToBytes(token.text)
val, err := dict.trie.Get(bytes)
Expand Down Expand Up @@ -77,8 +77,8 @@ func (dict *Dictionary) RemoveToken(token Token) error {
return dict.trie.Delete(bytes)
}

// LookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词
// 返回值为找到的分词数
// LookupTokens finds tokens and words in the dictionary, matching the given pattern
// and returns the number of tokens
func (dict *Dictionary) LookupTokens(
words []Text, tokens []*Token) (numOfTokens int) {
var (
Expand All @@ -103,7 +103,7 @@ func (dict *Dictionary) LookupTokens(
}

// Find find the word in the dictionary is non-existent
// and the word's frequency, pos
// and the word's frequency and pos
func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
var (
id, value int
Expand Down Expand Up @@ -131,7 +131,7 @@ func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
}

// Value find word in the dictionary
// retrun the word's value, id
// retrun the word's value and id
func (dict *Dictionary) Value(word []byte) (val, id int, err error) {
id, err = dict.trie.Jump(word, id)
if err != nil {
Expand Down
12 changes: 4 additions & 8 deletions gse.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,17 @@ const (
// minTokenFrequency = 2 // only read tokens with frequency >= 2 from the dictionary
)

func init() {
hmm.LoadModel()
}

// GetVersion get the gse version
// GetVersion get the version of gse
func GetVersion() string {
return Version
}

// Prob type hmm model struct
// Prob define the hmm model struct
type Prob struct {
B, E, M, S map[rune]float64
}

// New return new gse segmenter
// New return a new gse segmenter
func New(files ...string) (seg Segmenter, err error) {
if len(files) > 1 && files[1] == "alpha" {
seg.AlphaNum = true
Expand Down Expand Up @@ -124,7 +120,7 @@ func (seg *Segmenter) CutStr(str []string, separator ...string) (r string) {
return
}

// LoadModel load the hmm model
// LoadModel load the hmm model (default is Chinese char)
//
// Use the user's model:
//
Expand Down
11 changes: 6 additions & 5 deletions gse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ func TestLoadDictMap(t *testing.T) {
}

func TestAnalyze(t *testing.T) {
txt := `城市地标建筑: 纽约帝国大厦, 旧金山湾金门大桥, Seattle Space Needle, Toronto CN Tower, 伦敦大笨钟`
txt := `城市地标建筑: 纽约帝国大厦, 旧金山湾金门大桥, Seattle Space Needle; Toronto CN Tower, 伦敦大笨钟`

s := prodSeg.Cut(txt, true)
tt.Equal(t, 23, len(s))
tt.Equal(t, "[城市地标 建筑 : 纽约 帝国大厦 , 旧金山湾 金门大桥 , seattle space needle , toronto cn tower , 伦敦 大笨钟]", s)
tt.Equal(t, "[城市地标 建筑 : 纽约 帝国大厦 , 旧金山湾 金门大桥 , seattle space needle ; toronto cn tower , 伦敦 大笨钟]", s)

a := prodSeg.Analyze(s, "", true)
tt.Equal(t, 23, len(a))
tt.Equal(t, "[{0 4 0 0 城市地标 3 j} {4 6 1 0 建筑 14397 n} {6 8 2 0 : 0 } {8 10 3 0 纽约 1758 ns} {10 14 4 0 帝国大厦 3 nr} {14 16 5 0 , 0 } {16 20 6 0 旧金山湾 3 ns} {20 24 7 0 金门大桥 38 nz} {24 26 8 0 , 0 } {26 33 9 0 seattle 0 } {33 34 10 0 0 } {34 39 11 0 space 0 } {39 40 12 0 0 } {40 46 13 0 needle 0 } {46 48 14 0 , 0 } {48 55 15 0 toronto 0 } {55 56 16 0 0 } {56 58 17 0 cn 0 } {58 59 18 0 0 } {59 64 19 0 tower 0 } {64 66 20 0 , 0 } {66 68 21 0 伦敦 2255 ns} {68 71 22 0 大笨钟 0 }]", a)
tt.Equal(t, "[{0 4 0 0 城市地标 3 j} {4 6 1 0 建筑 14397 n} {6 8 2 0 : 0 } {8 10 3 0 纽约 1758 ns} {10 14 4 0 帝国大厦 3 nr} {14 16 5 0 , 0 } {16 20 6 0 旧金山湾 3 ns} {20 24 7 0 金门大桥 38 nz} {24 26 8 0 , 0 } {26 33 9 0 seattle 0 } {33 34 10 0 0 } {34 39 11 0 space 0 } {39 40 12 0 0 } {40 46 13 0 needle 0 } {46 48 14 0 ; 0 } {48 55 15 0 toronto 0 } {55 56 16 0 0 } {56 58 17 0 cn 0 } {58 59 18 0 0 } {59 64 19 0 tower 0 } {64 66 20 0 , 0 } {66 68 21 0 伦敦 2255 ns} {68 71 22 0 大笨钟 0 }]", a)

tt.Equal(t, 0, a[0].Start)
tt.Equal(t, 4, a[0].End)
Expand All @@ -59,11 +59,11 @@ func TestAnalyze(t *testing.T) {

s = prodSeg.CutSearch(txt, true)
tt.Equal(t, 34, len(s))
tt.Equal(t, "[城市 市地 地标 城市地标 建筑 : 纽约 帝国 国大 大厦 帝国大厦 , 金山 山湾 旧金山 旧金山湾 金门 大桥 金门大桥 , seattle space needle , toronto cn tower , 伦敦 大笨钟]", s)
tt.Equal(t, "[城市 市地 地标 城市地标 建筑 : 纽约 帝国 国大 大厦 帝国大厦 , 金山 山湾 旧金山 旧金山湾 金门 大桥 金门大桥 , seattle space needle ; toronto cn tower , 伦敦 大笨钟]", s)

a = prodSeg.Analyze(s, txt)
tt.Equal(t, 34, len(a))
tt.Equal(t, "[{0 6 0 0 城市 25084 ns} {3 9 1 0 市地 11 n} {6 12 2 0 地标 32 n} {0 12 3 0 城市地标 3 j} {12 18 4 0 建筑 14397 n} {18 20 5 0 : 0 } {20 26 6 0 纽约 1758 ns} {26 32 7 0 帝国 3655 n} {29 35 8 0 国大 114 j} {32 38 9 0 大厦 777 n} {26 38 10 0 帝国大厦 3 nr} {104 106 11 0 , 0 } {43 49 12 0 金山 291 nr} {46 52 13 0 山湾 7 ns} {40 49 14 0 旧金山 238 ns} {40 52 15 0 旧金山湾 3 ns} {52 58 16 0 金门 149 n} {58 64 17 0 大桥 3288 ns} {52 64 18 0 金门大桥 38 nz} {86 88 19 0 , 0 } {66 73 20 0 seattle 0 } {105 106 21 0 0 } {74 79 22 0 space 0 } {98 99 23 0 0 } {80 86 24 0 needle 0 } {64 66 25 0 , 0 } {88 95 26 0 toronto 0 } {95 96 27 0 0 } {96 98 28 0 cn 0 } {87 88 29 0 0 } {99 104 30 0 tower 0 } {38 40 31 0 , 0 } {106 112 32 0 伦敦 2255 ns} {112 121 33 0 大笨钟 0 }]", a)
tt.Equal(t, "[{0 6 0 0 城市 25084 ns} {3 9 1 0 市地 11 n} {6 12 2 0 地标 32 n} {0 12 3 0 城市地标 3 j} {12 18 4 0 建筑 14397 n} {18 20 5 0 : 0 } {20 26 6 0 纽约 1758 ns} {26 32 7 0 帝国 3655 n} {29 35 8 0 国大 114 j} {32 38 9 0 大厦 777 n} {26 38 10 0 帝国大厦 3 nr} {104 106 11 0 , 0 } {43 49 12 0 金山 291 nr} {46 52 13 0 山湾 7 ns} {40 49 14 0 旧金山 238 ns} {40 52 15 0 旧金山湾 3 ns} {52 58 16 0 金门 149 n} {58 64 17 0 大桥 3288 ns} {52 64 18 0 金门大桥 38 nz} {64 66 19 0 , 0 } {66 73 20 0 seattle 0 } {105 106 21 0 0 } {74 79 22 0 space 0 } {98 99 23 0 0 } {80 86 24 0 needle 0 } {86 88 25 0 ; 0 } {88 95 26 0 toronto 0 } {95 96 27 0 0 } {96 98 28 0 cn 0 } {87 88 29 0 0 } {99 104 30 0 tower 0 } {38 40 31 0 , 0 } {106 112 32 0 伦敦 2255 ns} {112 121 33 0 大笨钟 0 }]", a)
}

func TestHMM(t *testing.T) {
Expand Down Expand Up @@ -313,6 +313,7 @@ func TestUrl(t *testing.T) {
func TestLoadDictSep(t *testing.T) {
var seg1 Segmenter
seg1.DictSep = ","
seg1.NotLoadHMM = true
err := seg1.LoadDict("./testdata/test_en.txt")
tt.Nil(t, err)

Expand Down
31 changes: 17 additions & 14 deletions segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ import (
"unicode/utf8"
)

// Segmenter 分词器结构体
// Segmenter define the segmenter structure
type Segmenter struct {
Dict *Dictionary
Load bool
DictSep string

// NotLoadHMM option load the default hmm model config (Chinese char)
NotLoadHMM bool

// AlphaNum set splitTextToWords can add token
// when words in alphanum
// set up alphanum dictionary word segmentation
Expand Down Expand Up @@ -60,15 +63,15 @@ type jumper struct {
token *Token
}

// Segment 对文本分词
// Segment use shortest path to segment the text
//
// 输入参数
// input parameter
//
// bytes UTF8 文本的字节数组
// bytes UTF8 text []byte
//
// 输出
// output
//
// []Segment 划分的分词
// []Segment retrun segments result
func (seg *Segmenter) Segment(bytes []byte) []Segment {
return seg.internalSegment(bytes, false)
}
Expand All @@ -84,13 +87,13 @@ func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
}

func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
// 处理特殊情况
// specific case
if len(bytes) == 0 {
// return []Segment{}
return nil
}

// 划分字元
// split text to words
text := seg.SplitTextToWords(bytes)

return seg.segmentWords(text, searchMode)
Expand Down Expand Up @@ -182,13 +185,13 @@ func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
}
}

// SplitWords 将文本划分成字元
// SplitWords splits a string to token words
func SplitWords(text Text) []Text {
var seg Segmenter
return seg.SplitTextToWords(text)
}

// SplitTextToWords 将文本划分成字元
// SplitTextToWords splits a string to token words
func (seg *Segmenter) SplitTextToWords(text Text) []Text {
output := make([]Text, 0, len(text)/3)
current, alphanumericStart := 0, 0
Expand Down Expand Up @@ -221,7 +224,7 @@ func (seg *Segmenter) SplitTextToWords(text Text) []Text {
current += size
}

// 处理最后一个字元是英文的情况
// procsss last byte is alpha and num
if inAlphanumeric && !seg.AlphaNum {
if current != 0 {
output = append(output, toLow(text[alphanumericStart:current]))
Expand All @@ -239,7 +242,7 @@ func toLow(text []byte) []byte {
return text
}

// toLower 将英文词转化为小写
// toLower converts a string to lower
func toLower(text []byte) []byte {
output := make([]byte, len(text))
for i, t := range text {
Expand All @@ -253,15 +256,15 @@ func toLower(text []byte) []byte {
return output
}

// minInt 取两整数较小值
// minInt get min value of int
func minInt(a, b int) int {
if a > b {
return b
}
return a
}

// maxInt 取两整数较大值
// maxInt get max value of int
func maxInt(a, b int) int {
if a > b {
return a
Expand Down

0 comments on commit f20a3db

Please sign in to comment.