Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add tfidf & bm25 in TagExtracter #1

Merged
merged 12 commits into from
Nov 16, 2023
28 changes: 28 additions & 0 deletions consts/dict_file.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package consts

const (
// dict file type to loading

// LoadDictTypeIDF dict of IDF to loading
LoadDictTypeIDF = iota + 1

// LoadDictTypeTFIDF dict of TFIDF to loading
LoadDictTypeTFIDF

// LoadDictTypeBM25 dict of BM25 to loading
LoadDictTypeBM25

// LoadDictTypeWithPos dict of with position to loading
LoadDictTypeWithPos

// LoadDictCorpus dict of corpus to loading
LoadDictCorpus
)

const (
// BM25DefaultK1 default k1 value for calculate bm25
BM25DefaultK1 = 1.25

// BM25DefaultK1 default B value for calculate bm25
BM25DefaultB = 0.75
)
5 changes: 5 additions & 0 deletions dag.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ func (seg *Segmenter) Find(str string) (float64, string, bool) {
return seg.Dict.Find([]byte(str))
}

// FindTFIDF find word in dictionary return word's freq, inverseFreq and existence
func (seg *Segmenter) FindTFIDF(str string) (float64, float64, bool) {
return seg.Dict.FindTFIDF([]byte(str))
}

// Value find word in dictionary return word's value
func (seg *Segmenter) Value(str string) (int, int, error) {
return seg.Dict.Value([]byte(str))
Expand Down
14 changes: 14 additions & 0 deletions data/dict/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,15 @@
Some dict/zh data is from [github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)

update at 2023-11-16:

add two new dict documents , which from [github.com/GuocaiL/nlp_corpus](https://github.com/GuocaiL/nlp_corpus)

generated by `nlp_corpus/open_ner_data/boson/boson.txt`, `open_ner_data/people_daily/people_daily_ner.txt`, `open_ner_data/tianchi_yiyao/train.txt`,`open_ner_data/ResumeNER/dev.txt`

1. tf_idf.txt

The first column of this document is the term , the second column is the word frequency of the corresponding term, and the third column is the inverse document frequency of the corresponding term

2. tf_idf_origin.txt

the origin corpus text
107,536 changes: 107,536 additions & 0 deletions data/dict/zh/tf_idf.txt

Large diffs are not rendered by default.

33,450 changes: 33,450 additions & 0 deletions data/dict/zh/tf_idf_origin.txt

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions dict_1.16.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ package gse

import (
"strings"

"github.com/go-ego/gse/types"
)

// //go:embed data/dict/dictionary.txt
Expand Down Expand Up @@ -128,6 +130,43 @@ func (seg *Segmenter) LoadDictStr(dict string) error {
return nil
}

// LoadTFIDFDictStr load the TFIDF dictionary from dict path
func (seg *Segmenter) LoadTFIDFDictStr(dictFile *types.LoadDictFile) error {
if seg.Dict == nil {
seg.Dict = NewDict()
seg.Init()
}

arr := strings.Split(dictFile.FilePath, "\n")
for i := 0; i < len(arr); i++ {
s1 := strings.Split(arr[i], seg.DictSep+" ")
size := len(s1)
if size == 0 {
continue
}
text := strings.TrimSpace(s1[0])
// frequency
freqText := strings.TrimSpace(s1[1])
freq := seg.Size(size, text, freqText)
if freq == 0.0 {
continue
}
// invserse frequency
inverseFreqText := strings.Trim(s1[2], "\n")
inverseFreq := seg.Size(size, text, inverseFreqText)
if inverseFreq == 0.0 {
continue
}
// add the words to the token
words := seg.SplitTextToWords([]byte(text))
token := Token{text: words, freq: freq, inverseFreq: inverseFreq}
seg.Dict.AddToken(token)
}

seg.CalcToken()
return nil
}

// LoadStopEmbed load the stop dictionary from embed file
func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error) {
if len(dict) > 0 {
Expand Down
12 changes: 12 additions & 0 deletions dict_1.16_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
_ "embed"
"testing"

"github.com/go-ego/gse/consts"
"github.com/go-ego/gse/types"
"github.com/vcaesar/tt"
)

Expand Down Expand Up @@ -92,3 +94,13 @@ func TestDictSep(t *testing.T) {
tt.Equal(t, "x", pos)
tt.Equal(t, 10, f)
}

func TestLoadTFIDFDictStr(t *testing.T) {
var seg Segmenter
a := []*types.LoadDictFile{}
a = append(a, &types.LoadDictFile{
FilePath: "/workspaces/gse/data/dict/zh/tf_idf.txt",
FileType: consts.LoadDictTypeTFIDF,
})
seg.LoadTFIDFDict(a)
}
215 changes: 213 additions & 2 deletions dict_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ import (
"strconv"
"strings"
"unicode"
"unicode/utf8"

"github.com/go-ego/gse/types"
)

var (
Expand Down Expand Up @@ -216,6 +219,47 @@ func (seg *Segmenter) LoadDict(files ...string) error {
return nil
}

// LoadTFIDFDict load tfidf dict for cal tfidf & bm25
func (seg *Segmenter) LoadTFIDFDict(files []*types.LoadDictFile) error {
if !seg.Load {
seg.Dict = NewDict()
seg.Load = true
seg.Init()
}

var (
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
)

for _, file := range files {
dictFiles := DictPaths(dictDir, file.FilePath)
if !seg.SkipLog {
log.Println("Dict files path: ", dictFiles)
}

if len(dictFiles) == 0 {
log.Println("Warning: dict files is nil.")
// return errors.New("Dict files is nil.")
}

if len(dictFiles) > 0 {
for i := 0; i < len(dictFiles); i++ {
err := seg.ReadTFIDF(dictFiles[i])
if err != nil {
return err
}
}
}
}

seg.CalcToken()
if !seg.SkipLog {
log.Println("Gse dictionary loaded finished.")
}

return nil
}

// GetCurrentFilePath get the current file path
func (seg *Segmenter) GetCurrentFilePath() string {
if seg.DictPath != "" {
Expand All @@ -238,6 +282,75 @@ func (seg *Segmenter) GetIdfPath(files ...string) []string {
return files
}

// LoadCorpusAverLen load the average length of corpus
func (seg *Segmenter) LoadCorpusAverLen(files ...string) (corpusTotal float64, err error) {
filePaths := seg.GetCorpusPath(files...)
corpusTotal = 0
for _, v := range filePaths {
var number float64 = 0
number, err = seg.ReadCorpus(v)
if err != nil {
log.Printf("Could not read corpus from file path: \"%s\", %v \n", v, err)
return
}
corpusTotal += number
}

corpusTotal = corpusTotal / float64(len(filePaths))

return
}

// GetCorpusPath get the corpus path
func (seg *Segmenter) GetCorpusPath(files ...string) []string {
var (
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
dictPath = path.Join(dictDir, "dict/zh/tf_idf_origin.txt")
)

files = append(files, dictPath)

return files
}

func (seg *Segmenter) ReadCorpus(file string) (corpusAverLen float64, err error) {
if !seg.SkipLog {
log.Printf("Load the gse dictionary: \"%s\" ", file)
}
var corpusNumber float64 = 0
var corpusLength float64 = 0
dictFile, err := os.Open(file)
if err != nil {
log.Printf("Could not load dictionaries: \"%s\", %v \n", file, err)
return
}
defer dictFile.Close()

// new the Scanner to read file content
scanner := bufio.NewScanner(dictFile)
// read file content by line
for scanner.Scan() {
corpusNumber++
line := scanner.Text()
corpusLength += float64(utf8.RuneCountInString(line))
}
corpusAverLen = corpusLength / corpusNumber

return
}

// GetTfIdfPath get the tfidf path
func (seg *Segmenter) GetTfIdfPath(files ...string) []string {
var (
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
dictPath = path.Join(dictDir, "dict/zh/tf_idf.txt")
)

files = append(files, dictPath)

return files
}

// Read read the dict file
func (seg *Segmenter) Read(file string) error {
if !seg.SkipLog {
Expand All @@ -255,6 +368,23 @@ func (seg *Segmenter) Read(file string) error {
return seg.Reader(reader, file)
}

// ReadTFIDF read the dict file
func (seg *Segmenter) ReadTFIDF(file string) error {
if !seg.SkipLog {
log.Printf("Load the gse dictionary: \"%s\" ", file)
}

dictFile, err := os.Open(file)
if err != nil {
log.Printf("Could not load dictionaries: \"%s\", %v \n", file, err)
return err
}
defer dictFile.Close()

reader := bufio.NewReader(dictFile)
return seg.ReaderTFIDF(reader, file)
}

// Size frequency is calculated based on the size of the text
func (seg *Segmenter) Size(size int, text, freqText string) (freq float64) {
if size == 0 {
Expand Down Expand Up @@ -294,8 +424,7 @@ func (seg *Segmenter) Size(size int, text, freqText string) (freq float64) {
}

// ReadN read the tokens by '\n'
func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int,
text, freqText, pos string, fsErr error) {
func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int, text, freqText, pos string, fsErr error) {
var txt string
txt, fsErr = reader.ReadString('\n')

Expand All @@ -313,6 +442,23 @@ func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int,
return
}

// ReadNTFIDF read the tokens with tfidf by '\n'
func (seg *Segmenter) ReadNTFIDF(reader *bufio.Reader) (size int, text, freqText, idfText string, fsErr error) {
var txt string
txt, fsErr = reader.ReadString('\n')

parts := strings.Split(txt, seg.DictSep+" ")
size = len(parts)

text = parts[0]
if size > 2 {
freqText = strings.TrimSpace(parts[1])
idfText = strings.TrimSpace(strings.Trim(parts[2], "\n"))
}

return
}

// Reader load dictionary from io.Reader
func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error {
var (
Expand Down Expand Up @@ -382,6 +528,71 @@ func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error {
return nil
}

// ReaderTFIDF load tfidf dictionary from io.Reader
func (seg *Segmenter) ReaderTFIDF(reader *bufio.Reader, files ...string) error {
var (
file string
text, freqText, idfText string
freq float64
inverseFreq float64
)

if len(files) > 0 {
file = files[0]
}

// Read the word segmentation line by line
line := 0
for {
line++
var (
size int
fsErr error
)
if seg.DictSep == "" {
size, fsErr = fmt.Fscanln(reader, &text, &freqText, &idfText)
} else {
size, text, freqText, idfText, fsErr = seg.ReadNTFIDF(reader)
}

if fsErr != nil {
if fsErr == io.EOF {
// End of file
if seg.DictSep == "" {
break
}

if seg.DictSep != "" && text == "" {
break
}
}

if size > 0 {
if seg.MoreLog {
log.Printf("File '%v' line \"%v\" read error: %v, skip",
file, line, fsErr.Error())
}
} else {
log.Printf("File '%v' line \"%v\" is empty, read error: %v, skip",
file, line, fsErr.Error())
}
}

freq = seg.Size(size, text, freqText)
inverseFreq = seg.Size(size, text, idfText)
if freq == 0.0 || inverseFreq == 0.0 {
continue
}

// Add participle tokens to the dictionary
words := seg.SplitTextToWords([]byte(text))
token := Token{text: words, freq: freq, inverseFreq: inverseFreq}
seg.Dict.AddToken(token)
}

return nil
}

// DictPaths get the dict's paths
func DictPaths(dictDir, filePath string) (files []string) {
var dictPath string
Expand Down
Loading