CocaineCong · CocaineCong · Nov 16, 2023 · Nov 11, 2023 · Nov 12, 2023 · Nov 13, 2023
diff --git a/consts/dict_file.go b/consts/dict_file.go
@@ -0,0 +1,28 @@
+package consts
+
+const (
+	// dict file type to loading
+
+	// LoadDictTypeIDF dict of IDF to loading
+	LoadDictTypeIDF = iota + 1
+
+	// LoadDictTypeTFIDF dict of TFIDF to loading
+	LoadDictTypeTFIDF
+
+	// LoadDictTypeBM25 dict of BM25 to loading
+	LoadDictTypeBM25
+
+	// LoadDictTypeWithPos dict of with position to loading
+	LoadDictTypeWithPos
+
+	// LoadDictCorpus dict of corpus to loading
+	LoadDictCorpus
+)
+
+const (
+	// BM25DefaultK1 default k1 value for calculate bm25
+	BM25DefaultK1 = 1.25
+
+	// BM25DefaultK1 default B value for calculate bm25
+	BM25DefaultB = 0.75
+)
diff --git a/dag.go b/dag.go
@@ -40,6 +40,11 @@ func (seg *Segmenter) Find(str string) (float64, string, bool) {
 	return seg.Dict.Find([]byte(str))
 }
 
+// FindTFIDF find word in dictionary return word's freq, inverseFreq and existence
+func (seg *Segmenter) FindTFIDF(str string) (float64, float64, bool) {
+	return seg.Dict.FindTFIDF([]byte(str))
+}
+
 // Value find word in dictionary return word's value
 func (seg *Segmenter) Value(str string) (int, int, error) {
 	return seg.Dict.Value([]byte(str))

diff --git a/data/dict/README.md b/data/dict/README.md
@@ -1 +1,15 @@
 Some dict/zh data is from [github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
+
+update at 2023-11-16:
+
+add two new dict documents , which from [github.com/GuocaiL/nlp_corpus](https://github.com/GuocaiL/nlp_corpus)
+
+generated by `nlp_corpus/open_ner_data/boson/boson.txt`, `open_ner_data/people_daily/people_daily_ner.txt`, `open_ner_data/tianchi_yiyao/train.txt`,`open_ner_data/ResumeNER/dev.txt`
+
+1. tf_idf.txt
+
+The first column of this document is the term , the second column is the word frequency of the corresponding term, and the third column is the inverse document frequency of the corresponding term 
+
+2. tf_idf_origin.txt
+
+the origin corpus text
diff --git a/data/dict/zh/tf_idf.txt b/data/dict/zh/tf_idf.txt
diff --git a/data/dict/zh/tf_idf_origin.txt b/data/dict/zh/tf_idf_origin.txt
diff --git a/dict_1.16.go b/dict_1.16.go
@@ -12,6 +12,8 @@ package gse
 
 import (
 	"strings"
+
+	"github.com/go-ego/gse/types"
 )
 
 // //go:embed data/dict/dictionary.txt
@@ -128,6 +130,43 @@ func (seg *Segmenter) LoadDictStr(dict string) error {
 	return nil
 }
 
+// LoadTFIDFDictStr load the TFIDF dictionary from dict path
+func (seg *Segmenter) LoadTFIDFDictStr(dictFile *types.LoadDictFile) error {
+	if seg.Dict == nil {
+		seg.Dict = NewDict()
+		seg.Init()
+	}
+
+	arr := strings.Split(dictFile.FilePath, "\n")
+	for i := 0; i < len(arr); i++ {
+		s1 := strings.Split(arr[i], seg.DictSep+" ")
+		size := len(s1)
+		if size == 0 {
+			continue
+		}
+		text := strings.TrimSpace(s1[0])
+		// frequency
+		freqText := strings.TrimSpace(s1[1])
+		freq := seg.Size(size, text, freqText)
+		if freq == 0.0 {
+			continue
+		}
+		// invserse frequency
+		inverseFreqText := strings.Trim(s1[2], "\n")
+		inverseFreq := seg.Size(size, text, inverseFreqText)
+		if inverseFreq == 0.0 {
+			continue
+		}
+		// add the words to the token
+		words := seg.SplitTextToWords([]byte(text))
+		token := Token{text: words, freq: freq, inverseFreq: inverseFreq}
+		seg.Dict.AddToken(token)
+	}
+
+	seg.CalcToken()
+	return nil
+}
+
 // LoadStopEmbed load the stop dictionary from embed file
 func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error) {
 	if len(dict) > 0 {

diff --git a/dict_1.16_test.go b/dict_1.16_test.go
@@ -7,6 +7,8 @@ import (
 	_ "embed"
 	"testing"
 
+	"github.com/go-ego/gse/consts"
+	"github.com/go-ego/gse/types"
 	"github.com/vcaesar/tt"
 )
 
@@ -92,3 +94,13 @@ func TestDictSep(t *testing.T) {
 	tt.Equal(t, "x", pos)
 	tt.Equal(t, 10, f)
 }
+
+func TestLoadTFIDFDictStr(t *testing.T) {
+	var seg Segmenter
+	a := []*types.LoadDictFile{}
+	a = append(a, &types.LoadDictFile{
+		FilePath: "/workspaces/gse/data/dict/zh/tf_idf.txt",
+		FileType: consts.LoadDictTypeTFIDF,
+	})
+	seg.LoadTFIDFDict(a)
+}
diff --git a/dict_util.go b/dict_util.go
@@ -26,6 +26,9 @@ import (
 	"strconv"
 	"strings"
 	"unicode"
+	"unicode/utf8"
+
+	"github.com/go-ego/gse/types"
 )
 
 var (
@@ -216,6 +219,47 @@ func (seg *Segmenter) LoadDict(files ...string) error {
 	return nil
 }
 
+// LoadTFIDFDict load tfidf dict for cal tfidf & bm25
+func (seg *Segmenter) LoadTFIDFDict(files []*types.LoadDictFile) error {
+	if !seg.Load {
+		seg.Dict = NewDict()
+		seg.Load = true
+		seg.Init()
+	}
+
+	var (
+		dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
+	)
+
+	for _, file := range files {
+		dictFiles := DictPaths(dictDir, file.FilePath)
+		if !seg.SkipLog {
+			log.Println("Dict files path: ", dictFiles)
+		}
+
+		if len(dictFiles) == 0 {
+			log.Println("Warning: dict files is nil.")
+			// return errors.New("Dict files is nil.")
+		}
+
+		if len(dictFiles) > 0 {
+			for i := 0; i < len(dictFiles); i++ {
+				err := seg.ReadTFIDF(dictFiles[i])
+				if err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	seg.CalcToken()
+	if !seg.SkipLog {
+		log.Println("Gse dictionary loaded finished.")
+	}
+
+	return nil
+}
+
 // GetCurrentFilePath get the current file path
 func (seg *Segmenter) GetCurrentFilePath() string {
 	if seg.DictPath != "" {
@@ -238,6 +282,75 @@ func (seg *Segmenter) GetIdfPath(files ...string) []string {
 	return files
 }
 
+// LoadCorpusAverLen load the average length of corpus
+func (seg *Segmenter) LoadCorpusAverLen(files ...string) (corpusTotal float64, err error) {
+	filePaths := seg.GetCorpusPath(files...)
+	corpusTotal = 0
+	for _, v := range filePaths {
+		var number float64 = 0
+		number, err = seg.ReadCorpus(v)
+		if err != nil {
+			log.Printf("Could not read corpus from file path: \"%s\", %v \n", v, err)
+			return
+		}
+		corpusTotal += number
+	}
+
+	corpusTotal = corpusTotal / float64(len(filePaths))
+
+	return
+}
+
+// GetCorpusPath get the corpus path
+func (seg *Segmenter) GetCorpusPath(files ...string) []string {
+	var (
+		dictDir  = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
+		dictPath = path.Join(dictDir, "dict/zh/tf_idf_origin.txt")
+	)
+
+	files = append(files, dictPath)
+
+	return files
+}
+
+func (seg *Segmenter) ReadCorpus(file string) (corpusAverLen float64, err error) {
+	if !seg.SkipLog {
+		log.Printf("Load the gse dictionary: \"%s\" ", file)
+	}
+	var corpusNumber float64 = 0
+	var corpusLength float64 = 0
+	dictFile, err := os.Open(file)
+	if err != nil {
+		log.Printf("Could not load dictionaries: \"%s\", %v \n", file, err)
+		return
+	}
+	defer dictFile.Close()
+
+	// new the Scanner to read file content
+	scanner := bufio.NewScanner(dictFile)
+	// read file content by line
+	for scanner.Scan() {
+		corpusNumber++
+		line := scanner.Text()
+		corpusLength += float64(utf8.RuneCountInString(line))
+	}
+	corpusAverLen = corpusLength / corpusNumber
+
+	return
+}
+
+// GetTfIdfPath get the tfidf path
+func (seg *Segmenter) GetTfIdfPath(files ...string) []string {
+	var (
+		dictDir  = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
+		dictPath = path.Join(dictDir, "dict/zh/tf_idf.txt")
+	)
+
+	files = append(files, dictPath)
+
+	return files
+}
+
 // Read read the dict file
 func (seg *Segmenter) Read(file string) error {
 	if !seg.SkipLog {
@@ -255,6 +368,23 @@ func (seg *Segmenter) Read(file string) error {
 	return seg.Reader(reader, file)
 }
 
+// ReadTFIDF read the dict file
+func (seg *Segmenter) ReadTFIDF(file string) error {
+	if !seg.SkipLog {
+		log.Printf("Load the gse dictionary: \"%s\" ", file)
+	}
+
+	dictFile, err := os.Open(file)
+	if err != nil {
+		log.Printf("Could not load dictionaries: \"%s\", %v \n", file, err)
+		return err
+	}
+	defer dictFile.Close()
+
+	reader := bufio.NewReader(dictFile)
+	return seg.ReaderTFIDF(reader, file)
+}
+
 // Size frequency is calculated based on the size of the text
 func (seg *Segmenter) Size(size int, text, freqText string) (freq float64) {
 	if size == 0 {
@@ -294,8 +424,7 @@ func (seg *Segmenter) Size(size int, text, freqText string) (freq float64) {
 }
 
 // ReadN read the tokens by '\n'
-func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int,
-	text, freqText, pos string, fsErr error) {
+func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int, text, freqText, pos string, fsErr error) {
 	var txt string
 	txt, fsErr = reader.ReadString('\n')
 
@@ -313,6 +442,23 @@ func (seg *Segmenter) ReadN(reader *bufio.Reader) (size int,
 	return
 }
 
+// ReadNTFIDF read the tokens with tfidf by '\n'
+func (seg *Segmenter) ReadNTFIDF(reader *bufio.Reader) (size int, text, freqText, idfText string, fsErr error) {
+	var txt string
+	txt, fsErr = reader.ReadString('\n')
+
+	parts := strings.Split(txt, seg.DictSep+" ")
+	size = len(parts)
+
+	text = parts[0]
+	if size > 2 {
+		freqText = strings.TrimSpace(parts[1])
+		idfText = strings.TrimSpace(strings.Trim(parts[2], "\n"))
+	}
+
+	return
+}
+
 // Reader load dictionary from io.Reader
 func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error {
 	var (
@@ -382,6 +528,71 @@ func (seg *Segmenter) Reader(reader *bufio.Reader, files ...string) error {
 	return nil
 }
 
+// ReaderTFIDF load tfidf dictionary from io.Reader
+func (seg *Segmenter) ReaderTFIDF(reader *bufio.Reader, files ...string) error {
+	var (
+		file                    string
+		text, freqText, idfText string
+		freq                    float64
+		inverseFreq             float64
+	)
+
+	if len(files) > 0 {
+		file = files[0]
+	}
+
+	// Read the word segmentation line by line
+	line := 0
+	for {
+		line++
+		var (
+			size  int
+			fsErr error
+		)
+		if seg.DictSep == "" {
+			size, fsErr = fmt.Fscanln(reader, &text, &freqText, &idfText)
+		} else {
+			size, text, freqText, idfText, fsErr = seg.ReadNTFIDF(reader)
+		}
+
+		if fsErr != nil {
+			if fsErr == io.EOF {
+				// End of file
+				if seg.DictSep == "" {
+					break
+				}
+
+				if seg.DictSep != "" && text == "" {
+					break
+				}
+			}
+
+			if size > 0 {
+				if seg.MoreLog {
+					log.Printf("File '%v' line \"%v\" read error: %v, skip",
+						file, line, fsErr.Error())
+				}
+			} else {
+				log.Printf("File '%v' line \"%v\" is empty, read error: %v, skip",
+					file, line, fsErr.Error())
+			}
+		}
+
+		freq = seg.Size(size, text, freqText)
+		inverseFreq = seg.Size(size, text, idfText)
+		if freq == 0.0 || inverseFreq == 0.0 {
+			continue
+		}
+
+		// Add participle tokens to the dictionary
+		words := seg.SplitTextToWords([]byte(text))
+		token := Token{text: words, freq: freq, inverseFreq: inverseFreq}
+		seg.Dict.AddToken(token)
+	}
+
+	return nil
+}
+
 // DictPaths get the dict's paths
 func DictPaths(dictDir, filePath string) (files []string) {
 	var dictPath string