Skip to content

Commit

Permalink
重构CustomDictionary,支持多实例 #1339
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Dec 5, 2019
1 parent 5cd35e4 commit ec9c509
Show file tree
Hide file tree
Showing 21 changed files with 210 additions and 104 deletions.
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,20 +254,25 @@ public class DemoCustomDictionary
{
public static void main(String[] args)
{
// 系统默认的词典
CustomDictionary dictionary = CustomDictionary.DEFAULT;
// 每个分词器都有一份词典,默认公用 CustomDictionary.DEFAULT,你可以为任何分词器指定一份不同的词典
CustomDictionary myDictionary = new CustomDictionary("data/dictionary/custom/CustomDictionary.txt", "data/dictionary/custom/机构名词典.txt");
StandardTokenizer.SEGMENT.enableCustomDictionary(myDictionary);
// 动态增加
CustomDictionary.add("攻城狮");
dictionary.add("攻城狮");
// 强行插入
CustomDictionary.insert("白富美", "nz 1024");
dictionary.insert("白富美", "nz 1024");
// 删除词语(注释掉试试)
// CustomDictionary.remove("攻城狮");
System.out.println(CustomDictionary.add("单身狗", "nz 1024 n 1"));
System.out.println(CustomDictionary.get("单身狗"));
// dictionary.remove("攻城狮");
System.out.println(dictionary.add("单身狗", "nz 1024 n 1"));
System.out.println(dictionary.get("单身狗"));

String text = "攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰"; // 怎么可能噗哈哈!

// AhoCorasickDoubleArrayTrie自动机扫描文本中出现的自定义词语
final char[] charArray = text.toCharArray();
CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>()
dictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>()
{
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value)
Expand All @@ -282,7 +287,7 @@ public class DemoCustomDictionary
}
```
- 说明
* `CustomDictionary`是一份全局的用户自定义词典,可以随时增删,影响全部分词器。另外可以在任何分词器中关闭它。通过代码动态增删不会保存到词典文件。
* `CustomDictionary.DEFAULT`是一份全局的用户自定义词典,可以随时增删,影响全部分词器。另外可以在任何分词器中关闭它。通过代码动态增删不会保存到词典文件。
* 中文分词≠词典,词典无法解决中文分词,`Segment`提供高低优先级应对不同场景,请参考[FAQ](https://github.com/hankcs/HanLP/wiki/FAQ#%E4%B8%BA%E4%BB%80%E4%B9%88%E4%BF%AE%E6%94%B9%E4%BA%86%E8%AF%8D%E5%85%B8%E8%BF%98%E6%98%AF%E6%B2%A1%E6%9C%89%E6%95%88%E6%9E%9C)
- 追加词典
* `CustomDictionary`主词典文本路径是`data/dictionary/custom/CustomDictionary.txt`,用户可以在此增加自己的词语(不推荐);也可以单独新建一个文本文件,通过配置文件`CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 我的词典.txt;`来追加词典(推荐)。
Expand Down
107 changes: 80 additions & 27 deletions src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,70 @@ public class CustomDictionary
/**
* 用于储存用户动态插入词条的二分trie树
*/
public static BinTrie<CoreDictionary.Attribute> trie;
public static DoubleArrayTrie<CoreDictionary.Attribute> dat = new DoubleArrayTrie<CoreDictionary.Attribute>();
public BinTrie<CoreDictionary.Attribute> trie;
/**
* 用于储存文件中的词条
*/
public DoubleArrayTrie<CoreDictionary.Attribute> dat;
/**
* 本词典是从哪些路径加载得到的
*/
public String path[];
/**
* 默认实例
*/
public static CustomDictionary DEFAULT = new CustomDictionary(HanLP.Config.CustomDictionaryPath);

/**
* 构造一份词典对象,并加载{@code com.hankcs.hanlp.HanLP.Config#CustomDictionaryPath}
*/
public CustomDictionary()
{
this(HanLP.Config.CustomDictionaryPath);
}

// 自动加载词典
static
/**
* 构造一份词典对象,并加载指定路径的词典
* @param path 词典路径
*/
public CustomDictionary(String... path)
{
this(new DoubleArrayTrie<CoreDictionary.Attribute>(), new BinTrie<CoreDictionary.Attribute>(), path);
}

/**
* 使用高级数据结构构造词典对象,并加载指定路径的词典
* @param dat 双数组trie树
* @param trie trie树
* @param path 词典路径
*/
public CustomDictionary(DoubleArrayTrie<CoreDictionary.Attribute> dat, BinTrie<CoreDictionary.Attribute> trie, String[] path)
{
this.dat = dat;
this.trie = trie;
if (path != null)
{
load(path);
}
}

/**
* 加载指定路径的词典
* @param path 词典路径
* @return 是否加载成功
*/
public boolean load(String... path)
{
String path[] = HanLP.Config.CustomDictionaryPath;
long start = System.currentTimeMillis();
if (!loadMainDictionary(path[0]))
{
logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
return false;
}
else
{
logger.info("自定义词典加载成功:" + dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
return true;
}
}

Expand Down Expand Up @@ -152,9 +201,14 @@ public static boolean loadMainDictionary(String mainPath, String path[], DoubleA
return true;
}

private static boolean loadMainDictionary(String mainPath)
/**
* 使用词典路径为缓存路径,加载指定词典
* @param mainPath 词典路径(+.bin等于缓存路径)
* @return
*/
public boolean loadMainDictionary(String mainPath)
{
return loadMainDictionary(mainPath, HanLP.Config.CustomDictionaryPath, CustomDictionary.dat, true);
return loadMainDictionary(mainPath, HanLP.Config.CustomDictionaryPath, this.dat, true);
}


Expand Down Expand Up @@ -227,7 +281,7 @@ public static boolean load(String path, Nature defaultNature, TreeMap<String, Co
* @param rewriteTable
* @return 是否更新了
*/
private static boolean updateAttributeIfExist(String key, CoreDictionary.Attribute attribute, TreeMap<String, CoreDictionary.Attribute> map, TreeMap<Integer, CoreDictionary.Attribute> rewriteTable)
private boolean updateAttributeIfExist(String key, CoreDictionary.Attribute attribute, TreeMap<String, CoreDictionary.Attribute> map, TreeMap<Integer, CoreDictionary.Attribute> rewriteTable)
{
int wordID = CoreDictionary.getWordID(key);
CoreDictionary.Attribute attributeExisted;
Expand Down Expand Up @@ -262,7 +316,7 @@ private static boolean updateAttributeIfExist(String key, CoreDictionary.Attribu
* @param natureWithFrequency 词性和其对应的频次,比如“nz 1 v 2”,null时表示“nz 1”
* @return 是否插入成功(失败的原因可能是不覆盖、natureWithFrequency有问题等,后者可以通过调试模式了解原因)
*/
public static boolean add(String word, String natureWithFrequency)
public boolean add(String word, String natureWithFrequency)
{
if (contains(word)) return false;
return insert(word, natureWithFrequency);
Expand All @@ -275,7 +329,7 @@ public static boolean add(String word, String natureWithFrequency)
* @param word 新词 如“裸婚”
* @return 是否插入成功(失败的原因可能是不覆盖等,可以通过调试模式了解原因)
*/
public static boolean add(String word)
public boolean add(String word)
{
if (HanLP.Config.Normalization) word = CharTable.convert(word);
if (contains(word)) return false;
Expand All @@ -290,7 +344,7 @@ public static boolean add(String word)
* @param natureWithFrequency 词性和其对应的频次,比如“nz 1 v 2”,null时表示“nz 1”。
* @return 是否插入成功(失败的原因可能是natureWithFrequency问题,可以通过调试模式了解原因)
*/
public static boolean insert(String word, String natureWithFrequency)
public boolean insert(String word, String natureWithFrequency)
{
if (word == null) return false;
if (HanLP.Config.Normalization) word = CharTable.convert(word);
Expand All @@ -309,7 +363,7 @@ public static boolean insert(String word, String natureWithFrequency)
* @param word
* @return
*/
public static boolean insert(String word)
public boolean insert(String word)
{
return insert(word, null);
}
Expand Down Expand Up @@ -415,7 +469,7 @@ private static boolean isDicNeedUpdate(String mainPath, String path[])
* @param key
* @return
*/
public static CoreDictionary.Attribute get(String key)
public CoreDictionary.Attribute get(String key)
{
if (HanLP.Config.Normalization) key = CharTable.convert(key);
CoreDictionary.Attribute attribute = dat.get(key);
Expand All @@ -430,7 +484,7 @@ public static CoreDictionary.Attribute get(String key)
*
* @param key
*/
public static void remove(String key)
public void remove(String key)
{
if (HanLP.Config.Normalization) key = CharTable.convert(key);
if (trie == null) return;
Expand All @@ -443,7 +497,7 @@ public static void remove(String key)
* @param key
* @return
*/
public static LinkedList<Map.Entry<String, CoreDictionary.Attribute>> commonPrefixSearch(String key)
public LinkedList<Map.Entry<String, CoreDictionary.Attribute>> commonPrefixSearch(String key)
{
return trie.commonPrefixSearchWithValue(key);
}
Expand All @@ -455,12 +509,12 @@ public static LinkedList<Map.Entry<String, CoreDictionary.Attribute>> commonPref
* @param begin
* @return
*/
public static LinkedList<Map.Entry<String, CoreDictionary.Attribute>> commonPrefixSearch(char[] chars, int begin)
public LinkedList<Map.Entry<String, CoreDictionary.Attribute>> commonPrefixSearch(char[] chars, int begin)
{
return trie.commonPrefixSearchWithValue(chars, begin);
}

public static BaseSearcher getSearcher(String text)
public BaseSearcher getSearcher(String text)
{
return new Searcher(text);
}
Expand All @@ -478,7 +532,7 @@ public String toString()
* @param key 词语
* @return 是否包含
*/
public static boolean contains(String key)
public boolean contains(String key)
{
if (dat.exactMatchSearch(key) >= 0) return true;
return trie != null && trie.containsKey(key);
Expand All @@ -489,12 +543,12 @@ public static boolean contains(String key)
* @param charArray 文本
* @return 查询者
*/
public static BaseSearcher getSearcher(char[] charArray)
public BaseSearcher getSearcher(char[] charArray)
{
return new Searcher(charArray);
}

static class Searcher extends BaseSearcher<CoreDictionary.Attribute>
class Searcher extends BaseSearcher<CoreDictionary.Attribute>
{
/**
* 分词从何处开始,这是一个状态
Expand Down Expand Up @@ -547,7 +601,7 @@ public Map.Entry<String, CoreDictionary.Attribute> next()
* @return
* @deprecated 谨慎操作,有可能废弃此接口
*/
public static BinTrie<CoreDictionary.Attribute> getTrie()
public BinTrie<CoreDictionary.Attribute> getTrie()
{
return trie;
}
Expand All @@ -557,7 +611,7 @@ public static BinTrie<CoreDictionary.Attribute> getTrie()
* @param text 文本
* @param processor 处理器
*/
public static void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute> processor)
public void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute> processor)
{
if (trie != null)
{
Expand All @@ -575,11 +629,11 @@ public static void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit<CoreDi
* @param text 文本
* @param processor 处理器
*/
public static void parseText(String text, AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute> processor)
public void parseText(String text, AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute> processor)
{
if (trie != null)
{
BaseSearcher searcher = CustomDictionary.getSearcher(text);
BaseSearcher searcher = this.getSearcher(text);
int offset;
Map.Entry<String, CoreDictionary.Attribute> entry;
while ((entry = searcher.next()) != null)
Expand All @@ -601,7 +655,7 @@ public static void parseText(String text, AhoCorasickDoubleArrayTrie.IHit<CoreDi
* @param text 文本
* @param processor 处理器
*/
public static void parseLongestText(String text, AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute> processor)
public void parseLongestText(String text, AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute> processor)
{
if (trie != null)
{
Expand Down Expand Up @@ -649,9 +703,8 @@ public void hit(int begin, int end, CoreDictionary.Attribute value)
* 集群环境(或其他IOAdapter)需要自行删除缓存文件(路径 = HanLP.Config.CustomDictionaryPath[0] + Predefine.BIN_EXT)
* @return 是否加载成功
*/
public static boolean reload()
public boolean reload()
{
String path[] = HanLP.Config.CustomDictionaryPath;
if (path == null || path.length == 0) return false;
IOUtil.deleteFile(path[0] + Predefine.BIN_EXT); // 删掉缓存
return loadMainDictionary(path[0]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
*/
public abstract class CharacterBasedSegment extends Segment
{

/**
* 查询或猜测一个词语的属性,
* 先查词典,然后对字母、数字串的属性进行判断,最后猜测未登录词
Expand All @@ -40,7 +39,7 @@ public static CoreDictionary.Attribute guessAttribute(Term term)
CoreDictionary.Attribute attribute = CoreDictionary.get(term.word);
if (attribute == null)
{
attribute = CustomDictionary.get(term.word);
attribute = CustomDictionary.DEFAULT.get(term.word);
}
if (attribute == null)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ public class DoubleArrayTrieSegment extends DictionaryBasedSegment
*/
public DoubleArrayTrie<CoreDictionary.Attribute> trie;

public CustomDictionary customDictionary = CustomDictionary.DEFAULT;

/**
* 使用核心词库的trie树构造分词器
*/
Expand Down Expand Up @@ -78,10 +80,10 @@ protected List<Term> segSentence(char[] sentence)
matchLongest(sentence, wordNet, natureArray, trie);
if (config.useCustomDictionary)
{
matchLongest(sentence, wordNet, natureArray, CustomDictionary.dat);
if (CustomDictionary.trie != null)
matchLongest(sentence, wordNet, natureArray, customDictionary.dat);
if (customDictionary.trie != null)
{
CustomDictionary.trie.parseLongestText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>()
customDictionary.trie.parseLongestText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>()
{
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value)
Expand Down
Loading

0 comments on commit ec9c509

Please sign in to comment.