diff --git a/README.md b/README.md index 9560a54fb..94794fd33 100644 --- a/README.md +++ b/README.md @@ -254,20 +254,25 @@ public class DemoCustomDictionary { public static void main(String[] args) { + // 系统默认的词典 + CustomDictionary dictionary = CustomDictionary.DEFAULT; + // 每个分词器都有一份词典,默认公用 CustomDictionary.DEFAULT,你可以为任何分词器指定一份不同的词典 + CustomDictionary myDictionary = new CustomDictionary("data/dictionary/custom/CustomDictionary.txt", "data/dictionary/custom/机构名词典.txt"); + StandardTokenizer.SEGMENT.enableCustomDictionary(myDictionary); // 动态增加 - CustomDictionary.add("攻城狮"); + dictionary.add("攻城狮"); // 强行插入 - CustomDictionary.insert("白富美", "nz 1024"); + dictionary.insert("白富美", "nz 1024"); // 删除词语(注释掉试试) -// CustomDictionary.remove("攻城狮"); - System.out.println(CustomDictionary.add("单身狗", "nz 1024 n 1")); - System.out.println(CustomDictionary.get("单身狗")); +// dictionary.remove("攻城狮"); + System.out.println(dictionary.add("单身狗", "nz 1024 n 1")); + System.out.println(dictionary.get("单身狗")); String text = "攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰"; // 怎么可能噗哈哈! // AhoCorasickDoubleArrayTrie自动机扫描文本中出现的自定义词语 final char[] charArray = text.toCharArray(); - CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit() + dictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, CoreDictionary.Attribute value) @@ -282,7 +287,7 @@ public class DemoCustomDictionary } ``` - 说明 - * `CustomDictionary`是一份全局的用户自定义词典,可以随时增删,影响全部分词器。另外可以在任何分词器中关闭它。通过代码动态增删不会保存到词典文件。 + * `CustomDictionary.DEFAULT`是一份全局的用户自定义词典,可以随时增删,影响全部分词器。另外可以在任何分词器中关闭它。通过代码动态增删不会保存到词典文件。 * 中文分词≠词典,词典无法解决中文分词,`Segment`提供高低优先级应对不同场景,请参考[FAQ](https://github.com/hankcs/HanLP/wiki/FAQ#%E4%B8%BA%E4%BB%80%E4%B9%88%E4%BF%AE%E6%94%B9%E4%BA%86%E8%AF%8D%E5%85%B8%E8%BF%98%E6%98%AF%E6%B2%A1%E6%9C%89%E6%95%88%E6%9E%9C)。 - 追加词典 * `CustomDictionary`主词典文本路径是`data/dictionary/custom/CustomDictionary.txt`,用户可以在此增加自己的词语(不推荐);也可以单独新建一个文本文件,通过配置文件`CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 我的词典.txt;`来追加词典(推荐)。 diff --git a/src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java b/src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java index 2a32be950..c1eba4313 100644 --- a/src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java +++ b/src/main/java/com/hankcs/hanlp/dictionary/CustomDictionary.java @@ -40,21 +40,70 @@ public class CustomDictionary /** * 用于储存用户动态插入词条的二分trie树 */ - public static BinTrie trie; - public static DoubleArrayTrie dat = new DoubleArrayTrie(); + public BinTrie trie; + /** + * 用于储存文件中的词条 + */ + public DoubleArrayTrie dat; + /** + * 本词典是从哪些路径加载得到的 + */ + public String path[]; + /** + * 默认实例 + */ + public static CustomDictionary DEFAULT = new CustomDictionary(HanLP.Config.CustomDictionaryPath); + + /** + * 构造一份词典对象,并加载{@code com.hankcs.hanlp.HanLP.Config#CustomDictionaryPath} + */ + public CustomDictionary() + { + this(HanLP.Config.CustomDictionaryPath); + } - // 自动加载词典 - static + /** + * 构造一份词典对象,并加载指定路径的词典 + * @param path 词典路径 + */ + public CustomDictionary(String... path) + { + this(new DoubleArrayTrie(), new BinTrie(), path); + } + + /** + * 使用高级数据结构构造词典对象,并加载指定路径的词典 + * @param dat 双数组trie树 + * @param trie trie树 + * @param path 词典路径 + */ + public CustomDictionary(DoubleArrayTrie dat, BinTrie trie, String[] path) + { + this.dat = dat; + this.trie = trie; + if (path != null) + { + load(path); + } + } + + /** + * 加载指定路径的词典 + * @param path 词典路径 + * @return 是否加载成功 + */ + public boolean load(String... path) { - String path[] = HanLP.Config.CustomDictionaryPath; long start = System.currentTimeMillis(); if (!loadMainDictionary(path[0])) { logger.warning("自定义词典" + Arrays.toString(path) + "加载失败"); + return false; } else { logger.info("自定义词典加载成功:" + dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms"); + return true; } } @@ -152,9 +201,14 @@ public static boolean loadMainDictionary(String mainPath, String path[], DoubleA return true; } - private static boolean loadMainDictionary(String mainPath) + /** + * 使用词典路径为缓存路径,加载指定词典 + * @param mainPath 词典路径(+.bin等于缓存路径) + * @return + */ + public boolean loadMainDictionary(String mainPath) { - return loadMainDictionary(mainPath, HanLP.Config.CustomDictionaryPath, CustomDictionary.dat, true); + return loadMainDictionary(mainPath, HanLP.Config.CustomDictionaryPath, this.dat, true); } @@ -227,7 +281,7 @@ public static boolean load(String path, Nature defaultNature, TreeMap map, TreeMap rewriteTable) + private boolean updateAttributeIfExist(String key, CoreDictionary.Attribute attribute, TreeMap map, TreeMap rewriteTable) { int wordID = CoreDictionary.getWordID(key); CoreDictionary.Attribute attributeExisted; @@ -262,7 +316,7 @@ private static boolean updateAttributeIfExist(String key, CoreDictionary.Attribu * @param natureWithFrequency 词性和其对应的频次,比如“nz 1 v 2”,null时表示“nz 1” * @return 是否插入成功(失败的原因可能是不覆盖、natureWithFrequency有问题等,后者可以通过调试模式了解原因) */ - public static boolean add(String word, String natureWithFrequency) + public boolean add(String word, String natureWithFrequency) { if (contains(word)) return false; return insert(word, natureWithFrequency); @@ -275,7 +329,7 @@ public static boolean add(String word, String natureWithFrequency) * @param word 新词 如“裸婚” * @return 是否插入成功(失败的原因可能是不覆盖等,可以通过调试模式了解原因) */ - public static boolean add(String word) + public boolean add(String word) { if (HanLP.Config.Normalization) word = CharTable.convert(word); if (contains(word)) return false; @@ -290,7 +344,7 @@ public static boolean add(String word) * @param natureWithFrequency 词性和其对应的频次,比如“nz 1 v 2”,null时表示“nz 1”。 * @return 是否插入成功(失败的原因可能是natureWithFrequency问题,可以通过调试模式了解原因) */ - public static boolean insert(String word, String natureWithFrequency) + public boolean insert(String word, String natureWithFrequency) { if (word == null) return false; if (HanLP.Config.Normalization) word = CharTable.convert(word); @@ -309,7 +363,7 @@ public static boolean insert(String word, String natureWithFrequency) * @param word * @return */ - public static boolean insert(String word) + public boolean insert(String word) { return insert(word, null); } @@ -415,7 +469,7 @@ private static boolean isDicNeedUpdate(String mainPath, String path[]) * @param key * @return */ - public static CoreDictionary.Attribute get(String key) + public CoreDictionary.Attribute get(String key) { if (HanLP.Config.Normalization) key = CharTable.convert(key); CoreDictionary.Attribute attribute = dat.get(key); @@ -430,7 +484,7 @@ public static CoreDictionary.Attribute get(String key) * * @param key */ - public static void remove(String key) + public void remove(String key) { if (HanLP.Config.Normalization) key = CharTable.convert(key); if (trie == null) return; @@ -443,7 +497,7 @@ public static void remove(String key) * @param key * @return */ - public static LinkedList> commonPrefixSearch(String key) + public LinkedList> commonPrefixSearch(String key) { return trie.commonPrefixSearchWithValue(key); } @@ -455,12 +509,12 @@ public static LinkedList> commonPref * @param begin * @return */ - public static LinkedList> commonPrefixSearch(char[] chars, int begin) + public LinkedList> commonPrefixSearch(char[] chars, int begin) { return trie.commonPrefixSearchWithValue(chars, begin); } - public static BaseSearcher getSearcher(String text) + public BaseSearcher getSearcher(String text) { return new Searcher(text); } @@ -478,7 +532,7 @@ public String toString() * @param key 词语 * @return 是否包含 */ - public static boolean contains(String key) + public boolean contains(String key) { if (dat.exactMatchSearch(key) >= 0) return true; return trie != null && trie.containsKey(key); @@ -489,12 +543,12 @@ public static boolean contains(String key) * @param charArray 文本 * @return 查询者 */ - public static BaseSearcher getSearcher(char[] charArray) + public BaseSearcher getSearcher(char[] charArray) { return new Searcher(charArray); } - static class Searcher extends BaseSearcher + class Searcher extends BaseSearcher { /** * 分词从何处开始,这是一个状态 @@ -547,7 +601,7 @@ public Map.Entry next() * @return * @deprecated 谨慎操作,有可能废弃此接口 */ - public static BinTrie getTrie() + public BinTrie getTrie() { return trie; } @@ -557,7 +611,7 @@ public static BinTrie getTrie() * @param text 文本 * @param processor 处理器 */ - public static void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit processor) + public void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit processor) { if (trie != null) { @@ -575,11 +629,11 @@ public static void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit processor) + public void parseText(String text, AhoCorasickDoubleArrayTrie.IHit processor) { if (trie != null) { - BaseSearcher searcher = CustomDictionary.getSearcher(text); + BaseSearcher searcher = this.getSearcher(text); int offset; Map.Entry entry; while ((entry = searcher.next()) != null) @@ -601,7 +655,7 @@ public static void parseText(String text, AhoCorasickDoubleArrayTrie.IHit processor) + public void parseLongestText(String text, AhoCorasickDoubleArrayTrie.IHit processor) { if (trie != null) { @@ -649,9 +703,8 @@ public void hit(int begin, int end, CoreDictionary.Attribute value) * 集群环境(或其他IOAdapter)需要自行删除缓存文件(路径 = HanLP.Config.CustomDictionaryPath[0] + Predefine.BIN_EXT) * @return 是否加载成功 */ - public static boolean reload() + public boolean reload() { - String path[] = HanLP.Config.CustomDictionaryPath; if (path == null || path.length == 0) return false; IOUtil.deleteFile(path[0] + Predefine.BIN_EXT); // 删掉缓存 return loadMainDictionary(path[0]); diff --git a/src/main/java/com/hankcs/hanlp/seg/CharacterBasedSegment.java b/src/main/java/com/hankcs/hanlp/seg/CharacterBasedSegment.java index a1171f171..042ef0f68 100644 --- a/src/main/java/com/hankcs/hanlp/seg/CharacterBasedSegment.java +++ b/src/main/java/com/hankcs/hanlp/seg/CharacterBasedSegment.java @@ -28,7 +28,6 @@ */ public abstract class CharacterBasedSegment extends Segment { - /** * 查询或猜测一个词语的属性, * 先查词典,然后对字母、数字串的属性进行判断,最后猜测未登录词 @@ -40,7 +39,7 @@ public static CoreDictionary.Attribute guessAttribute(Term term) CoreDictionary.Attribute attribute = CoreDictionary.get(term.word); if (attribute == null) { - attribute = CustomDictionary.get(term.word); + attribute = CustomDictionary.DEFAULT.get(term.word); } if (attribute == null) { diff --git a/src/main/java/com/hankcs/hanlp/seg/Other/DoubleArrayTrieSegment.java b/src/main/java/com/hankcs/hanlp/seg/Other/DoubleArrayTrieSegment.java index 513007f73..a3d2bc374 100644 --- a/src/main/java/com/hankcs/hanlp/seg/Other/DoubleArrayTrieSegment.java +++ b/src/main/java/com/hankcs/hanlp/seg/Other/DoubleArrayTrieSegment.java @@ -37,6 +37,8 @@ public class DoubleArrayTrieSegment extends DictionaryBasedSegment */ public DoubleArrayTrie trie; + public CustomDictionary customDictionary = CustomDictionary.DEFAULT; + /** * 使用核心词库的trie树构造分词器 */ @@ -78,10 +80,10 @@ protected List segSentence(char[] sentence) matchLongest(sentence, wordNet, natureArray, trie); if (config.useCustomDictionary) { - matchLongest(sentence, wordNet, natureArray, CustomDictionary.dat); - if (CustomDictionary.trie != null) + matchLongest(sentence, wordNet, natureArray, customDictionary.dat); + if (customDictionary.trie != null) { - CustomDictionary.trie.parseLongestText(charArray, new AhoCorasickDoubleArrayTrie.IHit() + customDictionary.trie.parseLongestText(charArray, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, CoreDictionary.Attribute value) diff --git a/src/main/java/com/hankcs/hanlp/seg/Segment.java b/src/main/java/com/hankcs/hanlp/seg/Segment.java index 4dcc0c893..72f9f24a9 100644 --- a/src/main/java/com/hankcs/hanlp/seg/Segment.java +++ b/src/main/java/com/hankcs/hanlp/seg/Segment.java @@ -15,6 +15,7 @@ import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie; import com.hankcs.hanlp.collection.trie.DoubleArrayTrie; import com.hankcs.hanlp.collection.trie.bintrie.BaseNode; +import com.hankcs.hanlp.collection.trie.bintrie.BinTrie; import com.hankcs.hanlp.corpus.tag.Nature; import com.hankcs.hanlp.dictionary.CoreDictionary; import com.hankcs.hanlp.dictionary.CustomDictionary; @@ -46,12 +47,39 @@ public abstract class Segment */ protected Config config; + /** + * 自定义词典,默认所有分词器共用一套定义在{@code com.hankcs.hanlp.HanLP.Config#CustomDictionaryPath}中的词典 + */ + public CustomDictionary customDictionary; + /** * 构造一个分词器 */ public Segment() { - config = new Config(); + this(new Config(), CustomDictionary.DEFAULT); + } + + /** + * 用自定义词典构造 + * + * @param customDictionary 自定义词典 + */ + public Segment(CustomDictionary customDictionary) + { + this(new Config(), customDictionary); + } + + /** + * 用配置和自定义词典构造 + * + * @param config + * @param customDictionary + */ + public Segment(Config config, CustomDictionary customDictionary) + { + this.config = config; + this.customDictionary = customDictionary; } /** @@ -196,17 +224,18 @@ protected static List quickAtomSegment(char[] charArray, int start, in */ protected static List combineByCustomDictionary(List vertexList) { - return combineByCustomDictionary(vertexList, CustomDictionary.dat); + return combineByCustomDictionary(vertexList, CustomDictionary.DEFAULT); } /** * 使用用户词典合并粗分结果 * @param vertexList 粗分结果 - * @param dat 用户自定义词典 + * @param customDictionary 用户自定义词典 * @return 合并后的结果 */ - protected static List combineByCustomDictionary(List vertexList, DoubleArrayTrie dat) + protected static List combineByCustomDictionary(List vertexList, CustomDictionary customDictionary) { + DoubleArrayTrie dat = customDictionary.dat; assert vertexList.size() >= 2 : "vertexList至少包含 始##始 和 末##末"; Vertex[] wordNet = new Vertex[vertexList.size()]; vertexList.toArray(wordNet); @@ -240,12 +269,13 @@ protected static List combineByCustomDictionary(List vertexList, } } // BinTrie合并 - if (CustomDictionary.trie != null) + BinTrie trie = customDictionary.trie; + if (trie != null) { for (int i = 1; i < length; ++i) { if (wordNet[i] == null) continue; - BaseNode state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0); + BaseNode state = trie.transition(wordNet[i].realWord.toCharArray(), 0); if (state != null) { int to = i + 1; @@ -286,19 +316,19 @@ protected static List combineByCustomDictionary(List vertexList, */ protected static List combineByCustomDictionary(List vertexList, final WordNet wordNetAll) { - return combineByCustomDictionary(vertexList, CustomDictionary.dat, wordNetAll); + return combineByCustomDictionary(vertexList, CustomDictionary.DEFAULT, wordNetAll); } /** * 使用用户词典合并粗分结果,并将用户词语收集到全词图中 * @param vertexList 粗分结果 - * @param dat 用户自定义词典 + * @param customDictionary 用户自定义词典 * @param wordNetAll 收集用户词语到全词图中 * @return 合并后的结果 */ - protected static List combineByCustomDictionary(List vertexList, DoubleArrayTrie dat, final WordNet wordNetAll) + protected static List combineByCustomDictionary(List vertexList, CustomDictionary customDictionary, final WordNet wordNetAll) { - List outputList = combineByCustomDictionary(vertexList, dat); + List outputList = combineByCustomDictionary(vertexList, customDictionary); int line = 0; for (final Vertex vertex : outputList) { @@ -306,7 +336,7 @@ protected static List combineByCustomDictionary(List vertexList, final int currentLine = line; if (parentLength >= 3) { - CustomDictionary.parseText(vertex.realWord, new AhoCorasickDoubleArrayTrie.IHit() + customDictionary.parseText(vertex.realWord, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, CoreDictionary.Attribute value) @@ -722,6 +752,18 @@ public Segment enableCustomDictionary(boolean enable) return this; } + /** + * 启用新的用户词典 + * + * @param customDictionary 新的自定义词典 + */ + public Segment enableCustomDictionary(CustomDictionary customDictionary) + { + config.useCustomDictionary = true; + this.customDictionary = customDictionary; + return this; + } + /** * 是否尽可能强制使用用户词典(使用户词典的优先级尽可能高)
* 警告:具体实现由各子类决定,可能会破坏分词器的统计特性(例如,如果用户词典 diff --git a/src/main/java/com/hankcs/hanlp/seg/Viterbi/ViterbiSegment.java b/src/main/java/com/hankcs/hanlp/seg/Viterbi/ViterbiSegment.java index 48529130f..66f75212f 100644 --- a/src/main/java/com/hankcs/hanlp/seg/Viterbi/ViterbiSegment.java +++ b/src/main/java/com/hankcs/hanlp/seg/Viterbi/ViterbiSegment.java @@ -40,11 +40,9 @@ */ public class ViterbiSegment extends WordBasedSegment { - private DoubleArrayTrie dat; public ViterbiSegment() { - this.dat = CustomDictionary.dat; } /** @@ -66,12 +64,12 @@ public ViterbiSegment(String customPath, boolean cache) public DoubleArrayTrie getDat() { - return dat; + return customDictionary.dat; } public void setDat(DoubleArrayTrie dat) { - this.dat = dat; + this.customDictionary.dat = dat; } @Override @@ -94,8 +92,8 @@ protected List segSentence(char[] sentence) if (config.useCustomDictionary) { if (config.indexMode > 0) - combineByCustomDictionary(vertexList, this.dat, wordNetAll); - else combineByCustomDictionary(vertexList, this.dat); + combineByCustomDictionary(vertexList, customDictionary, wordNetAll); + else combineByCustomDictionary(vertexList, customDictionary); } if (HanLP.Config.DEBUG) diff --git a/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java b/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java index f0250786b..c4240f1a2 100644 --- a/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java +++ b/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java @@ -33,6 +33,7 @@ */ public abstract class WordBasedSegment extends Segment { + public CustomDictionary customDictionary = CustomDictionary.DEFAULT; public WordBasedSegment() { @@ -396,7 +397,7 @@ protected void generateWordNet(final WordNet wordNetStorage) // 强制用户词典查询 if (config.forceCustomDictionary) { - CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit() + customDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, CoreDictionary.Attribute value) diff --git a/src/main/java/com/hankcs/hanlp/tokenizer/lexical/AbstractLexicalAnalyzer.java b/src/main/java/com/hankcs/hanlp/tokenizer/lexical/AbstractLexicalAnalyzer.java index 45deab736..6460e2904 100644 --- a/src/main/java/com/hankcs/hanlp/tokenizer/lexical/AbstractLexicalAnalyzer.java +++ b/src/main/java/com/hankcs/hanlp/tokenizer/lexical/AbstractLexicalAnalyzer.java @@ -111,7 +111,7 @@ protected void segment(final String sentence, final String normalized, final Lis if (attributeList != null) { final int[] offset = new int[]{0}; - CustomDictionary.parseLongestText(sentence, new AhoCorasickDoubleArrayTrie.IHit() + customDictionary.parseLongestText(sentence, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, CoreDictionary.Attribute value) @@ -145,7 +145,7 @@ public void segment(final String sentence, final String normalized, final List() + customDictionary.parseLongestText(sentence, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, CoreDictionary.Attribute value) @@ -620,13 +620,13 @@ private List segmentWithAttribute(String original, Str * @param vertexList 粗分结果 * @return 合并后的结果 */ - protected static List combineWithCustomDictionary(List vertexList) + protected List combineWithCustomDictionary(List vertexList) { String[] wordNet = new String[vertexList.size()]; vertexList.toArray(wordNet); CoreDictionary.Attribute[] attributeArray = new CoreDictionary.Attribute[wordNet.length]; // DAT合并 - DoubleArrayTrie dat = CustomDictionary.dat; + DoubleArrayTrie dat = customDictionary.dat; int length = wordNet.length; for (int i = 0; i < length; ++i) { @@ -656,12 +656,12 @@ protected static List combineWithCustomDictionary(List } } // BinTrie合并 - if (CustomDictionary.trie != null) + if (customDictionary.trie != null) { for (int i = 0; i < length; ++i) { if (wordNet[i] == null) continue; - BaseNode state = CustomDictionary.trie.transition(wordNet[i], 0); + BaseNode state = customDictionary.trie.transition(wordNet[i], 0); if (state != null) { int to = i + 1; diff --git a/src/main/java/com/hankcs/hanlp/utility/LexiconUtility.java b/src/main/java/com/hankcs/hanlp/utility/LexiconUtility.java index 252e7813d..37ea6a43f 100644 --- a/src/main/java/com/hankcs/hanlp/utility/LexiconUtility.java +++ b/src/main/java/com/hankcs/hanlp/utility/LexiconUtility.java @@ -36,7 +36,7 @@ public static CoreDictionary.Attribute getAttribute(String word) { CoreDictionary.Attribute attribute = CoreDictionary.get(word); if (attribute != null) return attribute; - return CustomDictionary.get(word); + return CustomDictionary.DEFAULT.get(word); } /** @@ -83,12 +83,12 @@ public static boolean setAttribute(String word, CoreDictionary.Attribute attribu if (attribute == null) return false; if (CoreDictionary.trie.set(word, attribute)) return true; - if (CustomDictionary.dat.set(word, attribute)) return true; - if (CustomDictionary.trie == null) + if (CustomDictionary.DEFAULT.dat.set(word, attribute)) return true; + if (CustomDictionary.DEFAULT.trie == null) { - CustomDictionary.add(word); + CustomDictionary.DEFAULT.add(word); } - CustomDictionary.trie.put(word, attribute); + CustomDictionary.DEFAULT.trie.put(word, attribute); return true; } diff --git a/src/test/java/com/hankcs/book/ch03/DemoCustomDictionary.java b/src/test/java/com/hankcs/book/ch03/DemoCustomDictionary.java index 277dc326c..9236e2edc 100644 --- a/src/test/java/com/hankcs/book/ch03/DemoCustomDictionary.java +++ b/src/test/java/com/hankcs/book/ch03/DemoCustomDictionary.java @@ -10,7 +10,6 @@ */ package com.hankcs.book.ch03; -import com.hankcs.hanlp.dictionary.CustomDictionary; import com.hankcs.hanlp.seg.Segment; import com.hankcs.hanlp.seg.Viterbi.ViterbiSegment; @@ -31,7 +30,7 @@ public static void main(String[] args) final String sentence = "社会摇摆简称社会摇"; segment.enableCustomDictionary(false); System.out.println("不挂载词典:" + segment.seg(sentence)); - CustomDictionary.insert("社会摇", "nz 100"); + segment.customDictionary.insert("社会摇", "nz 100"); segment.enableCustomDictionary(true); System.out.println("低优先级词典:" + segment.seg(sentence)); segment.enableCustomDictionaryForcing(true); diff --git a/src/test/java/com/hankcs/book/ch05/OnlineLearning.java b/src/test/java/com/hankcs/book/ch05/OnlineLearning.java index d29b3616e..739d98c5e 100644 --- a/src/test/java/com/hankcs/book/ch05/OnlineLearning.java +++ b/src/test/java/com/hankcs/book/ch05/OnlineLearning.java @@ -37,7 +37,7 @@ public static void main(String[] args) throws IOException String text = "与川普通电话"; System.out.println(segment.seg(text)); - CustomDictionary.insert("川普", "nrf 1"); + segment.customDictionary.insert("川普", "nrf 1"); segment.enableCustomDictionaryForcing(true); System.out.println(segment.seg(text)); diff --git a/src/test/java/com/hankcs/book/ch07/CustomPOS.java b/src/test/java/com/hankcs/book/ch07/CustomPOS.java index da9b657bb..458bbfb3d 100644 --- a/src/test/java/com/hankcs/book/ch07/CustomPOS.java +++ b/src/test/java/com/hankcs/book/ch07/CustomPOS.java @@ -28,10 +28,10 @@ public class CustomPOS { public static void main(String[] args) throws IOException { - CustomDictionary.insert("苹果", "手机品牌 1"); - CustomDictionary.insert("iPhone X", "手机型号 1"); PerceptronLexicalAnalyzer analyzer = new PerceptronLexicalAnalyzer(); analyzer.enableCustomDictionaryForcing(true); + analyzer.customDictionary.insert("苹果", "手机品牌 1"); + analyzer.customDictionary.insert("iPhone X", "手机型号 1"); System.out.println(analyzer.analyze("你们苹果iPhone X保修吗?")); System.out.println(analyzer.analyze("多吃苹果有益健康")); } diff --git a/src/test/java/com/hankcs/demo/DemoCustomDictionary.java b/src/test/java/com/hankcs/demo/DemoCustomDictionary.java index 243e1056b..43e954c48 100644 --- a/src/test/java/com/hankcs/demo/DemoCustomDictionary.java +++ b/src/test/java/com/hankcs/demo/DemoCustomDictionary.java @@ -16,6 +16,8 @@ import com.hankcs.hanlp.dictionary.BaseSearcher; import com.hankcs.hanlp.dictionary.CoreDictionary; import com.hankcs.hanlp.dictionary.CustomDictionary; +import com.hankcs.hanlp.seg.Viterbi.ViterbiSegment; +import com.hankcs.hanlp.tokenizer.StandardTokenizer; import java.util.Map; @@ -28,20 +30,25 @@ public class DemoCustomDictionary { public static void main(String[] args) { + // 系统默认的词典 + CustomDictionary dictionary = CustomDictionary.DEFAULT; + // 每个分词器都有一份词典,默认公用 CustomDictionary.DEFAULT,你可以为任何分词器指定一份不同的词典 + CustomDictionary myDictionary = new CustomDictionary("data/dictionary/custom/CustomDictionary.txt", "data/dictionary/custom/机构名词典.txt"); + StandardTokenizer.SEGMENT.enableCustomDictionary(myDictionary); // 动态增加 - CustomDictionary.add("攻城狮"); + dictionary.add("攻城狮"); // 强行插入 - CustomDictionary.insert("白富美", "nz 1024"); + dictionary.insert("白富美", "nz 1024"); // 删除词语(注释掉试试) -// CustomDictionary.remove("攻城狮"); - System.out.println(CustomDictionary.add("单身狗", "nz 1024 n 1")); - System.out.println(CustomDictionary.get("单身狗")); +// dictionary.remove("攻城狮"); + System.out.println(dictionary.add("单身狗", "nz 1024 n 1")); + System.out.println(dictionary.get("单身狗")); String text = "攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰"; // 怎么可能噗哈哈! // DoubleArrayTrie分词 final char[] charArray = text.toCharArray(); - CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit() + dictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit() { @Override public void hit(int begin, int end, CoreDictionary.Attribute value) @@ -50,7 +57,7 @@ public void hit(int begin, int end, CoreDictionary.Attribute value) } }); // 首字哈希之后二分的trie树分词 - BaseSearcher searcher = CustomDictionary.getSearcher(text); + BaseSearcher searcher = dictionary.getSearcher(text); Map.Entry entry; while ((entry = searcher.next()) != null) { diff --git a/src/test/java/com/hankcs/demo/DemoCustomNature.java b/src/test/java/com/hankcs/demo/DemoCustomNature.java index 86581e7f2..ab80a32bf 100644 --- a/src/test/java/com/hankcs/demo/DemoCustomNature.java +++ b/src/test/java/com/hankcs/demo/DemoCustomNature.java @@ -53,7 +53,7 @@ public static void main(String[] args) System.out.printf("找到了 [%s] : %s\n", pcNature, term.word); } // 还可以直接插入到用户词典 - CustomDictionary.insert("阿尔法狗", "科技名词 1024"); + CustomDictionary.DEFAULT.insert("阿尔法狗", "科技名词 1024"); StandardTokenizer.SEGMENT.enablePartOfSpeechTagging(true); // 依然支持隐马词性标注 termList = HanLP.segment("苹果电脑可以运行开源阿尔法狗代码吗"); System.out.println(termList); diff --git a/src/test/java/com/hankcs/demo/DemoNormalization.java b/src/test/java/com/hankcs/demo/DemoNormalization.java index e0f55b275..0423d1f45 100644 --- a/src/test/java/com/hankcs/demo/DemoNormalization.java +++ b/src/test/java/com/hankcs/demo/DemoNormalization.java @@ -25,7 +25,7 @@ public class DemoNormalization public static void main(String[] args) { HanLP.Config.Normalization = true; - CustomDictionary.insert("爱听4G", "nz 1000"); + CustomDictionary.DEFAULT.insert("爱听4G", "nz 1000"); System.out.println(HanLP.segment("爱听4g")); System.out.println(HanLP.segment("爱听4G")); System.out.println(HanLP.segment("爱听4G")); diff --git a/src/test/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrieTest.java b/src/test/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrieTest.java index d7419f873..ed9131e56 100644 --- a/src/test/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrieTest.java +++ b/src/test/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrieTest.java @@ -56,7 +56,7 @@ public void testLongestSearcher() throws Exception public void testTransmit() throws Exception { - DoubleArrayTrie dat = CustomDictionary.dat; + DoubleArrayTrie dat = CustomDictionary.DEFAULT.dat; int index = dat.transition("龙", 1); assertNull(dat.output(index)); index = dat.transition("窝", index); diff --git a/src/test/java/com/hankcs/hanlp/dictionary/CustomDictionaryTest.java b/src/test/java/com/hankcs/hanlp/dictionary/CustomDictionaryTest.java index 884e8bbaa..14cd7b749 100644 --- a/src/test/java/com/hankcs/hanlp/dictionary/CustomDictionaryTest.java +++ b/src/test/java/com/hankcs/hanlp/dictionary/CustomDictionaryTest.java @@ -25,7 +25,7 @@ public class CustomDictionaryTest extends TestCase public void testGet() throws Exception { - assertEquals("nz 1 ", CustomDictionary.get("一个心眼儿").toString()); + assertEquals("nz 1 ", CustomDictionary.DEFAULT.get("一个心眼儿").toString()); } /** @@ -164,15 +164,15 @@ public void testCustomNature() throws Exception public void testIssue540() throws Exception { - CustomDictionary.add("123"); - CustomDictionary.add("摩根"); - CustomDictionary.remove("123"); - CustomDictionary.remove("摩根"); + CustomDictionary.DEFAULT.add("123"); + CustomDictionary.DEFAULT.add("摩根"); + CustomDictionary.DEFAULT.remove("123"); + CustomDictionary.DEFAULT.remove("摩根"); } public void testReload() { - CustomDictionary.reload(); + CustomDictionary.DEFAULT.reload(); System.out.println(HanLP.segment("自然语言处理")); } } \ No newline at end of file diff --git a/src/test/java/com/hankcs/hanlp/model/perceptron/PerceptronLexicalAnalyzerTest.java b/src/test/java/com/hankcs/hanlp/model/perceptron/PerceptronLexicalAnalyzerTest.java index 9a7e0d0e4..e84d407d9 100644 --- a/src/test/java/com/hankcs/hanlp/model/perceptron/PerceptronLexicalAnalyzerTest.java +++ b/src/test/java/com/hankcs/hanlp/model/perceptron/PerceptronLexicalAnalyzerTest.java @@ -46,7 +46,7 @@ public void testEmptyInput() throws Exception public void testCustomDictionary() throws Exception { analyzer.enableCustomDictionary(true); - assertTrue(CustomDictionary.contains("一字长蛇阵")); + assertTrue(CustomDictionary.DEFAULT.contains("一字长蛇阵")); final String text = "张飞摆出一字长蛇阵如入无人之境,孙权惊呆了"; // System.out.println(analyzer.analyze(text)); assertTrue(analyzer.analyze(text).toString().contains(" 一字长蛇阵/")); @@ -54,7 +54,7 @@ public void testCustomDictionary() throws Exception public void testCustomNature() throws Exception { - assertTrue(CustomDictionary.insert("饿了么", "ntc 1")); + assertTrue(CustomDictionary.DEFAULT.insert("饿了么", "ntc 1")); analyzer.enableCustomDictionaryForcing(true); assertEquals("美团/n 与/p 饿了么/ntc 争夺/v 外卖/v 市场/n", analyzer.analyze("美团与饿了么争夺外卖市场").toString()); } @@ -113,7 +113,7 @@ public void testWhiteSpace() throws Exception public void testCustomDictionaryForcing() throws Exception { String text = "银川普通人与川普通电话讲四川普通话"; - CustomDictionary.insert("川普", "NRF 1"); + CustomDictionary.DEFAULT.insert("川普", "NRF 1"); analyzer.enableCustomDictionaryForcing(false); System.out.println(analyzer.analyze(text)); diff --git a/src/test/java/com/hankcs/hanlp/model/perceptron/PerceptronSegmenterTest.java b/src/test/java/com/hankcs/hanlp/model/perceptron/PerceptronSegmenterTest.java index e349e939f..953c1689d 100644 --- a/src/test/java/com/hankcs/hanlp/model/perceptron/PerceptronSegmenterTest.java +++ b/src/test/java/com/hankcs/hanlp/model/perceptron/PerceptronSegmenterTest.java @@ -32,7 +32,7 @@ public void testNoCustomDictionary() throws Exception { PerceptronLexicalAnalyzer analyzer = new PerceptronLexicalAnalyzer(); analyzer.enableCustomDictionary(false); - CustomDictionary.insert("禁用用户词典"); + analyzer.customDictionary.insert("禁用用户词典"); assertEquals("[禁用/v, 用户/n, 词典/n]", analyzer.seg("禁用用户词典").toString()); } diff --git a/src/test/java/com/hankcs/hanlp/seg/SegmentTest.java b/src/test/java/com/hankcs/hanlp/seg/SegmentTest.java index ac18c9e2c..4114f44f9 100644 --- a/src/test/java/com/hankcs/hanlp/seg/SegmentTest.java +++ b/src/test/java/com/hankcs/hanlp/seg/SegmentTest.java @@ -65,7 +65,7 @@ public void testIssue880() throws Exception public void testViterbi() throws Exception { // HanLP.Config.enableDebug(true); - CustomDictionary.add("网剧"); + CustomDictionary.DEFAULT.add("网剧"); Segment seg = new DijkstraSegment(); List termList = seg.seg("优酷总裁魏明介绍了优酷2015年的内容战略,表示要以“大电影、大网剧、大综艺”为关键词"); // System.out.println(termList); @@ -152,7 +152,7 @@ public void testFactory() throws Exception public void testCustomDictionary() throws Exception { - CustomDictionary.insert("肯德基", "ns 1000"); + CustomDictionary.DEFAULT.insert("肯德基", "ns 1000"); Segment segment = new ViterbiSegment(); // System.out.println(segment.seg("肯德基")); } @@ -176,7 +176,7 @@ public void testIssue2() throws Exception // HanLP.Config.enableDebug(); String text = "BENQphone"; // System.out.println(HanLP.segment(text)); - CustomDictionary.insert("BENQ"); + CustomDictionary.DEFAULT.insert("BENQ"); // System.out.println(HanLP.segment(text)); } @@ -323,7 +323,7 @@ public void testIssue10() throws Exception public void testIssue16() throws Exception { - CustomDictionary.insert("爱听4g", "nz 1000"); + CustomDictionary.DEFAULT.insert("爱听4g", "nz 1000"); Segment segment = new ViterbiSegment(); // System.out.println(segment.seg("爱听4g")); // System.out.println(segment.seg("爱听4G")); @@ -425,8 +425,8 @@ public void testIssue290() throws Exception public void testIssue343() throws Exception { - CustomDictionary.insert("酷我"); - CustomDictionary.insert("酷我音乐"); + CustomDictionary.DEFAULT.insert("酷我"); + CustomDictionary.DEFAULT.insert("酷我音乐"); Segment segment = HanLP.newSegment().enableIndexMode(true); // System.out.println(segment.seg("1酷我音乐2酷我音乐3酷我4酷我音乐6酷7酷我音乐")); } @@ -495,7 +495,7 @@ public void testIssue623() throws Exception public void testIssue633() throws Exception { - CustomDictionary.add("钱管家"); + CustomDictionary.DEFAULT.add("钱管家"); StandardTokenizer.SEGMENT.enableCustomDictionaryForcing(true); // System.out.println(HanLP.segment("钱管家中怎么绑定网银")); } @@ -503,7 +503,7 @@ public void testIssue633() throws Exception public void testIssue784() throws Exception { String s = "苏苏中级会计什么时候更新"; - CustomDictionary.add("苏苏"); + CustomDictionary.DEFAULT.add("苏苏"); StandardTokenizer.SEGMENT.enableCustomDictionaryForcing(true); assertTrue(HanLP.segment(s).toString().contains("苏苏")); } @@ -533,7 +533,7 @@ public void testIssue932() throws Exception public void testIssue1172() { - CustomDictionary.insert("我的额度", "xyz"); + CustomDictionary.DEFAULT.insert("我的额度", "xyz"); System.out.println(HanLP.segment("我的额度不够,需要提高额度")); } } diff --git a/src/test/java/com/hankcs/hanlp/tokenizer/lexical/AbstractLexicalAnalyzerTest.java b/src/test/java/com/hankcs/hanlp/tokenizer/lexical/AbstractLexicalAnalyzerTest.java index 54280c35a..6e80070e3 100644 --- a/src/test/java/com/hankcs/hanlp/tokenizer/lexical/AbstractLexicalAnalyzerTest.java +++ b/src/test/java/com/hankcs/hanlp/tokenizer/lexical/AbstractLexicalAnalyzerTest.java @@ -33,7 +33,7 @@ public void testCustomDictionary() throws Exception LexicalAnalyzer analyzer = new PerceptronLexicalAnalyzer(); String text = "攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰"; System.out.println(analyzer.segment(text)); - CustomDictionary.add("攻城狮"); + CustomDictionary.DEFAULT.add("攻城狮"); System.out.println(analyzer.segment(text)); } @@ -42,7 +42,7 @@ public void testOverwriteTag() throws IOException CRFLexicalAnalyzer analyzer = new CRFLexicalAnalyzer(); String text = "强行修改词性"; System.out.println(analyzer.seg(text)); - CustomDictionary.add("修改", "自定义词性"); + CustomDictionary.DEFAULT.add("修改", "自定义词性"); System.out.println(analyzer.seg(text)); } } \ No newline at end of file