Skip to content

Commit

Permalink
Segment 添加是否进行 Normalize 的配置方法 close #1714
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Mar 8, 2022
1 parent 4b43124 commit 69506a7
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public class CustomDictionary
*/
public static boolean loadMainDictionary(String mainPath, String path[], DoubleArrayTrie<CoreDictionary.Attribute> dat, boolean isCache)
{
return DynamicCustomDictionary.loadMainDictionary(mainPath, path, dat, isCache);
return DynamicCustomDictionary.loadMainDictionary(mainPath, path, dat, isCache, HanLP.Config.Normalization);
}


Expand All @@ -59,7 +59,7 @@ public static boolean loadMainDictionary(String mainPath, String path[], DoubleA
*/
public static boolean load(String path, Nature defaultNature, TreeMap<String, CoreDictionary.Attribute> map, LinkedHashSet<Nature> customNatureCollector)
{
return DynamicCustomDictionary.load(path, defaultNature, map, customNatureCollector);
return DynamicCustomDictionary.load(path, defaultNature, map, customNatureCollector, HanLP.Config.Normalization);
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ public class DynamicCustomDictionary
*/
public String path[];

/**
* 是否执行字符正规化(繁体->简体,全角->半角,大写->小写),切换配置后必须删CustomDictionary.txt.bin缓存
*/
public boolean normalization = HanLP.Config.Normalization;

/**
* 构造一份词典对象,并加载{@code com.hankcs.hanlp.HanLP.Config#CustomDictionaryPath}
Expand Down Expand Up @@ -93,7 +97,7 @@ public DynamicCustomDictionary(DoubleArrayTrie<CoreDictionary.Attribute> dat, Bi
public boolean load(String... path)
{
long start = System.currentTimeMillis();
if (!loadMainDictionary(path[0]))
if (!loadMainDictionary(path[0], normalization))
{
logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
return false;
Expand All @@ -113,7 +117,7 @@ public boolean load(String... path)
* @param path 自定义词典
* @param isCache 是否缓存结果
*/
public static boolean loadMainDictionary(String mainPath, String path[], DoubleArrayTrie<CoreDictionary.Attribute> dat, boolean isCache)
public static boolean loadMainDictionary(String mainPath, String path[], DoubleArrayTrie<CoreDictionary.Attribute> dat, boolean isCache, boolean normalization)
{
logger.info("自定义词典开始加载:" + mainPath);
if (loadDat(mainPath, dat)) return true;
Expand Down Expand Up @@ -144,7 +148,7 @@ public static boolean loadMainDictionary(String mainPath, String path[], DoubleA
}
}
logger.info("以默认词性[" + defaultNature + "]加载自定义词典" + p + "中……");
boolean success = load(p, defaultNature, map, customNatureCollector);
boolean success = load(p, defaultNature, map, customNatureCollector, normalization);
if (!success) logger.warning("失败:" + p);
}
if (map.size() == 0)
Expand Down Expand Up @@ -207,9 +211,9 @@ public static boolean loadMainDictionary(String mainPath, String path[], DoubleA
* @param mainPath 词典路径(+.bin等于缓存路径)
* @return
*/
public boolean loadMainDictionary(String mainPath)
public boolean loadMainDictionary(String mainPath, boolean normalization)
{
return loadMainDictionary(mainPath, HanLP.Config.CustomDictionaryPath, this.dat, true);
return loadMainDictionary(mainPath, HanLP.Config.CustomDictionaryPath, this.dat, true, normalization);
}


Expand All @@ -221,7 +225,7 @@ public boolean loadMainDictionary(String mainPath)
* @param customNatureCollector 收集用户词性
* @return
*/
public static boolean load(String path, Nature defaultNature, TreeMap<String, CoreDictionary.Attribute> map, LinkedHashSet<Nature> customNatureCollector)
public static boolean load(String path, Nature defaultNature, TreeMap<String, CoreDictionary.Attribute> map, LinkedHashSet<Nature> customNatureCollector, boolean normalization)
{
try
{
Expand All @@ -242,7 +246,7 @@ public static boolean load(String path, Nature defaultNature, TreeMap<String, Co
}
String[] param = line.split(splitter);
if (param[0].length() == 0) continue; // 排除空行
if (HanLP.Config.Normalization) param[0] = CharTable.convert(param[0]); // 正规化
if (normalization) param[0] = CharTable.convert(param[0]); // 正规化

int natureCount = (param.length - 1) / 2;
CoreDictionary.Attribute attribute;
Expand Down Expand Up @@ -333,7 +337,7 @@ public boolean add(String word, String natureWithFrequency)
*/
public boolean add(String word)
{
if (HanLP.Config.Normalization) word = CharTable.convert(word);
if (normalization) word = CharTable.convert(word);
if (contains(word)) return false;
return insert(word, null);
}
Expand All @@ -349,7 +353,7 @@ public boolean add(String word)
public boolean insert(String word, String natureWithFrequency)
{
if (word == null) return false;
if (HanLP.Config.Normalization) word = CharTable.convert(word);
if (normalization) word = CharTable.convert(word);
CoreDictionary.Attribute att = natureWithFrequency == null ? new CoreDictionary.Attribute(Nature.nz, 1) : CoreDictionary.Attribute.create(natureWithFrequency);
if (att == null) return false;
if (dat.set(word, att)) return true;
Expand Down Expand Up @@ -474,7 +478,7 @@ public static boolean isDicNeedUpdate(String mainPath, String path[])
*/
public CoreDictionary.Attribute get(String key)
{
if (HanLP.Config.Normalization) key = CharTable.convert(key);
if (normalization) key = CharTable.convert(key);
CoreDictionary.Attribute attribute = dat.get(key);
if (attribute != null) return attribute;
if (trie == null) return null;
Expand All @@ -489,7 +493,7 @@ public CoreDictionary.Attribute get(String key)
*/
public void remove(String key)
{
if (HanLP.Config.Normalization) key = CharTable.convert(key);
if (normalization) key = CharTable.convert(key);
if (trie == null) return;
trie.remove(key);
}
Expand Down Expand Up @@ -715,6 +719,6 @@ public boolean reload()
{
if (path == null || path.length == 0) return false;
IOUtil.deleteFile(path[0] + Predefine.BIN_EXT); // 删掉缓存
return loadMainDictionary(path[0]);
return loadMainDictionary(path[0], normalization);
}
}
7 changes: 7 additions & 0 deletions src/main/java/com/hankcs/hanlp/seg/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
*/
package com.hankcs.hanlp.seg;

import com.hankcs.hanlp.HanLP;

/**
* 分词器配置项
*/
Expand Down Expand Up @@ -86,4 +88,9 @@ public boolean isIndexMode()
{
return indexMode > 0;
}

/**
* 是否执行字符正规化(繁体->简体,全角->半角,大写->小写),切换配置后必须删CustomDictionary.txt.bin缓存
*/
public boolean normalization = HanLP.Config.Normalization;
}
14 changes: 12 additions & 2 deletions src/main/java/com/hankcs/hanlp/seg/Segment.java
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ private static void removeFromWordNet(Vertex cur, WordNet wordNetAll, int line,
public List<Term> seg(String text)
{
char[] charArray = text.toCharArray();
if (HanLP.Config.Normalization)
if (config.normalization)
{
CharTable.normalization(charArray);
}
Expand Down Expand Up @@ -602,7 +602,7 @@ public List<Term> seg(String text)
public List<Term> seg(char[] text)
{
assert text != null;
if (HanLP.Config.Normalization)
if (config.normalization)
{
CharTable.normalization(text);
}
Expand Down Expand Up @@ -881,4 +881,14 @@ public Segment enableMultithreading(int threadNumber)
config.threadNumber = threadNumber;
return this;
}

/**
* 是否执行字符正规化(繁体->简体,全角->半角,大写->小写),切换配置后必须删CustomDictionary.txt.bin缓存
*/
public Segment enableNormalization(boolean normalization)
{

config.normalization = normalization;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.recognition.nr.JapanesePersonRecognition;
import com.hankcs.hanlp.recognition.nr.PersonRecognition;
import com.hankcs.hanlp.recognition.nr.TranslatedPersonRecognition;
Expand Down Expand Up @@ -211,7 +211,7 @@ private void loadCustomDic(String customPath, boolean isCache)
File file = new File(mainPath);
mainPath = file.getParent() + "/" + Math.abs(combinePath.toString().hashCode());
mainPath = mainPath.replace("\\", "/");
customDictionary.loadMainDictionary(mainPath, path, dat, isCache);
DynamicCustomDictionary.loadMainDictionary(mainPath, path, dat, isCache, config.normalization);
}

/**
Expand Down

0 comments on commit 69506a7

Please sign in to comment.