Skip to content

Commit

Permalink
DocVectorModel支持自定义分词器、开/关停用词过滤器 fix #1253 (comment)
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Jul 27, 2019
1 parent 86f5351 commit b0158f6
Showing 1 changed file with 55 additions and 2 deletions.
57 changes: 55 additions & 2 deletions src/main/java/com/hankcs/hanlp/mining/word2vec/DocVectorModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
package com.hankcs.hanlp.mining.word2vec;


import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NotionalTokenizer;

import java.util.*;
import java.util.List;
import java.util.Map;

/**
* 文档向量模型
Expand All @@ -24,11 +27,26 @@
public class DocVectorModel extends AbstractVectorModel<Integer>
{
private final WordVectorModel wordVectorModel;
/**
* 分词器
*/
private Segment segment;
/**
* 是否使用CoreStopwordDictionary的过滤器
*/
private boolean filter;

public DocVectorModel(WordVectorModel wordVectorModel)
{
this(wordVectorModel, NotionalTokenizer.SEGMENT, true);
}

public DocVectorModel(WordVectorModel wordVectorModel, Segment segment, boolean filter)
{
super();
this.wordVectorModel = wordVectorModel;
this.segment = segment;
this.filter = filter;
}

/**
Expand Down Expand Up @@ -68,7 +86,11 @@ public List<Map.Entry<Integer, Float>> nearest(String query)
public Vector query(String content)
{
if (content == null || content.length() == 0) return null;
List<Term> termList = NotionalTokenizer.segment(content);
List<Term> termList = segment.seg(content);
if (filter)
{
CoreStopWordDictionary.apply(termList);
}
Vector result = new Vector(dimension());
int n = 0;
for (Term term : termList)
Expand Down Expand Up @@ -97,6 +119,7 @@ public int dimension()

/**
* 文档相似度计算
*
* @param what
* @param with
* @return
Expand All @@ -109,4 +132,34 @@ public float similarity(String what, String with)
if (B == null) return -1f;
return A.cosineForUnitVector(B);
}

public Segment getSegment()
{
return segment;
}

public void setSegment(Segment segment)
{
this.segment = segment;
}

/**
* 是否激活了停用词过滤器
*
* @return
*/
public boolean isFilterEnabled()
{
return filter;
}

/**
* 激活/关闭停用词过滤器
*
* @param filter
*/
public void enableFilter(boolean filter)
{
this.filter = filter;
}
}

0 comments on commit b0158f6

Please sign in to comment.