-
Notifications
You must be signed in to change notification settings - Fork 2
/
test_ja.py
57 lines (46 loc) · 1.91 KB
/
test_ja.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env python
# coding: utf-8
from BM25F.ja import Normalizer
from BM25F.ja import PosFilter
from BM25F.ja import StemFilter
from BM25F.ja import Tokenizer
import unittest
class TestJapanese(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.stem_filter = StemFilter()
cls.pos_filter = PosFilter()
def test_normalzier(self):
n = Normalizer()
self.assertEqual('abc', n.normalize('ABC'))
self.assertEqual('カラー', n.normalize('カラー'))
self.assertEqual('モニタ', n.normalize('モニター'))
self.assertEqual('モニター', n.normalize('モニターー'))
self.assertEqual('モニタの', n.normalize('モニターの'))
self.assertEqual('イーメール', n.normalize('イーメール'))
def test_stem_filter(self):
self.assertTrue('ある' in self.stem_filter)
self.assertFalse('テスト' in self.stem_filter)
def test_pos_filter(self):
self.assertTrue('助詞-格助詞-一般' in self.pos_filter)
self.assertFalse('名詞-一般' in self.pos_filter)
def test_tokenizer(self):
m = Tokenizer()
self.assertEqual([
('テスト', '名詞-サ変接続'),
('データ', '名詞-一般'),
], m.tokenize_smartly('テストデータ'))
def test_tokenizer_with_stem_filter(self):
m = Tokenizer(stem_filter=self.stem_filter)
self.assertEqual([
('テスト', '名詞-サ変接続'),
('データ', '名詞-一般'),
], m.tokenize_smartly('その他テストデータ'))
def test_tokenizer_with_pos_filter(self):
m = Tokenizer(pos_filter=self.pos_filter)
self.assertEqual([
('テスト', '名詞-サ変接続'),
('データ', '名詞-一般'),
], m.tokenize_smartly('テストのデータ'))
if __name__ == '__main__':
unittest.main()