-
Notifications
You must be signed in to change notification settings - Fork 0
/
dic_to_db_noun.py
42 lines (37 loc) · 1.42 KB
/
dic_to_db_noun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# -*- coding: utf-8 -*-
# Make the word objects from Noun.dic and save these in database
# In Noun.dic, the word information is like following:
# (品詞 (名詞 一般)) ((見出し語 (鉄腕 3999)) (読み テツワン) (発音 テツワン) )
# Store the information of Hinshi, Midashigo, Yomi(katakana), and hiragana
import codecs
import re
import os
from jcconv import *
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
from games.models import *
file = codecs.open('dic/Noun.dic', 'r', 'utf_8')
lines = file.readlines()
# delete all object
Word.objects.all().delete()
# make the word objects from Noun.dic
for line in lines:
regex = u"品詞 \((.+?)\)\)"
list = re.findall(regex, line)
hinshi = list[0].encode('utf_8')
regex = u"見出し語 \((\S+)"
list = re.findall(regex, line)
midashigo = list[0].encode('utf_8')
regex = u"読み (\S+)\)"
list = re.findall(regex, line)
katakana = list[0].encode('utf_8')
if katakana.startswith('{'):
# the case that there are 2 way to read
kanas = katakana.split('/')
for kana in kanas:
hira = kata2hira(kana.strip('{}'))
word = Word(hinshi=hinshi, midashigo=midashigo, katakana=kana, hiragana=hira)
word.save()
else:
hiragana = kata2hira(list[0].encode('utf_8'))
word = Word(hinshi=hinshi, midashigo=midashigo, katakana=katakana, hiragana=hiragana)
word.save()