-
Notifications
You must be signed in to change notification settings - Fork 1
/
nlp.py
54 lines (43 loc) · 1.5 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from nltk import wordnet, pos_tag
from nltk import WordNetLemmatizer
import pymorphy2
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import string
import re
s = '!"#$%&\'()*+,./:;<=>?@[\]^_`{|}~'
def get_wordnet_pos(treebank_tag):
my_switch = {
'J': wordnet.wordnet.ADJ,
'V': wordnet.wordnet.VERB,
'N': wordnet.wordnet.NOUN,
'R': wordnet.wordnet.ADV,
}
for key, item in my_switch.items():
if treebank_tag.startswith(key):
return item
return wordnet.wordnet.NOUN
def cleaning(t):
t = t.lower()
t = [i.strip(s) for i in t.split(' ')]
t = [i for i in t if i not in stopwords.words('russian')]
return " ".join(t)
def stemming(t):
t = cleaning(t)
stemmer = SnowballStemmer(language='russian')
t = ' '.join([stemmer.stem(word) for word in t.split()])
return t
def lemming(t):
t = cleaning(t)
lemmatizer = WordNetLemmatizer()
tokenized_sent = t.split()
pos_tagged = [(word, get_wordnet_pos(tag)) for word, tag in pos_tag(tokenized_sent)]
t = ' '.join([lemmatizer.lemmatize(word, tag) for word, tag in pos_tagged])
return t
def my_lemmatizer_ru(sent):
sent = cleaning(sent)
lemmatizer = pymorphy2.MorphAnalyzer()
tokenized_sent = sent.split()
return ' '.join([lemmatizer.parse(word)[0].normal_form
for word in tokenized_sent])