-
Notifications
You must be signed in to change notification settings - Fork 1
/
features.py
103 lines (95 loc) · 5.72 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
from pandas import DataFrame
import nltk
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from nltk.corpus import wordnet as wn
import pickle
from scipy.sparse import hstack
from scipy import sparse
def loadTrain(filename):
traindf = DataFrame.from_csv(filename, sep='\t',index_col=False)
traindf1 =traindf[['essay_id','essay_set','essay','rater1_domain1','rater2_domain1','domain1_score']]
filtercase =('b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','.')
X=traindf1['essay']
Y=traindf1['rater1_domain1']
return X,Y
def filter(X,value,strategy):
if strategy=='order':
return X[:value]
def generateFeatures(X):
""" Features related to basic counters of words and sentences: """
essay_word= [nltk.word_tokenize(raw) for raw in X ]
##Word Count
essay_word_count= [len(nltk.word_tokenize(raw)) for raw in X]
avg_essay_word_count=np.mean(essay_word_count)
##Sentence Count
sentence_per_essay = [ nltk.sent_tokenize(raw) for raw in X ]
sentence_count_per_essay = [ len(sentences) for sentences in sentence_per_essay ]
avg_sentence_count= np.mean(sentence_count_per_essay)
##Word tokens that have more than 6 characters divided by the number of word tokens
count_word_tokens_greater_6= [ [len(wordgreaterthan6)/len(words) for wordgreaterthan6 in words if len(wordgreaterthan6)>6 ] for words in essay_word]
##Word tokens that have less than 4 characters divided by the number of word tokens
count_word_tokens_less_4= [ [len(wordlessthan4)/len(words) for wordlessthan4 in words if len(wordlessthan4)<4 ] for words in essay_word]
avg_word_per_sentence_per_essay= []
from sets import Set
count_lemma_set_for_essay=[]
for words in essay_word:
lemma_set=Set()
for word in words:
for synset in wn.synsets(word):
for lemma in synset.lemmas():
lemma_set.add(lemma)
count_lemma_set_for_essay.append(len(lemma_set))
""" Number of word tokens divided by number of sentences """
word_token_by_count_sentences = [ float(count_word)/count_sentences for count_word,count_sentences in zip(essay_word_count,sentence_count_per_essay) ]
""" Number of non-initial CAPS words divided by number of sentences """
""" Number of characters in the essay divided by number of sentences"""
character_count_by_count_sentences = [ float(len(x))/count_sentences for x,count_sentences in zip(X,sentence_count_per_essay) ]
#Features related to nonlinear combinations of different attributes:
fourth_root_essay_word_count = [ pow(c,0.25) for c in essay_word_count]
countPOSTag = [ Counter(tag for word,tag in nltk.pos_tag(nltk.Text(nltk.word_tokenize(raw)))) for raw in X]
pendict = {1: 'CC', 2: 'CD', 3: 'DT', 4: 'EX', 5: 'FW', 6: 'IN', 7: 'JJ', 8: 'JJR', 9: 'JJS', 10: 'LS', 11: 'MD', 12: 'NN', 13: 'NNS', 14: 'NNP', 15: 'NNPS', 16: 'PDT', 17: 'POS', 18: 'PRP', 19: 'PRP$', 20: 'RB', 21: 'RBR', 22: 'RBS', 23: 'RP', 24: 'SYM', 25: 'TO', 26: 'UH', 27: 'VB', 28: 'VBD', 29: 'VBG', 30: 'VBN', 31: 'VBP', 32: 'VBZ', 33: 'WDT', 34: 'WP', 35: 'WP$', 36: 'WRB', 'NN': 12, 'FW': 5, 'PRP': 18, 'RB': 20, 'NNS': 13, 'NNP': 14, 'PRP$': 19, 'WRB': 36, 'CC': 1, 'PDT': 16, 'VBN': 30, 'WP$': 35, 'JJS': 9, 'JJR': 8, 'SYM': 24, 'VBP': 31, 'WDT': 33, 'JJ': 7, 'VBG': 29, 'WP': 34, 'VBZ': 32, 'DT': 3, 'POS': 17, 'TO': 25, 'LS': 10, 'VB': 27, 'RBS': 22, 'RBR': 21, 'EX': 4, 'IN': 6, 'RP': 23, 'CD': 2, 'VBD': 28, 'MD': 11, 'NNPS': 15, 'UH': 26, '.':37 , 37:'.' , ':':38, 38:':','-NONE-':39,39:'-NONE-' , ',':40, 40:','}
essay_POS_features=[]
for i in xrange(0,len(countPOSTag)):
vector=[0]*40
for key in countPOSTag[i]:
if key in pendict.values():
vector[pendict[key]-1]=int(countPOSTag[i][key])
essay_POS_features.append(vector)
#Transforming into TF-IDF feature vectors
vectorizer = CountVectorizer(min_df=1,decode_error='ignore')
transformer = TfidfTransformer()
X = vectorizer.fit_transform(X)
X = transformer.fit_transform(X)
#Combining features into a single feature matrix
X=hstack((X,essay_POS_features))
X=hstack((X, sparse.csr_matrix(np.array(fourth_root_essay_word_count)).transpose() ))
X=hstack((X, sparse.csr_matrix(np.array(character_count_by_count_sentences)).transpose() ))
X=hstack((X, sparse.csr_matrix(np.array(word_token_by_count_sentences)).transpose() ))
X=hstack((X, sparse.csr_matrix(np.array(count_lemma_set_for_essay)).transpose() ))
#X=hstack((X, sparse.csr_matrix(np.array(count_word_tokens_less_4)).transpose() ))
#X=hstack((X, sparse.csr_matrix(np.array(count_word_tokens_greater_6)).transpose() ))
X=hstack((X, sparse.csr_matrix(np.array(sentence_count_per_essay)).transpose() ))
X=hstack((X, sparse.csr_matrix(np.array(essay_word_count)).transpose() ))
return X
def saveFeatures(X,values):
with open('features'+values+'.pickle', 'wb') as handle:
pickle.dump(X, handle)
def classifyandscore(name,X,Y):
X_train, X_test, y_train, y_test =cross_validation.train_test_split(X.toarray(),Y, test_size=0.4, random_state=0)
if(name=='SVM'):
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
print clf.score(X_test, y_test)
elif (name=='RandomForest'):
clf = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
print clf.score(X_test, y_test)
X,Y= loadTrain('data/training_set_rel3.tsv')
X=filter(X,10,'order')
Y=filter(Y,10,'order')
X=generateFeatures(X)
classifyandscore('SVM',X,Y)