example_adult.py

import numpy as np
from scipy.stats import mode, itemfreq
from scipy import delete
import matplotlib.pylab as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC as SVM

from missing_data_imputation import Imputer


# declare csv headers
x = np.genfromtxt('data/adult-train-raw', delimiter=', ', dtype=object)

# remove redundant education-number feature
x = delete(x, (4, 14), 1)

# enumerate parameters and instantiate Imputer
imp = Imputer()
missing_data_cond = lambda x: x == '?'
cat_cols = (1, 3, 4, 5, 6, 7, 8, 12)
n_neighbors = 5

# # drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print 'imputing with random replacement'
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print 'imputing with feature summarization (mode)'
summ_func = lambda x: mode(x)[0]
data_mode = imp.summarize(x, summ_func, missing_data_cond)

# replace categorical features with one hot row
print 'imputing with one-hot'
data_onehot = imp.binarize_data(x, cat_cols)

# replace missing data with predictions using random forest
print 'imputing with predicted values from random forest'
clf = RandomForestClassifier(n_estimators=100, criterion='gini')
data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with predictions using SVM
print 'imputing with predicted values usng SVM'
clf = SVM(
    penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', 
    fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, 
    random_state=None, max_iter=1000)
data_svm = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with predictions using logistic regression
print 'imputing with predicted values usng logistic regression'
clf = LogisticRegression(
            penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
            intercept_scaling=1)
data_logistic = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with values obtained after factor analysis
print 'imputing with factor analysis'
data_facanal = imp.factor_analysis(x, cat_cols, missing_data_cond)

# replace missing data with knn
print 'imputing with K-Nearest Neighbors'
data_knn = imp.knn(x, n_neighbors, np.mean, missing_data_cond, cat_cols)

def compute_histogram(data, labels):
    histogram = itemfreq(sorted(data))
    for label in labels:
        if label not in histogram[:,0]:
            histogram = np.vstack((histogram,
                                   np.array([[label, 0]], dtype=object)))
    histogram = histogram[histogram[:,0].argsort()]
    return histogram

# compute histograms
labels = np.unique(x[:,1])
freq_data = {}
freq_data['Raw data'] = compute_histogram(x[:,1], labels)
# freq_data['Drop missing'] = compute_histogram(data_drop[:,1], labels)
freq_data['Random replace'] = compute_histogram(data_replace[:,1], labels)
freq_data['Summary'] = compute_histogram(data_mode[:,1], labels)
freq_data['Random forests'] = compute_histogram(data_rf[:,1], labels)
freq_data['SVM'] = compute_histogram(data_svm[:,1], labels)
freq_data['Logistic regression'] = compute_histogram(data_logistic[:,1], labels)
freq_data['PCA'] = compute_histogram(data_facanal[:,1], labels)
freq_data['KNN'] = compute_histogram(data_knn[:,1], labels)

# plot histograms given feature with missing data
n_methods = len(freq_data.keys())
bins = np.arange(len(labels))
width = .25
fig, ax = plt.subplots(figsize=(12,8))

for i in xrange(n_methods):
    key = sorted(freq_data.keys())[i]
    offset = i*2*width/float(n_methods)
    ax.bar(bins+offset, freq_data[key][:,1].astype(int), width, label=key,
           color=plt.cm.hot(i/float(n_methods)), align='center')

ax.set_xlabel('Work class categories', size=15)
ax.set_ylabel('Count', size=15)
ax.set_title('Adult training set (N= 32,561)', size=15, fontweight='bold')
ax.set_xticks(bins + width)
ax.set_xticklabels(labels, rotation=45)
plt.legend(loc=2)
plt.tight_layout()
plt.show()