Source code for pipeline.modeling.models

import numpy as np
import pandas as pd
from .. import setup_environment
from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier 
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression 
from sklearn.neighbors.nearest_centroid import NearestCentroid 
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier 

config_db = setup_environment.get_config_file('pipeline/default_profile.yaml')


[docs]class ConfigError(): # probabily exception error to be filled in later pass
[docs]class Model(): def __init__(self, model_name, model_params, label, training_data, testing_data, cols_to_use, config): self.model_name = model_name self.model_params = model_params self.label = label self.training_data = training_data self.testing_data = testing_data self.cols_to_use= cols_to_use self.config= config
[docs] def get_data(self, df, undersample = False): # print warnings for expected columns not found in data for col in self.cols_to_use: if col not in (list(df.columns.values)): print "column not in " print col df[col] = 0 # partition class labels for undersampling positive_label_df = df[df[self.label] ==1 ] negative_label_df = df[(df[self.label] ==0) & (df[self.label] != None)] if undersample: len_for_negative_labels = len(positive_label_df) * 9 if len_for_negative_labels <= len(negative_label_df): negative_label_df = negative_label_df.sample(len_for_negative_labels) df = negative_label_df.append(positive_label_df) _x = df[self.cols_to_use] # CAUTION -- for testing _x = _x.fillna(0) _y = df[self.label] _ids = df[config_db['id_column']] return _x, _y, _ids
[docs] def get_training_data(self): training_x, training_y, training_ids = self.get_data(self.training_data, undersample= True) return training_x, training_y, training_ids
[docs] def get_test_data(self): test_x, test_y, test_ids = self.get_data(self.testing_data) return test_x, test_y, test_ids
[docs] def run(self): training_x, training_y, training_ids = self.get_training_data() test_x, test_y, test_ids = self.get_test_data() clf = self.define_model(self.model_name, self.model_params) clf.fit(training_x, training_y) res_predict = clf.predict(test_x) if (self.model_name == "SGDClassifier" and (clf.loss =="hinge" or clf.loss == "perceptron")) or self.model_name == "linear.SVC": res = list(clf.decision_function(test_x)) else: res = list(clf.predict_proba(test_x)[:,1]) #fp, fn, tp, tn = self.compute_confusion_matrix(res[:,0], test_y) result_dictionary = {'training_ids': training_ids, 'predictions_test_y': list(res_predict), 'prob_prediction_test_y': res , 'test_y': list(test_y), 'test_ids': list(test_ids), 'model_name': self.model_name, 'model_params': self.model_params, 'label': self.label, 'feature_columns_used': self.cols_to_use, 'config': self.config, 'feature_importance': self.get_feature_importance(clf, self.model_name), 'columned_used_for_feat_importance': list(training_x.columns.values)} return result_dictionary, clf
[docs] def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
[docs] def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
[docs] def compute_confusion_matrix(self,predicted_labels, labels): false_positive = 0 false_negative = 0 true_positive = 0 true_negative = 0 for idx, p_label in enumerate(predicted_labels): if p_label == 1 and labels[idx] == 1: true_positive +=1 elif p_label ==1 and labels[idx] == 0: false_positive +=1 elif p_label == 0 and labels[idx] == 1: false_negative += 1 else: true_negative += 1 return false_positive, false_negative, true_positive, true_negative