Source code for pipeline.preprocessing.feature_processor

from .features import class_map
from .. import setup_environment
import pandas as pd
import datetime

config_db = setup_environment.get_config_file('pipeline/default_profile.yaml')


[docs]class FeatureGrabber(): def __init__(self, end_date, engine, config_db, con): self.end_date = end_date self.engine = engine self.config_db = config_db self.con = con def __read_feature_from_db(self, query, drop_duplicates = True): print query results = pd.read_sql(query, con=self.con) return results
[docs] def getFeature(self, feature_to_load): kwargs = {'fake_today' : self.end_date, 'db_tables' : self.config_db} feature = class_map.lookup(feature_to_load, **kwargs) if type(feature.query) == str: results = self.__read_feature_from_db(feature.query, drop_duplicates= True) featurenames = feature.description if feature.type_of_features == "categorical": results, featurenames = convert_categorical(results) elif feature.type_of_features == "numerical": results, featurenames = numerical_column_clean(results) elif feature.type_of_features == "imputation zero": results, featurenames = imputation_zero(results) else: results, featurenames = feature_name_grabber(results) return results, featurenames
[docs]def feature_name_grabber(df): df.fillna(0) columns = df.columns.values.tolist() columns[:] = (value for value in columns if value != config_db['id_column']) return df, columns
[docs]def convert_categorical(df): onecol = df.columns[1] onecol_name = df.columns.values.tolist()[1] df[onecol] = df[onecol].str.lower() categories = pd.unique(df[onecol]) categories = [x for x in categories if x is not None] try: categories.remove(' ') except: pass categories = [str(x) for x in categories] categories = list(set([str.lower(x).strip() for x in categories])) #replaces spaces in middle of word w underscores categories = list(set([x.replace(" ", '_') for x in categories])) featnames = [] for i in range(len(categories)): if type(categories[i]) is str: newfeatstr = onecol_name+'_is_' + categories[i] featnames.append(newfeatstr) df[newfeatstr] = (df[onecol] == categories[i]) onecol_null = onecol_name + "_is_null" df[onecol_null] = pd.isnull(df[onecol]) df[onecol_null] = df[onecol_null].astype(float) df = df.drop(onecol, axis=1) df[featnames] = df[featnames].astype(float) df = df.groupby(config_db['id_column'], sort = False, as_index=False)[featnames].max() return df, featnames
[docs]def numerical_column_clean(df): df.fillna(0) columns = df.columns.values.tolist() columns[:] = (value for value in columns if value != config_db['id_column']) return df, columns
[docs]def imputation_zero(df): onecol_name = df.columns.values.tolist()[1] null_column_name = onecol_name + '_is_null' df[null_column_name] = pd.isnull(df[onecol_name]) df[null_column_name] = df[null_column_name].astype(float) df = df.fillna(0) columns = df.columns.values.tolist() columns[:] = (value for value in columns if value != config_db['id_column']) return df, columns