Source code for mightypy.stats._feature_importance

""" 
Feature Importance Methods
-----------------------------

"""

from typing import Callable, Tuple, Optional
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


[docs] class WOE_IV: """ Weight of Evidence and Information Value. References: https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html Args: event (str): event name. Generally label true/1. non_event (str): non event name. Generally label false/0. target_col (str): Target column name. value_col (str): Value column name to aggregate(count). Defaults to None. bucket_col (str): bucketing column name. agg_func (Callable, optional): Aggregation function name. Defaults to np.count_nonzero. bucket_col_type (str, optional): Bucketing columns value type. If discrete buckets will not be created else buckets will be created. Defaults to 'continuous'. n_buckets (int, optional): If bucket column has continuous values then create aritificial buckets. Defaults to 10. Examples: >>> from sklearn.datasets import load_breast_cancer >>> from mightypy.stats import WOE_IV >>> dataset = load_breast_cancer(as_frame=True) >>> df = dataset.frame[['mean radius', 'target']] >>> target_map = {0: 'False', 1: 'True'} >>> df['label'] = df['target'].map(target_map) >>> obj = WOE_IV(event='True', non_event='False', target_col='label', >>> bucket_col='mean radius') >>> cal_df, iv = obj.values(df) >>> fig = obj.plot() >>> fig.tight_layout() >>> fig.show() or directly >>> fig, ax = obj.plot(df) >>> fig.show() """ def __init__( self, event: str, non_event: str, target_col: str, bucket_col: str, value_col: Optional[str] = None, agg_func: Callable = np.count_nonzero, bucket_col_type: str = "continuous", n_buckets: int = 10, ): self._event = event self._non_event = non_event self._target_col = target_col self._bucket_col = bucket_col self._bucket_col_name = f"buckets_{bucket_col}" self._value_col = value_col self._agg_func = agg_func self._bucket_col_type = bucket_col_type self._n_buckets = n_buckets self._perc_event_col_name: str = f"%_event_{event}" self._perc_non_event_col_name: str = f"%_non_event_{non_event}" self._df: pd.DataFrame = None # type: ignore self._cal_df: pd.DataFrame = None # type: ignore self._iv: float = None # type: ignore def _calculate(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, float]: """ Calculations for weight of evidence and information value. Args: df (pd.DataFrame): input dataframe with said values. Raises: NotImplementedError: If bucket column type is not 'continuous' or 'discrete'. Returns: Tuple[pd.DataFrame, float]: calculated dataframe and information value. """ if self._value_col is None: self._value_col = "values" self._df = df[[self._target_col, self._bucket_col]].copy() self._df.insert(loc=0, column=self._value_col, value="x") else: self._df = df[[self._target_col, self._bucket_col, self._value_col]].copy() if self._bucket_col_type == "continuous": quantiles = np.linspace(0, 1, self._n_buckets + 1) self._df.insert( loc=0, column=self._bucket_col_name, value=pd.qcut( self._df[self._bucket_col].values, q=quantiles, duplicates="raise", retbins=False, ), ) elif self._bucket_col_type == "discrete": self._df.insert( loc=0, column=self._bucket_col_name, value=df[self._bucket_col] ) else: raise NotImplementedError self._cal_df = pd.pivot_table( self._df, index=[self._bucket_col_name], columns=[self._target_col], values=self._value_col, aggfunc=self._agg_func, ) self._cal_df.fillna(value=0, inplace=True) self._cal_df[["adj_event", "adj_non_event"]] = self._cal_df[ [self._event, self._non_event] ].apply(lambda x: (x + 0.5) if (x[0] == 0 or x[1] == 0) else x, axis=1) event_sum = self._cal_df[self._event].sum() self._cal_df[self._perc_event_col_name] = self._cal_df["adj_event"] / event_sum non_event_sum = self._cal_df[self._non_event].sum() self._cal_df[self._perc_non_event_col_name] = ( self._cal_df["adj_non_event"] / non_event_sum ) self._cal_df["woe"] = np.log( self._cal_df[self._perc_non_event_col_name] / self._cal_df[self._perc_event_col_name] ) self._cal_df["iv"] = ( self._cal_df[self._perc_non_event_col_name] - self._cal_df[self._perc_event_col_name] ) * self._cal_df["woe"] self._iv: float = self._cal_df["iv"].sum() return self._cal_df, self._iv
[docs] def values(self, df: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, float]: """ Returns weight of evidence and information value for given dataframe. Args: df (Optional[pd.DataFrame], optional): Input dataframe. Defaults to None. Raises: ValueError: If input dataframe does not exist either in the model or in method input args. Returns: Tuple[pd.DataFrame, float]: calculated dataframe and information value. """ if self._iv is None or self._cal_df is None: if df is None: if self._df is None: raise ValueError( "dataframe doesn't exist. Please insert dataframe." ) else: self._calculate(self._df) else: self._calculate(df) return self._cal_df, self._iv
[docs] def plot(self, df: Optional[pd.DataFrame] = None, figsize=(10, 5)) -> plt.Figure: # type: ignore """ Plot weight of evidence and subsequent plots. Args: df (Optional[pd.DataFrame], optional): Input dataframe. Defaults to None. figsize (tuple, optional): Figure size. Defaults to (10, 5). Raises: ValueError: If dataframe doesn't exist either in the model or in method args. Returns: plt.Figure: matplotlib figure. """ if self._iv is None or self._cal_df is None: if df is None: if self._df is None: raise ValueError( "dataframe doesn't exist. Please insert dataframe." ) else: self._calculate(self._df) else: self._calculate(df) idxs = self._cal_df.index.astype(str) ranges = np.arange(0, self._n_buckets, step=1) fig, _ax = plt.subplots(1, 2, figsize=figsize) _ax[0].set_xlim( left=self._cal_df["woe"].min() - 2, # type: ignore right=self._cal_df["woe"].max() + 2, ) _ax[0].barh( y=idxs, width=self._cal_df["woe"], color="blue", alpha=0.6 # type: ignore ) # type: ignore for i in _ax[0].containers: # type: ignore _ax[0].bar_label(i, fmt="%.3f", padding=5) # type: ignore _ax[0].grid(alpha=0.2) # type: ignore _ax[0].set_xlabel(None) # type: ignore _ax[0].set_ylabel(None) # type: ignore _ax[0].set_title("Weight Of Evidence") # type: ignore _ax[1].barh( y=ranges - 0.2, width=self._cal_df[self._event], # type: ignore color="red", alpha=0.6, label=self._event, height=0.4, ) _ax[1].barh( y=ranges + 0.2, width=self._cal_df[self._non_event], # type: ignore color="green", alpha=0.6, label=self._non_event, height=0.4, ) _ax[1].set_yticks(ranges) # type: ignore _ax[1].set_yticklabels(idxs) # type: ignore for i in _ax[1].containers: # type: ignore _ax[1].bar_label(i, fmt="%.0f", padding=5) # type: ignore _ax[1].grid(alpha=0.2) # type: ignore _ax[1].set_ylabel(None) # type: ignore _ax[1].set_title("Deciles") # type: ignore _ax[1].legend(bbox_to_anchor=(1.5, 1), loc="upper right") # type: ignore fig.suptitle( f""" {self._bucket_col} ======================================= Information Value : {self._iv:.3f} --------------------------------------- """, fontsize=12, ) return fig
if __name__ == "__main__": # from sklearn.datasets import load_breast_cancer # plt.style.use('seaborn') # dataset = load_breast_cancer(as_frame=True) # df = dataset.frame[['mean radius', 'target']] # target_map = {0: 'False', 1: 'True'} # df['label'] = df['target'].map(target_map) # model = WOE_IV(event='True', non_event='False', # target_col='label', bucket_col='mean radius') # fig = model.plot(df) # fig.tight_layout() # plt.show() # print(model.values()) pass