Build a credit scoring model

read data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
%matplotlib inline
data=pd.read_csv('./ch17_cs_training.csv')
data.head()

observation data

data.info()

Heuristically remove duplicate values ​​to see if the amount of data is reduced, if there is a reduction, there are duplicate values ​​and have been removed

data.drop_duplicates(inplace=True) 
data.info()

Restore the index, and explore the proportion of missing values

data.reset_index(inplace=True,drop=True)   
data.isnull().mean() 

It can be seen from the above that the number of family members has fewer missing values, and the missing values ​​can be directly replaced by the mean.

data['NumberOfDependents'].fillna(data['NumberOfDependents'].mean(),inplace=True) 

From the above, it can be seen that the missing value of monthly income is large, and random forest can be used to supplement the missing value

def fill_missing_rf(X,y,to_fill):

    """
    A function to fill missing values ​​for a feature using random forests

    parameter:
    X: feature matrix to fill
    y: The label corresponding to the part of the data with no missing values
    to_fill: feature to fill
    """

    #Build new feature matrix and new labels
    df = X.copy()
    fill = df.loc[:,to_fill]
    df = pd.concat([df.loc[:,df.columns != to_fill],pd.DataFrame(y)],axis=1)

    # Find out our training and test sets
    Ytrain = fill[fill.notnull()]
    Ytest = fill[fill.isnull()]
    Xtrain = df.iloc[Ytrain.index,:]
    Xtest = df.iloc[Ytest.index,:]
    #Filling Missing Values ​​with Random Forest Regression
    from sklearn.ensemble import RandomForestRegressor as rfr
    rfr = rfr(n_estimators=100)
    rfr = rfr.fit(Xtrain, Ytrain)
    Y_predict = rfr.predict(Xtest)

    return  Y_predict  

X = data.iloc[:,1:] 
y = data.iloc[:,0] 
# Bring X,y and features with missing values ​​to_fill into the defined function
y_pred = fill_missing_rf(X,y,'MonthlyIncome') 
# Fill in missing predicted values
data.loc[data.loc[:,'MonthlyIncome'].isnull(),'MonthlyIncome'] = y_pred 

After the missing values ​​are added, the next step is to deal with outliers

data.describe([0.01,0.1,0.25,0.5,0.75,0.9,0.99]).T 

insert image description here

It is observed that the minimum age is 0, which does not meet the business needs of the bank, so delete it directly

data = data[data["age"] != 0] 

Moving on, there are three metrics that look odd:

"NumberOfTime30-59DaysPastDueNotWorse"
"NumberOfTime60-89DaysPastDueNotWorse"
"NumberOfTimes90DaysLate"
The three indicators are "the worse number of 35-59 days overdue but no development in the past two years", "60-89 days overdue in the past two years but no development"
There are worse times for development", "the number of overdue times of 90 days in the past two years". These three indicators are still 2 in the 99% distribution, but the maximum value is
98, looks very abnormal.

data[data.loc[:,"NumberOfTimes90DaysLate"] > 90].count() 

Not right, then judge

There are 225 samples in this situation, and these samples, let's observe, the labels are not all 1, and they are not all bad customers, which is obviously abnormal.
Therefore, we can basically judge that these samples are some kind of anomalies and should be deleted.

data = data[data.loc[:,'NumberOfTimes90DaysLate']<90]  

detect

data.describe([0.01,0.1,0.25,0.5,0.75,0.9,0.99]).T 

Recover the index, and explore the distribution of labels

data.reset_index(inplace=True,drop=True) 
X = data.iloc[:,1:] 
y = data.iloc[:,0]
# y.value_counts() 
sns.countplot(x="SeriousDlqin2yrs", data=data) 
n_1_sample = y.value_counts()[1]
n_0_sample = y.value_counts()[0] 
print('Number of samples:{};1 Take up{:.2%};0 Take up{:.2%}'.format(len(y),n_1_sample/len(y),n_0_sample/len(y)))  

Unbalanced data, make it balanced

#In logistic regression, the most used is the upsampling method SMOTE for sample equalization

import imblearn
#imblearn is a library specially used to deal with imbalanced data sets, and its performance is much higher than sklearn in dealing with imbalanced samples.
#There are also classes in imblearn, which also need to be instantiated and fit ted, similar to the usage of sklearn.
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42) #instantiate
X,y = sm.fit_resample(X,y) #Returns the feature matrix and labels after upsampling
n_sample_ = X.shape[0]
n_1_sample = pd.Series(y).value_counts()[1]
n_0_sample = pd.Series(y).value_counts()[0]
print('Number of samples:{}; 1 Take up{:.2%}; 0 Take up{:.2%}'.format(n_sample_,n_1_sample/n_sample_,n_0_sample/n_sample_)) 

training data to build a model

from sklearn.model_selection import train_test_split
X = pd.DataFrame(X)
y = pd.DataFrame(y)
 
X_train, X_vali, Y_train, Y_vali = train_test_split(X,y,test_size=0.3,random_state=420) 
model_data = pd.concat([Y_train, X_train], axis=1)
model_data.reset_index(drop=True,inplace=True)
model_data.columns = data.columns

Divide training set and validation set

vali_data = pd.concat([Y_vali, X_vali], axis=1)
vali_data.reset_index(drop=True,inplace=True) 
vali_data.columns = data.columns
model_data.to_csv(r'.\model_data.csv')
vali_data.to_csv(r'.\vali_data.csv')

Analyze the training set

import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
df=pd.read_csv(r'.\model_data.csv', index_col = 0)
df.index.name = 'ID'
states={'SeriousDlqin2yrs':'good and bad customers',
        'RevolvingUtilizationOfUnsecuredLines':'Available quota ratio',
        'age':'age',
        'NumberOfTime30-59DaysPastDueNotWorse':'Overdue 30-59 number of days',
        'DebtRatio':'debt ratio',
        'MonthlyIncome':'monthly income',
        'NumberOfOpenCreditLinesAndLoans':'amount of credit',
        'NumberOfTimes90DaysLate':'90 days overdue',
        'NumberRealEstateLoansOrLines':'Fixed Asset Loans',
        'NumberOfTime60-89DaysPastDueNotWorse':'Overdue 60-89 number of days',
        'NumberOfDependents':'Number of family members'}
df.rename(columns=states,inplace=True)
df.head()

Univariate Analysis

age_cut=pd.cut(df['age'],5)
age_cut_group=df['good and bad customers'].groupby(age_cut).count()
age_cut_grouped1=df["good and bad customers"].groupby(age_cut).sum()
df2=pd.merge(pd.DataFrame(age_cut_group),pd.DataFrame(age_cut_grouped1),left_index=True,right_index=True)
df2.rename(columns={'good and bad customers_x':'total customers','good and bad customers_y':'number of bad customers'},inplace=True)
df2.insert(2,"Number of good customers",df2["total customers"]-df2["number of bad customers"])
df2.insert(2,"percentage of bad customers",df2["number of bad customers"]/df2["total customers"])
df2

Trend graph of bad customer rate with age

ax11=df2["percentage of bad customers"].plot(figsize=(10,5))
ax11.set_xticklabels([0,20,29,38,47,55,64,72,81,89,98,107])
ax11.set_ylabel("bad customer rate")
ax11.set_title("Trend graph of bad customer rate with age")

multivariate analysis

import seaborn as sns
corr = df.corr()#Calculate the correlation coefficient of each variable
xticks = list(corr.index)#x-axis labels
yticks = list(corr.index)#y-axis labels
fig = plt.figure(figsize=(15,10))
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr, annot=True, cmap="rainbow",ax=ax1,linewidths=.5, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})
ax1.set_xticklabels(xticks, rotation=35, fontsize=15)
ax1.set_yticklabels(yticks, rotation=0, fontsize=15)
plt.show()

WOE binning and WOE value calculation

cut1=pd.qcut(df["Available quota ratio"],4,labels=False)
cut2=pd.qcut(df["age"],8,labels=False)
bins3=[-1,0,1,3,5,13]
cut3=pd.cut(df["Overdue 30-59 number of days"],bins3,labels=False)
cut4=pd.qcut(df["debt ratio"],3,labels=False)
cut5=pd.qcut(df["monthly income"],4,labels=False)
cut6=pd.qcut(df["amount of credit"],4,labels=False)
bins7=[-1, 0, 1, 3,5, 20]
cut7=pd.cut(df["90 days overdue"],bins7,labels=False)
bins8=[-1, 0,1,2, 3, 33]
cut8=pd.cut(df["Fixed Asset Loans"],bins8,labels=False)
bins9=[-1, 0, 1, 3, 12]
cut9=pd.cut(df["Overdue 60-89 number of days"],bins9,labels=False)
bins10=[-1, 0, 1, 2, 3, 5, 21]
cut10=pd.cut(df["Number of family members"],bins10,labels=False)

rate=df["good and bad customers"].sum()/(df["good and bad customers"].count()-df["good and bad customers"].sum())
def get_woe_data(cut):
    grouped=df["good and bad customers"].groupby(cut,as_index = True).value_counts()
    woe=np.log(grouped.unstack().iloc[:,1]/grouped.unstack().iloc[:,0]/rate)
    return woe
cut1_woe=get_woe_data(cut1)
cut2_woe=get_woe_data(cut2)
cut3_woe=get_woe_data(cut3)
cut4_woe=get_woe_data(cut4)
cut5_woe=get_woe_data(cut5)
cut6_woe=get_woe_data(cut6)
cut7_woe=get_woe_data(cut7)
cut8_woe=get_woe_data(cut8)
cut9_woe=get_woe_data(cut9)
cut10_woe=get_woe_data(cut10)

Just pick a few variables and look at woe

You can change the variable name to see

# cut1_woe.plot.bar(color='b',alpha=0.3,rot=0)
# cut2_woe.plot.bar(color='b',alpha=0.3,rot=0)
cut3_woe.plot.bar(color='b',alpha=0.3,rot=0)

IV value calculation

def get_IV_data(cut,cut_woe):
    grouped=df["good and bad customers"].groupby(cut,as_index = True).value_counts()
    cut_IV=((grouped.unstack().iloc[:,1]/df["good and bad customers"].sum()-grouped.unstack().iloc[:,0]/(df["good and bad customers"].count()-df["good and bad customers"].sum()))*cut_woe).sum()    
    return cut_IV
#Calculate the IV value of each group
cut1_IV=get_IV_data(cut1,cut1_woe)
cut2_IV=get_IV_data(cut2,cut2_woe)
cut3_IV=get_IV_data(cut3,cut3_woe)
cut4_IV=get_IV_data(cut4,cut4_woe)
cut5_IV=get_IV_data(cut5,cut5_woe)
cut6_IV=get_IV_data(cut6,cut6_woe)
cut7_IV=get_IV_data(cut7,cut7_woe)
cut8_IV=get_IV_data(cut8,cut8_woe)
cut9_IV=get_IV_data(cut9,cut9_woe)
cut10_IV=get_IV_data(cut10,cut10_woe)
IV=pd.DataFrame([cut1_IV,cut2_IV,cut3_IV,cut4_IV,cut5_IV,cut6_IV,cut7_IV,cut8_IV,cut9_IV,cut10_IV],index=['Available quota ratio','age','Overdue 30-59 number of days','debt ratio','monthly income','amount of credit','90 days overdue','Fixed Asset Loans','Overdue 60-89 number of days','Number of family members'],columns=['IV'])
iv=IV.plot.bar(color='b',alpha=0.3,rot=30,figsize=(10,5),fontsize=(10))
iv.set_title('characteristic variable and IV Value distribution map',fontsize=(15))
iv.set_xlabel('characteristic variable',fontsize=(15))
iv.set_ylabel('IV',fontsize=(15))

WOE value replacement

df_new=pd.DataFrame()   #Create a new df_new to store the converted data of woe
def replace_data(cut,cut_woe):
    a=[]
    for i in cut.unique():
        a.append(i)
        a.sort()
    for m in range(len(a)):
        cut.replace(a[m],cut_woe.values[m],inplace=True)
    return cut
df_new["good and bad customers"]=df["good and bad customers"]
df_new["Available quota ratio"]=replace_data(cut1,cut1_woe)
df_new["age"]=replace_data(cut2,cut2_woe)
df_new["Overdue 30-59 number of days"]=replace_data(cut3,cut3_woe)
df_new["debt ratio"]=replace_data(cut4,cut4_woe)
df_new["monthly income"]=replace_data(cut5,cut5_woe)
df_new["amount of credit"]=replace_data(cut6,cut6_woe)
df_new["90 days overdue"]=replace_data(cut7,cut7_woe)
df_new["Fixed Asset Loans"]=replace_data(cut8,cut8_woe)
df_new["Overdue 60-89 number of days"]=replace_data(cut9,cut9_woe)
df_new["Number of family members"]=replace_data(cut10,cut10_woe)
df_new.head()

Train the model

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
x=df_new.iloc[:,1:]
y=df_new.iloc[:,:1]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.6,random_state=0)
model=LogisticRegression()
clf=model.fit(x_train,y_train)
print('Test scores:{}'.format(clf.score(x_test,y_test)))


Test score: 0.7791803769069698

Model evaluation roc curve

coe=clf.coef_  
y_pred=clf.predict(x_test)
from sklearn.metrics import roc_curve, auc
fpr, tpr, threshold = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',  linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC_curve')
plt.legend(loc="lower right")
plt.show()
print(roc_auc)

Drawing KS curve

fig, ax = plt.subplots()
ax.plot(1 - threshold, tpr, label='tpr') # The ks curve should be arranged in descending order of the predicted probability, so a 1-threshold image is required
ax.plot(1 - threshold, fpr, label='fpr')
ax.plot(1 - threshold, tpr-fpr,label='KS')
plt.xlabel('score')
plt.title('KS Curve')
plt.ylim([0.0, 1.0])
plt.figure(figsize=(20,20))
legend = ax.legend(loc='upper left')
plt.show()
print(max(tpr-fpr))


0.5584308938611491

Model results to scoring

factor = 20 / np.log(2)
offset = 600 - 20 * np.log(20) / np.log(2)
def get_score(coe,woe,factor):
    scores=[]
    for w in woe:
        score=round(coe*w*factor,0)
        scores.append(score)
    return scores
x1 = get_score(coe[0][0], cut1_woe, factor)
x2 = get_score(coe[0][1], cut2_woe, factor)
x3 = get_score(coe[0][2], cut3_woe, factor)
x4 = get_score(coe[0][3], cut4_woe, factor)
x5 = get_score(coe[0][4], cut5_woe, factor)
x6 = get_score(coe[0][5], cut6_woe, factor)
x7 = get_score(coe[0][6], cut7_woe, factor)
x8 = get_score(coe[0][7], cut8_woe, factor)
x9 = get_score(coe[0][8], cut9_woe, factor)
x10 = get_score(coe[0][9], cut10_woe, factor)
print("The score corresponding to the available quota ratio:{}".format(x1))
print("age-related scores:{}".format(x2))
print("Overdue 30-59 Score corresponding to the number of days:{}".format(x3))
print("Debt ratio corresponding score:{}".format(x4))
print("Points corresponding to monthly income:{}".format(x5))
print("Score corresponding to the number of credits:{}".format(x6))
print("Points corresponding to the number of 90 days overdue:{}".format(x7))
print("The score corresponding to the amount of fixed asset loans:{}".format(x8))
print("Overdue 60-89 Score corresponding to the number of days:{}".format(x9))
print("Score for the number of dependents:{}".format(x10))

Calculate the user's total score

cu1=pd.qcut(df["Available quota ratio"],4,labels=False,retbins=True)
bins1=cu1[1]
cu2=pd.qcut(df["age"],8,labels=False,retbins=True)
bins2=cu2[1]

bins3=[-1,0,1,3,5,13]
cut3=pd.cut(df["Overdue 30-59 number of days"],bins3,labels=False)
cu4=pd.qcut(df["debt ratio"],3,labels=False,retbins=True)
bins4=cu4[1]
cu5=pd.qcut(df["monthly income"],4,labels=False,retbins=True)
bins5=cu5[1]
cu6=pd.qcut(df["amount of credit"],4,labels=False,retbins=True)
bins6=cu6[1]

Summation for each score

def compute_score(series,bins,score):
    list = []
    i = 0
    while i < len(series):
        value = series[i]
        j = len(bins) - 2
        m = len(bins) - 2
        while j >= 0:
            if value >= bins[j]:
                j = -1
            else:
                j -= 1
                m -= 1
        list.append(score[m])
        i += 1
    return list

Substitute into the test set for estimation

test1.to_csv(r'./ScoreData.csv', index=False)
test1['x1'] = pd.Series(compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], bins1, x1))
test1['x2'] = pd.Series(compute_score(test1['age'], bins2, x2))
test1['x3'] = pd.Series(compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], bins3, x3))
test1['x4'] = pd.Series(compute_score(test1['DebtRatio'], bins4, x4))
test1['x5'] = pd.Series(compute_score(test1['MonthlyIncome'], bins5, x5))
test1['x6'] = pd.Series(compute_score(test1['NumberOfOpenCreditLinesAndLoans'], bins6, x6))
test1['x7'] = pd.Series(compute_score(test1['NumberOfTimes90DaysLate'], bins7, x7))
test1['x8'] = pd.Series(compute_score(test1['NumberRealEstateLoansOrLines'], bins8, x8))
test1['x9'] = pd.Series(compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'], bins9, x9))
test1['x10'] = pd.Series(compute_score(test1['NumberOfDependents'], bins10, x10))
test1['Score'] = test1['x1']+test1['x2']+test1['x3']+test1['x4']+test1['x5']+test1['x6']+test1['x7']+test1['x8']+test1['x9']+test1['x10']+600
test1.to_csv(r'./ScoreData.csv', index=False)

View test set results

Score = pd.read_csv(r'.\ScoreData.csv',index_col=0)
Score.index.name = 'ID'
Score.iloc[:,11:23]

Tags: Python Machine Learning sklearn

Posted by heffym on Fri, 14 Oct 2022 07:38:34 +1030