read data
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.linear_model import LogisticRegression as LR from sklearn.ensemble import RandomForestRegressor as rfr from sklearn.model_selection import train_test_split from imblearn.over_sampling import SMOTE %matplotlib inline data=pd.read_csv('./ch17_cs_training.csv') data.head()
observation data
data.info()
Heuristically remove duplicate values to see if the amount of data is reduced, if there is a reduction, there are duplicate values and have been removed
data.drop_duplicates(inplace=True) data.info()
Restore the index, and explore the proportion of missing values
data.reset_index(inplace=True,drop=True) data.isnull().mean()
It can be seen from the above that the number of family members has fewer missing values, and the missing values can be directly replaced by the mean.
data['NumberOfDependents'].fillna(data['NumberOfDependents'].mean(),inplace=True)
From the above, it can be seen that the missing value of monthly income is large, and random forest can be used to supplement the missing value
def fill_missing_rf(X,y,to_fill): """ A function to fill missing values for a feature using random forests parameter: X: feature matrix to fill y: The label corresponding to the part of the data with no missing values to_fill: feature to fill """ #Build new feature matrix and new labels df = X.copy() fill = df.loc[:,to_fill] df = pd.concat([df.loc[:,df.columns != to_fill],pd.DataFrame(y)],axis=1) # Find out our training and test sets Ytrain = fill[fill.notnull()] Ytest = fill[fill.isnull()] Xtrain = df.iloc[Ytrain.index,:] Xtest = df.iloc[Ytest.index,:] #Filling Missing Values with Random Forest Regression from sklearn.ensemble import RandomForestRegressor as rfr rfr = rfr(n_estimators=100) rfr = rfr.fit(Xtrain, Ytrain) Y_predict = rfr.predict(Xtest) return Y_predict X = data.iloc[:,1:] y = data.iloc[:,0] # Bring X,y and features with missing values to_fill into the defined function y_pred = fill_missing_rf(X,y,'MonthlyIncome') # Fill in missing predicted values data.loc[data.loc[:,'MonthlyIncome'].isnull(),'MonthlyIncome'] = y_pred
After the missing values are added, the next step is to deal with outliers
data.describe([0.01,0.1,0.25,0.5,0.75,0.9,0.99]).T
insert image description here
It is observed that the minimum age is 0, which does not meet the business needs of the bank, so delete it directly
data = data[data["age"] != 0]
Moving on, there are three metrics that look odd:
"NumberOfTime30-59DaysPastDueNotWorse"
"NumberOfTime60-89DaysPastDueNotWorse"
"NumberOfTimes90DaysLate"
The three indicators are "the worse number of 35-59 days overdue but no development in the past two years", "60-89 days overdue in the past two years but no development"
There are worse times for development", "the number of overdue times of 90 days in the past two years". These three indicators are still 2 in the 99% distribution, but the maximum value is
98, looks very abnormal.
data[data.loc[:,"NumberOfTimes90DaysLate"] > 90].count()
Not right, then judge
There are 225 samples in this situation, and these samples, let's observe, the labels are not all 1, and they are not all bad customers, which is obviously abnormal.
Therefore, we can basically judge that these samples are some kind of anomalies and should be deleted.
data = data[data.loc[:,'NumberOfTimes90DaysLate']<90]
detect
data.describe([0.01,0.1,0.25,0.5,0.75,0.9,0.99]).T
Recover the index, and explore the distribution of labels
data.reset_index(inplace=True,drop=True) X = data.iloc[:,1:] y = data.iloc[:,0] # y.value_counts() sns.countplot(x="SeriousDlqin2yrs", data=data) n_1_sample = y.value_counts()[1] n_0_sample = y.value_counts()[0] print('Number of samples:{};1 Take up{:.2%};0 Take up{:.2%}'.format(len(y),n_1_sample/len(y),n_0_sample/len(y)))
Unbalanced data, make it balanced
#In logistic regression, the most used is the upsampling method SMOTE for sample equalization import imblearn #imblearn is a library specially used to deal with imbalanced data sets, and its performance is much higher than sklearn in dealing with imbalanced samples. #There are also classes in imblearn, which also need to be instantiated and fit ted, similar to the usage of sklearn. from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=42) #instantiate X,y = sm.fit_resample(X,y) #Returns the feature matrix and labels after upsampling n_sample_ = X.shape[0] n_1_sample = pd.Series(y).value_counts()[1] n_0_sample = pd.Series(y).value_counts()[0] print('Number of samples:{}; 1 Take up{:.2%}; 0 Take up{:.2%}'.format(n_sample_,n_1_sample/n_sample_,n_0_sample/n_sample_))
training data to build a model
from sklearn.model_selection import train_test_split X = pd.DataFrame(X) y = pd.DataFrame(y) X_train, X_vali, Y_train, Y_vali = train_test_split(X,y,test_size=0.3,random_state=420) model_data = pd.concat([Y_train, X_train], axis=1) model_data.reset_index(drop=True,inplace=True) model_data.columns = data.columns
Divide training set and validation set
vali_data = pd.concat([Y_vali, X_vali], axis=1) vali_data.reset_index(drop=True,inplace=True) vali_data.columns = data.columns model_data.to_csv(r'.\model_data.csv') vali_data.to_csv(r'.\vali_data.csv')
Analyze the training set
import matplotlib.pyplot as plt from pylab import mpl mpl.rcParams['font.sans-serif'] = ['SimHei'] df=pd.read_csv(r'.\model_data.csv', index_col = 0) df.index.name = 'ID' states={'SeriousDlqin2yrs':'good and bad customers', 'RevolvingUtilizationOfUnsecuredLines':'Available quota ratio', 'age':'age', 'NumberOfTime30-59DaysPastDueNotWorse':'Overdue 30-59 number of days', 'DebtRatio':'debt ratio', 'MonthlyIncome':'monthly income', 'NumberOfOpenCreditLinesAndLoans':'amount of credit', 'NumberOfTimes90DaysLate':'90 days overdue', 'NumberRealEstateLoansOrLines':'Fixed Asset Loans', 'NumberOfTime60-89DaysPastDueNotWorse':'Overdue 60-89 number of days', 'NumberOfDependents':'Number of family members'} df.rename(columns=states,inplace=True) df.head()
Univariate Analysis
age_cut=pd.cut(df['age'],5) age_cut_group=df['good and bad customers'].groupby(age_cut).count() age_cut_grouped1=df["good and bad customers"].groupby(age_cut).sum() df2=pd.merge(pd.DataFrame(age_cut_group),pd.DataFrame(age_cut_grouped1),left_index=True,right_index=True) df2.rename(columns={'good and bad customers_x':'total customers','good and bad customers_y':'number of bad customers'},inplace=True) df2.insert(2,"Number of good customers",df2["total customers"]-df2["number of bad customers"]) df2.insert(2,"percentage of bad customers",df2["number of bad customers"]/df2["total customers"]) df2
Trend graph of bad customer rate with age
ax11=df2["percentage of bad customers"].plot(figsize=(10,5)) ax11.set_xticklabels([0,20,29,38,47,55,64,72,81,89,98,107]) ax11.set_ylabel("bad customer rate") ax11.set_title("Trend graph of bad customer rate with age")
multivariate analysis
import seaborn as sns corr = df.corr()#Calculate the correlation coefficient of each variable xticks = list(corr.index)#x-axis labels yticks = list(corr.index)#y-axis labels fig = plt.figure(figsize=(15,10)) ax1 = fig.add_subplot(1, 1, 1) sns.heatmap(corr, annot=True, cmap="rainbow",ax=ax1,linewidths=.5, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'}) ax1.set_xticklabels(xticks, rotation=35, fontsize=15) ax1.set_yticklabels(yticks, rotation=0, fontsize=15) plt.show()
WOE binning and WOE value calculation
cut1=pd.qcut(df["Available quota ratio"],4,labels=False) cut2=pd.qcut(df["age"],8,labels=False) bins3=[-1,0,1,3,5,13] cut3=pd.cut(df["Overdue 30-59 number of days"],bins3,labels=False) cut4=pd.qcut(df["debt ratio"],3,labels=False) cut5=pd.qcut(df["monthly income"],4,labels=False) cut6=pd.qcut(df["amount of credit"],4,labels=False) bins7=[-1, 0, 1, 3,5, 20] cut7=pd.cut(df["90 days overdue"],bins7,labels=False) bins8=[-1, 0,1,2, 3, 33] cut8=pd.cut(df["Fixed Asset Loans"],bins8,labels=False) bins9=[-1, 0, 1, 3, 12] cut9=pd.cut(df["Overdue 60-89 number of days"],bins9,labels=False) bins10=[-1, 0, 1, 2, 3, 5, 21] cut10=pd.cut(df["Number of family members"],bins10,labels=False) rate=df["good and bad customers"].sum()/(df["good and bad customers"].count()-df["good and bad customers"].sum()) def get_woe_data(cut): grouped=df["good and bad customers"].groupby(cut,as_index = True).value_counts() woe=np.log(grouped.unstack().iloc[:,1]/grouped.unstack().iloc[:,0]/rate) return woe cut1_woe=get_woe_data(cut1) cut2_woe=get_woe_data(cut2) cut3_woe=get_woe_data(cut3) cut4_woe=get_woe_data(cut4) cut5_woe=get_woe_data(cut5) cut6_woe=get_woe_data(cut6) cut7_woe=get_woe_data(cut7) cut8_woe=get_woe_data(cut8) cut9_woe=get_woe_data(cut9) cut10_woe=get_woe_data(cut10)
Just pick a few variables and look at woe
You can change the variable name to see
# cut1_woe.plot.bar(color='b',alpha=0.3,rot=0) # cut2_woe.plot.bar(color='b',alpha=0.3,rot=0) cut3_woe.plot.bar(color='b',alpha=0.3,rot=0)
IV value calculation
def get_IV_data(cut,cut_woe): grouped=df["good and bad customers"].groupby(cut,as_index = True).value_counts() cut_IV=((grouped.unstack().iloc[:,1]/df["good and bad customers"].sum()-grouped.unstack().iloc[:,0]/(df["good and bad customers"].count()-df["good and bad customers"].sum()))*cut_woe).sum() return cut_IV #Calculate the IV value of each group cut1_IV=get_IV_data(cut1,cut1_woe) cut2_IV=get_IV_data(cut2,cut2_woe) cut3_IV=get_IV_data(cut3,cut3_woe) cut4_IV=get_IV_data(cut4,cut4_woe) cut5_IV=get_IV_data(cut5,cut5_woe) cut6_IV=get_IV_data(cut6,cut6_woe) cut7_IV=get_IV_data(cut7,cut7_woe) cut8_IV=get_IV_data(cut8,cut8_woe) cut9_IV=get_IV_data(cut9,cut9_woe) cut10_IV=get_IV_data(cut10,cut10_woe) IV=pd.DataFrame([cut1_IV,cut2_IV,cut3_IV,cut4_IV,cut5_IV,cut6_IV,cut7_IV,cut8_IV,cut9_IV,cut10_IV],index=['Available quota ratio','age','Overdue 30-59 number of days','debt ratio','monthly income','amount of credit','90 days overdue','Fixed Asset Loans','Overdue 60-89 number of days','Number of family members'],columns=['IV']) iv=IV.plot.bar(color='b',alpha=0.3,rot=30,figsize=(10,5),fontsize=(10)) iv.set_title('characteristic variable and IV Value distribution map',fontsize=(15)) iv.set_xlabel('characteristic variable',fontsize=(15)) iv.set_ylabel('IV',fontsize=(15))
WOE value replacement
df_new=pd.DataFrame() #Create a new df_new to store the converted data of woe def replace_data(cut,cut_woe): a=[] for i in cut.unique(): a.append(i) a.sort() for m in range(len(a)): cut.replace(a[m],cut_woe.values[m],inplace=True) return cut df_new["good and bad customers"]=df["good and bad customers"] df_new["Available quota ratio"]=replace_data(cut1,cut1_woe) df_new["age"]=replace_data(cut2,cut2_woe) df_new["Overdue 30-59 number of days"]=replace_data(cut3,cut3_woe) df_new["debt ratio"]=replace_data(cut4,cut4_woe) df_new["monthly income"]=replace_data(cut5,cut5_woe) df_new["amount of credit"]=replace_data(cut6,cut6_woe) df_new["90 days overdue"]=replace_data(cut7,cut7_woe) df_new["Fixed Asset Loans"]=replace_data(cut8,cut8_woe) df_new["Overdue 60-89 number of days"]=replace_data(cut9,cut9_woe) df_new["Number of family members"]=replace_data(cut10,cut10_woe) df_new.head()
Train the model
from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split x=df_new.iloc[:,1:] y=df_new.iloc[:,:1] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.6,random_state=0) model=LogisticRegression() clf=model.fit(x_train,y_train) print('Test scores:{}'.format(clf.score(x_test,y_test)))
Test score: 0.7791803769069698
Model evaluation roc curve
coe=clf.coef_ y_pred=clf.predict(x_test) from sklearn.metrics import roc_curve, auc fpr, tpr, threshold = roc_curve(y_test, y_pred) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC_curve') plt.legend(loc="lower right") plt.show() print(roc_auc)
Drawing KS curve
fig, ax = plt.subplots() ax.plot(1 - threshold, tpr, label='tpr') # The ks curve should be arranged in descending order of the predicted probability, so a 1-threshold image is required ax.plot(1 - threshold, fpr, label='fpr') ax.plot(1 - threshold, tpr-fpr,label='KS') plt.xlabel('score') plt.title('KS Curve') plt.ylim([0.0, 1.0]) plt.figure(figsize=(20,20)) legend = ax.legend(loc='upper left') plt.show() print(max(tpr-fpr))
0.5584308938611491
Model results to scoring
factor = 20 / np.log(2) offset = 600 - 20 * np.log(20) / np.log(2) def get_score(coe,woe,factor): scores=[] for w in woe: score=round(coe*w*factor,0) scores.append(score) return scores x1 = get_score(coe[0][0], cut1_woe, factor) x2 = get_score(coe[0][1], cut2_woe, factor) x3 = get_score(coe[0][2], cut3_woe, factor) x4 = get_score(coe[0][3], cut4_woe, factor) x5 = get_score(coe[0][4], cut5_woe, factor) x6 = get_score(coe[0][5], cut6_woe, factor) x7 = get_score(coe[0][6], cut7_woe, factor) x8 = get_score(coe[0][7], cut8_woe, factor) x9 = get_score(coe[0][8], cut9_woe, factor) x10 = get_score(coe[0][9], cut10_woe, factor) print("The score corresponding to the available quota ratio:{}".format(x1)) print("age-related scores:{}".format(x2)) print("Overdue 30-59 Score corresponding to the number of days:{}".format(x3)) print("Debt ratio corresponding score:{}".format(x4)) print("Points corresponding to monthly income:{}".format(x5)) print("Score corresponding to the number of credits:{}".format(x6)) print("Points corresponding to the number of 90 days overdue:{}".format(x7)) print("The score corresponding to the amount of fixed asset loans:{}".format(x8)) print("Overdue 60-89 Score corresponding to the number of days:{}".format(x9)) print("Score for the number of dependents:{}".format(x10))
Calculate the user's total score
cu1=pd.qcut(df["Available quota ratio"],4,labels=False,retbins=True) bins1=cu1[1] cu2=pd.qcut(df["age"],8,labels=False,retbins=True) bins2=cu2[1] bins3=[-1,0,1,3,5,13] cut3=pd.cut(df["Overdue 30-59 number of days"],bins3,labels=False) cu4=pd.qcut(df["debt ratio"],3,labels=False,retbins=True) bins4=cu4[1] cu5=pd.qcut(df["monthly income"],4,labels=False,retbins=True) bins5=cu5[1] cu6=pd.qcut(df["amount of credit"],4,labels=False,retbins=True) bins6=cu6[1]
Summation for each score
def compute_score(series,bins,score): list = [] i = 0 while i < len(series): value = series[i] j = len(bins) - 2 m = len(bins) - 2 while j >= 0: if value >= bins[j]: j = -1 else: j -= 1 m -= 1 list.append(score[m]) i += 1 return list
Substitute into the test set for estimation
test1.to_csv(r'./ScoreData.csv', index=False) test1['x1'] = pd.Series(compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], bins1, x1)) test1['x2'] = pd.Series(compute_score(test1['age'], bins2, x2)) test1['x3'] = pd.Series(compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], bins3, x3)) test1['x4'] = pd.Series(compute_score(test1['DebtRatio'], bins4, x4)) test1['x5'] = pd.Series(compute_score(test1['MonthlyIncome'], bins5, x5)) test1['x6'] = pd.Series(compute_score(test1['NumberOfOpenCreditLinesAndLoans'], bins6, x6)) test1['x7'] = pd.Series(compute_score(test1['NumberOfTimes90DaysLate'], bins7, x7)) test1['x8'] = pd.Series(compute_score(test1['NumberRealEstateLoansOrLines'], bins8, x8)) test1['x9'] = pd.Series(compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'], bins9, x9)) test1['x10'] = pd.Series(compute_score(test1['NumberOfDependents'], bins10, x10)) test1['Score'] = test1['x1']+test1['x2']+test1['x3']+test1['x4']+test1['x5']+test1['x6']+test1['x7']+test1['x8']+test1['x9']+test1['x10']+600 test1.to_csv(r'./ScoreData.csv', index=False)
View test set results
Score = pd.read_csv(r'.\ScoreData.csv',index_col=0) Score.index.name = 'ID' Score.iloc[:,11:23]