使用5种分类模型进行用户贷款逾期预测

Contents

一、背景

这是一份关于用户借还贷款数据,以用户是否逾期为目标,先进行简单的预测,这节主要进行模型评价对比

二、代码实现

# coding=utf-8

"""1. 导包"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import*
from sklearn.linear_model import LogisticRegression
from  sklearn import svm
from  sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm  import LGBMClassifier

"""2. 读取数据"""
dataset = pd.read_csv('./data/data.csv',encoding='gbk')

"""2. 读取数据"""
dataset = pd.read_csv('./data/data.csv',encoding='gbk')

"""3. 数据处理"""
# 删除固定信息列
dataset = dataset.drop(["custid","trade_no","bank_card_no","id_name","first_transaction_time","latest_query_time","loans_latest_time","source"],axis=1)
# 对于sstudent_feature列,我们进行NAN转成0,2转为0
# 缺失值填充
dataset["student_feature"] = dataset["student_feature"].fillna(0)
# 2替换为0
dataset["student_feature"] = dataset["student_feature"].replace([2],[0])

# 针对城市列'reg_preference_for_trad',进行数据替换
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("一线城市", "1")
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("二线城市", "2")
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("三线城市", "3")
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("其他城市", "4")
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("境外", "0")


dataset = dataset.fillna(0) # 使用 0 替换所有 NaN 的值
col = dataset.columns.tolist()[1:]

def missing(df, columns):
    """
    使用众数填充缺失值
    df[i].mode()[0] 获取众数第一个值
    """
    col = columns
    for i in col:
        df[i].fillna(df[i].mode()[0], inplace=True)
        df[i] = df[i].astype('float')


missing(dataset, col)

# 将object类型转成folat
dataset = dataset.convert_objects(convert_numeric=True)

"""4. 数据划分"""
X = dataset.drop(["status"],axis=1)
Y = dataset["status"]

# 数据按正常的2、8划分
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=666)
# not enough values to unpack (expected 4, got 2)

from sklearn.preprocessing import minmax_scale # minmax_scale归一化,缩放到0-1
X_train = minmax_scale(X_train)
X_test =  minmax_scale(X_test)
# Input contains NaN, infinity or a value too large for dtype('float64').

"""5. 数据归一化"""
from sklearn.preprocessing import minmax_scale
# 归一化,缩放到0-1
X_train = minmax_scale(X_train)
X_test =  minmax_scale(X_test)

"""6. 模型整合及预测"""
# log_reg
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# SVM
LinearSVC = svm.SVC(kernel='linear', probability=True).fit(X_train, y_train)
# decision tree
dtree = DecisionTreeClassifier(max_depth=6)
dtree.fit(X_train, y_train)
# xgboost
xgbClassifier = XGBClassifier()
xgbClassifier.fit(X_train, y_train)
# lightgbm
lgbmClassifier = LGBMClassifier()
lgbmClassifier.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt

def model_metrics(clf, X_train, X_test, y_train, y_test):
    # 预测
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    y_train_proba = clf.predict_proba(X_train)[:, 1]
    y_test_proba = clf.predict_proba(X_test)[:, 1]

    # 准确率
    print('[准确率]', end=' ')
    print('训练集:', '%.4f' % accuracy_score(y_train, y_train_pred), end=' ')
    print('测试集:', '%.4f' % accuracy_score(y_test, y_test_pred))

    # 精准率
    print('[精准率]', end=' ')
    print('训练集:', '%.4f' % precision_score(y_train, y_train_pred), end=' ')
    print('测试集:', '%.4f' % precision_score(y_test, y_test_pred))

    # 召回率
    print('[召回率]', end=' ')
    print('训练集:', '%.4f' % recall_score(y_train, y_train_pred), end=' ')
    print('测试集:', '%.4f' % recall_score(y_test, y_test_pred))

    # f1-score
    print('[f1-score]', end=' ')
    print('训练集:', '%.4f' % f1_score(y_train, y_train_pred), end=' ')
    print('测试集:', '%.4f' % f1_score(y_test, y_test_pred))

    # auc取值:用roc_auc_score或auc
    print('[auc值]', end=' ')
    print('训练集:', '%.4f' % roc_auc_score(y_train, y_train_proba), end=' ')
    print('测试集:', '%.4f' % roc_auc_score(y_test, y_test_proba))

    # roc曲线
    fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_proba, pos_label=1)
    fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_proba, pos_label=1)

    label = ["Train - AUC:{:.4f}".format(auc(fpr_train, tpr_train)),
             "Test - AUC:{:.4f}".format(auc(fpr_test, tpr_test))]
    plt.plot(fpr_train, tpr_train)
    plt.plot(fpr_test, tpr_test)
    plt.plot([0, 1], [0, 1], 'd--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(label, loc=4)
    plt.title("ROC curve")
    plt.show()

if __name__ == "__main__":

    model_metrics(log_reg, X_train, X_test, y_train, y_test)

    model_metrics(dtree, X_train, X_test, y_train, y_test)

    model_metrics(LinearSVC, X_train, X_test, y_train, y_test)

    model_metrics(xgbClassifier, X_train, X_test, y_train, y_test)

    model_metrics(lgbmClassifier, X_train, X_test, y_train, y_test)

三、预测结果

image

四、小结

  1. 明显可以看出LightGBM存在过拟合的问题
  2. 经测试,用众数和用平均数填充,结果出现微小区别,平均值受极端值影响较大
  3. 数据需要进行更加好的处理,才能得出较好的模型


转载请注明:yezuolin的博客 » 点击阅读原文