1 数据预览及预处理

1.1数据预览

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

data=pd.read_csv('creditcard.csv')
data.head()

数据预览

数据量和字段数量

有数据预览可知，该数据共有31个字段，共有284807条数据，其中V1到V28这28个字段是已经处理好的特征，Class字段表示是否有欺诈的分类，其中Amount字段的数值远大于其他字段数值，为了保证特征间重要程度相当，在这里对Amount字段进行标准化，删除无意义字段Time

1.2数据预处理：

# Amount字段标准化
from sklearn.preprocessing import StandardScaler

data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
# 删除Time字段和元Amount字段
data = data.drop(['Time','Amount'],axis = 1)
data.head()

对Class字段进行计数，统计每个类型的样本数量并做柱状图：

count_classes = pd.value_counts(data['Class'],sort = True).sort_index()
print(count_classes)
count_classes.plot(kind = 'bar')
plt.title('Fraud class histogram')
plt.xlabel('class')
plt.ylabel('Frequency')

label值分布

由上可知，class为0的样本数量远大于class为1的样本，符合异常值数量远小于正常值数量的事实

但是，样本数量不均衡会导致模型的预测效果降低，在这里需要对原始的数据进行处理使得class=1的样本数量和class=0的样本数量一致，可以采用过采样和下采样两种方式处理

2 模型构建

2.1下采样

# 对原始数据提取特征值和label值
X = data.ix[:,data.columns != 'Class']
y = data.ix[:,data.columns == 'Class']

# 计算异常样本数量及对应的索引值
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)

# 计算正常样本索引值
normal_indices = data[data.Class == 0].index

# 在正常样本索引中选取异常样本数量个样本的索引
random_normal_indices = np.random.choice(normal_indices,number_records_fraud,replace = False)
random_normal_indices = np.array(random_normal_indices)

# 联结得到的正常和异常的样本索引
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])

# 提取下采样索引对应的数据
under_sample_data = data.iloc[under_sample_indices]

# 提取特征数据和label值
X_undersample = under_sample_data.ix[:,under_sample_data.columns != 'Class']
y_undersample = under_sample_data.ix[:,under_sample_data.columns == 'Class']

# 样本数量比例
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))

下采样后正常和异常的比值

2.2划分训练数据和测试数据

from sklearn.model_selection import train_test_split

# 整个原始数据
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0)

print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

# 下采样数据划分
X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample = train_test_split(X_undersample,
                                                                                                 y_undersample,
                                                                                                 test_size = 0.3,
                                                                                                 random_state = 0)

print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))

原数据集和下采样数据集划分

由上可知，

原始数据划分后的训练集和测试集数量分别为199364和85443

下采样数据划分后的训练集和测试集数量分别为688和296

2.3模型构建

在异常检测问题中，精度不是衡量模型好坏的指标，因为我们的目的是找出可能的异常值，所以用召回率衡量模型的好坏

# 召回率Recall = TP/(TP+FN) ,异常值检测问题用recall值来评估模型效果
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report

构建模型：

# 定义一个返回交叉验证分数的函数
def printing_KFold_scores(x_train_data,y_train_data):
    #fold = KFold(n_splits = 5,shuffle = False).split(x_train_data.index.values)
    #fold = KFold(len(y_train_data),5,shuffle=False)
    fold = KFold(n_splits = 5,shuffle = False).split(x_train_data.index.values)
    # 设置正则惩罚力度c
    c_param_range = [0.01,0.1,1,10,100]
    
    results_table = pd.DataFrame(index = range(len(c_param_range),2),columns = ['C_parameter','Mean recall score'])
    results_table['Mean recall score']=results_table['Mean recall score'].astype('float64')
    results_table['C_parameter'] = c_param_range
    
    
    # kfold 会给出两个列表：train_indices = indices[0], test_indices = indices[1]
    j = 0
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')
        
        recall_accs = []
        for iteration,indices in enumerate(fold,start=1):# 交叉验证
            
            # 具体化模型
            lr = LogisticRegression(C = c_param,penalty = 'l1')
            
            # 训练模型
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
            
            # 模型预测
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
            
            #计算召回分数
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs.append(recall_acc)
            print('Iteration',iteration,':recall score = ',recall_acc)
            
            # 计算每个c值对应的平均召回分数
            results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
            j +=1
            print('')
            print('Mean recall score',np.mean(recall_accs))
            print('')
       
        # 得到使得recall score最大的c值
        best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
        # Finally, we can check which C parameter is the best amongst the chosen.
        print('*********************************************************************************')
        print('Best model to choose from cross validation is with C parameter = ', best_c)
        print('*********************************************************************************')
    
        return best_c

输出：