InsuranceFraud/train.py at main · arkanandi/InsuranceFraud · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json
import pickle
import numpy as np
import pandas as pd
import scipy.stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder


def detect_fraud(hyperparams):
    """ Insurance farud detection.

    Performs gradient-boost algorithm to train the fraud_data and give the best model to test on.

    :param : Hyperparameters to train the algorithms
    :return: The best model after training on dataset
    """

    train = pd.read_csv('fraud_data/train.csv')  # Load the fraud_data file of training

    y = train['fraud']  # Create the column with fraud status
    X = train.drop(columns=['fraud'])  # Drop the column name fraud from the training fraud_data and save it to X

    # Splitting the dataset into train and validation / test during training period
    X_train_n, X_test_n, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=1234)

    #  Drop the column with claim_number for the preprocessing
    X_train = X_train_n.drop(columns=['claim_number'])
    X_test = X_test_n.drop(columns=['claim_number'])

    X_train = X_train.apply(LabelEncoder().fit_transform)
    X_test = X_test.apply(LabelEncoder().fit_transform)

    np.random.seed(123)

    randomClassifier = RandomForestClassifier()

    param_grid = {"n_estimators": np.arange(hyperparams.n_estimators_min, hyperparams.n_estimators_max).tolist(),
                  "max_depth": np.arange(hyperparams.max_depth_min, hyperparams.max_depth_max).tolist(),
                  "max_features": np.arange(hyperparams.max_features_min, hyperparams.max_features_max).tolist()}

    hparam_tuner = GridSearchCV(randomClassifier, cv=3, scoring='roc_auc', param_grid=param_grid)
    hparam_tuner = hparam_tuner.fit(X_train, y_train)

    pd.DataFrame(
        hparam_tuner.cv_results_,
        columns=[
            'n_estimators',
            'min_samples_split',
            'min_samples_leaf',
            'max_depth',
            'max_features',
            'mean_test_score',
            'std_test_score',
            'rank_test_score',
        ],
    ).sort_values(by=['rank_test_score'])

    #  Saving best model from training
    best_model = hparam_tuner.best_estimator_

    probs = best_model.predict_proba(X_test)
    df = pd.DataFrame({'claim_number': X_test_n['claim_number'], 'fraud': probs[:, 1]})
    df.to_csv("submission.csv", index=False)

    with open('models/store_best_model.pickle', 'wb') as f:
        pickle.dump(best_model, f)

    return "Training is successful. Best model has been saved at 'models/store_best_model.pickle' "


def test():
    """ Insurance fraud detection.

    Performs testing on test dataset on gradient-boost algorithm best model.

    :return: The json file with fraud prediction
    """
    with open('models/store_best_model.pickle', 'rb') as f:
        loaded_model = pickle.load(f)

    test = pd.read_csv('fraud_data/test.csv')  # Load testing fraud_data
    X_test = test.drop(columns=['claim_number'])
    test_withoutID = X_test.fillna('na')
    test_withoutID = test_withoutID.apply(LabelEncoder().fit_transform)
    final_y = loaded_model.predict(test_withoutID)  # Predict the test fraud_data

    final_report = test
    final_report['fraud_status'] = final_y
    final_report = final_report.loc[:, ['claim_number', 'fraud_status']]
    # Replace 1-0 with Yes-No to make it interpretable
    final_report = final_report.replace(1, 'Fraud')
    final_report = final_report.replace(0, 'Not Fraud')

    final_report.to_json('output/report.json', orient='records')

    f = open('output/report.json', "r")
    result = json.loads(f.read())

    return {"fraud_database": result}