-
Notifications
You must be signed in to change notification settings - Fork 6
/
algorithm_compare.py
303 lines (245 loc) · 12.9 KB
/
algorithm_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# 系统&第三方模块导入
import pandas as pd
import numpy as np
from xml.dom.minidom import parse
import pprint
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import time
### 自定义模块导入
import riskslim_in_use
import original_two_layer_model
import RiskSLIM_1_LR_2
import LR_1_RiskSLIM_2
############################################
###双层模型+RISKSLIM+逻辑回归在HELOC数据集上对比##
################CHENZIHAO###################
############################################
# AUC图像绘制
def AUC_plot(algorithmName, test_y, pred_y_prob):
# print(algorithmName, "AUC图像绘制:")
fpr, tpr, thresholds = roc_curve(test_y, pred_y_prob)
auc = roc_auc_score(test_y, pred_y_prob)
plt.plot(fpr, tpr)
plt.title(algorithmName+" AUC=%.4f" % (auc))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.fill_between(fpr, tpr, where=(tpr > 0), color='green', alpha=0.5)
plt.show()
# 输出打印算法性能
def printPerformance(algorithm_name, test_y, pred_y, pred_y_prob):
# TP(True Positive) 预测正确的1
# FN(False Negative) 预测为-1,真实为1
# FP(False Positive) 预测为1,真实为-1
# TN(True Negative) 预测为-1,真实为-1
TP = []
FN = []
FP = []
TN = []
for i in range(len(pred_y)):
if pred_y[i] == 1 and test_y[i] == 1:
TP.append(i)
elif pred_y[i] == -1 and test_y[i] == 1:
FN.append(i)
elif pred_y[i] == 1 and test_y[i] == -1:
FP.append(i)
elif pred_y[i] == -1 and test_y[i] == -1:
TN.append(i)
accuracy = (len(TP)+len(TN))/(len(TP)+len(FP)+len(TN)+len(FN))
precision = len(TP) / (len(TP) + len(FP))
recall = len(TP) / (len(TP) + len(FN))
F1_score = 2 * ((precision*recall)/(precision+recall))
print(algorithm_name, ':')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-SCORE:', F1_score)
AUC_plot(algorithm_name, test_y, pred_y_prob)
print('\n')
# 生成表头 例如var=xx split_num=3 则返回 ['xx_1', 'xx_2', 'xx_3'],在生成one-hot表时用
def generateColNames(var, split_num):
res = []
for i in range(split_num):
res.append(var + '_' + str(i+1))
return res
# 判断当前值处于第几个区间中,在生成one-hot表时用
def checkWhichDivision(split_list, var):
for i in range(len(split_list)):
begin_index = split_list[i].index('(')
mid_index = split_list[i].index(',')
if('-INF' in split_list[i]): # 含有-INF说明是第一个区间,则直接取'逗号'开始至']'结束
end_index = split_list[i].index(']')
# print(split_list[i][mid_index+1:end_index])
if (var <= float(split_list[i][mid_index+1 : end_index])):
return i+1
elif('+INF' in split_list[i]): # 含有+INF说明是最后一个区间,则直接取第一个括号开始至','结束
# print(split_list[i][begin_index+1:mid_index])
if (var > float(split_list[i][begin_index+1:mid_index])):
return i+1
else: # 处于中间区间的,取两个数,第一个数是'('开始','结束,第二个数是','开始']'结束
end_index = split_list[i].index(']')
if float(split_list[i][mid_index+1:end_index]) >= var > float(split_list[i][begin_index+1: mid_index]):
return i+1
# 生成one-hot某一行的字典格式数据,用于追加到dataframe中,在生成one-hot表时用
def generateCurRowInDict(col_names, whichDivision):
res = {}
hasSetOne = False
for col_name in col_names:
if hasSetOne:
res[col_name] = 0
else:
if str(whichDivision) in col_name:
res[col_name] = 1
hasSetOne = True
else:
res[col_name] = 0
return res
# 根据传入的subscale的变量list来生成one-hot文件
def generateOneHotByList(list, var_split_list, generatePath):
# print(list, generatePath)
# 取出原数据集中对应列
data = pd.read_csv('data/heloc_dataset_v2.csv')
# print(data[list])
try:
partial_data = data[list] # 只包含在了该subscale中的变量的部分data视图
except KeyError:
raise Exception('配置文件中的变量不能在原数据集中找到,检查subscale.xml中的变量')
dataframe_final = pd.DataFrame()
for var in list:
if var in var_split_list.keys(): # 确定是否要分箱
cur_split_list = var_split_list[var]
split_num = len(cur_split_list)
col_names = generateColNames(var, split_num)
dataframe_temp = pd.DataFrame(columns = col_names) # 有几个分段就要创建一个几列的dataframe
cur_col = partial_data[var]
# 这个for循环一行一行插入dataframe_temp
for index in cur_col.index:
whichDivision = checkWhichDivision(cur_split_list, cur_col[index]) # 判断当前这个值在哪个区间里面
temp_row = generateCurRowInDict(col_names, whichDivision) # 根据所处区间生成一个字典数据,如:"{'ExternalRiskEstimate_1': 0, 'ExternalRiskEstimate_2': 0, 'ExternalRiskEstimate_3': 1}"用于之后的追加
# print(temp_row)
dataframe_temp = dataframe_temp.append(temp_row, ignore_index = True)
# print(cur_split_list, cur_col[index], whichDivision, temp_row)
dataframe_temp.columns = col_names
# 合并在dataframe_final中
if dataframe_final.empty:
dataframe_final = dataframe_temp
else:
dataframe_final = pd.concat([dataframe_final, dataframe_temp], axis = 1, ignore_index = False)
# print(dataframe_final)
dataframe_final.to_csv(generatePath, index=0)
print('生成', generatePath, '成功')
if __name__ == "__main__":
p0 = time.process_time()
### 数据预处理 BEGIN
split_file = pd.read_csv('data/features_detail.csv') # 这个文件的split_list就是分箱的结果
original_data = pd.read_csv('data/heloc_dataset_v2.csv') # v2在heloc_dataset_v1.csv的基础上将预测目标列名改为了target,且Bad->-1,Good->1
original_data_onehot = pd.read_csv('onehot/ALL_IN_ONE.csv')
var_start_with = {} # 每个变量所在区间的起始下标
var_all = [] # 所有变量 = var_to_be_bin + var_not_to_be_bin
var_to_be_bin = [] # 分了箱的变量数组
var_not_to_be_bin = [] # 未分箱的变量数组
var_split_list = {} # 分箱变量的分箱结果,从features_detail.csv中读取的
subscales = {} # 从配置文件中读取的subscale分组情况,以分组名为key,组中成员变量名为values
subscales_name = [] # subscale分组名的记录数组
NEED_GENERATE_ONE_HOT_CSV = False # 是否生成one-hot文件,只需要生成一次,不用每次运行反复生成,设置为False前需要确认是否真的存在了对应subscale个csv
RUN_RISKSLIM = True # 本次测试是否运行RiskSlim
RUN_TWOLAYER = True # 本次测试是否运行Twolayer
RUN_LOGISTICREG = True # 本次测试是否运行逻辑回归
RUN_1LAYER_RS_2LAYER_LR = True # 本次测试是否运行 第一层RISKSLIM 第二层LR的模型
RUN_1LAYER_LR_2LAYER_RS = True # 本次测试是否运行 第一层LR 第二层RISKSLIM的模型
# 找出每个变量所在文件中区间的起始下标,顺便记录分了箱的那些变量数组
for i in range(split_file.shape[0]):
if (i == 0) or ((i != 0) and (split_file.var_name[i] != split_file.var_name[i-1])) :
var_start_with[split_file.var_name[i]] = i
var_to_be_bin.append(split_file.var_name[i])
# 记录分箱变量的分箱结果
temp = []
for i in range(split_file.shape[0]):
if i == split_file.shape[0] - 1: # 最后一个变量时
temp.append(split_file.split_list[i])
var_split_list[split_file.var_name[i]] = temp
elif i == 0 or split_file.var_name[i] == split_file.var_name[i - 1]:
temp.append(split_file.split_list[i])
elif split_file.var_name[i] != split_file.var_name[i - 1]:
var_split_list[split_file.var_name[i-1]] = temp
temp = []
temp.append(split_file.split_list[i])
# 从配置文件中读取subscale信息
config_path = "config/subscale.xml"
rootNode = parse(config_path).documentElement
count = 0
if rootNode.nodeName == "root":
subs = rootNode.getElementsByTagName("subscale")
for sub in subs:
temp_sub = []
vars = sub.getElementsByTagName("var")
for var in vars:
temp_sub.append(var.childNodes[0].data)
var_all.append(var.childNodes[0].data)
subscales[sub.getAttribute("name")] = temp_sub
var_not_to_be_bin = list(set(var_all).difference(var_to_be_bin))
### 数据预处理 END
# 打印一些summary
print("============================================================================================================")
print('共有', len(var_all), '个变量')
print('分箱变量有', len(var_to_be_bin), '个,不分箱的有', len(var_not_to_be_bin), '个')
print('其中分箱变量的分箱信息:')
pprint.pprint(var_split_list)
print('从配置文件:', config_path, '中读取的subscale信息如下:')
pprint.pprint(subscales)
# for key in subscales:
# pprint_pprint(key, " ", subscales[key])
# subscales_name.append(key)
print("============================================================================================================")
# 生成了一次one-hot后不用每次运行都生成一次。
if NEED_GENERATE_ONE_HOT_CSV:
# 按照subscale生成one-hot的csv文件,有多少个subscale就需要生成多少个文件
for key in subscales:
generateOneHotByList(subscales[key], var_split_list, 'onehot/'+key +'.csv') # 根据subscale生成one-hot编码文件
# 给每个one-hot首列加入Labels列:RiskPerformance
for key in subscales:
temp_csv = pd.read_csv('onehot/' + key + '.csv')
# print(temp_csv)
temp_csv.insert(0, 'RiskPerformance', original_data['target'])
temp_csv.to_csv('onehot/' + key + '.csv', index=0)
print('====================================所有subscale的one-hot编码文件生成成功===============================')
else:
print('---直接使用了onehot文件夹中以前已生成过的数据来运行算法')
### 算法运行 BEGIN
if RUN_RISKSLIM:
pred_y_riskslim, pred_y_prob_riskslim, test_y_riskslim = riskslim_in_use.run('onehot/ALL_IN_ONE.csv')
printPerformance('RiskSlim', test_y_riskslim, pred_y_riskslim, pred_y_prob_riskslim)
else:
print('---不运行RiskSLIM')
if RUN_LOGISTICREG:
# 保证算法比较时用的训练集和数据集相同
train_X_lr,test_X_lr,train_y_lr,test_y_lr = train_test_split(original_data_onehot.drop(['RiskPerformance'], axis=1), original_data_onehot['RiskPerformance'], test_size=0.3, random_state=666)
test_y_lr = np.array(test_y_lr)
lr = LogisticRegression(max_iter=1000000)
lr.fit(train_X_lr, train_y_lr)
pred_y_lr = lr.predict(test_X_lr)
pred_y_prob_lr = lr.predict_proba(test_X_lr)[:, 1]
printPerformance('LogisticReg', np.array(test_y_lr), pred_y_lr, pred_y_prob_lr)
else:
print('---不运行逻辑回归')
if RUN_TWOLAYER:
test_y_twolayer, pred_y_twolayer, pred_y_prob_twolayer = original_two_layer_model.run(subscales, var_split_list)
printPerformance('Two-layer-model', np.array(test_y_twolayer), pred_y_twolayer, pred_y_prob_twolayer)
else:
print('---不运行双层模型')
if RUN_1LAYER_RS_2LAYER_LR:
test_y_riskslim_lr, pred_y_riskslim_lr, pred_y_prob_riskslim_lr = RiskSLIM_1_LR_2.run(subscales, var_split_list)
printPerformance('1-layer RiskSLIM | 2-layer LR', test_y_riskslim_lr, pred_y_riskslim_lr, pred_y_prob_riskslim_lr)
else:
print('---不运行第一层RiskSLIM第二层LR的模型')
if RUN_1LAYER_LR_2LAYER_RS:
test_y_lr_riskslim, pred_y_lr_riskslim, pred_y_prob_lr_riskslim = LR_1_RiskSLIM_2.run(subscales, var_split_list)
# LR_1_RiskSLIM_2.run(subscales, var_split_list)
printPerformance('1-layer LR | 2-layer RiskSLIM', test_y_lr_riskslim, pred_y_lr_riskslim, pred_y_prob_lr_riskslim)
else:
print('---不运行第一层LR第二层RiskSLIM的模型')
### 算法运行 END
p1 = time.process_time()
print('运行时间: %s 秒' % (p1-p0))