更好的手动特征工程#
虽然featuretools提供了自动特征工程,但是dfs巨慢。
我们应该构建新领域特征,这会大幅提升模型。通过比例、product、minus、count以及之前的agg方法。这个过程中,我们会做一些必要的异常值处理。
最后,通过kfold-lgbm得到稳定的结果,并剔除之前记录的no_importance_features 加快模型训练
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import lightgbm as lgb
from lightgbm import LGBMClassifier
import warnings
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold,StratifiedKFold
import re
from contextlib import contextmanager
import time
import os
from lightgbm import early_stopping, log_evaluation
gc.enable()
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (8,6)
plt.rcParams['figure.dpi'] = 100
公共函数#
def submit(ids, pred, name, feature_count=None):
"""
ids: 测试集的 SK_ID_CURR
pred: 模型预测概率
name: 你的实验备注 (如 'lgb_v1', 'baseline')
feature_count: 可选,记录模型使用了多少个特征
"""
# 1. 创建提交 DataFrame
submit_df = pd.DataFrame({
'SK_ID_CURR': ids,
'TARGET': pred
})
# 2. 生成时间戳 (格式: 0213_1530)
timestamp = time.strftime("%m%d_%H%M")
# 3. 构造文件名
# 格式: 0213_1530_lgb_v1_f542.csv
f_str = f"_f{feature_count}" if feature_count else ""
filename = f"{timestamp}_{name}{f_str}.csv"
# 4. 确保保存目录存在 (可选)
if not os.path.exists('submissions'):
os.makedirs('submissions')
save_path = os.path.join('submissions', filename)
# 5. 保存并打印提示
submit_df.to_csv(save_path, index=False)
return submit_df
def onehot_encoder(df, nan_as_category=True):
"""
df: 输入的 DataFrame
nan_as_category: 是否将 NaN 视为一个独立的类别进行编码
return: 新的cols
"""
original_columns = df.columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
df = pd.get_dummies(df, columns=categorical_cols, dummy_na=nan_as_category)
new_columns = [col for col in df.columns if col not in original_columns]
return df, new_columns
kfold将数据分为几份,每次用其中一份做验证,其他做训练。得到更稳定结果
stratified=True 代表分层抽样,保持每份数据中正负样本比例相同
def submit(ids, pred, name, feature_count=None):
"""
ids: 测试集的 SK_ID_CURR
pred: 模型预测概率
name: 你的实验备注 (如 'lgb_v1', 'baseline')
feature_count: 可选,记录模型使用了多少个特征
"""
# 1. 创建提交 DataFrame
submit_df = pd.DataFrame({
'SK_ID_CURR': ids,
'TARGET': pred
})
# 2. 生成时间戳 (格式: 0213_1530)
timestamp = time.strftime("%m%d_%H%M")
# 3. 构造文件名
# 格式: 0213_1530_lgb_v1_f542.csv
f_str = f"_f{feature_count}" if feature_count else ""
filename = f"{timestamp}_{name}{f_str}.csv"
# 4. 确保保存目录存在 (可选)
if not os.path.exists('submissions'):
os.makedirs('submissions')
save_path = os.path.join('submissions', filename)
# 5. 保存并打印提示
submit_df.to_csv(save_path, index=False)
return submit_df
def kfold_lightgbm(df, num_folds, stratified=False, debug=True):
def clean_names(df):
# 替换所有非字母、数字的字符为下划线
# 这里的正则 [^A-Za-z0-9_] 会匹配空格、斜杠、括号等所有特殊字符
df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
# 顺便处理一下可能出现的重复下划线,比如 __
df.columns = [re.sub(r'_+', '_', col).strip('_') for col in df.columns]
return df
df = clean_names(df)
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]
if stratified:
fold = StratifiedKFold(n_splits=num_folds, shuffle=True)
else:
fold = KFold(n_splits=num_folds, shuffle=True)
features = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
feature_importance_df = pd.DataFrame()
out_of_fold_preds = np.zeros(train_df.shape[0])
submit_preds = np.zeros(test_df.shape[0])
for n_fold, (train_idx, valid_idx) in enumerate(fold.split(train_df[features], train_df['TARGET'])):
dtrain = lgb.Dataset(data=train_df[features].iloc[train_idx],
label = train_df['TARGET'].iloc[train_idx],
free_raw_data=False
)
dvalid = lgb.Dataset(data=train_df[features].iloc[valid_idx],
label = train_df['TARGET'].iloc[valid_idx],
free_raw_data=False
)
params = {
'objective': 'binary',
'metric': 'auc',
'max_depth': 8,
'num_leaves': 40,
'min_child_samples': 30,
'learning_rate': 0.02,
'verbosity': -1,
# 特征与数据采样(增加随机性,防止过拟合)
'feature_fraction': 0.8, # 每次迭代只用 80% 的特征
'bagging_fraction': 0.8, # 每次迭代只用 80% 的数据
'bagging_freq': 5, # 每 5 轮进行一次采样
'lambda_l1': 0.1,
'lambda_l2': 0.1
}
clf = lgb.train(
params = params,
train_set=dtrain,
valid_sets=[dtrain, dvalid],
num_boost_round=10000, # 10000个树
callbacks=[
early_stopping(stopping_rounds=200), # 如果连续200迭代没有提升auc,就自动停止
log_evaluation(period=100) # 每100轮提醒你一下进度
]
)
out_of_fold_preds[valid_idx] = clf.predict(dvalid.data)
submit_preds += clf.predict(test_df[features]) / fold.n_splits
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
fold_importance_df["fold"] = n_fold + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, out_of_fold_preds[valid_idx])))
del clf, dtrain, dvalid
gc.collect()
if not debug:
submit(test_df['SK_ID_CURR'], submit_preds, 'lgbm_folds', feature_count=len(features))
print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], out_of_fold_preds))
return feature_importance_df
def plot_importances(feature_importance_df):
cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:30].index
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
application_train/test#
def application_train_test(nrows = None, nan_as_category = True):
"""
"""
app_train = pd.read_csv('data/application_train.csv', nrows=nrows)
app_test = pd.read_csv('data/application_test.csv', nrows=nrows)
print(f'train {len(app_train)}, test {len(app_test)}')
app = pd.concat([app_train, app_test])
app = app.reset_index()
print(f'app {len(app)}')
#
app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
app['NEW_CREDIT_TO_ANNUITY_RATIO'] = app['AMT_CREDIT'] / (app['AMT_ANNUITY'] + 1)
app['NEW_CREDIT_TO_GOODS_RATIO'] = app['AMT_CREDIT'] / (app['AMT_GOODS_PRICE'] + 1)
app['NEW_EMPLOYED_TO_BIRTH_RATIO'] = app['DAYS_EMPLOYED'] / (app['DAYS_BIRTH'] + 1)
app['NEW_ANNITY_TO_INCOME_RATIO'] = app['AMT_ANNUITY'] / (app['AMT_INCOME_TOTAL'] + 1)
app['NEW_CREDIT_TO_INCOME_RATIO'] = app['AMT_CREDIT'] / (app['AMT_INCOME_TOTAL'] + 1)
app['NEW_EXT_SOURCE_PROD'] = app['EXT_SOURCE_1'] * app['EXT_SOURCE_2'] * app['EXT_SOURCE_3']
app['NEW_EXT_SOURCE_MEAN'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
app['NEW_EXT_SOURCE_STD'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
for bin_feature in ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CODE_GENDER']:
app[bin_feature], uniques = pd.factorize(app[bin_feature])
app, new_cat_features = onehot_encoder(app)
del app_train, app_test
gc.collect()
return app
bureau_and_balance#
def bureau_and_balance(nrows = None, nan_as_category = True):
bureau = pd.read_csv('data/bureau.csv',nrows=nrows)
balance = pd.read_csv('data/bureau_balance.csv', nrows=nrows)
print(f'bureau {bureau.shape}, balance {balance.shape}')
balance, balance_cat_cols = onehot_encoder(balance, nan_as_category)
bureau, bureau_cat_cols = onehot_encoder(bureau, nan_as_category)
# balance
balance_aggregations = {
'MONTHS_BALANCE': ['min', 'max', 'size']
}
for col in balance_cat_cols:
balance_aggregations[col] = ['mean']
balance_agg = balance.groupby('SK_ID_BUREAU').agg(balance_aggregations)
balance_agg.columns = pd.Index([col[0] + '_' + col[1].upper() for col in balance_agg.columns.tolist()])
bureau = bureau.join(balance_agg, how='left', on='SK_ID_BUREAU')
bureau = bureau.drop(columns=['SK_ID_BUREAU'])
del balance, balance_agg
gc.collect()
# bureau
num_aggregations = {
'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
'DAYS_CREDIT_UPDATE': ['mean'],
'CREDIT_DAY_OVERDUE': ['max', 'mean'],
'AMT_CREDIT_MAX_OVERDUE': ['mean'],
'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_OVERDUE': ['mean'],
'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
'AMT_ANNUITY': ['max', 'mean'],
'CNT_CREDIT_PROLONG': ['sum'],
'MONTHS_BALANCE_MIN': ['min'],
'MONTHS_BALANCE_MAX': ['max'],
'MONTHS_BALANCE_SIZE': ['mean', 'sum']
}
category_aggregations = {}
for col in bureau_cat_cols:
category_aggregations[col] = ['mean']
for col in balance_cat_cols:
category_aggregations[col + '_MEAN'] = ['mean']
bureau_agg = bureau.groupby(by='SK_ID_CURR').agg({**num_aggregations, **category_aggregations})
bureau_agg.columns = pd.Index(
['BUREAU_' + col[0] + '_' + col[1].upper() for col in bureau_agg.columns.tolist()]
)
# bureau - active where
active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
cols = active_agg.columns
active_agg.columns = pd.Index(
['ACTIVE_' + col[0] +'_' + col[1].upper() for col in active_agg.columns.tolist()]
)
bureau_agg = bureau_agg.join(active_agg, on='SK_ID_CURR', how='left')
del active_agg, active
gc.collect()
# bureau - closed where
closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
del closed, closed_agg
gc.collect()
for col in cols:
bureau_agg['NEW_RATIO_BUREAU_' + col[0] + "_" + col[1].upper()] = bureau_agg['ACTIVE_' + col[0] + "_" + col[1].upper()] / bureau_agg['CLOSED_' + col[0] + "_" + col[1].upper()]
del bureau
gc.collect()
return bureau_agg
active和closed是同样的逻辑,但是不同的业务含义。active代表当前还款压力,closed代表往日信用记录。 他们是两类指标,混在一起的话,意义不明确
这个类似ft工具的where
previous_applications#
def previous_applications(nrows = None, nan_as_category = True):
prev = pd.read_csv('data/previous_application.csv',nrows=nrows)
print(f'prev {prev.shape}')
prev, cat_cols = onehot_encoder(prev, nan_as_category)
prev['DAYS_FIRST_DRAWING'] = prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan)
prev['DAYS_FIRST_DUE'] = prev['DAYS_FIRST_DUE'].replace(365243, np.nan)
prev['DAYS_LAST_DUE_1ST_VERSION'] = prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan)
prev['DAYS_LAST_DUE'] = prev['DAYS_LAST_DUE'].replace(365243, np.nan)
prev['DAYS_TERMINATION'] = prev['DAYS_TERMINATION'].replace(365243, np.nan)
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
num_aggregations = {
'AMT_ANNUITY': ['min', 'max', 'mean'],
'AMT_APPLICATION': ['min', 'max', 'mean'],
'AMT_CREDIT': ['min', 'max', 'mean'],
'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
'DAYS_DECISION': ['min', 'max', 'mean'],
'CNT_PAYMENT': ['mean', 'sum'],
}
cat_aggregations = {}
for cat in cat_cols:
cat_aggregations[cat] = ['mean']
prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
# Previous Applications: Approved Applications - only numerical features
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
cols = approved_agg.columns.tolist()
approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
# Previous Applications: Refused Applications - only numerical features
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
del refused, refused_agg, approved, approved_agg, prev
for e in cols:
prev_agg['NEW_RATIO_PREV_' + e[0] + "_" + e[1].upper()] = prev_agg['APPROVED_' + e[0] + "_" + e[1].upper()] / prev_agg['REFUSED_' + e[0] + "_" + e[1].upper()]
gc.collect()
return prev_agg
同样的,approved和refused代表不同含义,如APPROVED_AMT_CREDIT是银行对客户信心额度,REFUSED_AMT_CREDIT是客户想借没借到的额度
POS_CASH_balance#
def pos_cash(nrows = None , nan_as_category = True):
pos = pd.read_csv('data/POS_CASH_balance.csv', nrows = nrows)
pos, cat_cols = onehot_encoder(pos, nan_as_category)
aggregations = {
'MONTHS_BALANCE': ['max', 'mean', 'size'],
'SK_DPD': ['max', 'mean'],
'SK_DPD_DEF': ['max', 'mean']
}
for cat in cat_cols:
aggregations[cat] = ['mean']
pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
del pos
gc.collect()
return pos_agg
installments_payments#
def installments_payments(nrows = None , nan_as_category = True):
ins = pd.read_csv('data/installments_payments.csv', nrows = nrows)
ins, cat_cols = onehot_encoder(ins, nan_as_category= True)
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
# DPD逾期天数, DBD提前还款天数
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
aggregations = {
'NUM_INSTALMENT_VERSION': ['nunique'],
'DPD': ['max', 'mean', 'sum'],
'DBD': ['max', 'mean', 'sum'],
'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
'AMT_INSTALMENT': ['max', 'mean', 'sum'],
'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
}
for cat in cat_cols:
aggregations[cat] = ['mean']
ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
del ins
gc.collect()
return ins_agg
credit_card_balance#
def credit_card_balance(nrows = None, nan_as_category = True):
cc = pd.read_csv('data/credit_card_balance.csv', nrows = nrows)
cc, cat_cols = onehot_encoder(cc, nan_as_category= True)
cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
numeric_cols = [c for c in cc.columns if cc[c].dtype != 'object']
num_aggregations = {col: [ 'mean', 'sum', 'var'] for col in numeric_cols}
category_aggregations = {}
for col in cat_cols:
category_aggregations[col] = ['mean']
cc_agg = cc.groupby('SK_ID_CURR').agg({**num_aggregations, **category_aggregations})
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
del cc
gc.collect()
return cc_agg
main#
@contextmanager
def timer(title):
t0 = time.time()
yield # 执行with代码
print(f'{title} done in {time.time() - t0:.0f}s')
def get_last_no_imp_features():
""" 获取那些不重要的特征,以供剔除
"""
try:
features_importance_df = pd.read_feather('features_importance_df.feather')
low_importance_features = features_importance_df.groupby('feature')['importance'].mean()
low_importance_features = low_importance_features[low_importance_features == 0].index.tolist()
return low_importance_features
except FileNotFoundError:
return []
def main(debug = False):
nrows = 10000 if debug else None
df = application_train_test(nrows)
with timer('Process bureau and bureau balance'):
bureau = bureau_and_balance(nrows)
print(f'Bureau shape :{bureau.shape}')
df = df.join(bureau, on='SK_ID_CURR', how='left')
del bureau
gc.collect()
with timer("Process previous_applications"):
prev = previous_applications(nrows)
print("Previous applications df shape:", prev.shape)
df = df.join(prev, how='left', on='SK_ID_CURR')
del prev
gc.collect()
with timer("Process POS-CASH balance"):
pos = pos_cash(nrows)
print("Pos-cash balance df shape:", pos.shape)
df = df.join(pos, how='left', on='SK_ID_CURR')
del pos
gc.collect()
with timer("Process installments payments"):
ins = installments_payments(nrows)
print("Installments payments df shape:", ins.shape)
df = df.join(ins, how='left', on='SK_ID_CURR')
del ins
gc.collect()
with timer("Process credit card balance"):
cc = credit_card_balance(nrows)
print("Credit card balance df shape:", cc.shape)
df = df.join(cc, how='left', on='SK_ID_CURR')
del cc
gc.collect()
with timer('Run lgbm with kfold'):
no_imp_features = get_last_no_imp_features()
features_to_drop = list(set(no_imp_features) & set(df.columns))
df = df.drop(columns=features_to_drop)
print(f'final DF :{df.shape}')
features_importance_df = kfold_lightgbm(df, num_folds=5, debug=debug)
features_importance_df.to_feather('features_importance_df.feather')
with timer('full run'):
main()
train 307511, test 48744
app 356255
bureau (1716428, 17), balance (27299925, 3)
Bureau shape :(305811, 143)
Process bureau and bureau balance done in 17s
prev (1670214, 37)
Previous applications df shape: (338857, 279)
Process previous_applications done in 21s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance done in 12s
Installments payments df shape: (339587, 26)
Process installments payments done in 23s
Credit card balance df shape: (103558, 72)
Process credit card balance done in 14s
final DF :(356255, 720)
Training until validation scores don't improve for 200 rounds
[100] training's auc: 0.78238 valid_1's auc: 0.765774
[200] training's auc: 0.8061 valid_1's auc: 0.778186
[300] training's auc: 0.822628 valid_1's auc: 0.784157
[400] training's auc: 0.836671 valid_1's auc: 0.787432
[500] training's auc: 0.848619 valid_1's auc: 0.789125
[600] training's auc: 0.859644 valid_1's auc: 0.790174
[700] training's auc: 0.869208 valid_1's auc: 0.79104
[800] training's auc: 0.878829 valid_1's auc: 0.791669
[900] training's auc: 0.88704 valid_1's auc: 0.792334
[1000] training's auc: 0.894494 valid_1's auc: 0.792548
[1100] training's auc: 0.901095 valid_1's auc: 0.79254
Early stopping, best iteration is:
[993] training's auc: 0.893959 valid_1's auc: 0.792591
Fold 1 AUC : 0.792591
Training until validation scores don't improve for 200 rounds
[100] training's auc: 0.782763 valid_1's auc: 0.759153
[200] training's auc: 0.807392 valid_1's auc: 0.772789
[300] training's auc: 0.823934 valid_1's auc: 0.779224
[400] training's auc: 0.837874 valid_1's auc: 0.783119
[500] training's auc: 0.849965 valid_1's auc: 0.785261
[600] training's auc: 0.860856 valid_1's auc: 0.786782
[700] training's auc: 0.870219 valid_1's auc: 0.78738
[800] training's auc: 0.879139 valid_1's auc: 0.787911
[900] training's auc: 0.887327 valid_1's auc: 0.788308
[1000] training's auc: 0.895203 valid_1's auc: 0.788714
[1100] training's auc: 0.902198 valid_1's auc: 0.788899
[1200] training's auc: 0.908633 valid_1's auc: 0.789061
[1300] training's auc: 0.914877 valid_1's auc: 0.789003
[1400] training's auc: 0.920659 valid_1's auc: 0.789187
[1500] training's auc: 0.925779 valid_1's auc: 0.789444
[1600] training's auc: 0.930208 valid_1's auc: 0.789508
[1700] training's auc: 0.934451 valid_1's auc: 0.789575
[1800] training's auc: 0.93854 valid_1's auc: 0.789671
[1900] training's auc: 0.942259 valid_1's auc: 0.789874
[2000] training's auc: 0.945858 valid_1's auc: 0.789743
Early stopping, best iteration is:
[1885] training's auc: 0.941636 valid_1's auc: 0.789935
Fold 2 AUC : 0.789935
Training until validation scores don't improve for 200 rounds
[100] training's auc: 0.782443 valid_1's auc: 0.760835
[200] training's auc: 0.806232 valid_1's auc: 0.774092
[300] training's auc: 0.822842 valid_1's auc: 0.780568
[400] training's auc: 0.836422 valid_1's auc: 0.783767
[500] training's auc: 0.848625 valid_1's auc: 0.786248
[600] training's auc: 0.859569 valid_1's auc: 0.787689
[700] training's auc: 0.869923 valid_1's auc: 0.789037
[800] training's auc: 0.879094 valid_1's auc: 0.789732
[900] training's auc: 0.887098 valid_1's auc: 0.790292
[1000] training's auc: 0.894593 valid_1's auc: 0.790652
[1100] training's auc: 0.901143 valid_1's auc: 0.79099
[1200] training's auc: 0.907911 valid_1's auc: 0.791169
[1300] training's auc: 0.913411 valid_1's auc: 0.791089
[1400] training's auc: 0.918854 valid_1's auc: 0.791133
Early stopping, best iteration is:
[1201] training's auc: 0.907944 valid_1's auc: 0.791183
Fold 3 AUC : 0.791183
Training until validation scores don't improve for 200 rounds
[100] training's auc: 0.783452 valid_1's auc: 0.759775
[200] training's auc: 0.80759 valid_1's auc: 0.772697
[300] training's auc: 0.823811 valid_1's auc: 0.778664
[400] training's auc: 0.83728 valid_1's auc: 0.781591
[500] training's auc: 0.849105 valid_1's auc: 0.783588
[600] training's auc: 0.860079 valid_1's auc: 0.7851
[700] training's auc: 0.869841 valid_1's auc: 0.786359
[800] training's auc: 0.879002 valid_1's auc: 0.787141
[900] training's auc: 0.887656 valid_1's auc: 0.787323
[1000] training's auc: 0.895257 valid_1's auc: 0.787467
[1100] training's auc: 0.902494 valid_1's auc: 0.787827
[1200] training's auc: 0.908634 valid_1's auc: 0.788189
[1300] training's auc: 0.914872 valid_1's auc: 0.788379
[1400] training's auc: 0.920066 valid_1's auc: 0.788461
[1500] training's auc: 0.92536 valid_1's auc: 0.788412
[1600] training's auc: 0.930068 valid_1's auc: 0.788697
[1700] training's auc: 0.934425 valid_1's auc: 0.78877
[1800] training's auc: 0.938771 valid_1's auc: 0.788662
[1900] training's auc: 0.94271 valid_1's auc: 0.788981
[2000] training's auc: 0.946599 valid_1's auc: 0.788999
[2100] training's auc: 0.949967 valid_1's auc: 0.788967
Early stopping, best iteration is:
[1927] training's auc: 0.944003 valid_1's auc: 0.789081
Fold 4 AUC : 0.789081
Training until validation scores don't improve for 200 rounds
[100] training's auc: 0.782267 valid_1's auc: 0.760801
[200] training's auc: 0.805775 valid_1's auc: 0.774578
[300] training's auc: 0.822456 valid_1's auc: 0.781059
[400] training's auc: 0.837006 valid_1's auc: 0.784653
[500] training's auc: 0.848933 valid_1's auc: 0.786817
[600] training's auc: 0.860147 valid_1's auc: 0.788454
[700] training's auc: 0.869955 valid_1's auc: 0.789242
[800] training's auc: 0.879022 valid_1's auc: 0.789719
[900] training's auc: 0.887043 valid_1's auc: 0.790111
[1000] training's auc: 0.894709 valid_1's auc: 0.790376
[1100] training's auc: 0.901814 valid_1's auc: 0.790596
[1200] training's auc: 0.90801 valid_1's auc: 0.790917
[1300] training's auc: 0.914572 valid_1's auc: 0.791167
[1400] training's auc: 0.920002 valid_1's auc: 0.791147
[1500] training's auc: 0.92504 valid_1's auc: 0.791326
[1600] training's auc: 0.929904 valid_1's auc: 0.79151
[1700] training's auc: 0.934097 valid_1's auc: 0.791493
[1800] training's auc: 0.938231 valid_1's auc: 0.791423
Early stopping, best iteration is:
[1649] training's auc: 0.931961 valid_1's auc: 0.791637
Fold 5 AUC : 0.791637
Full AUC score 0.790754
Run lgbm with kfold done in 905s
full run done in 996s
78.9分
plot_importances(features_importance_df)