更好的手动特征工程#

虽然featuretools提供了自动特征工程,但是dfs巨慢。

我们应该构建新领域特征,这会大幅提升模型。通过比例、product、minus、count以及之前的agg方法。这个过程中,我们会做一些必要的异常值处理。

最后,通过kfold-lgbm得到稳定的结果,并剔除之前记录的no_importance_features 加快模型训练

import pandas as pd 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import lightgbm as lgb
from lightgbm import LGBMClassifier
import warnings
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold,StratifiedKFold
import re
from contextlib import contextmanager
import time
import os
from lightgbm import early_stopping, log_evaluation

gc.enable()
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.rcParams['axes.unicode_minus'] = False 
plt.rcParams['figure.figsize'] = (8,6) 
plt.rcParams['figure.dpi'] = 100

公共函数#

def submit(ids, pred, name, feature_count=None):
    """
    ids: 测试集的 SK_ID_CURR
    pred: 模型预测概率
    name: 你的实验备注 (如 'lgb_v1', 'baseline')
    feature_count: 可选,记录模型使用了多少个特征
    """
    # 1. 创建提交 DataFrame
    submit_df = pd.DataFrame({
        'SK_ID_CURR': ids,
        'TARGET': pred
    })

    # 2. 生成时间戳 (格式: 0213_1530)
    timestamp = time.strftime("%m%d_%H%M")
    
    # 3. 构造文件名
    # 格式: 0213_1530_lgb_v1_f542.csv
    f_str = f"_f{feature_count}" if feature_count else ""
    filename = f"{timestamp}_{name}{f_str}.csv"
    
    # 4. 确保保存目录存在 (可选)
    if not os.path.exists('submissions'):
        os.makedirs('submissions')
    
    save_path = os.path.join('submissions', filename)
    
    # 5. 保存并打印提示
    submit_df.to_csv(save_path, index=False)
    
    return submit_df
def onehot_encoder(df, nan_as_category=True):
    """ 
    df: 输入的 DataFrame
    nan_as_category: 是否将 NaN 视为一个独立的类别进行编码

    return: 新的cols
    """
    original_columns = df.columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    df = pd.get_dummies(df, columns=categorical_cols, dummy_na=nan_as_category)
    new_columns = [col for col in df.columns if col not in original_columns]
    return df, new_columns

kfold将数据分为几份,每次用其中一份做验证,其他做训练。得到更稳定结果

  • stratified=True 代表分层抽样,保持每份数据中正负样本比例相同

def submit(ids, pred, name, feature_count=None):
    """
    ids: 测试集的 SK_ID_CURR
    pred: 模型预测概率
    name: 你的实验备注 (如 'lgb_v1', 'baseline')
    feature_count: 可选,记录模型使用了多少个特征
    """
    # 1. 创建提交 DataFrame
    submit_df = pd.DataFrame({
        'SK_ID_CURR': ids,
        'TARGET': pred
    })

    # 2. 生成时间戳 (格式: 0213_1530)
    timestamp = time.strftime("%m%d_%H%M")
    
    # 3. 构造文件名
    # 格式: 0213_1530_lgb_v1_f542.csv
    f_str = f"_f{feature_count}" if feature_count else ""
    filename = f"{timestamp}_{name}{f_str}.csv"
    
    # 4. 确保保存目录存在 (可选)
    if not os.path.exists('submissions'):
        os.makedirs('submissions')
    
    save_path = os.path.join('submissions', filename)
    
    # 5. 保存并打印提示
    submit_df.to_csv(save_path, index=False)
    
    return submit_df
def kfold_lightgbm(df, num_folds, stratified=False, debug=True):
    def clean_names(df):
        # 替换所有非字母、数字的字符为下划线
        # 这里的正则 [^A-Za-z0-9_] 会匹配空格、斜杠、括号等所有特殊字符
        df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
        # 顺便处理一下可能出现的重复下划线,比如 __
        df.columns = [re.sub(r'_+', '_', col).strip('_') for col in df.columns]
        return df
    df = clean_names(df)
    
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    if stratified:
        fold = StratifiedKFold(n_splits=num_folds, shuffle=True)
    else:
        fold = KFold(n_splits=num_folds, shuffle=True)

    features = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    feature_importance_df = pd.DataFrame()
    out_of_fold_preds = np.zeros(train_df.shape[0])
    submit_preds = np.zeros(test_df.shape[0])
    for n_fold, (train_idx, valid_idx) in enumerate(fold.split(train_df[features], train_df['TARGET'])):
        dtrain = lgb.Dataset(data=train_df[features].iloc[train_idx],
            label = train_df['TARGET'].iloc[train_idx],
            free_raw_data=False
        )
        dvalid = lgb.Dataset(data=train_df[features].iloc[valid_idx],
            label = train_df['TARGET'].iloc[valid_idx],
            free_raw_data=False
        )
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'max_depth': 8,
            'num_leaves': 40,
            'min_child_samples': 30,
            'learning_rate': 0.02,
            'verbosity': -1,    

            # 特征与数据采样(增加随机性,防止过拟合)
            'feature_fraction': 0.8,   # 每次迭代只用 80% 的特征
            'bagging_fraction': 0.8,   # 每次迭代只用 80% 的数据
            'bagging_freq': 5,         # 每 5 轮进行一次采样
            
            'lambda_l1': 0.1,
            'lambda_l2': 0.1  
        }
        clf = lgb.train(
            params = params,
            train_set=dtrain,
            valid_sets=[dtrain, dvalid],
            num_boost_round=10000, # 10000个树
            callbacks=[
                early_stopping(stopping_rounds=200),  # 如果连续200迭代没有提升auc,就自动停止 
                log_evaluation(period=100) # 每100轮提醒你一下进度
            ]
        )
        out_of_fold_preds[valid_idx] = clf.predict(dvalid.data)
        submit_preds += clf.predict(test_df[features]) / fold.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, out_of_fold_preds[valid_idx])))
        del clf, dtrain, dvalid
        gc.collect()


    if not debug:
        submit(test_df['SK_ID_CURR'], submit_preds, 'lgbm_folds', feature_count=len(features))

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], out_of_fold_preds))
    return feature_importance_df
def plot_importances(feature_importance_df):
    cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:30].index
    best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

application_train/test#

def application_train_test(nrows = None, nan_as_category = True):
    """ 
    """
    app_train = pd.read_csv('data/application_train.csv', nrows=nrows)
    app_test = pd.read_csv('data/application_test.csv', nrows=nrows)
    print(f'train {len(app_train)}, test {len(app_test)}')
    app = pd.concat([app_train, app_test])
    app = app.reset_index()
    print(f'app {len(app)}')

    # 
    app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

    app['NEW_CREDIT_TO_ANNUITY_RATIO'] = app['AMT_CREDIT'] / (app['AMT_ANNUITY'] + 1)
    app['NEW_CREDIT_TO_GOODS_RATIO'] = app['AMT_CREDIT'] / (app['AMT_GOODS_PRICE'] + 1)
    app['NEW_EMPLOYED_TO_BIRTH_RATIO'] = app['DAYS_EMPLOYED'] / (app['DAYS_BIRTH'] + 1)
    app['NEW_ANNITY_TO_INCOME_RATIO'] = app['AMT_ANNUITY'] / (app['AMT_INCOME_TOTAL'] + 1)
    app['NEW_CREDIT_TO_INCOME_RATIO'] = app['AMT_CREDIT'] / (app['AMT_INCOME_TOTAL'] + 1)
    app['NEW_EXT_SOURCE_PROD'] = app['EXT_SOURCE_1'] * app['EXT_SOURCE_2'] * app['EXT_SOURCE_3']
    app['NEW_EXT_SOURCE_MEAN'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    app['NEW_EXT_SOURCE_STD'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)

    for bin_feature in ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CODE_GENDER']:
        app[bin_feature], uniques = pd.factorize(app[bin_feature])
    app, new_cat_features = onehot_encoder(app)
    
    del app_train, app_test
    gc.collect()
    return app
    

bureau_and_balance#

def bureau_and_balance(nrows = None, nan_as_category = True):
    bureau = pd.read_csv('data/bureau.csv',nrows=nrows)
    balance = pd.read_csv('data/bureau_balance.csv', nrows=nrows)
    print(f'bureau {bureau.shape}, balance {balance.shape}')
    balance, balance_cat_cols = onehot_encoder(balance, nan_as_category)
    bureau, bureau_cat_cols = onehot_encoder(bureau, nan_as_category)

    # balance
    balance_aggregations = {
        'MONTHS_BALANCE': ['min', 'max', 'size']
    }
    for col in balance_cat_cols:
        balance_aggregations[col] = ['mean']
    balance_agg = balance.groupby('SK_ID_BUREAU').agg(balance_aggregations)
    balance_agg.columns = pd.Index([col[0] + '_' + col[1].upper() for col in balance_agg.columns.tolist()])
    bureau = bureau.join(balance_agg, how='left', on='SK_ID_BUREAU')
    bureau = bureau.drop(columns=['SK_ID_BUREAU'])
    del balance, balance_agg
    gc.collect()

    # bureau
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    category_aggregations = {}
    for col in bureau_cat_cols:
        category_aggregations[col] = ['mean']
    for col in balance_cat_cols:
        category_aggregations[col + '_MEAN'] = ['mean']
    bureau_agg = bureau.groupby(by='SK_ID_CURR').agg({**num_aggregations, **category_aggregations})
    bureau_agg.columns = pd.Index(
        ['BUREAU_' + col[0] + '_' + col[1].upper() for col in bureau_agg.columns.tolist()]
    )

    # bureau - active where
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    cols =  active_agg.columns
    active_agg.columns = pd.Index(
        ['ACTIVE_' + col[0] +'_' + col[1].upper() for col in active_agg.columns.tolist()]
    )
    bureau_agg = bureau_agg.join(active_agg, on='SK_ID_CURR', how='left')
    del active_agg, active
    gc.collect()

    # bureau - closed where
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg
    gc.collect()

    for col in cols:
        bureau_agg['NEW_RATIO_BUREAU_' + col[0] + "_" + col[1].upper()] = bureau_agg['ACTIVE_' + col[0] + "_" + col[1].upper()] / bureau_agg['CLOSED_' + col[0] + "_" + col[1].upper()]
    
    del bureau
    gc.collect()
    
    return bureau_agg

activeclosed是同样的逻辑,但是不同的业务含义。active代表当前还款压力,closed代表往日信用记录。 他们是两类指标,混在一起的话,意义不明确

这个类似ft工具的where

previous_applications#

def previous_applications(nrows = None, nan_as_category = True):
    prev  = pd.read_csv('data/previous_application.csv',nrows=nrows)
    print(f'prev {prev.shape}')
    prev, cat_cols = onehot_encoder(prev, nan_as_category)

    prev['DAYS_FIRST_DRAWING'] = prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan)
    prev['DAYS_FIRST_DUE'] = prev['DAYS_FIRST_DUE'].replace(365243, np.nan)
    prev['DAYS_LAST_DUE_1ST_VERSION'] = prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan)
    prev['DAYS_LAST_DUE'] = prev['DAYS_LAST_DUE'].replace(365243, np.nan)
    prev['DAYS_TERMINATION'] = prev['DAYS_TERMINATION'].replace(365243, np.nan)
    
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }

    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']

    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    cols = approved_agg.columns.tolist()
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev

    for e in cols:
        prev_agg['NEW_RATIO_PREV_' + e[0] + "_" + e[1].upper()] = prev_agg['APPROVED_' + e[0] + "_" + e[1].upper()] / prev_agg['REFUSED_' + e[0] + "_" + e[1].upper()]
    
    gc.collect()
    return prev_agg

同样的,approvedrefused代表不同含义,如APPROVED_AMT_CREDIT是银行对客户信心额度,REFUSED_AMT_CREDIT是客户想借没借到的额度

POS_CASH_balance#

def pos_cash(nrows = None , nan_as_category = True):
    pos = pd.read_csv('data/POS_CASH_balance.csv', nrows = nrows)
    pos, cat_cols = onehot_encoder(pos, nan_as_category)
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])

    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    
    del pos
    gc.collect()

    return pos_agg

installments_payments#

def installments_payments(nrows = None , nan_as_category = True):
    ins = pd.read_csv('data/installments_payments.csv', nrows = nrows)
    ins, cat_cols = onehot_encoder(ins, nan_as_category= True)

    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']

    # DPD逾期天数, DBD提前还款天数
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)

    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])

    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    
    del ins
    gc.collect()
    return ins_agg

credit_card_balance#

def credit_card_balance(nrows = None, nan_as_category = True):
    cc = pd.read_csv('data/credit_card_balance.csv', nrows = nrows)
    cc, cat_cols = onehot_encoder(cc, nan_as_category= True)

    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)

    numeric_cols = [c for c in cc.columns if cc[c].dtype != 'object']
    num_aggregations = {col: [ 'mean', 'sum', 'var'] for col in numeric_cols}

    category_aggregations = {}
    for col in cat_cols:
        category_aggregations[col] = ['mean']
    
    cc_agg = cc.groupby('SK_ID_CURR').agg({**num_aggregations, **category_aggregations})
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])

    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    
    del cc
    gc.collect()
    return cc_agg

main#

@contextmanager
def timer(title):
    t0 = time.time()
    yield # 执行with代码
    print(f'{title} done in {time.time() - t0:.0f}s')  
def get_last_no_imp_features():
    """ 获取那些不重要的特征,以供剔除
    """
    try:
        features_importance_df = pd.read_feather('features_importance_df.feather')
        low_importance_features = features_importance_df.groupby('feature')['importance'].mean()
        low_importance_features = low_importance_features[low_importance_features == 0].index.tolist()
        return low_importance_features
    except FileNotFoundError:
        return []
def main(debug = False):
    nrows = 10000 if debug else None
    df = application_train_test(nrows)
    with timer('Process bureau and bureau balance'):
        bureau = bureau_and_balance(nrows)
        print(f'Bureau shape :{bureau.shape}')
        df = df.join(bureau, on='SK_ID_CURR', how='left')
        del bureau
        gc.collect()
    with timer("Process previous_applications"):
        prev = previous_applications(nrows)
        print("Previous applications df shape:", prev.shape)
        df = df.join(prev, how='left', on='SK_ID_CURR')
        del prev
        gc.collect()
    with timer("Process POS-CASH balance"):
        pos = pos_cash(nrows)
        print("Pos-cash balance df shape:", pos.shape)
        df = df.join(pos, how='left', on='SK_ID_CURR')
        del pos
        gc.collect()
    with timer("Process installments payments"):
        ins = installments_payments(nrows)
        print("Installments payments df shape:", ins.shape)
        df = df.join(ins, how='left', on='SK_ID_CURR')
        del ins
        gc.collect()
    with timer("Process credit card balance"):
        cc = credit_card_balance(nrows)
        print("Credit card balance df shape:", cc.shape)
        df = df.join(cc, how='left', on='SK_ID_CURR')
        del cc
        gc.collect()
    with timer('Run lgbm with kfold'):
        no_imp_features = get_last_no_imp_features()
        features_to_drop = list(set(no_imp_features) & set(df.columns))
        df = df.drop(columns=features_to_drop)
        print(f'final DF :{df.shape}')
        features_importance_df = kfold_lightgbm(df, num_folds=5, debug=debug)
        features_importance_df.to_feather('features_importance_df.feather')
with timer('full run'):
    main()
train 307511, test 48744
app 356255
bureau (1716428, 17), balance (27299925, 3)
Bureau shape :(305811, 143)
Process bureau and bureau balance done in 17s
prev (1670214, 37)
Previous applications df shape: (338857, 279)
Process previous_applications done in 21s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance done in 12s
Installments payments df shape: (339587, 26)
Process installments payments done in 23s
Credit card balance df shape: (103558, 72)
Process credit card balance done in 14s
final DF :(356255, 720)
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.78238	valid_1's auc: 0.765774
[200]	training's auc: 0.8061	valid_1's auc: 0.778186
[300]	training's auc: 0.822628	valid_1's auc: 0.784157
[400]	training's auc: 0.836671	valid_1's auc: 0.787432
[500]	training's auc: 0.848619	valid_1's auc: 0.789125
[600]	training's auc: 0.859644	valid_1's auc: 0.790174
[700]	training's auc: 0.869208	valid_1's auc: 0.79104
[800]	training's auc: 0.878829	valid_1's auc: 0.791669
[900]	training's auc: 0.88704	valid_1's auc: 0.792334
[1000]	training's auc: 0.894494	valid_1's auc: 0.792548
[1100]	training's auc: 0.901095	valid_1's auc: 0.79254
Early stopping, best iteration is:
[993]	training's auc: 0.893959	valid_1's auc: 0.792591
Fold  1 AUC : 0.792591
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.782763	valid_1's auc: 0.759153
[200]	training's auc: 0.807392	valid_1's auc: 0.772789
[300]	training's auc: 0.823934	valid_1's auc: 0.779224
[400]	training's auc: 0.837874	valid_1's auc: 0.783119
[500]	training's auc: 0.849965	valid_1's auc: 0.785261
[600]	training's auc: 0.860856	valid_1's auc: 0.786782
[700]	training's auc: 0.870219	valid_1's auc: 0.78738
[800]	training's auc: 0.879139	valid_1's auc: 0.787911
[900]	training's auc: 0.887327	valid_1's auc: 0.788308
[1000]	training's auc: 0.895203	valid_1's auc: 0.788714
[1100]	training's auc: 0.902198	valid_1's auc: 0.788899
[1200]	training's auc: 0.908633	valid_1's auc: 0.789061
[1300]	training's auc: 0.914877	valid_1's auc: 0.789003
[1400]	training's auc: 0.920659	valid_1's auc: 0.789187
[1500]	training's auc: 0.925779	valid_1's auc: 0.789444
[1600]	training's auc: 0.930208	valid_1's auc: 0.789508
[1700]	training's auc: 0.934451	valid_1's auc: 0.789575
[1800]	training's auc: 0.93854	valid_1's auc: 0.789671
[1900]	training's auc: 0.942259	valid_1's auc: 0.789874
[2000]	training's auc: 0.945858	valid_1's auc: 0.789743
Early stopping, best iteration is:
[1885]	training's auc: 0.941636	valid_1's auc: 0.789935
Fold  2 AUC : 0.789935
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.782443	valid_1's auc: 0.760835
[200]	training's auc: 0.806232	valid_1's auc: 0.774092
[300]	training's auc: 0.822842	valid_1's auc: 0.780568
[400]	training's auc: 0.836422	valid_1's auc: 0.783767
[500]	training's auc: 0.848625	valid_1's auc: 0.786248
[600]	training's auc: 0.859569	valid_1's auc: 0.787689
[700]	training's auc: 0.869923	valid_1's auc: 0.789037
[800]	training's auc: 0.879094	valid_1's auc: 0.789732
[900]	training's auc: 0.887098	valid_1's auc: 0.790292
[1000]	training's auc: 0.894593	valid_1's auc: 0.790652
[1100]	training's auc: 0.901143	valid_1's auc: 0.79099
[1200]	training's auc: 0.907911	valid_1's auc: 0.791169
[1300]	training's auc: 0.913411	valid_1's auc: 0.791089
[1400]	training's auc: 0.918854	valid_1's auc: 0.791133
Early stopping, best iteration is:
[1201]	training's auc: 0.907944	valid_1's auc: 0.791183
Fold  3 AUC : 0.791183
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.783452	valid_1's auc: 0.759775
[200]	training's auc: 0.80759	valid_1's auc: 0.772697
[300]	training's auc: 0.823811	valid_1's auc: 0.778664
[400]	training's auc: 0.83728	valid_1's auc: 0.781591
[500]	training's auc: 0.849105	valid_1's auc: 0.783588
[600]	training's auc: 0.860079	valid_1's auc: 0.7851
[700]	training's auc: 0.869841	valid_1's auc: 0.786359
[800]	training's auc: 0.879002	valid_1's auc: 0.787141
[900]	training's auc: 0.887656	valid_1's auc: 0.787323
[1000]	training's auc: 0.895257	valid_1's auc: 0.787467
[1100]	training's auc: 0.902494	valid_1's auc: 0.787827
[1200]	training's auc: 0.908634	valid_1's auc: 0.788189
[1300]	training's auc: 0.914872	valid_1's auc: 0.788379
[1400]	training's auc: 0.920066	valid_1's auc: 0.788461
[1500]	training's auc: 0.92536	valid_1's auc: 0.788412
[1600]	training's auc: 0.930068	valid_1's auc: 0.788697
[1700]	training's auc: 0.934425	valid_1's auc: 0.78877
[1800]	training's auc: 0.938771	valid_1's auc: 0.788662
[1900]	training's auc: 0.94271	valid_1's auc: 0.788981
[2000]	training's auc: 0.946599	valid_1's auc: 0.788999
[2100]	training's auc: 0.949967	valid_1's auc: 0.788967
Early stopping, best iteration is:
[1927]	training's auc: 0.944003	valid_1's auc: 0.789081
Fold  4 AUC : 0.789081
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.782267	valid_1's auc: 0.760801
[200]	training's auc: 0.805775	valid_1's auc: 0.774578
[300]	training's auc: 0.822456	valid_1's auc: 0.781059
[400]	training's auc: 0.837006	valid_1's auc: 0.784653
[500]	training's auc: 0.848933	valid_1's auc: 0.786817
[600]	training's auc: 0.860147	valid_1's auc: 0.788454
[700]	training's auc: 0.869955	valid_1's auc: 0.789242
[800]	training's auc: 0.879022	valid_1's auc: 0.789719
[900]	training's auc: 0.887043	valid_1's auc: 0.790111
[1000]	training's auc: 0.894709	valid_1's auc: 0.790376
[1100]	training's auc: 0.901814	valid_1's auc: 0.790596
[1200]	training's auc: 0.90801	valid_1's auc: 0.790917
[1300]	training's auc: 0.914572	valid_1's auc: 0.791167
[1400]	training's auc: 0.920002	valid_1's auc: 0.791147
[1500]	training's auc: 0.92504	valid_1's auc: 0.791326
[1600]	training's auc: 0.929904	valid_1's auc: 0.79151
[1700]	training's auc: 0.934097	valid_1's auc: 0.791493
[1800]	training's auc: 0.938231	valid_1's auc: 0.791423
Early stopping, best iteration is:
[1649]	training's auc: 0.931961	valid_1's auc: 0.791637
Fold  5 AUC : 0.791637
Full AUC score 0.790754
Run lgbm with kfold done in 905s
full run done in 996s

78.9分

plot_importances(features_importance_df)