null importance#

介绍了一种识别无效特征的方法

  • 有效特征面对假标签表现得应该很差

  • 无效特征面对真假标签表现得差不多

我们可以用将标签不断打散,每次lgbm计算特征,就可以识别出无效特征了

application_train = pd.read_csv('data/application_train.csv')
application_test = pd.read_csv('data/application_test.csv')
train_labels = application_train['TARGET']
train_ids = application_train['SK_ID_CURR']
test_ids = application_test['SK_ID_CURR']
train_features = application_train.drop(columns=['TARGET', 'SK_ID_CURR'])
test_features = application_test.drop(columns=['SK_ID_CURR'])
for col in train_features.select_dtypes(include=['object']).columns:
    train_features[col] = train_features[col].astype('category')

for col in test_features.select_dtypes(include=['object']).columns:
    test_features[col] = test_features[col].astype('category')
def get_features_importance(train_features, train_labels):
    lgbm_model = LGBMClassifier(
        n_estimators=100,      
        learning_rate=0.1, 
        max_depth=8,      
        random_state=42,   
    )
    lgbm_model.fit(train_features, train_labels)
    features_importance = pd.DataFrame(
        {
            'gain': lgbm_model.booster_.feature_importance(importance_type='gain'),
            'split': lgbm_model.booster_.feature_importance(importance_type='split'),
            'feature': lgbm_model.feature_name_
        }
    )
    return features_importance
actual_fi_df = get_features_importance(train_features, train_labels)

多次shuffle

  • pd没有这个函数, 使用.sample(frac=1) 全采样

    • 这个服从正太分布采用

runs = 50
fi_list = []
for i in range(runs):
    shuffle_train_labels = train_labels.sample(frac=1)
    fi_df = get_features_importance(train_features, shuffle_train_labels)
    fi_df['run'] = i + 1
    fi_list.append(fi_df)
fi_dfs = pd.concat(fi_list, axis=0, ignore_index=True)

我们可以绘制 这些重要性得分布,

  • 如果真实重要性与随机重要性相近,那就是假特征

actual_fi_df.sort_values(by='gain')
def plot_importance_distribution_of_feature(actual_fi_df, df, feature_name):
    data = df[df['feature'] == feature_name]
    
    actual_gain = actual_fi_df.loc[actual_fi_df['feature'] == feature_name, 'gain'].iloc[0]
    actual_split = actual_fi_df.loc[actual_fi_df['feature'] == feature_name, 'split'].iloc[0]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
    
    sns.histplot(data['gain'], alpha=0.3, label='Null Gain', ax=ax1, kde=True)
    ax1.axvline(x=actual_gain,  linestyle='--', label=f'Actual: {actual_gain:.1f}')
    ax1.set_title(f'Gain Importance: {feature_name}')
    ax1.legend()
    
    sns.histplot(data['split'],  alpha=0.3, label='Null Split', ax=ax2, kde=True)
    ax2.axvline(x=actual_split,  linestyle='--', label=f'Actual: {actual_split}')
    ax2.set_title(f'Split Importance: {feature_name}')
    ax2.legend()
    
    plt.tight_layout()
plot_importance_distribution_of_feature(actual_fi_df, fi_dfs, 'FONDKAPREMONT_MODE')
plot_importance_distribution_of_feature(actual_fi_df, fi_dfs, 'EXT_SOURCE_3')

我们以分布得75分位数 作为 分布值, 去与真实对比。

  • 如果大于这个值,表示真实特征表现优于75%的噪声

    • 即 score>0 。 我们使用log以0分界,可以更好的剔除

def get_feature_score(actual_importance, shuffle_importances, feature_name):
    feature_actual_gain = actual_importance[actual_importance['feature'] == feature_name]['gain']
    feature_actual_split = actual_importance[actual_importance['feature'] == feature_name]['split']
    feature_shuffle_gains = shuffle_importances[shuffle_importances['feature'] == feature_name]['gain']
    feature_shuffle_splits = shuffle_importances[shuffle_importances['feature'] == feature_name]['split']
    return pd.DataFrame({
        'feature': feature_name,
        'gain_score':  np.log(1e-10 + feature_actual_gain / np.percentile(feature_shuffle_gains, 75)),
        'split_score': np.log(1e-10 + feature_actual_split / np.percentile(feature_shuffle_splits, 75)),
    })
get_feature_score(actual_fi_df, fi_dfs, 'CODE_GENDER')
features_score = []
for f in actual_fi_df['feature'].unique():
    features_score.append(get_feature_score(actual_fi_df, fi_dfs, f))
features_score = pd.concat(features_score, ignore_index=True)
plot_gain_features_score = features_score.sort_values(by='gain_score', ascending=False)
plot_split_features_score = features_score.sort_values(by='split_score', ascending=False)
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

sns.barplot(data=plot_gain_features_score.head(70), x='gain_score', y='feature', ax=axes[0])
axes[0].set_title('Top 70 Features by Gain')

sns.barplot(data=plot_split_features_score.head(70), x='split_score', y='feature', ax=axes[1])
axes[1].set_title('Top 70 Features by Split')

plt.tight_layout()
features_score.head()
threshold = 30
features_removed_by_gain_score = features_score[features_score['gain_score'] < 0]
features_removed_by_split_score = features_score[features_score['split_score'] < 0]
features_removed = set(features_removed_by_gain_score['feature']) & set(features_removed_by_split_score['feature'])
features_removed
train_features_clean = train_features.drop(columns=features_removed)
test_features_clean = test_features.drop(columns=features_removed)
lgbm_model = LGBMClassifier(
    n_estimators=100,      
    learning_rate=0.1, 
    max_depth=8,      
    random_state=42,   
)
lgbm_model.fit(train_features_clean, train_labels)
train_pred = lgbm_model.predict_proba(train_features_clean)
roc_auc_score(train_labels, train_pred[:, 1])