null importance#
介绍了一种识别无效特征的方法
有效特征面对假标签表现得应该很差
无效特征面对真假标签表现得差不多
我们可以用将标签不断打散,每次lgbm计算特征,就可以识别出无效特征了
application_train = pd.read_csv('data/application_train.csv')
application_test = pd.read_csv('data/application_test.csv')
train_labels = application_train['TARGET']
train_ids = application_train['SK_ID_CURR']
test_ids = application_test['SK_ID_CURR']
train_features = application_train.drop(columns=['TARGET', 'SK_ID_CURR'])
test_features = application_test.drop(columns=['SK_ID_CURR'])
for col in train_features.select_dtypes(include=['object']).columns:
train_features[col] = train_features[col].astype('category')
for col in test_features.select_dtypes(include=['object']).columns:
test_features[col] = test_features[col].astype('category')
def get_features_importance(train_features, train_labels):
lgbm_model = LGBMClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=8,
random_state=42,
)
lgbm_model.fit(train_features, train_labels)
features_importance = pd.DataFrame(
{
'gain': lgbm_model.booster_.feature_importance(importance_type='gain'),
'split': lgbm_model.booster_.feature_importance(importance_type='split'),
'feature': lgbm_model.feature_name_
}
)
return features_importance
actual_fi_df = get_features_importance(train_features, train_labels)
多次shuffle
pd没有这个函数, 使用
.sample(frac=1)全采样这个服从正太分布采用
runs = 50
fi_list = []
for i in range(runs):
shuffle_train_labels = train_labels.sample(frac=1)
fi_df = get_features_importance(train_features, shuffle_train_labels)
fi_df['run'] = i + 1
fi_list.append(fi_df)
fi_dfs = pd.concat(fi_list, axis=0, ignore_index=True)
我们可以绘制 这些重要性得分布,
如果真实重要性与随机重要性相近,那就是假特征
actual_fi_df.sort_values(by='gain')
def plot_importance_distribution_of_feature(actual_fi_df, df, feature_name):
data = df[df['feature'] == feature_name]
actual_gain = actual_fi_df.loc[actual_fi_df['feature'] == feature_name, 'gain'].iloc[0]
actual_split = actual_fi_df.loc[actual_fi_df['feature'] == feature_name, 'split'].iloc[0]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
sns.histplot(data['gain'], alpha=0.3, label='Null Gain', ax=ax1, kde=True)
ax1.axvline(x=actual_gain, linestyle='--', label=f'Actual: {actual_gain:.1f}')
ax1.set_title(f'Gain Importance: {feature_name}')
ax1.legend()
sns.histplot(data['split'], alpha=0.3, label='Null Split', ax=ax2, kde=True)
ax2.axvline(x=actual_split, linestyle='--', label=f'Actual: {actual_split}')
ax2.set_title(f'Split Importance: {feature_name}')
ax2.legend()
plt.tight_layout()
plot_importance_distribution_of_feature(actual_fi_df, fi_dfs, 'FONDKAPREMONT_MODE')
plot_importance_distribution_of_feature(actual_fi_df, fi_dfs, 'EXT_SOURCE_3')
我们以分布得75分位数 作为 分布值, 去与真实对比。
如果大于这个值,表示真实特征表现优于75%的噪声
即 score>0 。 我们使用log以0分界,可以更好的剔除
def get_feature_score(actual_importance, shuffle_importances, feature_name):
feature_actual_gain = actual_importance[actual_importance['feature'] == feature_name]['gain']
feature_actual_split = actual_importance[actual_importance['feature'] == feature_name]['split']
feature_shuffle_gains = shuffle_importances[shuffle_importances['feature'] == feature_name]['gain']
feature_shuffle_splits = shuffle_importances[shuffle_importances['feature'] == feature_name]['split']
return pd.DataFrame({
'feature': feature_name,
'gain_score': np.log(1e-10 + feature_actual_gain / np.percentile(feature_shuffle_gains, 75)),
'split_score': np.log(1e-10 + feature_actual_split / np.percentile(feature_shuffle_splits, 75)),
})
get_feature_score(actual_fi_df, fi_dfs, 'CODE_GENDER')
features_score = []
for f in actual_fi_df['feature'].unique():
features_score.append(get_feature_score(actual_fi_df, fi_dfs, f))
features_score = pd.concat(features_score, ignore_index=True)
plot_gain_features_score = features_score.sort_values(by='gain_score', ascending=False)
plot_split_features_score = features_score.sort_values(by='split_score', ascending=False)
fig, axes = plt.subplots(1, 2, figsize=(20, 10))
sns.barplot(data=plot_gain_features_score.head(70), x='gain_score', y='feature', ax=axes[0])
axes[0].set_title('Top 70 Features by Gain')
sns.barplot(data=plot_split_features_score.head(70), x='split_score', y='feature', ax=axes[1])
axes[1].set_title('Top 70 Features by Split')
plt.tight_layout()
features_score.head()
threshold = 30
features_removed_by_gain_score = features_score[features_score['gain_score'] < 0]
features_removed_by_split_score = features_score[features_score['split_score'] < 0]
features_removed = set(features_removed_by_gain_score['feature']) & set(features_removed_by_split_score['feature'])
features_removed
train_features_clean = train_features.drop(columns=features_removed)
test_features_clean = test_features.drop(columns=features_removed)
lgbm_model = LGBMClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=8,
random_state=42,
)
lgbm_model.fit(train_features_clean, train_labels)
train_pred = lgbm_model.predict_proba(train_features_clean)
roc_auc_score(train_labels, train_pred[:, 1])