初步#

  • 特征工程

  • 回归模型:线性,非线性

导入#

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
import os

from scipy import stats
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder,LabelEncoder, FunctionTransformer,TargetEncoder,PolynomialFeatures
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso,LassoCV, Ridge,RidgeCV,ElasticNetCV, ElasticNet, LinearRegression,BayesianRidge
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.metrics import root_mean_squared_error
import gc
import warnings
from contextlib import contextmanager

gc.enable()
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.rcParams['axes.unicode_minus'] = False 
plt.rcParams['figure.figsize'] = (8,6) 
plt.rcParams['figure.dpi'] = 100

sns.set_theme(style="whitegrid", palette="Set1")

print(f'pd: {pd.__version__}')
pd: 2.3.3
traindata = pd.read_csv('data/train.csv',index_col='Id')
testdata = pd.read_csv('data/test.csv',index_col='Id')
traindata.shape
(1460, 80)
traindata.head()
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 80 columns

traindata.info()
<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuilt      1460 non-null   int64  
 19  YearRemodAdd   1460 non-null   int64  
 20  RoofStyle      1460 non-null   object 
 21  RoofMatl       1460 non-null   object 
 22  Exterior1st    1460 non-null   object 
 23  Exterior2nd    1460 non-null   object 
 24  MasVnrType     588 non-null    object 
 25  MasVnrArea     1452 non-null   float64
 26  ExterQual      1460 non-null   object 
 27  ExterCond      1460 non-null   object 
 28  Foundation     1460 non-null   object 
 29  BsmtQual       1423 non-null   object 
 30  BsmtCond       1423 non-null   object 
 31  BsmtExposure   1422 non-null   object 
 32  BsmtFinType1   1423 non-null   object 
 33  BsmtFinSF1     1460 non-null   int64  
 34  BsmtFinType2   1422 non-null   object 
 35  BsmtFinSF2     1460 non-null   int64  
 36  BsmtUnfSF      1460 non-null   int64  
 37  TotalBsmtSF    1460 non-null   int64  
 38  Heating        1460 non-null   object 
 39  HeatingQC      1460 non-null   object 
 40  CentralAir     1460 non-null   object 
 41  Electrical     1459 non-null   object 
 42  1stFlrSF       1460 non-null   int64  
 43  2ndFlrSF       1460 non-null   int64  
 44  LowQualFinSF   1460 non-null   int64  
 45  GrLivArea      1460 non-null   int64  
 46  BsmtFullBath   1460 non-null   int64  
 47  BsmtHalfBath   1460 non-null   int64  
 48  FullBath       1460 non-null   int64  
 49  HalfBath       1460 non-null   int64  
 50  BedroomAbvGr   1460 non-null   int64  
 51  KitchenAbvGr   1460 non-null   int64  
 52  KitchenQual    1460 non-null   object 
 53  TotRmsAbvGrd   1460 non-null   int64  
 54  Functional     1460 non-null   object 
 55  Fireplaces     1460 non-null   int64  
 56  FireplaceQu    770 non-null    object 
 57  GarageType     1379 non-null   object 
 58  GarageYrBlt    1379 non-null   float64
 59  GarageFinish   1379 non-null   object 
 60  GarageCars     1460 non-null   int64  
 61  GarageArea     1460 non-null   int64  
 62  GarageQual     1379 non-null   object 
 63  GarageCond     1379 non-null   object 
 64  PavedDrive     1460 non-null   object 
 65  WoodDeckSF     1460 non-null   int64  
 66  OpenPorchSF    1460 non-null   int64  
 67  EnclosedPorch  1460 non-null   int64  
 68  3SsnPorch      1460 non-null   int64  
 69  ScreenPorch    1460 non-null   int64  
 70  PoolArea       1460 non-null   int64  
 71  PoolQC         7 non-null      object 
 72  Fence          281 non-null    object 
 73  MiscFeature    54 non-null     object 
 74  MiscVal        1460 non-null   int64  
 75  MoSold         1460 non-null   int64  
 76  YrSold         1460 non-null   int64  
 77  SaleType       1460 non-null   object 
 78  SaleCondition  1460 non-null   object 
 79  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(34), object(43)
memory usage: 923.9+ KB
traindata.describe(include='all')
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
count 1460.000000 1460 1201.000000 1460.000000 1460 91 1460 1460 1460 1460 ... 1460.000000 7 281 54 1460.000000 1460.000000 1460.000000 1460 1460 1460.000000
unique NaN 5 NaN NaN 2 2 4 4 2 5 ... NaN 3 4 4 NaN NaN NaN 9 6 NaN
top NaN RL NaN NaN Pave Grvl Reg Lvl AllPub Inside ... NaN Gd MnPrv Shed NaN NaN NaN WD Normal NaN
freq NaN 1151 NaN NaN 1454 50 925 1311 1459 1052 ... NaN 3 157 49 NaN NaN NaN 1267 1198 NaN
mean 56.897260 NaN 70.049958 10516.828082 NaN NaN NaN NaN NaN NaN ... 2.758904 NaN NaN NaN 43.489041 6.321918 2007.815753 NaN NaN 180921.195890
std 42.300571 NaN 24.284752 9981.264932 NaN NaN NaN NaN NaN NaN ... 40.177307 NaN NaN NaN 496.123024 2.703626 1.328095 NaN NaN 79442.502883
min 20.000000 NaN 21.000000 1300.000000 NaN NaN NaN NaN NaN NaN ... 0.000000 NaN NaN NaN 0.000000 1.000000 2006.000000 NaN NaN 34900.000000
25% 20.000000 NaN 59.000000 7553.500000 NaN NaN NaN NaN NaN NaN ... 0.000000 NaN NaN NaN 0.000000 5.000000 2007.000000 NaN NaN 129975.000000
50% 50.000000 NaN 69.000000 9478.500000 NaN NaN NaN NaN NaN NaN ... 0.000000 NaN NaN NaN 0.000000 6.000000 2008.000000 NaN NaN 163000.000000
75% 70.000000 NaN 80.000000 11601.500000 NaN NaN NaN NaN NaN NaN ... 0.000000 NaN NaN NaN 0.000000 8.000000 2009.000000 NaN NaN 214000.000000
max 190.000000 NaN 313.000000 215245.000000 NaN NaN NaN NaN NaN NaN ... 738.000000 NaN NaN NaN 15500.000000 12.000000 2010.000000 NaN NaN 755000.000000

11 rows × 80 columns

traindata.dtypes
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 80, dtype: object

submit#

def submit(ids, pred, name, feature_count=None):
    """
    ids: 测试集的id
    pred: 模型预测概率
    name: 你的实验备注 (如 'lgb_v1', 'baseline')
    feature_count: 可选,记录模型使用了多少个特征
    """
    # 1. 创建提交 DataFrame
    submit_df = pd.DataFrame({
        'ID': ids,
        'SalePrice': pred
    })

    # 2. 生成时间戳 (格式: 0213_1530)
    timestamp = time.strftime("%m%d_%H%M")
    
    # 3. 构造文件名
    # 格式: 0213_1530_lgb_v1_f542.csv
    f_str = f"_f{feature_count}" if feature_count else ""
    filename = f"{timestamp}_{name}{f_str}.csv"
    
    # 4. 确保保存目录存在 (可选)
    if not os.path.exists('submissions'):
        os.makedirs('submissions')
    
    save_path = os.path.join('submissions', filename)
    
    # 5. 保存并打印提示
    submit_df.to_csv(save_path, index=False)
    
    return submit_df

字段说明#

来自AI

🏠 房屋基本属性与位置

这些字段决定了房子的“骨架”和地段:

  • MSSubClass: 房屋类型(如:1层现代风格、2层复古风格等)。

  • MSZoning: 土地分区(如:住宅区、商业区、农业区)。

  • LotFrontage: 房屋连接街道的距离(英尺)。

  • LotArea: 地块面积(平方英尺)。

  • Street / Alley: 道路/巷道的铺设类型(如:碎石、柏油)。

  • LotShape: 地块形状(是否规整)。

  • LandContour: 地形平坦度(如:平坦、有坡度、凹陷)。

  • Neighborhood: 艾姆斯市内的具体社区位置。

  • BldgType: 住宅类型(独栋、联排等)。

  • HouseStyle: 住宅风格(一层、二层、半层等)。

🌟 整体品质与年代

  • OverallQual: 整体材料和装饰评分(1-10分)。

  • OverallCond: 整体状态评分(1-10分)。

  • YearBuilt: 建造年份。

  • YearRemodAdd: 改建年份。

🧱 外部与结构细节

  • RoofStyle / RoofMatl: 屋顶类型和材料。

  • Exterior1st / Exterior2nd: 房屋外墙覆盖物。

  • MasVnrType / MasVnrArea: 砖石饰面类型及面积。

  • ExterQual / ExterCond: 外部材料的质量和现状评分。

  • Foundation: 地基类型(如:混凝土、砖、石头)。

🕳️ 地下室情况

  • BsmtQual / BsmtCond: 地下室的高度评价和质量评分。

  • BsmtExposure: 花园式地下室的采光/出口情况。

  • BsmtFinType1 / BsmtFinSF1: 第一类型地下室完工面积及类型。

  • BsmtFinType2 / BsmtFinSF2: 第二类型地下室完工面积。

  • BsmtUnfSF: 未完工地下室面积。

  • TotalBsmtSF: 地下室总面积。

🌡️ 公用设施与系统

  • Heating / HeatingQC: 供暖类型和质量评分。

  • CentralAir: 是否有中央空调(Y/N)。

  • Electrical: 电气系统类型。

📐 居住面积与房间

  • 1stFlrSF / 2ndFlrSF: 一层/二层面积。

  • LowQualFinSF: 低质量完工面积(所有楼层)。

  • GrLivArea: 地面以上居住面积总和。

  • BsmtFullBath / BsmtHalfBath: 地下室全卫/半卫数量。

  • FullBath / HalfBath: 地面上全卫/半卫数量。

  • BedroomAbvGr: 地面上卧室数量。

  • KitchenAbvGr / KitchenQual: 厨房数量及质量评分。

  • TotRmsAbvGrd: 地面上房间总数(不含卫浴)。

  • Fireplaces / FireplaceQu: 壁炉数量及质量。

🚗 车库与附属设施

  • GarageType / GarageYrBlt: 车库位置、建造年份。

  • GarageFinish / GarageCars / GarageArea: 车库装修情况、车位数、面积。

  • GarageQual / GarageCond: 车库质量和现状。

  • PavedDrive: 车道铺设情况。

  • WoodDeckSF / OpenPorchSF / EnclosedPorch / 3SsnPorch / ScreenPorch: 露台、走廊等各类户外空间的面积。

  • PoolArea / PoolQC: 泳池面积及质量。

  • Fence: 围栏质量。

  • MiscFeature / MiscVal: 其他杂项特征(如:网球场、棚屋)及其价值。

💰 销售信息

  • MoSold / YrSold: 售出月份和年份。

  • SaleType: 销售类型(如:现金、贷款、法拍)。

  • SaleCondition: 销售条件(如:正常交易、家庭内部转让)。

  • SalePrice: 目标变量,房屋售价(美元)。

EDA#

描述了数据特征探索的全部过程。

EDA过程进行了一些修改:缺失值删除列或者行,异常值删除样本行,还有转变dtype,类型编码,log变换等,

从EDA到pipeline的时候,注意的:

  1. 删除样本行的操作,应该在pipeline之前,在train上进行

  2. 填充缺失值、标准化、类型编码,在pipeline中,train上fit,test上transform

  3. 缺失值删除列、log变换,在pipeline中或者之前

eda_data = traindata.copy() # 方便探索,不修改traindata

traindata会进行样本行删除

metric#

Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price.

修正dtypes#

date_cols = ['GarageYrBlt','YearRemodAdd', 'YearBuilt', 'YrSold', 'MoSold']
eda_data[date_cols] = eda_data[date_cols].astype('datetime64[us]')
eda_data['GarageYrBlt'] = eda_data['GarageYrBlt'].dt.year
eda_data['YearRemodAdd'] = eda_data['YearRemodAdd'].dt.year
eda_data['YearBuilt'] = eda_data['YearBuilt'].dt.year
eda_data['YrSold'] = eda_data['YrSold'].dt.year
eda_data['MoSold'] = eda_data['MoSold'].dt.month

数字表示类别,不具备数值意义:

MSSubClass是类别

eda_data['MSSubClass'] = eda_data['MSSubClass'].astype('category')

定序特征转换为数值类型:

eda_data['ExterQual'] = eda_data['ExterQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['BsmtCond'] = eda_data['BsmtCond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['BsmtFinType1'] = eda_data['BsmtFinType1'].map({'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA':0})
eda_data['BsmtFinType2'] = eda_data['BsmtFinType2'].map({'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA':0})
eda_data['BsmtQual'] = eda_data['BsmtQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['KitchenQual'] = eda_data['KitchenQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['GarageQual'] = eda_data['GarageQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['GarageCond'] = eda_data['GarageCond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['HeatingQC'] = eda_data['HeatingQC'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['FireplaceQu'] = eda_data['FireplaceQu'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['PoolQC'] = eda_data['PoolQC'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
eda_data['Fence'] = eda_data['Fence'].map({'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'NA': 0})

目标SalePrice#

ax = sns.histplot(data=eda_data, x='SalePrice',kde=True)
ax.set_title('SalePrice distribution')
Text(0.5, 1.0, 'SalePrice distribution')
../../_images/a38cd9dd4e11cf22a2764756aae47624bf45b1db81e726081ec88b0e0f066349.png

明显是左偏分布、尖峰长尾

skew = eda_data['SalePrice'].skew()
kurt = eda_data['SalePrice'].kurt()
print(f'SalePrice: skew {skew}, kurt {kurt}')
SalePrice: skew 1.8828757597682129, kurt 6.536281860064529

数值特征与SalePrice相关性#

所有数值包括 定序特征

eda_data.select_dtypes(include=['number']).columns
Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'BsmtQual', 'BsmtCond',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')
plt.figure(figsize=(15, 12))
corrs = eda_data.corr(numeric_only=True)
sns.heatmap(data=corrs)
<Axes: >
../../_images/3686ea5395ab48f4efe7187821542b99e26cf051edc310c16d20f1661e7a1181.png

明显地,黄色方块

  • 有几个高度共线特征

  • SalePrice的几个相关特征:GrLivArea,OverallQual..

着重看下与SalePrice几个高相关特征

high10_cols = corrs.nlargest(10, 'SalePrice').index.tolist() 
high10_corrs = eda_data[high10_cols].corr()
plt.figure(figsize=(8,6))
sns.heatmap(data=high10_corrs, annot=True)
plt.show()
../../_images/14c34c4b9610626555f758bb83268ee74d060d2e6b6448f43af90bfeae159979.png

理解下这几个特征:

  • OverallQual 房屋装潢很重要!

  • GrLivArea 地上楼层活动面积,

  • GarageCars 车位数,GarageArea 车库面积 。 这两个是共线强相关的~

  • TotalBsmtSF 地下室面积,1stFlrSF 一层面积。 这两个是共线强相关的~

  • FullBath 地上楼层带有洗澡全设施的数量!!

  • ExterQual

  • KitchenQual

  • BsmtQual

cols = ['SalePrice','OverallQual','GrLivArea','ExterQual','KitchenQual','TotalBsmtSF','GarageCars']
ax = sns.pairplot(data=eda_data[cols])
../../_images/32f21ae41318574b5719faa781d0d86387e7b69541b5393c1e09d0bc2977dfad.png

发现:

有一些很像边界的线: 比如TotalBsmtSF 总是小于GrLivArea这是合理的,地上楼层活动面积大于地下室面积

随着建筑年限靠近,价格指数级上升

plt.figure(figsize=(10,6))
ax = sns.boxplot(data=eda_data, x= 'YearBuilt', y='SalePrice')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
../../_images/29dbd693acba1e9fe618fbf7d8712395956d2fb6c6b0f1e913e9c7e826af5273.png

分类特征与目标ANOVA#

如何在这么多特征找到那些关键特征??

我们通过ANOVA(方差分析),F统计量检验来找一些关键特征

eda_data.select_dtypes(include=['object', 'category']).columns
Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterCond', 'Foundation',
       'BsmtExposure', 'Heating', 'CentralAir', 'Electrical', 'Functional',
       'GarageType', 'GarageFinish', 'PavedDrive', 'MiscFeature', 'SaleType',
       'SaleCondition'],
      dtype='object')
def stat_category_features(df, n_high = 10):
    cols = df.select_dtypes(include=['object', 'category']).columns
    le = LabelEncoder()
    y = df[['SalePrice']]
    X = pd.DataFrame()
    for col in cols:
        X[col] = le.fit_transform(df[col])
    f_stats, p_values = f_regression(X = X, y=y)
    result = pd.DataFrame(index=X.columns)
    result['f_score'] = f_stats
    result['p_value'] = p_values
    return result
result = stat_category_features(eda_data)
no_imp_category_features = result[result['p_value'] > 0.05].index
no_imp_category_features
Index(['Street', 'LandContour', 'Utilities', 'LandSlope', 'Condition2'], dtype='object')

对于$p>0.05$的特征,我们认为F统计量是不可信的随机的。即分类对价格影响是随机的,可以删除

fig, axes = plt.subplots(2,3, figsize=(15, 10))
axes = axes.flatten()
for i, col in enumerate(no_imp_category_features):
    sns.boxplot(data=eda_data, x=col, y='SalePrice', ax = axes[i])
    sns.stripplot(data=eda_data, x=col, y='SalePrice', ax=axes[i], alpha=0.5)
plt.tight_layout()
plt.show()
../../_images/e3460acbcd66efecbb28b0c55449d25f4127724365791e8d2c6c6728716c57b8.png

确实地,可以看到,这些特征几乎都只用了一个类别!

我们来看下找到的前几个关键特征

result[result['p_value'] < 0.05].sort_values(by='f_score',ascending=False).head(9)
f_score p_value
GarageFinish 629.844106 7.922769e-116
GarageType 303.848427 5.866699e-62
Foundation 249.840256 4.579866e-52
BsmtExposure 153.953618 1.112827e-33
MasVnrType 125.530544 5.231103e-28
LotShape 101.893942 3.320712e-23
CentralAir 98.305344 1.809506e-22
Electrical 85.006587 1.008341e-19
PavedDrive 82.454424 3.418340e-19
imp_category_features = result[result['p_value'] < 0.05].sort_values(by='f_score',ascending=False).head(9).index
imp_category_features
Index(['GarageFinish', 'GarageType', 'Foundation', 'BsmtExposure',
       'MasVnrType', 'LotShape', 'CentralAir', 'Electrical', 'PavedDrive'],
      dtype='object')
fig, axes = plt.subplots(3,3, figsize=(15, 10), sharey=True)
axes = axes.flatten()
for i, col in enumerate(imp_category_features):
    sns.boxplot(data=eda_data, x=col, y='SalePrice', ax=axes[i])
    sns.stripplot(data=eda_data, x=col, y='SalePrice', ax=axes[i], alpha=0.5)
plt.tight_layout()
plt.show()
../../_images/ff5765ae2264cb793f1b67c388fbbce0cf8c06c5b5733956fe65bda6092f4a5e.png

确实的,都有着不错的效果~😊

一些特别的(后续训练出现的)

result[result.index == 'MSZoning']
f_score p_value
MSZoning 41.762896 1.401300e-10
sns.boxplot(data=eda_data, x='MSZoning', y='SalePrice')
<Axes: xlabel='MSZoning', ylabel='SalePrice'>
../../_images/dce7f2090863475b17f76d2401998060b6cbe342e4133b25c986fc621a2bc864.png

missing#

先处理缺失值

  • 缺失的程度

  • 缺失是否有规律?还是人为异常?

total = eda_data.isnull().sum()
percent = eda_data.isnull().sum() / eda_data.shape[0]
missing_data = pd.concat([total, percent], axis=1, keys=['total', 'percent'])
missing_data.sort_values(by='percent', ascending=False).head()
total percent
PoolQC 1453 0.995205
MiscFeature 1406 0.963014
Alley 1369 0.937671
Fence 1179 0.807534
MasVnrType 872 0.597260
missing_data[missing_data['percent'] > 0.15]
total percent
LotFrontage 259 0.177397
Alley 1369 0.937671
MasVnrType 872 0.597260
FireplaceQu 690 0.472603
PoolQC 1453 0.995205
Fence 1179 0.807534
MiscFeature 1406 0.963014

缺失比例大于15%,我们考虑删除特征:

主要都是质量材料相关的缺失,应该不重要,可以删除

missing_data[(missing_data['percent'] <= 0.15) & (missing_data['percent'] >0)]
total percent
MasVnrArea 8 0.005479
BsmtQual 37 0.025342
BsmtCond 37 0.025342
BsmtExposure 38 0.026027
BsmtFinType1 37 0.025342
BsmtFinType2 38 0.026027
Electrical 1 0.000685
GarageType 81 0.055479
GarageYrBlt 81 0.055479
GarageFinish 81 0.055479
GarageQual 81 0.055479
GarageCond 81 0.055479

对于缺失小于15%的,

  • Bsmt 有相关的 TotalBsmtSF 表示。 删除特征

  • Garage 有相关的 GarageCars。 删除特征

  • Electrical删除行就可

eda_data[eda_data['Electrical'].isnull()].index
Index([1380], dtype='int64', name='Id')
eda_data = eda_data.drop(index=eda_data[eda_data['Electrical'].isnull()].index,axis=0)
traindata = traindata.drop(index=eda_data[eda_data['Electrical'].isnull()].index,axis=0)
missing_remove_num_cols = {'MasVnrArea'}
missing_remove_cat_cols = {'Alley', 'MasVnrType', 'Fence', 'MiscFeature','GarageType', 'GarageFinish'}
missing_remove_ord_cols = {'PoolQC', 'FireplaceQu','BsmtQual','BsmtCond','BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'GarageQual', 'GarageCond'}
eda_data['GarageYrBlt'] = eda_data['GarageYrBlt'].fillna(eda_data['YearBuilt'])
eda_data.isnull().sum().sort_values(ascending=False)
PoolQC         1452
MiscFeature    1405
Alley          1368
Fence          1178
MasVnrType      871
               ... 
ExterCond         0
ExterQual         0
Exterior2nd       0
Exterior1st       0
SalePrice         0
Length: 80, dtype: int64

先进性粗略的填充使用median,none

异常值#

  • 删除行:

应该聚焦于那些关键的特征!

imp_numeric_cols =[
   'OverallQual','TotalBsmtSF', 'GrLivArea','1stFlrSF'
]
imp_category_cols = [
   'Foundation','LotShape', 'CentralAir','Electrical', 'PavedDrive'
    ]

关于离群异常点的定义,不能使用简单的IQR,很多数值特征都是长尾的,所需需要log变换

目标异常#

eda_data['SalePrice_log'] = np.log1p(eda_data['SalePrice'])
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))
sns.histplot(data=eda_data, x='SalePrice_log', kde=True, ax=ax1)
sns.boxplot(data=eda_data, x='SalePrice_log', ax=ax2)
plt.tight_layout()
plt.show()
../../_images/289f764ec3293d9658ac9b9db43f658f49de802b41957985741d0dbc0f1501c4.png

双变量异常#

散点异常:删除点

eda_data[imp_numeric_cols]
OverallQual TotalBsmtSF GrLivArea 1stFlrSF
Id
1 7 856 1710 856
2 6 1262 1262 1262
3 7 920 1786 920
4 7 756 1717 961
5 8 1145 2198 1145
... ... ... ... ...
1456 6 953 1647 953
1457 6 1542 2073 2073
1458 7 1152 2340 1188
1459 5 1078 1078 1078
1460 5 1256 1256 1256

1459 rows × 4 columns

fig,axes = plt.subplots(1,4, figsize=(15,4))
axes = axes.flatten()
for i,col in enumerate(imp_numeric_cols):
    sns.scatterplot(data=eda_data, x=col, y='SalePrice', ax = axes[i])
plt.tight_layout()
plt.show()
../../_images/0ff51987e5f75be4443853f86d5095f60572f015ac848f4af2021744e46ed506.png

有一些离群点

eda_data[(eda_data['GrLivArea']>4000) & (eda_data['SalePrice']<200000)]
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice SalePrice_log
Id
524 60 RL 130.0 40094 Pave NaN IR1 Bnk AllPub Inside ... NaN NaN NaN 0 1 1970 New Partial 184750 12.126764
1299 60 RL 313.0 63887 Pave NaN IR3 Bnk AllPub Corner ... 4.0 NaN NaN 0 1 1970 New Partial 160000 11.982935

2 rows × 81 columns

eda_data[eda_data['TotalBsmtSF']>5000]
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice SalePrice_log
Id
1299 60 RL 313.0 63887 Pave NaN IR3 Bnk AllPub Corner ... 4.0 NaN NaN 0 1 1970 New Partial 160000 11.982935

1 rows × 81 columns

fig, axes = plt.subplots(len(imp_category_cols), 2, figsize=(16, 4 * len(imp_category_cols)))

for i, col in enumerate(imp_category_cols):
    sns.boxplot(data=eda_data, x='SalePrice_log', y=col, ax=axes[i, 0])
    sns.stripplot(data=eda_data, x='SalePrice_log', y=col, ax=axes[i, 0], alpha=0.4)

    sns.boxplot(data=eda_data, x='SalePrice', y=col, ax=axes[i, 1])
    sns.stripplot(data=eda_data, x='SalePrice', y=col, ax=axes[i, 1], alpha=0.4)
plt.tight_layout()
plt.show()
../../_images/bcec146c46b364bb738f7f51e7d355e9abcc480c3daee6fa5694a3c1424d3c01.png
eda_data[(eda_data['Foundation'] == 'PConc') & (eda_data['SalePrice'] > 700000)]
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice SalePrice_log
Id
692 60 RL 104.0 21535 Pave NaN IR1 Lvl AllPub Corner ... NaN NaN NaN 0 1 1970 WD Normal 755000 13.534474
1183 60 RL 160.0 15623 Pave NaN IR1 Lvl AllPub Corner ... 5.0 3.0 NaN 0 1 1970 WD Abnorml 745000 13.521141

2 rows × 81 columns

eda_data[(eda_data['Foundation'] == 'BrkTil') & (eda_data['SalePrice'] > 400000)]
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice SalePrice_log
Id
186 75 RM 90.0 22950 Pave NaN IR2 Lvl AllPub Inside ... NaN 4.0 NaN 0 1 1970 WD Normal 475000 13.071072

1 rows × 81 columns

eda_data[(eda_data['LotShape'] == 'IR1') & (eda_data['SalePrice'] > 700000)]
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice SalePrice_log
Id
692 60 RL 104.0 21535 Pave NaN IR1 Lvl AllPub Corner ... NaN NaN NaN 0 1 1970 WD Normal 755000 13.534474
1183 60 RL 160.0 15623 Pave NaN IR1 Lvl AllPub Corner ... 5.0 3.0 NaN 0 1 1970 WD Abnorml 745000 13.521141

2 rows × 81 columns

剔除异常行#

dropids = [524, 1299, 692, 1183,186]
# 来自model训练后的离散点分析,如需复现这几个ids,请注释掉,然后查看model分析
dropids.append(441) 
dropids.extend([971, 89, 689]) 
eda_data = eda_data.drop(dropids, axis=0)
traindata = traindata.drop(dropids, axis=0)

log变换#

log 减小了远端大值误差。 增大了近端小值误差。 结果就是:远端不离群了,小端更加离群了

sns.histplot(eda_data, x='TotalBsmtSF',kde=True)
<Axes: xlabel='TotalBsmtSF', ylabel='Count'>
../../_images/9b1f106fb973147eecd045532cc16fc2a45d29d55a5b38a01a2432ea02267571.png
  • TotalBsmtSF = 0 值太多。 这代表没有地下室。 我们不能对0做log变换

  • 远端有离群点

为此,我们需要创建一个特征,为了0值。只对非0值做转换

eda_data['HasBsmt'] = 1
eda_data['HasBsmt'][eda_data[eda_data['TotalBsmtSF'] == 0].index] = 0
eda_data.loc[eda_data['HasBsmt']==1, 'TotalBsmtSF'] = np.log(eda_data.loc[eda_data['HasBsmt']==1, 'TotalBsmtSF'])
sns.histplot(eda_data[eda_data['HasBsmt'] == 1], x='TotalBsmtSF',kde=True)
<Axes: xlabel='TotalBsmtSF', ylabel='Count'>
../../_images/b12758d624f84e5cdf5074aa112c6b5d09b87259ae2d562b266598016bdb50f4.png

Gauss-Markon假设#

误差同方差#

这里是不严谨的,应该是n维超线性空间的误差。

但是如果误差方差依赖了某个特征,那么就会波动,不恒定。

ax = sns.regplot(eda_data[eda_data['HasBsmt'] == 1], x='TotalBsmtSF', y='SalePrice')
../../_images/4eb523978e75883e0a28796fd77f6686fc35c1ab81e6c7f2753092fcf166d101.png

TotalBsmtSF特征而言,每个x值,y分布沿着回归线一致,即方差不变

误差正态假设#

为了后续显著性检验

误差,y,$\beta$ 是正态的

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
sns.histplot(data=eda_data, x='SalePrice',kde=True, ax= ax1)
res = stats.probplot(eda_data['SalePrice'], plot=ax2)
plt.tight_layout()
plt.show()
../../_images/5d885c4ab0246c39d93f35312a2b4b9c1de09045dd477144c0f288bd3450081a.png

数据向上弯表示右偏,红线为正态分布

通过log变换让他更像正态分布

eda_data['SalePrice_log'] = np.log(eda_data['SalePrice'])
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
sns.histplot(data=eda_data, x='SalePrice_log',kde=True, ax= ax1)
res = stats.probplot(eda_data['SalePrice_log'], plot=ax2)
plt.tight_layout()
plt.show()
../../_images/a01c90a95c54d63e854931bb4578786e58403630c774a89ba7fa7e17815306fc.png

Warning

下面train和test同步

特征选择#

对列进行增加,删除,构建新特征

应该阐述我构造的思想~

有些特征与目标也不是线性关系,也得非线性转换

对于线性构造方法,旨在创造一些更具有解释意义的特征,但必须之后删除一些特征,保证满秩。

对于非线性构造方法,旨在捕捉隐藏特征,提高模型上限,

date数据类型可以转为pd.datetime

1. 年份#

  • 年份本身可以是变量,能够表明某些年份房价波动

  • 年份构造的age抵消了时间背景,表示age对房价也有波动

探索一下

eda_data['HouseAge'] = eda_data['YrSold'] -eda_data['YearBuilt']
fig, axes = plt.subplots(1, 3, figsize=(15,4))
sns.regplot(data=eda_data, x='YearBuilt', y = 'SalePrice', ax=axes[0])
sns.regplot(data=eda_data, x='HouseAge', y = 'SalePrice', ax=axes[1])
sns.regplot(data=eda_data, x='YrSold', y = 'SalePrice', ax=axes[2])
plt.tight_layout()
plt.show()
../../_images/056ca6275558c6adceb6fef2aee936af0915d1c302aec21411d39f4b7b67e1be.png
eda_data[['YearBuilt', 'HouseAge']].corr()
YearBuilt HouseAge
YearBuilt NaN NaN
HouseAge NaN NaN

图中表明,YrSold 对于构造HouseAge影响不大。也因此,相关性大。

结论:不构造HouseAge, 删除YrSold, 保留YearBuilt

2. 房屋面积#

eda_data['TotalSF'] = eda_data['1stFlrSF'] + eda_data['2ndFlrSF'] + eda_data['TotalBsmtSF']
fig, axes = plt.subplots(1, 4, figsize=(15,4))
axes = axes.flatten()
sns.regplot(data=eda_data, x='TotalSF', y = 'SalePrice', ax=axes[0])
sns.regplot(data=eda_data, x='1stFlrSF', y = 'SalePrice', ax=axes[1])
sns.regplot(data=eda_data, x='2ndFlrSF', y = 'SalePrice', ax=axes[2])
sns.regplot(data=eda_data, x='TotalBsmtSF', y = 'SalePrice', ax=axes[3])
plt.tight_layout()
plt.show()
../../_images/67a7a857214cc65f37dcfbe16bfcc2bc1fd3e3ca896f8217fa255cc03bf7484c.png
eda_data[['TotalSF', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF']].corr()
TotalSF 1stFlrSF 2ndFlrSF TotalBsmtSF
TotalSF 1.000000 0.527859 0.687724 0.185774
1stFlrSF 0.527859 1.000000 -0.253567 0.268795
2ndFlrSF 0.687724 -0.253567 1.000000 -0.020634
TotalBsmtSF 0.185774 0.268795 -0.020634 1.000000

结论:删除1stFlrSF, 保留TotalSF

3. 房屋面积*质量评分#

eda_data['HouseSF_PRODUCT_QUAL'] = eda_data['TotalSF'] * eda_data['OverallQual']
fig, axes = plt.subplots(1, 3, figsize=(12,4))
axes = axes.flatten()
sns.regplot(data=eda_data, x='TotalSF', y = 'SalePrice', ax=axes[0])
sns.regplot(data=eda_data, x='OverallQual', y = 'SalePrice', ax=axes[1])
sns.regplot(data=eda_data, x='HouseSF_PRODUCT_QUAL', y = 'SalePrice', ax=axes[2])
plt.tight_layout()
plt.show()
../../_images/3ad83a6fda911721e29ee25959e5c0b754be3e1fd0a773692e69a786bfabfd39.png
eda_data[['HouseSF_PRODUCT_QUAL', 'OverallQual', 'TotalSF', 'SalePrice']].corr()
HouseSF_PRODUCT_QUAL OverallQual TotalSF SalePrice
HouseSF_PRODUCT_QUAL 1.000000 0.829884 0.921501 0.869864
OverallQual 0.829884 1.000000 0.588959 0.798928
TotalSF 0.921501 0.588959 1.000000 0.730945
SalePrice 0.869864 0.798928 0.730945 1.000000

可以看到,我们构造的特征对房价有更高的相关性

结论:保留HouseSF_PRODUCT_QUAL

pipeline#

ft pipeline#

结合了eda和特征选择两部分。对列进行修改

def ft_features(df):
    """ 
    """
    df['MSSubClass'] = df['MSSubClass'].astype('category')

    df['HasBsmt'] = 1
    df['HasBsmt'][df[df['TotalBsmtSF'] == 0].index] = 0

    df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']
    df['HouseSF_PRODUCT_QUAL'] = df['TotalSF'] * df['OverallQual']

    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])
    date_cols = ['GarageYrBlt','YearRemodAdd', 'YearBuilt', 'MoSold']
    df[date_cols] = df[date_cols].astype('datetime64[us]')
    df['GarageYrBlt'] = df['GarageYrBlt'].dt.year
    df['YearRemodAdd'] = df['YearRemodAdd'].dt.year
    df['YearBuilt'] = df['YearBuilt'].dt.year
    df['MoSold'] = df['MoSold'].dt.month

    cols_to_remove = [
        '1stFlrSF', 'YrSold',
        *missing_remove_num_cols,
        *missing_remove_cat_cols,
        *missing_remove_ord_cols,
        *no_imp_category_features,
    ]
    remaining_features = [f for f in df.columns if f not in cols_to_remove]

    df = df[remaining_features]

    print(f'feature_features transformer done. ')
    return df
def ft_feature_names_out(transformer, input_features):
    
    cols_to_remove = [
        '1stFlrSF', 'YrSold',
        *missing_remove_num_cols,
        *missing_remove_cat_cols,
        *missing_remove_ord_cols,
        *no_imp_category_features,
    ]
    remaining_features = [f for f in input_features if f not in cols_to_remove]
    
    new_cols = ['HasBsmt', 'TotalSF', 'HouseSF_PRODUCT_QUAL']
    for col in new_cols:
        if col not in remaining_features:
            remaining_features.append(col)
    return remaining_features
ft_pipeline = Pipeline([
    ('ft_features', FunctionTransformer(func=ft_features, feature_names_out=ft_feature_names_out))
])

date pipeline#

def to_int_type(df):
    return df.astype(int)
date_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])
date_pipeline.set_output(transform="pandas") # 配置每步输出df,而不是np arr
Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

numeric pipeline#

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # 填充缺失值
    ('log_transform', FunctionTransformer(np.log1p, validate=False, feature_names_out="one-to-one")),  # 取对数
    ('std_scaler', StandardScaler()) # 标准化
])
numeric_pipeline.set_output(transform="pandas") # 配置每步输出df,而不是np arr
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('log_transform',
                 FunctionTransformer(feature_names_out='one-to-one',
                                     func=<ufunc 'log1p'>)),
                ('std_scaler', StandardScaler())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

category pipeline#

categorical_pipeline  = Pipeline(steps=[
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore',sparse_output=False))  # 使用 OneHotEncoder
])
categorical_pipeline.set_output(transform="pandas") # 配置每步输出df,而不是np arr
Pipeline(steps=[('onehot_encoder',
                 OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

ordinal pipeline#

def fix_map_encoder(df):
    df['OverallQual'] = df['OverallQual'].fillna(0)
    df['OverallCond'] = df['OverallCond'].fillna(0)

    df['ExterCond'] = df['ExterCond'].fillna('NA').map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}).fillna(0)
    df['ExterQual'] = df['ExterQual'].fillna('NA').map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}).fillna(0)
    df['KitchenQual'] = df['KitchenQual'].fillna('NA').map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}).fillna(0)
    df['HeatingQC'] = df['HeatingQC'].fillna('NA').map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}).fillna(0)
    print('fix_map_encoder transformer done.')
    return df.astype(int)
# 对有序类别进行编码
ordinal_pipeline = Pipeline(steps=[
    ('fix_map_encoder', FunctionTransformer(fix_map_encoder, feature_names_out='one-to-one')),    
])
ordinal_pipeline.set_output(transform="pandas") # 配置每步输出df,而不是np arr
Pipeline(steps=[('fix_map_encoder',
                 FunctionTransformer(feature_names_out='one-to-one',
                                     func=<function fix_map_encoder at 0x00000249C6686440>))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

baseline#

训练#

ft_pipeline完成后会确定所有特征

numeric_cols = ['LotFrontage','LotArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
      '2ndFlrSF','LowQualFinSF','GrLivArea','GarageArea',
     'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea',
     'MiscVal','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr',
     'KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','HasBsmt', 'TotalSF', 'HouseSF_PRODUCT_QUAL']

categorical_cols = ['MSSubClass','CentralAir','MSZoning','LotShape','LotConfig',
      'Neighborhood','Condition1','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
      'Exterior2nd','Foundation','Heating','Electrical','Functional',
      'PavedDrive','SaleType','SaleCondition'
]
date_cols = ['GarageYrBlt','YearRemodAdd','YearBuilt','MoSold' ]
ordinal_cols = ['OverallQual','OverallCond','ExterQual', 'ExterCond', 
    'HeatingQC', 'KitchenQual']
len(numeric_cols) + len(date_cols) + len(categorical_cols) + len(ordinal_cols)
59
sub_preprocessor = ColumnTransformer(
    transformers = [
        ('date',date_pipeline, date_cols),
        ('numeric', numeric_pipeline, numeric_cols),            # 数值型处理
        ('ordianl', ordinal_pipeline, ordinal_cols),            # 有序类别处理
        ('categoric', categorical_pipeline, categorical_cols)     # 无序类别处理
    ],
    remainder='drop'  # 其余列删除掉
)

preprocessor = Pipeline(
    steps=[
        ('ft',ft_pipeline),
        ('sub_preprocessor',sub_preprocessor)
    ]
)
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    now = time.time()
    print(f'{title} {(now - t0) : .2f}s')
X_train = traindata.drop(columns = ['SalePrice']).copy()
y_train = traindata[['SalePrice']].copy()
X_train, X_valid, y_train, y_valid  = train_test_split(X_train, y_train, test_size=0.33,)
X_train.columns
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal',
       'MoSold', 'YrSold', 'SaleType', 'SaleCondition'],
      dtype='object')
model_lasso = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor = LassoCV(cv=20, max_iter=5000),
        func = np.log1p, # y转换
        inverse_func = np.expm1, 
    ))
])
with timer('model_lasso'):
    model_lasso.fit(X_train, y_train)
    
feature_features transformer done. 
fix_map_encoder transformer done.
model_lasso  1.69s
y_train_pred = model_lasso.predict(X_train)
feature_features transformer done. 
fix_map_encoder transformer done.
root_mean_squared_error(np.log(y_train_pred), np.log(y_train))
0.09871707791314763
y_valid_pred = model_lasso.predict(X_valid)
root_mean_squared_error(np.log(y_valid_pred), np.log(y_valid))
feature_features transformer done. 
fix_map_encoder transformer done.
0.11064641975430012

可以看到,在训练集和验证集上,效果都不太好

模型解释#

features_names_out = model_lasso[0].get_feature_names_out()
model = model_lasso.named_steps['regressor']
lasso = model.regressor_ # 内部模型
lasso.alpha_
0.0004540549206624422
coefs = lasso.coef_
feature_coef_df = pd.DataFrame({
    'feature': features_names_out,
    'coef': coefs,
    'coef_abs': np.abs(coefs)
})
feature_coef_df.shape
(198, 3)
(feature_coef_df['coef_abs'] == 0).sum() /  len(feature_coef_df)
0.5656565656565656

有64%的特征系数为0

high20_coef_features= feature_coef_df[feature_coef_df['coef_abs'] != 0].sort_values(by='coef_abs', ascending=False).head(20)
# 条形图
sns.barplot(data=high20_coef_features, x='coef', y='feature')
<Axes: xlabel='coef', ylabel='feature'>
../../_images/27b2c9a56d25c1a164219a36af37bec897f7b1479fb3d7a31deef65730fc3142.png
feature_coef_df[feature_coef_df['feature'] == 'categoric__SaleCondition_Abnorml']
feature coef coef_abs
192 categoric__SaleCondition_Abnorml -0.100477 0.100477
feature_coef_df['feature'].to_list()
['date__GarageYrBlt',
 'date__YearRemodAdd',
 'date__YearBuilt',
 'date__MoSold',
 'numeric__LotFrontage',
 'numeric__LotArea',
 'numeric__BsmtFinSF1',
 'numeric__BsmtFinSF2',
 'numeric__BsmtUnfSF',
 'numeric__TotalBsmtSF',
 'numeric__2ndFlrSF',
 'numeric__LowQualFinSF',
 'numeric__GrLivArea',
 'numeric__GarageArea',
 'numeric__WoodDeckSF',
 'numeric__OpenPorchSF',
 'numeric__EnclosedPorch',
 'numeric__3SsnPorch',
 'numeric__ScreenPorch',
 'numeric__PoolArea',
 'numeric__MiscVal',
 'numeric__BsmtFullBath',
 'numeric__BsmtHalfBath',
 'numeric__FullBath',
 'numeric__HalfBath',
 'numeric__BedroomAbvGr',
 'numeric__KitchenAbvGr',
 'numeric__TotRmsAbvGrd',
 'numeric__Fireplaces',
 'numeric__GarageCars',
 'numeric__HasBsmt',
 'numeric__TotalSF',
 'numeric__HouseSF_PRODUCT_QUAL',
 'ordianl__OverallQual',
 'ordianl__OverallCond',
 'ordianl__ExterQual',
 'ordianl__ExterCond',
 'ordianl__HeatingQC',
 'ordianl__KitchenQual',
 'categoric__MSSubClass_20',
 'categoric__MSSubClass_30',
 'categoric__MSSubClass_40',
 'categoric__MSSubClass_45',
 'categoric__MSSubClass_50',
 'categoric__MSSubClass_60',
 'categoric__MSSubClass_70',
 'categoric__MSSubClass_75',
 'categoric__MSSubClass_80',
 'categoric__MSSubClass_85',
 'categoric__MSSubClass_90',
 'categoric__MSSubClass_120',
 'categoric__MSSubClass_160',
 'categoric__MSSubClass_180',
 'categoric__MSSubClass_190',
 'categoric__CentralAir_N',
 'categoric__CentralAir_Y',
 'categoric__MSZoning_C (all)',
 'categoric__MSZoning_FV',
 'categoric__MSZoning_RH',
 'categoric__MSZoning_RL',
 'categoric__MSZoning_RM',
 'categoric__LotShape_IR1',
 'categoric__LotShape_IR2',
 'categoric__LotShape_IR3',
 'categoric__LotShape_Reg',
 'categoric__LotConfig_Corner',
 'categoric__LotConfig_CulDSac',
 'categoric__LotConfig_FR2',
 'categoric__LotConfig_FR3',
 'categoric__LotConfig_Inside',
 'categoric__Neighborhood_Blmngtn',
 'categoric__Neighborhood_BrDale',
 'categoric__Neighborhood_BrkSide',
 'categoric__Neighborhood_ClearCr',
 'categoric__Neighborhood_CollgCr',
 'categoric__Neighborhood_Crawfor',
 'categoric__Neighborhood_Edwards',
 'categoric__Neighborhood_Gilbert',
 'categoric__Neighborhood_IDOTRR',
 'categoric__Neighborhood_MeadowV',
 'categoric__Neighborhood_Mitchel',
 'categoric__Neighborhood_NAmes',
 'categoric__Neighborhood_NPkVill',
 'categoric__Neighborhood_NWAmes',
 'categoric__Neighborhood_NoRidge',
 'categoric__Neighborhood_NridgHt',
 'categoric__Neighborhood_OldTown',
 'categoric__Neighborhood_SWISU',
 'categoric__Neighborhood_Sawyer',
 'categoric__Neighborhood_SawyerW',
 'categoric__Neighborhood_Somerst',
 'categoric__Neighborhood_StoneBr',
 'categoric__Neighborhood_Timber',
 'categoric__Neighborhood_Veenker',
 'categoric__Condition1_Artery',
 'categoric__Condition1_Feedr',
 'categoric__Condition1_Norm',
 'categoric__Condition1_PosA',
 'categoric__Condition1_PosN',
 'categoric__Condition1_RRAe',
 'categoric__Condition1_RRAn',
 'categoric__Condition1_RRNe',
 'categoric__Condition1_RRNn',
 'categoric__BldgType_1Fam',
 'categoric__BldgType_2fmCon',
 'categoric__BldgType_Duplex',
 'categoric__BldgType_Twnhs',
 'categoric__BldgType_TwnhsE',
 'categoric__HouseStyle_1.5Fin',
 'categoric__HouseStyle_1.5Unf',
 'categoric__HouseStyle_1Story',
 'categoric__HouseStyle_2.5Fin',
 'categoric__HouseStyle_2.5Unf',
 'categoric__HouseStyle_2Story',
 'categoric__HouseStyle_SFoyer',
 'categoric__HouseStyle_SLvl',
 'categoric__RoofStyle_Flat',
 'categoric__RoofStyle_Gable',
 'categoric__RoofStyle_Gambrel',
 'categoric__RoofStyle_Hip',
 'categoric__RoofStyle_Mansard',
 'categoric__RoofStyle_Shed',
 'categoric__RoofMatl_CompShg',
 'categoric__RoofMatl_Metal',
 'categoric__RoofMatl_Roll',
 'categoric__RoofMatl_Tar&Grv',
 'categoric__RoofMatl_WdShake',
 'categoric__RoofMatl_WdShngl',
 'categoric__Exterior1st_AsbShng',
 'categoric__Exterior1st_AsphShn',
 'categoric__Exterior1st_BrkComm',
 'categoric__Exterior1st_BrkFace',
 'categoric__Exterior1st_CemntBd',
 'categoric__Exterior1st_HdBoard',
 'categoric__Exterior1st_ImStucc',
 'categoric__Exterior1st_MetalSd',
 'categoric__Exterior1st_Plywood',
 'categoric__Exterior1st_Stucco',
 'categoric__Exterior1st_VinylSd',
 'categoric__Exterior1st_Wd Sdng',
 'categoric__Exterior1st_WdShing',
 'categoric__Exterior2nd_AsbShng',
 'categoric__Exterior2nd_AsphShn',
 'categoric__Exterior2nd_Brk Cmn',
 'categoric__Exterior2nd_BrkFace',
 'categoric__Exterior2nd_CmentBd',
 'categoric__Exterior2nd_HdBoard',
 'categoric__Exterior2nd_ImStucc',
 'categoric__Exterior2nd_MetalSd',
 'categoric__Exterior2nd_Other',
 'categoric__Exterior2nd_Plywood',
 'categoric__Exterior2nd_Stone',
 'categoric__Exterior2nd_Stucco',
 'categoric__Exterior2nd_VinylSd',
 'categoric__Exterior2nd_Wd Sdng',
 'categoric__Exterior2nd_Wd Shng',
 'categoric__Foundation_BrkTil',
 'categoric__Foundation_CBlock',
 'categoric__Foundation_PConc',
 'categoric__Foundation_Slab',
 'categoric__Foundation_Stone',
 'categoric__Heating_Floor',
 'categoric__Heating_GasA',
 'categoric__Heating_GasW',
 'categoric__Heating_Grav',
 'categoric__Heating_OthW',
 'categoric__Heating_Wall',
 'categoric__Electrical_FuseA',
 'categoric__Electrical_FuseF',
 'categoric__Electrical_FuseP',
 'categoric__Electrical_Mix',
 'categoric__Electrical_SBrkr',
 'categoric__Electrical_nan',
 'categoric__Functional_Maj1',
 'categoric__Functional_Maj2',
 'categoric__Functional_Min1',
 'categoric__Functional_Min2',
 'categoric__Functional_Mod',
 'categoric__Functional_Sev',
 'categoric__Functional_Typ',
 'categoric__PavedDrive_N',
 'categoric__PavedDrive_P',
 'categoric__PavedDrive_Y',
 'categoric__SaleType_COD',
 'categoric__SaleType_CWD',
 'categoric__SaleType_Con',
 'categoric__SaleType_ConLD',
 'categoric__SaleType_ConLI',
 'categoric__SaleType_ConLw',
 'categoric__SaleType_New',
 'categoric__SaleType_Oth',
 'categoric__SaleType_WD',
 'categoric__SaleCondition_Abnorml',
 'categoric__SaleCondition_AdjLand',
 'categoric__SaleCondition_Alloca',
 'categoric__SaleCondition_Family',
 'categoric__SaleCondition_Normal',
 'categoric__SaleCondition_Partial']

效果是合理的。

解释:

  1. MSZoning_C表示在商业区的房子,对房价负面影响大,价格就会低

假设验证#

residuals = np.log(y_valid_pred) - np.log(y_valid)
residuals = residuals.rename(columns={'SalePrice': 'residual'})
residuals['price'] = y_valid
residuals['pred'] = y_valid_pred
residuals['residual'].skew()
residuals['residual'].kurt()
7.476599229884512
residuals.head()
residual price pred
Id
300 0.081613 158500 171978.234917
1108 -0.092221 274725 250522.735459
801 -0.020707 200000 195901.273403
595 -0.025818 110000 107196.339264
1117 0.095717 184100 202592.345412
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
sns.histplot(data=residuals, x='residual',kde=True, ax= ax1)
res = stats.probplot(residuals['residual'], plot=ax2)
plt.tight_layout()
plt.show()
../../_images/2c708ace67287d0726bc609d86767c6215f18d52b2919ef11b9e117fc38d6f39.png
residuals[(residuals['residual'] < 0.2) & (residuals['residual'] >-0.2)].shape[0] / len(residuals)
0.9394572025052192
residuals['residual'].std()
0.11066593180150201

误差正态假设基本是成立的,在数据集中区域尤为明显。

残差在$[-0.2, 0.2]$的数据占比93%。 样本标准差为0.1,参数估计的话, 符合$2\sigma$内分布95%数据

那些离群点需要研究下,

离群#

找出这些离群点的共同特点,

Warning

由于没有version控制,notebook不能够反映思路:通过训练再去修改eda过程

根据probplot划分左右

与 #剔除异常行 配合

左侧#

都是预测值小于真实值!

residuals[residuals['residual'] < -0.3]
residual price pred
Id
804 -0.346820 582933 412094.131402
682 -0.460682 159434 100579.438356
329 -0.333022 214500 153743.842153

3个离散点,直接删除即可

右侧#

着重看下系数大于0的特征。neighborhood, totalsf

feature_coef_df[feature_coef_df['coef'] > 0].sort_values(by='coef', ascending=False).head(10)
feature coef coef_abs
31 numeric__TotalSF 0.135101 0.135101
91 categoric__Neighborhood_StoneBr 0.088154 0.088154
75 categoric__Neighborhood_Crawfor 0.085541 0.085541
33 ordianl__OverallQual 0.076441 0.076441
179 categoric__Functional_Typ 0.076041 0.076041
73 categoric__Neighborhood_ClearCr 0.070117 0.070117
85 categoric__Neighborhood_NridgHt 0.058988 0.058988
84 categoric__Neighborhood_NoRidge 0.046653 0.046653
5 numeric__LotArea 0.046062 0.046062
163 categoric__Heating_GasW 0.045141 0.045141
hard_mask = (residuals['residual'] > 0.2) 
hard_ids = residuals[hard_mask].index
hard_df = eda_data.loc[hard_ids, :]
hard_df.head()
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... MoSold YrSold SaleType SaleCondition SalePrice SalePrice_log HasBsmt HouseAge TotalSF HouseSF_PRODUCT_QUAL
Id
706 190 RM 70.0 5600 Pave NaN Reg Lvl AllPub Inside ... 1 1970 WD Normal 55000 10.915088 0 0 1092.000000 4368.000000
1388 50 RM 60.0 8520 Pave Grvl Reg Lvl AllPub Inside ... 1 1970 CWD Family 136000 11.820410 1 0 2532.570883 15195.425298
1217 90 RM 68.0 8930 Pave NaN Reg Lvl AllPub Inside ... 1 1970 WD Normal 112000 11.626254 0 0 1902.000000 11412.000000
790 60 RL NaN 12205 Pave NaN IR1 Low AllPub Inside ... 1 1970 WD Normal 187500 12.141534 1 0 2093.723832 12562.342995
561 20 RL NaN 11341 Pave NaN IR1 Lvl AllPub Inside ... 1 1970 WD Normal 121500 11.707670 1 0 1399.238497 6996.192484

5 rows × 85 columns

totalsf

sns.scatterplot(data=hard_df, x='TotalSF', y='SalePrice')
<Axes: xlabel='TotalSF', ylabel='SalePrice'>
../../_images/4aa90482364816d67f80da7919c2b7114655ff60cdb965cc5aa1c2a5eeb7a46c.png
hard_df[hard_df['TotalSF'] > 2250]
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... MoSold YrSold SaleType SaleCondition SalePrice SalePrice_log HasBsmt HouseAge TotalSF HouseSF_PRODUCT_QUAL
Id
1388 50 RM 60.0 8520 Pave Grvl Reg Lvl AllPub Inside ... 1 1970 CWD Family 136000 11.82041 1 0 2532.570883 15195.425298

1 rows × 85 columns

saletype

hard_df['SaleType'].value_counts()
SaleType
WD     13
New     2
CWD     1
Name: count, dtype: int64

WD是最普通的方式,没什么问题

sns.boxplot(data=hard_df, x='SaleType', y='SalePrice')
sns.stripplot(data=hard_df, x='SaleType', y='SalePrice')
<Axes: xlabel='SaleType', ylabel='SalePrice'>
../../_images/03cba12b1dc9e6998bf08548519faaa7dc89fe738cb4a46dc47d850054bf2bf1.png

Neighborhood:StoneBr,NridgHt,Crawfor,NoRidge

plt.figure(figsize=(10,4))
sns.boxplot(data=hard_df, x='Neighborhood', y='SalePrice')
sns.stripplot(data=hard_df, x='Neighborhood', y='SalePrice')
<Axes: xlabel='Neighborhood', ylabel='SalePrice'>
../../_images/3df5864fbc7b1c51edd25fd0aa10b835e15569cc164ac598a6c584c769ae888e.png
plt.figure(figsize=(20,4))
sns.boxplot(data=eda_data, x='Neighborhood', y='SalePrice')
sns.stripplot(data=eda_data, x='Neighborhood', y='SalePrice')
<Axes: xlabel='Neighborhood', ylabel='SalePrice'>
../../_images/d8b275bd86998f08fc9472881183650da7c1a5874b4828984239690f1e71071c.png

Functional_Typ

plt.figure(figsize=(10,4))
sns.boxplot(data=hard_df, x='Functional', y='SalePrice')
sns.stripplot(data=hard_df, x='Functional', y='SalePrice')
<Axes: xlabel='Functional', ylabel='SalePrice'>
../../_images/25c1051a43e11e47c04e6cc03cb10364cb2dc5814aef9fcbe070aa5fdd88a0be.png

OverallQual

plt.figure(figsize=(10,4))
sns.boxplot(data=hard_df, x='OverallQual', y='SalePrice')
sns.stripplot(data=hard_df, x='OverallQual', y='SalePrice')
<Axes: xlabel='OverallQual', ylabel='SalePrice'>
../../_images/331c7984e4634705b4b46a82f6e570702a2ff7ee531ef1ab473d8896e67ec358.png

Exterior1st_BrkFace

plt.figure(figsize=(10,4))
sns.boxplot(data=hard_df, x='Exterior1st', y='SalePrice')
sns.stripplot(data=hard_df, x='Exterior1st', y='SalePrice')
<Axes: xlabel='Exterior1st', ylabel='SalePrice'>
../../_images/6afa73170f4fa16d8f6660343eefc1ef94a52faf7a54c494f48955cbf86a4589.png

GrLivArea

sns.scatterplot(data=hard_df, x='GrLivArea', y='SalePrice')
<Axes: xlabel='GrLivArea', ylabel='SalePrice'>
../../_images/e5074168573ef46e8622d5a9eafbec07721cd58601d887a562dcbe10bfa0a886.png

都没什么大问题,删除那个异常点试试

submit#

testdata.head()
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
Id
1461 20 RH 80.0 11622 Pave NaN Reg Lvl AllPub Inside ... 120 0 NaN MnPrv NaN 0 6 2010 WD Normal
1462 20 RL 81.0 14267 Pave NaN IR1 Lvl AllPub Corner ... 0 0 NaN NaN Gar2 12500 6 2010 WD Normal
1463 60 RL 74.0 13830 Pave NaN IR1 Lvl AllPub Inside ... 0 0 NaN MnPrv NaN 0 3 2010 WD Normal
1464 60 RL 78.0 9978 Pave NaN IR1 Lvl AllPub Inside ... 0 0 NaN NaN NaN 0 6 2010 WD Normal
1465 120 RL 43.0 5005 Pave NaN IR1 HLS AllPub Inside ... 144 0 NaN NaN NaN 0 1 2010 WD Normal

5 rows × 79 columns

testdata.index
Index([1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470,
       ...
       2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919],
      dtype='int64', name='Id', length=1459)
preds = model_lasso.predict(testdata)
feature_features transformer done. 
fix_map_encoder transformer done.
submit(testdata.index, preds.flatten(), name='lasso_baseline', feature_count= len(feature_coef_df))
ID SalePrice
0 1461 118348.668143
1 1462 170322.125056
2 1463 182666.123600
3 1464 205888.913258
4 1465 205860.298380
... ... ...
1454 2915 86879.146192
1455 2916 80897.026129
1456 2917 173171.990091
1457 2918 110314.381332
1458 2919 222879.322699

1459 rows × 2 columns

得分0.13

至此,完成了最基本的训练、特征工程。得分与社区接近

提升得分的方式

  1. 引入非线性模型如xgboost,比如$0.7linear + 0.3xgboost$

  2. 试图找到更好的特征

一些问答#

  1. 最小化残差平方和RSS估计\beta等价于 误差在正态分布假设下对参数进行极大似然估计。

  1. 多重共线性下(特征高度相关), 对\beta的影响?

X_MATH = preprocessor.fit_transform(X_train)
feature_features transformer done. 
fix_map_encoder transformer done.
X_MATH.shape
(972, 198)
np.linalg.matrix_rank(X_MATH)
172

可以看到在预处理下不是满秩的,这不符合基本线性模型假设。

这是因为one-hot时候,生成的0,1,会满足加和=1.

两种解决策略:

  1. onehot drop-first

  2. 引入正则项(推荐)

lasso能做是因为加了一个L2正则项$\lambda$

l = lasso.alpha_
np.linalg.matrix_rank(X_MATH @ X_MATH.T + np.eye(len(X_MATH)) * l)
172

这是浮点计算的问题,l比较小,很多就会截断, 实际理论也是满秩的