pandas#

import numpy as np
import pandas as pd
pd.__version__
'2.3.1'

nlargest

data[[‘far_price’, ‘near_price’]] = data[[‘far_price’, ‘near_price’]].fillna(data[‘reference_price’])

dropna, drop行

.all(axis=1)

.any()

quickstart#

df = pd.DataFrame({
    'height':[1,2,3],
    'score':[0,1,2]
})
df.sort_values(by='score', ascending=False)
height score
2 3 2
1 2 1
0 1 0

对象创建#

通过一个list创建 series,默认索引为RangeIndex

s = pd.Series([1,2, np.nan, 4])
s
0    1.0
1    2.0
2    NaN
3    4.0
dtype: float64

通过date_range创建df, 带有时间索引

dates = pd.date_range('20201010', periods=6)
dates
DatetimeIndex(['2020-10-10', '2020-10-11', '2020-10-12', '2020-10-13',
               '2020-10-14', '2020-10-15'],
              dtype='datetime64[us]', freq='D')
df = pd.DataFrame(
    np.random.randn(6, 4),
    index = dates,
    columns = list('ABCD')
)
df
A B C D
2020-10-10 -0.987835 -0.300875 1.914156 -0.911640
2020-10-11 -1.806821 0.835906 -0.331015 0.094304
2020-10-12 0.506742 -0.020602 0.216988 -2.628958
2020-10-13 0.967278 1.758684 0.758026 -0.418628
2020-10-14 -0.259991 -0.165694 0.313832 0.581593
2020-10-15 -0.324487 -1.197632 1.761982 -0.595702

通过字典创建 df

df2 = pd.DataFrame(
    {
        'A': 1.0,
        'B': pd.Timestamp('20201010'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3] * 4, dtype='int32'),
        'E': pd.Categorical(['test','train', 'test', 'train']),
        'F': 'FOO'
    }
)
df2
A B C D E F
0 1.0 2020-10-10 1.0 3 test FOO
1 1.0 2020-10-10 1.0 3 train FOO
2 1.0 2020-10-10 1.0 3 test FOO
3 1.0 2020-10-10 1.0 3 train FOO
df2.dtypes
A           float64
B    datetime64[us]
C           float32
D             int32
E          category
F               str
dtype: object

dtypes#

  • pandas 会有逻辑类型和存储类型之分

  • select_dtypes的是逻辑类型

  • dtypes是存储类型

  • 常见的,

    • number逻辑类型包含int64, float32等等;

    • object包含object

    • str

    • category: 常常应该把Object类型转换为这个,其他如lgbm才会处理

    • bool

    • datetime

df = pd.DataFrame(
    {
        "A": np.random.rand(3),
        "B": 1,
        "C": "foo",
        "D": pd.Timestamp("20010102"),
        "E": pd.Series([1.0] * 3).astype("float32"),
        "F": False,
        "G": pd.Series([1] * 3, dtype="int8"),
        "H": pd.Series([1] * 3, dtype="int8"),
    }
)
df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 df = pd.DataFrame(
      2     {
      3         "A": np.random.rand(3),
      4         "B": 1,
      5         "C": "foo",
      6         "D": pd.Timestamp("20010102"),
      7         "E": pd.Series([1.0] * 3).astype("float32"),
      8         "F": False,
      9         "G": pd.Series([1] * 3, dtype="int8"),
     10         "H": pd.Series([1] * 3, dtype="int8"),
     11     }
     12 )

NameError: name 'pd' is not defined
 traindata.select_dtypes(include=['datetime']).columns
df.dtypes
A           float64
B             int64
C               str
D    datetime64[us]
E           float32
F              bool
G              int8
dtype: object
df.select_dtypes(include=['str'])
C
0 foo
1 foo
2 foo
df.select_dtypes(include=['number'])
A B E G
0 0.910442 1 1.0 1
1 0.504557 1 1.0 1
2 0.071116 1 1.0 1

view data#

df.head()
A B C D
2020-10-10 -0.987835 -0.300875 1.914156 -0.911640
2020-10-11 -1.806821 0.835906 -0.331015 0.094304
2020-10-12 0.506742 -0.020602 0.216988 -2.628958
2020-10-13 0.967278 1.758684 0.758026 -0.418628
2020-10-14 -0.259991 -0.165694 0.313832 0.581593
df.tail()
A B C D
2020-10-11 -1.806821 0.835906 -0.331015 0.094304
2020-10-12 0.506742 -0.020602 0.216988 -2.628958
2020-10-13 0.967278 1.758684 0.758026 -0.418628
2020-10-14 -0.259991 -0.165694 0.313832 0.581593
2020-10-15 -0.324487 -1.197632 1.761982 -0.595702
df.index
DatetimeIndex(['2020-10-10', '2020-10-11', '2020-10-12', '2020-10-13',
               '2020-10-14', '2020-10-15'],
              dtype='datetime64[us]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='str')
df.to_numpy()
array([[-0.98783525, -0.30087463,  1.91415639, -0.91164047],
       [-1.80682121,  0.83590589, -0.33101498,  0.0943043 ],
       [ 0.50674201, -0.02060168,  0.21698811, -2.62895756],
       [ 0.96727811,  1.75868444,  0.7580259 , -0.41862832],
       [-0.25999123, -0.16569436,  0.31383164,  0.5815927 ],
       [-0.32448715, -1.1976322 ,  1.76198225, -0.59570173]])
df2.to_numpy()
array([[1.0, Timestamp('2020-10-10 00:00:00'), 1.0, 3, 'test', 'FOO'],
       [1.0, Timestamp('2020-10-10 00:00:00'), 1.0, 3, 'train', 'FOO'],
       [1.0, Timestamp('2020-10-10 00:00:00'), 1.0, 3, 'test', 'FOO'],
       [1.0, Timestamp('2020-10-10 00:00:00'), 1.0, 3, 'train', 'FOO']],
      dtype=object)
df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.317519 0.151631 0.772328 -0.646505
std 1.000081 1.020440 0.896591 1.105617
min -1.806821 -1.197632 -0.331015 -2.628958
25% -0.821998 -0.267080 0.241199 -0.832656
50% -0.292239 -0.093148 0.535929 -0.507165
75% 0.315059 0.621779 1.510993 -0.033929
max 0.967278 1.758684 1.914156 0.581593
df.T
2020-10-10 2020-10-11 2020-10-12 2020-10-13 2020-10-14 2020-10-15
A -0.987835 -1.806821 0.506742 0.967278 -0.259991 -0.324487
B -0.300875 0.835906 -0.020602 1.758684 -0.165694 -1.197632
C 1.914156 -0.331015 0.216988 0.758026 0.313832 1.761982
D -0.911640 0.094304 -2.628958 -0.418628 0.581593 -0.595702
df.sort_index(axis=1, ascending=False) # 索引(列)排序
D C B A
2020-10-10 -0.911640 1.914156 -0.300875 -0.987835
2020-10-11 0.094304 -0.331015 0.835906 -1.806821
2020-10-12 -2.628958 0.216988 -0.020602 0.506742
2020-10-13 -0.418628 0.758026 1.758684 0.967278
2020-10-14 0.581593 0.313832 -0.165694 -0.259991
2020-10-15 -0.595702 1.761982 -1.197632 -0.324487
df.sort_values(by='B') # 行排序
A B C D
2020-10-15 -0.324487 -1.197632 1.761982 -0.595702
2020-10-10 -0.987835 -0.300875 1.914156 -0.911640
2020-10-14 -0.259991 -0.165694 0.313832 0.581593
2020-10-12 0.506742 -0.020602 0.216988 -2.628958
2020-10-11 -1.806821 0.835906 -0.331015 0.094304
2020-10-13 0.967278 1.758684 0.758026 -0.418628
df.info()

selection#

[]#

df['A']
2020-10-10   -0.987835
2020-10-11   -1.806821
2020-10-12    0.506742
2020-10-13    0.967278
2020-10-14   -0.259991
2020-10-15   -0.324487
Freq: D, Name: A, dtype: float64

如果列名只包括字母、数、下户线,就可以这样获取

df.A
2020-10-10   -0.987835
2020-10-11   -1.806821
2020-10-12    0.506742
2020-10-13    0.967278
2020-10-14   -0.259991
2020-10-15   -0.324487
Freq: D, Name: A, dtype: float64
df[['A', 'B']]
A B
2020-10-10 -0.987835 -0.300875
2020-10-11 -1.806821 0.835906
2020-10-12 0.506742 -0.020602
2020-10-13 0.967278 1.758684
2020-10-14 -0.259991 -0.165694
2020-10-15 -0.324487 -1.197632

获取行

df[0:3] 
A B C D
2020-10-10 -0.987835 -0.300875 1.914156 -0.911640
2020-10-11 -1.806821 0.835906 -0.331015 0.094304
2020-10-12 0.506742 -0.020602 0.216988 -2.628958

loc#

dates[0]
Timestamp('2020-10-10 00:00:00')
df.loc[dates[0]]
A   -0.987835
B   -0.300875
C    1.914156
D   -0.911640
Name: 2020-10-10 00:00:00, dtype: float64
df.loc[:, ['A', 'B']]
A B
2020-10-10 -0.987835 -0.300875
2020-10-11 -1.806821 0.835906
2020-10-12 0.506742 -0.020602
2020-10-13 0.967278 1.758684
2020-10-14 -0.259991 -0.165694
2020-10-15 -0.324487 -1.197632

注意第一个参数,是两边闭

df.loc['2020-10-10':'2020-10-13', ['A', 'B']]
A B
2020-10-10 -0.987835 -0.300875
2020-10-11 -1.806821 0.835906
2020-10-12 0.506742 -0.020602
2020-10-13 0.967278 1.758684
df.loc[dates[0], 'A']
np.float64(-0.9878352544417939)
df.at[dates[0], 'A']
np.float64(-0.9878352544417939)

at获取一个位置数值,和loc 等价的

iloc#

通过位置选择行

df.iloc[3]
A    0.967278
B    1.758684
C    0.758026
D   -0.418628
Name: 2020-10-13 00:00:00, dtype: float64
df.iloc[:2, 1:]
B C D
2020-10-10 -0.300875 1.914156 -0.911640
2020-10-11 0.835906 -0.331015 0.094304
df.iloc[1,1]
np.float64(0.8359058858792715)
df.iat[1,1]
np.float64(0.8359058858792715)

bool indexing#

  • 选择行,使列满足什么条件

df[df['A']>0]
A B C D
2020-10-12 0.506742 -0.020602 0.216988 -2.628958
2020-10-13 0.967278 1.758684 0.758026 -0.418628
df[df>0]
A B C D
2020-10-10 NaN NaN 1.914156 NaN
2020-10-11 NaN 0.835906 NaN 0.094304
2020-10-12 0.506742 NaN 0.216988 NaN
2020-10-13 0.967278 1.758684 0.758026 NaN
2020-10-14 NaN NaN 0.313832 0.581593
2020-10-15 NaN NaN 1.761982 NaN
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2
A B C D E
2020-10-10 -0.987835 -0.300875 1.914156 -0.911640 one
2020-10-11 -1.806821 0.835906 -0.331015 0.094304 one
2020-10-12 0.506742 -0.020602 0.216988 -2.628958 two
2020-10-13 0.967278 1.758684 0.758026 -0.418628 three
2020-10-14 -0.259991 -0.165694 0.313832 0.581593 four
2020-10-15 -0.324487 -1.197632 1.761982 -0.595702 three
df[df2['E'].isin(['two','four'])]
A B C D
2020-10-12 0.506742 -0.020602 0.216988 -2.628958
2020-10-14 -0.259991 -0.165694 0.313832 0.581593

set value#

s1 = pd.Series(
    [1,2,3,4,5,6],
    index = pd.date_range('20201010', periods=6)
)
s1
2020-10-10    1
2020-10-11    2
2020-10-12    3
2020-10-13    4
2020-10-14    5
2020-10-15    6
Freq: D, dtype: int64

为df设置一个新列

df['F'] = s1
df
A B C D F
2020-10-10 -0.987835 -0.300875 1.914156 -0.911640 1
2020-10-11 -1.806821 0.835906 -0.331015 0.094304 2
2020-10-12 0.506742 -0.020602 0.216988 -2.628958 3
2020-10-13 0.967278 1.758684 0.758026 -0.418628 4
2020-10-14 -0.259991 -0.165694 0.313832 0.581593 5
2020-10-15 -0.324487 -1.197632 1.761982 -0.595702 6
df.at[dates[0], 'A'] = 0
df
A B C D F
2020-10-10 0.000000 -0.300875 1.914156 -0.911640 1
2020-10-11 -1.806821 0.835906 -0.331015 0.094304 2
2020-10-12 0.506742 -0.020602 0.216988 -2.628958 3
2020-10-13 0.967278 1.758684 0.758026 -0.418628 4
2020-10-14 -0.259991 -0.165694 0.313832 0.581593 5
2020-10-15 -0.324487 -1.197632 1.761982 -0.595702 6
df.loc[:, 'D'] = np.array([5] * len(df))
df
A B C D F
2020-10-10 0.000000 -0.300875 1.914156 5.0 1
2020-10-11 -1.806821 0.835906 -0.331015 5.0 2
2020-10-12 0.506742 -0.020602 0.216988 5.0 3
2020-10-13 0.967278 1.758684 0.758026 5.0 4
2020-10-14 -0.259991 -0.165694 0.313832 5.0 5
2020-10-15 -0.324487 -1.197632 1.761982 5.0 6

我们也可以使用bool条件设置

df2 = df.copy()
df2[df2 > 0] = -df2
df2
0 1 2 3 target
0 -0.964237 -0.573508 -1.109884 -1.506903 -0.348507
1 -1.361796 -1.311485 -0.361705 -1.636924 -0.449459
2 -1.122803 -2.252449 -1.261629 -0.575619 -0.157774

例子:

df = pd.DataFrame(np.random.randn(3,4))
df
0 1 2 3
0 -1.098862 0.859736 0.448311 -0.563639
1 0.359723 0.063172 1.273104 0.212454
2 -0.162782 0.734354 -0.368384 -0.874321
df.index
RangeIndex(start=0, stop=3, step=1)
df.columns
RangeIndex(start=0, stop=4, step=1)
df[1]
0    0.859736
1    0.063172
2    0.734354
Name: 1, dtype: float64

missing data#

  • np.nan 表示缺失值

df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1
A B C D F E
2020-10-10 0.000000 -0.300875 1.914156 5.0 1 NaN
2020-10-11 -1.806821 0.835906 -0.331015 5.0 2 NaN
2020-10-12 0.506742 -0.020602 0.216988 5.0 3 NaN
2020-10-13 0.967278 1.758684 0.758026 5.0 4 NaN

reindex 支持我们动态在指定轴上 修改索引

df1.loc[dates[0]:dates[1], 'E'] = 1
df1
A B C D F E
2020-10-10 0.000000 -0.300875 1.914156 5.0 1 1.0
2020-10-11 -1.806821 0.835906 -0.331015 5.0 2 1.0
2020-10-12 0.506742 -0.020602 0.216988 5.0 3 NaN
2020-10-13 0.967278 1.758684 0.758026 5.0 4 NaN

dropna丢弃行,

df1.dropna(how='any')
A B C D F E
2020-10-10 0.000000 -0.300875 1.914156 5.0 1 1.0
2020-10-11 -1.806821 0.835906 -0.331015 5.0 2 1.0
df1.fillna(value=2)
A B C D F E
2020-10-10 0.000000 -0.300875 1.914156 5.0 1 1.0
2020-10-11 -1.806821 0.835906 -0.331015 5.0 2 1.0
2020-10-12 0.506742 -0.020602 0.216988 5.0 3 2.0
2020-10-13 0.967278 1.758684 0.758026 5.0 4 2.0
df.isna()
A B C D F
2020-10-10 False False False False False
2020-10-11 False False False False False
2020-10-12 False False False False False
2020-10-13 False False False False False
2020-10-14 False False False False False
2020-10-15 False False False False False

operations#

stats#

df
A B C D F
2020-10-10 0.000000 -0.300875 1.914156 5.0 1
2020-10-11 -1.806821 0.835906 -0.331015 5.0 2
2020-10-12 0.506742 -0.020602 0.216988 5.0 3
2020-10-13 0.967278 1.758684 0.758026 5.0 4
2020-10-14 -0.259991 -0.165694 0.313832 5.0 5
2020-10-15 -0.324487 -1.197632 1.761982 5.0 6
df.mean()
A   -0.152880
B    0.151631
C    0.772328
D    5.000000
F    3.500000
dtype: float64
df.mean(axis=1) # 行
2020-10-10    1.522656
2020-10-11    1.139614
2020-10-12    1.740626
2020-10-13    2.496798
2020-10-14    1.977629
2020-10-15    2.247973
Freq: D, dtype: float64

数据和索引不对齐,索引会自动延申

dates
DatetimeIndex(['2020-10-10', '2020-10-11', '2020-10-12', '2020-10-13',
               '2020-10-14', '2020-10-15'],
              dtype='datetime64[us]', freq='D')
s = pd.Series([1,2,3, np.nan, 7, 9], index=dates)
s
2020-10-10    1.0
2020-10-11    2.0
2020-10-12    3.0
2020-10-13    NaN
2020-10-14    7.0
2020-10-15    9.0
Freq: D, dtype: float64

沿着index做差

df.sub(s, axis='index')
A B C D F
2020-10-10 -1.000000 -1.300875 0.914156 4.0 0.0
2020-10-11 -3.806821 -1.164094 -2.331015 3.0 0.0
2020-10-12 -2.493258 -3.020602 -2.783012 2.0 0.0
2020-10-13 NaN NaN NaN NaN NaN
2020-10-14 -7.259991 -7.165694 -6.686168 -2.0 -2.0
2020-10-15 -9.324487 -10.197632 -7.238018 -4.0 -3.0

user defined functions#

  • agg简化结果,进行了聚合 。 自定义函数参数为列,返回为列结果

  • transform 对每一个元素进行转换。 自定义函数参数为元素额,

df
A B C D F
2020-10-10 0.000000 -0.300875 1.914156 5.0 1
2020-10-11 -1.806821 0.835906 -0.331015 5.0 2
2020-10-12 0.506742 -0.020602 0.216988 5.0 3
2020-10-13 0.967278 1.758684 0.758026 5.0 4
2020-10-14 -0.259991 -0.165694 0.313832 5.0 5
2020-10-15 -0.324487 -1.197632 1.761982 5.0 6
df.agg(lambda x: np.mean(x) * 2) # 列转换为了 均值的平方
A    -0.305760
B     0.303262
C     1.544656
D    10.000000
F     7.000000
dtype: float64
df.transform(lambda x: x * 101.2) # 对每个元素做了统一变化
A B C D F
2020-10-10 0.000000 -30.448513 193.712626 506.0 101.2
2020-10-11 -182.850307 84.593676 -33.498716 506.0 202.4
2020-10-12 51.282292 -2.084890 21.959197 506.0 303.6
2020-10-13 97.888545 177.978865 76.712221 506.0 404.8
2020-10-14 -26.311112 -16.768269 31.759762 506.0 506.0
2020-10-15 -32.838099 -121.200379 178.312604 506.0 607.2

value_counts#

s = pd.Series(np.random.randint(0,7, size=10))
s.value_counts()
4    3
3    2
1    2
2    1
0    1
6    1
Name: count, dtype: int64

统计百分比

s.value_counts(normalize=True)

string method#

  • Series配备了.str方法, 更好进行变换

s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s
0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: str
s.str.lower()
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: str

merge#

concat#

df = pd.DataFrame(np.random.randn(10,4))
df
0 1 2 3
0 -0.602090 0.550267 -1.223086 -0.444570
1 -0.633155 -0.346444 -0.660296 -1.540638
2 0.300601 1.815010 -0.168324 -1.224505
3 -1.720432 1.502249 0.178592 0.913841
4 0.151822 0.039237 -0.553416 0.958658
5 -0.305660 -0.387923 -1.518987 -0.189006
6 1.026909 0.803875 0.926005 0.364731
7 -1.869858 0.775665 -0.554072 0.058960
8 -0.744252 -0.026305 -0.445575 -0.075493
9 -0.391757 0.587830 0.190065 0.122111
pieces = [df[:3], df[3:5], df[5:]]
pd.concat(pieces)
0 1 2 3
0 -0.602090 0.550267 -1.223086 -0.444570
1 -0.633155 -0.346444 -0.660296 -1.540638
2 0.300601 1.815010 -0.168324 -1.224505
3 -1.720432 1.502249 0.178592 0.913841
4 0.151822 0.039237 -0.553416 0.958658
5 -0.305660 -0.387923 -1.518987 -0.189006
6 1.026909 0.803875 0.926005 0.364731
7 -1.869858 0.775665 -0.554072 0.058960
8 -0.744252 -0.026305 -0.445575 -0.075493
9 -0.391757 0.587830 0.190065 0.122111

join#

  • how参数:表明保留哪些行。

    • inner: 保留共有字段

    • outer: 并集

    • left: 左表所有键

left = pd.DataFrame({
    'key':['foo','bar', 'bob'],
    'lval': [1,2, 3]
})
right = pd.DataFrame({
    'key':['foo','bar'],
    'rval': [1,2]
})
pd.merge(left, right, on='key', how='left')
key lval rval
0 foo 1 1.0
1 bar 2 2.0
2 bob 3 NaN
pd.merge(left, right, on='key', how='inner')
key lval rval
0 foo 1 1
1 bar 2 2
pd.merge(left, right, on='key', how='outer')
key lval rval
0 bar 2 2.0
1 bob 3 NaN
2 foo 1 1.0

group分组#

  • 指定by;

  • 选择一些列;

  • apply function

  • 聚合

df = pd.DataFrame({
    'A': ['one', 'two', 'three'],
    'B': ['foo', 'bob', 'alice'],
    'C': np.random.randn(3),
    'D': np.random.randn(3)
})
df
A B C D
0 one foo -0.407880 0.037157
1 two bob 1.395543 1.246526
2 three alice -1.279270 1.019978
df.groupby(by='A')[['B', 'C']].sum()
B C
A
one foo -0.407880
three alice -1.279270
two bob 1.395543
df.groupby(by='A')['B'].sum() # 只选择B列
A
one        foo
three    alice
two        bob
Name: B, dtype: str

多级行索引

df.groupby(by=['A','B']).sum()
C D
A B
one foo -0.407880 0.037157
three alice -1.279270 1.019978
two bob 1.395543 1.246526

reshaping#

stack#

arrays = [
    ['foo', 'bar', 'alice'],
    ['one', 'two', 'three']
]
index = pd.MultiIndex.from_arrays(
    arrays,
)
df = pd.DataFrame(
    np.random.randn(3, 2),
    index = index,
    columns = ['A', 'B']
)
df
A B
foo one 0.501621 0.045875
bar two 0.967446 -1.766177
alice three -1.287660 -0.256039
df.stack()
foo    one    A    0.501621
              B    0.045875
bar    two    A    0.967446
              B   -1.766177
alice  three  A   -1.287660
              B   -0.256039
dtype: float64
df.unstack()
A B
one three two one three two
alice NaN -1.28766 NaN NaN -0.256039 NaN
bar NaN NaN 0.967446 NaN NaN -1.766177
foo 0.501621 NaN NaN 0.045875 NaN NaN

pivot tables#

time series#

rng = pd.date_range(start='20201010', periods=10)
rng
DatetimeIndex(['2020-10-10', '2020-10-11', '2020-10-12', '2020-10-13',
               '2020-10-14', '2020-10-15', '2020-10-16', '2020-10-17',
               '2020-10-18', '2020-10-19'],
              dtype='datetime64[us]', freq='D')
ts = pd.Series(
    np.random.randint(0, 10, len(rng)),
    index = rng
)

categoricals#

df = pd.DataFrame({
    'id': [1,2,3,4],
    'raw_grade': ['a', 'b', 'c', 'd']
})
df['raw_grade']
0    a
1    b
2    c
3    d
Name: raw_grade, dtype: str

我们可以将其转化为categorical类型

df['grade'] = df['raw_grade'].astype('category')
df['grade']
0    a
1    b
2    c
3    d
Name: grade, dtype: category
Categories (4, str): ['a', 'b', 'c', 'd']
df
id raw_grade grade
0 1 a a
1 2 b b
2 3 c c
3 4 d d

categorical应该具有更好的名称

df['grade'] = df['grade'].cat.rename_categories(['very good', 'good', 'very bad', 'worst'])

排序是按照类别的顺序,不是词汇顺序

df.sort_values(by='grade')
id raw_grade grade
0 1 a very good
1 2 b good
2 3 c very bad
3 4 d worst

plotting#

import matplotlib.pyplot as plt
plt.close('all')
ts = pd.Series(
    np.random.randn(1000),
    index = pd.date_range('20201010', periods=1000)
)
ts
2020-10-10   -0.855467
2020-10-11    1.356744
2020-10-12    1.353603
2020-10-13    0.987404
2020-10-14   -0.424806
                ...   
2023-07-02    1.719214
2023-07-03   -0.172318
2023-07-04   -3.198644
2023-07-05   -0.207917
2023-07-06    1.494452
Freq: D, Length: 1000, dtype: float64
ts = ts.cumsum() # 累积求和
ts.plot()
<Axes: >
../_images/d39dadaa1492ee3529ed7ce1eac049adbf02432c7e22f23906785a99e2991909.png
df = pd.DataFrame(
    np.random.randn(1000, 4),
    index=ts.index,
    columns = ['A', 'B', 'C', 'D']
)
df = df.cumsum()
df.plot()
plt.legend()
<matplotlib.legend.Legend at 0x2429e12a5d0>
../_images/a7ea47cd6de64fe9ca295c3aeb031b845150a264f9104effea50ffbba44da3a9.png

copy#

  • 防止连带修改, 很多时候, 我们不小心会修改到原数据,不希望的

  • app_train_domain = app_train.copy() 另一份拷贝

数据结构#

增强版的 NumPy 结构化数组

print(res.shape) # 返回 (N,) 表示 Series,返回 (N, M) 表示 DataFrame

3.2.1 Series 对象#

初识#

  1. Series对象 带有索引的 一维数组。

  2. 通过 .index 和 .values 属性访问

    • values返回类np数组

    • index返回pd.index对象

data = pd.Series([0.25, 0.5, 0.1, 0.2])
values = data.values
index = data.index
data[1]
data[1:3]
1    0.5
2    0.1
dtype: float64

Series灵活的特性 : 数组和字典。#

  1. 与np数组最大不同:Series对象是显示的索引,索引可以是任意类型。

  2. Series对象是特殊的字典。特殊:一类数据映射到另一类数据。

  • 就像np数组比py列表高效一样, Series对象比py字典会高效。

  • 因此可以从py字典创建一个Series对象

因为具有数组和字典双重特性 所以可以对字典进行切片操作!!

  1. 创建Series对象 pd.Series(data, index = index)

data = pd.Series([0.25, 0.5, 0.1, 0.2], 
                index = ['a', 'b', 'c', 'd'], name='score') # name在合并df时候标记为列名
data
a    0.25
b    0.50
c    0.10
d    0.20
Name: score, dtype: float64
popular_dict = {
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
}
population = pd.Series(popular_dict)
population
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
population['California' : 'Florida']
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64
pd.Series([2, 4, 6]) # 列表或者np数组
pd.Series(5, index = [1,2,3]) # 标量  , 填充到每个索引上
pd.Series({1 : 'A', 2 : 'B'})
1    A
2    B
dtype: object

3.2.2 DataFrame对象#

  • 与Series对象一样,数组和字典双重特性。

  • DataFrame对象既有行索引又有列索引。是具有同样索引Series对象的集合

  • DataFrame 的基本索引是列索引, 直接[]访问的是列

数组角度#

  • 如何获取一行数据?不能直接通过[]这样索引,而是loc。

    • 因为这个每一列是一个元素(series),[]是访问列的

# 创建一个只有列名没有数据的
pd.DataFrame(columns=['col1', 'col2'])
col1 col2
# 创建一个只有列名没有数据,但指定类型
pd.DataFrame({
 'id':pd.Series(dtype=int),
    'name':pd.Series(dtype=str)
})
id name
import pandas as pd
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
    'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
states = pd.DataFrame({'population' : population, 'area' : area})
states
population area
California 38332521 423967
Texas 26448193 695662
New York 19651127 141297
Florida 19552860 170312
Illinois 12882135 149995
states.columns
states.index
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
states.values # 返回对应的数组
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

Cell In[2], line 1

----> 1 states.values # 返回对应的数组



NameError: name 'states' is not defined
# 改列名
df.rename(columns={'旧列名': '新列名'}, inplace=True)

# 改行索引名
df.rename(index={'旧行名': '新行名'}, inplace=True)

字典角度#

  1. 注意不能直接通过行名获取,因为字典看来,字段是列名!

states['area']
#states['Texas'] # !!
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

创建DataFram对象#

pd.DataFrame(population, columns=['population']) # 通过一个series对象创建
pd.DataFrame([ {'a' : i, 'b' : 2 * i} for i in range(4)]) # 元素是字典的列表
pd.DataFrame([{'a':1, 'b':2}, {'c':3, 'd':4}]) 
pd.DataFrame({'population' : population, 'area' : area}) # 具有同样索引多个series对象
pd.DataFrame(np.random.rand(3,2), columns = ['foo', 'bar'], index = ['a', 'b', 'c']) # np二维数组,指定行索引index,列索引columns
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')]) 
pd.DataFrame(A) # 从np结构化数组构建
A B
0 0 0.0
1 0 0.0
2 0 0.0

3.2.3 Index对象#

  • 可以看作是 不可变数组 或 有序集合。

  • 默认索引是RangeIndex,

ind = pd.Index([2, 3, 5, 7])
ind
Index([2, 3, 5, 7], dtype='int64')
ind.to_list()
[2, 3, 5, 7]

从不可变数组角度:类似np数组#

  • 值不可修改!

ind[1]
ind[::2]
print(ind.shape, ind.size, ind.ndim, ind.dtype)
(4,) 4 1 int64
# ind[1] = 0

从有序集合角度:#

  • 可以使用集合操作。 但必须size相同!

indA = pd.Index([1, 3, 2, 5])
indB = pd.Index([1, 2, 4, 6])
indA | indB
Index([1, 3, 6, 7], dtype='int64')
  • so, 可以for

for i in ind:
    print(i)
2
3
5
7

基本功能#

  • pandas 一般都不会原地修改

函数应用#

  • 将自己或者其他库函数应到pd对象

统计特征函数#

  • 都会保留索引的,值为df

  • sum ; quantile(); mean()

  • map

  • skew,偏度,分布是否对称, 偏左<0。

  • kurt 峰度。以正太分布基准,数据集中程度。尖峰>0

rng = np.random.RandomState(42)
s = pd.Series(rng.randint(0, 10, 4))
s
0    6
1    3
2    7
3    4
dtype: int32
s = pd.Series([2.1, 3.2,3,4])
s.quantile(0.25) # 直接调用接口
np.float64(2.775)
df = pd.DataFrame(rng.randint(0,10, (3,4)), columns=['A', 'B', 'C', 'D'])
df
np.exp(s)
np.sin(df)
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

Cell In[53], line 1

----> 1 np.exp(s)

      2 np.sin(df)



NameError: name 's' is not defined

Matching / broadcasting behavior#

  • 没有对齐的数据缺失值用NAN填充,或者设置参数fill_value

# 索引不同的两个Series对象
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
    'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
    'New York': 19651127}, name='population')
population / area
population.divide(area, fill_value = 0)
Alaska         0.000000
California    90.413926
New York            inf
Texas         38.018740
dtype: float64
df1 = pd.DataFrame(rng.randint(0,10, (3,4)), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(rng.randint(0,10, (4,2)), columns=['A', 'F'])
print(df1, '\n', df2)
df1.add(df2, fill_value=0)

使用均值填充! stack将二维数组压成一维

fill = df2.stack().mean()
df1.add(df2, fill_value = fill)
A = rng.randint(10, size=(3, 4))
A
df = pd.DataFrame(A, columns=list('QRST'))
df
df - df.loc[0]
df.subtract(df['R'], axis=0)

IO#

Feather#

  • csv存储是为了用户的,但要在不同阶段传递,比如特征工程各阶段。那么csv会损坏数据类型,且速度慢

  • feather提供了df的序列化和反序列化,几乎是内存保存

import pytz
df = pd.DataFrame(
  {
      "a": list("abc"),
      "b": list(range(1, 4)),
      "c": np.arange(3, 6).astype("u1"),
      "d": np.arange(4.0, 7.0, dtype="float64"),
      "e": [True, False, True],
      "f": pd.Categorical(list("abc")),
      "g": pd.date_range("20130101", periods=3),
      "h": pd.date_range("20130101", periods=3, tz=pytz.timezone("US/Eastern")),
      "i": pd.date_range("20130101", periods=3, freq="ns"),
  }) 
df.dtypes
a                           str
b                         int64
c                         uint8
d                       float64
e                          bool
f                      category
g                datetime64[us]
h    datetime64[us, US/Eastern]
i                datetime64[ns]
dtype: object
df.to_feather('example.feather')
result = pd.read_feather('example.feather')
result.dtypes
a                           str
b                         int64
c                         uint8
d                       float64
e                          bool
f                      category
g                datetime64[us]
h    datetime64[us, US/Eastern]
i                datetime64[ns]
dtype: object

align 对齐#

  • join

    • inner, outer, left

s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)
s1 = s[:3]
s2 = s[1:]
print(s1)
print(s2)
a   -0.440754
b   -0.967044
c   -0.343204
d    1.479043
e   -0.125159
dtype: float64
a   -0.440754
b   -0.967044
c   -0.343204
dtype: float64
b   -0.967044
c   -0.343204
d    1.479043
e   -0.125159
dtype: float64
# 交集
s1, s2 = s1.align(s2, join='inner')
print(s1, s2)
b   -0.967044
c   -0.343204
dtype: float64 b   -0.967044
c   -0.343204
dtype: float64

read_csv

# 读取网页内table标签内容,返回table列表,可能多个table
# 需要lxml底层库 解析网页
pd.read_html()
# pd.read_csv('.csv', sep='|', index_col='')
# pd.read_csv('URL', sep='|', index_col='')
# pd.read_csv('URL', sep=r'\s+')
#df.head(10)
#users.tail(10)
#users.describe(include='all')
# train_data[['col1', 'col2', 'col3']].describe()

include 不能直接通过列名

Indexing and selecting data#

索引方式#

  • loc

  • iloc

  • []

Boolean index#

  • | or & and ~ not

  • 必须括号分组

  • 这与numpy相同

s = pd.Series(range(-3, 4))
s[s>0]
4    1
5    2
6    3
dtype: int64
s[(s<1) | (s>2)]
0   -3
1   -2
2   -1
3    0
6    3
dtype: int64
dates = pd.date_range('1/1/2020', periods=8)
df = pd.DataFrame(
    np.random.randn(8,4),
    index = dates,
    columns = ['a', 'b', 'c', 'd']
)
df[df['a'] > 0]
a b c d
2020-01-03 0.180091 0.673043 -0.416968 -1.834319
2020-01-05 0.679155 -0.557629 0.388789 0.699663
2020-01-06 1.341901 1.036003 -0.096680 0.157641
2020-01-08 0.678832 -1.018873 0.447808 1.362020
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
              'c': np.random.randn(7)})
    
df2
a b c
0 one x 0.235107
1 one y 0.455529
2 two y 1.187247
3 three x -0.776355
4 two y -0.572664
5 one x 1.399288
6 six x 0.555768

例子:各种筛选#

import pandas as pd
data = {
    'AGE': [25, 45, 30],
    'LOAN': [1000, 5000, 2000],
    'OVERDUE': [0, 1, 0]
}
df = pd.DataFrame(data, index=['User_A', 'User_B', 'User_C'])
df
AGE LOAN OVERDUE
User_A 25 1000 0
User_B 45 5000 1
User_C 30 2000 0
  • 筛选行: 符合条件的行

df[df['LOAN'] > 2000]
AGE LOAN OVERDUE
User_B 45 5000 1
  • 筛选列名: 比如corr矩阵

df = pd.DataFrame(np.random.randn(4,4), index=list('abcd'), columns=list('abcd'))
corr = df.corr()
corr = corr.abs()
corr
a b c d
a 1.000000 0.558091 0.349060 0.767784
b 0.558091 1.000000 0.488331 0.326215
c 0.349060 0.488331 1.000000 0.650923
d 0.767784 0.326215 0.650923 1.000000
corr.columns[corr['c'] > 0.6] #
Index(['c', 'd'], dtype='str')
  • 筛选行名:比如想知道符合条件的id

corr.index[corr['c'] > 0.6] # 找到了和c 相关系数大于0.6的特征
Index(['c', 'd'], dtype='str')

astype(bool)#

  • 非0即True

  • NaN是True

df = pd.DataFrame(
    {
        'A': np.arange(4),
        'b': np.random.randn(4),
        'c': [np.nan for i in range(4)]
    }
)
df
A b c
0 0 -1.004799 NaN
1 1 0.562132 NaN
2 2 0.890005 NaN
3 3 0.559909 NaN
df.astype(bool)
A b c
0 False True True
1 True True True
2 True True True
3 True True True

where#

  • 快速where update

df = pd.DataFrame(
    np.random.randn(6,4),
)
df
0 1 2 3
0 -0.616614 -0.657717 -0.025098 -0.480370
1 -0.368138 -0.341348 -2.326300 0.454195
2 1.261347 2.296435 1.386900 -0.867184
3 -0.364546 -1.276686 -0.822064 -1.099228
4 1.197151 0.670332 -0.009173 -0.483234
5 0.429132 -0.108743 1.044581 -0.361712
df.where(df<0.3, 88)
0 1 2 3
0 -0.616614 -0.657717 -0.025098 -0.480370
1 -0.368138 -0.341348 -2.326300 88.000000
2 88.000000 88.000000 88.000000 -0.867184
3 -0.364546 -1.276686 -0.822064 -1.099228
4 88.000000 88.000000 -0.009173 -0.483234
5 88.000000 -0.108743 88.000000 -0.361712
df.where(df<0.3) # 默认变为NaN
0 1 2 3
0 -0.616614 -0.657717 -0.025098 -0.480370
1 -0.368138 -0.341348 -2.326300 NaN
2 NaN NaN NaN -0.867184
3 -0.364546 -1.276686 -0.822064 -1.099228
4 NaN NaN -0.009173 -0.483234
5 NaN -0.108743 NaN -0.361712

reset索引#

df = pd.DataFrame(
    data = np.random.randn(6,2),
    index = pd.date_range('20201010', periods=6),
    columns = ['area', 'pop']
)
df
area pop
2020-10-10 1.183677 1.191695
2020-10-11 -0.410622 0.811568
2020-10-12 -0.237822 -1.234384
2020-10-13 -0.619160 1.917208
2020-10-14 0.030715 0.243250
2020-10-15 3.830778 -0.931668

reset_index 将索引重置为简单的整数索引, 原index转为列

df.reset_index()
index area pop
0 2020-10-10 1.183677 1.191695
1 2020-10-11 -0.410622 0.811568
2 2020-10-12 -0.237822 -1.234384
3 2020-10-13 -0.619160 1.917208
4 2020-10-14 0.030715 0.243250
5 2020-10-15 3.830778 -0.931668

3.3 数据取值与选择#

3.3.1 Series对象#

1. 作为字典#

data = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a', 'b', 'c', 'd'])
data
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 data = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a', 'b', 'c', 'd'])
      2 data

NameError: name 'pd' is not defined
'a' in data
data.keys()
list(data.items())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 'a' in data
      2 data.keys()
      3 list(data.items())

NameError: name 'data' is not defined
data['e'] = 1.2 #添加索引和数据

2. 作为数组#

data['a':'c']
a    0.25
b    0.50
c    0.75
dtype: float64
data[0:2] # 隐式整数索引
a    0.25
b    0.50
dtype: float64
data > 0.5
a    False
b    False
c     True
d     True
e     True
dtype: bool
data[data > 0.5]
c    0.75
d    1.00
e    1.20
dtype: float64
data[['a', 'e']] # 花哨索引
a    0.25
e    1.20
dtype: float64

3. 索引器 loc, ilod, ix#

data.loc['a'] # 显示索引
data.iloc[1]
np.float64(0.5)

3.3.2 DataFrame数据选择方法#

  • for 返回的是列名

插入一列

#train_dataset.insert(0, "WAGE", y_train)

1. DataFrame作为字典, value是Series对象#

import pandas as pd
area = pd.Series({'California': 423967, 'Texas': 695662,
    'New York': 141297, 'Florida': 170312,
    'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
    'New York': 19651127, 'Florida': 19552860,
    'Illinois': 12882135})
data = pd.DataFrame({'area' : area, 'pop' : pop})
data
area pop
California 423967 38332521
Texas 695662 26448193
New York 141297 19651127
Florida 170312 19552860
Illinois 149995 12882135
data['area'].unique()
array([423967, 695662, 141297, 170312, 149995])
data['area']
data.area
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
data['density'] = data['pop'] / data['area']
data
area pop density
California 423967 38332521 90.413926
Texas 695662 26448193 38.018740
New York 141297 19651127 139.076746
Florida 170312 19552860 114.806121
Illinois 149995 12882135 85.883763

2. 看作二维数组#

得到的结果也是np数组 通过loc,iloc索引访问 Index对象!

  • 不支持[]数组切片. 只能通过iloc位置或者loc列名

# data[:, 1:] # error
data.values
array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])
data.T
California Texas New York Florida Illinois
area 4.239670e+05 6.956620e+05 1.412970e+05 1.703120e+05 1.499950e+05
pop 3.833252e+07 2.644819e+07 1.965113e+07 1.955286e+07 1.288214e+07
density 9.041393e+01 3.801874e+01 1.390767e+02 1.148061e+02 8.588376e+01
data.values[0]
array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])
data.iloc[:3, :2] # 按位置索引。  
area pop
California 423967 38332521
Texas 695662 26448193
New York 141297 19651127
data.index
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

iloc依赖整数索引,如果索引被修改过,可能iloc失效

data.loc[['Texas', 'Florida'], ['area']]
area
Texas 695662
Florida 170312
data.loc[:'Texas', 'area':] # 按照名字索引
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 data.loc[:'Texas', 'area':] # 按照名字索引

NameError: name 'data' is not defined

切片是基于行索引切片的, 最好通过loc显式切片

data[:1] # 表示第一行 而非第一列。  
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 data[:1] # 表示第一行 而非第一列。  

NameError: name 'data' is not defined

3. 删除行列#

import pandas as pd
crime = pd.DataFrame({
      'id':[1,2,3],
    'country':['CH','US','EU'],
    'total':[1,2,3],
    'dead':[4,6,7]
})
crime.set_index('id',inplace=True)
  1. del原地删除列。

  • 不能用于删除行

del crime['total']
crime
country dead
id
1 CH 4
2 US 6
3 EU 7
  1. drop

  • inplace原地修改

  • 必须axis指定行列方向

# 删除列
crime.drop(columns=['dead'],axis=1)
country
id
1 CH
2 US
3 EU
# 删除行 axis=0. 
crime.drop(index=[1], axis=0)
country dead
id
2 US 6
3 EU 7

4. 通过dtypes选取列#

df = pd.DataFrame({
    'age':[1,2],
    'name':['aa','bb'],
    'married':[False, True]
})
df.dtypes
age         int64
name       object
married      bool
dtype: object
df.select_dtypes(include=['int64'])
age
0 1
1 2

Working with text data#

names.str.capitalize()
names.str.len()
names.str.match('[A-Za-z]+')
names.str[0:3]
names.str.split().get(-1)

Working with missing data#

3.5 处理缺失值#

? 缺失值有3种形式;null, NaN, NA

pd的缺失值表示:NaN,None对象

3.5.2 Pandas的缺失值#

  1. None:Python对象类型的缺失值

    只有在object数组类型用到

np.array([1, None, 3 ,4], dtype = object)
  1. NaN(全称Not a Number,不是一个数字): 数值类型缺失

  • 是一个特殊的浮点数!!!

  • 无论和NaN 进行何种操作,最终结果都是NaN。不会抛出异常

  • np有一些特殊函数,忽略nan聚合影响

x = np.array([1, np.nan, 3, 4])
x + 2
np.nanmax(x)
  1. Pandas中NaN与None的差异

基本上可以看作等价交换的

pd.Series([1, np.nan, 3, None])

3.5.3 处理缺失值#

发现、剔除、替换缺失值

  • isnull()

  • notnull()

  • dropna()

  • fillna()

  1. 发现

data = pd.Series([1, np.nan, 'hello', None])
data
data.isnull() # 返回掩码数组
data[data.notnull()] # 通过掩码筛选,索引去除
  1. 剔除

data.dropna() # 快速剔除!
df = pd.DataFrame([[1, np.nan, 2],
    [2, 3, 5],
    [np.nan, 4, 6]])
df
df.dropna() # 默认剔除行
df.dropna(axis='columns') # 剔除列

剔除行列对数据影响太大,

  • 通过how属性选择策略

  • thresh设置阈值:非缺失值最小数量

df.dropna(how = 'any') # 只要有缺失就剔除
df.dropna(axis = 1)
df.dropna(axis='columns', how='all') # 只有一列全部缺失才剔除
df.dropna(axis='rows', thresh= 3)
  1. 填充缺失值

data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data
data.fillna(0) # 0填充
data.fillna(method='ffill') # 通过前面的值填充
data.fillna(method = 'bfill')
df
df.fillna(method = 'ffill', axis=0)

MultiIndex / advanced indexing#

层级索引#

分组后agg也会生成层级列索引

df = pd.DataFrame({
    'ID': pd.RangeIndex(6),
    'A': np.random.randn(6),
    'B': np.random.randn(6)
}
)
df_agg = df.groupby(by='ID').agg(
    ['min', 'max', 'sum']
)
df_agg
A B
min max sum min max sum
ID
0 0.106374 0.106374 0.106374 -0.514180 -0.514180 -0.514180
1 1.610102 1.610102 1.610102 -0.389459 -0.389459 -0.389459
2 0.687068 0.687068 0.687068 -0.214000 -0.214000 -0.214000
3 0.235996 0.235996 0.235996 -0.120299 -0.120299 -0.120299
4 -0.136370 -0.136370 -0.136370 -1.748903 -1.748903 -1.748903
5 -0.809654 -0.809654 -0.809654 -1.679501 -1.679501 -1.679501

可以将层级列索引平铺

df_agg.columns = ['_'.join(col).strip() for col in df_agg.columns.values]
df_agg
A_min A_max A_sum B_min B_max B_sum
ID
0 0.106374 0.106374 0.106374 -0.514180 -0.514180 -0.514180
1 1.610102 1.610102 1.610102 -0.389459 -0.389459 -0.389459
2 0.687068 0.687068 0.687068 -0.214000 -0.214000 -0.214000
3 0.235996 0.235996 0.235996 -0.120299 -0.120299 -0.120299
4 -0.136370 -0.136370 -0.136370 -1.748903 -1.748903 -1.748903
5 -0.809654 -0.809654 -0.809654 -1.679501 -1.679501 -1.679501
index = [('California', 2000), ('California', 2010),
    ('New York', 2000), ('New York', 2010),
    ('Texas', 2000), ('Texas', 2010)]
index
[('California', 2000),
 ('California', 2010),
 ('New York', 2000),
 ('New York', 2010),
 ('Texas', 2000),
 ('Texas', 2010)]
index = pd.MultiIndex.from_tuples(index)
index
MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )
populations = [33871648, 37253956,
    18976457, 19378102,
    20851820, 25145561]
pop = pd.Series(populations, index=index)
pop
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

一级索引缺失忽略就是多级索引的表现形式!。

pop[:,2010] # 二级索引获取2010年数据
California    37253956
New York      19378102
Texas         25145561
dtype: int64

!多级索引如二级索引 可以与 dataframe互换!

pop_df = pop.unstack()
pop_df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 pop_df = pop.unstack()
      2 pop_df

NameError: name 'pop' is not defined
pop_df.stack()

三级索引!

pop_df = pd.DataFrame({'total' : pop, 'under18' : [9267089, 9284094,4687374, 4318033,5906301, 6879014]})
pop_df

3.6.2 多级索引的创建方法#

  1. 最直接的就是,index设置为多维数组 (隐式)

df = pd.DataFrame(np.random.rand(4, 2),
    index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
    columns=['data1', 'data2'])
df
data1 data2
a 1 0.133549 0.117980
2 0.328639 0.054611
b 1 0.846874 0.911447
2 0.772955 0.942654
  1. 元组为键的字典(隐式)

data = {('California', 2000): 33871648,
    ('California', 2010): 37253956,
    ('Texas', 2000): 20851820,
    ('Texas', 2010): 25145561,
    ('New York', 2000): 18976457,
    ('New York', 2010): 19378102}
pd.Series(data)
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64
  1. MultiIndex显示创建

pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]]) # 规格的多维数组
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
pd.MultiIndex.from_tuples([('a', 1),('a', 2),('b', 1),('b', 2)]) # 直接用元组
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) # 笛卡尔积
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
  1. 为多级索引添加名字

pop
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
pop.index.names = ['state', 'years']
pop
state       years
California  2000     33871648
            2010     37253956
New York    2000     18976457
            2010     19378102
Texas       2000     20851820
            2010     25145561
dtype: int64
  1. DF的多级列索引

# 两个多级索引
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
    names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
    names=['subject', 'type'])
data = np.round(np.random.randn(4,6), 1)
health_data = pd.DataFrame(data, index = index, columns = columns)
health_data
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 -0.7 0.3 0.1 0.7 -1.3 -0.4
2 -1.6 0.9 0.6 -0.6 0.7 0.8
2014 1 -0.8 -1.6 -0.1 0.2 0.3 0.4
2 -0.9 2.2 -0.7 -1.6 0.4 -1.8

这是一个四维数据

health_data['Sue'] # 列的一级索引直接访问
type HR Temp
year visit
2013 1 -1.3 -0.4
2 0.7 0.8
2014 1 0.3 0.4
2 0.4 -1.8
health_data.loc[2013] # 行一级索引
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
visit
1 -0.7 0.3 0.1 0.7 -1.3 -0.4
2 -1.6 0.9 0.6 -0.6 0.7 0.8

3.6.3 多级索引的取值与切片#

  1. Series多级索引取值

pop
pop['California',2000]
pop['California']
pop[:, 2000]
pop[pop > 100]
pop[['California', 'Texas']]
  1. DataFrame多级索引

health_data
health_data['Bob', 'HR'] # 列的二级索引访问
health_data.iloc[:2, :2] # 看作二维数组

Indeslice来快速处理多级索引取值: 比如要访问行索引的第二级

idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

3.6.4 多级索引行列转换#

  1. 有序的索引和无序的索引

    • MultiIndex如果不是有序索引,那么切片就会失败!因此必须保证索引有序

    • pd提供了方法来排序索引

index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index = index)
data.index.names = ['char', 'int']
data
char  int
a     1      0.667677
      2      0.790632
c     1      0.783999
      2      0.035292
b     1      0.991972
      2      0.442809
dtype: float64
# data['a' : 'b'] # 切片失败
data.sort_index()
char  int
a     1      0.667677
      2      0.790632
b     1      0.991972
      2      0.442809
c     1      0.783999
      2      0.035292
dtype: float64
  1. 索引stack与unstack

pop
state       years
California  2000     33871648
            2010     37253956
New York    2000     18976457
            2010     19378102
Texas       2000     20851820
            2010     25145561
dtype: int64

展开索引, 比如多级,要展1级

pop.unstack() 
years 2000 2010
state
California 33871648 37253956
New York 18976457 19378102
Texas 20851820 25145561
pop.unstack(level = 0) # 设置第一级索引去列
state California New York Texas
years
2000 33871648 18976457 20851820
2010 37253956 19378102 25145561
pop
state       years
California  2000     33871648
            2010     37253956
New York    2000     18976457
            2010     19378102
Texas       2000     20851820
            2010     25145561
dtype: int64

层级索引:平铺#

平铺列索引

df = pd.DataFrame({
    'ID': pd.RangeIndex(6),
    'A': np.random.randn(6),
    'B': np.random.randn(6)
}
)
df_agg = df.groupby(by='ID').agg(
    ['min', 'max', 'sum']
)
df_agg
A B
min max sum min max sum
ID
0 -1.531047 -1.531047 -1.531047 -0.253353 -0.253353 -0.253353
1 0.342627 0.342627 0.342627 -1.600890 -1.600890 -1.600890
2 0.641067 0.641067 0.641067 -0.434768 -0.434768 -0.434768
3 0.201696 0.201696 0.201696 -0.899129 -0.899129 -0.899129
4 1.652071 1.652071 1.652071 0.409722 0.409722 0.409722
5 0.430262 0.430262 0.430262 0.371764 0.371764 0.371764
df_agg.reset_index()
ID A B
min max sum min max sum
0 0 -1.531047 -1.531047 -1.531047 -0.253353 -0.253353 -0.253353
1 1 0.342627 0.342627 0.342627 -1.600890 -1.600890 -1.600890
2 2 0.641067 0.641067 0.641067 -0.434768 -0.434768 -0.434768
3 3 0.201696 0.201696 0.201696 -0.899129 -0.899129 -0.899129
4 4 1.652071 1.652071 1.652071 0.409722 0.409722 0.409722
5 5 0.430262 0.430262 0.430262 0.371764 0.371764 0.371764
  1. 索引的设置与重置

  • 重置索引,则会生成一个DataFrame。

  • 重置时候,需要补充数据列名

  • 将这样数据转化为MultiIndex可以设置索引!

pop_flat = pop.reset_index(name = 'population')
pop_flat
state years population
0 California 2000 33871648
1 California 2010 37253956
2 New York 2000 18976457
3 New York 2010 19378102
4 Texas 2000 20851820
5 Texas 2010 25145561

set_index()没有inplace必须手动赋值

pop_flat.set_index(['state','years']) # 设置某列为索引
population
state years
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
pop_flat.reset_index()
index state years population
0 0 California 2000 33871648
1 1 California 2010 37253956
2 2 New York 2000 18976457
3 3 New York 2010 19378102
4 4 Texas 2000 20851820
5 5 Texas 2010 25145561

3.6.5 多级索引的数据累计方法#

health_data
  1. 行计算:

health_data.groupby(level = 'year').mean()
health_data.groupby(level = 'visit').mean()
year_mean = health_data.groupby(level = 'year').mean()
year_mean.groupby(axis = 1, level = 'type').mean()
  1. 列计算

health_data.groupby(axis = 1, level ='type').mean()

Copy-on-Write (CoW)#

Duplicate Labels#

Merge, join, concatenate and compare#

3.7 合并数据concat#

df之间、series之间、df与series之间

  • concat axis指定行列

3.7.1 concat连接#

import pandas as pd
import numpy as np
def make_df(cols, ind):
    data = {c : [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, index= ind)
make_df("ABC", range(3))
A B C
0 A0 B0 C0
1 A1 B1 C1
2 A2 B2 C2
pd.concat([make_df("ABC", range(3)), make_df("ABC", range(4))], ignore_index=True)
A B C
0 A0 B0 C0
1 A1 B1 C1
2 A2 B2 C2
3 A0 B0 C0
4 A1 B1 C1
5 A2 B2 C2
6 A3 B3 C3
s1 = pd.Series(np.random.randint(1,10,10))
s2 = pd.Series(np.random.randint(1,10,10))
pd.concat([s1,s2], axis=1, keys=['s1','s2']) # keys重新命名列
s1 s2
0 7 1
1 7 1
2 3 1
3 5 9
4 4 3
5 6 1
6 3 2
7 3 9
8 1 9
9 5 8
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])
1    A
2    B
3    C
4    D
5    E
6    F
dtype: object
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
pd.concat([df1, df2])
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
pd.concat([df1, df2], axis = 1)
A B A B
1 A1 B1 NaN NaN
2 A2 B2 NaN NaN
3 NaN NaN A3 B3
4 NaN NaN A4 B4
pd.concat([df1, df2], axis = 1)
A B A B
1 A1 B1 NaN NaN
2 A2 B2 NaN NaN
3 NaN NaN A3 B3
4 NaN NaN A4 B4

索引重复时候如何连接?#

  • pd默认是允许的,结果会有重复索引

  • 但我们想要结果肯定是唯一索引

  1. 捕获重复索引错误。

  2. 忽略索引

  3. 增加多级索引

s1=pd.Series(np.random.randint(1,4,10))
s2=pd.Series(np.random.randint(1,3,10))
s3=pd.Series(np.random.randint(10000,30000,10))
d= pd.concat([s1,s2,s3], axis=0)
d.index
Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3,
       4, 5, 6, 7, 8, 9],
      dtype='int64')

重复索引合理的不会自动合并

pd.concat([s1,s2,s3], axis=0, ignore_index=True) 
0          1
1          3
2          3
3          3
4          2
       ...  
295    13715
296    29372
297    23352
298    28118
299    24099
Length: 300, dtype: int32

注意这样合并,还是series,通过to_frame()

x = make_df('AB', [0, 1])
y = make_df('AB', [0, 3])
pd.concat([x ,y])
A B
0 A0 B0
1 A1 B1
0 A0 B0
3 A3 B3
try:
    pd.concat([x,y], verify_integrity = True)
except ValueError as e:
    print("ValueError:", e)
pd.concat([x,y], ignore_index=True) #默认行为忽略
pd.concat([x, y], keys = ['x', 'y']) # 连接时候加上层级索引

合并:数据缺失NaN#

  • concat连接时候会出现NaN, 数据缺失。

  • 设置join属性,决定合并连接策略

df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5)
print(df6)
    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
pd.concat([df5, df6])
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
3 NaN B3 C3 D3
4 NaN B4 C4 D4
 pd.concat([df5, df6], join ='outer') # 默认行为并集
pd.concat([df5, df6], join = 'inner') # 交集
# pd.concat([df5, df6], join_axies = [df5.columns])

3.8 合并数据集df:merge#

  • 类似数据库,实现不同数据源的连接:一对一、一对多、多对多

  • pd.merge和 Serier、DataFrame的join

contact vs merge

  • contact只是按照行列进行拼接, 重复索引和值缺失

  • merge是基于列进行连接匹配,
    merge更加高级,比如内外也可做出contact效果

  1. 一对一连接

df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
    'hire_date': [2004, 2008, 2012, 2014]})
print(df1); print(df2)
  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
df3 = pd.merge(df1, df2)
df3

两个df有一个共同列,merge就会自动根据这个列进行合并!且行数据不冲突!

  1. 多对一连接

  • 合并两个列时候,有一个列有重复值。

  • merge会保留重复值。并填充缺失

df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
    'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3);print(df4)
pd.merge(df3, df4)

(2,supervisor) 是缺失的,自动填充了!

  1. 多对多连接

  • 顾名思义:合并的两个列内部都有重复值

df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
    'Engineering', 'Engineering', 'HR', 'HR'],
    'skills': ['math', 'spreadsheets', 'coding', 'linux',
    'spreadsheets', 'organization']})
print(df1, '\n', df5)

场景:显示人员的一种或者多种能力

pd.merge(df1, df5)

3.8.3 设置数据合并的键#

  • 很多情况下,目的合并的列是不同名的。merge不会自动合并。需要设置哪两个列合并

1. 参数on

用在列名相同(merge默认行为)

print(df1, '\n', df2)
pd.merge(df1, df2, on = 'employee')

2. left_on与right_on参数

  • !! 结果中并没有直接去掉列,需要drop方法手动去掉

df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
    'salary': [70000, 80000, 120000, 90000]})
print(df1, '\n', df3)
  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR 
    name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
pd.merge(df1, df3, left_on = 'employee', right_on = 'name')
employee group name salary
0 Bob Accounting Bob 70000
1 Jake Engineering Jake 80000
2 Lisa Engineering Lisa 120000
3 Sue HR Sue 90000
pd.merge(df1, df3, left_on = 'employee', right_on = 'name').drop('name', axis = 1)
employee group salary
0 Bob Accounting 70000
1 Jake Engineering 80000
2 Lisa Engineering 120000
3 Sue HR 90000

丢弃一列额外方式:直接取出来赋值

#price_table= price_table[['item_name','item_price']]
  1. left_index与right_index参数

  • 之前都是合并列。 这里合并索引

  • 索引和列同时合并,需要设置_index 和 _on

print(df1, '\n', df2)
df1a = df1.set_index('employee') # 将employee列设置为索引
df2a = df2.set_index('employee')
print(df1a, '\n', df2a)
pd.merge(df1a, df2a, left_index = True, right_index = True)
print(df1a, '\n', df3)
pd.merge(df1a, df3, left_index = True, right_on = 'name') # 保留df1a索引, df3合并保留name列

3.8.4 设置数据连接的集合操作规则#

  • 合并两个列时候, 如果一个值出现没有在另外一个列。即需要内连接 外连接规则

df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
    'food': ['fish', 'beans', 'bread']},
    columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
    'drink': ['wine', 'beer']},
    columns=['name', 'drink'])
print(df6); print(df7);
pd.merge(df6, df7) # 默认内连接,即等值连接
pd.merge(df6, df7, how = 'outer') # 外连接全连接
pd.merge(df6, df7, how = 'left')

3.8.5 重复列名:suffixes参数#

  • 列名重复,且不是目的合并列。 需要重命名或者添加后缀

df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
    'rank': [3, 1, 4, 2]})
print(df8, '\n', df9)
pd.merge(df8, df9, on = 'name') # 指定了name列是目标合并列,merge会为其他重复列自动后缀
pd.merge(df8, df9, on = 'name', suffixes = ["_L", "_R"])  

3.8.6 案例:美国各州的统计数据#

任务:计算各州人口密度排名

  1. 合并pop与缩写表,显示全称

  2. 将面积表合并

pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
print(pop.head(), '\n', areas.head(), '\n', abbrevs.head())
data = pd.merge(pop, abbrevs, left_on = 'state/region', right_on = 'abbreviation').drop('abbreviation', axis = 1)
data
data.isnull().any() # 检查字段是否有缺失
data = pd.merge(data, areas, how = 'left')
data.isnull().any()
data
data2010 = data.query("year == 2010 & ages == 'total'")
data2010.set_index('state', inplace=True) # inplace表示原地修改
data2010
density = data2010['population'] / data2010['area (sq. mi)']
density

排序
.sort_values(by=’quantity’, ascending=False)

density.sort_values(ascending = False, inplace = True)
density.head()
density.tail()

Group by: split-apply-combine#

agg#

  • aggregate

  • 在指定轴上进行一个或者多个运算 进行聚合

  • 行汇总

df = pd.DataFrame(
    [[1, 2, 3], [4, 5, 6], [7, 8, 9], [np.nan, np.nan, np.nan]],
    columns=['A', 'B', 'C']
)
df
A B C
0 1.0 2.0 3.0
1 4.0 5.0 6.0
2 7.0 8.0 9.0
3 NaN NaN NaN
df.agg(['sum', 'min'])
A B C
sum 12.0 15.0 18.0
min 1.0 2.0 3.0

每列可以有不同的聚合方式

df.agg({"A": ["sum", "min"], "B": ["min", "max"]})
A B
sum 12.0 NaN
min 1.0 2.0
max NaN 8.0

3.9 累计与分组#

  • 统计

  • 分组groupby

3.9.1 行星数据为例#

  • 内容为2014年以来发现的外行星绕太阳系运动观测数据

  • describe方法快速描述整体统计. series、df、groupby都可以使用

  • grouby局部分析

  • GroupBy对象

  • series直接操作:value_counts()

import seaborn as sns
planets = sns.load_dataset('planets') #
planets
method number orbital_period mass distance year
0 Radial Velocity 1 269.300000 7.10 77.40 2006
1 Radial Velocity 1 874.774000 2.21 56.95 2008
2 Radial Velocity 1 763.000000 2.60 19.84 2011
3 Radial Velocity 1 326.030000 19.40 110.62 2007
4 Radial Velocity 1 516.220000 10.50 119.47 2009
... ... ... ... ... ... ...
1030 Transit 1 3.941507 NaN 172.00 2006
1031 Transit 1 2.615864 NaN 148.00 2007
1032 Transit 1 3.191524 NaN 174.00 2007
1033 Transit 1 4.125083 NaN 293.00 2008
1034 Transit 1 4.187757 NaN 260.00 2008

1035 rows × 6 columns

print(planets.info())   # 查看数据基本信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB
None
print(planets.describe())  # 统计摘要
            number  orbital_period        mass     distance         year
count  1035.000000      992.000000  513.000000   808.000000  1035.000000
mean      1.785507     2002.917596    2.638161   264.069282  2009.070531
std       1.240976    26014.728304    3.818617   733.116493     3.972567
min       1.000000        0.090706    0.003600     1.350000  1989.000000
25%       1.000000        5.442540    0.229000    32.560000  2007.000000
50%       1.000000       39.979500    1.260000    55.250000  2010.000000
75%       2.000000      526.005000    3.040000   178.500000  2012.000000
max       7.000000   730000.000000   25.000000  8500.000000  2014.000000
print(planets.isnull().sum())  # 查看缺失值
method              0
number              0
orbital_period     43
mass              522
distance          227
year                0
dtype: int64
planets.dropna().describe()
number orbital_period mass distance year
count 498.00000 498.000000 498.000000 498.000000 498.000000
mean 1.73494 835.778671 2.509320 52.068213 2007.377510
std 1.17572 1469.128259 3.636274 46.596041 4.167284
min 1.00000 1.328300 0.003600 1.350000 1989.000000
25% 1.00000 38.272250 0.212500 24.497500 2005.000000
50% 1.00000 357.000000 1.245000 39.940000 2009.000000
75% 2.00000 999.600000 2.867500 59.332500 2011.000000
max 6.00000 17337.500000 25.000000 354.000000 2014.000000

groupby常分为分布:分割、应用累计、组合 image.png

import pandas as pd
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
    'data': range(6)}, columns=['key', 'data'])
df
key data
0 A 0
1 B 1
2 C 2
3 A 3
4 B 4
5 C 5
df.groupby('key') # 返回的是GroupBy对象!
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000231752DFDA0>

GroupBy对象

  • 魔力:可以看作是特殊形式DF,但是只有在累计函数计算时候,才会计算、显示。(延迟计算

df.groupby('key').sum()
data
key
A 3
B 5
C 7
  1. 基本操作:按列取值,类似df

  • 可以看作是DF集合,可以按照列取DF

  • 比如说从在a列中找出b特征最多的个数

planets.groupby('method')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000231752DF590>
planets.groupby('method')['orbital_period']
<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000231752DD010>
median1 = planets.groupby('method')['orbital_period'].median()
median1
method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64
median1.index
Index(['Astrometry', 'Eclipse Timing Variations', 'Imaging', 'Microlensing',
       'Orbital Brightness Modulation', 'Pulsar Timing',
       'Pulsation Timing Variations', 'Radial Velocity', 'Transit',
       'Transit Timing Variations'],
      dtype='object', name='method')

可以看到分组后index是groupby, 如果要默认整数索引,通过.reset_index.

  • 方便后续通过列名进行操作 sort等

  • name参数为前面操作函数 重新设置列名

planets.groupby('method')['orbital_period'].median().reset_index(name = 'orbital_period_median')
method orbital_period_median
0 Astrometry 631.180000
1 Eclipse Timing Variations 4343.500000
2 Imaging 27500.000000
3 Microlensing 3300.000000
4 Orbital Brightness Modulation 0.342887
5 Pulsar Timing 66.541900
6 Pulsation Timing Variations 1170.000000
7 Radial Velocity 360.200000
8 Transit 5.714932
9 Transit Timing Variations 57.011000
planets.groupby('method')['orbital_period'].median().reset_index()
method orbital_period
0 Astrometry 631.180000
1 Eclipse Timing Variations 4343.500000
2 Imaging 27500.000000
3 Microlensing 3300.000000
4 Orbital Brightness Modulation 0.342887
5 Pulsar Timing 66.541900
6 Pulsation Timing Variations 1170.000000
7 Radial Velocity 360.200000
8 Transit 5.714932
9 Transit Timing Variations 57.011000
planets.groupby('method').size().reset_index(name='size')
method size
0 Astrometry 2
1 Eclipse Timing Variations 9
2 Imaging 38
3 Microlensing 23
4 Orbital Brightness Modulation 3
5 Pulsar Timing 5
6 Pulsation Timing Variations 1
7 Radial Velocity 553
8 Transit 397
9 Transit Timing Variations 4

一起分组多个,比如item_name和对应价格经常对应。

# chipo.groupby(['item_name', 'item_price']).size()

2. 按组迭代

planets.groupby('method')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000023177A84980>
# 查看某个分组数据
planets.groupby('method').get_group('Astrometry')
method number orbital_period mass distance year
113 Astrometry 1 246.36 NaN 20.77 2013
537 Astrometry 1 1016.00 NaN 14.98 2010
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))
Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)
planets.groupby('method')['year'].describe()
count mean std min 25% 50% 75% max
method
Astrometry 2.0 2011.500000 2.121320 2010.0 2010.75 2011.5 2012.25 2013.0
Eclipse Timing Variations 9.0 2010.000000 1.414214 2008.0 2009.00 2010.0 2011.00 2012.0
Imaging 38.0 2009.131579 2.781901 2004.0 2008.00 2009.0 2011.00 2013.0
Microlensing 23.0 2009.782609 2.859697 2004.0 2008.00 2010.0 2012.00 2013.0
Orbital Brightness Modulation 3.0 2011.666667 1.154701 2011.0 2011.00 2011.0 2012.00 2013.0
Pulsar Timing 5.0 1998.400000 8.384510 1992.0 1992.00 1994.0 2003.00 2011.0
Pulsation Timing Variations 1.0 2007.000000 NaN 2007.0 2007.00 2007.0 2007.00 2007.0
Radial Velocity 553.0 2007.518987 4.249052 1989.0 2005.00 2009.0 2011.00 2014.0
Transit 397.0 2011.236776 2.077867 2002.0 2010.00 2012.0 2013.00 2014.0
Transit Timing Variations 4.0 2012.500000 1.290994 2011.0 2011.75 2012.5 2013.25 2014.0

3. 高级操作:aggregate()、filter()、transform() 和apply()

  • 累计:一次性多个统计操作

  • 过滤

  • 转换
    apply() 对一行或者一列的每个元素操作
    transform() 对整个df或者列操作,确保形状不变!

import pandas as pd
import numpy as np
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
    'data1': range(6),
    'data2': rng.randint(0, 10, 6)},
    columns = ['key', 'data1', 'data2'])
df
key data1 data2
0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
df.aggregate(['min', 'max']) # 直接使用
key data1 data2
min A 0 0
max C 5 9
df.groupby('key').aggregate(['min', 'median', 'max']) # 可以自定义一次统计的操作
data1 data2
min median max min median max
key
A 0 1.5 3 3 4.0 5
B 1 2.5 4 0 3.5 7
C 2 3.5 5 3 6.0 9
df.groupby('key').std()
data1 data2
key
A 2.12132 1.414214
B 2.12132 4.949747
C 2.12132 4.242641
def filter_func(x):
    return x['data2'].std() > 4
df.groupby('key').filter(filter_func)
key data1 data2
1 B 1 0
2 C 2 3
4 B 4 7
5 C 5 9
df.groupby('key').transform(lambda x:x - x.mean()) # 标准化每个元素减去每组的平均值
data1 data2
0 -1.5 1.0
1 -1.5 -3.5
2 -1.5 -3.0
3 1.5 -1.0
4 1.5 3.5
5 1.5 3.0
df.groupby('key').apply(lambda x:x - x.mean()) # 标准化每个元素减去每组的平均值
data1 data2
key
A 0 -1.5 1.0
3 1.5 -1.0
B 1 -1.5 -3.5
4 1.5 3.5
C 2 -1.5 -3.0
5 1.5 3.0
  1. 键分割

下面例子中,decade并不是planets中的一列, 但也可以传入进行分组

decade =  (planets['year'] // 10) * 10
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)
decade 1980s 1990s 2000s 2010s
method
Astrometry 0.0 0.0 0.0 2.0
Eclipse Timing Variations 0.0 0.0 5.0 10.0
Imaging 0.0 0.0 29.0 21.0
Microlensing 0.0 0.0 12.0 15.0
Orbital Brightness Modulation 0.0 0.0 0.0 5.0
Pulsar Timing 0.0 9.0 1.0 1.0
Pulsation Timing Variations 0.0 0.0 1.0 0.0
Radial Velocity 1.0 52.0 475.0 424.0
Transit 0.0 0.0 64.0 712.0
Transit Timing Variations 0.0 0.0 0.0 9.0

series直接操作:value_counts()

planets['method'].value_counts().head(1) # 统计一列
method
Radial Velocity    553
Name: count, dtype: int64

Reshaping and pivot tables#

cut#

  • 生成区间分组, 分类

ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
pd.cut(ages, bins=5)
[(9.95, 20.0], (9.95, 20.0], (9.95, 20.0], (9.95, 20.0], (20.0, 30.0], (20.0, 30.0], (20.0, 30.0], (50.0, 60.0], (50.0, 60.0]]
Categories (5, interval[float64, right]): [(9.95, 20.0] < (20.0, 30.0] < (30.0, 40.0] < (40.0, 50.0] < (50.0, 60.0]]
pd.cut(ages, bins=[0, 18, 35, 70])
[(0, 18], (0, 18], (0, 18], (0, 18], (18, 35], (18, 35], (18, 35], (35, 70], (35, 70]]
Categories (3, interval[int64, right]): [(0, 18] < (18, 35] < (35, 70]]
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)
full_monte = pd.DataFrame({'name': names,
    'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 
    'B|C|D']})
full_monte['info'].str.get_dummies('|')
A B C D
0 0 1 1 1
1 0 1 0 1
2 1 0 1 0
3 0 1 0 1
4 0 1 1 1

stack 和 unstack#

  • stack 把列索引变成行索引

melt#

宽表人类看,长表分析好

df = pd.DataFrame({
    'name':['ming'],
    'math':[90],
    'english':[30]
})
df
name math english
0 ming 90 30
df.melt(
    id_vars = ['name'], # 那几列不变
    value_vars = ['math', 'english'], # 这几个类名合成一列
    var_name = 'subject', # 合成列的名字
    value_name = 'score' # 剩余数值列名字
)
name subject score
0 ming math 90
1 ming english 30

get_dummies()#

  • 默认识别 object(str)或者categorical 类型

  • 我们也可以指定这两种类型的列

df = pd.DataFrame({
    'key': list('abcdef'),
    'data1': range(6)
})
df.dtypes
key        str
data1    int64
dtype: object
df
key data1
0 a 0
1 b 1
2 c 2
3 d 3
4 e 4
5 f 5
pd.get_dummies(df['key'])
a b c d e f
0 True False False False False False
1 False True False False False False
2 False False True False False False
3 False False False True False False
4 False False False False True False
5 False False False False False True
dummies = pd.get_dummies(df['key'], prefix='key')
df[['data1']].join(dummies)
data1 key_a key_b key_c key_d key_e key_f
0 0 True False False False False False
1 1 False True False False False False
2 2 False False True False False False
3 3 False False False True False False
4 4 False False False False True False
5 5 False False False False False True

3.10 数据透视表#

  • 数据透视表是多维的GroupBy操作,比如二维分割 很方便

  • titantic 示例

3.10.1 示例:titantic#

import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True

3.10.2 手工制作数据透视表#

比如:现在要不同性别乘客生存率, 不同性别、船舱等级的生存率

titanic.groupby('sex')['survived'].mean()
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

3.10.3 数据透视表语法#

pivot_table来实现上面等价效果

titanic.pivot_table('survived', index = 'sex', columns = 'class')

结果表明:一等舱的女性几乎全部生还,而三等舱的男性只有10之1生还。

1. 多级数据透视表

  • 层级索引效果

制定了 三层索引,索引侧[sex, age] ,列名侧class

age = pd.cut(titanic['age'], [0, 18, 80]) # 分段
# print(age)
titanic.pivot_table('survived', ['sex', age], 'class')
  1. 其他数据透视表选项

titanic.pivot_table('survived', index = 'sex', columns = 'class', margins=True)  # margins计数分组

3.10.4 示例:美国人的生日#

births = pd.read_csv('births.csv')
births.head()

🗼1. 增加一列,看各年代出生男女比例

births['decade'] = 10 * (births['year'] // 10)
births.pivot_table('births', index = 'decade', columns = 'gender', aggfunc='sum') # aggfunc设置累计函数,默认是均值
import matplotlib.pyplot as plt
sns.set() # 使用Seaborn风格
births.pivot_table('births', index = 'year', columns = 'gender', aggfunc='sum').plot()
plt.ylabel('total births this year')

🗼2. 不同年代不同日期的日均出生数

births[births['day'] > 30]

发现,day中有NaN

births.dropna(inplace = True) #原地修改
births.isnull().any()
births['day'].dtype
births['day'] = births['day'].astype(int)
births['day'].dtype

剔除异常日期,如0199 ???

births = births.query('1 <= day <= 31')
births
quartiles = np.percentile(births['births'], [25, 50, 75])
mu = quartiles[1] # 均值
sig = 0.74 * (quartiles[2] - quartiles[0]) # 标准差估计值
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')

创建一个日期索引

births.index = pd.to_datetime(10000 * births.year +100 * births.month +births.day, format="%Y%m%d")
births['dayofweek'] = births.index.dayofweek
births
births.pivot_table('births', index = 'dayofweek', columns= 'decade', aggfunc = 'mean')
births.pivot_table('births', index = 'dayofweek', columns= 'decade', aggfunc = 'mean').plot()
plt.ylabel('mean births by day')

🗼3. 各年份平均每天出生人数,按照月和日两个维度分组.时间索引

births
births_by_date = births.pivot_table('births', index = [births.index.month, births.index.day])
births_by_date
births_by_date.index =[ pd.to_datetime(f"2012{month:02}{day:02}") for (month, day) in births_by_date.index]
births_by_date.plot()

Time series / date functionality#

3.12 处理时间序列#

三种时间:时间戳,时间间隔(天),持续时间(s)

Time deltas#

Categorical data#

3.12.1 Python的日期与时间工具#

  1. 原生Python的日期与时间工具:datetime(标准库)与dateutil(三方)

from datetime import datetime
datetime(2015, 7, 4)
datetime.datetime(2015, 7, 4, 0, 0)
from dateutil import parser
date = parser.parse("4th of July, 2015")
date.strftime('%A') # 星期
'Saturday'
  1. 时间类型数组:NumPy的datetime64类型

  • 是64位整数

date = np.array('2015-07-04', dtype=np.datetime64)
date
date + np.arange(12)
  1. Pandas的日期与时间工具:理想与现实的最佳解决方案

  • 自己的TimeStamp对象实现

  • 可以使用一组TimeStamp对象创建时间索引

import pandas as pd
pd.to_datetime("4th of July, 2015")
Timestamp('2015-07-04 00:00:00')
几列合并一列
# data['date'] = pd.to_datetime(data[['Yr', 'Mo', 'Dy']])

3.12.2 Pandas时间序列:用时间作索引#

index = pd.DatetimeIndex(['2014-07-04', '2014-08-04',
    '2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
data
2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64
data['2015'] # 可以直接获取一年的
2015-07-04    2
2015-08-04    3
dtype: int64

3.12.3 Pandas时间序列数据结构#

TimeStamp, Period, Timedelta 时间结构即系列索引 DateTimeIndex,PeriodIndex,TimedeltaIndex

df['日期'].dt.year # dt保证把序列对象转为datetime处理

to_datetime都是时间格式

d = pd.to_datetime('2020',format='%Y')
d
Timestamp('2020-01-01 00:00:00')
plotdf['date'] = pd.to_datetime({
    'year': plotdf['year'],
    'month': plotdf['month'],
    'day': 1  # 用每月第一天作为默认
})
d.year
2020

支持replace修改

d.replace(year=d.year - 100)
Timestamp('1920-01-01 00:00:00')
from datetime import datetime
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015','2015-Jul-6', '07-07-2015', '20150708'])
dates
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015','2015-Jul-6', '07-07-2015', '20150708'])
      2 dates

NameError: name 'pd' is not defined
dates.to_period('D')
PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='period[D]')
dates - dates[0]

创建有规律时间序列

pd.date_range('20120101','20130101')
DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03', '2012-01-04',
               '2012-01-05', '2012-01-06', '2012-01-07', '2012-01-08',
               '2012-01-09', '2012-01-10',
               ...
               '2012-12-23', '2012-12-24', '2012-12-25', '2012-12-26',
               '2012-12-27', '2012-12-28', '2012-12-29', '2012-12-30',
               '2012-12-31', '2013-01-01'],
              dtype='datetime64[ns]', length=367, freq='D')
pd.date_range('20140202', periods=8, freq='h')
pd.period_range('201701', periods=8, freq='M')
PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
             '2017-07', '2017-08'],
            dtype='period[M]')

!常常要实现类似POWERBI钻取,按照年月季度聚合。 .dt类型检查了,不加也行

df['year'] = df['date'].dt.year         # 提取年份
df['month'] = df['date'].dt.month       # 提取月份
df['day'] = df['date'].dt.day           # 提取日
df['quarter'] = df['date'].dt.quarter   # 提取季度
df['weekday'] = df['date'].dt.weekday   # 提取星期(0=周一)
df['week'] = df['date'].dt.isocalendar().week  # 提取周(注意版本要求)

3.12.4 时间频率与偏移量#

pd.timedelta_range(0, periods=9, freq='2h30t') #间隔2消失的偏移量

3.12.5 重新取样、迁移和窗口#

from pandas_datareader import data
goog = data.DataReader('GOOGL', start='2019-1-1',data_source='stooq')
goog.head()
goog['Close']
goog['Close'].plot()

resample以数据累计基础 asfreq以数据取样基础

goog['Close'].plot()
goog['Close'].resample('BA').mean().plot(style = ':') # 上一年的均值
goog['Close'].asfreq('BA').plot(style = '--') # 年末最后工作日值
goog_close = goog['Close']
data = goog_close.iloc[:10]
data.asfreq('D').plot(style = '-o') # 生成时间序列,包含休息天
data.asfreq('D', method='bfill').plot(style = '--') # 向后填充

时间迁移:

data = goog_close.asfreq('D', method='bfill')
data.plot()
data.shift(120).plot() # 向后迁移

# 设置图例与标签
local_max = pd.to_datetime('2007-11-05')
offset = pd.Timedelta(120, 'D')
  1. 移动时间窗口

rolling = goog_close.rolling(365, center=True)
data = pd.DataFrame({'input': goog_close,
    'one-year rolling_mean': rolling.mean(),
    'one-year rolling_std': rolling.std()})
ax = data.plot(style=['-', '--', ':'])
ax.lines[0].set_alpha(0.3)

3.12.7 案例:美国西雅图自行车统计数据的可视化#

弗莱蒙特桥每小时通行的自行车数量

import pandas as pd
data = pd.read_csv('Fremont_Bridge_Bicycle_Counter.csv', index_col='Date', parse_dates=True)
data.head()
C:\Users\63517\AppData\Local\Temp\ipykernel_15928\361130387.py:2: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  data = pd.read_csv('Fremont_Bridge_Bicycle_Counter.csv', index_col='Date', parse_dates=True)
Fremont Bridge Sidewalks, south of N 34th St Fremont Bridge Sidewalks, south of N 34th St Cyclist West Sidewalk Fremont Bridge Sidewalks, south of N 34th St Cyclist East Sidewalk
Date
2012-10-02 13:00:00 55.0 7.0 48.0
2012-10-02 14:00:00 130.0 55.0 75.0
2012-10-02 15:00:00 152.0 81.0 71.0
2012-10-02 16:00:00 278.0 167.0 111.0
2012-10-02 17:00:00 563.0 393.0 170.0
data.columns = ['total','West', 'East']
data.dropna().describe()
data.plot() # 每hour自行车数量

resample()用于对时间序列数据(即索引DatetimeIndex)重新采样,可以指定时间间隔聚合数据。

import pandas as pd
import numpy as np

# 创建示例数据
dates = pd.date_range(start='2000-01-01', periods=1000, freq='D')
data = pd.DataFrame({'value': np.random.randint(1, 100, size=1000)}, index=dates)

# 1. 按月求和
data.resample('ME').sum()

# 2. 按季度求均值
data.resample('QE').mean()

# 3. 按 10 年聚合
data.resample('10YS').sum()

# 4. 重新采样,并前向填充缺失值
data.resample('D').ffill()
value
2000-01-01 90
2000-01-02 31
2000-01-03 51
2000-01-04 31
2000-01-05 70
... ...
2002-09-22 66
2002-09-23 29
2002-09-24 40
2002-09-25 94
2002-09-26 27

1000 rows × 1 columns


weekly = data.resample('W').sum()
weekly.plot(style = ['-','--',':'])
<Axes: xlabel='Date'>
../_images/144356ad9b30d6f4a13ba9873eb8856e48096fb6ab42f89f3b32ba39a26d7600.png
daily = data.resample('D').sum() #日
daily.rolling(30, center=True).mean().plot(style=[':', '--', '-']) # 30日移动平均

平滑

daily.rolling(50, win_type='gaussian').sum(std=10).plot(style=[':', '--', '-']);

Enhancing performance#

3.13 高性能Pandas:eval()与query()#

直接运行c语言

3.13.1 query()与eval()的设计动机:复合代数式#

import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(100000)
y = rng.rand(100000)
%timeit x+y # %timeit单行代码

但处理复合式子很慢, 这样情景对于大数组应该采用更高效的方式

mask = (x > 0.5) & (y < 0.5)

3.13.2 用pandas.eval()实现高性能运算#

用字符串的代数式 代替了 运算表达式

import numpy as np
import pandas as pd
nrows, ncols = 100000, 100 
rng = np.random.RandomState(42) 
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) 
                     for i in range(4))

普通方法计算和

%timeit df1+df2+df3+df4
46.3 ms ± 5.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

eval字符串代数式计算

%timeit pd.eval('df1+df2+df3+df4')
16.6 ms ± 595 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

!明显eval速度快一倍

pd.eval()支持的运算

df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))
  1. 算数运算

result1 = -df1 * df2 / (df3 + df4) - df5 
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5') 
np.allclose(result1, result2) # 检查是否近似相等
True
  1. 比较运算

result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
 
result2 = pd.eval('df1 < df2 <= df3 != df4') 
np.allclose(result1, result2) 
True
  1. 位运算

result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4) 
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)') 
np.allclose(result1, result2) 
True
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)') 
np.allclose(result1, result3) 
True
  1. 对象属性与索引

result1 = df2.T[0] + df3.iloc[1] 
print(result1)
result2 = pd.eval('df2.T[0] + df3.iloc[1]') 
np.allclose(result1, result2) 
0    841
1    729
2    937
dtype: int32
True

3.13.3 用DataFrame.eval()实现列间运算#

eval可以直接用列名计算!

df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C']) 
df.head() 
A B C
0 0.375506 0.406939 0.069938
1 0.069087 0.235615 0.154374
2 0.677945 0.433839 0.652324
3 0.264038 0.808055 0.347197
4 0.589161 0.252418 0.557789

正常的eval写法, 通过.引用列(属性)

result1 = (df['A'] + df['B']) / (df['C'] - 1) 
result2 = pd.eval("(df.A + df.B) / (df.C - 1)") 
np.allclose(result1, result2) 
True

更加简洁的,通过df调用eval

result3 = df.eval('(A + B) / (C - 1)') 
np.allclose(result1, result3) 
True

其他常见使用:

  1. 使用df.eval增加列

df.eval('D = (A + B) / C', inplace= True)
df.head()
A B C D
0 0.375506 0.406939 0.069938 11.187620
1 0.069087 0.235615 0.154374 1.973796
2 0.677945 0.433839 0.652324 1.704344
3 0.264038 0.808055 0.347197 3.087857
4 0.589161 0.252418 0.557789 1.508776
  1. df.eval通过@使用局部变量, 而不仅是上面的列名

column_mean = df.mean(1) 
result1 = df['A'] + column_mean 
result2 = df.eval('A + @column_mean') 
np.allclose(result1, result2) 
True

3.13.4 DataFrame.query()方法#

更加简化之间的eval查询过滤运算

eval过滤写法

result1 = df[(df.A < 0.5) & (df.B < 0.5)] 
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]') 
np.allclose(result1, result2) 
True
df.A.query('<0.5')
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[52], line 1
----> 1 df.A.query('<0.5')

File d:\miniconda3\Lib\site-packages\pandas\core\generic.py:6299, in NDFrame.__getattr__(self, name)
   6292 if (
   6293     name not in self._internal_names_set
   6294     and name not in self._metadata
   6295     and name not in self._accessors
   6296     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   6297 ):
   6298     return self[name]
-> 6299 return object.__getattribute__(self, name)

AttributeError: 'Series' object has no attribute 'query'

query写法

Cmean = df['C'].mean() 
result1 = df[(df.A < Cmean) & (df.B < Cmean)] 
result2 = df.query('A < @Cmean and B < @Cmean') 
np.allclose(result1, result2) 
True
result2 = df.query('A < 0.5 and B < 0.5') 
np.allclose(result1, result2) 
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[49], line 2
      1 result2 = df.query('A < 0.5 and B < 0.5') 
----> 2 np.allclose(result1, result2) 

File d:\miniconda3\Lib\site-packages\numpy\_core\numeric.py:2329, in allclose(a, b, rtol, atol, equal_nan)
   2243 @array_function_dispatch(_allclose_dispatcher)
   2244 def allclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
   2245     """
   2246     Returns True if two arrays are element-wise equal within a tolerance.
   2247 
   (...)
   2327 
   2328     """
-> 2329     res = all(isclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan))
   2330     return builtins.bool(res)

File d:\miniconda3\Lib\site-packages\numpy\_core\numeric.py:2447, in isclose(a, b, rtol, atol, equal_nan)
   2444     y = float(y)
   2446 with errstate(invalid='ignore'):
-> 2447     result = (less_equal(abs(x-y), atol + rtol * abs(y))
   2448               & isfinite(y)
   2449               | (x == y))
   2450     if equal_nan:
   2451         result |= isnan(x) & isnan(y)

ValueError: operands could not be broadcast together with shapes (235,4) (231,4) 
df[]
  Cell In[50], line 1
    df[]
       ^
SyntaxError: invalid syntax

iloc 和loc#

  • loc操作的行列名字。 loc[行名或者过滤行条件,列名]

    • 过滤条件就是一个series bool值

  • iloc操作的是位置

import pandas as pd
import numpy as np

data = {
    'SK_ID': [101, 102, 103, 104],
    'TARGET': [0, 1, 0, 1],
    'NAME': ['Alice', 'Bob', 'Charlie', 'David'],
    'AMT_INCOME': [200000, 150000, 300000, 120000],
    'DAYS_BIRTH': [-15000, -10000, -20000, -9000]
}

df = pd.DataFrame(data)
# 为了演示 loc 的行名功能,我们把 SK_ID 设为索引
df.set_index('SK_ID', inplace=True)
df
TARGET NAME AMT_INCOME DAYS_BIRTH
SK_ID
101 0 Alice 200000 -15000
102 1 Bob 150000 -10000
103 0 Charlie 300000 -20000
104 1 David 120000 -9000
df.loc[101, :]
TARGET             0
NAME           Alice
AMT_INCOME    200000
DAYS_BIRTH    -15000
Name: 101, dtype: object
df.loc[df['TARGET']==0, 'NAME']
SK_ID
101      Alice
103    Charlie
Name: NAME, dtype: str
df.loc[(df['TARGET']==0 )& (df['AMT_INCOME'] > 180000), 'NAME']
SK_ID
101      Alice
103    Charlie
Name: NAME, dtype: str

replace#

  • SeriesDF 对象都支持

  • 一对一

  • 多个值统一

import pandas as pd
import numpy as np
s = pd.Series(np.array([1,2,3, None]))
print(s)
# 
0       1
1       2
2       3
3    None
dtype: object
0     NaN
1       2
2       3
3    None
dtype: object
0       1
1     NaN
2     NaN
3    None
dtype: object
0       a
1       4
2     NaN
3    None
dtype: object
s1 = s.replace(1, np.nan)
print(s1)
0     NaN
1       2
2       3
3    None
dtype: object
# 多对多,映射替换
s3 = s.replace({1: 'a', 2: 4, 3:np.nan})
print(s3)
0       a
1       4
2     NaN
3    None
dtype: object
# 多换一
s2 = s.replace([2, 3], np.nan)
print(s2)
0       1
1     NaN
2     NaN
3    None
dtype: object
area_dict = {'California': 3, 'Texas': 3, 'New York': 1,
    'Florida': -1, 'Illinois': 2}
area = pd.Series(area_dict)
populations_dict = {'California': 2, 'Texas': 2, 'New York': 3,
    'Florida': -1, 'Illinois': 4}
populations = pd.Series(populations_dict)
states = pd.DataFrame({'population': populations, 'area': area})
print(states)
            population  area
California           2     3
Texas                2     3
New York             3     1
Florida              6     5
Illinois             4     2
# 全局替换
states.replace(-1, np.nan)
population area
California 2 3
Texas 2 3
New York 3 1
Florida 6 5
Illinois 4 2
# 对不同列 不同值 
states.replace({
    'area':{
        3: 33,
        1: 11
    },
    'population':{
        2:22,
        6:66
    }
})
population area
California 22 33
Texas 22 33
New York 3 11
Florida 66 5
Illinois 4 2
apply 计算长度
#data['文本长度'] = data['新闻'].apply(len)
import pandas as pd
names =pd.DataFrame(
    {
        'name': ['GOGO','papa','jkjk'],
        'count':[122,3,4]
    }
)
names.set_index('name')
count
name
GOGO 122
papa 3
jkjk 4
names['count'].idxmax() # 不好用! 只能返回一个索引
0

Sparse data structures#

Chart visualization#

3.14 绘图接口#

pandas 的绘图功能通过 plot() 方法实现,底层依赖 matplotlib,适用于 Series 和 DataFrame。

  • Series:单列数据,索引自动作 x 轴,值作 y 轴。

  • DataFrame:多列数据,可指定 x 和 y 列,或默认用索引作 x 轴。

通过 kind 参数指定:

  • line:折线图(默认)

  • bar:柱状图

  • pie:饼图

  • scatter:散点图(DataFrame 更常用)

  • area:面积图

  • box:箱线图

通过参数设置样式:

  • title:图表标题

  • color:颜色

  • figsize:大小

df['col'].nunique() # 返回唯一值个数

skew()衡量偏态分布程度, 如果是1,对称分布

Table Visualization#

cookbook#

computation#

相关性

df = pd.DataFrame(np.random.randn(3,4))
df
0 1 2 3
0 0.964237 0.573508 -1.109884 -1.506903
1 -1.361796 1.311485 -0.361705 1.636924
2 -1.122803 -2.252449 1.261629 -0.575619
corr = df.corr()
corr
0 1 2 3
0 1.000000 0.230549 -0.677053 -0.789277
1 0.230549 1.000000 -0.872202 0.415529
2 -0.677053 -0.872202 1.000000 0.082491
3 -0.789277 0.415529 0.082491 1.000000
df['target'] = np.random.randn(3)
df
0 1 2 3 target
0 0.964237 0.573508 -1.109884 -1.506903 0.348507
1 -1.361796 1.311485 -0.361705 1.636924 0.449459
2 -1.122803 -2.252449 1.261629 -0.575619 -0.157774
df['target'].corr(df[1])
np.float64(0.9991318029867298)

案例:食谱数据库#

数据清理

try:
    recipes = pd.read_json('recipeitems-latest.json')
except ValueError as e:
    print("ValueError:",e) # 数据断行
---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

File c:\Users\63517\miniconda3\envs\data-analysis\Lib\site-packages\pandas\io\json\_json.py:948, in JsonReader._get_data_from_filepath(self, filepath_or_buffer)

    947 try:

--> 948     self.handles = get_handle(

    949         filepath_or_buffer,

    950         "r",

    951         encoding=self.encoding,

    952         compression=self.compression,

    953         storage_options=self.storage_options,

    954         errors=self.encoding_errors,

    955     )

    956 except OSError as err:



File c:\Users\63517\miniconda3\envs\data-analysis\Lib\site-packages\pandas\io\common.py:926, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)

    924 if ioargs.encoding and "b" not in ioargs.mode:

    925     # Encoding

--> 926     handle = open(

    927         handle,

    928         ioargs.mode,

    929         encoding=ioargs.encoding,

    930         errors=errors,

    931         newline="",

    932     )

    933 else:

    934     # Binary mode



FileNotFoundError: [Errno 2] No such file or directory: 'recipeitems-latest.json'



The above exception was the direct cause of the following exception:



FileNotFoundError                         Traceback (most recent call last)

Cell In[80], line 2

      1 try:

----> 2     recipes = pd.read_json('recipeitems-latest.json')

      3 except ValueError as e:

      4     print("ValueError:",e) # 数据断行



File c:\Users\63517\miniconda3\envs\data-analysis\Lib\site-packages\pandas\io\json\_json.py:829, in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, precise_float, date_unit, encoding, encoding_errors, lines, chunksize, compression, nrows, storage_options, dtype_backend, engine)

    826 if convert_axes is None and orient != "table":

    827     convert_axes = True

--> 829 json_reader = JsonReader(

    830     path_or_buf,

    831     orient=orient,

    832     typ=typ,

    833     dtype=dtype,

    834     convert_axes=convert_axes,

    835     convert_dates=convert_dates,

    836     keep_default_dates=keep_default_dates,

    837     precise_float=precise_float,

    838     date_unit=date_unit,

    839     encoding=encoding,

    840     lines=lines,

    841     chunksize=chunksize,

    842     compression=compression,

    843     nrows=nrows,

    844     storage_options=storage_options,

    845     encoding_errors=encoding_errors,

    846     dtype_backend=dtype_backend,

    847     engine=engine,

    848 )

    850 if chunksize:

    851     return json_reader



File c:\Users\63517\miniconda3\envs\data-analysis\Lib\site-packages\pandas\io\json\_json.py:931, in JsonReader.__init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, precise_float, date_unit, encoding, lines, chunksize, compression, nrows, storage_options, encoding_errors, dtype_backend, engine)

    929     self.data = filepath_or_buffer

    930 elif self.engine == "ujson":

--> 931     data = self._get_data_from_filepath(filepath_or_buffer)

    932     # If self.chunksize, we prepare the data for the `__next__` method.

    933     # Otherwise, we read it into memory for the `read` method.

    934     if not (self.chunksize or self.nrows):



File c:\Users\63517\miniconda3\envs\data-analysis\Lib\site-packages\pandas\io\json\_json.py:957, in JsonReader._get_data_from_filepath(self, filepath_or_buffer)

    948     self.handles = get_handle(

    949         filepath_or_buffer,

    950         "r",

   (...)    954         errors=self.encoding_errors,

    955     )

    956 except OSError as err:

--> 957     raise FileNotFoundError(

    958         f"File {filepath_or_buffer} does not exist"

    959     ) from err

    960 filepath_or_buffer = self.handles.handle

    961 return filepath_or_buffer



FileNotFoundError: File recipeitems-latest.json does not exist
with open('recipeitems-latest.json') as f:
    line = f.readline()
pd.read_json(line).shape

每一行都是json对象,[]组成json格式。 解析到dataframe

with open('recipeitems-latest.json', 'r', encoding='utf-8') as f:
    data = (line.strip() for line in f)
    data_json = "[{0}]".format(','.join(data))
recipes = pd.read_json(data_json)
recipes.iloc[0]
recipes['ingredients'].str.len().describe()

!这个字段最长居然有9000多个字符, 看看是哪个

recipes.iloc[np.argmax(recipes.ingredients.str.len())]
recipes['description'].str.contains('[Bb]reakfast').sum() # 一共有3524个早餐

美食推荐系统 根据用户有的食材取搜索

spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
    'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']
import re
spice_df = pd.DataFrame(dict( (spice, recipes['ingredients'].str.contains(spice, re.IGNORECASE))for spice in spice_list)) # 构建bool数组
spice_df
selection = spice_df.query('parsley & paprika & sage')
selection.index
recipes['name'][selection.index]