数据框

Reads: 841 Edit

DataFrame 是 Pandas 的另一个重要对象数据类型,其属于二维数组的形式,类似于Excel的一张表。可以同时存储数据分析中的多个变量,是进行统计分析和机器学习的基本数据结构。

1 创建DataFrame

创建空DataFrame:

>>> import pandas as pd
>>> import numpy  as np			# pandas基于numpy开发,所以需要导入numpy库

>>> df = pd.DataFrame()
>>> print(df)
Empty DataFrame
Columns: []
Index: []

列表创建DataFrame:

>>> list1=[24,37,18,27,33,40,16]
>>> df1=pd.DataFrame(list1)				# 按列创建DataFrame
>>> df1
    0
0  24
1  37
2  18
3  27
4  33
5  40
6  16
>>> list1_1=[[24,37,18,27,33,40,16]]
>>> df1_1=pd.DataFrame(list1_1)			# 按行创建DataFrame
>>> df1_1
    0   1   2   3   4   5   6
0  24  37  18  27  33  40  16


>>> pd.DataFrame(list1,index=['a','b','c','d','e','f','g'],columns=['age'])
   age
a   24
b   37
c   18
d   27
e   33
f   40
g   16

>>> list2=[[24,'M'],[37,'F'],[18,'F'],[27,'M'],[33,'F'],[40,'M']]
>>> pd.DataFrame(list2,columns=['age','sex'])
   age sex
0   24   M
1   37   F
2   18   F
3   27   M
4   33   F
5   40   M
>>> pd.DataFrame(list(map(list,zip(*list2))),index=['age','sex'])   # 先将list2转置后再创建DataFrame,从而实现按照列生成DataFrame。
      0   1   2   3   4   5
age  24  37  18  27  33  40
sex   M   F   F   M   F   M

向量创建DataFrame:

>>> array1=np.array([24,37,18,27,33,40,16])
>>> pd.DataFrame(array1,index=['a','b','c','d','e','f','g'],columns=['age'])
   age
a   24
b   37
c   18
d   27
e   33
f   40
g   16

>>> array2=np.array([[24,'M'],[37,'F'],[18,'F'],[27,'M'],[33,'F'],[40,'M']])
>>> pd.DataFrame(array2,columns=['age','sex'])
  age sex
0  24   M
1  37   F
2  18   F
3  27   M
4  33   F
5  40   M
>>> pd.DataFrame(array2.T,index=['age','sex'])		# 可以通过array转置的方式按照列创建DataFrame
      0   1   2   3   4   5
age  24  37  18  27  33  40
sex   M   F   F   M   F   M

字典创建DataFrame:

>>> dict1 = {'age': [24,37,18,27,33,40], 'sex': ['M','F','F','M','F','M',]}
>>> pd.DataFrame(dict1)
   age sex
0   24   M
1   37   F
2   18   F
3   27   M
4   33   F
5   40   M

Series创建DataFrame:

>>> s1=pd.Series([24,37,18,27,33,40])
>>> s2=pd.Series(['M','F','F','M','F','M'])
>>> pd.DataFrame([s1,s2],index=['age','sex'])			# 默认是按照行来创建DataFrame!
      0   1   2   3   4   5
age  24  37  18  27  33  40
sex   M   F   F   M   F   M


>>> pd.DataFrame({'age':s1,'sex':s2})					# 可以将Series添加到字典,从而按照列来创建DataFrame!
   age sex
0   24   M
1   37   F
2   18   F
3   27   M
4   33   F
5   40   M

index的修改与重置:

>>> df1_1=df1_2=pd.DataFrame({'age': [24,37,18,27,33,40], 'sex': ['M','F','F','M','F','M',]})
>>> df1_1
   age sex
0   24   M
1   37   F
2   18   F
3   27   M
4   33   F
5   40   M
>>> df1_1.index=df1_1['age']
>>> df1_1
     age sex
age         
24    24   M
37    37   F
18    18   F
27    27   M
33    33   F
40    40   M
>>> df1_2.set_index('age', drop= True, inplace=True)
>>> df1_2
    sex
age    
24    M
37    F
18    F
27    M
33    F
40    M
>>> df1_1.reset_index(drop=True)
  sex
0   M
1   F
2   F
3   M
4   F
5   M
df1_1.reset_index(drop=False)
   age sex
0   24   M
1   37   F
2   18   F
3   27   M
4   33   F
5   40   M

2 DataFrame数据的索引、修改和删除

索引和修改:

>>> df2=pd.DataFrame({'age': [24,37,18,27,33,40], 'sex': ['M','F','F','M','F','M',]})
>>> df2['age']
0    24
1    37
2    18
3    27
4    33
5    40
Name: age, dtype: int64
>>> df2[['age','sex']]
   age sex
0   24   M
1   37   F
2   18   F
3   27   M
4   33   F
5   40   M

>>> df2.loc[1,:]  		# 按照行的index进行索引
age    37
sex     F

>>> df2.iloc[1,:]		# 按照行的位置次序进行索引,注意行的index可以变,但是行的位置次序不会变。
age    37
sex     F
Name: 1, dtype: object


>>> df2.loc[1:3,'age':'sex']	# 根据行和列的index同时索引
   age sex
1   37   F
2   18   F
3   27   M

>>> df2.iloc[1:4,0:2]			# 根据行和列的位置次序同时索引
   age sex
1   37   F
2   18   F
3   27   M

>>> df2.loc[4,'sex']			
'F'

>>> df2.iloc[4,1]
'F'


>>> df2.loc[1,'age']=38
>>> df2
   age sex
0   24   M
1   38   F
2   18   F
3   27   M
4   33   F
5   40   M

>>> df2.iloc[1,0]=37
>>> df2
   age sex
0   24   M
1   37   F
2   18   F
3   27   M
4   33   F
5   40   M

添加:

>>> df2=pd.DataFrame({'age': [24,37,18,27,33,40], 'sex': ['M','F','F','M','F','M',]})
>>> df2['age']
0    24
1    37
2    18
3    27
4    33
5    40
Name: age, dtype: int64

>>> df2.insert(1,column='name',value=['denise','maggie','eileen','cyril','julian','colin'])
>>> df2
   age    name sex
0   24  denise   M
1   37  maggie   F
2   18  eileen   F
3   27   cyril   M
4   33  julian   F
5   40   colin   M

>>> df2.loc[len(df2.index)] = [20, 'jack', 'M']			# 最后一行插入行
>>> df2
   age    name sex
0   24  denise   M
1   37  maggie   F
2   18  eileen   F
3   27   cyril   M
4   33  julian   F
5   40   colin   M
6   20    jack   M

删除:

接着上面的df2操作

>>> df2.drop('sex',axis=1)		# 删除1列,如果需要修改df2的值需要加参数`inplace='True'`。
   age    name
0   24  denise
1   37  maggie
2   18  eileen
3   27   cyril
4   33  julian
5   40   colin
6   20    jack
>>> df2.drop(['age','sex'],axis=1)		# 删除多列,如果需要修改df2的值需要加参数`inplace='True'`。
     name
0  denise
1  maggie
2  eileen
3   cyril
4  julian
5   colin
6    jack

>>> df2.drop(6,axis=0)   # 删除1行,如果需要修改df2的值需要加参数`inplace='True'`。
   age    name sex
0   24  denise   M
1   37  maggie   F
2   18  eileen   F
3   27   cyril   M
4   33  julian   F
5   40   colin   M
>>> df2.drop(range(2,4),axis=0)		# 删除多行,如果需要修改df2的值需要加参数`inplace='True'`。
   age    name sex
0   24  denise   M
1   37  maggie   F
4   33  julian   F
5   40   colin   M
6   20    jack   M

3 获取DataFrame的属性

>>> df3=pd.DataFrame({'age': [24,37,18,27,33,40], 'sex': ['M','F','F','M','F','M',],'name':['denise','maggie','eileen','cyril','julian','colin']})
>>> df3.T
           0       1       2      3       4      5
age       24      37      18     27      33     40
sex        M       F       F      M       F      M
name  denise  maggie  eileen  cyril  julian  colin
>>> df3.axes
[RangeIndex(start=0, stop=6, step=1), Index(['age', 'sex', 'name'], dtype='object')]
>>> df3.ndim
2
>>> df3.shape
(6, 3)
>>> df3.values
...   
array([[24, 'M', 'denise'],
       [37, 'F', 'maggie'],
       [18, 'F', 'eileen'],
       [27, 'M', 'cyril'],
       [33, 'F', 'julian'],
       [40, 'M', 'colin']], dtype=object)
>>> df3.head(3)
   age sex    name
0   24   M  denise
1   37   F  maggie
2   18   F  eileen
>>> df3.tail(2)
   age sex    name
4   33   F  julian
5   40   M   colin

4 DataFrame中的函数操作

>>> df3.sort_values(by='age',axis=0)
   age sex    name
2   18   F  eileen
0   24   M  denise
3   27   M   cyril
4   33   F  julian
1   37   F  maggie
5   40   M   colin

>>> df3.sort_index()
   age sex    name
0   24   M  denise
1   37   F  maggie
2   18   F  eileen
3   27   M   cyril
4   33   F  julian
5   40   M   colin

>>> df3.replace('M','F')
   age sex    name
0   24   F  denise
1   37   F  maggie
2   18   F  eileen
3   27   F   cyril
4   33   F  julian
5   40   F   colin

>>> df3.drop_duplicates('sex')
   age sex    name
0   24   M  denise
1   37   F  maggie

>>> df4=pd.DataFrame({'age': [24,37,18,27,33,40], 'sex': ['M','F','F','M','F','M',],'weight':[76,56,49,84,61,66]})
>>> df4
   age sex  weight
0   24   M      76
1   37   F      56
2   18   F      49
3   27   M      84
4   33   F      61
5   40   M      66

>>> df4.mean(axis=0)
age       29.833333
weight    65.333333
dtype: float64
>>> df4.mean(axis=1)
0    50.0
1    46.5
2    33.5
3    55.5
4    47.0
5    53.0
dtype: float64

>>> df4.describe()
             age     weight
count   6.000000   6.000000
mean   29.833333  65.333333
std     8.328665  12.925427
min    18.000000  49.000000
25%    24.750000  57.250000
50%    30.000000  63.500000
75%    36.000000  73.500000
max    40.000000  84.000000
>>> df4.describe(include='object')
       sex
count    6
unique   2
top      M
freq     3

5 合并DataFrame对象

按照索引合并:

>>> df5_1=pd.DataFrame({'age': [24,37,18,27,33,40], 'sex': ['M','F','F','M','F','M',]})
>>> df5_2=pd.DataFrame({'name': ['james','denise','eileen','kobe','julian','colin'],'weight':[76,56,49,84,61,66]},index=[2,3,4,5,6,7])
>>> df5_3=pd.DataFrame({'name': ['denise','maggie','eileen','cyril','julian','colin'],'height':[164,178,152,160,171,166]})
>>> pd.concat([df5_1,df5_2],axis=1)
    age  sex    name  weight
0  24.0    M     NaN     NaN
1  37.0    F     NaN     NaN
2  18.0    F   james    76.0
3  27.0    M  denise    56.0
4  33.0    F  eileen    49.0
5  40.0    M    kobe    84.0
6   NaN  NaN  julian    61.0
7   NaN  NaN   colin    66.0
>>> pd.concat([df5_1,df5_2],axis=1,join='inner')
   age sex    name  weight
2   18   F   james      76
3   27   M  denise      56
4   33   F  eileen      49
5   40   M    kobe      84

>>> pd.concat([df5_2,df5_3])
     name  weight  height
2   james    76.0     NaN
3  denise    56.0     NaN
4  eileen    49.0     NaN
5    kobe    84.0     NaN
6  julian    61.0     NaN
7   colin    66.0     NaN
0  denise     NaN   164.0
1  maggie     NaN   178.0
2  eileen     NaN   152.0
3   cyril     NaN   160.0
4  julian     NaN   171.0
5   colin     NaN   166.0
>>> pd.concat([df5_2,df5_3],join='inner')
     name
2   james
3  denise
4  eileen
5    kobe
6  julian
7   colin
0  denise
1  maggie
2  eileen
3   cyril
4  julian
5   colin
>>> pd.concat([df5_2,df5_3],join='inner',ignore_index=True)
      name
0    james
1   denise
2   eileen
3     kobe
4   julian
5    colin
6   denise
7   maggie
8   eileen
9    cyril
10  julian
11   colin

按照DataFrame的列进行合并:

在实际数据分析时,往往需要从多个表中导入数据并根据关键词进行合并,这时可以使用merge来进行合并:

>>> df5_2=pd.DataFrame({'name': ['james','denise','eileen','kobe','julian','colin'],'weight':[76,56,49,84,61,66]},index=[2,3,4,5,6,7])
>>> df5_3=pd.DataFrame({'name': ['denise','maggie','eileen','cyril','julian','colin'],'height':[164,178,152,160,171,166]})
>>> df5_2.merge(df5_3,on='name')
     name  weight  height
0  denise      56     164
1  eileen      49     152
2  julian      61     171
3   colin      66     166

Comments

Make a comment