数据分组聚合
import numpy as np
import pandas as pd
from pandas import DataFrame
替换操作
- 替换操作可以同步作用于Series和DataFrame中
-
单值替换
- 普通替换: 替换所有符合要求的元素:to_replace=15,value='e'
- 按列指定单值替换: to_replace={列标签:替换值} value='value'
-
多值替换
- 列表替换: to_replace=[] value=[]
- 字典替换(推荐) to_replace={to_replace:value,to_replace:value}
df = DataFrame(data=np.random.randint(0,100,size=(5,6)))
df
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
65 |
85 |
66 |
36 |
72 |
2 |
1 |
85 |
51 |
15 |
67 |
57 |
56 |
2 |
1 |
48 |
9 |
39 |
88 |
52 |
3 |
48 |
34 |
89 |
50 |
5 |
63 |
4 |
17 |
75 |
5 |
72 |
44 |
53 |
df.replace(to_replace=2,value='Two')
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
65 |
85 |
66 |
36 |
72 |
Two |
1 |
85 |
51 |
15 |
67 |
57 |
56 |
2 |
1 |
48 |
9 |
39 |
88 |
52 |
3 |
48 |
34 |
89 |
50 |
5 |
63 |
4 |
17 |
75 |
5 |
72 |
44 |
53 |
df.replace(to_replace={1:'one'})
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
65 |
85 |
66 |
36 |
72 |
2 |
1 |
85 |
51 |
15 |
67 |
57 |
56 |
2 |
one |
48 |
9 |
39 |
88 |
52 |
3 |
48 |
34 |
89 |
50 |
5 |
63 |
4 |
17 |
75 |
5 |
72 |
44 |
53 |
df.replace(to_replace={4:5},value='five')
#将指定列的元素进行替换to_replase={列索引:被替换的值}
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
65 |
85 |
66 |
36 |
72 |
2 |
1 |
85 |
51 |
15 |
67 |
57 |
56 |
2 |
1 |
48 |
9 |
39 |
88 |
52 |
3 |
48 |
34 |
89 |
50 |
five |
63 |
4 |
17 |
75 |
5 |
72 |
44 |
53 |
映射操作
dic = {
'name':['张三','李四','张三'],
'salary':[15000,20000,15000]
}
df = DataFrame(data=dic)
df
|
name |
salary |
0 |
张三 |
15000 |
1 |
李四 |
20000 |
2 |
张三 |
15000 |
#映射关系表
dic = {
'张三':'tom',
'李四':'jack'
}
df['e_name'] = df['name'].map(dic)
df
- map是Series的方法,只能被Series调用
运算工具
- 超过3000部分的钱缴纳50%的税,计算每个人的税后薪资
#该函数是我们指定的一个运算法则
def after_sal(s):#计算s对应的税后薪资
return s - (s-3000)*0.5
df['after_sal'] = df['salary'].map(after_sal)#可以将df['salary']这个Series中每一个元素(薪资)作为参数传递给s
df
排序实现的随机抽样
- take()
- np.random.permutation()
df = DataFrame(data=np.random.randint(0,100,size=(100,3)),columns=['A','B','C'])
df
#生成乱序的随机序列
np.random.permutation(10)
array([2, 1, 7, 3, 5, 0, 4, 9, 6, 8])
#将原始数据打乱
df.take([2,0,1],axis=1)
df.take(np.random.permutation(3),axis=1)
df.take(np.random.permutation(3),axis=1).take(np.random.permutation(100),axis=0)[0:50]
数据的分类处理
-
数据分类处理的核心:
- groupby()函数
- groups属性查看分组情况
df = DataFrame({'item':['Apple','Banana','Orange','Banana','Orange','Apple'],
'price':[4,3,3,2.5,4,2],
'color':['red','yellow','yellow','green','green','green'],
'weight':[12,20,50,30,20,44]})
df
|
item |
price |
color |
weight |
0 |
Apple |
4.0 |
red |
12 |
1 |
Banana |
3.0 |
yellow |
20 |
2 |
Orange |
3.0 |
yellow |
50 |
3 |
Banana |
2.5 |
green |
30 |
4 |
Orange |
4.0 |
green |
20 |
5 |
Apple |
2.0 |
green |
44 |
#想要水果的种类进行分析
df.groupby(by='item')
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x1152f1908>
#查看详细的分组情况
df.groupby(by='item').groups
{'Apple': Int64Index([0, 5], dtype='int64'),
'Banana': Int64Index([1, 3], dtype='int64'),
'Orange': Int64Index([2, 4], dtype='int64')}
#计算出每一种水果的平均价格
df.groupby(by='item')['price'].mean()
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
#计算每一种颜色对应水果的平均重量
df.groupby(by='color')['weight'].mean()
color
green 31.333333
red 12.000000
yellow 35.000000
Name: weight, dtype: float64
dic = df.groupby(by='color')['weight'].mean().to_dict()
#将计算出的平均重量汇总到源数据
df['mean_w'] = df['color'].map(dic)
df
高级数据聚合
- 使用groupby分组后,也可以使用transform和apply提供自定义函数实现更多的运算
- df.groupby('item')['price'].sum() <==> df.groupby('item')['price'].apply(sum)
- transform和apply都会进行运算,在transform或者apply中传入函数即可
- transform和apply也可以传入一个lambda表达式
def my_mean(s):
m_sum = 0
for i in s:
m_sum += i
return m_sum / len(s)
df.groupby(by='item')['price'].transform(my_mean)
0 3.00
1 2.75
2 3.50
3 2.75
4 3.50
5 3.00
Name: price, dtype: float64
df.groupby(by='item')['price'].apply(my_mean)
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
数据加载
df = pd.read_csv('./data/type-.txt')
df
|
你好-我好-他也好 |
0 |
也许-大概-有可能 |
1 |
然而-未必-不见得 |
- 将文件中每一个词作为元素存放在DataFrame中
pd.read_csv('./data/type-.txt',header=None,sep='-')
|
0 |
1 |
2 |
0 |
你好 |
我好 |
他也好 |
1 |
也许 |
大概 |
有可能 |
2 |
然而 |
未必 |
不见得 |
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
engine = create_engine(
'mysql+pymysql://root:root123@127.0.0.1:3306/testdb?charset=utf8')
detail = pd.read_sql_table('meal_order_detail1',con = engine)
detailGroup = detail[['order_id','counts',
'amounts']].groupby(by = 'order_id')
print('分组后的订单详情表为:',detailGroup)
print('订单详情表分组后前5组每组的均值为:\n',
detailGroup.mean().head())
print('订单详情表分组后前5组每组的标准差为:\n',
detailGroup.std().head())
print('订单详情表分组后前5组每组的大小为:','\n',
detailGroup.size().head())
print('订单详情表的菜品销量与售价的和与均值为:\n',
detail[['counts','amounts']].agg([np.sum,np.mean]))
print('订单详情表的菜品销量总和与售价的均值为:\n',
detail.agg({'counts':np.sum,'amounts':np.mean}))
print('菜品订单详情表的菜品销量总和与售价的总和与均值为:\n',
detail.agg({'counts':np.sum,'amounts':[np.mean,np.sum]}))
##自定义函数求两倍的和
def DoubleSum(data):
s = data.sum()*2
return s
print('菜品订单详情表的菜品销量两倍总和为:','\n',
detail.agg({'counts':DoubleSum},axis = 0))
##自定义函数求两倍的和
def DoubleSum1(data):
s = np.sum(data)*2
return s
print('订单详情表的菜品销量两倍总和为:\n',
detail.agg({'counts':DoubleSum1},axis = 0).head())
print('订单详情表的菜品销量与售价的和的两倍为:\n',
detail[['counts','amounts']].agg(DoubleSum1))
print('订单详情表分组后前3组每组的均值为:\n',
detailGroup.agg(np.mean).head(3))
print('订单详情表分组后前3组每组的标准差为:\n',
detailGroup.agg(np.std).head(3))
print('订单详情分组前3组每组菜品总数和售价均值为:\n',
detailGroup.agg({'counts':np.sum,
'amounts':np.mean}).head(3))
print('订单详情表的菜品销量与售价的均值为:\n',
detail[['counts','amounts']].apply(np.mean))
print('订单详情表分组后前3组每组的均值为:','\n', detailGroup.apply(np.mean).head(3))
print('订单详情表分组后前3组每组的标准差为:','\n', detailGroup.apply(np.std).head(3))
print('订单详情表的菜品销量与售价的两倍为:\n',
detail[['counts','amounts']].transform(
lambda x:x*2).head(4))
print('订单详情表分组后实现组内离差标准化后前五行为:\n',
detailGroup.transform(lambda x:(x.mean()
-x.min())/(x.max()-x.min())).head())
detail['place_order_time'] = pd.to_datetime(
detail['place_order_time'])
detail['date'] = [i.date() for i in detail['place_order_time']]
detailGroup = detail[['date','counts','amounts']].groupby(by='date')
print('订单详情表前5组每组的数目为:\n',detailGroup.size().head())
dayMean = detailGroup.agg({'amounts':np.mean})
print('订单详情表前五组每日菜品均价为:\n',dayMean.head())
dayMedian = detailGroup.agg({'amounts':np.median})
print('订单详情表前五组每日菜品售价中位数为:\n',dayMedian.head())
daySaleSum = detailGroup.apply(np.sum)['counts']
print('订单详情表前五组每日菜品售出数目为:\n',daySaleSum.head())