반응형
In [289]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np
import math
1. 원 데이터 백업¶
In [293]:
df=sns.load_dataset("tips")
df
df.to_csv('C:\\Users\\tipsdata.csv')
file_path='C:\\Users\\tipsdata.csv'
df=pd.read_csv(file_path)
df
databackup=df.copy()
df.to_csv('C:\\Users\\tipsdatabackup.csv')
In [294]:
file_path='C:\\Users\\tipsdata.csv'
databackup=pd.read_csv(file_path)
databackup
Out[294]:
Unnamed: 0 | total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
239 | 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 8 columns
데이터 개수 확인¶
In [304]:
df.count()
Out[304]:
Unnamed: 0 244
total_bill 244
tip 244
sex 244
smoker 244
day 244
time 244
size 244
dtype: int64
결측치 확인¶
In [250]:
df2.isnull().sum()
Out[250]:
total_bill 0
tip 0
sex 0
smoker 0
day 0
time 0
size 0
dtype: int64
결측치 평균값으로 대체¶
In [276]:
df2=df.fillna(df.mean())
df2
Out[276]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
원 데이터와의 비교¶
In [265]:
df
Out[265]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
이 데이터에는 결측 데이터가 없습니다.¶
In [253]:
import matplotlib as mpl
import matplotlib.pylab as plt
plt.boxplot(df2['size'])
plt.show()
In [254]:
df2[df2["size"]>100]
Out[254]:
total_bill | tip | sex | smoker | day | time | size |
---|
In [255]:
plt.boxplot(df['tip'])
plt.show()
In [256]:
df2[df2["tip"]>100]
Out[256]:
total_bill | tip | sex | smoker | day | time | size |
---|
이 데이터에는 이상데이터가 없습니다.¶
In [270]:
pd.set_option('display.max_rows', None)
In [273]:
pd.reset_option('^display.', silent=True)
In [277]:
df2.duplicated()
Out[277]:
0 False
1 False
2 False
3 False
4 False
...
239 False
240 False
241 False
242 False
243 False
Length: 244, dtype: bool
완전히 같은 행 제거 처리¶
In [278]:
df2=df2.drop_duplicates()
df2
Out[278]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
243 rows × 7 columns
두 열의 값이 같은 행 제거 처리¶
In [227]:
df2=df2.drop_duplicates(subset=['tip','total_bill'])
df2
Out[227]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
242 rows × 7 columns
이 데이터에는 중복 데이터가 없습니다.¶
데이터 유형 확인¶
In [238]:
df2.dtypes
Out[238]:
total_bill float64
tip float64
sex category
smoker category
day category
time category
size int64
dtype: object
데이터 기본 정보 출력¶
In [239]:
df2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 243 entries, 0 to 243
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 total_bill 243 non-null float64
1 tip 243 non-null float64
2 sex 243 non-null category
3 smoker 243 non-null category
4 day 243 non-null category
5 time 243 non-null category
6 size 243 non-null int64
dtypes: category(4), float64(2), int64(1)
memory usage: 9.0 KB
데이터 행 인덱스 확인¶
In [315]:
df2.iloc[0]
Out[315]:
total_bill 16.99
tip 1.01
sex Female
smoker No
day Sun
time Dinner
size 2
total 18
Name: 0, dtype: object
데이터 상위 데이터 추출¶
In [296]:
df2.head()
Out[296]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
데이터 하위 데이터 추출¶
In [297]:
df2.tail()
Out[297]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
데이터 프레임 인덱스 보기¶
In [298]:
df2.index
Out[298]:
Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
...
234, 235, 236, 237, 238, 239, 240, 241, 242, 243],
dtype='int64', length=243)
행, 열 구조 보기¶
In [299]:
df2.values
Out[299]:
array([[16.99, 1.01, 'Female', ..., 'Sun', 'Dinner', 2],
[10.34, 1.66, 'Male', ..., 'Sun', 'Dinner', 3],
[21.01, 3.5, 'Male', ..., 'Sun', 'Dinner', 3],
...,
[22.67, 2.0, 'Male', ..., 'Sat', 'Dinner', 2],
[17.82, 1.75, 'Male', ..., 'Sat', 'Dinner', 2],
[18.78, 3.0, 'Female', ..., 'Thur', 'Dinner', 2]], dtype=object)
~기준 오름차순 정렬¶
In [301]:
df2.sort_values(by=['total_bill'], axis=0)
Out[301]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
67 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
92 | 5.75 | 1.00 | Female | Yes | Fri | Dinner | 2 |
111 | 7.25 | 1.00 | Female | No | Sat | Dinner | 1 |
172 | 7.25 | 5.15 | Male | Yes | Sun | Dinner | 2 |
149 | 7.51 | 2.00 | Male | No | Thur | Lunch | 2 |
... | ... | ... | ... | ... | ... | ... | ... |
182 | 45.35 | 3.50 | Male | Yes | Sun | Dinner | 3 |
156 | 48.17 | 5.00 | Male | No | Sun | Dinner | 6 |
59 | 48.27 | 6.73 | Male | No | Sat | Dinner | 4 |
212 | 48.33 | 9.00 | Male | No | Sat | Dinner | 4 |
170 | 50.81 | 10.00 | Male | Yes | Sat | Dinner | 3 |
243 rows × 7 columns
~기준 내림차순 정렬¶
In [302]:
df2.sort_values(by=['total_bill'], axis=0, ascending=False)
Out[302]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
170 | 50.81 | 10.00 | Male | Yes | Sat | Dinner | 3 |
212 | 48.33 | 9.00 | Male | No | Sat | Dinner | 4 |
59 | 48.27 | 6.73 | Male | No | Sat | Dinner | 4 |
156 | 48.17 | 5.00 | Male | No | Sun | Dinner | 6 |
182 | 45.35 | 3.50 | Male | Yes | Sun | Dinner | 3 |
... | ... | ... | ... | ... | ... | ... | ... |
149 | 7.51 | 2.00 | Male | No | Thur | Lunch | 2 |
172 | 7.25 | 5.15 | Male | Yes | Sun | Dinner | 2 |
111 | 7.25 | 1.00 | Female | No | Sat | Dinner | 1 |
92 | 5.75 | 1.00 | Female | Yes | Fri | Dinner | 2 |
67 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
243 rows × 7 columns
단순열 합계¶
In [303]:
df2.sum(axis=1)
Out[303]:
0 20.00
1 15.00
2 27.51
3 28.99
4 32.20
...
239 37.95
240 31.18
241 26.67
242 21.57
243 23.78
Length: 243, dtype: float64
In [279]:
df2
Out[279]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
243 rows × 7 columns
In [241]:
df2['sex'].value_counts()
Out[241]:
Male 157
Female 86
Name: sex, dtype: int64
In [242]:
df2['day'].value_counts()
Out[242]:
Sat 87
Sun 76
Thur 61
Fri 19
Name: day, dtype: int64
In [243]:
df2['smoker'].value_counts()
Out[243]:
No 151
Yes 92
Name: smoker, dtype: int64
In [244]:
df2['time'].value_counts()
Out[244]:
Dinner 176
Lunch 67
Name: time, dtype: int64
In [180]:
pd.crosstab(df2['sex'],df2['day'])
Out[180]:
day | Thur | Fri | Sat | Sun |
---|---|---|---|---|
sex | ||||
Male | 30 | 10 | 58 | 57 |
Female | 30 | 9 | 28 | 18 |
여백 또는 누적값 확인¶
In [280]:
pd.crosstab(df2['sex'],df2['day'], margins=True)
Out[280]:
day | Thur | Fri | Sat | Sun | All |
---|---|---|---|---|---|
sex | |||||
Male | 30 | 10 | 59 | 58 | 157 |
Female | 31 | 9 | 28 | 18 | 86 |
All | 61 | 19 | 87 | 76 | 243 |
전체 빈도 비율 확인¶
In [281]:
pd.crosstab(df2['sex'],df2['day']).apply(lambda r: r/len(df2),axis=1)
Out[281]:
day | Thur | Fri | Sat | Sun |
---|---|---|---|---|
sex | ||||
Male | 0.123457 | 0.041152 | 0.242798 | 0.238683 |
Female | 0.127572 | 0.037037 | 0.115226 | 0.074074 |
3) 연속형 변수¶
데이터 중심 확인 [평균값과 중위값의 차이를 보고 데이터 클린징이 잘 수행되었는지 확인]
데이터 퍼짐 정도 확인
In [312]:
df2.describe()
Out[312]:
total_bill | tip | size | total | |
---|---|---|---|---|
count | 243.000000 | 243.000000 | 243.000000 | 243.000000 |
mean | 19.813868 | 3.002387 | 2.572016 | 22.816255 |
std | 8.910071 | 1.385002 | 0.952356 | 9.897837 |
min | 3.070000 | 1.000000 | 1.000000 | 4.070000 |
25% | 13.380000 | 2.000000 | 2.000000 | 15.640000 |
50% | 17.810000 | 2.920000 | 2.000000 | 20.620000 |
75% | 24.175000 | 3.575000 | 3.000000 | 27.785000 |
max | 50.810000 | 10.000000 | 6.000000 | 60.810000 |
4) 파생변수(유도변수) 만들기¶
In [417]:
df2['total']=df2['total_bill']+df2['tip']
<ipython-input-417-b78c44a1fef5>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df2['total']=df2['total_bill']+df2['tip']
In [367]:
df3=df2.groupby(['day'])
df3
Out[367]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000118FA4F34F0>
In [369]:
df3.get_group('Thur')
Out[369]:
total_bill | tip | sex | smoker | day | time | size | total | |
---|---|---|---|---|---|---|---|---|
77 | 27.20 | 4.00 | Male | No | Thur | Lunch | 4 | 31.20 |
78 | 22.76 | 3.00 | Male | No | Thur | Lunch | 2 | 25.76 |
79 | 17.29 | 2.71 | Male | No | Thur | Lunch | 2 | 20.00 |
80 | 19.44 | 3.00 | Male | Yes | Thur | Lunch | 2 | 22.44 |
81 | 16.66 | 3.40 | Male | No | Thur | Lunch | 2 | 20.06 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
201 | 12.74 | 2.01 | Female | Yes | Thur | Lunch | 2 | 14.75 |
203 | 16.40 | 2.50 | Female | Yes | Thur | Lunch | 2 | 18.90 |
204 | 20.53 | 4.00 | Male | Yes | Thur | Lunch | 4 | 24.53 |
205 | 16.47 | 3.23 | Female | Yes | Thur | Lunch | 3 | 19.70 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 21.78 |
61 rows × 8 columns
In [373]:
df2.groupby('day').mean()
Out[373]:
total_bill | tip | size | total | |
---|---|---|---|---|
day | ||||
Thur | 17.759508 | 2.784098 | 2.459016 | 20.543607 |
Fri | 17.151579 | 2.734737 | 2.105263 | 19.886316 |
Sat | 20.441379 | 2.993103 | 2.517241 | 23.434483 |
Sun | 21.410000 | 3.255132 | 2.842105 | 24.665132 |
2) 그룹 분석 활용 [차별화 분석 포인트]¶
그룹 분석 통계¶
여러 통계 함수 적용¶
In [374]:
df2.groupby(["day","sex"]).aggregate([np.mean, np.sum, 'count'])
Out[374]:
total_bill | tip | size | total | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | sum | count | mean | sum | count | mean | sum | count | mean | sum | count | ||
day | sex | ||||||||||||
Thur | Male | 18.714667 | 561.44 | 30 | 2.980333 | 89.41 | 30 | 2.433333 | 73 | 30 | 21.695000 | 650.85 | 30 |
Female | 16.835161 | 521.89 | 31 | 2.594194 | 80.42 | 31 | 2.483871 | 77 | 31 | 19.429355 | 602.31 | 31 | |
Fri | Male | 19.857000 | 198.57 | 10 | 2.693000 | 26.93 | 10 | 2.100000 | 21 | 10 | 22.550000 | 225.50 | 10 |
Female | 14.145556 | 127.31 | 9 | 2.781111 | 25.03 | 9 | 2.111111 | 19 | 9 | 16.926667 | 152.34 | 9 | |
Sat | Male | 20.802542 | 1227.35 | 59 | 3.083898 | 181.95 | 59 | 2.644068 | 156 | 59 | 23.886441 | 1409.30 | 59 |
Female | 19.680357 | 551.05 | 28 | 2.801786 | 78.45 | 28 | 2.250000 | 63 | 28 | 22.482143 | 629.50 | 28 | |
Sun | Male | 21.887241 | 1269.46 | 58 | 3.220345 | 186.78 | 58 | 2.810345 | 163 | 58 | 25.107586 | 1456.24 | 58 |
Female | 19.872222 | 357.70 | 18 | 3.367222 | 60.61 | 18 | 2.944444 | 53 | 18 | 23.239444 | 418.31 | 18 |
In [ ]:
def total_cat(total):
if math.isnan(total):
return -1
return (total//5)*10
In [ ]:
df2["total"]=df2['total'].apply(total_cat)
In [421]:
total_result=df2.groupby(['day','sex']).aggregate([np.mean,'count'])['total']
요일별 남자, 여자 평균 지출량¶
In [422]:
total_result
Out[422]:
mean | count | ||
---|---|---|---|
day | sex | ||
Thur | Male | 21.695000 | 30 |
Female | 19.429355 | 31 | |
Fri | Male | 22.550000 | 10 |
Female | 16.926667 | 9 | |
Sat | Male | 23.886441 | 59 |
Female | 22.482143 | 28 | |
Sun | Male | 25.107586 | 58 |
Female | 23.239444 | 18 |
In [423]:
total_result['mean'].sort_values(ascending=False).plot.bar()
Out[423]:
<AxesSubplot:xlabel='day,sex'>
In [ ]:
반응형
'Computer Engineering > Big Data Analytics Using Python' 카테고리의 다른 글
[빅데이터분석 실습] 스몰데이터 통계 Part.1 (0) | 2021.05.11 |
---|---|
[빅데이터 분석 프로젝트] 마크다운으로 타이타닉 탑승객의 생존률 보고서 작성하기 -Part 2 (0) | 2021.04.19 |
[빅데이터 분석] 10minutes pandas (1) (0) | 2021.04.17 |
[빅데이터 분석] 6장. 데이터 탐색 (0) | 2021.04.13 |
[빅데이터 분석 프로젝트] 마크다운으로 따릉이 데이터 분석 보고서 작성하기 -Part 1 (0) | 2021.04.06 |