반응형
데이터 분석처리¶
In [178]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np
import math
1. 원 데이터 백업¶
In [179]:
file_path='C:\\Users\\titanic.csv'
df=pd.read_csv(file_path)
df
databackup=df.copy()
df.to_csv('C:\\Users\\titanicdatabackup.csv')
In [180]:
file_path='C:\\Usersn\\titanic.csv'
databackup=pd.read_csv(file_path)
databackup
Out[180]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
데이터 개수 확인¶
In [181]:
df.count()
Out[181]:
PassengerId 891
Survived 891
Pclass 891
Name 891
Sex 891
Age 714
SibSp 891
Parch 891
Ticket 891
Fare 891
Cabin 204
Embarked 889
dtype: int64
결측치 확인¶
In [182]:
df.isnull().sum()
Out[182]:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
결측치 평균값으로 대체¶
In [183]:
df2=df.fillna(df.mean())
df2
Out[183]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
원 데이터와의 비교¶
In [184]:
df
Out[184]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
In [185]:
df2.isnull().sum()
Out[185]:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
In [186]:
import matplotlib as mpl
import matplotlib.pylab as plt
plt.boxplot(df2['Age'])
plt.show()
In [187]:
df2[df2["Age"]>100]
Out[187]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
In [188]:
plt.boxplot(df2['Fare'])
plt.show()
In [238]:
df2[df2["Fare"]>500]
Out[238]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_class | Age_class2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
679 | 680 | 1 | 1 | Cardeza, Mr. Thomas Drake Martinez | male | 30.0 | 0 | 1 | PC 17755 | 512.3292 | B51 B53 B55 | C | 성년 | 노년 |
737 | 738 | 1 | 1 | Lesurer, Mr. Gustave J | male | 30.0 | 0 | 0 | PC 17755 | 512.3292 | B101 | C | 성년 | 노년 |
이상 데이터 제거¶
In [239]:
df2=df2.drop(행n)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-239-2a325d4d54d5> in <module>
----> 1 df2=df2.drop(행n)
NameError: name '행n' is not defined
In [190]:
pd.set_option('display.max_rows', None)
In [191]:
pd.reset_option('^display.', silent=True)
In [192]:
df2.duplicated()
Out[192]:
0 False
1 False
2 False
3 False
4 False
...
886 False
887 False
888 False
889 False
890 False
Length: 891, dtype: bool
완전히 같은 행 제거 처리¶
In [193]:
df2=df2.drop_duplicates()
df2
Out[193]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
두 열의 값이 같은 행 제거 처리¶
In [194]:
df2=df2.drop_duplicates(subset=['Name','Age'])
df2
Out[194]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
데이터 유형 확인¶
In [195]:
df2.dtypes
Out[195]:
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
데이터 기본 정보 출력¶
In [196]:
df2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 891 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB
데이터 행 인덱스 확인¶
In [197]:
df2.iloc[0]
Out[197]:
PassengerId 1
Survived 0
Pclass 3
Name Braund, Mr. Owen Harris
Sex male
Age 22
SibSp 1
Parch 0
Ticket A/5 21171
Fare 7.25
Cabin NaN
Embarked S
Name: 0, dtype: object
데이터 상위 데이터 추출¶
In [198]:
df2.head()
Out[198]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
데이터 하위 데이터 추출¶
In [199]:
df2.tail()
Out[199]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.00 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.00 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.45 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.00 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.75 | NaN | Q |
데이터 프레임 인덱스 보기¶
In [200]:
df2.index
Out[200]:
Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
...
881, 882, 883, 884, 885, 886, 887, 888, 889, 890],
dtype='int64', length=891)
행, 열 구조 보기¶
In [201]:
df2.values
Out[201]:
array([[1, 0, 3, ..., 7.25, nan, 'S'],
[2, 1, 1, ..., 71.2833, 'C85', 'C'],
[3, 1, 3, ..., 7.925, nan, 'S'],
...,
[889, 0, 3, ..., 23.45, nan, 'S'],
[890, 1, 1, ..., 30.0, 'C148', 'C'],
[891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)
요금기준 오름차순 정렬¶
In [202]:
df2.sort_values(by=['Fare'], axis=0)
Out[202]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
271 | 272 | 1 | 3 | Tornquist, Mr. William Henry | male | 25.000000 | 0 | 0 | LINE | 0.0000 | NaN | S |
597 | 598 | 0 | 3 | Johnson, Mr. Alfred | male | 49.000000 | 0 | 0 | LINE | 0.0000 | NaN | S |
302 | 303 | 0 | 3 | Johnson, Mr. William Cahoone Jr | male | 19.000000 | 0 | 0 | LINE | 0.0000 | NaN | S |
633 | 634 | 0 | 1 | Parr, Mr. William Henry Marsh | male | 29.699118 | 0 | 0 | 112052 | 0.0000 | NaN | S |
277 | 278 | 0 | 2 | Parkes, Mr. Francis "Frank" | male | 29.699118 | 0 | 0 | 239853 | 0.0000 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
438 | 439 | 0 | 1 | Fortune, Mr. Mark | male | 64.000000 | 1 | 4 | 19950 | 263.0000 | C23 C25 C27 | S |
341 | 342 | 1 | 1 | Fortune, Miss. Alice Elizabeth | female | 24.000000 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S |
737 | 738 | 1 | 1 | Lesurer, Mr. Gustave J | male | 35.000000 | 0 | 0 | PC 17755 | 512.3292 | B101 | C |
258 | 259 | 1 | 1 | Ward, Miss. Anna | female | 35.000000 | 0 | 0 | PC 17755 | 512.3292 | NaN | C |
679 | 680 | 1 | 1 | Cardeza, Mr. Thomas Drake Martinez | male | 36.000000 | 0 | 1 | PC 17755 | 512.3292 | B51 B53 B55 | C |
891 rows × 12 columns
요금기준 내림차순 정렬¶
In [203]:
df2.sort_values(by=['Fare'], axis=0, ascending=False)
Out[203]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
258 | 259 | 1 | 1 | Ward, Miss. Anna | female | 35.000000 | 0 | 0 | PC 17755 | 512.3292 | NaN | C |
737 | 738 | 1 | 1 | Lesurer, Mr. Gustave J | male | 35.000000 | 0 | 0 | PC 17755 | 512.3292 | B101 | C |
679 | 680 | 1 | 1 | Cardeza, Mr. Thomas Drake Martinez | male | 36.000000 | 0 | 1 | PC 17755 | 512.3292 | B51 B53 B55 | C |
88 | 89 | 1 | 1 | Fortune, Miss. Mabel Helen | female | 23.000000 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S |
27 | 28 | 0 | 1 | Fortune, Mr. Charles Alexander | male | 19.000000 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
633 | 634 | 0 | 1 | Parr, Mr. William Henry Marsh | male | 29.699118 | 0 | 0 | 112052 | 0.0000 | NaN | S |
413 | 414 | 0 | 2 | Cunningham, Mr. Alfred Fleming | male | 29.699118 | 0 | 0 | 239853 | 0.0000 | NaN | S |
822 | 823 | 0 | 1 | Reuchlin, Jonkheer. John George | male | 38.000000 | 0 | 0 | 19972 | 0.0000 | NaN | S |
732 | 733 | 0 | 2 | Knight, Mr. Robert J | male | 29.699118 | 0 | 0 | 239855 | 0.0000 | NaN | S |
674 | 675 | 0 | 2 | Watson, Mr. Ennis Hastings | male | 29.699118 | 0 | 0 | 239856 | 0.0000 | NaN | S |
891 rows × 12 columns
단순열 합계¶
In [204]:
df2.sum(axis=1)
Out[204]:
0 34.250000
1 114.283300
2 40.925000
3 95.100000
4 51.050000
...
886 929.000000
887 939.000000
888 948.149118
889 948.000000
890 933.750000
Length: 891, dtype: float64
In [205]:
df2
Out[205]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
In [206]:
df2['Sex'].value_counts()
Out[206]:
male 577
female 314
Name: Sex, dtype: int64
In [207]:
df2['Pclass'].value_counts()
Out[207]:
3 491
1 216
2 184
Name: Pclass, dtype: int64
In [208]:
df2['Survived'].value_counts()
Out[208]:
0 549
1 342
Name: Survived, dtype: int64
In [209]:
pd.crosstab(df2['Sex'],df2['Pclass'])
Out[209]:
Pclass | 1 | 2 | 3 |
---|---|---|---|
Sex | |||
female | 94 | 76 | 144 |
male | 122 | 108 | 347 |
여백 또는 누적값 확인¶
In [210]:
pd.crosstab(df2['Sex'],df2['Pclass'], margins=True)
Out[210]:
Pclass | 1 | 2 | 3 | All |
---|---|---|---|---|
Sex | ||||
female | 94 | 76 | 144 | 314 |
male | 122 | 108 | 347 | 577 |
All | 216 | 184 | 491 | 891 |
전체 빈도 비율 확인¶
In [211]:
pd.crosstab(df2['Sex'],df2['Pclass']).apply(lambda r: r/len(df2),axis=1)
Out[211]:
Pclass | 1 | 2 | 3 |
---|---|---|---|
Sex | |||
female | 0.105499 | 0.085297 | 0.161616 |
male | 0.136925 | 0.121212 | 0.389450 |
3) 연속형 변수¶
데이터 중심 확인 [평균값과 중위값의 차이를 보고 데이터 클린징이 잘 수행되었는지 확인]
데이터 퍼짐 정도 확인
In [212]:
df2.describe()
Out[212]:
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 13.002015 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 22.000000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 29.699118 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 35.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
4) 파생변수(유도변수) 만들기¶
In [90]:
df2['total']=df2['total_bill']+df2['tip']
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\python\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2894 try:
-> 2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'total_bill'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-90-b78c44a1fef5> in <module>
----> 1 df2['total']=df2['total_bill']+df2['tip']
C:\python\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2900 if self.columns.nlevels > 1:
2901 return self._getitem_multilevel(key)
-> 2902 indexer = self.columns.get_loc(key)
2903 if is_integer(indexer):
2904 indexer = [indexer]
C:\python\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
-> 2897 raise KeyError(key) from err
2898
2899 if tolerance is not None:
KeyError: 'total_bill'
4. 데이터 탐색¶
In [213]:
bins=[1,20,60,100]
df2_Age=pd.cut(df2["Age"],bins,labels=["미성년", "성년","노년"])
df2["Age_class"]=df2_Age
df2
Out[213]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 성년 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 성년 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 성년 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S | 성년 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S | 성년 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.0000 | NaN | S | 성년 |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S | 미성년 |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 성년 |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C | 성년 |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 성년 |
891 rows × 13 columns
나이를 3개 범주로 구분하기¶
In [215]:
df2_Age2=pd.qcut(df2["Age"], q=3, labels=["미성년","성년","노년"])
df2["Age_class2"]=df2_Age2
df2
Out[215]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_class | Age_class2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 성년 | 미성년 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 성년 | 노년 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 성년 | 성년 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S | 성년 | 노년 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S | 성년 | 노년 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.0000 | NaN | S | 성년 | 성년 |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S | 미성년 | 미성년 |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 성년 | 성년 |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C | 성년 | 성년 |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 성년 | 노년 |
891 rows × 14 columns
범주형 컬럼 처리¶
In [216]:
df2['Sex']=df2['Sex'].astype('category')
In [217]:
df2['Pclass']=df2['Pclass'].astype('category')
In [218]:
df2['Embarked']=df2['Embarked'].astype('category')
In [219]:
df2
Out[219]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_class | Age_class2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 성년 | 미성년 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 성년 | 노년 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 성년 | 성년 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S | 성년 | 노년 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S | 성년 | 노년 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.0000 | NaN | S | 성년 | 성년 |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S | 미성년 | 미성년 |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 성년 | 성년 |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C | 성년 | 성년 |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 성년 | 노년 |
891 rows × 14 columns
성별, 선실 , 나이 그룹에 의한 생존율 계산¶
In [220]:
df4=df2.groupby(['Sex','Age_class','Pclass'])["Survived"].mean()
df5=df4.unstack("Pclass")
df5
Out[220]:
Pclass | 1 | 2 | 3 | |
---|---|---|---|---|
Sex | Age_class | |||
female | 미성년 | 0.928571 | 1.000000 | 0.465116 |
성년 | 0.974359 | 0.900000 | 0.489583 | |
노년 | 1.000000 | NaN | 1.000000 | |
male | 미성년 | 0.500000 | 0.357143 | 0.180556 |
성년 | 0.388350 | 0.069767 | 0.119850 | |
노년 | 0.083333 | 0.333333 | 0.000000 |
성별 및 선실에 의한 생존율을 피봇 데이터로 표현¶
In [231]:
df2.pivot_table(['Survived'],index=['Sex','Pclass'], aggfunc='mean')
Out[231]:
Survived | ||
---|---|---|
Sex | Pclass | |
female | 1 | 0.968085 |
2 | 0.921053 | |
3 | 0.500000 | |
male | 1 | 0.368852 |
2 | 0.157407 | |
3 | 0.135447 |
2) 그룹 함수로 분석하기¶
In [221]:
df3=df2.groupby(['Sex'])
df3
Out[221]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002367CA772E0>
In [222]:
df3.get_group('female')
Out[222]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_class | Age_class2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 성년 | 노년 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 성년 | 성년 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S | 성년 | 노년 |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.000000 | 0 | 2 | 347742 | 11.1333 | NaN | S | 성년 | 성년 |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.000000 | 1 | 0 | 237736 | 30.0708 | NaN | C | 미성년 | 미성년 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
880 | 881 | 1 | 2 | Shelley, Mrs. William (Imanita Parrish Hall) | female | 25.000000 | 0 | 1 | 230433 | 26.0000 | NaN | S | 성년 | 미성년 |
882 | 883 | 0 | 3 | Dahlberg, Miss. Gerda Ulrika | female | 22.000000 | 0 | 0 | 7552 | 10.5167 | NaN | S | 성년 | 미성년 |
885 | 886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.000000 | 0 | 5 | 382652 | 29.1250 | NaN | Q | 성년 | 노년 |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S | 미성년 | 미성년 |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 성년 | 성년 |
314 rows × 14 columns
In [223]:
df2.groupby('Pclass').mean()
Out[223]:
PassengerId | Survived | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
Pclass | ||||||
1 | 461.597222 | 0.629630 | 37.048118 | 0.416667 | 0.356481 | 84.154687 |
2 | 445.956522 | 0.472826 | 29.866958 | 0.402174 | 0.380435 | 20.662183 |
3 | 439.154786 | 0.242363 | 26.403259 | 0.615071 | 0.393075 | 13.675550 |
성별과 좌석등급으로 구분한 연속형 데이터 평균 [생존율, 평균나이, 평균 비용]¶
In [224]:
df2.groupby(['Pclass','Sex']).mean()
Out[224]:
PassengerId | Survived | Age | SibSp | Parch | Fare | ||
---|---|---|---|---|---|---|---|
Pclass | Sex | ||||||
1 | female | 469.212766 | 0.968085 | 34.141405 | 0.553191 | 0.457447 | 106.125798 |
male | 455.729508 | 0.368852 | 39.287717 | 0.311475 | 0.278689 | 67.226127 | |
2 | female | 443.105263 | 0.921053 | 28.748661 | 0.486842 | 0.605263 | 21.970121 |
male | 447.962963 | 0.157407 | 30.653908 | 0.342593 | 0.222222 | 19.741782 | |
3 | female | 399.729167 | 0.500000 | 24.068493 | 0.895833 | 0.798611 | 16.118810 |
male | 455.515850 | 0.135447 | 27.372153 | 0.498559 | 0.224784 | 12.661633 |
3) 그룹 분석 활용 [차별화 분석 포인트]¶
(1)그룹 분석 통계¶
여러 통계 함수 적용¶
In [225]:
df2.groupby(["Sex","Pclass"]).aggregate([np.mean, np.sum, 'count'])
Out[225]:
PassengerId | Survived | Age | SibSp | Parch | Fare | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | sum | count | mean | sum | count | mean | sum | count | mean | sum | count | mean | sum | count | mean | sum | count | ||
Sex | Pclass | ||||||||||||||||||
female | 1 | 469.212766 | 44106 | 94 | 0.968085 | 91 | 94 | 34.141405 | 3209.292059 | 94 | 0.553191 | 52 | 94 | 0.457447 | 43 | 94 | 106.125798 | 9975.8250 | 94 |
2 | 443.105263 | 33676 | 76 | 0.921053 | 70 | 76 | 28.748661 | 2184.898235 | 76 | 0.486842 | 37 | 76 | 0.605263 | 46 | 76 | 21.970121 | 1669.7292 | 76 | |
3 | 399.729167 | 57561 | 144 | 0.500000 | 72 | 144 | 24.068493 | 3465.862941 | 144 | 0.895833 | 129 | 144 | 0.798611 | 115 | 144 | 16.118810 | 2321.1086 | 144 | |
male | 1 | 455.729508 | 55599 | 122 | 0.368852 | 45 | 122 | 39.287717 | 4793.101471 | 122 | 0.311475 | 38 | 122 | 0.278689 | 34 | 122 | 67.226127 | 8201.5875 | 122 |
2 | 447.962963 | 48380 | 108 | 0.157407 | 17 | 108 | 30.653908 | 3310.622059 | 108 | 0.342593 | 37 | 108 | 0.222222 | 24 | 108 | 19.741782 | 2132.1125 | 108 | |
3 | 455.515850 | 158064 | 347 | 0.135447 | 47 | 347 | 27.372153 | 9498.137059 | 347 | 0.498559 | 173 | 347 | 0.224784 | 78 | 347 | 12.661633 | 4393.5865 | 347 |
In [226]:
def age_cat(Age):
if math.isnan(Age):
return -1
return (Age//10)*10
In [227]:
df2["Age"]=df2['Age'].apply(age_cat)
In [228]:
age_result=df2.groupby(['Age','Pclass']).aggregate([np.mean,'count'])['Survived']
좌석등급별 남자, 여자 생존자수¶
In [229]:
age_result
Out[229]:
mean | count | ||
---|---|---|---|
Age | Pclass | ||
0.0 | 1 | 0.666667 | 3 |
2 | 1.000000 | 17 | |
3 | 0.452381 | 42 | |
10.0 | 1 | 0.833333 | 18 |
2 | 0.500000 | 18 | |
3 | 0.257576 | 66 | |
20.0 | 1 | 0.593750 | 64 |
2 | 0.406250 | 64 | |
3 | 0.241636 | 269 | |
30.0 | 1 | 0.780000 | 50 |
2 | 0.395833 | 48 | |
3 | 0.217391 | 69 | |
40.0 | 1 | 0.594595 | 37 |
2 | 0.555556 | 18 | |
3 | 0.058824 | 34 | |
50.0 | 1 | 0.555556 | 27 |
2 | 0.333333 | 15 | |
3 | 0.000000 | 6 | |
60.0 | 1 | 0.307692 | 13 |
2 | 0.333333 | 3 | |
3 | 0.333333 | 3 | |
70.0 | 1 | 0.000000 | 3 |
2 | 0.000000 | 1 | |
3 | 0.000000 | 2 | |
80.0 | 1 | 1.000000 | 1 |
2 | NaN | 0 | |
3 | NaN | 0 |
In [230]:
age_result['mean'].sort_values(ascending=False).plot.bar()
Out[230]:
<AxesSubplot:xlabel='Age,Pclass'>
반응형
'Computer Engineering > Big Data Analytics Using Python' 카테고리의 다른 글
[빅데이터 분석] 스몰데이터 통계 Part.1 (0) | 2021.05.11 |
---|---|
[빅데이터분석 실습] 스몰데이터 통계 Part.1 (0) | 2021.05.11 |
[빅데이터 분석 프로젝트] 마크다운으로 팁 데이터 분석 보고서 작성하기 -Part 2 (0) | 2021.04.19 |
[빅데이터 분석] 10minutes pandas (1) (0) | 2021.04.17 |
[빅데이터 분석] 6장. 데이터 탐색 (0) | 2021.04.13 |