import pandas as pd
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
train_data = pd.read_csv('./train.csv')
train_data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
train_data
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
train_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
train_data.isna()
#na 값을 찾아라 : na - > True
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | False | False | True | False |
1 | False | False | False | False | False | False | False | False | False | False | False | False |
2 | False | False | False | False | False | False | False | False | False | False | True | False |
3 | False | False | False | False | False | False | False | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | False | False | True | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | False | False | False | False | False | False | False | False | False | False | True | False |
887 | False | False | False | False | False | False | False | False | False | False | False | False |
888 | False | False | False | False | False | True | False | False | False | False | True | False |
889 | False | False | False | False | False | False | False | False | False | False | False | False |
890 | False | False | False | False | False | False | False | False | False | False | True | False |
891 rows × 12 columns
train_data.dropna(subset=['Age', 'Cabin'])
#train_data.dropna():na값이 들어있는 모든 값을 지워준다.
#train_data.dropna(subset=['Age', 'Cabin']) : Age와 Cabin에 na값이 있는 값을 지워준다.
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
10 | 11 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4.0 | 1 | 1 | PP 9549 | 16.7000 | G6 | S |
11 | 12 | 1 | 1 | Bonnell, Miss. Elizabeth | female | 58.0 | 0 | 0 | 113783 | 26.5500 | C103 | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
871 | 872 | 1 | 1 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | female | 47.0 | 1 | 1 | 11751 | 52.5542 | D35 | S |
872 | 873 | 0 | 1 | Carlsson, Mr. Frans Olof | male | 33.0 | 0 | 0 | 695 | 5.0000 | B51 B53 B55 | S |
879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
196 rows × 12 columns
train_data.dropna(axis=1)
#칼럼단위로 지우고 싶다.
#na가 있는 columns는 다 지워진다.
PassengerId | Survived | Pclass | Name | Sex | SibSp | Parch | Ticket | Fare | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 1 | 0 | A/5 21171 | 7.2500 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 1 | 0 | PC 17599 | 71.2833 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 0 | 0 | STON/O2. 3101282 | 7.9250 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 1 | 0 | 113803 | 53.1000 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 0 | 0 | 373450 | 8.0500 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 0 | 0 | 211536 | 13.0000 |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 0 | 0 | 112053 | 30.0000 |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 1 | 2 | W./C. 6607 | 23.4500 |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 0 | 0 | 111369 | 30.0000 |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 0 | 0 | 370376 | 7.7500 |
891 rows × 9 columns
train_data['Age'].fillna(train_data['Age'].mean())
# fillna는 na에 .{함수} | .mean(평균값)으로 채워 넣는다
# ? : 나이의 대푯값을 평균으로 구하는것이 맞을까?
# 평균은 이상한 값이 포함되면 신뢰도가 굉장히 떨어지기 때문에 최빈값을 많이 쓴다.
0 22.000000 1 38.000000 2 26.000000 3 35.000000 4 35.000000 ... 886 27.000000 887 19.000000 888 29.699118 889 26.000000 890 32.000000 Name: Age, Length: 891, dtype: float64
mean1 = train_data[train_data['Survived'] == 1]
mean1
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
875 | 876 | 1 | 3 | Najib, Miss. Adele Kiamie "Jane" | female | 15.0 | 0 | 0 | 2667 | 7.2250 | NaN | C |
879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C |
880 | 881 | 1 | 2 | Shelley, Mrs. William (Imanita Parrish Hall) | female | 25.0 | 0 | 1 | 230433 | 26.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
342 rows × 12 columns
#생존자의 나이 평균
mean1 = train_data[train_data['Survived'] == 1]['Age'].mean() #생존자의 Age칼럼의 평균값
#사망자의 나이 평균
mean2 = train_data[train_data['Survived'] == 0]['Age'].mean()
print(f'{mean1}, {mean2}')
28.343689655172415, 30.62617924528302
train_data[train_data['Survived'] == 1]['Age'] #생존자의 Age칼럼
train_data[train_data['Survived'] == 1]['Age'].fillna(mean1) #생존자의 Age칼럼
1 38.0 2 26.0 3 35.0 8 27.0 9 14.0 ... 875 15.0 879 56.0 880 25.0 887 19.0 889 26.0 Name: Age, Length: 342, dtype: float64
train_data.loc[train_data['Survived'] == 1, 'Age']
# loc를 활용한 문법은 ','를 사용해서 &를 표현한다.
1 38.0 2 26.0 3 35.0 8 27.0 9 14.0 ... 875 15.0 879 56.0 880 25.0 887 19.0 889 26.0 Name: Age, Length: 342, dtype: float64
train_data.loc[train_data['Survived'] == 1, 'Age']= train_data[train_data['Survived'] == 1]['Age'].fillna(mean1) #생존자의 Age칼럼
#생존자의 나이 평균
mean1 = train_data[train_data['Survived'] == 1]['Age'].mean() #생존자의 Age칼럼의 평균값
mean1
28.343689655172412
train_data[train_data['Survived'] == 1]['Age'].mean()
28.343689655172412
train_data[train_data['Age']== 28.343689655172415]
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
17 | 18 | 1 | 2 | Williams, Mr. Charles Eugene | male | 28.34369 | 0 | 0 | 244373 | 13.0000 | NaN | S |
19 | 20 | 1 | 3 | Masselmani, Mrs. Fatima | female | 28.34369 | 0 | 0 | 2649 | 7.2250 | NaN | C |
28 | 29 | 1 | 3 | O'Dwyer, Miss. Ellen "Nellie" | female | 28.34369 | 0 | 0 | 330959 | 7.8792 | NaN | Q |
31 | 32 | 1 | 1 | Spencer, Mrs. William Augustus (Marie Eugenie) | female | 28.34369 | 1 | 0 | PC 17569 | 146.5208 | B78 | C |
32 | 33 | 1 | 3 | Glynn, Miss. Mary Agatha | female | 28.34369 | 0 | 0 | 335677 | 7.7500 | NaN | Q |
36 | 37 | 1 | 3 | Mamee, Mr. Hanna | male | 28.34369 | 0 | 0 | 2677 | 7.2292 | NaN | C |
47 | 48 | 1 | 3 | O'Driscoll, Miss. Bridget | female | 28.34369 | 0 | 0 | 14311 | 7.7500 | NaN | Q |
55 | 56 | 1 | 1 | Woolner, Mr. Hugh | male | 28.34369 | 0 | 0 | 19947 | 35.5000 | C52 | S |
65 | 66 | 1 | 3 | Moubarek, Master. Gerios | male | 28.34369 | 1 | 1 | 2661 | 15.2458 | NaN | C |
82 | 83 | 1 | 3 | McDermott, Miss. Brigdet Delia | female | 28.34369 | 0 | 0 | 330932 | 7.7875 | NaN | Q |
107 | 108 | 1 | 3 | Moss, Mr. Albert Johan | male | 28.34369 | 0 | 0 | 312991 | 7.7750 | NaN | S |
109 | 110 | 1 | 3 | Moran, Miss. Bertha | female | 28.34369 | 1 | 0 | 371110 | 24.1500 | NaN | Q |
128 | 129 | 1 | 3 | Peter, Miss. Anna | female | 28.34369 | 1 | 1 | 2668 | 22.3583 | F E69 | C |
166 | 167 | 1 | 1 | Chibnall, Mrs. (Edith Martha Bowerman) | female | 28.34369 | 0 | 1 | 113505 | 55.0000 | E33 | S |
186 | 187 | 1 | 3 | O'Brien, Mrs. Thomas (Johanna "Hannah" Godfrey) | female | 28.34369 | 1 | 0 | 370365 | 15.5000 | NaN | Q |
198 | 199 | 1 | 3 | Madigan, Miss. Margaret "Maggie" | female | 28.34369 | 0 | 0 | 370370 | 7.7500 | NaN | Q |
241 | 242 | 1 | 3 | Murphy, Miss. Katherine "Kate" | female | 28.34369 | 1 | 0 | 367230 | 15.5000 | NaN | Q |
256 | 257 | 1 | 1 | Thorne, Mrs. Gertrude Maybelle | female | 28.34369 | 0 | 0 | PC 17585 | 79.2000 | NaN | C |
274 | 275 | 1 | 3 | Healy, Miss. Hanora "Nora" | female | 28.34369 | 0 | 0 | 370375 | 7.7500 | NaN | Q |
298 | 299 | 1 | 1 | Saalfeld, Mr. Adolphe | male | 28.34369 | 0 | 0 | 19988 | 30.5000 | C106 | S |
300 | 301 | 1 | 3 | Kelly, Miss. Anna Katherine "Annie Kate" | female | 28.34369 | 0 | 0 | 9234 | 7.7500 | NaN | Q |
301 | 302 | 1 | 3 | McCoy, Mr. Bernard | male | 28.34369 | 2 | 0 | 367226 | 23.2500 | NaN | Q |
303 | 304 | 1 | 2 | Keane, Miss. Nora A | female | 28.34369 | 0 | 0 | 226593 | 12.3500 | E101 | Q |
306 | 307 | 1 | 1 | Fleming, Miss. Margaret | female | 28.34369 | 0 | 0 | 17421 | 110.8833 | NaN | C |
330 | 331 | 1 | 3 | McCoy, Miss. Agnes | female | 28.34369 | 2 | 0 | 367226 | 23.2500 | NaN | Q |
334 | 335 | 1 | 1 | Frauenthal, Mrs. Henry William (Clara Heinshei... | female | 28.34369 | 1 | 0 | PC 17611 | 133.6500 | NaN | S |
347 | 348 | 1 | 3 | Davison, Mrs. Thomas Henry (Mary E Finck) | female | 28.34369 | 1 | 0 | 386525 | 16.1000 | NaN | S |
358 | 359 | 1 | 3 | McGovern, Miss. Mary | female | 28.34369 | 0 | 0 | 330931 | 7.8792 | NaN | Q |
359 | 360 | 1 | 3 | Mockler, Miss. Helen Mary "Ellie" | female | 28.34369 | 0 | 0 | 330980 | 7.8792 | NaN | Q |
367 | 368 | 1 | 3 | Moussa, Mrs. (Mantoura Boulos) | female | 28.34369 | 0 | 0 | 2626 | 7.2292 | NaN | C |
368 | 369 | 1 | 3 | Jermyn, Miss. Annie | female | 28.34369 | 0 | 0 | 14313 | 7.7500 | NaN | Q |
375 | 376 | 1 | 1 | Meyer, Mrs. Edgar Joseph (Leila Saks) | female | 28.34369 | 1 | 0 | PC 17604 | 82.1708 | NaN | C |
431 | 432 | 1 | 3 | Thorneycroft, Mrs. Percival (Florence Kate White) | female | 28.34369 | 1 | 0 | 376564 | 16.1000 | NaN | S |
444 | 445 | 1 | 3 | Johannesen-Bratthammer, Mr. Bernt | male | 28.34369 | 0 | 0 | 65306 | 8.1125 | NaN | S |
457 | 458 | 1 | 1 | Kenyon, Mrs. Frederick R (Marion) | female | 28.34369 | 1 | 0 | 17464 | 51.8625 | D21 | S |
507 | 508 | 1 | 1 | Bradley, Mr. George ("George Arthur Brayton") | male | 28.34369 | 0 | 0 | 111427 | 26.5500 | NaN | S |
533 | 534 | 1 | 3 | Peter, Mrs. Catherine (Catherine Rizk) | female | 28.34369 | 0 | 2 | 2668 | 22.3583 | NaN | C |
547 | 548 | 1 | 2 | Padro y Manent, Mr. Julian | male | 28.34369 | 0 | 0 | SC/PARIS 2146 | 13.8625 | NaN | C |
573 | 574 | 1 | 3 | Kelly, Miss. Mary | female | 28.34369 | 0 | 0 | 14312 | 7.7500 | NaN | Q |
596 | 597 | 1 | 2 | Leitch, Miss. Jessie Wills | female | 28.34369 | 0 | 0 | 248727 | 33.0000 | NaN | S |
612 | 613 | 1 | 3 | Murphy, Miss. Margaret Jane | female | 28.34369 | 1 | 0 | 367230 | 15.5000 | NaN | Q |
643 | 644 | 1 | 3 | Foo, Mr. Choong | male | 28.34369 | 0 | 0 | 1601 | 56.4958 | NaN | S |
653 | 654 | 1 | 3 | O'Leary, Miss. Hanora "Norah" | female | 28.34369 | 0 | 0 | 330919 | 7.8292 | NaN | Q |
669 | 670 | 1 | 1 | Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright) | female | 28.34369 | 1 | 0 | 19996 | 52.0000 | C126 | S |
692 | 693 | 1 | 3 | Lam, Mr. Ali | male | 28.34369 | 0 | 0 | 1601 | 56.4958 | NaN | S |
697 | 698 | 1 | 3 | Mullens, Miss. Katherine "Katie" | female | 28.34369 | 0 | 0 | 35852 | 7.7333 | NaN | Q |
709 | 710 | 1 | 3 | Moubarek, Master. Halim Gonios ("William George") | male | 28.34369 | 1 | 1 | 2661 | 15.2458 | NaN | C |
727 | 728 | 1 | 3 | Mannion, Miss. Margareth | female | 28.34369 | 0 | 0 | 36866 | 7.7375 | NaN | Q |
740 | 741 | 1 | 1 | Hawksford, Mr. Walter James | male | 28.34369 | 0 | 0 | 16988 | 30.0000 | D45 | S |
828 | 829 | 1 | 3 | McCormack, Mr. Thomas Joseph | male | 28.34369 | 0 | 0 | 367228 | 7.7500 | NaN | Q |
839 | 840 | 1 | 1 | Marechal, Mr. Pierre | male | 28.34369 | 0 | 0 | 11774 | 29.7000 | C47 | C |
849 | 850 | 1 | 1 | Goldenberg, Mrs. Samuel L (Edwiga Grabowska) | female | 28.34369 | 1 | 0 | 17453 | 89.1042 | C92 | C |