학습목표

  1. 숫자 데이터의 범주형 데이터 화
In [1]:
import pandas as pd
In [10]:
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
train_data = pd.read_csv('./train.csv')
print(train_data.head())
print(train_data['Age'])
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

Pclass 변수 변환하기

  • astype 사용하여 간단히 타입만 변환
In [3]:
train_data['Pclass'] = train_data['Pclass'].astype(str)

Age 변수 변환하기

  • 변환 로직을 함수로 만든 후, apply 함수로 적용
In [11]:
import math
def age_categorize(age):
    if math.isnan(age):
        return -1
    return math.floor(age/10)* 10
In [13]:
train_data['Age'].apply(age_categorize)
#apply는 pandas에서 지원해준다.
#apply : 하나하나 적용시켜준다. or for문을 써야한다.
Out[13]:
0      20
1      30
2      20
3      30
4      30
       ..
886    20
887    10
888    -1
889    20
890    30
Name: Age, Length: 891, dtype: int64
In [15]:
train_data['Age']
Out[15]:
0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64
In [18]:
# apply를 안 쓰면 for문을 써야한다.
for age in train_data['Age'] :
    tmp = int
    if math.isnan(age):
        tmp = -1
        continue
    tmp = math.floor(age/10)*10
    print(tmp, end=', ')
20, 30, 20, 30, 30, 50, 0, 20, 10, 0, 50, 20, 30, 10, 50, 0, 30, 30, 30, 10, 20, 0, 30, 10, 40, 60, 20, 40, 20, 10, 10, 40, 20, 0, 10, 10, 0, 20, 40, 20, 60, 20, 20, 0, 10, 20, 30, 40, 0, 20, 10, 10, 20, 30, 10, 20, 20, 30, 20, 0, 30, 20, 20, 20, 10, 30, 10, 20, 20, 20, 20, 40, 20, 50, 70, 20, 30, 30, 20, 20, 30, 30, 20, 20, 30, 40, 10, 20, 20, 10, 20, 70, 20, 20, 0, 20, 30, 30, 50, 10, 20, 40, 30, 20, 40, 20, 20, 20, 10, 30, 10, 20, 20, 20, 10, 10, 10, 20, 0, 30, 40, 50, 20, 50, 40, 50, 10, 30, 40, 40, 20, 10, 0, 0, 40, 20, 60, 0, 0, 20, 50, 10, 50, 30, 30, 0, 0, 0, 40, 40, 30, 30, 10, 10, 0, 40, 50, 40, 20, 20, 30, 40, 10, 0, 30, 20, 10, 40, 20, 30, 20, 30, 30, 20, 40, 30, 30, 10, 20, 50, 30, 20, 10, 20, 10, 30, 20, 50, 0, 20, 40, 0, 10, 30, 20, 20, 30, 40, 20, 20, 30, 50, 20, 60, 30, 40, 20, 30, 30, 50, 0, 50, 40, 30, 10, 20, 50, 30, 20, 40, 30, 60, 40, 0, 30, 60, 20, 10, 10, 30, 30, 20, 40, 20, 20, 10, 30, 20, 20, 20, 0, 50, 10, 0, 10, 30, 30, 20, 10, 20, 20, 40, 20, 20, 50, 30, 40, 20, 20, 30, 20, 30, 60, 30, 30, 10, 40, 30, 10, 20, 40, 40, 40, 0, 20, 20, 20, 30, 20, 40, 0, 40, 20, 10, 20, 20, 20, 30, 40, 20, 40, 30, 30, 60, 20, 20, 10, 10, 20, 0, 20, 20, 20, 10, 40, 0, 30, 30, 10, 0, 30, 10, 30, 20, 20, 20, 20, 20, 30, 40, 20, 20, 30, 20, 20, 20, 20, 30, 50, 0, 20, 30, 40, 30, 10, 30, 10, 20, 20, 20, 10, 20, 10, 30, 20, 40, 10, 50, 10, 20, 20, 60, 30, 40, 20, 20, 20, 0, 10, 30, 0, 50, 30, 30, 40, 20, 60, 50, 40, 30, 40, 40, 30, 50, 0, 30, 30, 20, 20, 30, 20, 20, 0, 0, 50, 60, 20, 30, 50, 30, 0, 20, 50, 70, 20, 50, 20, 20, 10, 20, 30, 10, 10, 30, 20, 20, 20, 30, 50, 20, 40, 30, 30, 30, 30, 20, 40, 40, 50, 30, 20, 0, 10, 30, 0, 40, 30, 20, 30, 0, 10, 30, 50, 60, 10, 30, 0, 10, 20, 20, 20, 60, 40, 30, 30, 40, 20, 20, 10, 20, 30, 60, 50, 30, 10, 10, 30, 30, 30, 20, 30, 50, 30, 10, 40, 60, 20, 30, 50, 40, 30, 30, 40, 40, 20, 40, 30, 30, 30, 20, 20, 40, 30, 30, 20, 30, 20, 0, 20, 20, 40, 20, 20, 20, 60, 50, 20, 20, 80, 50, 30, 0, 20, 30, 30, 40, 20, 20, 0, 0, 40, 10, 50, 20, 10, 20, 10, 20, 30, 20, 50, 50, 40, 40, 30, 20, 30, 20, 40, 40, 30, 70, 30, 10, 20, 10, 40, 30, 20, 20, 10, 60, 20, 10, 10, 10, 10, 30, 0, 20, 60, 50, 40, 40, 40, 10, 30, 10, 20, 20, 30, 40, 40, 20, 20, 40, 20, 50, 10, 30, 20, 30, 0, 10, 30, 50, 20, 20, 30, 20, 20, 20, 10, 20, 20, 20, 40, 30, 30, 20, 20, 30, 70, 10, 30, 10, 30, 0, 0, 30, 20, 40, 0, 20, 10, 30, 30, 40, 20, 30, 10, 50, 30, 30, 20, 40, 50, 50, 10, 0, 40, 10, 10, 20, 20, 20, 10, 0, 0, 40, 10, 20, 30, 40, 30, 30, 30, 30, 30, 10, 0, 20, 30, 30, 10, 30, 30, 20, 30, 30, 0, 30, 20, 30, 40, 10, 50, 20, 30, 20, 0, 0, 60, 10, 0, 20, 10, 30, 20, 30, 20, 10, 30, 30, 10, 40, 30, 20, 0, 70, 0, 10, 40, 10, 40, 50, 20, 40, 20, 40, 20, 40, 20, 30, 0, 20, 40, 30, 40, 20, 10, 20, 10, 50, 20, 30, 20, 20, 20, 30, 20, 10, 20, 30,