본문 바로가기
IT,SW,Data,Cloud,코딩/Python

20230617 빅분기 4회 기출유형 제2유형

by 착실하게 2023. 6. 17.
반응형
import pandas as pd 
pd.set_option('display.max_columns',None)
test = pd.read_csv('data/4th/test.csv')
train = pd.read_csv('data/4th/train.csv')

# print(test.shape, train.shape) #(2154,10),(6665,11)
# print(test.head())
print(train.head()) #Segmentation이 target 

test_id = test.pop('ID') #pop의 결과는 Series
# print(type(test_id)) #<class 'pandas.core.series.Series'>
train = train.drop('ID',axis=1)

y_train = train.pop('Segmentation')


# print(test.info())
# print(train.info())

cols = train.select_dtypes(include='object').columns
# print(cols)

# print(train[cols].nunique())
# print(test[cols].nunique())

# object 타입 : Gender(2), Ever_Married(2), Graduated(2), Profession(9), Spending_Score(3), Var_1(7)

# for col in cols:
#   print(col)
#   a = set(train[col].unique())
#   b = set(test[col].unique())
#   print(a-b)
#   print(b-a) 
#확인완료

####################################인코딩 시작 
# 라벨인코딩 진행 Profession(9) , Var_1(7)
# 원핫인코딩 진행 Gender(2), Ever_Married(2), Graduated(2), Spending_Score(3)

# print(train.head())
# print(test.head())

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
cols = ['Profession','Var_1']
for col in cols:
  train[col] = encoder.fit_transform(train[col])
  test[col] = encoder.transform(test[col])

cols = ['Gender','Ever_Married','Graduated','Spending_Score']
train = pd.get_dummies(train,columns=cols)
test = pd.get_dummies(test,columns=cols)

# print(train.head())
# print("===============")
# print(test.head())
###########################인코딩 완료

#스케일링 도전
# from sklearn.preprocessing import (MinMaxScaler,StandardScaler,RobustScaler)

# cols = ['Age']
# scaler = MinMaxScaler()
# train[cols] = scaler.fit_transform(train[cols])
# test[cols]=scaler.transform(test[cols])

# print(train.head())





#####################
# validation dataset 분리 
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, y_train, test_size=0.2, random_state=2023)

# 다중분류 문제. 
# 모델링, 평가지표 macro f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score 

model = RandomForestClassifier(random_state=2023,max_depth=9,n_estimators=1000,max_features=0.3)
model.fit(X_tr,y_tr)
pred=model.predict(X_val)
print(f1_score(y_val,pred,average='macro'))

# max_depth = 3-12. 3,5,7추천
# n_estimators= 100,200,400,800,1000까지.
# max_features=최대선택 특성수. 낮출수록 트리는깊어지고 달라짐.

# baseline: 0.47182631646663775
# max_depth=5: 0.4831811813874177
# max_depth=5,n_estimators=400: 0.4922063922842186
# max_depth=7,n_estimators=400: 0.5124680167386901
# max_depth=9,n_estimators=400: 0.518881638377439
# max_depth=9,n_estimators=800: 0.5213206729527654
# max_depth=9,n_estimators=1000: 0.5240952849451492
# max_depth=9,n_estimators=1000,max_features=0.3: 0.5263648466889252

# max_depth=9,n_estimators=1000,max_features=0.3인 상태에서 
# Age에 Robust scaler 적용 : 0.5271273502440313 
# Age에 MinMaxScaler 적용 : 0.5280972750676134 



pred = model.predict(test)

submit = pd.DataFrame({
  'ID' : test_id,
  'Segmentation' : pred
})

submit.to_csv('data/4th/submission.csv',index=False)

mj = pd.read_csv('data/4th/submission.csv')
print(mj.head())

 

 

빠뜨린 것
>> 결측치 확인 (다행히 결측치 없었음. 있었으면 머신러닝 학습때 에러났겠지) train.isnull().sum()
>> target 피쳐의 불균형정도 확인. value_counts()

>> describe() 찍어보기 

 

 

반응형

댓글