반응형
import pandas as pd
pd.set_option('display.max_columns',None)
test = pd.read_csv('data/4th/test.csv')
train = pd.read_csv('data/4th/train.csv')
# print(test.shape, train.shape) #(2154,10),(6665,11)
# print(test.head())
print(train.head()) #Segmentation이 target
test_id = test.pop('ID') #pop의 결과는 Series
# print(type(test_id)) #<class 'pandas.core.series.Series'>
train = train.drop('ID',axis=1)
y_train = train.pop('Segmentation')
# print(test.info())
# print(train.info())
cols = train.select_dtypes(include='object').columns
# print(cols)
# print(train[cols].nunique())
# print(test[cols].nunique())
# object 타입 : Gender(2), Ever_Married(2), Graduated(2), Profession(9), Spending_Score(3), Var_1(7)
# for col in cols:
# print(col)
# a = set(train[col].unique())
# b = set(test[col].unique())
# print(a-b)
# print(b-a)
#확인완료
####################################인코딩 시작
# 라벨인코딩 진행 Profession(9) , Var_1(7)
# 원핫인코딩 진행 Gender(2), Ever_Married(2), Graduated(2), Spending_Score(3)
# print(train.head())
# print(test.head())
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
cols = ['Profession','Var_1']
for col in cols:
train[col] = encoder.fit_transform(train[col])
test[col] = encoder.transform(test[col])
cols = ['Gender','Ever_Married','Graduated','Spending_Score']
train = pd.get_dummies(train,columns=cols)
test = pd.get_dummies(test,columns=cols)
# print(train.head())
# print("===============")
# print(test.head())
###########################인코딩 완료
#스케일링 도전
# from sklearn.preprocessing import (MinMaxScaler,StandardScaler,RobustScaler)
# cols = ['Age']
# scaler = MinMaxScaler()
# train[cols] = scaler.fit_transform(train[cols])
# test[cols]=scaler.transform(test[cols])
# print(train.head())
#####################
# validation dataset 분리
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, y_train, test_size=0.2, random_state=2023)
# 다중분류 문제.
# 모델링, 평가지표 macro f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
model = RandomForestClassifier(random_state=2023,max_depth=9,n_estimators=1000,max_features=0.3)
model.fit(X_tr,y_tr)
pred=model.predict(X_val)
print(f1_score(y_val,pred,average='macro'))
# max_depth = 3-12. 3,5,7추천
# n_estimators= 100,200,400,800,1000까지.
# max_features=최대선택 특성수. 낮출수록 트리는깊어지고 달라짐.
# baseline: 0.47182631646663775
# max_depth=5: 0.4831811813874177
# max_depth=5,n_estimators=400: 0.4922063922842186
# max_depth=7,n_estimators=400: 0.5124680167386901
# max_depth=9,n_estimators=400: 0.518881638377439
# max_depth=9,n_estimators=800: 0.5213206729527654
# max_depth=9,n_estimators=1000: 0.5240952849451492
# max_depth=9,n_estimators=1000,max_features=0.3: 0.5263648466889252
# max_depth=9,n_estimators=1000,max_features=0.3인 상태에서
# Age에 Robust scaler 적용 : 0.5271273502440313
# Age에 MinMaxScaler 적용 : 0.5280972750676134
pred = model.predict(test)
submit = pd.DataFrame({
'ID' : test_id,
'Segmentation' : pred
})
submit.to_csv('data/4th/submission.csv',index=False)
mj = pd.read_csv('data/4th/submission.csv')
print(mj.head())
빠뜨린 것
>> 결측치 확인 (다행히 결측치 없었음. 있었으면 머신러닝 학습때 에러났겠지) train.isnull().sum()
>> target 피쳐의 불균형정도 확인. value_counts()
>> describe() 찍어보기
반응형
'IT,SW,Data,Cloud,코딩 > Python' 카테고리의 다른 글
20230621 캐글 연습문제 풀기 (0) | 2023.06.22 |
---|---|
20230620 넘파이 ceil, floor, trunc (0) | 2023.06.21 |
20230619 lightgbm (0) | 2023.06.19 |
20230619 빅분기 공부 5회 기출 유형 (0) | 2023.06.19 |
20230617 빅분기 3유형 가설검정 t-test, one-way ANOVA (1) | 2023.06.17 |
20230616 빅분기공부 (0) | 2023.06.17 |
20230615 머신러닝 공부, 시험환경 꿀팁 등 (0) | 2023.06.15 |
20230614 kaggle 따라치기 - 타이타닉 튜토리얼 (0) | 2023.06.14 |
댓글