본문 바로가기
IT,SW,Data,Cloud,코딩/Python

20230619 빅분기 공부 5회 기출 유형

by 착실하게 2023. 6. 19.
반응형
import pandas as pd

# 종량제 봉투 종류가 '규격봉투'이고, 종량제 봉투 용도가 '음식물쓰레기'인 2L가격 평균을 출력하시오 (단, 가격0 제외, 반올림 후 정수 출력)
df = pd.read_csv("5-1price.csv")

cond1 = (df['종량제봉투종류'] == '규격봉투')
cond2 = (df['종량제봉투용도'] == '음식물쓰레기')
cond3 = (df['2l가격'] != 0)
ans = df[cond1&cond2&cond3]['2l가격'].mean()
print(int(round(ans,0)))
print(round(ans)) #이렇게 해도 동일하게 정수로 된다. ,0 안하면됨. 


# bmi 계산하고, 수치가 정상인 사람 수와 위험체중인 사람 수의 차이를 절대값으로 구하시오. (정수로 출력)
# bmi: 몸무게 (kg) / 키 (m) 의 제곱 단위 
# 정상체중: 18.5 이상~23미만
#과체중 또는 위험체중: 23이상~25미만
df = pd.read_csv("5-2bmi.csv")

df['bmi'] = df['Weight'] / (df['Height']/100)**2 

normal = (18.5 <= df['bmi'] &  df['bmi'] < 23) 
danger = (23 <= df['bmi'] & df['bmi'] < 25 )

print(abs(len(df[normal]) - len(df[danger])))

# 순전입학생이 가장 많은 학교의 전체학생수 (순전입학생 = 전입학생 - 전출학생 ) 정수로 출력하시오.
df = pd.read_csv("5-3student.csv")
df['순전입']= df['전입학생수(계)'] - df['전출학생수(계)']

df = df.sort_values('순전입',ascending=False)
int(df.iloc[0]['전체학생수(계)'])

 

5회 실기 2유형 3유형 연습 

 

#자동차 가격 예측
#예측할 값(y): price
#평가:RMSE (Root Mean Squared Error)
#data: train, test 
#제출: result.csv 컬럼은 'pred' 1개 

import pandas as pd 
train = pd.read_csv('data/5th/train.csv')
test = pd.read_csv('data/5th/test.csv')

y_train = train.pop('price')

# print(train.shape,test.shape) # (3759, 8) (1617, 8)
# print("============")
# print(train.head())
# print(test.head())
# print("============")
# print(train.info())
# print(test.info())
# print("============")
# print(train.describe())
# print(test.describe())
# print("============")
# print(train.isnull().sum()) #결측치 없음 
# print(test.isnull().sum()) #결측치 없음  

# 스케일링 대상: 
cols = train.select_dtypes(exclude=object).columns 
# print('스케일링대상컬럼', cols)
from sklearn.preprocessing import RobustScaler 
scaler = RobustScaler()
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

# print(train.head())
# print(test.head())

# 인코딩 대상: model, transmission, fuelType 
# for col in cols:
#   print(col)
#   print(train[col].nunique())
#   print(test[col].nunique())
# model 19,19, transmission 3,3 fuelType 3,3 
# train, test 각각 동일함

cols = ['model','transmission','fuelType']
from sklearn.preprocessing import LabelEncoder 
encoder = LabelEncoder()
for col in cols:
  train[col]=encoder.fit_transform(train[col])
  test[col]=encoder.transform(test[col])

# print(train.head())
# print(test.head())

from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(train, y_train, test_size = 0.2 , random_state = 2023 )

from sklearn.metrics import mean_squared_error 

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import numpy as np 

def calc_rmse(y_val,pred):
  mse = mean_squared_error(y_val,pred)
  return np.sqrt(mse)


# model = LinearRegression()
# model.fit(X_tr,y_tr)
# pred=model.predict(X_val)
# rmse = calc_rmse(y_val,pred)
# print('LinearRegression ', rmse) 

#2266

# model = DecisionTreeRegressor(random_state=2023)
# model.fit(X_tr,y_tr)
# pred=model.predict(X_val)
# rmse = calc_rmse(y_val,pred)
# print('DecisionTreeRegressor ', rmse) 

#1738

model = RandomForestRegressor(random_state=2023 )
model.fit(X_tr,y_tr)
pred=model.predict(X_val)
rmse = calc_rmse(y_val,pred)
print('RandomForestRegressor ', rmse) 

#1234
#max_depth=5: 1797 
#max_depth=7: 1436 
#max_depth=9: 1263 
#n_estimators=200: 1241 
#max_depth =7, n_estimators=800: 1424 

model = XGBRegressor(random_state=2023  )
model.fit(X_tr,y_tr)
pred=model.predict(X_val)
rmse = calc_rmse(y_val,pred)
print('XGBRegressor ', rmse) 

#1202
# max_depth =5: 1218
# max_depth =7: 1797
# max_depth =9: 1271 
# n_estimators=200: 1227 
# max_depth =5, n_estimators=400: 1287 
# max_depth =7, n_estimators=800: 1306 

# model = LGBMRegressor(random_state=2023)
# model.fit(X_tr,y_tr)
# pred=model.predict(X_val)
# rmse = calc_rmse(y_val,pred)
# print('LGBMRegressor ', rmse)

pred = model.predict(test)

# pred = pred.astype(int)

submit = pd.DataFrame({
  'pred':pred
})

submit.to_csv('data/5th/20230619_01.csv',index=False)

mj = pd.read_csv('data/5th/20230619_01.csv')
print(mj.head())

y=pd.read_csv('data/5th/y.csv')
rmse = calc_rmse(y,pred)
print('real : ', rmse)

 

반응형

댓글