728x90

import pandas as pd
import numpy as np
x_train = pd.read_csv('data/X_train.csv', encoding='cp949')
y_train = pd.read_csv('data/y_train.csv', encoding='cp949')
x_test = pd.read_csv('data/X_test.csv', encoding='cp949')


x_test_id = x_test.loc[:,'cust_id']
x_test_id = x_test.iloc[:,0] # 어차피 같은거

#x_test_id2 = x_test['cust_id']
x_train = x_train.iloc[:, 1:] # cust_id 빼고 x_train으로 만듦
y_train = y_train.iloc[:, 1] # cust_id 빼고 gender만 남김
x_test = x_test.iloc[:, 1:] #  cust_id 빼고x_test로 만듦

x_train['환불금액'].fillna(0, inplace=True)
x_test['환불금액'].fillna(0, inplace=True)

print(x_train.describe())

x_train[x_train['총구매액'] < 0] = 0
print(x_train.describe())

import sklearn.preprocessing
x_test['총구매액'] = sklearn.preprocessing.maxabs_scale(x_test['총구매액'])
x_train['총구매액'] = sklearn.preprocessing.maxabs_scale(x_train['총구매액'])

x_test['최대구매액'] = sklearn.preprocessing.maxabs_scale(x_test['최대구매액'])
x_train['최대구매액'] = sklearn.preprocessing.maxabs_scale(x_train['최대구매액'])

x_test['환불금액'] = sklearn.preprocessing.maxabs_scale(x_test['환불금액'])
x_train['환불금액'] = sklearn.preprocessing.maxabs_scale(x_train['환불금액'])

x_test['내점일수'] = sklearn.preprocessing.maxabs_scale(x_test['내점일수'])
x_train['내점일수'] = sklearn.preprocessing.maxabs_scale(x_train['내점일수'])

x_train_enc = pd.get_dummies(x_train)

x_test_enc = pd.get_dummies(x_test)

lack_cols = set(x_train_enc.columns) - set(x_test_enc.columns)
remain_cols = set(x_test_enc.columns) - set(x_train_enc.columns)

for col in lack_cols:
    x_test_enc[col] = 0 #컬럼 만들어 줌

for col in remain_cols:
    x_test_enc.drop(col, axis = 1) # 컬럼 삭제

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x_train_enc, y_train, test_size=0.2, random_state=1)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis  import LinearDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models=[]
models.append(('clf', Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear'))])))
models.append(('lr', Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])))
models.append(('lda', Pipeline([('scaler', StandardScaler()), ('lda', LinearDiscriminantAnalysis())])))
models.append(('knn', Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])))
models.append(('gnb', Pipeline([('scaler', StandardScaler()), ('gnb', GaussianNB())])))
models.append(('svm', Pipeline([('scaler', StandardScaler()), ('svm', SVC(gamma='auto'))])))
models.append(('ada', Pipeline([('ada', AdaBoostClassifier())])))

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
for name, model in models:
    cv_results = cross_val_score(model, train_x, train_y, cv=cv, scoring='roc_auc')
    print(name+":"+str(cv_results.mean())+str(cv_results.std()))
    #print()
    #print(cv_results.std())

best_model = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear'))])

best_model.fit(train_x, train_y)
best_predict = best_model.predict_proba(test_x)

from sklearn.metrics import roc_auc_score
best_score = roc_auc_score(test_y, best_predict[:,1])
print(best_score)

best_model.fit(x_train_enc, y_train)

submit_predict = best_model.predict_proba(x_test_enc)
print(submit_predict)

final = pd.DataFrame({'cust_id':x_test_id, 'gender':submit_predict[:,1]})

final.to_csv('1234.csv', index=False)

참고 : https://deepcell.kr/bbs/board.php?bo_table=bigbungi&wr_id=23

728x90

+ Recent posts