Lidar Car Detection
Solution for submission 155379
A detailed solution for submission 155379 submitted for challenge Lidar Car Detection
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import gc
import os
from glob import glob
import pickle
import random
import shutil
import seaborn as sns
from collections import Counter
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score, train_test_split
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score, matthews_corrcoef
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
#import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None
In [ ]:
In [2]:
train_data = np.load("./data/train.npz", allow_pickle=True)
train_data = train_data['train']
train_data.shape
Out[2]:
In [3]:
X = train_data[:100, 0]
dtrain = [i.flatten() for i in X]
target = train_data[:100, 1]
In [4]:
test_data = np.load("./data/test.npz", allow_pickle=True)
test_data = test_data['test']
test_data.shape
Out[4]:
In [5]:
dtest = [i.flatten() for i in test_data]
In [6]:
dtrain = np.array(dtrain)
dtest = np.array(dtest)
In [7]:
use = ['x_' + str(f) for f in range(dtrain.shape[1])]
dtrain = pd.DataFrame(dtrain, columns = use)
dtrain['target'] = target
dtest = pd.DataFrame(dtest, columns = use)
In [8]:
dtrain.shape, dtest.shape
Out[8]:
In [9]:
dtrain.head()
Out[9]:
In [10]:
# remove constant columns
remove = []
for c in dtrain.columns:
if dtrain[c].std() == 0:
remove.append(c)
print('remove constant columns', remove)
dtrain.drop(remove, axis = 1, inplace = True)
dtest.drop(remove, axis = 1, inplace = True)
use = list(dtest.columns)
In [11]:
print(dtrain.shape, dtest.shape)
In [12]:
temp = pd.read_csv('imp_df-0.csv')
remove = temp['feat'][temp['diff'] >= 0]
#remove
for u in remove:
if u in use:
use.remove(u)
len(use)
Out[12]:
In [13]:
temp = pd.read_csv('imp_df-1.csv')
remove = temp['feat'][temp['diff'] >= 0]
#remove
for u in remove:
if u in use:
use.remove(u)
len(use)
Out[13]:
In [14]:
target = dtrain['target']
svd = TruncatedSVD(n_components = 45, n_iter = 15, random_state = 42)
svd.fit(dtrain[use].values)
print(svd.explained_variance_ratio_.sum())
xtrain = svd.transform(dtrain[use].values)
xtest = svd.transform(dtest[use].values)
dtrain = pd.DataFrame(xtrain)
dtest = pd.DataFrame(xtest)
xuse = ['x_' + str(f) for f in range(dtrain.shape[1])]
dtrain.columns = xuse
dtrain['target'] = target
dtest.columns = xuse
use = xuse
dtrain.head()
Out[14]:
In [ ]:
In [15]:
################### permutation importance
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
target = 'target'
dtrain.reset_index(drop = True, inplace = True)
def pim(xtrain, xuse, target):
models = []
Loss = []
seeds = 3
n_splits = 5
imp_df = pd.DataFrame(columns = ['feat', 'imp', 'imp_permuted'])
I = 0
for seed in range(seeds):
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
for train_index, test_index in skf.split(xtrain, xtrain[target]):
X_train, X_val = xtrain[use].loc[train_index], xtrain[use].loc[test_index]
y_train, y_val = xtrain[target].loc[train_index], xtrain[target].loc[test_index]
model = LGBMRegressor(
max_depth = 3 + seed * 2,
random_state = seed,
n_estimators = 1000,
#device = 'gpu',
subsample = 0.98,
subsample_freq = 5,
colsample_bytree = 0.98,
reg_alpha = 0.01,
reg_lambda = 0.1
)
model.fit(
X_train,
y_train.values,
eval_set = (X_val, y_val),
early_stopping_rounds = 100,
verbose = False,
)
temp = model.predict(X_val)
sc = mean_squared_error(y_val, temp)
Loss.append(sc)
models.append(model)
sc = np.mean(Loss)
for u in xuse:
N = 0
Loss_permuted = []
for seed in range(seeds):
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
for train_index, test_index in skf.split(xtrain, xtrain[target]):
X_train, X_val = xtrain[use].loc[train_index], xtrain[use].loc[test_index]
y_train, y_val = xtrain[target].loc[train_index], xtrain[target].loc[test_index]
X_val_permuted = X_val.copy()
temp = X_val_permuted[u].values
np.random.shuffle(temp)
X_val_permuted[u] = temp
temp = models[N].predict(X_val_permuted.values)
sc_permuted = mean_squared_error(y_val, temp)
Loss_permuted.append(sc_permuted)
N += 1
sc_permuted = np.mean(Loss_permuted)
if sc_permuted <= sc:
print(u, sc, sc_permuted, 'need delete')
else:
print(u, sc, sc_permuted)
if u in imp_df['feat'].unique():
imp_df['imp'][imp_df['feat'] == u] += sc
imp_df['imp_permuted'][imp_df['feat'] == u] += sc_permuted
else:
L = imp_df.shape[0]
imp_df.loc[L, 'feat'] = u
imp_df['imp'][imp_df['feat'] == u] = sc
imp_df['imp_permuted'][imp_df['feat'] == u] = sc_permuted
imp_df['diff'] = imp_df['imp'] - imp_df['imp_permuted']
imp_df.sort_values(by = ['diff'], inplace = True)
imp_df.to_csv('imp_df.csv', index = False)
imp_df[['imp', 'imp_permuted']] = imp_df[['imp', 'imp_permuted']] / (seeds * n_splits)
################### permutation importance
imp_df['diff'] = imp_df['imp'] - imp_df['imp_permuted']
imp_df.sort_values(by = ['diff'], inplace = True)
remove = imp_df['feat'][imp_df['diff'] >= 0].values
print(remove)
for u in remove:
if u in use:
use.remove(u)
return(xuse)
target = 'target'
dtrain.reset_index(drop = True, inplace = True)
dtrain[target] = dtrain[target].astype(int)
use = pim(dtrain, use, target)
In [16]:
use
Out[16]:
In [ ]:
In [17]:
target = dtrain['target'].values
dtrain = dtrain[use]
dtest = dtest[use]
dtrain['target'] = target
dtrain['target'] = dtrain['target'].astype(int)
gc.collect()
Out[17]:
In [18]:
dtrain.head()
Out[18]:
In [19]:
# covariate-shift-adaptation
if 'is_train' in dtrain.columns:
del dtrain['is_train']
if 'is_train' in dtest.columns:
del dtest['is_train']
if 'is_train' in use:
use.remove('is_train')
len_X = dtrain.shape[0]
dtrain['is_train'] = 1
dtest['is_train'] = 0
use.append('is_train')
dtrain['oof'] = 0
df = pd.concat([dtrain[use], dtest[use]], axis = 0, ignore_index = True, sort = False)
target = df['is_train']
del dtrain['is_train'], dtest['is_train']
df['pred'] = 0
AUC = []
use.remove('is_train')
seeds = 3
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = False)
for seed in range(seeds):
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
model = LGBMClassifier(
max_depth = 3 + seed * 2,
random_state = seed,
n_estimators = 1000,
#device = 'gpu',
subsample = 0.98,
subsample_freq = 5,
colsample_bytree = 0.98,
reg_alpha = 0.01,
reg_lambda = 0.1
)
for train_index, test_index in skf.split(df, df['is_train']):
X_train, X_test = df.loc[train_index], df.loc[test_index]
y_train, y_test = df['is_train'].loc[train_index], df['is_train'].loc[test_index]
model.fit(
X_train[use],
y_train,
eval_set = (X_test[use], y_test),
verbose = False,
early_stopping_rounds = 100,
)
df['pred'].loc[test_index]
temp = model.predict_proba(X_test[use])[:, 0]
df['pred'].loc[test_index] += temp
temp = model.predict_proba(X_test[use])[:, 1]
sc = roc_auc_score(y_test, temp)
print('roc_auc_score', sc)
AUC.append(sc)
weights = df['pred'][:len_X]
weights /= np.mean(weights)
print(weights.min(), weights.max())
dtrain['weights'] = weights
In [20]:
def get_params(train, target, use, n_splits, n_calls = 5, verbose = False):
def score(params_temp):
Loss = []
n_estimators = N_estimators
params0 = {
'subsample': params_temp[0],
'colsample_bytree': params_temp[1],
'reg_alpha': params_temp[2],
'reg_lambda': params_temp[3],
'learning_rate': params_temp[4],
'num_leaves': params_temp[5],
'max_depth': params_temp[6],
'n_estimators': n_estimators,
#'device': 'gpu',
}
params1 = {
'subsample': params_temp[7],
'colsample_bytree': params_temp[8],
'reg_alpha': params_temp[9],
'reg_lambda': params_temp[10],
'learning_rate': params_temp[11],
'num_leaves': params_temp[12],
'max_depth': params_temp[13],
'n_estimators': n_estimators,
#'device': 'gpu',
}
params2 = {
'subsample': params_temp[14],
'colsample_bytree': params_temp[15],
'reg_alpha': params_temp[16],
'reg_lambda': params_temp[17],
'learning_rate': params_temp[18],
'num_leaves': params_temp[19],
'max_depth': params_temp[20],
'n_estimators': n_estimators,
#'device': 'gpu',
}
oof = np.zeros([train.shape[0]])
oof = pd.DataFrame(oof, columns = ['oof'], index = train.index)
for seed in range(seeds):
params0['random_state'] = seed
clf0 = LGBMRegressor(**params0)
params1['random_state'] = seed
clf1 = LGBMRegressor(**params1)
params2['random_state'] = seed
clf2 = LGBMRegressor(**params2)
esr = 100
I = 0
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
for train_index, test_index in skf.split(train, train[target]):
X_trg, X_val = train[use].loc[train_index], train[use].loc[test_index]
y_trg, y_val = train[target].loc[train_index], train[target].loc[test_index]
clf0.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp0 = clf0.predict(X_val)
clf1.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp1 = clf1.predict(X_val)
clf2.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp2 = clf2.predict(X_val)
temp = (temp0 + temp1 + temp2) / 3
sc = mean_squared_error(y_val, temp)
Loss.append(sc)
#oof['oof'] = oof['oof'] / I
#oof = oof['oof'].values
#Loss = np.sqrt(mean_squared_error(dtrain[target], oof))
Loss = np.mean(Loss)
L = df_res.shape[0] + 1
df_res.loc[L, 'rmse'] = Loss
best = df_res['rmse'].min()
print('rmse...', Loss, 'Best...', best, 'Iter ', L)
#if L > 1:
# plt.plot(range(df_res.shape[0]), df_res['rmse'], 'r')
# plt.show()
#print(params0)
return(Loss)
df_res = pd.DataFrame()
values = [
Real(0.90, 1, "log-uniform", name = 'subsample'),
Real(0.90, 1, "log-uniform", name = 'colsample_bytree'),
Real(1e-14, 0.2, "log-uniform", name = 'reg_alpha'),
Real(1e-14, 0.2, "log-uniform", name = 'reg_lambda'),
Real(0.03, 0.3, "log-uniform", name = 'learning_rate'),
Integer(5, 50, name = 'num_leaves'),
Integer(3, 50, name = 'max_depth'),
Real(0.90, 1, "log-uniform", name = 'subsample'),
Real(0.90, 1, "log-uniform", name = 'colsample_bytree'),
Real(1e-14, 0.2, "log-uniform", name = 'reg_alpha'),
Real(1e-14, 0.2, "log-uniform", name = 'reg_lambda'),
Real(0.03, 0.3, "log-uniform", name = 'learning_rate'),
Integer(5, 75, name = 'num_leaves'),
Integer(3, 75, name = 'max_depth'),
Real(0.90, 1, "log-uniform", name = 'subsample'),
Real(0.90, 1, "log-uniform", name = 'colsample_bytree'),
Real(1e-14, 0.2, "log-uniform", name = 'reg_alpha'),
Real(1e-14, 0.2, "log-uniform", name = 'reg_lambda'),
Real(0.03, 0.3, "log-uniform", name = 'learning_rate'),
Integer(5, 75, name = 'num_leaves'),
Integer(3, 75, name = 'max_depth'),
]
res_gp = gp_minimize(score, values, n_calls = n_calls, random_state = 142, n_random_starts = 3)
n_estimators = N_estimators
params0 = {
'subsample': res_gp.x[0],
'colsample_bytree': res_gp.x[1],
'reg_alpha': res_gp.x[2],
'reg_lambda': res_gp.x[3],
'learning_rate': res_gp.x[4],
'num_leaves': res_gp.x[5],
'max_depth': res_gp.x[6],
'n_estimators': n_estimators,
#'device': 'gpu',
}
params1 = {
'subsample': res_gp.x[7],
'colsample_bytree': res_gp.x[8],
'reg_alpha': res_gp.x[9],
'reg_lambda': res_gp.x[10],
'learning_rate': res_gp.x[11],
'num_leaves': res_gp.x[12],
'max_depth': res_gp.x[13],
'n_estimators': n_estimators,
#'device': 'gpu',
}
params2 = {
'subsample': res_gp.x[14],
'colsample_bytree': res_gp.x[15],
'reg_alpha': res_gp.x[16],
'reg_lambda': res_gp.x[17],
'learning_rate': res_gp.x[18],
'num_leaves': res_gp.x[19],
'max_depth': res_gp.x[20],
'n_estimators': n_estimators,
#'device': 'gpu',
}
print('\n', 'Best score', res_gp.fun, '\n')
print(params0, '\n', params1, '\n', params2)
return(params0, params1, params2)
In [21]:
seeds = 5
n_splits = 7
N_estimators = 1000
target = 'target'
params0, params1, params2 = get_params(dtrain, target, use, n_splits, 10)
In [ ]:
In [22]:
use
Out[22]:
In [ ]:
In [23]:
Loss = []
n_estimators = 10000
dtest[target] = 0
if True:
for seed in range(seeds):
params0['random_state'] = seed
clf0 = LGBMRegressor(**params0)
params1['random_state'] = seed
clf1 = LGBMRegressor(**params1)
params2['random_state'] = seed
clf2 = LGBMRegressor(**params2)
esr = 100
I = 0
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
for train_index, test_index in skf.split(dtrain, dtrain[target]):
X_trg, X_val = dtrain[use].loc[train_index], dtrain[use].loc[test_index]
y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
clf0.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp0 = clf0.predict(X_val)
pred0 = clf0.predict(dtest[use])
clf1.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp1 = clf1.predict(X_val)
pred1 = clf1.predict(dtest[use])
clf2.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp2 = clf2.predict(X_val)
pred2 = clf2.predict(dtest[use])
pred = (pred0 + pred1 + pred2) / 3
dtest[target] += pred
temp = (temp0 + temp1 + temp2) / 3
sc = mean_squared_error(y_val, temp)
Loss.append(sc)
print(np.mean(Loss))
print(np.mean(Loss))
dtest[target] = dtest[target] / (seeds * n_splits)
In [ ]:
In [24]:
################### RFC
np.random.shuffle(use)
I = 0
remove = []
for u in use:
Loss = []
dtest[target] = 0
for seed in range(seeds):
if I == 0:
usen = use.copy()
else:
usen = use.copy()
usen.remove(u)
for f in remove:
if f in usen:
usen.remove(f)
params0['random_state'] = seed
clf0 = LGBMRegressor(**params0)
params1['random_state'] = seed
clf1 = LGBMRegressor(**params1)
params2['random_state'] = seed
clf2 = LGBMRegressor(**params2)
esr = 100
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
for train_index, test_index in skf.split(dtrain, dtrain[target]):
X_trg, X_val = dtrain[usen].loc[train_index], dtrain[usen].loc[test_index]
y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
clf0.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp0 = clf0.predict(X_val)
pred0 = clf0.predict(dtest[usen])
clf1.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp1 = clf1.predict(X_val)
pred1 = clf1.predict(dtest[usen])
clf2.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp2 = clf2.predict(X_val)
pred2 = clf2.predict(dtest[usen])
pred = (pred0 + pred1 + pred2) / 3
dtest[target] += pred
temp = (temp0 + temp1 + temp2) / 3
sc = mean_squared_error(y_val, temp)
Loss.append(sc)
if I == 0:
best = np.mean(Loss)
print('start score', best)
else:
if best > np.mean(Loss):
print(u, best, np.mean(Loss))
best = np.mean(Loss)
remove.append(u)
print(remove)
else:
print(u, ' ', best, np.mean(Loss))
del Loss
I += 1
################### RFC
print(remove)
for u in remove:
if u in use:
use.remove(u)
In [25]:
################### RFC
np.random.shuffle(use)
I = 0
remove = []
for u in use:
Loss = []
dtest[target] = 0
for seed in range(seeds):
if I == 0:
usen = use.copy()
else:
usen = use.copy()
usen.remove(u)
for f in remove:
if f in usen:
usen.remove(f)
params0['random_state'] = seed
clf0 = LGBMRegressor(**params0)
params1['random_state'] = seed
clf1 = LGBMRegressor(**params1)
params2['random_state'] = seed
clf2 = LGBMRegressor(**params2)
esr = 100
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
for train_index, test_index in skf.split(dtrain, dtrain[target]):
X_trg, X_val = dtrain[usen].loc[train_index], dtrain[usen].loc[test_index]
y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
clf0.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp0 = clf0.predict(X_val)
pred0 = clf0.predict(dtest[usen])
clf1.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp1 = clf1.predict(X_val)
pred1 = clf1.predict(dtest[usen])
clf2.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp2 = clf2.predict(X_val)
pred2 = clf2.predict(dtest[usen])
pred = (pred0 + pred1 + pred2) / 3
dtest[target] += pred
temp = (temp0 + temp1 + temp2) / 3
sc = mean_squared_error(y_val, temp)
Loss.append(sc)
if I == 0:
best = np.mean(Loss)
print('start score', best)
else:
if best > np.mean(Loss):
print(u, best, np.mean(Loss))
best = np.mean(Loss)
remove.append(u)
print(remove)
else:
print(u, ' ', best, np.mean(Loss))
del Loss
I += 1
################### RFC
print(remove)
for u in remove:
if u in use:
use.remove(u)
In [26]:
################### RFC
np.random.shuffle(use)
I = 0
remove = []
for u in use:
Loss = []
dtest[target] = 0
for seed in range(seeds):
if I == 0:
usen = use.copy()
else:
usen = use.copy()
usen.remove(u)
for f in remove:
if f in usen:
usen.remove(f)
params0['random_state'] = seed
clf0 = LGBMRegressor(**params0)
params1['random_state'] = seed
clf1 = LGBMRegressor(**params1)
params2['random_state'] = seed
clf2 = LGBMRegressor(**params2)
esr = 100
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
for train_index, test_index in skf.split(dtrain, dtrain[target]):
X_trg, X_val = dtrain[usen].loc[train_index], dtrain[usen].loc[test_index]
y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
clf0.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp0 = clf0.predict(X_val)
pred0 = clf0.predict(dtest[usen])
clf1.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp1 = clf1.predict(X_val)
pred1 = clf1.predict(dtest[usen])
clf2.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp2 = clf2.predict(X_val)
pred2 = clf2.predict(dtest[usen])
pred = (pred0 + pred1 + pred2) / 3
dtest[target] += pred
temp = (temp0 + temp1 + temp2) / 3
sc = mean_squared_error(y_val, temp)
Loss.append(sc)
if I == 0:
best = np.mean(Loss)
print('start score', best)
else:
if best > np.mean(Loss):
print(u, best, np.mean(Loss))
best = np.mean(Loss)
remove.append(u)
print(remove)
else:
print(u, ' ', best, np.mean(Loss))
del Loss
I += 1
################### RFC
print(remove)
for u in remove:
if u in use:
use.remove(u)
In [ ]:
In [27]:
seeds = 3
n_splits = 5
N_estimators = 1000
target = 'target'
params0, params1, params2 = get_params(dtrain, target, use, n_splits, 10)
In [ ]:
In [28]:
Loss = []
n_estimators = 10000
dtest[target] = 0
if True:
for seed in range(seeds):
params0['random_state'] = seed
clf0 = LGBMRegressor(**params0)
params1['random_state'] = seed
clf1 = LGBMRegressor(**params1)
params2['random_state'] = seed
clf2 = LGBMRegressor(**params2)
esr = 100
I = 0
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
for train_index, test_index in skf.split(dtrain, dtrain[target]):
X_trg, X_val = dtrain[use].loc[train_index], dtrain[use].loc[test_index]
y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
clf0.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp0 = clf0.predict(X_val)
pred0 = clf0.predict(dtest[use])
clf1.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp1 = clf1.predict(X_val)
pred1 = clf1.predict(dtest[use])
clf2.fit(
X_trg,
y_trg,
verbose = False,
eval_set = (X_val, y_val),
early_stopping_rounds = esr,
sample_weight = dtrain['weights'].loc[train_index]
)
temp2 = clf2.predict(X_val)
pred2 = clf2.predict(dtest[use])
pred = (pred0 + pred1 + pred2) / 3
dtest[target] += pred
temp = (temp0 + temp1 + temp2) / 3
sc = mean_squared_error(y_val, temp)
Loss.append(sc)
print(np.mean(Loss))
print(np.mean(Loss))
dtest[target] = dtest[target] / (seeds * n_splits)
In [ ]:
In [42]:
predictions = dtest[target].values
submission = pd.DataFrame({"label":predictions})
submission['label'][submission['label'] < 0] = 0
#submission['label'] = submission['label'] - predictions.min()
submission
Out[42]:
In [43]:
submission['label'].min()
Out[43]:
In [44]:
dtrain.target.hist()
Out[44]:
In [45]:
submission.label.hist()
Out[45]:
In [46]:
!rm -rf assets
!mkdir assets
submission.to_csv(os.path.join("assets", "submission.csv"))
In [ ]:
In [47]:
%load_ext aicrowd.magic
%aicrowd login
In [ ]:
Content
Comments
You must login before you can post a comment.