Lidar Car Detection

Solution for submission 155379

In [1]:

%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import gc
import os
from glob import glob
import pickle
import random
import shutil
import seaborn as sns
from collections import Counter

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score, train_test_split
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score, matthews_corrcoef
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

#import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.simplefilter(action              = 'ignore', category = FutureWarning)
warnings.simplefilter(action              = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action              = 'ignore', category = UserWarning)
warnings.simplefilter(action              = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None

%load_ext aicrowd.magic %aicrowd login!rm -rf data !mkdir data %aicrowd ds dl -c lidar-car-detection -o data

In [ ]:

In [2]:

train_data = np.load("./data/train.npz", allow_pickle=True)
train_data = train_data['train']

train_data.shape

Out[2]:

(400, 2)

In [3]:

X = train_data[:100, 0]
dtrain = [i.flatten()  for i in X]
target = train_data[:100, 1]

In [4]:

test_data = np.load("./data/test.npz", allow_pickle=True)
test_data = test_data['test']

test_data.shape

Out[4]:

(601,)

In [5]:

dtest = [i.flatten()  for i in test_data]

In [6]:

dtrain = np.array(dtrain)
dtest = np.array(dtest)

In [7]:

use = ['x_' + str(f) for f in range(dtrain.shape[1])]

dtrain = pd.DataFrame(dtrain, columns = use)
dtrain['target'] = target

dtest = pd.DataFrame(dtest, columns = use)

In [8]:

dtrain.shape, dtest.shape

Out[8]:

((100, 116380), (601, 116379))

In [9]:

dtrain.head()

Out[9]:

	x_0	x_1	x_2	x_3	x_4	x_5	x_6	x_7	x_8	x_9	...	x_116370	x_116371	x_116372	x_116373	x_116374	x_116375	x_116376	x_116377	x_116378	target
0	-12.568300	-5.545469	7.304270	-2.542295	16.209188	8.723945	7.204814	-5.573594	4.843357	15.317773	...	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	1
1	1.809375	-5.568750	3.113327	1.661836	-5.567539	3.089362	1.516445	-5.566328	3.067530	1.373027	...	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	0
2	-14.791308	-10.746484	9.721252	-15.049731	-10.389766	9.723750	-15.299448	-10.026738	9.726181	-15.540278	...	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	1
3	10.573271	-12.161767	8.568655	7.393847	-8.930312	6.164613	6.548662	-12.020322	7.278258	-5.766494	...	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	4
4	8.062422	14.520190	8.830832	-1.427363	-5.454453	2.997852	-1.573184	-5.470215	3.026464	-3.972129	...	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	1

5 rows × 116380 columns

In [10]:

# remove constant columns
remove = []
for c in dtrain.columns:
    if dtrain[c].std() == 0:
        remove.append(c)

print('remove constant columns', remove)
dtrain.drop(remove, axis = 1, inplace = True)
dtest.drop(remove,  axis = 1, inplace = True)

use = list(dtest.columns)

remove constant columns ['x_115641', 'x_115642', 'x_115643', 'x_115644', 'x_115645', 'x_115646', 'x_115647', 'x_115648', 'x_115649', 'x_115650', 'x_115651', 'x_115652', 'x_115653', 'x_115654', 'x_115655', 'x_115656', 'x_115657', 'x_115658', 'x_115659', 'x_115660', 'x_115661', 'x_115662', 'x_115663', 'x_115664', 'x_115665', 'x_115666', 'x_115667', 'x_115668', 'x_115669', 'x_115670', 'x_115671', 'x_115672', 'x_115673', 'x_115674', 'x_115675', 'x_115676', 'x_115677', 'x_115678', 'x_115679', 'x_115680', 'x_115681', 'x_115682', 'x_115683', 'x_115684', 'x_115685', 'x_115686', 'x_115687', 'x_115688', 'x_115689', 'x_115690', 'x_115691', 'x_115692', 'x_115693', 'x_115694', 'x_115695', 'x_115696', 'x_115697', 'x_115698', 'x_115699', 'x_115700', 'x_115701', 'x_115702', 'x_115703', 'x_115704', 'x_115705', 'x_115706', 'x_115707', 'x_115708', 'x_115709', 'x_115710', 'x_115711', 'x_115712', 'x_115713', 'x_115714', 'x_115715', 'x_115716', 'x_115717', 'x_115718', 'x_115719', 'x_115720', 'x_115721', 'x_115722', 'x_115723', 'x_115724', 'x_115725', 'x_115726', 'x_115727', 'x_115728', 'x_115729', 'x_115730', 'x_115731', 'x_115732', 'x_115733', 'x_115734', 'x_115735', 'x_115736', 'x_115737', 'x_115738', 'x_115739', 'x_115740', 'x_115741', 'x_115742', 'x_115743', 'x_115744', 'x_115745', 'x_115746', 'x_115747', 'x_115748', 'x_115749', 'x_115750', 'x_115751', 'x_115752', 'x_115753', 'x_115754', 'x_115755', 'x_115756', 'x_115757', 'x_115758', 'x_115759', 'x_115760', 'x_115761', 'x_115762', 'x_115763', 'x_115764', 'x_115765', 'x_115766', 'x_115767', 'x_115768', 'x_115769', 'x_115770', 'x_115771', 'x_115772', 'x_115773', 'x_115774', 'x_115775', 'x_115776', 'x_115777', 'x_115778', 'x_115779', 'x_115780', 'x_115781', 'x_115782', 'x_115783', 'x_115784', 'x_115785', 'x_115786', 'x_115787', 'x_115788', 'x_115789', 'x_115790', 'x_115791', 'x_115792', 'x_115793', 'x_115794', 'x_115795', 'x_115796', 'x_115797', 'x_115798', 'x_115799', 'x_115800', 'x_115801', 'x_115802', 'x_115803', 'x_115804', 'x_115805', 'x_115806', 'x_115807', 'x_115808', 'x_115809', 'x_115810', 'x_115811', 'x_115812', 'x_115813', 'x_115814', 'x_115815', 'x_115816', 'x_115817', 'x_115818', 'x_115819', 'x_115820', 'x_115821', 'x_115822', 'x_115823', 'x_115824', 'x_115825', 'x_115826', 'x_115827', 'x_115828', 'x_115829', 'x_115830', 'x_115831', 'x_115832', 'x_115833', 'x_115834', 'x_115835', 'x_115836', 'x_115837', 'x_115838', 'x_115839', 'x_115840', 'x_115841', 'x_115842', 'x_115843', 'x_115844', 'x_115845', 'x_115846', 'x_115847', 'x_115848', 'x_115849', 'x_115850', 'x_115851', 'x_115852', 'x_115853', 'x_115854', 'x_115855', 'x_115856', 'x_115857', 'x_115858', 'x_115859', 'x_115860', 'x_115861', 'x_115862', 'x_115863', 'x_115864', 'x_115865', 'x_115866', 'x_115867', 'x_115868', 'x_115869', 'x_115870', 'x_115871', 'x_115872', 'x_115873', 'x_115874', 'x_115875', 'x_115876', 'x_115877', 'x_115878', 'x_115879', 'x_115880', 'x_115881', 'x_115882', 'x_115883', 'x_115884', 'x_115885', 'x_115886', 'x_115887', 'x_115888', 'x_115889', 'x_115890', 'x_115891', 'x_115892', 'x_115893', 'x_115894', 'x_115895', 'x_115896', 'x_115897', 'x_115898', 'x_115899', 'x_115900', 'x_115901', 'x_115902', 'x_115903', 'x_115904', 'x_115905', 'x_115906', 'x_115907', 'x_115908', 'x_115909', 'x_115910', 'x_115911', 'x_115912', 'x_115913', 'x_115914', 'x_115915', 'x_115916', 'x_115917', 'x_115918', 'x_115919', 'x_115920', 'x_115921', 'x_115922', 'x_115923', 'x_115924', 'x_115925', 'x_115926', 'x_115927', 'x_115928', 'x_115929', 'x_115930', 'x_115931', 'x_115932', 'x_115933', 'x_115934', 'x_115935', 'x_115936', 'x_115937', 'x_115938', 'x_115939', 'x_115940', 'x_115941', 'x_115942', 'x_115943', 'x_115944', 'x_115945', 'x_115946', 'x_115947', 'x_115948', 'x_115949', 'x_115950', 'x_115951', 'x_115952', 'x_115953', 'x_115954', 'x_115955', 'x_115956', 'x_115957', 'x_115958', 'x_115959', 'x_115960', 'x_115961', 'x_115962', 'x_115963', 'x_115964', 'x_115965', 'x_115966', 'x_115967', 'x_115968', 'x_115969', 'x_115970', 'x_115971', 'x_115972', 'x_115973', 'x_115974', 'x_115975', 'x_115976', 'x_115977', 'x_115978', 'x_115979', 'x_115980', 'x_115981', 'x_115982', 'x_115983', 'x_115984', 'x_115985', 'x_115986', 'x_115987', 'x_115988', 'x_115989', 'x_115990', 'x_115991', 'x_115992', 'x_115993', 'x_115994', 'x_115995', 'x_115996', 'x_115997', 'x_115998', 'x_115999', 'x_116000', 'x_116001', 'x_116002', 'x_116003', 'x_116004', 'x_116005', 'x_116006', 'x_116007', 'x_116008', 'x_116009', 'x_116010', 'x_116011', 'x_116012', 'x_116013', 'x_116014', 'x_116015', 'x_116016', 'x_116017', 'x_116018', 'x_116019', 'x_116020', 'x_116021', 'x_116022', 'x_116023', 'x_116024', 'x_116025', 'x_116026', 'x_116027', 'x_116028', 'x_116029', 'x_116030', 'x_116031', 'x_116032', 'x_116033', 'x_116034', 'x_116035', 'x_116036', 'x_116037', 'x_116038', 'x_116039', 'x_116040', 'x_116041', 'x_116042', 'x_116043', 'x_116044', 'x_116045', 'x_116046', 'x_116047', 'x_116048', 'x_116049', 'x_116050', 'x_116051', 'x_116052', 'x_116053', 'x_116054', 'x_116055', 'x_116056', 'x_116057', 'x_116058', 'x_116059', 'x_116060', 'x_116061', 'x_116062', 'x_116063', 'x_116064', 'x_116065', 'x_116066', 'x_116067', 'x_116068', 'x_116069', 'x_116070', 'x_116071', 'x_116072', 'x_116073', 'x_116074', 'x_116075', 'x_116076', 'x_116077', 'x_116078', 'x_116079', 'x_116080', 'x_116081', 'x_116082', 'x_116083', 'x_116084', 'x_116085', 'x_116086', 'x_116087', 'x_116088', 'x_116089', 'x_116090', 'x_116091', 'x_116092', 'x_116093', 'x_116094', 'x_116095', 'x_116096', 'x_116097', 'x_116098', 'x_116099', 'x_116100', 'x_116101', 'x_116102', 'x_116103', 'x_116104', 'x_116105', 'x_116106', 'x_116107', 'x_116108', 'x_116109', 'x_116110', 'x_116111', 'x_116112', 'x_116113', 'x_116114', 'x_116115', 'x_116116', 'x_116117', 'x_116118', 'x_116119', 'x_116120', 'x_116121', 'x_116122', 'x_116123', 'x_116124', 'x_116125', 'x_116126', 'x_116127', 'x_116128', 'x_116129', 'x_116130', 'x_116131', 'x_116132', 'x_116133', 'x_116134', 'x_116135', 'x_116136', 'x_116137', 'x_116138', 'x_116139', 'x_116140', 'x_116141', 'x_116142', 'x_116143', 'x_116144', 'x_116145', 'x_116146', 'x_116147', 'x_116148', 'x_116149', 'x_116150', 'x_116151', 'x_116152', 'x_116153', 'x_116154', 'x_116155', 'x_116156', 'x_116157', 'x_116158', 'x_116159', 'x_116160', 'x_116161', 'x_116162', 'x_116163', 'x_116164', 'x_116165', 'x_116166', 'x_116167', 'x_116168', 'x_116169', 'x_116170', 'x_116171', 'x_116172', 'x_116173', 'x_116174', 'x_116175', 'x_116176', 'x_116177', 'x_116178', 'x_116179', 'x_116180', 'x_116181', 'x_116182', 'x_116183', 'x_116184', 'x_116185', 'x_116186', 'x_116187', 'x_116188', 'x_116189', 'x_116190', 'x_116191', 'x_116192', 'x_116193', 'x_116194', 'x_116195', 'x_116196', 'x_116197', 'x_116198', 'x_116199', 'x_116200', 'x_116201', 'x_116202', 'x_116203', 'x_116204', 'x_116205', 'x_116206', 'x_116207', 'x_116208', 'x_116209', 'x_116210', 'x_116211', 'x_116212', 'x_116213', 'x_116214', 'x_116215', 'x_116216', 'x_116217', 'x_116218', 'x_116219', 'x_116220', 'x_116221', 'x_116222', 'x_116223', 'x_116224', 'x_116225', 'x_116226', 'x_116227', 'x_116228', 'x_116229', 'x_116230', 'x_116231', 'x_116232', 'x_116233', 'x_116234', 'x_116235', 'x_116236', 'x_116237', 'x_116238', 'x_116239', 'x_116240', 'x_116241', 'x_116242', 'x_116243', 'x_116244', 'x_116245', 'x_116246', 'x_116247', 'x_116248', 'x_116249', 'x_116250', 'x_116251', 'x_116252', 'x_116253', 'x_116254', 'x_116255', 'x_116256', 'x_116257', 'x_116258', 'x_116259', 'x_116260', 'x_116261', 'x_116262', 'x_116263', 'x_116264', 'x_116265', 'x_116266', 'x_116267', 'x_116268', 'x_116269', 'x_116270', 'x_116271', 'x_116272', 'x_116273', 'x_116274', 'x_116275', 'x_116276', 'x_116277', 'x_116278', 'x_116279', 'x_116280', 'x_116281', 'x_116282', 'x_116283', 'x_116284', 'x_116285', 'x_116286', 'x_116287', 'x_116288', 'x_116289', 'x_116290', 'x_116291', 'x_116292', 'x_116293', 'x_116294', 'x_116295', 'x_116296', 'x_116297', 'x_116298', 'x_116299', 'x_116300', 'x_116301', 'x_116302', 'x_116303', 'x_116304', 'x_116305', 'x_116306', 'x_116307', 'x_116308', 'x_116309', 'x_116310', 'x_116311', 'x_116312', 'x_116313', 'x_116314', 'x_116315', 'x_116316', 'x_116317', 'x_116318', 'x_116319', 'x_116320', 'x_116321', 'x_116322', 'x_116323', 'x_116324', 'x_116325', 'x_116326', 'x_116327', 'x_116328', 'x_116329', 'x_116330', 'x_116331', 'x_116332', 'x_116333', 'x_116334', 'x_116335', 'x_116336', 'x_116337', 'x_116338', 'x_116339', 'x_116340', 'x_116341', 'x_116342', 'x_116343', 'x_116344', 'x_116345', 'x_116346', 'x_116347', 'x_116348', 'x_116349', 'x_116350', 'x_116351', 'x_116352', 'x_116353', 'x_116354', 'x_116355', 'x_116356', 'x_116357', 'x_116358', 'x_116359', 'x_116360', 'x_116361', 'x_116362', 'x_116363', 'x_116364', 'x_116365', 'x_116366', 'x_116367', 'x_116368', 'x_116369', 'x_116370', 'x_116371', 'x_116372', 'x_116373', 'x_116374', 'x_116375', 'x_116376', 'x_116377', 'x_116378']

In [11]:

print(dtrain.shape, dtest.shape)

(100, 115642) (601, 115641)

In [12]:

temp = pd.read_csv('imp_df-0.csv')

remove = temp['feat'][temp['diff'] >= 0]
#remove
for u in remove:
    if u in use:
        use.remove(u)
        
len(use)

Out[12]:

In [13]:

temp = pd.read_csv('imp_df-1.csv')

remove = temp['feat'][temp['diff'] >= 0]
#remove
for u in remove:
    if u in use:
        use.remove(u)
        
len(use)

Out[13]:

temp = pd.read_csv('imp_df-2.csv') remove = temp['feat'][temp['diff'] >= 0] #remove for u in remove: if u in use: use.remove(u) len(use)temp = pd.read_csv('imp_df-0.csv') use = list(temp['feat'][temp['diff'] < 0]) len(use)temp1 = pd.read_csv('imp_df-1.csv') use1 = list(temp1['feat'][temp1['diff'] < 0]) for u in use1: if u not in use: use.append(u) len(use)

In [14]:

target = dtrain['target']

svd = TruncatedSVD(n_components = 45, n_iter = 15, random_state = 42)
svd.fit(dtrain[use].values)
print(svd.explained_variance_ratio_.sum())

xtrain = svd.transform(dtrain[use].values)
xtest  = svd.transform(dtest[use].values)

dtrain = pd.DataFrame(xtrain)
dtest  = pd.DataFrame(xtest)

xuse = ['x_' + str(f) for f in range(dtrain.shape[1])]

dtrain.columns = xuse
dtrain['target'] = target

dtest.columns = xuse
use = xuse

dtrain.head()

0.9100863799262379

Out[14]:

	x_0	x_1	x_2	x_3	x_4	x_5	x_6	x_7	x_8	x_9	...	x_36	x_37	x_38	x_39	x_40	x_41	x_42	x_43	x_44	target
0	665.347745	61.008411	7.478397	234.969242	1.860219	-36.285563	257.085407	-59.178237	50.854748	-124.160004	...	43.604942	19.777652	27.203507	29.166768	53.762498	-0.194678	16.821663	-5.826462	16.599742	1
1	-164.259115	-660.473972	-329.311084	-154.422256	180.228304	-95.903074	-450.383136	483.737298	276.568393	614.044662	...	-36.370988	69.377785	26.731596	-26.265083	57.937470	52.809484	30.844033	7.480102	21.639360	0
2	-551.119638	-19.721721	279.854378	1058.988323	-68.721455	408.864109	-440.757909	423.665012	-200.041928	-676.242236	...	-221.107925	-199.454364	-55.708469	-288.397391	8.343854	132.603067	187.819706	22.807170	66.607220	1
3	487.408905	-358.594540	-34.508921	267.359841	76.387211	-208.288967	148.615855	-29.398435	84.436366	-154.206600	...	-5.731330	32.457181	58.627389	11.941469	-39.929008	37.576730	8.108722	-64.447498	41.338146	4
4	-907.094917	-65.134401	-137.229341	-187.632295	554.482389	-117.433242	-195.900825	-51.667340	-29.178691	54.870819	...	-60.075129	-9.423686	-21.739545	-32.864875	-79.617197	5.581825	-77.436023	-26.228361	-2.584403	1

5 rows × 46 columns

In [ ]:

In [15]:

################### permutation importance

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score

target = 'target'
dtrain.reset_index(drop = True, inplace = True)

def pim(xtrain, xuse, target):

    models = []
    Loss = []
    seeds    = 3
    n_splits = 5
    imp_df = pd.DataFrame(columns = ['feat', 'imp', 'imp_permuted'])
    I = 0

    for seed in range(seeds):
        skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
        for train_index, test_index in skf.split(xtrain, xtrain[target]):            
            X_train, X_val = xtrain[use].loc[train_index],    xtrain[use].loc[test_index]
            y_train, y_val = xtrain[target].loc[train_index], xtrain[target].loc[test_index]
            
            model = LGBMRegressor(
                max_depth    = 3 + seed * 2, 
                random_state = seed, 
                n_estimators = 1000,
                #device       = 'gpu',
                subsample        = 0.98, 
                subsample_freq   = 5, 
                colsample_bytree = 0.98,
                reg_alpha        = 0.01, 
                reg_lambda       = 0.1
            )
            model.fit(
                X_train, 
                y_train.values,
                eval_set = (X_val, y_val), 
                early_stopping_rounds = 100,
                verbose               = False,
            )
            temp = model.predict(X_val)
            sc = mean_squared_error(y_val, temp)
            Loss.append(sc)
            models.append(model)
            
    sc = np.mean(Loss)
                    
    for u in xuse:
        N = 0
        Loss_permuted = []
        for seed in range(seeds):
            skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
            for train_index, test_index in skf.split(xtrain, xtrain[target]):           
                X_train, X_val = xtrain[use].loc[train_index],    xtrain[use].loc[test_index]
                y_train, y_val = xtrain[target].loc[train_index], xtrain[target].loc[test_index]
                
                X_val_permuted = X_val.copy()
                temp = X_val_permuted[u].values
                np.random.shuffle(temp)
                X_val_permuted[u] = temp
                
                temp = models[N].predict(X_val_permuted.values)
                sc_permuted = mean_squared_error(y_val, temp)
                Loss_permuted.append(sc_permuted) 
                N += 1
                    
        sc_permuted = np.mean(Loss_permuted)
    
        if sc_permuted <= sc:
            print(u, sc, sc_permuted, 'need delete')
        else:
            print(u, sc, sc_permuted)
        
        if u in imp_df['feat'].unique():
            imp_df['imp'][imp_df['feat'] == u] += sc
            imp_df['imp_permuted'][imp_df['feat'] == u] += sc_permuted
        else:
            L = imp_df.shape[0]
            imp_df.loc[L, 'feat'] = u
            imp_df['imp'][imp_df['feat'] == u] = sc
            imp_df['imp_permuted'][imp_df['feat'] == u] = sc_permuted
        
        imp_df['diff'] = imp_df['imp'] - imp_df['imp_permuted']
        imp_df.sort_values(by = ['diff'], inplace = True)
        imp_df.to_csv('imp_df.csv', index = False)
        
    imp_df[['imp', 'imp_permuted']] = imp_df[['imp', 'imp_permuted']] / (seeds * n_splits)

    ################### permutation importance

    imp_df['diff'] = imp_df['imp'] - imp_df['imp_permuted']
    imp_df.sort_values(by = ['diff'], inplace = True)

    remove = imp_df['feat'][imp_df['diff'] >= 0].values
    print(remove)

    for u in remove:
        if u in use:
            use.remove(u)
            
    return(xuse)        
        
target = 'target'
dtrain.reset_index(drop = True, inplace = True)
dtrain[target] = dtrain[target].astype(int)
use = pim(dtrain, use, target)

x_0 1.9339867187011441 1.9260438673833855 need delete
x_1 1.9339867187011441 1.9339867187011441 need delete
x_2 1.9339867187011441 1.9317100646282124 need delete
x_3 1.9339867187011441 1.9396635537440337
x_4 1.9339867187011441 1.9442546489626882
x_5 1.9339867187011441 1.9155553703430135 need delete
x_6 1.9339867187011441 1.9303053218223094 need delete
x_7 1.9339867187011441 1.9290217120798767 need delete
x_8 1.9339867187011441 1.9368604828823555
x_9 1.9339867187011441 2.256370267370543
x_10 1.9339867187011441 1.939700345653777
x_11 1.9339867187011441 1.92566597249631 need delete
x_12 1.9339867187011441 1.9634984084876483
x_13 1.9339867187011441 1.9401763883879362
x_14 1.9339867187011441 1.9437716574734112
x_15 1.9339867187011441 1.9218898480174806 need delete
x_16 1.9339867187011441 1.933798010490061 need delete
x_17 1.9339867187011441 1.933623092454455 need delete
x_18 1.9339867187011441 1.9268854566917824 need delete
x_19 1.9339867187011441 1.9486716400490316
x_20 1.9339867187011441 1.963375701430915
x_21 1.9339867187011441 1.9325249299327656 need delete
x_22 1.9339867187011441 1.9338757221577756 need delete
x_23 1.9339867187011441 1.94023757466203
x_24 1.9339867187011441 1.9547714725021694
x_25 1.9339867187011441 1.9331332313067928 need delete
x_26 1.9339867187011441 1.9287832652432735 need delete
x_27 1.9339867187011441 1.9208524309675672 need delete
x_28 1.9339867187011441 2.2468604185606873
x_29 1.9339867187011441 1.9339776436431544 need delete
x_30 1.9339867187011441 1.9245475823117417 need delete
x_31 1.9339867187011441 1.9525390872515265
x_32 1.9339867187011441 1.934115901424877
x_33 1.9339867187011441 1.9339867187011441 need delete
x_34 1.9339867187011441 1.933482916471365 need delete
x_35 1.9339867187011441 1.9350089098115868
x_36 1.9339867187011441 1.9648474599921144
x_37 1.9339867187011441 1.9339867187011441 need delete
x_38 1.9339867187011441 1.9323254700479073 need delete
x_39 1.9339867187011441 1.9617270531796416
x_40 1.9339867187011441 1.9559755997365633
x_41 1.9339867187011441 1.9277023996160783 need delete
x_42 1.9339867187011441 1.9324857224787233 need delete
x_43 1.9339867187011441 1.9347718749405975
x_44 1.9339867187011441 1.9339867187011441 need delete
['x_1' 'x_44' 'x_37' 'x_33' 'x_29' 'x_22' 'x_16' 'x_17' 'x_34' 'x_25'
 'x_21' 'x_42' 'x_38' 'x_2' 'x_6' 'x_7' 'x_26' 'x_41' 'x_18' 'x_0' 'x_11'
 'x_30' 'x_15' 'x_27' 'x_5']

In [16]:

use

Out[16]:

['x_3',
 'x_4',
 'x_8',
 'x_9',
 'x_10',
 'x_12',
 'x_13',
 'x_14',
 'x_19',
 'x_20',
 'x_23',
 'x_24',
 'x_28',
 'x_31',
 'x_32',
 'x_35',
 'x_36',
 'x_39',
 'x_40',
 'x_43']

In [ ]:

In [17]:

target = dtrain['target'].values

dtrain = dtrain[use]
dtest  = dtest[use]
dtrain['target'] = target
dtrain['target'] = dtrain['target'].astype(int)
gc.collect()

Out[17]:

In [18]:

dtrain.head()

Out[18]:

	x_3	x_4	x_8	x_9	x_10	x_12	x_13	x_14	x_19	x_20	...	x_24	x_28	x_31	x_32	x_35	x_36	x_39	x_40	x_43	target
0	234.969242	1.860219	50.854748	-124.160004	2.820957	-72.305011	-0.787518	-179.410244	64.180675	34.828274	...	16.707547	61.593103	-35.363700	26.145273	-14.322306	43.604942	29.166768	53.762498	-5.826462	1
1	-154.422256	180.228304	276.568393	614.044662	-402.210747	210.637253	457.552202	-295.063815	-640.691430	187.355371	...	33.748974	-174.090849	-56.299179	603.621445	-8.414453	-36.370988	-26.265083	57.937470	7.480102	0
2	1058.988323	-68.721455	-200.041928	-676.242236	-195.270532	972.021663	-107.118557	-17.789619	266.252472	-197.238109	...	385.307033	-155.672637	186.114755	162.335423	50.124042	-221.107925	-288.397391	8.343854	22.807170	1
3	267.359841	76.387211	84.436366	-154.206600	-44.934611	108.287200	71.411365	30.869205	-15.257457	57.558212	...	-12.583325	63.539872	-39.333358	-27.198102	18.405949	-5.731330	11.941469	-39.929008	-64.447498	4
4	-187.632295	554.482389	-29.178691	54.870819	-39.739966	-19.941638	22.359923	52.685056	-114.473131	-3.741718	...	-3.486473	61.688607	28.449911	-78.514572	44.842399	-60.075129	-32.864875	-79.617197	-26.228361	1

5 rows × 21 columns

In [19]:

# covariate-shift-adaptation

if 'is_train' in dtrain.columns:
    del dtrain['is_train']    
    
if 'is_train' in dtest.columns:
    del dtest['is_train']   
    
if 'is_train' in use:
    use.remove('is_train')

len_X = dtrain.shape[0]
dtrain['is_train'] = 1
dtest['is_train']  = 0
use.append('is_train')
dtrain['oof'] = 0

df     = pd.concat([dtrain[use], dtest[use]], axis = 0, ignore_index = True, sort = False)
target = df['is_train']
del dtrain['is_train'], dtest['is_train']
df['pred'] = 0
AUC = []
use.remove('is_train')


seeds    = 3
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = False)

for seed in range(seeds):
    skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
    
    model = LGBMClassifier(
        max_depth = 3 + seed * 2, 
        random_state = seed, 
        n_estimators = 1000,
        #device       = 'gpu',
        subsample        = 0.98, 
        subsample_freq   = 5, 
        colsample_bytree = 0.98,
        reg_alpha        = 0.01, 
        reg_lambda       = 0.1
    )
    
    for train_index, test_index in skf.split(df, df['is_train']):
        X_train, X_test = df.loc[train_index],             df.loc[test_index]
        y_train, y_test = df['is_train'].loc[train_index], df['is_train'].loc[test_index]
        
        model.fit(
            X_train[use],
            y_train,
            eval_set = (X_test[use], y_test),
            verbose = False,
            early_stopping_rounds = 100,
        )
        
        df['pred'].loc[test_index] 
        temp = model.predict_proba(X_test[use])[:, 0]
        df['pred'].loc[test_index] += temp
    
        temp = model.predict_proba(X_test[use])[:, 1]
        sc = roc_auc_score(y_test, temp)
        print('roc_auc_score', sc)
        AUC.append(sc)
    
weights = df['pred'][:len_X]
weights /= np.mean(weights)

print(weights.min(), weights.max())
dtrain['weights'] = weights

roc_auc_score 0.7595041322314049
roc_auc_score 0.6829166666666666
roc_auc_score 0.7204166666666666
roc_auc_score 0.6254166666666667
roc_auc_score 0.6641666666666666
roc_auc_score 0.7200413223140497
roc_auc_score 0.6610416666666665
roc_auc_score 0.7370833333333333
roc_auc_score 0.63875
roc_auc_score 0.7112499999999999
roc_auc_score 0.6797520661157024
roc_auc_score 0.7570833333333333
roc_auc_score 0.6847916666666667
roc_auc_score 0.6675
roc_auc_score 0.6100000000000001
0.6240381314826711 1.1982149354909963

In [20]:

def get_params(train, target, use, n_splits, n_calls = 5, verbose = False):
    
    def score(params_temp):
        
        Loss     = []
        n_estimators = N_estimators
                
        params0 = {
            'subsample':         params_temp[0],
            'colsample_bytree':  params_temp[1],
            'reg_alpha':         params_temp[2],
            'reg_lambda':        params_temp[3],
            'learning_rate':     params_temp[4],
            'num_leaves':        params_temp[5],
            'max_depth':         params_temp[6],
            'n_estimators':      n_estimators,
            #'device':            'gpu',
        }
        params1 = {
            'subsample':         params_temp[7],
            'colsample_bytree':  params_temp[8],
            'reg_alpha':         params_temp[9],
            'reg_lambda':        params_temp[10],
            'learning_rate':     params_temp[11],
            'num_leaves':        params_temp[12],
            'max_depth':         params_temp[13],
            'n_estimators':      n_estimators,
            #'device':            'gpu',
        } 
        params2 = {
            'subsample':         params_temp[14],
            'colsample_bytree':  params_temp[15],
            'reg_alpha':         params_temp[16],
            'reg_lambda':        params_temp[17],
            'learning_rate':     params_temp[18],
            'num_leaves':        params_temp[19],
            'max_depth':         params_temp[20],
            'n_estimators':      n_estimators,
            #'device':            'gpu',
        }
        
        oof = np.zeros([train.shape[0]])
        oof = pd.DataFrame(oof, columns = ['oof'], index = train.index)
        
        for seed in range(seeds):            
            params0['random_state'] = seed            
            clf0 = LGBMRegressor(**params0)
            
            params1['random_state'] = seed
            clf1  = LGBMRegressor(**params1)
            
            params2['random_state'] = seed
            clf2  = LGBMRegressor(**params2)
            
            esr = 100
            I = 0
            
            skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
            for train_index, test_index in skf.split(train, train[target]):
                X_trg, X_val = train[use].loc[train_index],    train[use].loc[test_index]
                y_trg, y_val = train[target].loc[train_index], train[target].loc[test_index]
            
                clf0.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp0 = clf0.predict(X_val)
                    
                clf1.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp1 = clf1.predict(X_val)
            
                clf2.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr, 
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp2 = clf2.predict(X_val)                      
                    
                temp = (temp0 + temp1 + temp2) / 3              

                sc = mean_squared_error(y_val, temp)
                Loss.append(sc)

        
        #oof['oof'] = oof['oof'] / I
        #oof = oof['oof'].values
        #Loss = np.sqrt(mean_squared_error(dtrain[target], oof))
        Loss = np.mean(Loss)
        
        L = df_res.shape[0] + 1
        df_res.loc[L, 'rmse'] = Loss
        best = df_res['rmse'].min()
        
        print('rmse...', Loss, 'Best...', best, 'Iter ', L)
        #if L > 1:
        #    plt.plot(range(df_res.shape[0]), df_res['rmse'], 'r')
        #    plt.show()
        
        #print(params0)
        return(Loss) 
    
    df_res = pd.DataFrame()
    values  = [
        Real(0.90,   1,    "log-uniform",   name = 'subsample'),
        Real(0.90,   1,    "log-uniform",   name = 'colsample_bytree'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_alpha'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_lambda'),
        Real(0.03,   0.3,  "log-uniform",   name = 'learning_rate'),
        Integer(5,   50,                    name = 'num_leaves'),
        Integer(3,   50,                    name = 'max_depth'),
        
        Real(0.90,   1,    "log-uniform",   name = 'subsample'),
        Real(0.90,   1,    "log-uniform",   name = 'colsample_bytree'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_alpha'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_lambda'),
        Real(0.03,   0.3,  "log-uniform",   name = 'learning_rate'),
        Integer(5,   75,                    name = 'num_leaves'),
        Integer(3,   75,                    name = 'max_depth'),
        
        Real(0.90,   1,    "log-uniform",   name = 'subsample'),
        Real(0.90,   1,    "log-uniform",   name = 'colsample_bytree'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_alpha'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_lambda'),
        Real(0.03,   0.3,  "log-uniform",   name = 'learning_rate'),
        Integer(5,   75,                    name = 'num_leaves'),
        Integer(3,   75,                    name = 'max_depth'),

    ]
    res_gp = gp_minimize(score, values, n_calls = n_calls, random_state = 142, n_random_starts = 3)
    n_estimators = N_estimators
    params0 = {
        'subsample':         res_gp.x[0],
        'colsample_bytree':  res_gp.x[1],
        'reg_alpha':         res_gp.x[2],
        'reg_lambda':        res_gp.x[3],
        'learning_rate':     res_gp.x[4],
        'num_leaves':        res_gp.x[5],
        'max_depth':         res_gp.x[6],
        'n_estimators':      n_estimators,
        #'device':            'gpu',
    }
    
    params1 = {
        'subsample':         res_gp.x[7],
        'colsample_bytree':  res_gp.x[8],
        'reg_alpha':         res_gp.x[9],
        'reg_lambda':        res_gp.x[10],
        'learning_rate':     res_gp.x[11],
        'num_leaves':        res_gp.x[12],
        'max_depth':         res_gp.x[13],
        'n_estimators':      n_estimators,
        #'device':            'gpu',
    }
    
    params2 = {
        'subsample':         res_gp.x[14],
        'colsample_bytree':  res_gp.x[15],
        'reg_alpha':         res_gp.x[16],
        'reg_lambda':        res_gp.x[17],
        'learning_rate':     res_gp.x[18],
        'num_leaves':        res_gp.x[19],
        'max_depth':         res_gp.x[20],
        'n_estimators':      n_estimators,
        #'device':            'gpu',
    }

    print('\n', 'Best score', res_gp.fun, '\n')
    print(params0, '\n', params1, '\n', params2)
    return(params0, params1, params2)

In [21]:

seeds    = 5
n_splits = 7
N_estimators = 1000
target = 'target'

params0, params1, params2 = get_params(dtrain, target, use, n_splits, 10)

rmse... 1.7005283702618392 Best... 1.7005283702618392 Iter  1
rmse... 1.6899261134828916 Best... 1.6899261134828916 Iter  2
rmse... 1.6924281059999033 Best... 1.6899261134828916 Iter  3
rmse... 1.6822906260154833 Best... 1.6822906260154833 Iter  4
rmse... 1.691543892997744 Best... 1.6822906260154833 Iter  5
rmse... 1.6666513713365207 Best... 1.6666513713365207 Iter  6
rmse... 1.6539211114189913 Best... 1.6539211114189913 Iter  7
rmse... 1.6349251318291007 Best... 1.6349251318291007 Iter  8
rmse... 1.6889502238064658 Best... 1.6349251318291007 Iter  9
rmse... 1.6814596650146065 Best... 1.6349251318291007 Iter  10

 Best score 1.6349251318291007 

{'subsample': 0.914233740611371, 'colsample_bytree': 0.9339041674510885, 'reg_alpha': 0.000359823678231207, 'reg_lambda': 0.2, 'learning_rate': 0.07976796062066106, 'num_leaves': 38, 'max_depth': 16, 'n_estimators': 1000} 
 {'subsample': 0.9223510269284443, 'colsample_bytree': 0.9, 'reg_alpha': 0.0074024590481468235, 'reg_lambda': 3.116421681339948e-11, 'learning_rate': 0.29999999999999993, 'num_leaves': 73, 'max_depth': 3, 'n_estimators': 1000} 
 {'subsample': 0.932767757138783, 'colsample_bytree': 0.9026487694066411, 'reg_alpha': 0.0002815354465164848, 'reg_lambda': 1.0831441915802143e-05, 'learning_rate': 0.2069421811919979, 'num_leaves': 13, 'max_depth': 58, 'n_estimators': 1000}

In [ ]:

In [22]:

use

Out[22]:

['x_3',
 'x_4',
 'x_8',
 'x_9',
 'x_10',
 'x_12',
 'x_13',
 'x_14',
 'x_19',
 'x_20',
 'x_23',
 'x_24',
 'x_28',
 'x_31',
 'x_32',
 'x_35',
 'x_36',
 'x_39',
 'x_40',
 'x_43']

In [ ]:

In [23]:

Loss     = []
n_estimators = 10000
dtest[target] = 0

if True:
        
        for seed in range(seeds):            
            params0['random_state'] = seed            
            clf0 = LGBMRegressor(**params0)
            
            params1['random_state'] = seed
            clf1  = LGBMRegressor(**params1)
            
            params2['random_state'] = seed
            clf2  = LGBMRegressor(**params2)
            
            esr = 100
            I = 0
            
            skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
            for train_index, test_index in skf.split(dtrain, dtrain[target]):
                X_trg, X_val = dtrain[use].loc[train_index],    dtrain[use].loc[test_index]
                y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
                clf0.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp0 = clf0.predict(X_val)
                pred0 = clf0.predict(dtest[use])
                    
                clf1.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr, 
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp1 = clf1.predict(X_val)
                pred1 = clf1.predict(dtest[use])
            
                clf2.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp2 = clf2.predict(X_val)
                pred2 = clf2.predict(dtest[use])
                
                pred = (pred0 + pred1 + pred2) / 3
                dtest[target] += pred                
                    
                temp = (temp0 + temp1 + temp2) / 3              

                sc = mean_squared_error(y_val, temp)
                Loss.append(sc)
                
            print(np.mean(Loss))
        
print(np.mean(Loss))
                
dtest[target] = dtest[target] / (seeds * n_splits)

1.4406613111818205
1.5970538296865253
1.58150957740641
1.60402251621049
1.6349251318291007
1.6349251318291007

In [ ]:

In [24]:

################### RFC
np.random.shuffle(use)

I = 0
remove   = []

for u in use:
    Loss = []
    dtest[target] = 0
    for seed in range(seeds):        
        if I == 0:
            usen = use.copy()
        else:
            usen = use.copy()
            usen.remove(u)
            for f in remove:
                if f in usen:
                    usen.remove(f)            
        
        params0['random_state'] = seed            
        clf0 = LGBMRegressor(**params0)
            
        params1['random_state'] = seed
        clf1  = LGBMRegressor(**params1)
            
        params2['random_state'] = seed
        clf2  = LGBMRegressor(**params2)
            
        esr = 100
            
        skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
        for train_index, test_index in skf.split(dtrain, dtrain[target]):
            X_trg, X_val = dtrain[usen].loc[train_index],    dtrain[usen].loc[test_index]
            y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
            clf0.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp0 = clf0.predict(X_val)
            pred0 = clf0.predict(dtest[usen])
                    
            clf1.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr, 
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp1 = clf1.predict(X_val)
            pred1 = clf1.predict(dtest[usen])
            
            clf2.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp2 = clf2.predict(X_val)
            pred2 = clf2.predict(dtest[usen])
                
            pred = (pred0 + pred1 + pred2) / 3
            dtest[target] += pred                
                    
            temp = (temp0 + temp1 + temp2) / 3              

            sc = mean_squared_error(y_val, temp)
            Loss.append(sc)
            
    if I == 0:
        best = np.mean(Loss)
        print('start score', best)
    else:
        if best > np.mean(Loss):
            print(u, best, np.mean(Loss))
            best = np.mean(Loss)
            remove.append(u)
            print(remove)
        else:
            print(u, '		', best, np.mean(Loss))  
    
    del Loss                
    I += 1

################### RFC

print(remove)

for u in remove:
    if u in use:
        use.remove(u)

start score 1.6569092456546846
x_39 		 1.6569092456546846 1.7029584082759301
x_10 		 1.6569092456546846 1.68324277503502
x_12 1.6569092456546846 1.6041848960678078
['x_12']
x_31 1.6041848960678078 1.6035895957270514
['x_12', 'x_31']
x_35 1.6035895957270514 1.5926631996245522
['x_12', 'x_31', 'x_35']
x_23 1.5926631996245522 1.5803677022372167
['x_12', 'x_31', 'x_35', 'x_23']
x_24 1.5803677022372167 1.5479813241076823
['x_12', 'x_31', 'x_35', 'x_23', 'x_24']
x_3 		 1.5479813241076823 1.5644736770748013
x_14 		 1.5479813241076823 1.5564052434984226
x_8 1.5479813241076823 1.5297986813102957
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8']
x_36 		 1.5297986813102957 1.6130993077250164
x_40 		 1.5297986813102957 1.555984811946637
x_28 		 1.5297986813102957 1.6925173555440967
x_20 1.5297986813102957 1.5271064026960743
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8', 'x_20']
x_9 		 1.5271064026960743 1.8411891509516514
x_13 		 1.5271064026960743 1.548138327673871
x_19 1.5271064026960743 1.4978331123313762
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8', 'x_20', 'x_19']
x_32 1.4978331123313762 1.4460745206551833
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8', 'x_20', 'x_19', 'x_32']
x_4 		 1.4460745206551833 1.4771024752878223
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8', 'x_20', 'x_19', 'x_32']

In [25]:

################### RFC
np.random.shuffle(use)

I = 0
remove   = []

for u in use:
    Loss = []
    dtest[target] = 0
    for seed in range(seeds):        
        if I == 0:
            usen = use.copy()
        else:
            usen = use.copy()
            usen.remove(u)
            for f in remove:
                if f in usen:
                    usen.remove(f)            
        
        params0['random_state'] = seed            
        clf0 = LGBMRegressor(**params0)
            
        params1['random_state'] = seed
        clf1  = LGBMRegressor(**params1)
            
        params2['random_state'] = seed
        clf2  = LGBMRegressor(**params2)
            
        esr = 100
            
        skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
        for train_index, test_index in skf.split(dtrain, dtrain[target]):
            X_trg, X_val = dtrain[usen].loc[train_index],    dtrain[usen].loc[test_index]
            y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
            clf0.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp0 = clf0.predict(X_val)
            pred0 = clf0.predict(dtest[usen])
                    
            clf1.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr, 
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp1 = clf1.predict(X_val)
            pred1 = clf1.predict(dtest[usen])
            
            clf2.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp2 = clf2.predict(X_val)
            pred2 = clf2.predict(dtest[usen])
                
            pred = (pred0 + pred1 + pred2) / 3
            dtest[target] += pred                
                    
            temp = (temp0 + temp1 + temp2) / 3              

            sc = mean_squared_error(y_val, temp)
            Loss.append(sc)
            
    if I == 0:
        best = np.mean(Loss)
        print('start score', best)
    else:
        if best > np.mean(Loss):
            print(u, best, np.mean(Loss))
            best = np.mean(Loss)
            remove.append(u)
            print(remove)
        else:
            print(u, '		', best, np.mean(Loss))  
    
    del Loss                
    I += 1

################### RFC

print(remove)

for u in remove:
    if u in use:
        use.remove(u)

start score 1.4496209434191165
x_4 		 1.4496209434191165 1.4682643669482238
x_40 		 1.4496209434191165 1.4523718141470239
x_43 1.4496209434191165 1.4319693304928505
['x_43']
x_9 		 1.4319693304928505 1.7843356983765593
x_39 		 1.4319693304928505 1.4579145281323809
x_13 1.4319693304928505 1.4087640357927358
['x_43', 'x_13']
x_28 		 1.4087640357927358 1.7908206024220463
x_3 		 1.4087640357927358 1.4640140038281158
x_14 		 1.4087640357927358 1.4417577413574154
x_10 		 1.4087640357927358 1.5439146030889228
['x_43', 'x_13']

In [26]:

################### RFC
np.random.shuffle(use)

I = 0
remove   = []

for u in use:
    Loss = []
    dtest[target] = 0
    for seed in range(seeds):        
        if I == 0:
            usen = use.copy()
        else:
            usen = use.copy()
            usen.remove(u)
            for f in remove:
                if f in usen:
                    usen.remove(f)            
        
        params0['random_state'] = seed            
        clf0 = LGBMRegressor(**params0)
            
        params1['random_state'] = seed
        clf1  = LGBMRegressor(**params1)
            
        params2['random_state'] = seed
        clf2  = LGBMRegressor(**params2)
            
        esr = 100
            
        skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
        for train_index, test_index in skf.split(dtrain, dtrain[target]):
            X_trg, X_val = dtrain[usen].loc[train_index],    dtrain[usen].loc[test_index]
            y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
            clf0.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp0 = clf0.predict(X_val)
            pred0 = clf0.predict(dtest[usen])
                    
            clf1.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr, 
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp1 = clf1.predict(X_val)
            pred1 = clf1.predict(dtest[usen])
            
            clf2.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp2 = clf2.predict(X_val)
            pred2 = clf2.predict(dtest[usen])
                
            pred = (pred0 + pred1 + pred2) / 3
            dtest[target] += pred                
                    
            temp = (temp0 + temp1 + temp2) / 3              

            sc = mean_squared_error(y_val, temp)
            Loss.append(sc)
            
    if I == 0:
        best = np.mean(Loss)
        print('start score', best)
    else:
        if best > np.mean(Loss):
            print(u, best, np.mean(Loss))
            best = np.mean(Loss)
            remove.append(u)
            print(remove)
        else:
            print(u, '		', best, np.mean(Loss))  
    
    del Loss                
    I += 1

################### RFC

print(remove)

for u in remove:
    if u in use:
        use.remove(u)

start score 1.3982122586858419
x_36 		 1.3982122586858419 1.5788958115324099
x_28 		 1.3982122586858419 1.7741863960034117
x_10 		 1.3982122586858419 1.5308340007123231
x_14 		 1.3982122586858419 1.4560194114116045
x_9 		 1.3982122586858419 1.746497185374684
x_40 		 1.3982122586858419 1.4539260651081267
x_4 		 1.3982122586858419 1.4354286356760073
x_39 		 1.3982122586858419 1.434863671265331
[]

In [ ]:

In [27]:

seeds    = 3
n_splits = 5
N_estimators = 1000
target = 'target'

params0, params1, params2 = get_params(dtrain, target, use, n_splits, 10)

rmse... 1.5709147217550457 Best... 1.5709147217550457 Iter  1
rmse... 1.5508985479411896 Best... 1.5508985479411896 Iter  2
rmse... 1.573016489795622 Best... 1.5508985479411896 Iter  3
rmse... 1.5582837759831583 Best... 1.5508985479411896 Iter  4
rmse... 1.5681214611351213 Best... 1.5508985479411896 Iter  5
rmse... 1.57353184593852 Best... 1.5508985479411896 Iter  6
rmse... 1.5758244091568625 Best... 1.5508985479411896 Iter  7
rmse... 1.5564757482864828 Best... 1.5508985479411896 Iter  8
rmse... 1.5856977651219941 Best... 1.5508985479411896 Iter  9
rmse... 1.5145039312543003 Best... 1.5145039312543003 Iter  10

 Best score 1.5145039312543003 

{'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 0.2, 'reg_lambda': 0.2, 'learning_rate': 0.29999999999999993, 'num_leaves': 50, 'max_depth': 50, 'n_estimators': 1000} 
 {'subsample': 0.915866702114476, 'colsample_bytree': 1.0, 'reg_alpha': 0.2, 'reg_lambda': 1e-14, 'learning_rate': 0.29999999999999993, 'num_leaves': 75, 'max_depth': 3, 'n_estimators': 1000} 
 {'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 0.2, 'reg_lambda': 0.2, 'learning_rate': 0.29999999999999993, 'num_leaves': 75, 'max_depth': 3, 'n_estimators': 1000}

In [ ]:

In [28]:

Loss     = []
n_estimators = 10000
dtest[target] = 0

if True:
        
        for seed in range(seeds):            
            params0['random_state'] = seed            
            clf0 = LGBMRegressor(**params0)
            
            params1['random_state'] = seed
            clf1  = LGBMRegressor(**params1)
            
            params2['random_state'] = seed
            clf2  = LGBMRegressor(**params2)
            
            esr = 100
            I = 0
            
            skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
            for train_index, test_index in skf.split(dtrain, dtrain[target]):
                X_trg, X_val = dtrain[use].loc[train_index],    dtrain[use].loc[test_index]
                y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
                clf0.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp0 = clf0.predict(X_val)
                pred0 = clf0.predict(dtest[use])
                    
                clf1.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr, 
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp1 = clf1.predict(X_val)
                pred1 = clf1.predict(dtest[use])
            
                clf2.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp2 = clf2.predict(X_val)
                pred2 = clf2.predict(dtest[use])
                
                pred = (pred0 + pred1 + pred2) / 3
                dtest[target] += pred                
                    
                temp = (temp0 + temp1 + temp2) / 3              

                sc = mean_squared_error(y_val, temp)
                Loss.append(sc)
                
            print(np.mean(Loss))
        
print(np.mean(Loss))
                
dtest[target] = dtest[target] / (seeds * n_splits)

1.4975615504024486
1.5057990276432522
1.5145039312543003
1.5145039312543003

In [ ]:

In [42]:

predictions = dtest[target].values

submission = pd.DataFrame({"label":predictions})
submission['label'][submission['label'] < 0] = 0
#submission['label'] = submission['label'] - predictions.min()
submission

Out[42]:

	label
0	2.259465
1	2.437215
2	3.329918
3	3.048174
4	3.143170
...	...
596	1.271802
597	2.127097
598	2.987384
599	2.911476
600	2.980742

601 rows × 1 columns

In [43]:

submission['label'].min()

Out[43]:

0.0

In [44]:

dtrain.target.hist()

Out[44]:

<AxesSubplot:>

In [45]:

submission.label.hist()

Out[45]:

<AxesSubplot:>

In [46]:

!rm -rf assets
!mkdir assets
submission.to_csv(os.path.join("assets", "submission.csv"))

stop

In [ ]:

In [47]:

%load_ext aicrowd.magic
%aicrowd login

The aicrowd.magic extension is already loaded. To reload it, use:
  %reload_ext aicrowd.magic
Please login here: https://api.aicrowd.com/auth/nIlh0hH8gnHLzsZSCsXVjfHBSS0LCR35VnrM6legWTk
Login Error: Couldn't login. Max retries exceeded
Please try logging in again
Using notebook: /mnt/c/Kaggle/AI6/LiDAR/01/08.ipynb for submission...
Removing existing files from submission directory...
Scrubbing API keys from the notebook...
Collecting notebook...
submission.zip ━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 81.4/79.8 KB • 1.7 MB/s • 0:00:00• 0:00:01
                                                   ╭─────────────────────────╮                                                    
                                                   │ Successfully submitted! │                                                    
                                                   ╰─────────────────────────╯                                                    
                                                         Important links                                                          
┌──────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│  This submission │ https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection/submissions/155331              │
│                  │                                                                                                             │
│  All submissions │ https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection/submissions?my_submissions=true │
│                  │                                                                                                             │
│      Leaderboard │ https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection/leaderboards                    │
│                  │                                                                                                             │
│ Discussion forum │ https://discourse.aicrowd.com/c/ai-blitz-xi                                                                 │
│                  │                                                                                                             │
│   Challenge page │ https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection                                 │
└──────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [ ]:

https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection/submissions/new

Content

2772

Show Comments

Comments

You must login before you can post a comment.