ADDI Alzheimers Detection Challenge
Detailed Data Analysis & Simple CatBoost - 0.640 on LB
Description of features and the entire dataset, selection of categorical features by logic, CatBoost
I tried to make detailed graphs of the features and their dependencies on the diagnosis. I also made basic summaries for the entire dataset and trained the model. The analysis of the dataset corresponds to the organizers pdf. So, you can easily find the description of the desired feature in pdf. LB scores is 0.640 and 0.447.
Setup AIcrowd Utilities 🛠¶
In [ ]:
!pip install -q -U aicrowd-cli
In [ ]:
%load_ext aicrowd.magic
AIcrowd Runtime Configuration 🧷¶
In [ ]:
import os
# Please use the absolute for the location of the dataset.
# Or you can use relative path with `os.getcwd() + "test_data/validation.csv"`
AICROWD_TRAIN_DATASET_PATH = os.getenv("TRAIN_DATASET_PATH", "/ds_shared_drive/train.csv")
AICROWD_DATASET_PATH = os.getenv("DATASET_PATH", "/ds_shared_drive/validation.csv")
AICROWD_PREDICTIONS_PATH = os.getenv("PREDICTIONS_PATH", "predictions.csv")
AICROWD_ASSETS_DIR = "assets"
Install packages 🗃¶
In [ ]:
!pip install numpy pandas catboost sklearn
Define preprocessing code¶
Import common packages¶
In [ ]:
import numpy as np
import os
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
Training phase¶
Load training data¶
In [ ]:
train_data = pd.read_csv(AICROWD_TRAIN_DATASET_PATH)
train_data.head()
Out[ ]:
Features exploration¶
In [ ]:
regr_features = []
cat_features = []
Functions¶
In [ ]:
def get_corr(feature):
features_corr = [feature]
features_corr.append('diagnosis')
df_corr = train_data[features_corr].copy()
df_corr['diagnosis'] = pd.factorize(df_corr['diagnosis'])[0]
if 'intersection_pos_rel_centre' in features_corr:
df_corr['intersection_pos_rel_centre'] = pd.factorize(df_corr['intersection_pos_rel_centre'])[0]
return df_corr.corr().values[0, 1]
In [ ]:
bold = '\033[1m'
ordinary = '\033[0m'
def feature_describe(feature):
print(bold+'Data type:'+ordinary, train_data[feature].dtype)
print(bold+'Number of missing values: '+ordinary + str(round(100 * train_data[feature].isnull().sum() / train_data.shape[0], 2)) + '%')
print(bold+'Correlation with the diagnosis:'+ordinary, round(get_corr(feature), 2))
if train_data[feature].dtype != object:
print(bold+'Min:'+ordinary, round(train_data[feature].min(), 2))
print(bold+'Mean:'+ordinary, round(train_data[feature].mean(), 2))
print(bold+'Max:'+ordinary, round(train_data[feature].max(), 2))
unique_number = train_data[feature].nunique()
uniques = train_data[feature].unique()
print(bold+'Number of unique values:'+ordinary, unique_number)
print(bold+'Example of unique values:'+ordinary, end=' ')
for i in range(len(uniques[:5])):
if i != len(uniques[:5]) - 1:
if train_data[feature].dtype == object:
print(uniques[i], end=', ')
else:
print(np.round(uniques[i], 2), end=', ')
else:
if train_data[feature].dtype == object:
print(uniques[i])
else:
print(np.round(uniques[i], 2))
colors = ['orange', 'green', 'purple', 'deeppink', 'blue']
def show_distribution(feature):
plt.figure(figsize=(8, 4), dpi=80)
if train_data[feature].dtype == object:
print('Mapping values:', end=' ')
for i in range(len(set(pd.factorize(train_data[feature])[0]))):
if list(set(pd.factorize(train_data[feature])[0]))[i] == -1:
print('nan - -1', end=', ')
else:
print(list(set(pd.factorize(train_data[feature])[1]))[i] + ' - ' + str(list(set(pd.factorize(train_data[feature])[0]))[i]), end=', ')
sns.kdeplot(pd.factorize(train_data[feature])[0][train_data[train_data.diagnosis == 'normal'].index],
label='normal', linewidth=3, shade=True, color='green', alpha=.5)
sns.kdeplot(pd.factorize(train_data[feature])[0][train_data[train_data.diagnosis == 'pre_alzheimer'].index],
label='pre_alzheimer', linewidth=3, shade=True, color='orange', alpha=.5)
sns.kdeplot(pd.factorize(train_data[feature])[0][train_data[train_data.diagnosis == 'post_alzheimer'].index],
label='post_alzheimer', linewidth=3, shade=True, color='blue', alpha=.5)
else:
sns.kdeplot(train_data.loc[train_data.diagnosis == 'normal', feature],
label='normal', linewidth=3, shade=True, color='green', alpha=.5)
sns.kdeplot(train_data.loc[train_data.diagnosis == 'pre_alzheimer', feature],
label='pre_alzheimer', linewidth=3, shade=True, color='orange', alpha=.5)
sns.kdeplot(train_data.loc[train_data.diagnosis == 'post_alzheimer', feature],
label='post_alzheimer', linewidth=3, shade=True, color='blue', alpha=.5)
plt.xlabel('Value')
plt.legend()
plt.title(feature)
plt.show()
def show_distribution_hist(feature):
df = train_data.copy()
if feature == 'intersection_pos_rel_centre':
print('Mapping values:', end=' ')
for i in range(len(set(pd.factorize(df[feature])[0]))):
if list(set(pd.factorize(df[feature])[0]))[i] == -1:
print('nan - -1', end=', ')
else:
print(list(set(pd.factorize(train_data[feature])[1]))[i] + ' - ' + str(list(set(pd.factorize(train_data[feature])[0]))[i]), end=', ')
df['intersection_pos_rel_centre'] = pd.factorize(df['intersection_pos_rel_centre'])[0]
_, ax = plt.subplots(1, 3, figsize=(16, 4), dpi=80)
sns.histplot(df.loc[df.diagnosis == 'normal', feature],
ax=ax[0], label='normal', color='green', stat='probability', bins=10)
sns.histplot(df.loc[df.diagnosis == 'pre_alzheimer', feature],
ax=ax[1], label='pre_alzheimer', color='orange', stat='probability', bins=10)
sns.histplot(df.loc[df.diagnosis == 'post_alzheimer', feature],
ax=ax[2], label='post_alzheimer', color='blue', stat='probability', bins=10)
ax[0].legend()
ax[1].legend()
ax[2].legend()
plt.show()
In [ ]:
def show_corr(features):
features_corr = features.copy()
features_corr.append('diagnosis')
df_corr = train_data[features_corr].copy()
df_corr['diagnosis'] = pd.factorize(df_corr['diagnosis'])[0]
if 'intersection_pos_rel_centre' in features_corr:
df_corr['intersection_pos_rel_centre'] = pd.factorize(df_corr['intersection_pos_rel_centre'])[0]
corr = df_corr.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(20, 8))
sns.heatmap(corr,
mask=mask,
cmap=sns.color_palette('dark:salmon_r', as_cmap=True),
annot=True,
center=0,
linewidths=.5, cbar_kws={'shrink': .5})
plt.show()
del df_corr
del features_corr
del corr
del mask
Clock and Digit Features¶
In [ ]:
clock_features = []
Final Rotation Angle¶
In [ ]:
feature = 'final_rotation_angle'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Number of Digits¶
In [ ]:
feature = 'number_of_digits'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Missing Digit Dummy Variables¶
In [ ]:
feature = 'missing_digit_1'
# similary we have 11 other variables for all the other digits (missing_digit_2, missing_digit_3, etc.)
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
for i in range(1, 13):
feature = 'missing_digit_{}'.format(i)
cat_features.append(feature)
clock_features.append(feature)
Deviation of Axis Digits (3, 6, 9 and 12) from Mid Axes¶
In [ ]:
feature = 'deviation_dist_from_mid_axis'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Between Axis Digits Angle Metrics¶
In [ ]:
feature = 'between_axis_digits_angle_sum'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'between_axis_digits_angle_var'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Between Digits Angle Metrics¶
In [ ]:
feature = 'between_digits_angle_cw_sum'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'between_digits_angle_cw_var'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'between_digits_angle_ccw_sum'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'between_digits_angle_ccw_var'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Sequence Flag Clock Wise and Counter Clock Wise¶
In [ ]:
feature = 'sequence_flag_cw'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
cat_features.append(feature)
In [ ]:
feature = 'sequence_flag_ccw'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
cat_features.append(feature)
Correlation of clock and digits features¶
In [ ]:
show_corr(clock_features)
Hand Features¶
In [ ]:
hand_features = []
Number of Hands¶
In [ ]:
feature = 'number_of_hands'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'hand_count_dummy'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
cat_features.append(feature)
Hand Length¶
In [ ]:
feature = 'hour_hand_length'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)