ADDI Alzheimers Detection Challenge
Detailed Data Analysis & Simple CatBoost - 0.640 on LB
Description of features and the entire dataset, selection of categorical features by logic, CatBoost
I tried to make detailed graphs of the features and their dependencies on the diagnosis. I also made basic summaries for the entire dataset and trained the model. The analysis of the dataset corresponds to the organizers pdf. So, you can easily find the description of the desired feature in pdf. LB scores is 0.640 and 0.447.
Setup AIcrowd Utilities 🛠¶
In [ ]:
!pip install -q -U aicrowd-cli
In [ ]:
%load_ext aicrowd.magic
AIcrowd Runtime Configuration 🧷¶
In [ ]:
import os
# Please use the absolute for the location of the dataset.
# Or you can use relative path with `os.getcwd() + "test_data/validation.csv"`
AICROWD_TRAIN_DATASET_PATH = os.getenv("TRAIN_DATASET_PATH", "/ds_shared_drive/train.csv")
AICROWD_DATASET_PATH = os.getenv("DATASET_PATH", "/ds_shared_drive/validation.csv")
AICROWD_PREDICTIONS_PATH = os.getenv("PREDICTIONS_PATH", "predictions.csv")
AICROWD_ASSETS_DIR = "assets"
Install packages 🗃¶
In [ ]:
!pip install numpy pandas catboost sklearn
Define preprocessing code¶
Import common packages¶
In [ ]:
import numpy as np
import os
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
Training phase¶
Load training data¶
In [ ]:
train_data = pd.read_csv(AICROWD_TRAIN_DATASET_PATH)
train_data.head()
Out[ ]:
Features exploration¶
In [ ]:
regr_features = []
cat_features = []
Functions¶
In [ ]:
def get_corr(feature):
features_corr = [feature]
features_corr.append('diagnosis')
df_corr = train_data[features_corr].copy()
df_corr['diagnosis'] = pd.factorize(df_corr['diagnosis'])[0]
if 'intersection_pos_rel_centre' in features_corr:
df_corr['intersection_pos_rel_centre'] = pd.factorize(df_corr['intersection_pos_rel_centre'])[0]
return df_corr.corr().values[0, 1]
In [ ]:
bold = '\033[1m'
ordinary = '\033[0m'
def feature_describe(feature):
print(bold+'Data type:'+ordinary, train_data[feature].dtype)
print(bold+'Number of missing values: '+ordinary + str(round(100 * train_data[feature].isnull().sum() / train_data.shape[0], 2)) + '%')
print(bold+'Correlation with the diagnosis:'+ordinary, round(get_corr(feature), 2))
if train_data[feature].dtype != object:
print(bold+'Min:'+ordinary, round(train_data[feature].min(), 2))
print(bold+'Mean:'+ordinary, round(train_data[feature].mean(), 2))
print(bold+'Max:'+ordinary, round(train_data[feature].max(), 2))
unique_number = train_data[feature].nunique()
uniques = train_data[feature].unique()
print(bold+'Number of unique values:'+ordinary, unique_number)
print(bold+'Example of unique values:'+ordinary, end=' ')
for i in range(len(uniques[:5])):
if i != len(uniques[:5]) - 1:
if train_data[feature].dtype == object:
print(uniques[i], end=', ')
else:
print(np.round(uniques[i], 2), end=', ')
else:
if train_data[feature].dtype == object:
print(uniques[i])
else:
print(np.round(uniques[i], 2))
colors = ['orange', 'green', 'purple', 'deeppink', 'blue']
def show_distribution(feature):
plt.figure(figsize=(8, 4), dpi=80)
if train_data[feature].dtype == object:
print('Mapping values:', end=' ')
for i in range(len(set(pd.factorize(train_data[feature])[0]))):
if list(set(pd.factorize(train_data[feature])[0]))[i] == -1:
print('nan - -1', end=', ')
else:
print(list(set(pd.factorize(train_data[feature])[1]))[i] + ' - ' + str(list(set(pd.factorize(train_data[feature])[0]))[i]), end=', ')
sns.kdeplot(pd.factorize(train_data[feature])[0][train_data[train_data.diagnosis == 'normal'].index],
label='normal', linewidth=3, shade=True, color='green', alpha=.5)
sns.kdeplot(pd.factorize(train_data[feature])[0][train_data[train_data.diagnosis == 'pre_alzheimer'].index],
label='pre_alzheimer', linewidth=3, shade=True, color='orange', alpha=.5)
sns.kdeplot(pd.factorize(train_data[feature])[0][train_data[train_data.diagnosis == 'post_alzheimer'].index],
label='post_alzheimer', linewidth=3, shade=True, color='blue', alpha=.5)
else:
sns.kdeplot(train_data.loc[train_data.diagnosis == 'normal', feature],
label='normal', linewidth=3, shade=True, color='green', alpha=.5)
sns.kdeplot(train_data.loc[train_data.diagnosis == 'pre_alzheimer', feature],
label='pre_alzheimer', linewidth=3, shade=True, color='orange', alpha=.5)
sns.kdeplot(train_data.loc[train_data.diagnosis == 'post_alzheimer', feature],
label='post_alzheimer', linewidth=3, shade=True, color='blue', alpha=.5)
plt.xlabel('Value')
plt.legend()
plt.title(feature)
plt.show()
def show_distribution_hist(feature):
df = train_data.copy()
if feature == 'intersection_pos_rel_centre':
print('Mapping values:', end=' ')
for i in range(len(set(pd.factorize(df[feature])[0]))):
if list(set(pd.factorize(df[feature])[0]))[i] == -1:
print('nan - -1', end=', ')
else:
print(list(set(pd.factorize(train_data[feature])[1]))[i] + ' - ' + str(list(set(pd.factorize(train_data[feature])[0]))[i]), end=', ')
df['intersection_pos_rel_centre'] = pd.factorize(df['intersection_pos_rel_centre'])[0]
_, ax = plt.subplots(1, 3, figsize=(16, 4), dpi=80)
sns.histplot(df.loc[df.diagnosis == 'normal', feature],
ax=ax[0], label='normal', color='green', stat='probability', bins=10)
sns.histplot(df.loc[df.diagnosis == 'pre_alzheimer', feature],
ax=ax[1], label='pre_alzheimer', color='orange', stat='probability', bins=10)
sns.histplot(df.loc[df.diagnosis == 'post_alzheimer', feature],
ax=ax[2], label='post_alzheimer', color='blue', stat='probability', bins=10)
ax[0].legend()
ax[1].legend()
ax[2].legend()
plt.show()
In [ ]:
def show_corr(features):
features_corr = features.copy()
features_corr.append('diagnosis')
df_corr = train_data[features_corr].copy()
df_corr['diagnosis'] = pd.factorize(df_corr['diagnosis'])[0]
if 'intersection_pos_rel_centre' in features_corr:
df_corr['intersection_pos_rel_centre'] = pd.factorize(df_corr['intersection_pos_rel_centre'])[0]
corr = df_corr.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(20, 8))
sns.heatmap(corr,
mask=mask,
cmap=sns.color_palette('dark:salmon_r', as_cmap=True),
annot=True,
center=0,
linewidths=.5, cbar_kws={'shrink': .5})
plt.show()
del df_corr
del features_corr
del corr
del mask
Clock and Digit Features¶
In [ ]:
clock_features = []
Final Rotation Angle¶
In [ ]:
feature = 'final_rotation_angle'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Number of Digits¶
In [ ]:
feature = 'number_of_digits'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Missing Digit Dummy Variables¶
In [ ]:
feature = 'missing_digit_1'
# similary we have 11 other variables for all the other digits (missing_digit_2, missing_digit_3, etc.)
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
for i in range(1, 13):
feature = 'missing_digit_{}'.format(i)
cat_features.append(feature)
clock_features.append(feature)
Deviation of Axis Digits (3, 6, 9 and 12) from Mid Axes¶
In [ ]:
feature = 'deviation_dist_from_mid_axis'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Between Axis Digits Angle Metrics¶
In [ ]:
feature = 'between_axis_digits_angle_sum'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'between_axis_digits_angle_var'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Between Digits Angle Metrics¶
In [ ]:
feature = 'between_digits_angle_cw_sum'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'between_digits_angle_cw_var'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'between_digits_angle_ccw_sum'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'between_digits_angle_ccw_var'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
regr_features.append(feature)
Sequence Flag Clock Wise and Counter Clock Wise¶
In [ ]:
feature = 'sequence_flag_cw'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
cat_features.append(feature)
In [ ]:
feature = 'sequence_flag_ccw'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
clock_features.append(feature)
cat_features.append(feature)
Correlation of clock and digits features¶
In [ ]:
show_corr(clock_features)
Hand Features¶
In [ ]:
hand_features = []
Number of Hands¶
In [ ]:
feature = 'number_of_hands'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'hand_count_dummy'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
cat_features.append(feature)
Hand Length¶
In [ ]:
feature = 'hour_hand_length'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'minute_hand_length'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'single_hand_length'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'clockhand_ratio'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'clockhand_diff'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
Angle Between Hands¶
In [ ]:
feature = 'angle_between_hands'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
Deviation of Intersection Point of Hands from Geometric Centre¶
In [ ]:
feature = 'deviation_from_centre'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'intersection_pos_rel_centre'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
cat_features.append(feature)
The Proximity of Hour and Minute from 11 and 2 Respectively¶
In [ ]:
feature = 'hour_proximity_from_11'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'minute_proximity_from_2'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
regr_features.append(feature)
Digit Pointed by Hour and Minute Hand¶
In [ ]:
feature = 'hour_pointing_digit'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
cat_features.append(feature)
In [ ]:
feature = 'minute_pointing_digit'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
cat_features.append(feature)
Clock Hand Errors¶
In [ ]:
feature = 'eleven_ten_error'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
cat_features.append(feature)
In [ ]:
feature = 'other_error'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
hand_features.append(feature)
cat_features.append(feature)
Correlation of hand features¶
In [ ]:
show_corr(hand_features)
Circle Features¶
In [ ]:
circle_features = []
Ellipse to Circle Ratio¶
In [ ]:
feature = 'ellipse_circle_ratio'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
Predicted Tremor and the Number of Defects¶
In [ ]:
feature = 'count_defects'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'pred_tremor'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
cat_features.append(feature)
Percentage of Digits inside the Clock Face¶
In [ ]:
feature = 'percentage_inside_ellipse'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
The Length of the Major and Minor Axis of the Fitted Ellipse¶
In [ ]:
feature = 'double_major'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'double_minor'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
Area of the Top, Bottom, Left and Right Hemisphere of the Circle¶
In [ ]:
feature = 'top_area_perc'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'bottom_area_perc'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'left_area_perc'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'right_area_perc'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
Horizontal and vertical distances¶
In [ ]:
feature = 'horizontal_dist'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'vertical_dist'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
Euclidean Distance from Digits¶
In [ ]:
feature = 'euc_dist_digit_1'
# similary we have 11 other variables for all the other digits (euc_dist_digit_2, euc_dist_digit_3, etc.)
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
for i in range(1, 13):
feature = 'euc_dist_digit_{}'.format(i)
regr_features.append(feature)
circle_features.append(feature)
Distance of Digits from clock center¶
In [ ]:
feature = '1 dist from cen'
# similary we have 11 other variables for all the other digits (2 dist from cen, 3 dist from cen, etc.)
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
for i in range(1, 13):
feature = '{} dist from cen'.format(i)
regr_features.append(feature)
circle_features.append(feature)
Area, Height, Width of Digit Bounding Boxes Metrics¶
In [ ]:
feature = 'area_digit_1'
# similary we have 11 other variables for all the other digits (area_digit_2, area_digit_3, etc.)
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
for i in range(1, 13):
feature = 'area_digit_{}'.format(i)
regr_features.append(feature)
circle_features.append(feature)
In [ ]:
feature = 'height_digit_1'
# similary we have 11 other variables for all the other digits (height_digit_2, height_digit_3, etc.)
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
for i in range(1, 13):
feature = 'height_digit_{}'.format(i)
regr_features.append(feature)
circle_features.append(feature)
In [ ]:
feature = 'width_digit_1'
# similary we have 11 other variables for all the other digits (width_digit_2, width_digit_3, etc.)
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
for i in range(1, 13):
feature = 'width_digit_{}'.format(i)
regr_features.append(feature)
circle_features.append(feature)
In [ ]:
feature = 'variance_width'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
regr_features.append(feature)
circle_features.append(feature)
In [ ]:
feature = 'variance_height'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
regr_features.append(feature)
circle_features.append(feature)
In [ ]:
feature = 'variance_area'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
regr_features.append(feature)
circle_features.append(feature)
Time Features¶
In [ ]:
feature = 'time_diff'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
regr_features.append(feature)
circle_features.append(feature)
Centre Dot Detection¶
In [ ]:
feature = 'centre_dot_detect'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
cat_features.append(feature)
circle_features.append(feature)
Horizontal and vertical count¶
In [ ]:
feature = 'hor_count'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
In [ ]:
feature = 'vert_count'
feature_describe(feature)
show_distribution(feature)
show_distribution_hist(feature)
circle_features.append(feature)
regr_features.append(feature)
Correlation of circle features¶
In [ ]:
show_corr(circle_features[:20])
In [ ]:
show_corr(circle_features[20:40])
In [ ]:
show_corr(circle_features[40:60])
In [ ]:
show_corr(circle_features[60:])
Overview of all data¶
Functions¶
In [ ]:
def show_miss():
df_miss = 100 * train_data.isnull().sum().sort_values(ascending=False) / train_data.shape[0]
for i in [1, 5, 10, 15, 20, 30, 50]:
print('More than {} columns with a percentage of omissions greater than {}%'.format(len(df_miss[df_miss > i]), i))
plt.figure(figsize=(10, 10))
sns.barplot(x=df_miss.head(26).values, y=df_miss.head(26).index)
plt.title('Top 26 columns by number of omissions')
def show_corr_all():
df_corr = train_data.copy()
df_corr['diagnosis'] = pd.factorize(df_corr['diagnosis'])[0]
df_corr['intersection_pos_rel_centre'] = pd.factorize(df_corr['intersection_pos_rel_centre'])[0]
df_corr = df_corr.corr()['diagnosis'].sort_values(ascending=False).iloc[1:-2]
df_corr = pd.concat([df_corr.head(20), df_corr.tail(10)])
plt.figure(figsize=(10, 10))
sns.barplot(x=df_corr.values, y=df_corr.index)
plt.title('Top 20 correlations from the beggining and top 10 from the end')
def dist_diagnosis():
plt.figure(figsize=(10,5))
sns.barplot(x=train_data.diagnosis.unique(), y=train_data.groupby('diagnosis').row_id.count(), palette='rocket')
plt.ylabel('Count')
plt.show()
Missing values¶
In [ ]:
show_miss()
Categorical and regression features¶
I made the division according to my subjective logic. This is not the only correct solution.
In [ ]:
print(bold+'These are all categorical features in the form of a list:'+ordinary)
print(cat_features)
In [ ]:
train_data[cat_features].head()
Out[ ]:
In [ ]:
train_data[regr_features].head()
Out[ ]:
Top correlations¶
In [ ]:
show_corr_all()
Diagnosis distribution¶
In [ ]:
dist_diagnosis()
Training¶
In [ ]:
train_data.dtypes[train_data.dtypes == object]
Out[ ]:
In [ ]:
train_data[cat_features] = train_data[cat_features].fillna(999)
train_data[[feature for feature in cat_features if feature != 'intersection_pos_rel_centre']] = train_data[[feature for feature in cat_features if feature != 'intersection_pos_rel_centre']].astype(int)
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['row_id', 'diagnosis'], axis=1), train_data['diagnosis'],
test_size=0.15, stratify=train_data['diagnosis'], random_state=17)
model = CatBoostClassifier(loss_function='MultiClass',
auto_class_weights='SqrtBalanced')
model.fit(X_train, y_train, eval_set=(X_test, y_test), cat_features=cat_features, verbose=100)
In [ ]:
model.save_model(AICROWD_ASSETS_DIR + '/model_123')
np.save(AICROWD_ASSETS_DIR + '/cat', cat_features)
Prediction phase 🔎¶
In [ ]:
from catboost import CatBoostClassifier
model = CatBoostClassifier()
model.load_model(AICROWD_ASSETS_DIR + '/model_123')
Out[ ]:
Load test data¶
In [ ]:
test_data = pd.read_csv(AICROWD_DATASET_PATH)
cat_features = np.load(AICROWD_ASSETS_DIR + '/cat.npy', allow_pickle=True)
In [ ]:
test_data[cat_features] = test_data[cat_features].fillna(999)
test_data[[feature for feature in cat_features if feature != 'intersection_pos_rel_centre']] = test_data[[feature for feature in cat_features if feature != 'intersection_pos_rel_centre']].astype(int)
Generate predictions¶
In [ ]:
preds = model.predict_proba(test_data.drop(['row_id'], axis=1))
In [ ]:
predictions = {
"row_id": test_data["row_id"].values,
"normal_diagnosis_probability": preds[:, 0],
"post_alzheimer_diagnosis_probability": preds[:, 1],
"pre_alzheimer_diagnosis_probability": preds[:, 2],
}
predictions_df = pd.DataFrame.from_dict(predictions)
In [ ]:
pred_sum = predictions_df['normal_diagnosis_probability'] + predictions_df['post_alzheimer_diagnosis_probability'] + predictions_df['pre_alzheimer_diagnosis_probability']
predictions_df['normal_diagnosis_probability'] /= pred_sum
predictions_df['post_alzheimer_diagnosis_probability'] /= pred_sum
predictions_df['pre_alzheimer_diagnosis_probability'] /= pred_sum
predictions_df['normal_diagnosis_probability'] + predictions_df['post_alzheimer_diagnosis_probability'] + predictions_df['pre_alzheimer_diagnosis_probability']
Out[ ]:
In [ ]:
predictions_df.to_csv(AICROWD_PREDICTIONS_PATH, index=False)
Submit to AIcrowd 🚀¶
In [ ]:
!DATASET_PATH=$AICROWD_DATASET_PATH \
aicrowd notebook submit \
--assets-dir $AICROWD_ASSETS_DIR \
--challenge addi-alzheimers-detection-challenge
In [ ]:
Content
Comments
You must login before you can post a comment.