Loading

Insurance pricing game

Insurance Pricing Game EDA in R

Showing some R love.

lolatu2

Quick data explorations in Google Colab using R

AIcrowd

Prepare the notebook 🛠

In [ ]:
cat(system('curl -sL https://gitlab.aicrowd.com/jyotish/pricing-game-notebook-scripts/raw/r-functions/r/setup.sh > setup.sh && bash setup.sh', intern=TRUE), sep='\n')
source("aicrowd_helpers.R")

TRAINING_DATA_PATH = 'training.csv'
AICROWD_API_KEY = ''  # You can get the key from https://aicrowd.com/participants/me
download_aicrowd_dataset(AICROWD_API_KEY)
In [ ]:
options(width = 130)
options(warn = -1)

Packages 🗃

Install and require here all the packages you need to define your model.

Note: Installing packages the first time might take some time.

In [ ]:
install_packages <- function() {
  install.packages("skimr")
  install.packages("corrr")
  install.packages("tidyverse")
}
install_packages()
In [ ]:
global_imports <- function() {
  library(skimr)
  library(corrr)
  library(tidyverse)
}
global_imports()

Loading the data 📲

In [ ]:
# Load the dataset.
train_data = read_csv(TRAINING_DATA_PATH)

How does the data look like? 🔍

In [ ]:
skim(train_data)
── Data Summary ────────────────────────
                           Values    
Name                       train_data
Number of rows             228216    
Number of columns          26        
_______________________              
Column type frequency:               
  character                11        
  numeric                  15        
________________________             
Group variables            None      

── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────────────────
   skim_variable n_missing complete_rate   min   max empty n_unique whitespace
 1 id_policy             0             1     8     8     0    57054          0
 2 pol_coverage          0             1     3     4     0        4          0
 3 pol_pay_freq          0             1     6     9     0        4          0
 4 pol_payd              0             1     2     3     0        2          0
 5 pol_usage             0             1     7    12     0        4          0
 6 drv_sex1              0             1     1     1     0        2          0
 7 drv_drv2              0             1     2     3     0        2          0
 8 drv_sex2              0             1     1     1     0        3          0
 9 vh_make_model         0             1    16    16     0      975          0
10 vh_fuel               0             1     6     8     0        3          0
11 vh_type               0             1     7    10     0        2          0

── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────
   skim_variable          n_missing complete_rate       mean        sd     p0      p25     p50      p75    p100 hist 
 1 year                           0         1         2.5        1.12     1       1.75     2.5     3.25      4  ▇▇▁▇▇
 2 pol_no_claims_discount         0         1         0.0444     0.118    0       0        0       0         1  ▇▁▁▁▁
 3 pol_duration                   0         1        12.6        8.64     1       5       11      18        44  ▇▅▂▂▁
 4 pol_sit_duration               0         1         4.25       2.62     1       3        4       5        26  ▇▁▁▁▁
 5 drv_age1                       0         1        56.3       15.0     19      45       56      67       104  ▂▇▇▃▁
 6 drv_age_lic1                   0         1        34.1       13.9      1      24       34      44        80  ▂▇▇▃▁
 7 drv_age2                  152896         0.330    48.6       16.3     18      36       47      61       102  ▅▇▆▃▁
 8 drv_age_lic2              152896         0.330    26.7       14.9      1      15       25      38        83  ▇▇▆▂▁
 9 vh_age                         4         1.00     11.1        7.14     1       6       10      15        64  ▇▃▁▁▁
10 vh_speed                    2552         0.989   171.        25.6     95     155      174     183       251  ▁▃▇▃▁
11 vh_value                    2552         0.989 17700.     10536.    1113   11490    16321   22067    101525  ▇▃▁▁▁
12 vh_weight                   2552         0.989  1099.       398.       0     950     1145    1315      2554  ▁▃▇▁▁
13 population                     0         1       571.       673.       0     170      320     610      2550  ▇▂▁▁▁
14 town_surface_area              0         1       205.       163.      18.5    82.1    155.    288.      745. ▇▃▂▁▁
15 claim_amount                   0         1       113.       582.       0       0        0       0     50000  ▇▁▁▁▁
In [35]:
options(width = 100)
glimpse(train_data)
Rows: 228,216
Columns: 26
$ id_policy              <chr> "PL000000", "PL042495", "PL042496", "PL042497", "PL042498", "PL042…
$ year                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ pol_no_claims_discount <dbl> 0.332, 0.000, 0.196, 0.000, 0.000, 0.000, 0.018, 0.000, 0.087, 0.5…
$ pol_coverage           <chr> "Med2", "Med2", "Med1", "Med2", "Med1", "Med2", "Max", "Max", "Med…
$ pol_duration           <dbl> 5, 6, 2, 8, 2, 8, 1, 4, 1, 6, 29, 6, 2, 14, 5, 27, 22, 11, 10, 3, …
$ pol_sit_duration       <dbl> 1, 1, 1, 5, 2, 2, 1, 2, 1, 3, 1, 3, 2, 1, 2, 3, 7, 3, 1, 1, 4, 1, …
$ pol_pay_freq           <chr> "Monthly", "Monthly", "Yearly", "Yearly", "Yearly", "Yearly", "Yea…
$ pol_payd               <chr> "No", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", "No",…
$ pol_usage              <chr> "WorkPrivate", "WorkPrivate", "Retired", "WorkPrivate", "Retired",…
$ drv_sex1               <chr> "M", "M", "M", "F", "F", "F", "M", "F", "M", "M", "M", "F", "M", "…
$ drv_age1               <dbl> 35, 60, 55, 54, 65, 68, 41, 51, 44, 53, 55, 52, 47, 43, 47, 84, 64…
$ drv_age_lic1           <dbl> 16, 41, 35, 31, 38, 46, 20, 7, 22, 34, 34, 32, 25, 21, 22, 60, 45,…
$ drv_drv2               <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", "No"…
$ drv_sex2               <chr> "F", "0", "F", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "…
$ drv_age2               <dbl> 26, NA, 57, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 42, NA, NA, NA…
$ drv_age_lic2           <dbl> 1, NA, 38, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 24, NA, NA, NA,…
$ vh_make_model          <chr> "aparvvfowrjncdhp", "aparvvfowrjncdhp", "iwhqpdfuhrsxyqxe", "kvcdd…
$ vh_age                 <dbl> 8, 10, 8, 4, 13, 16, 1, 28, 12, 14, 15, 12, 15, 12, 5, 1, 17, 19, …
$ vh_fuel                <chr> "Gasoline", "Diesel", "Diesel", "Gasoline", "Gasoline", "Gasoline"…
$ vh_type                <chr> "Tourism", "Tourism", "Commercial", "Tourism", "Tourism", "Tourism…
$ vh_speed               <dbl> 174, 174, 150, 149, 200, 196, 160, 173, 149, 189, 188, 200, 159, 1…
$ vh_value               <dbl> 11040, 11040, 14159, 17233, 19422, 24750, 15245, 13952, 17233, 316…
$ vh_weight              <dbl> 1143, 1143, 1193, 1012, 1315, 1200, 1019, 1112, 1012, 1312, 1305, …
$ population             <dbl> 1270, 1290, 1020, 180, 30, 210, 550, 1760, 140, 810, 120, 50, 870,…
$ town_surface_area      <dbl> 33.1, 51.3, 262.8, 219.7, 70.3, 366.5, 74.0, 103.4, 397.2, 460.7, …
$ claim_amount           <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, …

Let's look at some charts!

In [ ]:
# remove id_policy and convert character columns to factors
train_clean <- train_data %>% 
    select(-id_policy) %>%
    mutate(across(where(is.character), as.factor))

Categorical Variables

In [ ]:
train_clean %>% 
  keep(is.factor) %>%
  gather() %>%
  ggplot() +
  geom_bar(mapping = aes(x=value, fill=key), color="black") +
  facet_wrap(~ key, scales = "free") +
  theme(legend.position = "",
        plot.title.position = "plot")+
  labs(title = "Categorical Variable Distributions")

Numeric Variables

In [ ]:
train_clean %>% 
  keep(is.numeric) %>%
  gather() %>%
  ggplot() +
  geom_histogram(mapping = aes(x=value, fill=key), color="black") +
  facet_wrap(~ key, scales = "free") +
  scale_x_continuous(n.breaks = 2)+
  theme(legend.position = "",
        plot.title.position = "plot")+
  labs(title = "Numeric Variable Distributions")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Correlations

In [ ]:
train_clean %>%
  keep(is.numeric) %>%
  corrr::correlate() %>%
  corrr::network_plot(min_cor = 0.2)
Correlation method: 'pearson'
Missing treated using: 'pairwise.complete.obs'


In [ ]:
train_clean %>%
  keep(is.numeric) %>%
  corrr::correlate() %>%
  corrr::rearrange() %>%
  corrr::shave() %>%
  corrr::fashion()
Correlation method: 'pearson'
Missing treated using: 'pairwise.complete.obs'


A data.frame: 15 × 16
termdrv_age1drv_age_lic1drv_age2drv_age_lic2pol_durationpol_sit_durationyearvh_agetown_surface_areapopulationclaim_amountvh_speedvh_weightvh_valuepol_no_claims_discount
<noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote><noquote>
drv_age1
drv_age_lic1 .92
drv_age2 .55 .49
drv_age_lic2 .50 .46 .94
pol_duration .38 .37 .12 .11
pol_sit_duration .25 .23 .18 .16 .31
year .07 .08 .07 .07 .13 .43
vh_age .07 .05-.02-.03 .03 .32 .16
town_surface_area .03 .03 .02 .03 .03 .01-.00 .02
population .02 .00-.01-.01 .01 .01-.00-.03 .10
claim_amount -.00-.00-.00-.01-.01-.03-.01-.08-.01 .01
vh_speed -.02-.02 .05 .04-.00-.09-.00-.18-.03 .03 .04
vh_weight -.03-.00 .04 .03-.01-.08-.00-.13-.02-.01 .04 .57
vh_value -.04-.01 .04 .03-.07-.09-.00-.09-.03-.00 .04 .49 .56
pol_no_claims_discount-.34-.36-.12-.13-.22-.13-.06-.04-.01 .03 .02 .01-.02-.03

Comments

You must login before you can post a comment.

Execute