Programming Language Classification
Roberta Pytorch with W&B Integration[INFERENCE]
A end to end training notebook to get started with Pytorch State of the Art Transformers
Imports¶
In [ ]:
import os
import gc
import cv2
import copy
import time
import random
# For data manipulation
import numpy as np
import pandas as pd
# Pytorch Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# For Transformer Models
from transformers import AutoTokenizer, AutoModel
# Utils
from tqdm import tqdm
# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
Configuration¶
In [ ]:
CONFIG = dict(
seed = 42,
model_name = '../input/roberta-base',
test_batch_size = 64,
max_length = 128,
num_classes = 15,
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)
CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
In [ ]:
#make sure you keep the weights from the training notebook
MODEL_PATHS = [
'../input/blitz-lang-class-10-epochs/Loss-Fold-0.bin',
'../input/blitz-lang-class-10-epochs/Loss-Fold-1.bin',
'../input/blitz-lang-class-10-epochs/Loss-Fold-2.bin',
'../input/blitz-lang-class-10-epochs/Loss-Fold-3.bin',
'../input/blitz-lang-class-10-epochs/Loss-Fold-4.bin'
]
In [ ]:
def set_seed(seed = 42):
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED'] = str(seed)
set_seed(CONFIG['seed'])
In [ ]:
df = pd.read_csv('../input/blitz-lang-class-test/test.csv')
df.head(3)
Dataset¶
In [ ]:
class Blitz(Dataset):
def __init__(self, df, tokenizer, max_length):
self.df = df
self.tokenizer = tokenizer
self.max_len = max_length
self.code = df['code'].values
def __len__(self):
return len(self.df)
def __getitem__(self,index):
code = self.code[index]
inputs_code = self.tokenizer.encode_plus(
code,
truncation = True,
add_special_tokens = True,
max_length = self.max_len,
padding = 'max_length'
)
code_ids = inputs_code['input_ids']
code_mask = inputs_code['attention_mask']
return {
'code_ids': torch.tensor(code_ids, dtype=torch.long),
'code_mask': torch.tensor(code_mask, dtype=torch.long)
}
In [ ]:
test_dataset = Blitz(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
num_workers=2, shuffle=False, pin_memory=True)
Building the model¶
In [ ]:
class BlitzModel(nn.Module):
def __init__(self, model_name):
super(BlitzModel, self).__init__()
self.model = AutoModel.from_pretrained(model_name)
self.drop = nn.Dropout(p=0.2)
self.fc = nn.Linear(768, CONFIG['num_classes'])
def forward(self, ids, mask):
out = self.model(input_ids=ids,attention_mask=mask,
output_hidden_states=False)
out = self.drop(out[1])
outputs = self.fc(out)
return outputs
In [ ]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
model.eval()
dataset_size = 0
running_loss = 0.0
PREDS = []
bar = tqdm(enumerate(dataloader), total=len(dataloader))
for step, data in bar:
ids = data['code_ids'].to(device, dtype = torch.long)
mask = data['code_mask'].to(device, dtype = torch.long)
outputs = model(ids, mask)
soft = outputs.softmax(1)
fin = soft.argmax(1)
PREDS.append(fin.view(-1).cpu().detach().numpy())
PREDS = np.concatenate(PREDS)
gc.collect()
return PREDS
Inference time¶
In [ ]:
def inference(model_paths, dataloader, device):
final_preds = []
for i, path in enumerate(model_paths):
model = BlitzModel(CONFIG['model_name'])
model.to(CONFIG['device'])
model.load_state_dict(torch.load(path))
print(f"Getting predictions for model {i+1}")
preds = valid_fn(model, dataloader, device)
final_preds.append(preds)
final_preds = np.array(final_preds)
final_preds = np.mean(final_preds, axis=0)
return final_preds
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'])
In [ ]:
df['target'] = preds
df['target'] = df['target'].round()
df['target'] = df['target'].astype('int')
In [ ]:
train_df = pd.read_csv('../input/train-lang-class/blitz_lang_class.csv')
In [ ]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder().fit(train_df.language)
train_df['target'] = LE.transform(train_df.language)
In [ ]:
df["prediction"] = LE.inverse_transform(df.target)
In [ ]:
df = df.sample(frac=1)
df.head()
In [ ]:
df.sample(10)
In [ ]:
!rm -rf assets
!mkdir assets
df.to_csv(os.path.join("assets", "submission.csv"))
In [ ]:
!pip install aicrowd-cli
%load_ext aicrowd.magic
In [ ]:
%aicrowd login
In [ ]:
%aicrowd notebook submit -c programming-language-classification -a assets --no-verify
Thanks for reading my notebook and hope you've learnt something new :3¶
Content
Comments
You must login before you can post a comment.
Do let me know if you have any doubts/issues :3