Programming Language Classification
Roberta Pytorch with W&B Integration[TRAINING ONLY]
A end to end training notebook to get started with Pytorch State of the Art Transformers
Imports and installs¶
In [ ]:
!pip install --upgrade wandb &> /dev/null
!pip install transformers &> /dev/null
In [ ]:
import os
import gc
import copy
import time
import random
import string
# For data manipulation
import numpy as np
import pandas as pd
# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
# Utils
from tqdm import tqdm
from collections import defaultdict
# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")
# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
import wandb
wandb.login(key = 'enter your key here')
In [ ]:
def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
'''generates a random 12 digit'''
return ''.join(random.SystemRandom().choice(chars) for _ in range(size))
HASH_NAME = id_generator(size=12)
print(HASH_NAME)
Configs¶
In [ ]:
CONFIG = {"seed": 2021,
"epochs": 10,
"criterion": nn.CrossEntropyLoss(),
"model_name": "roberta-base",
"train_batch_size": 32,
"valid_batch_size": 64,
"max_length": 128,
"learning_rate": 1e-4,
"scheduler": 'CosineAnnealingLR',
"min_lr": 1e-6,
"T_max": 500,
"weight_decay": 1e-6,
"n_fold": 5,
"n_accumulate": 1,
"num_classes": 15,
"device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
"hash_name": HASH_NAME
}
CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{HASH_NAME}-Baseline'
In [ ]:
def set_seed(seed=42):
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED'] = str(seed)
set_seed(CONFIG['seed'])
Data (you can either download them or use the aicrowd cli)¶
In [ ]:
train_df = pd.read_csv('../input/train-lang-class/blitz_lang_class.csv')
train_df.head(2)
In [ ]:
plt.figure(figsize = (14, 7))
sns.countplot(train_df.language)
Creating folds¶
In [ ]:
skf = StratifiedKFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG['seed'])
for fold, (train, val) in enumerate(skf.split(X = train_df, y = train_df.language)):
train_df.loc[val, 'kfold'] = int(fold)
train_df.sample(4)
In [ ]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder().fit(train_df.language)
train_df['target'] = LE.transform(train_df.language)
train_df.sample(3)
Dataset¶
In [ ]:
class Blitz(Dataset):
def __init__(self, df, tokenizer, max_length):
self.df = df
self.tokenizer = tokenizer
self.max_len = max_length
self.code = df['code'].values
self.target = df['target'].values
def __len__(self):
return len(self.df)
def __getitem__(self,index):
code = self.code[index]
target = self.target[index]
inputs_code = self.tokenizer.encode_plus(
code,
truncation = True,
add_special_tokens = True,
max_length = self.max_len,
padding = 'max_length'
)
code_ids = inputs_code['input_ids']
code_mask = inputs_code['attention_mask']
return {
'code_ids': torch.tensor(code_ids, dtype=torch.long),
'code_mask': torch.tensor(code_mask, dtype=torch.long),
'target': torch.tensor(target, dtype=torch.long)
}
Testing¶
In [ ]:
sample_ds = Blitz(train_df, tokenizer = CONFIG['tokenizer'], max_length = CONFIG['max_length'])
print(len(sample_ds))
In [ ]:
next(iter(sample_ds))
Creating model¶
In [ ]:
class BlitzModel(nn.Module):
def __init__(self, model_name):
super(BlitzModel, self).__init__()
self.model = AutoModel.from_pretrained(model_name)
self.drop = nn.Dropout(p = 0.2)
self.fc = nn.Linear(768, CONFIG['num_classes'])
def forward(self, ids, mask):
out = self.model(input_ids = ids,attention_mask = mask,
output_hidden_states = False)
out = self.drop(out[1])
outputs = self.fc(out)
return outputs
Training¶
In [ ]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
model.train()
dataset_size = 0
running_loss = 0.0
bar = tqdm(enumerate(dataloader), total=len(dataloader))
for step, data in bar:
code_ids = data['code_ids'].to(device, dtype = torch.long)
code_mask = data['code_mask'].to(device, dtype = torch.long)
targets = data['target'].to(device, dtype=torch.long)
batch_size = code_ids.size(0)
code_outputs = model(code_ids, code_mask)
crit = CONFIG['criterion']
loss = crit(code_outputs, targets)
loss = loss / CONFIG['n_accumulate']
loss.backward()
if (step + 1) % CONFIG['n_accumulate'] == 0:
optimizer.step()
optimizer.zero_grad()
if scheduler is not None:
scheduler.step()
running_loss += (loss.item() * batch_size)
dataset_size += batch_size
epoch_loss = running_loss / dataset_size
bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
LR=optimizer.param_groups[0]['lr'])
gc.collect()
return epoch_loss
Validation¶
In [ ]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
model.eval()
dataset_size = 0
running_loss = 0.0
bar = tqdm(enumerate(dataloader), total=len(dataloader))
for step, data in bar:
code_ids = data['code_ids'].to(device, dtype = torch.long)
code_mask = data['code_mask'].to(device, dtype = torch.long)
targets = data['target'].to(device, dtype=torch.long)
batch_size = code_ids.size(0)
code_outputs = model(code_ids, code_mask)
crit = CONFIG['criterion']
loss = crit(code_outputs, targets)
running_loss += (loss.item() * batch_size)
dataset_size += batch_size
epoch_loss = running_loss / dataset_size
bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
LR=optimizer.param_groups[0]['lr'])
gc.collect()
return epoch_loss
Run training¶
In [ ]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
wandb.watch(model, log_freq=100)
if torch.cuda.is_available():
print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
start = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_epoch_loss = np.inf
history = defaultdict(list)
for epoch in range(1, num_epochs + 1):
gc.collect()
train_epoch_loss = train_one_epoch(model, optimizer, scheduler,
dataloader=train_loader,
device=CONFIG['device'], epoch=epoch)
val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'],
epoch=epoch)
history['Train Loss'].append(train_epoch_loss)
history['Valid Loss'].append(val_epoch_loss)
# Log the metrics
wandb.log({"Train Loss": train_epoch_loss})
wandb.log({"Valid Loss": val_epoch_loss})
# deep copy the model
if val_epoch_loss <= best_epoch_loss:
print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
best_epoch_loss = val_epoch_loss
run.summary["Best Loss"] = best_epoch_loss
best_model_wts = copy.deepcopy(model.state_dict())
PATH = f"Loss-Fold-{fold}.bin"
torch.save(model.state_dict(), PATH)
# Save a model file from the current directory
print(f"Model Saved")
print()
end = time.time()
time_elapsed = end - start
print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
print("Best Loss: {:.4f}".format(best_epoch_loss))
# load best model weights
model.load_state_dict(best_model_wts)
return model, history
In [ ]:
def prepare_loaders(fold):
df_train = train_df[train_df.kfold != fold].reset_index(drop=True)
df_valid = train_df[train_df.kfold == fold].reset_index(drop=True)
train_dataset = Blitz(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
valid_dataset = Blitz(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'],
num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'],
num_workers=2, shuffle=False, pin_memory=True)
return train_loader, valid_loader
In [ ]:
def fetch_scheduler(optimizer):
if CONFIG['scheduler'] == 'CosineAnnealingLR':
scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'],
eta_min=CONFIG['min_lr'])
elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'],
eta_min=CONFIG['min_lr'])
elif CONFIG['scheduler'] == None:
return None
return scheduler
In [ ]:
for fold in range(0, CONFIG['n_fold']):
print(f"====== Fold: {fold} ======")
run = wandb.init(project = 'Blitz_lang_class',
config=CONFIG,
job_type='Train',
group=CONFIG['group'],
tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
name=f'{HASH_NAME}-fold-{fold}',
anonymous='must')
# Create Dataloaders
train_loader, valid_loader = prepare_loaders(fold=fold)
model = BlitzModel(CONFIG['model_name'])
model.to(CONFIG['device'])
# Define Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer)
model, history = run_training(model, optimizer, scheduler,
device=CONFIG['device'],
num_epochs=CONFIG['epochs'],
fold=fold)
run.finish()
del model, history, train_loader, valid_loader
_ = gc.collect()
print()
Content
Comments
You must login before you can post a comment.
Do checkout the inference notebook too ^^