The baseline provided is a little bit difficult to follow. I decided to publish my first working version using simpletransformers.
I haven't used all the data nor fine-tune the model therefore the score is not as good as the provided baseline but it is much simpler to understand the data and the task.
In [1]:
from IPython.display import clear_output
!pip install simpletransformers
clear_output()
print("DONE")
In [2]:
!mkdir -p data
original_input="../input/esci-challenge-data-20220416-080611"
!cp $original_input/product_catalogue-v0.2.csv/data/processed/public/task_3_product_substitute_identification/product_catalogue-v0.2.csv data
!cp $original_input/train-v0.2.csv/data/processed/public/task_3_product_substitute_identification/train-v0.2.csv data
!cp $original_input/test_public-v0.2.csv/data/processed/public/task_3_product_substitute_identification/test_public-v0.2.csv data
!ls -alh data
In [3]:
import numpy as np
import pandas as pd
input_dir = "data"
full_train_df = pd.read_csv(f"{input_dir}/train-v0.2.csv")
print(full_train_df.shape)
full_train_df.head(3)
product_df = pd.read_csv(f"{input_dir}/product_catalogue-v0.2.csv")
print(product_df.shape)
product_df.head(3)
full_train_df2 = pd.merge(full_train_df,
product_df[["product_id", "product_locale", "product_title"]],
left_on=["product_id", "query_locale"],
right_on=["product_id", "product_locale"]
)
print(full_train_df2.shape)
full_train_df2.head()
Out[3]:
In [4]:
full_train_df2["substitute_label"].value_counts()
full_train_df2["substitute_label"] = (full_train_df2["substitute_label"] == "substitute").astype(int)
full_train_df2 = full_train_df2[["query", "product_title", "substitute_label"]]
from sklearn.model_selection import train_test_split
train_df, eval_df = train_test_split(full_train_df2, test_size=0.5, random_state=0,
stratify=full_train_df2[['substitute_label']])
print(train_df.shape, eval_df.shape)
train_df.head()
Out[4]:
In [5]:
import gc
del product_df
del full_train_df
del full_train_df2
gc.collect()
Out[5]:
In [6]:
original_train_df = train_df
original_eval_df = eval_df
original_train_df["substitute_label"].value_counts()
Out[6]:
In [7]:
def get_balance(df):
df_pos = df[df["substitute_label"] == 1]
df_neg = df[df["substitute_label"] == 0].sample(n=df_pos.shape[0])
df2 = pd.concat([df_pos, df_neg]).sample(frac=1).reset_index(drop=True)
return df2
nb_rows = int(2e5)
train_df = get_balance(original_train_df).head(nb_rows)
eval_df = get_balance(original_eval_df).head(nb_rows)
print(train_df["substitute_label"].value_counts())
train_df.tail()
Out[7]:
In [8]:
train_df.columns = ["text_a", "text_b", "labels"]
eval_df.columns = ["text_a", "text_b", "labels"]
from simpletransformers.classification import (
ClassificationModel, ClassificationArgs
)
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
model_args = ClassificationArgs(
num_train_epochs=2,
train_batch_size = 16*10,
eval_batch_size = 16*10
)
#model = ClassificationModel("roberta", "xlm-roberta-base")
model = ClassificationModel("bert", "bert-base-multilingual-uncased",
args=model_args)
!rm -rf outputs
model.train_model(train_df)
Out[8]:
In [9]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)
result
Out[9]:
In [10]:
TP = result["tp"]
FP = result["fp"]
FN = result["fn"]
TP/(TP+0.5*(FP+FN))
Out[10]:
In [11]:
del model
gc.collect()
!nvidia-smi
Content
Comments
You must login before you can post a comment.