Loading

Sentiment Classification

Solution for submission 174846

A detailed solution for submission 174846 submitted for challenge Sentiment Classification

paulina_knut1

Sentiment classification: sklearn models

In [125]:
%load_ext aicrowd.magic
The aicrowd.magic extension is already loaded. To reload it, use:
  %reload_ext aicrowd.magic
In [126]:
%aicrowd login
Please login here: https://api.aicrowd.com/auth/4K4R3TM3Wilf3Wb0YYutWzKRpuB4g9ikjR0pSKwpG34
API Key valid
Gitlab access token valid
Saved details successfully!
In [127]:
# %aicrowd ds dl -c sentiment-classification -o data

Imports

In [128]:
import os

from ast import literal_eval
from collections import Counter
import pandas as pd
import neptune.new as neptune
import numpy as np
from scipy.stats import uniform
import optuna
from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
In [129]:
NEPTUNE_PROJECT = "deepsense-ai/AIcrowd"
NEPTUNE_API = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI0NWE5MTZhNi0yMDE3LTQ3N2EtOGMwOS1lZGY1YjRiOWJlYmUifQ=="

Data

In [130]:
train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
test_df = pd.read_csv("data/test.csv")
In [131]:
train_df.head()
Out[131]:
embeddings label
0 [0.3206779360771179, 0.988215982913971, 1.0441... positive
1 [0.05074610561132431, 1.0742985010147095, 0.60... negative
2 [0.41962647438049316, 0.4505457878112793, 1.39... negative
3 [0.4361684024333954, 0.19191382825374603, 0.83... positive
4 [0.6382085084915161, 0.8352395296096802, 0.393... neutral
In [132]:
train_df.label.value_counts()
Out[132]:
neutral     1694
positive    1684
negative    1622
Name: label, dtype: int64
In [133]:
X_train = [literal_eval(embedding)  for embedding in train_df['embeddings'].values]
y_train = train_df['label'].values

X_valid = [literal_eval(embedding)  for embedding in val_df['embeddings'].values]
y_valid = val_df['label'].values

X_test = [literal_eval(embedding)  for embedding in test_df['embeddings'].values]

Standard Scaler

In [134]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

Submission

In [135]:
def make_submission(y_test_pred):
    submission = pd.DataFrame(
        {
            "embeddings": X_test,
            "label": y_test_pred,
        }
    )
    submission.to_csv(os.path.join("assets", "submission.csv"))
    %aicrowd notebook submit -c sentiment-classification -a assets --no-verify

Sklearn models

In [136]:
def objective(trial):
    global best_score
    
    # optional PCA
    X_train_scaled_reduced = X_train_scaled
    X_valid_scaled_reduced = X_valid_scaled
    X_test_scaled_reduced = X_test_scaled
    num_dim = 512
    reduce_dim = trial.suggest_categorical("reduce_dim", [False, True])
    if reduce_dim:
        num_dim = trial.suggest_int("num_dim", 32, 512)
        pca = PCA(n_components=num_dim)
        X_train_scaled_reduced = pca.fit_transform(X_train_scaled)
        X_valid_scaled_reduced = pca.transform(X_valid_scaled)
        X_test_scaled_reduced = pca.transform(X_test_scaled)
        
    
    classifier_name = trial.suggest_categorical('classifier', ['mlp', 'svc'])  # 'knn'
    if classifier_name == 'svc':
        svc_c = trial.suggest_int('svc_c', 1, 1e8)
        svc_degree = trial.suggest_int('svc_degree', 2, 11)
        svc_gamma = trial.suggest_float('svc_gamma', 1e-10, 1e1)
        params = {
            "svc_c": svc_c,
            "svc_degree": svc_degree,
            "svc_gamma": svc_gamma,
        }
        classifier = SVC(
            C=svc_c,
            degree=svc_degree,
            gamma=svc_gamma,  # 'auto', 
            random_state=42,
        )
    elif classifier_name == 'knn':
        knn_neighbors = trial.suggest_int('knn_neighbors', 1, 21)
        params = {
            "knn_neighbors": knn_neighbors,
        }
        classifier = KNeighborsClassifier(
            n_neighbors=knn_neighbors,
        )
    elif classifier_name == 'mlp':
        mlp_alpha = trial.suggest_float('mlp_alpha', 1e-10, 10)
        mlp_hidden_layer_sizes = trial.suggest_int('mlp_hidden_layer_sizes', 128, 1024)
        mlp_validation_fraction = trial.suggest_float('mlp_validation_fraction', 0.01, 0.2)
        params = {
            "mlp_alpha": mlp_alpha,
            "mlp_hidden_layer_sizes": mlp_hidden_layer_sizes,
            "mlp_validation_fraction": mlp_validation_fraction,
        }
        classifier = MLPClassifier(
            alpha = mlp_alpha,
            hidden_layer_sizes = mlp_hidden_layer_sizes,
            early_stopping = True,
            n_iter_no_change = 100,
            max_iter = 1000,
            validation_fraction = mlp_validation_fraction,
            random_state=42,
        )
    else:
        raise Exception("Wrong classifier name") 
    
    classifier = classifier.fit(X_train_scaled_reduced, y_train)
    valid_accuracy = classifier.score(X_valid_scaled_reduced, y_valid)
    if valid_accuracy > best_score:
        print("SUBMISION, valid/acc:", valid_accuracy)
        best_score = valid_accuracy
        run = neptune.init(
            project=NEPTUNE_PROJECT,
            api_token=NEPTUNE_API,
            tags=["sentiment_classification", "sklearn", "optuna"]
        )
        run["model"] = classifier_name
        run["parameters"] = params
        run["reduce_dim"] = reduce_dim
        run["num_dim"] = num_dim
        run["train/acc"] = classifier.score(X_train_scaled_reduced, y_train)
        run["valid/acc"] = valid_accuracy
        run.stop()
        
        y_test_pred = classifier.predict(X_test_scaled)
        make_submission(y_test_pred)
        
    return valid_accuracy
In [137]:
best_score = 0.795
In [63]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
[I 2022-02-19 11:17:50,888] A new study created in memory with name: no-name-f36c719d-a90e-48c5-b46e-480ac04b6a2e
[I 2022-02-19 11:19:33,077] Trial 0 finished with value: 0.776 and parameters: {'reduce_dim': True, 'num_dim': 421, 'classifier': 'mlp', 'mlp_alpha': 2.5019245021107923, 'mlp_hidden_layer_sizes': 900, 'mlp_validation_fraction': 0.16645067204494246}. Best is trial 0 with value: 0.776.
[I 2022-02-19 11:20:43,648] Trial 1 finished with value: 0.752 and parameters: {'reduce_dim': True, 'num_dim': 83, 'classifier': 'mlp', 'mlp_alpha': 6.61428838340386, 'mlp_hidden_layer_sizes': 911, 'mlp_validation_fraction': 0.12969498331407583}. Best is trial 0 with value: 0.776.
[I 2022-02-19 11:23:11,791] Trial 2 finished with value: 0.764 and parameters: {'reduce_dim': True, 'num_dim': 420, 'classifier': 'mlp', 'mlp_alpha': 3.22735189916378, 'mlp_hidden_layer_sizes': 762, 'mlp_validation_fraction': 0.13358083106439148}. Best is trial 0 with value: 0.776.
[I 2022-02-19 11:24:51,657] Trial 3 finished with value: 0.7915 and parameters: {'reduce_dim': False, 'classifier': 'mlp', 'mlp_alpha': 2.806686364718306, 'mlp_hidden_layer_sizes': 680, 'mlp_validation_fraction': 0.02731453594179662}. Best is trial 3 with value: 0.7915.
[I 2022-02-19 11:25:54,032] Trial 4 finished with value: 0.7565 and parameters: {'reduce_dim': True, 'num_dim': 219, 'classifier': 'mlp', 'mlp_alpha': 8.3003630349897, 'mlp_hidden_layer_sizes': 634, 'mlp_validation_fraction': 0.07883168740309936}. Best is trial 3 with value: 0.7915.
[I 2022-02-19 11:26:57,497] Trial 5 finished with value: 0.762 and parameters: {'reduce_dim': True, 'num_dim': 118, 'classifier': 'mlp', 'mlp_alpha': 0.6587505900932472, 'mlp_hidden_layer_sizes': 964, 'mlp_validation_fraction': 0.17286630248839324}. Best is trial 3 with value: 0.7915.
[I 2022-02-19 11:28:50,642] Trial 6 finished with value: 0.7735 and parameters: {'reduce_dim': True, 'num_dim': 464, 'classifier': 'mlp', 'mlp_alpha': 1.8956272623811063, 'mlp_hidden_layer_sizes': 727, 'mlp_validation_fraction': 0.12169320461888987}. Best is trial 3 with value: 0.7915.
[I 2022-02-19 11:30:36,028] Trial 7 finished with value: 0.767 and parameters: {'reduce_dim': True, 'num_dim': 284, 'classifier': 'mlp', 'mlp_alpha': 1.2483569133543568, 'mlp_hidden_layer_sizes': 651, 'mlp_validation_fraction': 0.13209002911101916}. Best is trial 3 with value: 0.7915.
[I 2022-02-19 11:31:56,142] Trial 8 finished with value: 0.771 and parameters: {'reduce_dim': True, 'num_dim': 228, 'classifier': 'mlp', 'mlp_alpha': 9.87459311041519, 'mlp_hidden_layer_sizes': 739, 'mlp_validation_fraction': 0.13672341731660148}. Best is trial 3 with value: 0.7915.
[I 2022-02-19 11:33:47,838] Trial 9 finished with value: 0.787 and parameters: {'reduce_dim': False, 'classifier': 'mlp', 'mlp_alpha': 0.8192048861845439, 'mlp_hidden_layer_sizes': 648, 'mlp_validation_fraction': 0.19921545567033808}. Best is trial 3 with value: 0.7915.
In [14]:
study.best_params
Out[14]:
{'classifier': 'mlp',
 'mlp_alpha': 0.2017750141364524,
 'mlp_hidden_layer_sizes': 455,
 'mlp_validation_fraction': 0.0291592386774798}

Sklearn models with crossvalidation

In [138]:
train_valid_df = pd.concat([train_df, val_df]) # concat the train and validation set, we will be using the k fold method later
X_train_valid = [literal_eval(embedding) for embedding in train_valid_df['embeddings'].values]
y_train_valid = train_valid_df['label'].values
In [139]:
X_train_valid_scaled = scaler.transform(X_train_valid)
In [140]:
Fold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
In [141]:
def objective_cv(trial):
    global best_score
    params = {
        "alpha": trial.suggest_float('alpha', 1e-10, 10),
        "hidden_layer_sizes": trial.suggest_int('hidden_layer_sizes', 128, 1024),
        "validation_fraction": trial.suggest_float('mlp_validation_fraction', 0.01, 0.2),
    }
    
    # optional PCA
    X_train_valid_scaled_reduced = X_train_valid_scaled
    X_test_scaled_reduced = X_test_scaled
    num_dim = 512
    reduce_dim = trial.suggest_categorical("reduce_dim", [False, True])
    if reduce_dim:
        num_dim = trial.suggest_int("num_dim", 32, 512)
        pca = PCA(n_components=num_dim)
        X_train_valid_scaled_reduced = pca.fit_transform(X_train_valid_scaled)
        X_test_scaled_reduced = pca.transform(X_test_scaled)
        
    # kfold
    f1_scores = []
    models = []
    for n, (trn_, val_) in tqdm(enumerate(Fold.split(X_train_valid_scaled_reduced, y_train_valid))):
        
        fold_train_data = X_train_valid_scaled_reduced[trn_]
        fold_valid_data = X_train_valid_scaled_reduced[val_]
        
        fold_train_labels = y_train_valid[trn_]
        fold_valid_labels = y_train_valid[val_]
        model = MLPClassifier(**params)
        model.fit(fold_train_data, fold_train_labels)
        models.append(model)
        
        valid_pred = model.predict(fold_valid_data)
        f1 = f1_score(fold_valid_labels, valid_pred, average ='weighted')
        f1_scores.append(f1)

    mean_valid_f1 = np.mean(f1_scores)
    
    # neptune and submission
    if mean_valid_f1 > best_score:
        print("SUBMISION, mean_valid_f1:", mean_valid_f1)
#         best_score = mean_valid_f1
        run = neptune.init(
            project=NEPTUNE_PROJECT,
            api_token=NEPTUNE_API,
            tags=["sentiment_classification", "mlp", "optuna", "crossval"]
        )
        run["model"] = "mlp"
        run["parameters"] = params
        run["reduce_dim"] = reduce_dim
        run["num_dim"] = num_dim
        run["mean_valid_f1"] = mean_valid_f1
        run.stop()
        
        predictions = []
        for model in models:
            predictions.append(model.predict(X_test_scaled_reduced))
        
        y_test_pred = [
            Counter([pred[i] for pred in predictions]).most_common(1)[0][0]
            for i in range(len(X_test_scaled_reduced))
        ]
            
        make_submission(y_test_pred)

    return mean_valid_f1
In [142]:
best_score = 0.795
In [ ]:
study_cv = optuna.create_study(direction='maximize')
study_cv.optimize(objective_cv, n_trials=50)
[I 2022-02-21 10:26:27,069] A new study created in memory with name: no-name-c6379eb7-37d3-42d1-b645-cd605a0f3c5e
5it [01:11, 14.34s/it]
[I 2022-02-21 10:27:38,776] Trial 0 finished with value: 0.7875312104530361 and parameters: {'alpha': 1.245653185407123, 'hidden_layer_sizes': 219, 'mlp_validation_fraction': 0.04071282192517211, 'reduce_dim': False}. Best is trial 0 with value: 0.7875312104530361.
5it [00:51, 10.23s/it]
[I 2022-02-21 10:28:29,951] Trial 1 finished with value: 0.7818984547115877 and parameters: {'alpha': 8.622312541470208, 'hidden_layer_sizes': 139, 'mlp_validation_fraction': 0.18180375199538013, 'reduce_dim': False}. Best is trial 0 with value: 0.7875312104530361.
5it [02:39, 31.86s/it]
[I 2022-02-21 10:31:09,257] Trial 2 finished with value: 0.7864756068363976 and parameters: {'alpha': 6.396572349825316, 'hidden_layer_sizes': 518, 'mlp_validation_fraction': 0.011601681069209327, 'reduce_dim': False}. Best is trial 0 with value: 0.7875312104530361.
5it [03:08, 37.69s/it]
[I 2022-02-21 10:34:19,880] Trial 3 finished with value: 0.7933739684325545 and parameters: {'alpha': 6.772810444429572, 'hidden_layer_sizes': 467, 'mlp_validation_fraction': 0.020575657886802663, 'reduce_dim': True, 'num_dim': 386}. Best is trial 3 with value: 0.7933739684325545.
5it [01:53, 22.68s/it]
[I 2022-02-21 10:36:15,064] Trial 4 finished with value: 0.7888932571933469 and parameters: {'alpha': 6.932725379936173, 'hidden_layer_sizes': 281, 'mlp_validation_fraction': 0.1164946446439621, 'reduce_dim': True, 'num_dim': 334}. Best is trial 3 with value: 0.7933739684325545.
5it [06:02, 72.56s/it] 
[I 2022-02-21 10:42:17,885] Trial 5 finished with value: 0.7867979619531119 and parameters: {'alpha': 8.107487394823828, 'hidden_layer_sizes': 965, 'mlp_validation_fraction': 0.11369600672654658, 'reduce_dim': False}. Best is trial 3 with value: 0.7933739684325545.
5it [02:20, 28.09s/it]
[I 2022-02-21 10:44:38,330] Trial 6 finished with value: 0.794517633955735 and parameters: {'alpha': 0.6486536948903225, 'hidden_layer_sizes': 888, 'mlp_validation_fraction': 0.13384747041946632, 'reduce_dim': False}. Best is trial 6 with value: 0.794517633955735.
5it [01:51, 22.38s/it]
[I 2022-02-21 10:46:33,639] Trial 7 finished with value: 0.784923815170919 and parameters: {'alpha': 9.883509887010216, 'hidden_layer_sizes': 262, 'mlp_validation_fraction': 0.05819481404877714, 'reduce_dim': True, 'num_dim': 362}. Best is trial 6 with value: 0.794517633955735.
5it [03:13, 38.75s/it]
[I 2022-02-21 10:49:47,403] Trial 8 finished with value: 0.785379369141468 and parameters: {'alpha': 8.715701255210016, 'hidden_layer_sizes': 586, 'mlp_validation_fraction': 0.03488152305215696, 'reduce_dim': False}. Best is trial 6 with value: 0.794517633955735.
5it [01:21, 16.33s/it]
[I 2022-02-21 10:51:11,036] Trial 9 finished with value: 0.7797774070153658 and parameters: {'alpha': 3.888012297431358, 'hidden_layer_sizes': 327, 'mlp_validation_fraction': 0.0459071556154458, 'reduce_dim': True, 'num_dim': 174}. Best is trial 6 with value: 0.794517633955735.
5it [02:26, 29.22s/it]
SUBMISION, mean_valid_f1: 0.7995572669773718
https://app.neptune.ai/deepsense-ai/AIcrowd/e/AIC-210
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 43 operations to synchronize with Neptune. Do not kill this process.
All 43 operations synced, thanks for waiting!
Using notebook: sklearn_models.ipynb for submission...
Removing existing files from submission directory...
Scrubbing API keys from the notebook...
Collecting notebook...
In [ ]:

In [ ]:

In [336]:
classifier = SVC(
    C=20,
    degree=5,
    max_iter=-1,
#     break_ties=True,
#     tol=0.00001,
#     probability=True,
    gamma=0.0004,
    random_state=42,
).fit(X_train_scaled, y_train)
In [337]:
classifier.score(X_train_scaled, y_train)
Out[337]:
0.972
In [338]:
classifier.score(X_valid_scaled, y_valid)
Out[338]:
0.8065

Random Forest

In [105]:
classifier = RandomForestClassifier(
    n_estimators = 200,
    max_depth = 50,
    min_samples_leaf = 20,
).fit(X_train_scaled, y_train)
In [106]:
classifier.score(X_train_scaled, y_train)
Out[106]:
0.8712
In [107]:
classifier.score(X_valid_scaled, y_valid)
Out[107]:
0.6815
In [117]:
predictions = []
for model in [classifier, classifier, classifier]:
    predictions.append(model.predict(X_valid_scaled))

    y_test_pred = [
        Counter([pred[i] for pred in predictions]).most_common(1)[0][0]
        for i in range(len(X_valid_scaled))
    ]
In [120]:
# classifier.score(X_valid_scaled, y_test_pred)

Neural Network

In [19]:
X_train_valid_scaled = np.concatenate([X_train_scaled, X_valid_scaled])
y_train_valid = np.concatenate([y_train, y_valid])
In [23]:
classifier = MLPClassifier(
    alpha = 0.5,
    hidden_layer_sizes = 455,
    early_stopping = True,
    n_iter_no_change = 100,
    max_iter = 1000,
    validation_fraction = 0.02,
    random_state=42,
#     verbose =  True,
# ).fit(X_train_scaled, y_train)
).fit(X_train_valid_scaled, y_train_valid)
/mnt/ml-team/homes/paulina.knut/aicrowd/venv_pascal02/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:699: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
In [21]:
classifier.score(X_train_scaled, y_train)
Out[21]:
0.986
In [22]:
classifier.score(X_valid_scaled, y_valid)
Out[22]:
0.992

Validation

In [25]:
y_valid_pred = classifier.predict(X_valid_scaled)
print(classification_report(y_valid, y_valid_pred))
              precision    recall  f1-score   support

    negative       0.99      0.99      0.99       640
     neutral       0.99      0.99      0.99       633
    positive       1.00      1.00      1.00       727

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

In [26]:
y_test_pred = classifier.predict(X_test_scaled)
In [27]:
submission = pd.DataFrame(
    {
        "embeddings": X_test_scaled.tolist(),
        "label": y_test_pred,
    }
)

Submission

In [28]:
submission.to_csv(os.path.join("assets", "submission.csv"))
In [29]:
%aicrowd notebook submit -c sentiment-classification -a assets --no-verify
Using notebook: sklearn_models.ipynb for submission...
Removing existing files from submission directory...
Scrubbing API keys from the notebook...
Collecting notebook...
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



                                                       ╭─────────────────────────╮                                                       
                                                       │ Successfully submitted! │                                                       
                                                       ╰─────────────────────────╯                                                       
                                                             Important links                                                             
┌──────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│  This submission │ https://www.aicrowd.com/challenges/ai-blitz-xiii/problems/sentiment-classification/submissions/174152              │
│                  │                                                                                                                    │
│  All submissions │ https://www.aicrowd.com/challenges/ai-blitz-xiii/problems/sentiment-classification/submissions?my_submissions=true │
│                  │                                                                                                                    │
│      Leaderboard │ https://www.aicrowd.com/challenges/ai-blitz-xiii/problems/sentiment-classification/leaderboards                    │
│                  │                                                                                                                    │
│ Discussion forum │ https://discourse.aicrowd.com/c/ai-blitz-xiii                                                                      │
│                  │                                                                                                                    │
│   Challenge page │ https://www.aicrowd.com/challenges/ai-blitz-xiii/problems/sentiment-classification                                 │
└──────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
In [ ]:


Comments

You must login before you can post a comment.

Execute