From 0b8c06eb2cc2d4fa695d6c743ce96db9f4e70b09 Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Thu, 14 Jan 2021 10:54:32 +0000
Subject: [PATCH 01/10] initial commit

---
 tasks/wsd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/wsd.py b/tasks/wsd.py
index 74657b3..566c63f 100644
--- a/tasks/wsd.py
+++ b/tasks/wsd.py
@@ -106,7 +106,7 @@ def svm_wemb_baseline(df_train,df_test,wemb_model):
     return y_pred
 
 ### ---------------------------------------------------
-# BERT CENTROID METHODS
+#  BERT CENTROID METHODS
 
 ### ---------------------------------------------------
 # binary centroid vectors

From 0e0fd9b3f1a0b754894a1dd9d8127d747020c53f Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Thu, 14 Jan 2021 16:35:39 +0000
Subject: [PATCH 02/10] add model fine-tuning approach

---
 114.1 - review notebook - glossbert.ipynb | 378 ++++++++++++++++++++++
 1 file changed, 378 insertions(+)
 create mode 100644 114.1 - review notebook - glossbert.ipynb

diff --git a/114.1 - review notebook - glossbert.ipynb b/114.1 - review notebook - glossbert.ipynb
new file mode 100644
index 0000000..c7e679f
--- /dev/null
+++ b/114.1 - review notebook - glossbert.ipynb	
@@ -0,0 +1,378 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3",
+   "language": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import pickle\n",
+    "import pandas as pd\n",
+    "from tasks import wsd\n",
+    "from pathlib import Path\n",
+    "from tasks import wsd\n",
+    "from utils import nlp_tools\n",
+    "from tqdm.auto import tqdm\n",
+    "import numpy as np\n",
+    "import json\n",
+    "from sklearn.metrics import classification_report\n",
+    "from flair.embeddings import TransformerWordEmbeddings\n",
+    "from utils.dataset_download import harvest_data_from_extended_senses\n",
+    "from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lemma = 'machine'\n",
+    "pos = 'NN'\n",
+    "senses = {'machine_nn01-38474140'} # machine_nn01-38475772 machine_nn01-38475923 machine_nn01-38475835 machine_nn01-38474140\n",
+    "relations = ['seed','synonym'] # ,'descendant','sibling'\n",
+    "eval_mode = \"lemma_etal\" # lemma or lemma_etal\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "# senses before filtering by date = 517\n",
+      "# senses after filtering by date = 433\n",
+      "\n",
+      "\n",
+      "# of seed senses 26 \n",
+      "# of synonyms 383 \n",
+      "# of branch senses 0\n",
+      "\n",
+      "\n",
+      "# of seeds selected 1 \n",
+      "# of synonyms selected 44 \n",
+      "# of branches selected 0\n",
+      "[LOG] #rows before removing None vector (1947, 21)\n",
+      "[LOG] #rows after removing None vector (1911, 21)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_train, df_val, df_test = binarize(lemma,\n",
+    "                        pos,\n",
+    "                        senses, \n",
+    "                        relations,\n",
+    "                        strict_filter=True,\n",
+    "                        start=1700,\n",
+    "                        end=2000,\n",
+    "                        eval_mode=eval_mode)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "             sense_id                                   lemma_definition  \\\n",
+       "0  body_nn01-17170653  The complete physical form of a person or anim...   \n",
+       "1  man_nn01-110482153  An adult male human being. Without explicit co...   \n",
+       "2  body_nn01-17169813  The complete physical form of a person or anim...   \n",
+       "\n",
+       "                                          definition    word_id lemma  \\\n",
+       "0  Particular technical uses. The part of a vehic...  body_nn01  body   \n",
+       "1  As vocative or as int., introducing a remark o...   man_nn01   man   \n",
+       "2  Contrasted with the soul. Cf. soul body n. at ...  body_nn01  body   \n",
+       "\n",
+       "          quotation_id                                             source  \\\n",
+       "0  body_nn01-132916428  {'title': 'Material Handling Engin.', 'author'...   \n",
+       "1   man_nn01-110482440  {'title': 'Shaela', 'author': 'R. Bulter', 'ge...   \n",
+       "2   body_nn01-17169857  {'title': 'Ess. Man', 'author': 'A. Pope', 'ge...   \n",
+       "\n",
+       "                                                text    year  \\\n",
+       "0  {'keyword': 'bodies', 'full_text': 'After car ...  1990.0   \n",
+       "1  {'keyword': 'Min', 'full_text': 'Min A'm vexed...  1976.0   \n",
+       "2  {'keyword': 'Body', 'full_text': 'All are but ...  1733.0   \n",
+       "\n",
+       "                                           full_text  ... keyword_offset  \\\n",
+       "0  After car bodies are painted, they are moved i...  ...           10.0   \n",
+       "1                         Min A'm vexed ta hear yun.  ...            0.0   \n",
+       "2  All are but parts of one stupendous Whole, Who...  ...           49.0   \n",
+       "\n",
+       "                   vector_bert_base_-1,-2,-3,-4_mean  \\\n",
+       "0  [1.2747291, 0.25178745, 0.69486666, 0.42832682...   \n",
+       "1  [-0.10557328, 0.24347349, 0.731555, -0.4305202...   \n",
+       "2  [0.8197431, 0.04237363, 0.6312159, -0.2658673,...   \n",
+       "\n",
+       "                       vector_blert_-1,-2,-3,-4_mean label   id daterange  \\\n",
+       "0  [1.5054287, 1.1386966, 1.3405375, 0.8012274, -...     0  NaN       NaN   \n",
+       "1  [-0.49209523, 0.7658461, 0.07512934, 0.0148925...     0  NaN       NaN   \n",
+       "2  [0.60478234, 0.58020014, 0.053836707, -0.06571...     0  NaN       NaN   \n",
+       "\n",
+       "  provenance provenance_type relation_to_core_senses relation_to_seed_senses  \n",
+       "0        NaN             NaN                     NaN                     NaN  \n",
+       "1        NaN             NaN                     NaN                     NaN  \n",
+       "2        NaN             NaN                     NaN                     NaN  \n",
+       "\n",
+       "[3 rows x 21 columns]"
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>sense_id</th>\n      <th>lemma_definition</th>\n      <th>definition</th>\n      <th>word_id</th>\n      <th>lemma</th>\n      <th>quotation_id</th>\n      <th>source</th>\n      <th>text</th>\n      <th>year</th>\n      <th>full_text</th>\n      <th>...</th>\n      <th>keyword_offset</th>\n      <th>vector_bert_base_-1,-2,-3,-4_mean</th>\n      <th>vector_blert_-1,-2,-3,-4_mean</th>\n      <th>label</th>\n      <th>id</th>\n      <th>daterange</th>\n      <th>provenance</th>\n      <th>provenance_type</th>\n      <th>relation_to_core_senses</th>\n      <th>relation_to_seed_senses</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>body_nn01-17170653</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Particular technical uses. The part of a vehic...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-132916428</td>\n      <td>{'title': 'Material Handling Engin.', 'author'...</td>\n      <td>{'keyword': 'bodies', 'full_text': 'After car ...</td>\n      <td>1990.0</td>\n      <td>After car bodies are painted, they are moved i...</td>\n      <td>...</td>\n      <td>10.0</td>\n      <td>[1.2747291, 0.25178745, 0.69486666, 0.42832682...</td>\n      <td>[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>man_nn01-110482153</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>As vocative or as int., introducing a remark o...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110482440</td>\n      <td>{'title': 'Shaela', 'author': 'R. Bulter', 'ge...</td>\n      <td>{'keyword': 'Min', 'full_text': 'Min A'm vexed...</td>\n      <td>1976.0</td>\n      <td>Min A'm vexed ta hear yun.</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>[-0.10557328, 0.24347349, 0.731555, -0.4305202...</td>\n      <td>[-0.49209523, 0.7658461, 0.07512934, 0.0148925...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>body_nn01-17169813</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Contrasted with the soul. Cf. soul body n. at ...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-17169857</td>\n      <td>{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...</td>\n      <td>{'keyword': 'Body', 'full_text': 'All are but ...</td>\n      <td>1733.0</td>\n      <td>All are but parts of one stupendous Whole, Who...</td>\n      <td>...</td>\n      <td>49.0</td>\n      <td>[0.8197431, 0.04237363, 0.6312159, -0.2658673,...</td>\n      <td>[0.60478234, 0.58020014, 0.053836707, -0.06571...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>3 rows × 21 columns</p>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 6
+    }
+   ],
+   "source": [
+    "df_train.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',\n",
+       "       'quotation_id', 'source', 'text', 'year', 'full_text', 'keyword',\n",
+       "       'keyword_offset', 'vector_bert_base_-1,-2,-3,-4_mean',\n",
+       "       'vector_blert_-1,-2,-3,-4_mean', 'label', 'id', 'daterange',\n",
+       "       'provenance', 'provenance_type', 'relation_to_core_senses',\n",
+       "       'relation_to_seed_senses'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 7
+    }
+   ],
+   "source": [
+    "df_train.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def enclose_keyword(row,enclose_token='[TARGET]'):\n",
+    "    \"\"\"enclose keyword with specific token to point\n",
+    "    learner towards to word it has to focus on\n",
+    "    \"\"\"\n",
+    "    sentence = ''\n",
+    "    for i,c in enumerate(row.full_text):\n",
+    "        if i == int(row.keyword_offset):\n",
+    "            sentence+=enclose_token + ' '\n",
+    "        elif i ==int(row.keyword_offset + len(row.keyword)):\n",
+    "            sentence+= ' ' + enclose_token\n",
+    "        sentence+=c\n",
+    "    return sentence\n",
+    "\n",
+    "def merge_quotation_gloss(row):\n",
+    "    out_string = '[GLOSS] '\n",
+    "    if row.definition:\n",
+    "        out_string+=row.definition\n",
+    "    out_string+=' [QUOT] '  \n",
+    "    if row.enclosed_quotation:\n",
+    "        out_string+=row.enclosed_quotation\n",
+    "    return out_string\n",
+    "\n",
+    "def merge_quotation_keyword(row):\n",
+    "    out_string = '[TARGET] '\n",
+    "    if row.keyword:\n",
+    "        out_string+=row.keyword\n",
+    "    out_string+=' [QUOT] '  \n",
+    "    if row.enclosed_quotation:\n",
+    "        out_string+=row.enclosed_quotation\n",
+    "    return out_string\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "path = Path('./data/training_data')\n",
+    "path.mkdir(exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "csv_out_path = path / f\"{lemma}_{'_'.join(senses)}\"\n",
+    "csv_out_path.mkdir(exist_ok=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train['enclosed_quotation'] = df_train.apply(enclose_keyword, axis=1)\n",
+    "df_train['train_text'] = df_train.apply(merge_quotation_keyword, axis=1)\n",
+    "df_train[['train_text','label']].to_csv(csv_out_path / \"train.csv\",index = False, sep='\\t')    \n",
+    "df_val['enclosed_quotation'] = df_val.apply(enclose_keyword, axis=1)\n",
+    "df_val['train_text'] = df_val.apply(merge_quotation_keyword, axis=1)\n",
+    "df_val[['train_text','label']].to_csv(csv_out_path / \"dev.csv\",index = False, sep='\\t')        \n",
+    "df_test['enclosed_quotation'] = df_test.apply(enclose_keyword, axis=1)\n",
+    "df_test['train_text'] = df_test.apply(merge_quotation_keyword, axis=1)\n",
+    "df_test[['train_text','label']].to_csv(csv_out_path / \"test.csv\",index = False, sep='\\t')    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "2021-01-14 16:01:53,769 Reading data from data/training_data/machine_machine_nn01-38474140\n",
+      "2021-01-14 16:01:53,770 Train: data/training_data/machine_machine_nn01-38474140/train.csv\n",
+      "2021-01-14 16:01:53,770 Dev: data/training_data/machine_machine_nn01-38474140/dev.csv\n",
+      "2021-01-14 16:01:53,770 Test: data/training_data/machine_machine_nn01-38474140/test.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "from flair.data import Corpus\n",
+    "from flair.datasets import CSVClassificationCorpus\n",
+    "\n",
+    "# this is the folder in which train, test and dev files reside\n",
+    "data_folder = csv_out_path\n",
+    "\n",
+    "# column format indicating which columns hold the text and label(s)\n",
+    "column_name_map = {0: \"text\", 1: \"label\"}\n",
+    "\n",
+    "# load corpus containing training, test and dev data and if CSV has a header, you can skip it\n",
+    "corpus: Corpus = CSVClassificationCorpus(data_folder,\n",
+    "                                         column_name_map,\n",
+    "                                         skip_header=True,\n",
+    "                                         delimiter='\\t',    # tab-separated files\n",
+    ") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "2021-01-14 16:01:56,791 Computing label dictionary. Progress:\n",
+      "100%|██████████| 1604/1604 [00:00<00:00, 2103.07it/s]2021-01-14 16:01:57,857 [b'0', b'1']\n",
+      "Dictionary with 2 tags: 0, 1\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 2. create the label dictionary\n",
+    "label_dict = corpus.make_label_dictionary()\n",
+    "print(label_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.optim.adam import Adam\n",
+    "from flair.data import Corpus\n",
+    "from flair.datasets import TREC_6\n",
+    "from flair.embeddings import TransformerDocumentEmbeddings\n",
+    "from flair.models import TextClassifier\n",
+    "from flair.trainers import ModelTrainer\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# 3. initialize transformer document embeddings (many models are available)\n",
+    "document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)\n",
+    "\n",
+    "# 4. create the text classifier\n",
+    "classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)\n",
+    "\n",
+    "# 5. initialize the text classifier trainer with Adam optimizer\n",
+    "trainer = ModelTrainer(classifier, corpus, optimizer=Adam)\n",
+    "\n",
+    "# 6. start the training\n",
+    "trainer.train('models/taggers/trec',\n",
+    "              learning_rate=1e-5, # use very small learning rate\n",
+    "              mini_batch_size=16,\n",
+    "              mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine\n",
+    "              max_epochs=5, # terminate after 5 epochs\n",
+    "              )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}
\ No newline at end of file

From acc2cb960a845ddb5d51a926bcd75df00e1601de Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Thu, 14 Jan 2021 17:53:03 +0000
Subject: [PATCH 03/10] add glossbert method as draft

---
 114.1 - review notebook - glossbert.ipynb | 914 ++++++++++++++++++++--
 1 file changed, 852 insertions(+), 62 deletions(-)

diff --git a/114.1 - review notebook - glossbert.ipynb b/114.1 - review notebook - glossbert.ipynb
index c7e679f..60288ba 100644
--- a/114.1 - review notebook - glossbert.ipynb	
+++ b/114.1 - review notebook - glossbert.ipynb	
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,12 +51,19 @@
     "from sklearn.metrics import classification_report\n",
     "from flair.embeddings import TransformerWordEmbeddings\n",
     "from utils.dataset_download import harvest_data_from_extended_senses\n",
-    "from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma"
+    "from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma\n",
+    "from torch.optim.adam import Adam\n",
+    "from flair.datasets import CSVClassificationCorpus\n",
+    "from flair.data import Corpus\n",
+    "from flair.datasets import TREC_6\n",
+    "from flair.embeddings import TransformerDocumentEmbeddings\n",
+    "from flair.models import TextClassifier\n",
+    "from flair.trainers import ModelTrainer"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -70,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 71,
    "metadata": {},
    "outputs": [
     {
@@ -107,7 +114,353 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "                   sense_id  \\\n",
+       "238   machine_nn01-38474140   \n",
+       "305   machine_nn01-38474140   \n",
+       "713   machine_nn01-38474140   \n",
+       "938   machine_nn01-38474140   \n",
+       "1042  machine_nn01-38474140   \n",
+       "1056  machine_nn01-38474140   \n",
+       "\n",
+       "                                       lemma_definition  \\\n",
+       "238   A complex device, consisting of a number of in...   \n",
+       "305   A complex device, consisting of a number of in...   \n",
+       "713   A complex device, consisting of a number of in...   \n",
+       "938   A complex device, consisting of a number of in...   \n",
+       "1042  A complex device, consisting of a number of in...   \n",
+       "1056  A complex device, consisting of a number of in...   \n",
+       "\n",
+       "                                             definition       word_id  \\\n",
+       "238   A living body, esp. the human body considered ...  machine_nn01   \n",
+       "305   A living body, esp. the human body considered ...  machine_nn01   \n",
+       "713   A living body, esp. the human body considered ...  machine_nn01   \n",
+       "938   A living body, esp. the human body considered ...  machine_nn01   \n",
+       "1042  A living body, esp. the human body considered ...  machine_nn01   \n",
+       "1056  A living body, esp. the human body considered ...  machine_nn01   \n",
+       "\n",
+       "        lemma           quotation_id  \\\n",
+       "238   machine  machine_nn01-38474169   \n",
+       "305   machine  machine_nn01-38474177   \n",
+       "713   machine  machine_nn01-38474195   \n",
+       "938   machine  machine_nn01-38474223   \n",
+       "1042  machine  machine_nn01-38474203   \n",
+       "1056  machine  machine_nn01-38474212   \n",
+       "\n",
+       "                                                 source  \\\n",
+       "238   {'title': 'Death's Vision', 'author': 'J. Reyn...   \n",
+       "305   {'title': 'Spectator', 'author': 'J. Addison',...   \n",
+       "713   {'title': 'Med. & Physical Jrnl.', 'author': N...   \n",
+       "938   {'title': 'Of Human Bondage', 'author': 'W. S....   \n",
+       "1042  {'title': 'Poems', 'author': 'W. Wordsworth', ...   \n",
+       "1056  {'title': 'Telegraphy', 'author': 'W. H. Preec...   \n",
+       "\n",
+       "                                                   text    year  \\\n",
+       "238   {'keyword': 'Machins', 'full_text': 'What Nobl...  1709.0   \n",
+       "305   {'keyword': 'Machine', 'full_text': 'Cheerfuln...  1712.0   \n",
+       "713   {'keyword': 'machine', 'full_text': 'When a pr...  1805.0   \n",
+       "938   {'keyword': 'machine', 'full_text': 'He wonder...  1915.0   \n",
+       "1042  {'keyword': 'machine', 'full_text': 'And now I...  1807.0   \n",
+       "1056  {'keyword': 'machine', 'full_text': 'The human...  1876.0   \n",
+       "\n",
+       "                                              full_text  ... keyword_offset  \\\n",
+       "238          What Nobler Souls the Nobler Machins Wear.  ...           29.0   \n",
+       "305   Cheerfulness is..the best Promoter of Health. ...  ...           70.0   \n",
+       "713   When a product of diseased action has been eff...  ...           82.0   \n",
+       "938   He wondered whether at the very end, now that ...  ...           50.0   \n",
+       "1042  And now I see with eye serene The very pulse o...  ...           52.0   \n",
+       "1056  The human machine tires, and as a consequence ...  ...           10.0   \n",
+       "\n",
+       "                      vector_bert_base_-1,-2,-3,-4_mean  \\\n",
+       "238   [0.5628562, -0.04788875, 0.074935675, -0.22630...   \n",
+       "305   [0.0052292813, 0.12355395, 0.023108626, 0.2251...   \n",
+       "713   [0.25928053, 0.049638785, 0.022315167, 0.34901...   \n",
+       "938   [0.38040048, 0.38440758, 0.45397452, 0.1211486...   \n",
+       "1042  [-0.46428305, 0.013232344, -0.595714, 0.049642...   \n",
+       "1056  [0.6930934, 0.09074756, -0.13974331, 0.1105655...   \n",
+       "\n",
+       "                          vector_blert_-1,-2,-3,-4_mean label  \\\n",
+       "238   [-0.15516208, 0.289941, -0.15124893, -0.206332...     1   \n",
+       "305   [-0.04755735, 0.20182909, 0.33001357, -0.04851...     1   \n",
+       "713   [-0.16033216, -0.16846322, 0.5062964, 0.102019...     1   \n",
+       "938   [-0.059219074, 0.23112743, 0.42189148, 0.02944...     1   \n",
+       "1042  [0.021248298, 0.28699854, 0.24638082, -0.01793...     1   \n",
+       "1056  [0.11798739, -0.0029160888, 0.29418808, -0.076...     1   \n",
+       "\n",
+       "                         id  \\\n",
+       "238   machine_nn01-38474140   \n",
+       "305   machine_nn01-38474140   \n",
+       "713   machine_nn01-38474140   \n",
+       "938   machine_nn01-38474140   \n",
+       "1042  machine_nn01-38474140   \n",
+       "1056  machine_nn01-38474140   \n",
+       "\n",
+       "                                              daterange  \\\n",
+       "238   {'end': None, 'start': 1604, 'obsolete': False...   \n",
+       "305   {'end': None, 'start': 1604, 'obsolete': False...   \n",
+       "713   {'end': None, 'start': 1604, 'obsolete': False...   \n",
+       "938   {'end': None, 'start': 1604, 'obsolete': False...   \n",
+       "1042  {'end': None, 'start': 1604, 'obsolete': False...   \n",
+       "1056  {'end': None, 'start': 1604, 'obsolete': False...   \n",
+       "\n",
+       "                                         provenance provenance_type  \\\n",
+       "238   [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
+       "305   [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
+       "713   [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
+       "938   [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
+       "1042  [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
+       "1056  [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
+       "\n",
+       "      relation_to_core_senses  relation_to_seed_senses  \n",
+       "238   {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
+       "305   {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
+       "713   {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
+       "938   {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
+       "1042  {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
+       "1056  {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
+       "\n",
+       "[6 rows x 21 columns]"
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>sense_id</th>\n      <th>lemma_definition</th>\n      <th>definition</th>\n      <th>word_id</th>\n      <th>lemma</th>\n      <th>quotation_id</th>\n      <th>source</th>\n      <th>text</th>\n      <th>year</th>\n      <th>full_text</th>\n      <th>...</th>\n      <th>keyword_offset</th>\n      <th>vector_bert_base_-1,-2,-3,-4_mean</th>\n      <th>vector_blert_-1,-2,-3,-4_mean</th>\n      <th>label</th>\n      <th>id</th>\n      <th>daterange</th>\n      <th>provenance</th>\n      <th>provenance_type</th>\n      <th>relation_to_core_senses</th>\n      <th>relation_to_seed_senses</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>238</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474169</td>\n      <td>{'title': 'Death's Vision', 'author': 'J. Reyn...</td>\n      <td>{'keyword': 'Machins', 'full_text': 'What Nobl...</td>\n      <td>1709.0</td>\n      <td>What Nobler Souls the Nobler Machins Wear.</td>\n      <td>...</td>\n      <td>29.0</td>\n      <td>[0.5628562, -0.04788875, 0.074935675, -0.22630...</td>\n      <td>[-0.15516208, 0.289941, -0.15124893, -0.206332...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>305</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474177</td>\n      <td>{'title': 'Spectator', 'author': 'J. Addison',...</td>\n      <td>{'keyword': 'Machine', 'full_text': 'Cheerfuln...</td>\n      <td>1712.0</td>\n      <td>Cheerfulness is..the best Promoter of Health. ...</td>\n      <td>...</td>\n      <td>70.0</td>\n      <td>[0.0052292813, 0.12355395, 0.023108626, 0.2251...</td>\n      <td>[-0.04755735, 0.20182909, 0.33001357, -0.04851...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>713</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474195</td>\n      <td>{'title': 'Med. &amp; Physical Jrnl.', 'author': N...</td>\n      <td>{'keyword': 'machine', 'full_text': 'When a pr...</td>\n      <td>1805.0</td>\n      <td>When a product of diseased action has been eff...</td>\n      <td>...</td>\n      <td>82.0</td>\n      <td>[0.25928053, 0.049638785, 0.022315167, 0.34901...</td>\n      <td>[-0.16033216, -0.16846322, 0.5062964, 0.102019...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>938</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474223</td>\n      <td>{'title': 'Of Human Bondage', 'author': 'W. S....</td>\n      <td>{'keyword': 'machine', 'full_text': 'He wonder...</td>\n      <td>1915.0</td>\n      <td>He wondered whether at the very end, now that ...</td>\n      <td>...</td>\n      <td>50.0</td>\n      <td>[0.38040048, 0.38440758, 0.45397452, 0.1211486...</td>\n      <td>[-0.059219074, 0.23112743, 0.42189148, 0.02944...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>1042</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474203</td>\n      <td>{'title': 'Poems', 'author': 'W. Wordsworth', ...</td>\n      <td>{'keyword': 'machine', 'full_text': 'And now I...</td>\n      <td>1807.0</td>\n      <td>And now I see with eye serene The very pulse o...</td>\n      <td>...</td>\n      <td>52.0</td>\n      <td>[-0.46428305, 0.013232344, -0.595714, 0.049642...</td>\n      <td>[0.021248298, 0.28699854, 0.24638082, -0.01793...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>1056</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474212</td>\n      <td>{'title': 'Telegraphy', 'author': 'W. H. Preec...</td>\n      <td>{'keyword': 'machine', 'full_text': 'The human...</td>\n      <td>1876.0</td>\n      <td>The human machine tires, and as a consequence ...</td>\n      <td>...</td>\n      <td>10.0</td>\n      <td>[0.6930934, 0.09074756, -0.13974331, 0.1105655...</td>\n      <td>[0.11798739, -0.0029160888, 0.29418808, -0.076...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n  </tbody>\n</table>\n<p>6 rows × 21 columns</p>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 72
+    }
+   ],
+   "source": [
+    "df_train[df_train.sense_id=='machine_nn01-38474140']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "array(['body_nn01', 'man_nn01', 'machine_nn01', 'carcass_nn01',\n",
+       "       'person_nn01', 'case_nn02', 'personage_nn01', 'corporeity_nn01',\n",
+       "       'structure_nn01', 'dust_nn01', 'case_nn01', 'automaton_nn01',\n",
+       "       'earth_nn01', 'soma_nn02', 'bulk_nn01', 'microcosm_nn01',\n",
+       "       'personality_nn01', 'tabernacle_nn01', 'vessel_nn01',\n",
+       "       'corpse_nn01', 'case_nn04', 'clay_nn01', 'clod_nn01',\n",
+       "       'skinful_nn01', 'carrion_nn01', 'embodiment_nn01', 'corpus_nn01',\n",
+       "       'flesh_nn01', 'soma_nn01', 'bloodbulk_nn01', 'earth_nn02',\n",
+       "       'soulcase_nn02', 'corporation_nn01', 'chassis_nn01', 'bulk_nn03',\n",
+       "       'bouk_nn01', 'outwall_nn01', 'case_nn03', 'incarnation_nn01',\n",
+       "       'bonehouse_nn01', 'man_nn04', 'bulk_nn02', 'soulcase_nn01',\n",
+       "       'godsimage_nn01', 'quarrons_nn01'], dtype=object)"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 73
+    }
+   ],
+   "source": [
+    "df_train.word_id.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "                   sense_id  \\\n",
+       "0        body_nn01-17170653   \n",
+       "1        man_nn01-110482153   \n",
+       "2        body_nn01-17169813   \n",
+       "3     machine_nn01-38474877   \n",
+       "4     carcass_nn01-10177258   \n",
+       "...                     ...   \n",
+       "1215     man_nn01-110479060   \n",
+       "1216   person_nn01-30950985   \n",
+       "1217      clay_nn01-9320873   \n",
+       "1218     case_nn02-10018131   \n",
+       "1220     man_nn01-110487579   \n",
+       "\n",
+       "                                       lemma_definition  \\\n",
+       "0     The complete physical form of a person or anim...   \n",
+       "1     An adult male human being. Without explicit co...   \n",
+       "2     The complete physical form of a person or anim...   \n",
+       "3     A complex device, consisting of a number of in...   \n",
+       "4     The dead body of a person or animal; but no lo...   \n",
+       "...                                                 ...   \n",
+       "1215  An adult male human being. Without explicit co...   \n",
+       "1216  An individual human being; a man, woman, or ch...   \n",
+       "1217  A stiff viscous earth found, in many varieties...   \n",
+       "1218  A box, bag, or other receptacle, designed to c...   \n",
+       "1220  An adult male human being. Without explicit co...   \n",
+       "\n",
+       "                                             definition       word_id  \\\n",
+       "0     Particular technical uses. The part of a vehic...     body_nn01   \n",
+       "1     As vocative or as int., introducing a remark o...      man_nn01   \n",
+       "2     Contrasted with the soul. Cf. soul body n. at ...     body_nn01   \n",
+       "3     A bicycle or tricycle; a motorcycle. Formerly ...  machine_nn01   \n",
+       "4     The naked framework or ‘shell’ of a building b...  carcass_nn01   \n",
+       "...                                                 ...           ...   \n",
+       "1215  A husband. Now chiefly English regional (north...      man_nn01   \n",
+       "1216  Law. An individual (natural person n.) or corp...   person_nn01   \n",
+       "1217  Short for clay-pipe n. at  compounds 2 (colloq...     clay_nn01   \n",
+       "1218  slang. A house, esp. one used as a brothel. Cf...     case_nn02   \n",
+       "1220  In Cumbria: a cairn marking a summit or promin...      man_nn01   \n",
+       "\n",
+       "        lemma           quotation_id  \\\n",
+       "0        body    body_nn01-132916428   \n",
+       "1         man     man_nn01-110482440   \n",
+       "2        body     body_nn01-17169857   \n",
+       "3     machine  machine_nn01-38474966   \n",
+       "4     carcass  carcass_nn01-10177295   \n",
+       "...       ...                    ...   \n",
+       "1215      man     man_nn01-110479206   \n",
+       "1216   person   person_nn01-30951076   \n",
+       "1217     clay      clay_nn01-9320896   \n",
+       "1218     case     case_nn02-10018191   \n",
+       "1220      man     man_nn01-110487624   \n",
+       "\n",
+       "                                                 source  \\\n",
+       "0     {'title': 'Material Handling Engin.', 'author'...   \n",
+       "1     {'title': 'Shaela', 'author': 'R. Bulter', 'ge...   \n",
+       "2     {'title': 'Ess. Man', 'author': 'A. Pope', 'ge...   \n",
+       "3     {'title': 'National Trust Mag.', 'author': Non...   \n",
+       "4     {'title': 'New Pract. Builder', 'author': 'P. ...   \n",
+       "...                                                 ...   \n",
+       "1215  {'title': 'Four Years S. Afr.', 'author': 'C. ...   \n",
+       "1216  {'title': 'Daily News', 'author': None, 'gende...   \n",
+       "1217  {'title': 'Held in Bondage', 'author': '‘Ouida...   \n",
+       "1218  {'title': 'Mop Fair', 'author': 'A. M. Binstea...   \n",
+       "1220  {'title': 'Northern Affair', 'author': 'D. K. ...   \n",
+       "\n",
+       "                                                   text    year  \\\n",
+       "0     {'keyword': 'bodies', 'full_text': 'After car ...  1990.0   \n",
+       "1     {'keyword': 'Min', 'full_text': 'Min A'm vexed...  1976.0   \n",
+       "2     {'keyword': 'Body', 'full_text': 'All are but ...  1733.0   \n",
+       "3     {'keyword': 'machines', 'full_text': 'The cycl...  1992.0   \n",
+       "4     {'keyword': 'Carcase', 'full_text': 'Carcase o...  1823.0   \n",
+       "...                                                 ...     ...   \n",
+       "1215  {'keyword': 'man', 'full_text': 'The wife brok...  1829.0   \n",
+       "1216  {'keyword': 'persons', 'full_text': 'A Bill..e...  1900.0   \n",
+       "1217  {'keyword': 'clays', 'full_text': 'Filthy bird...  1863.0   \n",
+       "1218  {'keyword': 'case', 'full_text': 'They arrange...  1905.0   \n",
+       "1220  {'keyword': 'man', 'full_text': 'Over the elep...  1964.0   \n",
+       "\n",
+       "                                              full_text  ... keyword_offset  \\\n",
+       "0     After car bodies are painted, they are moved i...  ...           10.0   \n",
+       "1                            Min A'm vexed ta hear yun.  ...            0.0   \n",
+       "2     All are but parts of one stupendous Whole, Who...  ...           49.0   \n",
+       "3     The cyclists..took on the circular 21- or 42-m...  ...           92.0   \n",
+       "4     Carcase of a Building, the naked walls, and th...  ...            0.0   \n",
+       "...                                                 ...  ...            ...   \n",
+       "1215  The wife broke out, ‘You lament a brother, and...  ...           79.0   \n",
+       "1216  A Bill..extending to juridical persons, that i...  ...           31.0   \n",
+       "1217                Filthy bird's-eye, smoked in clays.  ...           29.0   \n",
+       "1218  They arranges to stop ‘private’ in Brighton, a...  ...           57.0   \n",
+       "1220  Over the elephant rocks and under the lee of t...  ...           55.0   \n",
+       "\n",
+       "                      vector_bert_base_-1,-2,-3,-4_mean  \\\n",
+       "0     [1.2747291, 0.25178745, 0.69486666, 0.42832682...   \n",
+       "1     [-0.10557328, 0.24347349, 0.731555, -0.4305202...   \n",
+       "2     [0.8197431, 0.04237363, 0.6312159, -0.2658673,...   \n",
+       "3     [-0.18150243, -0.24230756, -0.3336587, 0.34879...   \n",
+       "4     [0.6567496, -0.050804906, 0.31024605, 0.059706...   \n",
+       "...                                                 ...   \n",
+       "1215  [-0.07307064, -0.31692728, 0.38834277, -0.2980...   \n",
+       "1216  [0.030711764, 0.28706473, 0.6596842, -0.132111...   \n",
+       "1217  [-0.016634814, 0.6912965, -0.18498293, -0.2104...   \n",
+       "1218  [0.16278893, -0.17927478, 0.34916735, -0.34717...   \n",
+       "1220  [0.12908892, 0.1654679, -0.077464886, -0.44454...   \n",
+       "\n",
+       "                          vector_blert_-1,-2,-3,-4_mean label  \\\n",
+       "0     [1.5054287, 1.1386966, 1.3405375, 0.8012274, -...     0   \n",
+       "1     [-0.49209523, 0.7658461, 0.07512934, 0.0148925...     0   \n",
+       "2     [0.60478234, 0.58020014, 0.053836707, -0.06571...     0   \n",
+       "3     [-0.14852196, 0.69629294, 0.30973893, 0.598406...     0   \n",
+       "4     [0.41240987, 0.10217035, 0.48574266, 0.8627304...     0   \n",
+       "...                                                 ...   ...   \n",
+       "1215  [-0.20098017, 0.47577783, 0.013388823, -0.2808...     0   \n",
+       "1216  [-0.42745396, 0.4621299, 0.34301567, 0.2193956...     0   \n",
+       "1217  [-0.2833503, 0.80949837, -0.5981247, 0.4331013...     0   \n",
+       "1218  [0.3253876, 0.12327082, -0.077930324, 0.450299...     0   \n",
+       "1220  [-0.4877532, 0.62317544, -0.4543179, -0.167910...     0   \n",
+       "\n",
+       "                         id  \\\n",
+       "0                       NaN   \n",
+       "1                       NaN   \n",
+       "2                       NaN   \n",
+       "3     machine_nn01-38474877   \n",
+       "4                       NaN   \n",
+       "...                     ...   \n",
+       "1215                    NaN   \n",
+       "1216                    NaN   \n",
+       "1217                    NaN   \n",
+       "1218                    NaN   \n",
+       "1220                    NaN   \n",
+       "\n",
+       "                                              daterange  \\\n",
+       "0                                                   NaN   \n",
+       "1                                                   NaN   \n",
+       "2                                                   NaN   \n",
+       "3     {'end': None, 'start': 1823, 'obsolete': False...   \n",
+       "4                                                   NaN   \n",
+       "...                                                 ...   \n",
+       "1215                                                NaN   \n",
+       "1216                                                NaN   \n",
+       "1217                                                NaN   \n",
+       "1218                                                NaN   \n",
+       "1220                                                NaN   \n",
+       "\n",
+       "                                         provenance provenance_type  \\\n",
+       "0                                               NaN             NaN   \n",
+       "1                                               NaN             NaN   \n",
+       "2                                               NaN             NaN   \n",
+       "3     [[machine_nn01-38474877, seed, machine_nn01]]            seed   \n",
+       "4                                               NaN             NaN   \n",
+       "...                                             ...             ...   \n",
+       "1215                                            NaN             NaN   \n",
+       "1216                                            NaN             NaN   \n",
+       "1217                                            NaN             NaN   \n",
+       "1218                                            NaN             NaN   \n",
+       "1220                                            NaN             NaN   \n",
+       "\n",
+       "      relation_to_core_senses  relation_to_seed_senses  \n",
+       "0                         NaN                      NaN  \n",
+       "1                         NaN                      NaN  \n",
+       "2                         NaN                      NaN  \n",
+       "3     {machine_nn01-38474877}  {machine_nn01-38474877}  \n",
+       "4                         NaN                      NaN  \n",
+       "...                       ...                      ...  \n",
+       "1215                      NaN                      NaN  \n",
+       "1216                      NaN                      NaN  \n",
+       "1217                      NaN                      NaN  \n",
+       "1218                      NaN                      NaN  \n",
+       "1220                      NaN                      NaN  \n",
+       "\n",
+       "[1135 rows x 21 columns]"
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>sense_id</th>\n      <th>lemma_definition</th>\n      <th>definition</th>\n      <th>word_id</th>\n      <th>lemma</th>\n      <th>quotation_id</th>\n      <th>source</th>\n      <th>text</th>\n      <th>year</th>\n      <th>full_text</th>\n      <th>...</th>\n      <th>keyword_offset</th>\n      <th>vector_bert_base_-1,-2,-3,-4_mean</th>\n      <th>vector_blert_-1,-2,-3,-4_mean</th>\n      <th>label</th>\n      <th>id</th>\n      <th>daterange</th>\n      <th>provenance</th>\n      <th>provenance_type</th>\n      <th>relation_to_core_senses</th>\n      <th>relation_to_seed_senses</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>body_nn01-17170653</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Particular technical uses. The part of a vehic...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-132916428</td>\n      <td>{'title': 'Material Handling Engin.', 'author'...</td>\n      <td>{'keyword': 'bodies', 'full_text': 'After car ...</td>\n      <td>1990.0</td>\n      <td>After car bodies are painted, they are moved i...</td>\n      <td>...</td>\n      <td>10.0</td>\n      <td>[1.2747291, 0.25178745, 0.69486666, 0.42832682...</td>\n      <td>[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>man_nn01-110482153</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>As vocative or as int., introducing a remark o...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110482440</td>\n      <td>{'title': 'Shaela', 'author': 'R. Bulter', 'ge...</td>\n      <td>{'keyword': 'Min', 'full_text': 'Min A'm vexed...</td>\n      <td>1976.0</td>\n      <td>Min A'm vexed ta hear yun.</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>[-0.10557328, 0.24347349, 0.731555, -0.4305202...</td>\n      <td>[-0.49209523, 0.7658461, 0.07512934, 0.0148925...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>body_nn01-17169813</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Contrasted with the soul. Cf. soul body n. at ...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-17169857</td>\n      <td>{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...</td>\n      <td>{'keyword': 'Body', 'full_text': 'All are but ...</td>\n      <td>1733.0</td>\n      <td>All are but parts of one stupendous Whole, Who...</td>\n      <td>...</td>\n      <td>49.0</td>\n      <td>[0.8197431, 0.04237363, 0.6312159, -0.2658673,...</td>\n      <td>[0.60478234, 0.58020014, 0.053836707, -0.06571...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>machine_nn01-38474877</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A bicycle or tricycle; a motorcycle. Formerly ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474966</td>\n      <td>{'title': 'National Trust Mag.', 'author': Non...</td>\n      <td>{'keyword': 'machines', 'full_text': 'The cycl...</td>\n      <td>1992.0</td>\n      <td>The cyclists..took on the circular 21- or 42-m...</td>\n      <td>...</td>\n      <td>92.0</td>\n      <td>[-0.18150243, -0.24230756, -0.3336587, 0.34879...</td>\n      <td>[-0.14852196, 0.69629294, 0.30973893, 0.598406...</td>\n      <td>0</td>\n      <td>machine_nn01-38474877</td>\n      <td>{'end': None, 'start': 1823, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474877, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474877}</td>\n      <td>{machine_nn01-38474877}</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>carcass_nn01-10177258</td>\n      <td>The dead body of a person or animal; but no lo...</td>\n      <td>The naked framework or ‘shell’ of a building b...</td>\n      <td>carcass_nn01</td>\n      <td>carcass</td>\n      <td>carcass_nn01-10177295</td>\n      <td>{'title': 'New Pract. Builder', 'author': 'P. ...</td>\n      <td>{'keyword': 'Carcase', 'full_text': 'Carcase o...</td>\n      <td>1823.0</td>\n      <td>Carcase of a Building, the naked walls, and th...</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>[0.6567496, -0.050804906, 0.31024605, 0.059706...</td>\n      <td>[0.41240987, 0.10217035, 0.48574266, 0.8627304...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1215</th>\n      <td>man_nn01-110479060</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>A husband. Now chiefly English regional (north...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110479206</td>\n      <td>{'title': 'Four Years S. Afr.', 'author': 'C. ...</td>\n      <td>{'keyword': 'man', 'full_text': 'The wife brok...</td>\n      <td>1829.0</td>\n      <td>The wife broke out, ‘You lament a brother, and...</td>\n      <td>...</td>\n      <td>79.0</td>\n      <td>[-0.07307064, -0.31692728, 0.38834277, -0.2980...</td>\n      <td>[-0.20098017, 0.47577783, 0.013388823, -0.2808...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1216</th>\n      <td>person_nn01-30950985</td>\n      <td>An individual human being; a man, woman, or ch...</td>\n      <td>Law. An individual (natural person n.) or corp...</td>\n      <td>person_nn01</td>\n      <td>person</td>\n      <td>person_nn01-30951076</td>\n      <td>{'title': 'Daily News', 'author': None, 'gende...</td>\n      <td>{'keyword': 'persons', 'full_text': 'A Bill..e...</td>\n      <td>1900.0</td>\n      <td>A Bill..extending to juridical persons, that i...</td>\n      <td>...</td>\n      <td>31.0</td>\n      <td>[0.030711764, 0.28706473, 0.6596842, -0.132111...</td>\n      <td>[-0.42745396, 0.4621299, 0.34301567, 0.2193956...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1217</th>\n      <td>clay_nn01-9320873</td>\n      <td>A stiff viscous earth found, in many varieties...</td>\n      <td>Short for clay-pipe n. at  compounds 2 (colloq...</td>\n      <td>clay_nn01</td>\n      <td>clay</td>\n      <td>clay_nn01-9320896</td>\n      <td>{'title': 'Held in Bondage', 'author': '‘Ouida...</td>\n      <td>{'keyword': 'clays', 'full_text': 'Filthy bird...</td>\n      <td>1863.0</td>\n      <td>Filthy bird's-eye, smoked in clays.</td>\n      <td>...</td>\n      <td>29.0</td>\n      <td>[-0.016634814, 0.6912965, -0.18498293, -0.2104...</td>\n      <td>[-0.2833503, 0.80949837, -0.5981247, 0.4331013...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1218</th>\n      <td>case_nn02-10018131</td>\n      <td>A box, bag, or other receptacle, designed to c...</td>\n      <td>slang. A house, esp. one used as a brothel. Cf...</td>\n      <td>case_nn02</td>\n      <td>case</td>\n      <td>case_nn02-10018191</td>\n      <td>{'title': 'Mop Fair', 'author': 'A. M. Binstea...</td>\n      <td>{'keyword': 'case', 'full_text': 'They arrange...</td>\n      <td>1905.0</td>\n      <td>They arranges to stop ‘private’ in Brighton, a...</td>\n      <td>...</td>\n      <td>57.0</td>\n      <td>[0.16278893, -0.17927478, 0.34916735, -0.34717...</td>\n      <td>[0.3253876, 0.12327082, -0.077930324, 0.450299...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1220</th>\n      <td>man_nn01-110487579</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>In Cumbria: a cairn marking a summit or promin...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110487624</td>\n      <td>{'title': 'Northern Affair', 'author': 'D. K. ...</td>\n      <td>{'keyword': 'man', 'full_text': 'Over the elep...</td>\n      <td>1964.0</td>\n      <td>Over the elephant rocks and under the lee of t...</td>\n      <td>...</td>\n      <td>55.0</td>\n      <td>[0.12908892, 0.1654679, -0.077464886, -0.44454...</td>\n      <td>[-0.4877532, 0.62317544, -0.4543179, -0.167910...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>1135 rows × 21 columns</p>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 74
+    }
+   ],
+   "source": [
+    "df_train[df_train.label==\"0\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
    "metadata": {},
    "outputs": [
     {
@@ -159,7 +512,7 @@
       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>sense_id</th>\n      <th>lemma_definition</th>\n      <th>definition</th>\n      <th>word_id</th>\n      <th>lemma</th>\n      <th>quotation_id</th>\n      <th>source</th>\n      <th>text</th>\n      <th>year</th>\n      <th>full_text</th>\n      <th>...</th>\n      <th>keyword_offset</th>\n      <th>vector_bert_base_-1,-2,-3,-4_mean</th>\n      <th>vector_blert_-1,-2,-3,-4_mean</th>\n      <th>label</th>\n      <th>id</th>\n      <th>daterange</th>\n      <th>provenance</th>\n      <th>provenance_type</th>\n      <th>relation_to_core_senses</th>\n      <th>relation_to_seed_senses</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>body_nn01-17170653</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Particular technical uses. The part of a vehic...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-132916428</td>\n      <td>{'title': 'Material Handling Engin.', 'author'...</td>\n      <td>{'keyword': 'bodies', 'full_text': 'After car ...</td>\n      <td>1990.0</td>\n      <td>After car bodies are painted, they are moved i...</td>\n      <td>...</td>\n      <td>10.0</td>\n      <td>[1.2747291, 0.25178745, 0.69486666, 0.42832682...</td>\n      <td>[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>man_nn01-110482153</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>As vocative or as int., introducing a remark o...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110482440</td>\n      <td>{'title': 'Shaela', 'author': 'R. Bulter', 'ge...</td>\n      <td>{'keyword': 'Min', 'full_text': 'Min A'm vexed...</td>\n      <td>1976.0</td>\n      <td>Min A'm vexed ta hear yun.</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>[-0.10557328, 0.24347349, 0.731555, -0.4305202...</td>\n      <td>[-0.49209523, 0.7658461, 0.07512934, 0.0148925...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>body_nn01-17169813</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Contrasted with the soul. Cf. soul body n. at ...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-17169857</td>\n      <td>{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...</td>\n      <td>{'keyword': 'Body', 'full_text': 'All are but ...</td>\n      <td>1733.0</td>\n      <td>All are but parts of one stupendous Whole, Who...</td>\n      <td>...</td>\n      <td>49.0</td>\n      <td>[0.8197431, 0.04237363, 0.6312159, -0.2658673,...</td>\n      <td>[0.60478234, 0.58020014, 0.053836707, -0.06571...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>3 rows × 21 columns</p>\n</div>"
      },
      "metadata": {},
-     "execution_count": 6
+     "execution_count": 75
     }
    ],
    "source": [
@@ -168,7 +521,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 76,
    "metadata": {},
    "outputs": [
     {
@@ -185,7 +538,7 @@
       ]
      },
      "metadata": {},
-     "execution_count": 7
+     "execution_count": 76
     }
    ],
    "source": [
@@ -194,11 +547,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 100,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def enclose_keyword(row,enclose_token='[TARGET]'):\n",
+    "def enclose_keyword(row,enclose_token='\"'):\n",
     "    \"\"\"enclose keyword with specific token to point\n",
     "    learner towards to word it has to focus on\n",
     "    \"\"\"\n",
@@ -211,30 +564,115 @@
     "        sentence+=c\n",
     "    return sentence\n",
     "\n",
-    "def merge_quotation_gloss(row):\n",
-    "    out_string = '[GLOSS] '\n",
-    "    if row.definition:\n",
-    "        out_string+=row.definition\n",
-    "    out_string+=' [QUOT] '  \n",
-    "    if row.enclosed_quotation:\n",
-    "        out_string+=row.enclosed_quotation\n",
-    "    return out_string\n",
+    "#def merge_quotation_gloss(row):\n",
+    "#    out_string = '[GLOSS] '\n",
+    "#    if row.definition:\n",
+    "#        out_string+=row.definition\n",
+    "#    out_string+=' [QUOT] '  \n",
+    "#    if row.enclosed_quotation:\n",
+    "#        out_string+=row.enclosed_quotation\n",
+    "#    return out_string\n",
+    "\n",
+    "#def prep_train_text(row):\n",
+    "#    out_string='[TAGET] '+row.keyword+' [TAGET] : '\n",
+    "#    if row.definition:\n",
+    "#        out_string+=row.definition\n",
+    "#    out_string+=' [SEP] '  \n",
+    "#    if row.enclosed_quotation:\n",
+    "#        out_string+=row.enclosed_quotation\n",
+    "#    return out_string\n",
     "\n",
-    "def merge_quotation_keyword(row):\n",
-    "    out_string = '[TARGET] '\n",
-    "    if row.keyword:\n",
-    "        out_string+=row.keyword\n",
-    "    out_string+=' [QUOT] '  \n",
-    "    if row.enclosed_quotation:\n",
-    "        out_string+=row.enclosed_quotation\n",
-    "    return out_string\n",
+    "#def prep_test_text(row):\n",
+    "#    out_string='[TAGET] '+row.keyword+' [TAGET] : '\n",
+    "#    if row.enclosed_quotation:\n",
+    "#        out_string+=row.enclosed_quotation\n",
+    "#    return out_string\n",
     "\n",
+    "#def merge_quotation_keyword(row):\n",
+    "#    out_string = '[TARGET] '\n",
+    "#    if row.keyword:\n",
+    "#        out_string+=row.keyword\n",
+    "#    out_string+=' [QUOT] '  \n",
+    "#    if row.enclosed_quotation:\n",
+    "#        out_string+=row.enclosed_quotation\n",
+    "#    return out_string\n",
     "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def to_glossbert_format(df):\n",
+    "    def gloss_string(row, definition):\n",
+    "        out_string=''\n",
+    "        if row.enclosed_quotation:\n",
+    "            out_string+=row.enclosed_quotation\n",
+    "        out_string+=' [SEP] '  \n",
+    "        out_string+=row.keyword+': '\n",
+    "        if row.definition:\n",
+    "            out_string+=definition\n",
+    "        return out_string\n",
+    "\n",
+    "    df['enclosed_quotations'] = df.apply(enclose_keyword, axis=1)\n",
+    "    \n",
+    "    rows = [] \n",
+    "    for i,row in df.iterrows():\n",
+    "        rows.append([gloss_string(row, row.definition), 1])\n",
+    "        definitions = df[df.lemma==row.lemma].definition.unique()\n",
+    "        for d in definitions:\n",
+    "            if d != row.definition:\n",
+    "                rows.append([gloss_string(row,d), 0])\n",
+    "    \n",
+    "    return rows\n",
+    "\n",
+    "df_gloss_train = to_glossbert_format(df_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "[['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Particular technical uses. The part of a vehicle fitted to receive the load; spec. the part of a motor car in which driver and passengers sit, or the fuselage of an aeroplane. Cf. cart-body n. at  cart n. compounds 2, wide-body n.',\n",
+       "  1],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Contrasted with the soul. Cf. soul body n. at  soul n. compounds 4.',\n",
+       "  0],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Particular technical uses. The main part of a musical instrument, which in the case of traditional stringed instruments forms a resonating chamber.',\n",
+       "  0],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: The complete physical form of a person or animal; the assemblage of parts, organs, and tissues that constitutes the whole material organism.',\n",
+       "  0],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: A comprehensive and systematic collection of information, or of the details of any subject, esp. law; a textbook, a pandect. Usually with of.',\n",
+       "  0],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: A corpse.',\n",
+       "  0],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: The physical or mortal nature, state, or aspect of man. Frequently in in (the) body, out of (the) body and variants, sometimes contrasted with in spirit.',\n",
+       "  0],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: More widely: a material thing, an object; something that has physical existence and extension in space.',\n",
+       "  0],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Cell Biology. Any of various normal or abnormal structures found within the cytoplasm or nucleus of a cell. Frequently with distinguishing word.',\n",
+       "  0],\n",
+       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Originally: †size or bulk; quantity (obsolete). In later use: a quantity, mass, or area of something.',\n",
+       "  0]]"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 102
+    }
+   ],
+   "source": [
+    "df_gloss_train[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -245,7 +683,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 79,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -255,47 +693,90 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 80,
    "metadata": {},
    "outputs": [],
    "source": [
     "df_train['enclosed_quotation'] = df_train.apply(enclose_keyword, axis=1)\n",
-    "df_train['train_text'] = df_train.apply(merge_quotation_keyword, axis=1)\n",
-    "df_train[['train_text','label']].to_csv(csv_out_path / \"train.csv\",index = False, sep='\\t')    \n",
+    "df_train['text'] = df_train.apply(prep_train_text, axis=1)\n",
+    "df_train[['text','label']].to_csv(csv_out_path / \"train.csv\",index = False, sep='\\t')    \n",
     "df_val['enclosed_quotation'] = df_val.apply(enclose_keyword, axis=1)\n",
-    "df_val['train_text'] = df_val.apply(merge_quotation_keyword, axis=1)\n",
-    "df_val[['train_text','label']].to_csv(csv_out_path / \"dev.csv\",index = False, sep='\\t')        \n",
+    "df_val['text'] = df_val.apply(prep_test_text, axis=1)\n",
+    "df_val[['text','label']].to_csv(csv_out_path / \"dev.csv\",index = False, sep='\\t')        \n",
     "df_test['enclosed_quotation'] = df_test.apply(enclose_keyword, axis=1)\n",
-    "df_test['train_text'] = df_test.apply(merge_quotation_keyword, axis=1)\n",
-    "df_test[['train_text','label']].to_csv(csv_out_path / \"test.csv\",index = False, sep='\\t')    "
+    "df_test['text'] = df_test.apply(prep_test_text, axis=1)\n",
+    "df_test[['text','label']].to_csv(csv_out_path / \"test.csv\",index = False, sep='\\t')    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 81,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "387     [TAGET] Men [TAGET] : He canton'd out the Coun...\n",
+       "1491    [TAGET] earths [TAGET] : Ley-grounds cannot be...\n",
+       "1841    [TAGET] earth [TAGET] : It is well to see the ...\n",
+       "1244    [TAGET] person [TAGET] : The administrator..ha...\n",
+       "1809    [TAGET] earth [TAGET] : While I drove by in my...\n",
+       "                              ...                        \n",
+       "736     [TAGET] machines [TAGET] : ‘Anyone,’ declared,...\n",
+       "610     [TAGET] machine [TAGET] : To each mortal perad...\n",
+       "1612    [TAGET] body [TAGET] : The coffee, we know, st...\n",
+       "1128    [TAGET] Personalities [TAGET] : Wisdom, Learni...\n",
+       "1281    [TAGET] person [TAGET] : I'm a people [TARGET]...\n",
+       "Name: text, Length: 383, dtype: object"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 81
+    }
+   ],
+   "source": [
+    "df_test.text"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "'[TAGET] bodies [TAGET] : Particular technical uses. The part of a vehicle fitted to receive the load; spec. the part of a motor car in which driver and passengers sit, or the fuselage of an aeroplane. Cf. cart-body n. at  cart n. compounds 2, wide-body n. [SEP] After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color.'"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 82
+    }
+   ],
+   "source": [
+    "df_train.iloc[0].text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "name": "stdout",
      "text": [
-      "2021-01-14 16:01:53,769 Reading data from data/training_data/machine_machine_nn01-38474140\n",
-      "2021-01-14 16:01:53,770 Train: data/training_data/machine_machine_nn01-38474140/train.csv\n",
-      "2021-01-14 16:01:53,770 Dev: data/training_data/machine_machine_nn01-38474140/dev.csv\n",
-      "2021-01-14 16:01:53,770 Test: data/training_data/machine_machine_nn01-38474140/test.csv\n"
+      "2021-01-14 17:14:29,725 Reading data from data/training_data/machine_machine_nn01-38474140\n",
+      "2021-01-14 17:14:29,726 Train: data/training_data/machine_machine_nn01-38474140/train.csv\n",
+      "2021-01-14 17:14:29,727 Dev: data/training_data/machine_machine_nn01-38474140/dev.csv\n",
+      "2021-01-14 17:14:29,727 Test: data/training_data/machine_machine_nn01-38474140/test.csv\n"
      ]
     }
    ],
    "source": [
-    "from flair.data import Corpus\n",
-    "from flair.datasets import CSVClassificationCorpus\n",
+    "\n",
     "\n",
     "# this is the folder in which train, test and dev files reside\n",
     "data_folder = csv_out_path\n",
@@ -313,15 +794,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 84,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "name": "stdout",
      "text": [
-      "2021-01-14 16:01:56,791 Computing label dictionary. Progress:\n",
-      "100%|██████████| 1604/1604 [00:00<00:00, 2103.07it/s]2021-01-14 16:01:57,857 [b'0', b'1']\n",
+      "2021-01-14 17:14:29,745 Computing label dictionary. Progress:\n",
+      "100%|██████████| 1604/1604 [00:01<00:00, 1060.11it/s]2021-01-14 17:14:31,699 [b'0', b'1']\n",
       "Dictionary with 2 tags: 0, 1\n",
       "\n"
      ]
@@ -335,25 +816,320 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 85,
+   "metadata": {
+    "tags": [
+     "outputPrepend"
+    ]
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "eps=1e-12, elementwise_affine=True)\n",
+      "            (ffn): FFN(\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            )\n",
+      "            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "          )\n",
+      "          (4): TransformerBlock(\n",
+      "            (attention): MultiHeadSelfAttention(\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+      "            )\n",
+      "            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "            (ffn): FFN(\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            )\n",
+      "            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "          )\n",
+      "          (5): TransformerBlock(\n",
+      "            (attention): MultiHeadSelfAttention(\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+      "            )\n",
+      "            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "            (ffn): FFN(\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            )\n",
+      "            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (decoder): Linear(in_features=768, out_features=2, bias=True)\n",
+      "  (loss_function): CrossEntropyLoss()\n",
+      "  (beta): 1.0\n",
+      "  (weights): {b'1': 10, b'0': 1}\n",
+      "  (weight_tensor) tensor([1., 1.], device='cuda:0')\n",
+      ")\"\n",
+      "2021-01-14 17:14:33,683 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:14:33,684 Corpus: \"Corpus: 1221 train + 306 dev + 383 test sentences\"\n",
+      "2021-01-14 17:14:33,684 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:14:33,685 Parameters:\n",
+      "2021-01-14 17:14:33,685  - learning_rate: \"1e-05\"\n",
+      "2021-01-14 17:14:33,686  - mini_batch_size: \"16\"\n",
+      "2021-01-14 17:14:33,686  - patience: \"3\"\n",
+      "2021-01-14 17:14:33,687  - anneal_factor: \"0.5\"\n",
+      "2021-01-14 17:14:33,687  - max_epochs: \"10\"\n",
+      "2021-01-14 17:14:33,688  - shuffle: \"True\"\n",
+      "2021-01-14 17:14:33,688  - train_with_dev: \"False\"\n",
+      "2021-01-14 17:14:33,689  - batch_growth_annealing: \"False\"\n",
+      "2021-01-14 17:14:33,690 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:14:33,690 Model training base path: \"models/taggers/trec\"\n",
+      "2021-01-14 17:14:33,691 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:14:33,691 Device: cuda:0\n",
+      "2021-01-14 17:14:33,692 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:14:33,692 Embeddings storage mode: cpu\n",
+      "2021-01-14 17:14:33,693 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:14:38,343 epoch 1 - iter 7/77 - loss 0.48898176 - samples/sec: 26.27 - lr: 0.000010\n",
+      "2021-01-14 17:14:42,454 epoch 1 - iter 14/77 - loss 0.42082447 - samples/sec: 27.53 - lr: 0.000010\n",
+      "2021-01-14 17:14:46,658 epoch 1 - iter 21/77 - loss 0.33773888 - samples/sec: 26.85 - lr: 0.000010\n",
+      "2021-01-14 17:14:50,844 epoch 1 - iter 28/77 - loss 0.31560597 - samples/sec: 27.01 - lr: 0.000010\n",
+      "2021-01-14 17:14:54,998 epoch 1 - iter 35/77 - loss 0.25972683 - samples/sec: 27.14 - lr: 0.000010\n",
+      "2021-01-14 17:14:59,209 epoch 1 - iter 42/77 - loss 0.23569006 - samples/sec: 26.75 - lr: 0.000010\n",
+      "2021-01-14 17:15:03,408 epoch 1 - iter 49/77 - loss 0.24985709 - samples/sec: 26.85 - lr: 0.000010\n",
+      "2021-01-14 17:15:07,633 epoch 1 - iter 56/77 - loss 0.23229837 - samples/sec: 26.77 - lr: 0.000010\n",
+      "2021-01-14 17:15:11,797 epoch 1 - iter 63/77 - loss 0.23326370 - samples/sec: 27.06 - lr: 0.000010\n",
+      "2021-01-14 17:15:16,012 epoch 1 - iter 70/77 - loss 0.21914055 - samples/sec: 26.79 - lr: 0.000010\n",
+      "2021-01-14 17:15:19,739 epoch 1 - iter 77/77 - loss 0.20128365 - samples/sec: 30.18 - lr: 0.000010\n",
+      "2021-01-14 17:15:19,814 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:15:19,814 EPOCH 1 done: loss 0.2013 - lr 0.0000100\n",
+      "2021-01-14 17:15:23,961 DEV : loss 0.33609411120414734 - score 0.9281\n",
+      "2021-01-14 17:15:24,224 BAD EPOCHS (no improvement): 0\n",
+      "saving best model\n",
+      "2021-01-14 17:15:25,155 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:15:29,794 epoch 2 - iter 7/77 - loss 0.00740182 - samples/sec: 26.44 - lr: 0.000010\n",
+      "2021-01-14 17:15:34,024 epoch 2 - iter 14/77 - loss 0.03390250 - samples/sec: 26.74 - lr: 0.000010\n",
+      "2021-01-14 17:15:38,226 epoch 2 - iter 21/77 - loss 0.03571354 - samples/sec: 26.91 - lr: 0.000010\n",
+      "2021-01-14 17:15:42,369 epoch 2 - iter 28/77 - loss 0.03136397 - samples/sec: 27.22 - lr: 0.000010\n",
+      "2021-01-14 17:15:46,510 epoch 2 - iter 35/77 - loss 0.03331735 - samples/sec: 27.22 - lr: 0.000010\n",
+      "2021-01-14 17:15:50,650 epoch 2 - iter 42/77 - loss 0.07917234 - samples/sec: 27.26 - lr: 0.000010\n",
+      "2021-01-14 17:15:54,832 epoch 2 - iter 49/77 - loss 0.07227532 - samples/sec: 26.94 - lr: 0.000010\n",
+      "2021-01-14 17:15:59,093 epoch 2 - iter 56/77 - loss 0.06382573 - samples/sec: 26.50 - lr: 0.000010\n",
+      "2021-01-14 17:16:03,303 epoch 2 - iter 63/77 - loss 0.08917253 - samples/sec: 26.79 - lr: 0.000010\n",
+      "2021-01-14 17:16:07,583 epoch 2 - iter 70/77 - loss 0.08041374 - samples/sec: 26.33 - lr: 0.000010\n",
+      "2021-01-14 17:16:11,374 epoch 2 - iter 77/77 - loss 0.08118116 - samples/sec: 29.72 - lr: 0.000010\n",
+      "2021-01-14 17:16:11,437 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:16:11,438 EPOCH 2 done: loss 0.0812 - lr 0.0000100\n",
+      "2021-01-14 17:16:15,649 DEV : loss 0.5065702795982361 - score 0.9248\n",
+      "2021-01-14 17:16:15,909 BAD EPOCHS (no improvement): 1\n",
+      "2021-01-14 17:16:15,910 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:16:20,491 epoch 3 - iter 7/77 - loss 0.00894024 - samples/sec: 26.72 - lr: 0.000010\n",
+      "2021-01-14 17:16:24,744 epoch 3 - iter 14/77 - loss 0.04298888 - samples/sec: 26.62 - lr: 0.000010\n",
+      "2021-01-14 17:16:29,099 epoch 3 - iter 21/77 - loss 0.05707598 - samples/sec: 25.96 - lr: 0.000010\n",
+      "2021-01-14 17:16:33,430 epoch 3 - iter 28/77 - loss 0.04700577 - samples/sec: 26.10 - lr: 0.000010\n",
+      "2021-01-14 17:16:37,615 epoch 3 - iter 35/77 - loss 0.03774460 - samples/sec: 26.92 - lr: 0.000010\n",
+      "2021-01-14 17:16:41,848 epoch 3 - iter 42/77 - loss 0.03161711 - samples/sec: 26.63 - lr: 0.000010\n",
+      "2021-01-14 17:16:46,019 epoch 3 - iter 49/77 - loss 0.02749447 - samples/sec: 27.00 - lr: 0.000010\n",
+      "2021-01-14 17:16:50,152 epoch 3 - iter 56/77 - loss 0.02414880 - samples/sec: 27.36 - lr: 0.000010\n",
+      "2021-01-14 17:16:54,291 epoch 3 - iter 63/77 - loss 0.02319205 - samples/sec: 27.22 - lr: 0.000010\n",
+      "2021-01-14 17:16:58,450 epoch 3 - iter 70/77 - loss 0.02129739 - samples/sec: 27.15 - lr: 0.000010\n",
+      "2021-01-14 17:17:02,224 epoch 3 - iter 77/77 - loss 0.01944040 - samples/sec: 29.79 - lr: 0.000010\n",
+      "2021-01-14 17:17:02,272 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:17:02,273 EPOCH 3 done: loss 0.0194 - lr 0.0000100\n",
+      "2021-01-14 17:17:06,524 DEV : loss 0.49905064702033997 - score 0.9216\n",
+      "2021-01-14 17:17:06,786 BAD EPOCHS (no improvement): 2\n",
+      "2021-01-14 17:17:06,787 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:17:11,358 epoch 4 - iter 7/77 - loss 0.00185599 - samples/sec: 26.75 - lr: 0.000010\n",
+      "2021-01-14 17:17:15,541 epoch 4 - iter 14/77 - loss 0.00136404 - samples/sec: 27.04 - lr: 0.000010\n",
+      "2021-01-14 17:17:19,712 epoch 4 - iter 21/77 - loss 0.00128906 - samples/sec: 27.07 - lr: 0.000010\n",
+      "2021-01-14 17:17:23,840 epoch 4 - iter 28/77 - loss 0.00118582 - samples/sec: 27.40 - lr: 0.000010\n",
+      "2021-01-14 17:17:28,020 epoch 4 - iter 35/77 - loss 0.00106619 - samples/sec: 26.98 - lr: 0.000010\n",
+      "2021-01-14 17:17:32,295 epoch 4 - iter 42/77 - loss 0.00094899 - samples/sec: 26.41 - lr: 0.000010\n",
+      "2021-01-14 17:17:36,522 epoch 4 - iter 49/77 - loss 0.00087282 - samples/sec: 26.66 - lr: 0.000010\n",
+      "2021-01-14 17:17:40,684 epoch 4 - iter 56/77 - loss 0.00097088 - samples/sec: 27.12 - lr: 0.000010\n",
+      "2021-01-14 17:17:44,912 epoch 4 - iter 63/77 - loss 0.00087886 - samples/sec: 26.69 - lr: 0.000010\n",
+      "2021-01-14 17:17:49,112 epoch 4 - iter 70/77 - loss 0.00110282 - samples/sec: 26.91 - lr: 0.000010\n",
+      "2021-01-14 17:17:52,950 epoch 4 - iter 77/77 - loss 0.00101001 - samples/sec: 29.33 - lr: 0.000010\n",
+      "2021-01-14 17:17:53,013 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:17:53,014 EPOCH 4 done: loss 0.0010 - lr 0.0000100\n",
+      "2021-01-14 17:17:57,333 DEV : loss 0.5981439352035522 - score 0.9216\n",
+      "2021-01-14 17:17:57,595 BAD EPOCHS (no improvement): 3\n",
+      "2021-01-14 17:17:57,596 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:18:02,196 epoch 5 - iter 7/77 - loss 0.00060182 - samples/sec: 26.72 - lr: 0.000010\n",
+      "2021-01-14 17:18:06,460 epoch 5 - iter 14/77 - loss 0.00037103 - samples/sec: 26.48 - lr: 0.000010\n",
+      "2021-01-14 17:18:10,664 epoch 5 - iter 21/77 - loss 0.00028840 - samples/sec: 26.89 - lr: 0.000010\n",
+      "2021-01-14 17:18:14,811 epoch 5 - iter 28/77 - loss 0.00025692 - samples/sec: 27.23 - lr: 0.000010\n",
+      "2021-01-14 17:18:18,940 epoch 5 - iter 35/77 - loss 0.00024187 - samples/sec: 27.30 - lr: 0.000010\n",
+      "2021-01-14 17:18:23,111 epoch 5 - iter 42/77 - loss 0.00021889 - samples/sec: 27.08 - lr: 0.000010\n",
+      "2021-01-14 17:18:27,299 epoch 5 - iter 49/77 - loss 0.00035217 - samples/sec: 26.95 - lr: 0.000010\n",
+      "2021-01-14 17:18:31,425 epoch 5 - iter 56/77 - loss 0.00032686 - samples/sec: 27.30 - lr: 0.000010\n",
+      "2021-01-14 17:18:35,586 epoch 5 - iter 63/77 - loss 0.00029595 - samples/sec: 27.13 - lr: 0.000010\n",
+      "2021-01-14 17:18:39,774 epoch 5 - iter 70/77 - loss 0.00027235 - samples/sec: 26.91 - lr: 0.000010\n",
+      "2021-01-14 17:18:43,540 epoch 5 - iter 77/77 - loss 0.00028631 - samples/sec: 29.89 - lr: 0.000010\n",
+      "2021-01-14 17:18:43,601 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:18:43,602 EPOCH 5 done: loss 0.0003 - lr 0.0000100\n",
+      "2021-01-14 17:18:46,910 DEV : loss 0.6656511425971985 - score 0.9248\n",
+      "Epoch     5: reducing learning rate of group 0 to 5.0000e-06.\n",
+      "2021-01-14 17:18:47,170 BAD EPOCHS (no improvement): 4\n",
+      "2021-01-14 17:18:47,171 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:18:53,782 epoch 6 - iter 7/77 - loss 0.00022932 - samples/sec: 25.95 - lr: 0.000005\n",
+      "2021-01-14 17:18:57,949 epoch 6 - iter 14/77 - loss 0.00016531 - samples/sec: 27.13 - lr: 0.000005\n",
+      "2021-01-14 17:19:02,280 epoch 6 - iter 21/77 - loss 0.00022724 - samples/sec: 25.99 - lr: 0.000005\n",
+      "2021-01-14 17:19:06,568 epoch 6 - iter 28/77 - loss 0.00026307 - samples/sec: 26.27 - lr: 0.000005\n",
+      "2021-01-14 17:19:10,654 epoch 6 - iter 35/77 - loss 0.00023027 - samples/sec: 27.57 - lr: 0.000005\n",
+      "2021-01-14 17:19:14,851 epoch 6 - iter 42/77 - loss 0.00021121 - samples/sec: 26.89 - lr: 0.000005\n",
+      "2021-01-14 17:19:19,029 epoch 6 - iter 49/77 - loss 0.00019923 - samples/sec: 26.94 - lr: 0.000005\n",
+      "2021-01-14 17:19:23,229 epoch 6 - iter 56/77 - loss 0.00018661 - samples/sec: 26.87 - lr: 0.000005\n",
+      "2021-01-14 17:19:27,356 epoch 6 - iter 63/77 - loss 0.00017462 - samples/sec: 27.26 - lr: 0.000005\n",
+      "2021-01-14 17:19:31,517 epoch 6 - iter 70/77 - loss 0.00016146 - samples/sec: 27.04 - lr: 0.000005\n",
+      "2021-01-14 17:19:35,455 epoch 6 - iter 77/77 - loss 0.00015018 - samples/sec: 28.60 - lr: 0.000005\n",
+      "2021-01-14 17:19:35,526 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:19:35,527 EPOCH 6 done: loss 0.0002 - lr 0.0000050\n",
+      "2021-01-14 17:19:39,223 DEV : loss 0.6893778443336487 - score 0.9248\n",
+      "2021-01-14 17:19:39,484 BAD EPOCHS (no improvement): 1\n",
+      "2021-01-14 17:19:39,486 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:19:44,844 epoch 7 - iter 7/77 - loss 0.00019211 - samples/sec: 27.26 - lr: 0.000005\n",
+      "2021-01-14 17:19:49,028 epoch 7 - iter 14/77 - loss 0.00066268 - samples/sec: 26.98 - lr: 0.000005\n",
+      "2021-01-14 17:19:53,253 epoch 7 - iter 21/77 - loss 0.00053614 - samples/sec: 26.67 - lr: 0.000005\n",
+      "2021-01-14 17:19:57,430 epoch 7 - iter 28/77 - loss 0.00043514 - samples/sec: 27.03 - lr: 0.000005\n",
+      "2021-01-14 17:20:01,594 epoch 7 - iter 35/77 - loss 0.00036258 - samples/sec: 27.03 - lr: 0.000005\n",
+      "2021-01-14 17:20:05,829 epoch 7 - iter 42/77 - loss 0.00031573 - samples/sec: 26.60 - lr: 0.000005\n",
+      "2021-01-14 17:20:10,013 epoch 7 - iter 49/77 - loss 0.00028645 - samples/sec: 26.94 - lr: 0.000005\n",
+      "2021-01-14 17:20:14,238 epoch 7 - iter 56/77 - loss 0.00025793 - samples/sec: 26.67 - lr: 0.000005\n",
+      "2021-01-14 17:20:18,381 epoch 7 - iter 63/77 - loss 0.00023890 - samples/sec: 27.16 - lr: 0.000005\n",
+      "2021-01-14 17:20:22,569 epoch 7 - iter 70/77 - loss 0.00021868 - samples/sec: 26.92 - lr: 0.000005\n",
+      "2021-01-14 17:20:26,365 epoch 7 - iter 77/77 - loss 0.00020789 - samples/sec: 29.64 - lr: 0.000005\n",
+      "2021-01-14 17:20:26,443 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:20:26,443 EPOCH 7 done: loss 0.0002 - lr 0.0000050\n",
+      "2021-01-14 17:20:29,751 DEV : loss 0.6999250054359436 - score 0.9248\n",
+      "2021-01-14 17:20:30,015 BAD EPOCHS (no improvement): 2\n",
+      "2021-01-14 17:20:30,016 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:20:34,509 epoch 8 - iter 7/77 - loss 0.00007416 - samples/sec: 27.22 - lr: 0.000005\n",
+      "2021-01-14 17:20:39,619 epoch 8 - iter 14/77 - loss 0.00005522 - samples/sec: 26.82 - lr: 0.000005\n",
+      "2021-01-14 17:20:43,900 epoch 8 - iter 21/77 - loss 0.00006253 - samples/sec: 26.35 - lr: 0.000005\n",
+      "2021-01-14 17:20:48,020 epoch 8 - iter 28/77 - loss 0.00015550 - samples/sec: 27.36 - lr: 0.000005\n",
+      "2021-01-14 17:20:52,243 epoch 8 - iter 35/77 - loss 0.00013324 - samples/sec: 26.66 - lr: 0.000005\n",
+      "2021-01-14 17:20:56,397 epoch 8 - iter 42/77 - loss 0.00012891 - samples/sec: 27.08 - lr: 0.000005\n",
+      "2021-01-14 17:21:00,493 epoch 8 - iter 49/77 - loss 0.00012485 - samples/sec: 27.50 - lr: 0.000005\n",
+      "2021-01-14 17:21:04,731 epoch 8 - iter 56/77 - loss 0.00014117 - samples/sec: 26.59 - lr: 0.000005\n",
+      "2021-01-14 17:21:08,909 epoch 8 - iter 63/77 - loss 0.00013634 - samples/sec: 26.95 - lr: 0.000005\n",
+      "2021-01-14 17:21:13,121 epoch 8 - iter 70/77 - loss 0.00012635 - samples/sec: 26.74 - lr: 0.000005\n",
+      "2021-01-14 17:21:16,967 epoch 8 - iter 77/77 - loss 0.00012047 - samples/sec: 29.24 - lr: 0.000005\n",
+      "2021-01-14 17:21:17,026 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:21:17,027 EPOCH 8 done: loss 0.0001 - lr 0.0000050\n",
+      "2021-01-14 17:21:20,598 DEV : loss 0.7107362747192383 - score 0.9248\n",
+      "2021-01-14 17:21:20,864 BAD EPOCHS (no improvement): 3\n",
+      "2021-01-14 17:21:20,865 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:21:26,269 epoch 9 - iter 7/77 - loss 0.00002916 - samples/sec: 26.76 - lr: 0.000005\n",
+      "2021-01-14 17:21:30,543 epoch 9 - iter 14/77 - loss 0.00006615 - samples/sec: 26.39 - lr: 0.000005\n",
+      "2021-01-14 17:21:34,808 epoch 9 - iter 21/77 - loss 0.00006364 - samples/sec: 26.48 - lr: 0.000005\n",
+      "2021-01-14 17:21:38,963 epoch 9 - iter 28/77 - loss 0.00013939 - samples/sec: 27.14 - lr: 0.000005\n",
+      "2021-01-14 17:21:43,165 epoch 9 - iter 35/77 - loss 0.00012621 - samples/sec: 26.86 - lr: 0.000005\n",
+      "2021-01-14 17:21:47,317 epoch 9 - iter 42/77 - loss 0.00012101 - samples/sec: 27.09 - lr: 0.000005\n",
+      "2021-01-14 17:21:51,532 epoch 9 - iter 49/77 - loss 0.00018356 - samples/sec: 26.78 - lr: 0.000005\n",
+      "2021-01-14 17:21:56,114 epoch 9 - iter 56/77 - loss 0.00016344 - samples/sec: 26.01 - lr: 0.000005\n",
+      "2021-01-14 17:22:00,349 epoch 9 - iter 63/77 - loss 0.00015143 - samples/sec: 26.58 - lr: 0.000005\n",
+      "2021-01-14 17:22:04,591 epoch 9 - iter 70/77 - loss 0.00013992 - samples/sec: 26.55 - lr: 0.000005\n",
+      "2021-01-14 17:22:08,380 epoch 9 - iter 77/77 - loss 0.00012958 - samples/sec: 29.74 - lr: 0.000005\n",
+      "2021-01-14 17:22:08,443 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:22:08,444 EPOCH 9 done: loss 0.0001 - lr 0.0000050\n",
+      "2021-01-14 17:22:11,739 DEV : loss 0.7207703590393066 - score 0.9248\n",
+      "Epoch     9: reducing learning rate of group 0 to 2.5000e-06.\n",
+      "2021-01-14 17:22:12,003 BAD EPOCHS (no improvement): 4\n",
+      "2021-01-14 17:22:12,004 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:22:16,599 epoch 10 - iter 7/77 - loss 0.00004176 - samples/sec: 26.48 - lr: 0.000003\n",
+      "2021-01-14 17:22:21,730 epoch 10 - iter 14/77 - loss 0.00004900 - samples/sec: 26.15 - lr: 0.000003\n",
+      "2021-01-14 17:22:25,925 epoch 10 - iter 21/77 - loss 0.00005921 - samples/sec: 26.94 - lr: 0.000003\n",
+      "2021-01-14 17:22:30,112 epoch 10 - iter 28/77 - loss 0.00005456 - samples/sec: 26.94 - lr: 0.000003\n",
+      "2021-01-14 17:22:34,389 epoch 10 - iter 35/77 - loss 0.00004909 - samples/sec: 26.34 - lr: 0.000003\n",
+      "2021-01-14 17:22:38,546 epoch 10 - iter 42/77 - loss 0.00004503 - samples/sec: 27.15 - lr: 0.000003\n",
+      "2021-01-14 17:22:42,757 epoch 10 - iter 49/77 - loss 0.00004776 - samples/sec: 26.71 - lr: 0.000003\n",
+      "2021-01-14 17:22:46,803 epoch 10 - iter 56/77 - loss 0.00004461 - samples/sec: 27.81 - lr: 0.000003\n",
+      "2021-01-14 17:22:51,063 epoch 10 - iter 63/77 - loss 0.00004382 - samples/sec: 26.47 - lr: 0.000003\n",
+      "2021-01-14 17:22:55,173 epoch 10 - iter 70/77 - loss 0.00005728 - samples/sec: 27.40 - lr: 0.000003\n",
+      "2021-01-14 17:22:58,941 epoch 10 - iter 77/77 - loss 0.00005446 - samples/sec: 29.83 - lr: 0.000003\n",
+      "2021-01-14 17:22:58,988 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:22:58,989 EPOCH 10 done: loss 0.0001 - lr 0.0000025\n",
+      "2021-01-14 17:23:02,304 DEV : loss 0.7250329852104187 - score 0.9248\n",
+      "2021-01-14 17:23:02,567 BAD EPOCHS (no improvement): 1\n",
+      "2021-01-14 17:23:04,650 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-14 17:23:04,651 Testing using best model ...\n",
+      "2021-01-14 17:23:04,653 loading file models/taggers/trec/best-model.pt\n",
+      "2021-01-14 17:23:09,692 \t0.9295\n",
+      "2021-01-14 17:23:09,693 \n",
+      "Results:\n",
+      "- F-score (micro) 0.9295\n",
+      "- F-score (macro) 0.4817\n",
+      "- Accuracy 0.9295\n",
+      "\n",
+      "By class:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0     0.9295    1.0000    0.9635       356\n",
+      "           1     0.0000    0.0000    0.0000        27\n",
+      "\n",
+      "   micro avg     0.9295    0.9295    0.9295       383\n",
+      "   macro avg     0.4648    0.5000    0.4817       383\n",
+      "weighted avg     0.8640    0.9295    0.8955       383\n",
+      " samples avg     0.9295    0.9295    0.9295       383\n",
+      "\n",
+      "2021-01-14 17:23:09,694 ----------------------------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "{'test_score': 0.9295,\n",
+       " 'dev_score_history': [0.9281,\n",
+       "  0.9248,\n",
+       "  0.9216,\n",
+       "  0.9216,\n",
+       "  0.9248,\n",
+       "  0.9248,\n",
+       "  0.9248,\n",
+       "  0.9248,\n",
+       "  0.9248,\n",
+       "  0.9248],\n",
+       " 'train_loss_history': [0.20128365170646023,\n",
+       "  0.08118115926717782,\n",
+       "  0.019440397426679537,\n",
+       "  0.0010100115429271352,\n",
+       "  0.00028631362048062414,\n",
+       "  0.00015017583772733613,\n",
+       "  0.0002078855192506468,\n",
+       "  0.00012047414655809278,\n",
+       "  0.00012958204591429078,\n",
+       "  5.446238951249556e-05],\n",
+       " 'dev_loss_history': [0.33609411120414734,\n",
+       "  0.5065702795982361,\n",
+       "  0.49905064702033997,\n",
+       "  0.5981439352035522,\n",
+       "  0.6656511425971985,\n",
+       "  0.6893778443336487,\n",
+       "  0.6999250054359436,\n",
+       "  0.7107362747192383,\n",
+       "  0.7207703590393066,\n",
+       "  0.7250329852104187]}"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 85
+    }
+   ],
    "source": [
-    "from torch.optim.adam import Adam\n",
-    "from flair.data import Corpus\n",
-    "from flair.datasets import TREC_6\n",
-    "from flair.embeddings import TransformerDocumentEmbeddings\n",
-    "from flair.models import TextClassifier\n",
-    "from flair.trainers import ModelTrainer\n",
-    "\n",
-    "\n",
-    "\n",
     "\n",
     "# 3. initialize transformer document embeddings (many models are available)\n",
     "document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)\n",
     "\n",
     "# 4. create the text classifier\n",
-    "classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)\n",
+    "classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights={b\"1\":10, b\"0\":1}) # loss_weights={\"1\":10, \"0\":1}\n",
     "\n",
     "# 5. initialize the text classifier trainer with Adam optimizer\n",
     "trainer = ModelTrainer(classifier, corpus, optimizer=Adam)\n",
@@ -363,10 +1139,24 @@
     "              learning_rate=1e-5, # use very small learning rate\n",
     "              mini_batch_size=16,\n",
     "              mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine\n",
-    "              max_epochs=5, # terminate after 5 epochs\n",
+    "              max_epochs=10, # terminate after 5 epochs\n",
     "              )"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From ab0783ba0f886dc3e4a86bcb8733b37e596e1ecc Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Fri, 15 Jan 2021 10:46:55 +0000
Subject: [PATCH 04/10] implement glossbert

---
 114.1 - review notebook - glossbert.ipynb | 1109 +--------------------
 tasks/wsd_gloss.py                        |  106 ++
 2 files changed, 125 insertions(+), 1090 deletions(-)
 create mode 100644 tasks/wsd_gloss.py

diff --git a/114.1 - review notebook - glossbert.ipynb b/114.1 - review notebook - glossbert.ipynb
index 60288ba..cb3c924 100644
--- a/114.1 - review notebook - glossbert.ipynb	
+++ b/114.1 - review notebook - glossbert.ipynb	
@@ -34,1129 +34,58 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%matplotlib inline\n",
-    "import pickle\n",
-    "import pandas as pd\n",
-    "from tasks import wsd\n",
-    "from pathlib import Path\n",
-    "from tasks import wsd\n",
-    "from utils import nlp_tools\n",
-    "from tqdm.auto import tqdm\n",
-    "import numpy as np\n",
-    "import json\n",
-    "from sklearn.metrics import classification_report\n",
-    "from flair.embeddings import TransformerWordEmbeddings\n",
-    "from utils.dataset_download import harvest_data_from_extended_senses\n",
-    "from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma\n",
-    "from torch.optim.adam import Adam\n",
-    "from flair.datasets import CSVClassificationCorpus\n",
-    "from flair.data import Corpus\n",
-    "from flair.datasets import TREC_6\n",
-    "from flair.embeddings import TransformerDocumentEmbeddings\n",
-    "from flair.models import TextClassifier\n",
-    "from flair.trainers import ModelTrainer"
+    "from tasks.wsd_gloss import create_glossbert_data, train_glossbert"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "lemma = 'machine'\n",
     "pos = 'NN'\n",
-    "senses = {'machine_nn01-38474140'} # machine_nn01-38475772 machine_nn01-38475923 machine_nn01-38475835 machine_nn01-38474140\n",
-    "relations = ['seed','synonym'] # ,'descendant','sibling'\n",
-    "eval_mode = \"lemma_etal\" # lemma or lemma_etal\n",
     "    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "# senses before filtering by date = 517\n",
-      "# senses after filtering by date = 433\n",
-      "\n",
-      "\n",
-      "# of seed senses 26 \n",
-      "# of synonyms 383 \n",
-      "# of branch senses 0\n",
-      "\n",
-      "\n",
-      "# of seeds selected 1 \n",
-      "# of synonyms selected 44 \n",
-      "# of branches selected 0\n",
-      "[LOG] #rows before removing None vector (1947, 21)\n",
-      "[LOG] #rows after removing None vector (1911, 21)\n"
+     "output_type": "error",
+     "ename": "AttributeError",
+     "evalue": "'Series' object has no attribute 'enclosed_quotation'",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-6-c17e429c2492>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcreate_glossbert_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mpos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mcreate_glossbert_data\u001b[0;34m(lemma, pos)\u001b[0m\n\u001b[1;32m     60\u001b[0m     \u001b[0mdf_quotations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_quotations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeyword_offset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     61\u001b[0m     \u001b[0mdf_quotations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_quotations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m     \u001b[0mdf_glossbert\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mto_glossbert_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfrac\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     64\u001b[0m     \u001b[0mdf_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_glossbert\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstratify\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_glossbert\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mto_glossbert_format\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m     45\u001b[0m     \u001b[0mrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     46\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m         \u001b[0mrows\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mgloss_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Yes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msense_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     48\u001b[0m         \u001b[0mdefinitions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     49\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdefinitions\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mgloss_string\u001b[0;34m(row, definition)\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m         \u001b[0mout_string\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m         \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menclosed_quotation\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     36\u001b[0m             \u001b[0mout_string\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menclosed_quotation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     37\u001b[0m         \u001b[0mout_string\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0;34m' [SEP] '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   5272\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5273\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5274\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   5275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5276\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'enclosed_quotation'"
      ]
     }
    ],
    "source": [
-    "df_train, df_val, df_test = binarize(lemma,\n",
-    "                        pos,\n",
-    "                        senses, \n",
-    "                        relations,\n",
-    "                        strict_filter=True,\n",
-    "                        start=1700,\n",
-    "                        end=2000,\n",
-    "                        eval_mode=eval_mode)"
+    "data_path = create_glossbert_data(lemma,pos)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "                   sense_id  \\\n",
-       "238   machine_nn01-38474140   \n",
-       "305   machine_nn01-38474140   \n",
-       "713   machine_nn01-38474140   \n",
-       "938   machine_nn01-38474140   \n",
-       "1042  machine_nn01-38474140   \n",
-       "1056  machine_nn01-38474140   \n",
-       "\n",
-       "                                       lemma_definition  \\\n",
-       "238   A complex device, consisting of a number of in...   \n",
-       "305   A complex device, consisting of a number of in...   \n",
-       "713   A complex device, consisting of a number of in...   \n",
-       "938   A complex device, consisting of a number of in...   \n",
-       "1042  A complex device, consisting of a number of in...   \n",
-       "1056  A complex device, consisting of a number of in...   \n",
-       "\n",
-       "                                             definition       word_id  \\\n",
-       "238   A living body, esp. the human body considered ...  machine_nn01   \n",
-       "305   A living body, esp. the human body considered ...  machine_nn01   \n",
-       "713   A living body, esp. the human body considered ...  machine_nn01   \n",
-       "938   A living body, esp. the human body considered ...  machine_nn01   \n",
-       "1042  A living body, esp. the human body considered ...  machine_nn01   \n",
-       "1056  A living body, esp. the human body considered ...  machine_nn01   \n",
-       "\n",
-       "        lemma           quotation_id  \\\n",
-       "238   machine  machine_nn01-38474169   \n",
-       "305   machine  machine_nn01-38474177   \n",
-       "713   machine  machine_nn01-38474195   \n",
-       "938   machine  machine_nn01-38474223   \n",
-       "1042  machine  machine_nn01-38474203   \n",
-       "1056  machine  machine_nn01-38474212   \n",
-       "\n",
-       "                                                 source  \\\n",
-       "238   {'title': 'Death's Vision', 'author': 'J. Reyn...   \n",
-       "305   {'title': 'Spectator', 'author': 'J. Addison',...   \n",
-       "713   {'title': 'Med. & Physical Jrnl.', 'author': N...   \n",
-       "938   {'title': 'Of Human Bondage', 'author': 'W. S....   \n",
-       "1042  {'title': 'Poems', 'author': 'W. Wordsworth', ...   \n",
-       "1056  {'title': 'Telegraphy', 'author': 'W. H. Preec...   \n",
-       "\n",
-       "                                                   text    year  \\\n",
-       "238   {'keyword': 'Machins', 'full_text': 'What Nobl...  1709.0   \n",
-       "305   {'keyword': 'Machine', 'full_text': 'Cheerfuln...  1712.0   \n",
-       "713   {'keyword': 'machine', 'full_text': 'When a pr...  1805.0   \n",
-       "938   {'keyword': 'machine', 'full_text': 'He wonder...  1915.0   \n",
-       "1042  {'keyword': 'machine', 'full_text': 'And now I...  1807.0   \n",
-       "1056  {'keyword': 'machine', 'full_text': 'The human...  1876.0   \n",
-       "\n",
-       "                                              full_text  ... keyword_offset  \\\n",
-       "238          What Nobler Souls the Nobler Machins Wear.  ...           29.0   \n",
-       "305   Cheerfulness is..the best Promoter of Health. ...  ...           70.0   \n",
-       "713   When a product of diseased action has been eff...  ...           82.0   \n",
-       "938   He wondered whether at the very end, now that ...  ...           50.0   \n",
-       "1042  And now I see with eye serene The very pulse o...  ...           52.0   \n",
-       "1056  The human machine tires, and as a consequence ...  ...           10.0   \n",
-       "\n",
-       "                      vector_bert_base_-1,-2,-3,-4_mean  \\\n",
-       "238   [0.5628562, -0.04788875, 0.074935675, -0.22630...   \n",
-       "305   [0.0052292813, 0.12355395, 0.023108626, 0.2251...   \n",
-       "713   [0.25928053, 0.049638785, 0.022315167, 0.34901...   \n",
-       "938   [0.38040048, 0.38440758, 0.45397452, 0.1211486...   \n",
-       "1042  [-0.46428305, 0.013232344, -0.595714, 0.049642...   \n",
-       "1056  [0.6930934, 0.09074756, -0.13974331, 0.1105655...   \n",
-       "\n",
-       "                          vector_blert_-1,-2,-3,-4_mean label  \\\n",
-       "238   [-0.15516208, 0.289941, -0.15124893, -0.206332...     1   \n",
-       "305   [-0.04755735, 0.20182909, 0.33001357, -0.04851...     1   \n",
-       "713   [-0.16033216, -0.16846322, 0.5062964, 0.102019...     1   \n",
-       "938   [-0.059219074, 0.23112743, 0.42189148, 0.02944...     1   \n",
-       "1042  [0.021248298, 0.28699854, 0.24638082, -0.01793...     1   \n",
-       "1056  [0.11798739, -0.0029160888, 0.29418808, -0.076...     1   \n",
-       "\n",
-       "                         id  \\\n",
-       "238   machine_nn01-38474140   \n",
-       "305   machine_nn01-38474140   \n",
-       "713   machine_nn01-38474140   \n",
-       "938   machine_nn01-38474140   \n",
-       "1042  machine_nn01-38474140   \n",
-       "1056  machine_nn01-38474140   \n",
-       "\n",
-       "                                              daterange  \\\n",
-       "238   {'end': None, 'start': 1604, 'obsolete': False...   \n",
-       "305   {'end': None, 'start': 1604, 'obsolete': False...   \n",
-       "713   {'end': None, 'start': 1604, 'obsolete': False...   \n",
-       "938   {'end': None, 'start': 1604, 'obsolete': False...   \n",
-       "1042  {'end': None, 'start': 1604, 'obsolete': False...   \n",
-       "1056  {'end': None, 'start': 1604, 'obsolete': False...   \n",
-       "\n",
-       "                                         provenance provenance_type  \\\n",
-       "238   [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
-       "305   [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
-       "713   [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
-       "938   [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
-       "1042  [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
-       "1056  [[machine_nn01-38474140, seed, machine_nn01]]            seed   \n",
-       "\n",
-       "      relation_to_core_senses  relation_to_seed_senses  \n",
-       "238   {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
-       "305   {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
-       "713   {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
-       "938   {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
-       "1042  {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
-       "1056  {machine_nn01-38474140}  {machine_nn01-38474140}  \n",
-       "\n",
-       "[6 rows x 21 columns]"
-      ],
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>sense_id</th>\n      <th>lemma_definition</th>\n      <th>definition</th>\n      <th>word_id</th>\n      <th>lemma</th>\n      <th>quotation_id</th>\n      <th>source</th>\n      <th>text</th>\n      <th>year</th>\n      <th>full_text</th>\n      <th>...</th>\n      <th>keyword_offset</th>\n      <th>vector_bert_base_-1,-2,-3,-4_mean</th>\n      <th>vector_blert_-1,-2,-3,-4_mean</th>\n      <th>label</th>\n      <th>id</th>\n      <th>daterange</th>\n      <th>provenance</th>\n      <th>provenance_type</th>\n      <th>relation_to_core_senses</th>\n      <th>relation_to_seed_senses</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>238</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474169</td>\n      <td>{'title': 'Death's Vision', 'author': 'J. Reyn...</td>\n      <td>{'keyword': 'Machins', 'full_text': 'What Nobl...</td>\n      <td>1709.0</td>\n      <td>What Nobler Souls the Nobler Machins Wear.</td>\n      <td>...</td>\n      <td>29.0</td>\n      <td>[0.5628562, -0.04788875, 0.074935675, -0.22630...</td>\n      <td>[-0.15516208, 0.289941, -0.15124893, -0.206332...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>305</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474177</td>\n      <td>{'title': 'Spectator', 'author': 'J. Addison',...</td>\n      <td>{'keyword': 'Machine', 'full_text': 'Cheerfuln...</td>\n      <td>1712.0</td>\n      <td>Cheerfulness is..the best Promoter of Health. ...</td>\n      <td>...</td>\n      <td>70.0</td>\n      <td>[0.0052292813, 0.12355395, 0.023108626, 0.2251...</td>\n      <td>[-0.04755735, 0.20182909, 0.33001357, -0.04851...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>713</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474195</td>\n      <td>{'title': 'Med. &amp; Physical Jrnl.', 'author': N...</td>\n      <td>{'keyword': 'machine', 'full_text': 'When a pr...</td>\n      <td>1805.0</td>\n      <td>When a product of diseased action has been eff...</td>\n      <td>...</td>\n      <td>82.0</td>\n      <td>[0.25928053, 0.049638785, 0.022315167, 0.34901...</td>\n      <td>[-0.16033216, -0.16846322, 0.5062964, 0.102019...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>938</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474223</td>\n      <td>{'title': 'Of Human Bondage', 'author': 'W. S....</td>\n      <td>{'keyword': 'machine', 'full_text': 'He wonder...</td>\n      <td>1915.0</td>\n      <td>He wondered whether at the very end, now that ...</td>\n      <td>...</td>\n      <td>50.0</td>\n      <td>[0.38040048, 0.38440758, 0.45397452, 0.1211486...</td>\n      <td>[-0.059219074, 0.23112743, 0.42189148, 0.02944...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>1042</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474203</td>\n      <td>{'title': 'Poems', 'author': 'W. Wordsworth', ...</td>\n      <td>{'keyword': 'machine', 'full_text': 'And now I...</td>\n      <td>1807.0</td>\n      <td>And now I see with eye serene The very pulse o...</td>\n      <td>...</td>\n      <td>52.0</td>\n      <td>[-0.46428305, 0.013232344, -0.595714, 0.049642...</td>\n      <td>[0.021248298, 0.28699854, 0.24638082, -0.01793...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n    <tr>\n      <th>1056</th>\n      <td>machine_nn01-38474140</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A living body, esp. the human body considered ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474212</td>\n      <td>{'title': 'Telegraphy', 'author': 'W. H. Preec...</td>\n      <td>{'keyword': 'machine', 'full_text': 'The human...</td>\n      <td>1876.0</td>\n      <td>The human machine tires, and as a consequence ...</td>\n      <td>...</td>\n      <td>10.0</td>\n      <td>[0.6930934, 0.09074756, -0.13974331, 0.1105655...</td>\n      <td>[0.11798739, -0.0029160888, 0.29418808, -0.076...</td>\n      <td>1</td>\n      <td>machine_nn01-38474140</td>\n      <td>{'end': None, 'start': 1604, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474140, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474140}</td>\n      <td>{machine_nn01-38474140}</td>\n    </tr>\n  </tbody>\n</table>\n<p>6 rows × 21 columns</p>\n</div>"
-     },
-     "metadata": {},
-     "execution_count": 72
-    }
-   ],
-   "source": [
-    "df_train[df_train.sense_id=='machine_nn01-38474140']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 73,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "array(['body_nn01', 'man_nn01', 'machine_nn01', 'carcass_nn01',\n",
-       "       'person_nn01', 'case_nn02', 'personage_nn01', 'corporeity_nn01',\n",
-       "       'structure_nn01', 'dust_nn01', 'case_nn01', 'automaton_nn01',\n",
-       "       'earth_nn01', 'soma_nn02', 'bulk_nn01', 'microcosm_nn01',\n",
-       "       'personality_nn01', 'tabernacle_nn01', 'vessel_nn01',\n",
-       "       'corpse_nn01', 'case_nn04', 'clay_nn01', 'clod_nn01',\n",
-       "       'skinful_nn01', 'carrion_nn01', 'embodiment_nn01', 'corpus_nn01',\n",
-       "       'flesh_nn01', 'soma_nn01', 'bloodbulk_nn01', 'earth_nn02',\n",
-       "       'soulcase_nn02', 'corporation_nn01', 'chassis_nn01', 'bulk_nn03',\n",
-       "       'bouk_nn01', 'outwall_nn01', 'case_nn03', 'incarnation_nn01',\n",
-       "       'bonehouse_nn01', 'man_nn04', 'bulk_nn02', 'soulcase_nn01',\n",
-       "       'godsimage_nn01', 'quarrons_nn01'], dtype=object)"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 73
-    }
-   ],
-   "source": [
-    "df_train.word_id.unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 74,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "                   sense_id  \\\n",
-       "0        body_nn01-17170653   \n",
-       "1        man_nn01-110482153   \n",
-       "2        body_nn01-17169813   \n",
-       "3     machine_nn01-38474877   \n",
-       "4     carcass_nn01-10177258   \n",
-       "...                     ...   \n",
-       "1215     man_nn01-110479060   \n",
-       "1216   person_nn01-30950985   \n",
-       "1217      clay_nn01-9320873   \n",
-       "1218     case_nn02-10018131   \n",
-       "1220     man_nn01-110487579   \n",
-       "\n",
-       "                                       lemma_definition  \\\n",
-       "0     The complete physical form of a person or anim...   \n",
-       "1     An adult male human being. Without explicit co...   \n",
-       "2     The complete physical form of a person or anim...   \n",
-       "3     A complex device, consisting of a number of in...   \n",
-       "4     The dead body of a person or animal; but no lo...   \n",
-       "...                                                 ...   \n",
-       "1215  An adult male human being. Without explicit co...   \n",
-       "1216  An individual human being; a man, woman, or ch...   \n",
-       "1217  A stiff viscous earth found, in many varieties...   \n",
-       "1218  A box, bag, or other receptacle, designed to c...   \n",
-       "1220  An adult male human being. Without explicit co...   \n",
-       "\n",
-       "                                             definition       word_id  \\\n",
-       "0     Particular technical uses. The part of a vehic...     body_nn01   \n",
-       "1     As vocative or as int., introducing a remark o...      man_nn01   \n",
-       "2     Contrasted with the soul. Cf. soul body n. at ...     body_nn01   \n",
-       "3     A bicycle or tricycle; a motorcycle. Formerly ...  machine_nn01   \n",
-       "4     The naked framework or ‘shell’ of a building b...  carcass_nn01   \n",
-       "...                                                 ...           ...   \n",
-       "1215  A husband. Now chiefly English regional (north...      man_nn01   \n",
-       "1216  Law. An individual (natural person n.) or corp...   person_nn01   \n",
-       "1217  Short for clay-pipe n. at  compounds 2 (colloq...     clay_nn01   \n",
-       "1218  slang. A house, esp. one used as a brothel. Cf...     case_nn02   \n",
-       "1220  In Cumbria: a cairn marking a summit or promin...      man_nn01   \n",
-       "\n",
-       "        lemma           quotation_id  \\\n",
-       "0        body    body_nn01-132916428   \n",
-       "1         man     man_nn01-110482440   \n",
-       "2        body     body_nn01-17169857   \n",
-       "3     machine  machine_nn01-38474966   \n",
-       "4     carcass  carcass_nn01-10177295   \n",
-       "...       ...                    ...   \n",
-       "1215      man     man_nn01-110479206   \n",
-       "1216   person   person_nn01-30951076   \n",
-       "1217     clay      clay_nn01-9320896   \n",
-       "1218     case     case_nn02-10018191   \n",
-       "1220      man     man_nn01-110487624   \n",
-       "\n",
-       "                                                 source  \\\n",
-       "0     {'title': 'Material Handling Engin.', 'author'...   \n",
-       "1     {'title': 'Shaela', 'author': 'R. Bulter', 'ge...   \n",
-       "2     {'title': 'Ess. Man', 'author': 'A. Pope', 'ge...   \n",
-       "3     {'title': 'National Trust Mag.', 'author': Non...   \n",
-       "4     {'title': 'New Pract. Builder', 'author': 'P. ...   \n",
-       "...                                                 ...   \n",
-       "1215  {'title': 'Four Years S. Afr.', 'author': 'C. ...   \n",
-       "1216  {'title': 'Daily News', 'author': None, 'gende...   \n",
-       "1217  {'title': 'Held in Bondage', 'author': '‘Ouida...   \n",
-       "1218  {'title': 'Mop Fair', 'author': 'A. M. Binstea...   \n",
-       "1220  {'title': 'Northern Affair', 'author': 'D. K. ...   \n",
-       "\n",
-       "                                                   text    year  \\\n",
-       "0     {'keyword': 'bodies', 'full_text': 'After car ...  1990.0   \n",
-       "1     {'keyword': 'Min', 'full_text': 'Min A'm vexed...  1976.0   \n",
-       "2     {'keyword': 'Body', 'full_text': 'All are but ...  1733.0   \n",
-       "3     {'keyword': 'machines', 'full_text': 'The cycl...  1992.0   \n",
-       "4     {'keyword': 'Carcase', 'full_text': 'Carcase o...  1823.0   \n",
-       "...                                                 ...     ...   \n",
-       "1215  {'keyword': 'man', 'full_text': 'The wife brok...  1829.0   \n",
-       "1216  {'keyword': 'persons', 'full_text': 'A Bill..e...  1900.0   \n",
-       "1217  {'keyword': 'clays', 'full_text': 'Filthy bird...  1863.0   \n",
-       "1218  {'keyword': 'case', 'full_text': 'They arrange...  1905.0   \n",
-       "1220  {'keyword': 'man', 'full_text': 'Over the elep...  1964.0   \n",
-       "\n",
-       "                                              full_text  ... keyword_offset  \\\n",
-       "0     After car bodies are painted, they are moved i...  ...           10.0   \n",
-       "1                            Min A'm vexed ta hear yun.  ...            0.0   \n",
-       "2     All are but parts of one stupendous Whole, Who...  ...           49.0   \n",
-       "3     The cyclists..took on the circular 21- or 42-m...  ...           92.0   \n",
-       "4     Carcase of a Building, the naked walls, and th...  ...            0.0   \n",
-       "...                                                 ...  ...            ...   \n",
-       "1215  The wife broke out, ‘You lament a brother, and...  ...           79.0   \n",
-       "1216  A Bill..extending to juridical persons, that i...  ...           31.0   \n",
-       "1217                Filthy bird's-eye, smoked in clays.  ...           29.0   \n",
-       "1218  They arranges to stop ‘private’ in Brighton, a...  ...           57.0   \n",
-       "1220  Over the elephant rocks and under the lee of t...  ...           55.0   \n",
-       "\n",
-       "                      vector_bert_base_-1,-2,-3,-4_mean  \\\n",
-       "0     [1.2747291, 0.25178745, 0.69486666, 0.42832682...   \n",
-       "1     [-0.10557328, 0.24347349, 0.731555, -0.4305202...   \n",
-       "2     [0.8197431, 0.04237363, 0.6312159, -0.2658673,...   \n",
-       "3     [-0.18150243, -0.24230756, -0.3336587, 0.34879...   \n",
-       "4     [0.6567496, -0.050804906, 0.31024605, 0.059706...   \n",
-       "...                                                 ...   \n",
-       "1215  [-0.07307064, -0.31692728, 0.38834277, -0.2980...   \n",
-       "1216  [0.030711764, 0.28706473, 0.6596842, -0.132111...   \n",
-       "1217  [-0.016634814, 0.6912965, -0.18498293, -0.2104...   \n",
-       "1218  [0.16278893, -0.17927478, 0.34916735, -0.34717...   \n",
-       "1220  [0.12908892, 0.1654679, -0.077464886, -0.44454...   \n",
-       "\n",
-       "                          vector_blert_-1,-2,-3,-4_mean label  \\\n",
-       "0     [1.5054287, 1.1386966, 1.3405375, 0.8012274, -...     0   \n",
-       "1     [-0.49209523, 0.7658461, 0.07512934, 0.0148925...     0   \n",
-       "2     [0.60478234, 0.58020014, 0.053836707, -0.06571...     0   \n",
-       "3     [-0.14852196, 0.69629294, 0.30973893, 0.598406...     0   \n",
-       "4     [0.41240987, 0.10217035, 0.48574266, 0.8627304...     0   \n",
-       "...                                                 ...   ...   \n",
-       "1215  [-0.20098017, 0.47577783, 0.013388823, -0.2808...     0   \n",
-       "1216  [-0.42745396, 0.4621299, 0.34301567, 0.2193956...     0   \n",
-       "1217  [-0.2833503, 0.80949837, -0.5981247, 0.4331013...     0   \n",
-       "1218  [0.3253876, 0.12327082, -0.077930324, 0.450299...     0   \n",
-       "1220  [-0.4877532, 0.62317544, -0.4543179, -0.167910...     0   \n",
-       "\n",
-       "                         id  \\\n",
-       "0                       NaN   \n",
-       "1                       NaN   \n",
-       "2                       NaN   \n",
-       "3     machine_nn01-38474877   \n",
-       "4                       NaN   \n",
-       "...                     ...   \n",
-       "1215                    NaN   \n",
-       "1216                    NaN   \n",
-       "1217                    NaN   \n",
-       "1218                    NaN   \n",
-       "1220                    NaN   \n",
-       "\n",
-       "                                              daterange  \\\n",
-       "0                                                   NaN   \n",
-       "1                                                   NaN   \n",
-       "2                                                   NaN   \n",
-       "3     {'end': None, 'start': 1823, 'obsolete': False...   \n",
-       "4                                                   NaN   \n",
-       "...                                                 ...   \n",
-       "1215                                                NaN   \n",
-       "1216                                                NaN   \n",
-       "1217                                                NaN   \n",
-       "1218                                                NaN   \n",
-       "1220                                                NaN   \n",
-       "\n",
-       "                                         provenance provenance_type  \\\n",
-       "0                                               NaN             NaN   \n",
-       "1                                               NaN             NaN   \n",
-       "2                                               NaN             NaN   \n",
-       "3     [[machine_nn01-38474877, seed, machine_nn01]]            seed   \n",
-       "4                                               NaN             NaN   \n",
-       "...                                             ...             ...   \n",
-       "1215                                            NaN             NaN   \n",
-       "1216                                            NaN             NaN   \n",
-       "1217                                            NaN             NaN   \n",
-       "1218                                            NaN             NaN   \n",
-       "1220                                            NaN             NaN   \n",
-       "\n",
-       "      relation_to_core_senses  relation_to_seed_senses  \n",
-       "0                         NaN                      NaN  \n",
-       "1                         NaN                      NaN  \n",
-       "2                         NaN                      NaN  \n",
-       "3     {machine_nn01-38474877}  {machine_nn01-38474877}  \n",
-       "4                         NaN                      NaN  \n",
-       "...                       ...                      ...  \n",
-       "1215                      NaN                      NaN  \n",
-       "1216                      NaN                      NaN  \n",
-       "1217                      NaN                      NaN  \n",
-       "1218                      NaN                      NaN  \n",
-       "1220                      NaN                      NaN  \n",
-       "\n",
-       "[1135 rows x 21 columns]"
-      ],
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>sense_id</th>\n      <th>lemma_definition</th>\n      <th>definition</th>\n      <th>word_id</th>\n      <th>lemma</th>\n      <th>quotation_id</th>\n      <th>source</th>\n      <th>text</th>\n      <th>year</th>\n      <th>full_text</th>\n      <th>...</th>\n      <th>keyword_offset</th>\n      <th>vector_bert_base_-1,-2,-3,-4_mean</th>\n      <th>vector_blert_-1,-2,-3,-4_mean</th>\n      <th>label</th>\n      <th>id</th>\n      <th>daterange</th>\n      <th>provenance</th>\n      <th>provenance_type</th>\n      <th>relation_to_core_senses</th>\n      <th>relation_to_seed_senses</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>body_nn01-17170653</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Particular technical uses. The part of a vehic...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-132916428</td>\n      <td>{'title': 'Material Handling Engin.', 'author'...</td>\n      <td>{'keyword': 'bodies', 'full_text': 'After car ...</td>\n      <td>1990.0</td>\n      <td>After car bodies are painted, they are moved i...</td>\n      <td>...</td>\n      <td>10.0</td>\n      <td>[1.2747291, 0.25178745, 0.69486666, 0.42832682...</td>\n      <td>[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>man_nn01-110482153</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>As vocative or as int., introducing a remark o...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110482440</td>\n      <td>{'title': 'Shaela', 'author': 'R. Bulter', 'ge...</td>\n      <td>{'keyword': 'Min', 'full_text': 'Min A'm vexed...</td>\n      <td>1976.0</td>\n      <td>Min A'm vexed ta hear yun.</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>[-0.10557328, 0.24347349, 0.731555, -0.4305202...</td>\n      <td>[-0.49209523, 0.7658461, 0.07512934, 0.0148925...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>body_nn01-17169813</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Contrasted with the soul. Cf. soul body n. at ...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-17169857</td>\n      <td>{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...</td>\n      <td>{'keyword': 'Body', 'full_text': 'All are but ...</td>\n      <td>1733.0</td>\n      <td>All are but parts of one stupendous Whole, Who...</td>\n      <td>...</td>\n      <td>49.0</td>\n      <td>[0.8197431, 0.04237363, 0.6312159, -0.2658673,...</td>\n      <td>[0.60478234, 0.58020014, 0.053836707, -0.06571...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>machine_nn01-38474877</td>\n      <td>A complex device, consisting of a number of in...</td>\n      <td>A bicycle or tricycle; a motorcycle. Formerly ...</td>\n      <td>machine_nn01</td>\n      <td>machine</td>\n      <td>machine_nn01-38474966</td>\n      <td>{'title': 'National Trust Mag.', 'author': Non...</td>\n      <td>{'keyword': 'machines', 'full_text': 'The cycl...</td>\n      <td>1992.0</td>\n      <td>The cyclists..took on the circular 21- or 42-m...</td>\n      <td>...</td>\n      <td>92.0</td>\n      <td>[-0.18150243, -0.24230756, -0.3336587, 0.34879...</td>\n      <td>[-0.14852196, 0.69629294, 0.30973893, 0.598406...</td>\n      <td>0</td>\n      <td>machine_nn01-38474877</td>\n      <td>{'end': None, 'start': 1823, 'obsolete': False...</td>\n      <td>[[machine_nn01-38474877, seed, machine_nn01]]</td>\n      <td>seed</td>\n      <td>{machine_nn01-38474877}</td>\n      <td>{machine_nn01-38474877}</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>carcass_nn01-10177258</td>\n      <td>The dead body of a person or animal; but no lo...</td>\n      <td>The naked framework or ‘shell’ of a building b...</td>\n      <td>carcass_nn01</td>\n      <td>carcass</td>\n      <td>carcass_nn01-10177295</td>\n      <td>{'title': 'New Pract. Builder', 'author': 'P. ...</td>\n      <td>{'keyword': 'Carcase', 'full_text': 'Carcase o...</td>\n      <td>1823.0</td>\n      <td>Carcase of a Building, the naked walls, and th...</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>[0.6567496, -0.050804906, 0.31024605, 0.059706...</td>\n      <td>[0.41240987, 0.10217035, 0.48574266, 0.8627304...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1215</th>\n      <td>man_nn01-110479060</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>A husband. Now chiefly English regional (north...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110479206</td>\n      <td>{'title': 'Four Years S. Afr.', 'author': 'C. ...</td>\n      <td>{'keyword': 'man', 'full_text': 'The wife brok...</td>\n      <td>1829.0</td>\n      <td>The wife broke out, ‘You lament a brother, and...</td>\n      <td>...</td>\n      <td>79.0</td>\n      <td>[-0.07307064, -0.31692728, 0.38834277, -0.2980...</td>\n      <td>[-0.20098017, 0.47577783, 0.013388823, -0.2808...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1216</th>\n      <td>person_nn01-30950985</td>\n      <td>An individual human being; a man, woman, or ch...</td>\n      <td>Law. An individual (natural person n.) or corp...</td>\n      <td>person_nn01</td>\n      <td>person</td>\n      <td>person_nn01-30951076</td>\n      <td>{'title': 'Daily News', 'author': None, 'gende...</td>\n      <td>{'keyword': 'persons', 'full_text': 'A Bill..e...</td>\n      <td>1900.0</td>\n      <td>A Bill..extending to juridical persons, that i...</td>\n      <td>...</td>\n      <td>31.0</td>\n      <td>[0.030711764, 0.28706473, 0.6596842, -0.132111...</td>\n      <td>[-0.42745396, 0.4621299, 0.34301567, 0.2193956...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1217</th>\n      <td>clay_nn01-9320873</td>\n      <td>A stiff viscous earth found, in many varieties...</td>\n      <td>Short for clay-pipe n. at  compounds 2 (colloq...</td>\n      <td>clay_nn01</td>\n      <td>clay</td>\n      <td>clay_nn01-9320896</td>\n      <td>{'title': 'Held in Bondage', 'author': '‘Ouida...</td>\n      <td>{'keyword': 'clays', 'full_text': 'Filthy bird...</td>\n      <td>1863.0</td>\n      <td>Filthy bird's-eye, smoked in clays.</td>\n      <td>...</td>\n      <td>29.0</td>\n      <td>[-0.016634814, 0.6912965, -0.18498293, -0.2104...</td>\n      <td>[-0.2833503, 0.80949837, -0.5981247, 0.4331013...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1218</th>\n      <td>case_nn02-10018131</td>\n      <td>A box, bag, or other receptacle, designed to c...</td>\n      <td>slang. A house, esp. one used as a brothel. Cf...</td>\n      <td>case_nn02</td>\n      <td>case</td>\n      <td>case_nn02-10018191</td>\n      <td>{'title': 'Mop Fair', 'author': 'A. M. Binstea...</td>\n      <td>{'keyword': 'case', 'full_text': 'They arrange...</td>\n      <td>1905.0</td>\n      <td>They arranges to stop ‘private’ in Brighton, a...</td>\n      <td>...</td>\n      <td>57.0</td>\n      <td>[0.16278893, -0.17927478, 0.34916735, -0.34717...</td>\n      <td>[0.3253876, 0.12327082, -0.077930324, 0.450299...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1220</th>\n      <td>man_nn01-110487579</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>In Cumbria: a cairn marking a summit or promin...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110487624</td>\n      <td>{'title': 'Northern Affair', 'author': 'D. K. ...</td>\n      <td>{'keyword': 'man', 'full_text': 'Over the elep...</td>\n      <td>1964.0</td>\n      <td>Over the elephant rocks and under the lee of t...</td>\n      <td>...</td>\n      <td>55.0</td>\n      <td>[0.12908892, 0.1654679, -0.077464886, -0.44454...</td>\n      <td>[-0.4877532, 0.62317544, -0.4543179, -0.167910...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>1135 rows × 21 columns</p>\n</div>"
-     },
-     "metadata": {},
-     "execution_count": 74
-    }
-   ],
-   "source": [
-    "df_train[df_train.label==\"0\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 75,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "             sense_id                                   lemma_definition  \\\n",
-       "0  body_nn01-17170653  The complete physical form of a person or anim...   \n",
-       "1  man_nn01-110482153  An adult male human being. Without explicit co...   \n",
-       "2  body_nn01-17169813  The complete physical form of a person or anim...   \n",
-       "\n",
-       "                                          definition    word_id lemma  \\\n",
-       "0  Particular technical uses. The part of a vehic...  body_nn01  body   \n",
-       "1  As vocative or as int., introducing a remark o...   man_nn01   man   \n",
-       "2  Contrasted with the soul. Cf. soul body n. at ...  body_nn01  body   \n",
-       "\n",
-       "          quotation_id                                             source  \\\n",
-       "0  body_nn01-132916428  {'title': 'Material Handling Engin.', 'author'...   \n",
-       "1   man_nn01-110482440  {'title': 'Shaela', 'author': 'R. Bulter', 'ge...   \n",
-       "2   body_nn01-17169857  {'title': 'Ess. Man', 'author': 'A. Pope', 'ge...   \n",
-       "\n",
-       "                                                text    year  \\\n",
-       "0  {'keyword': 'bodies', 'full_text': 'After car ...  1990.0   \n",
-       "1  {'keyword': 'Min', 'full_text': 'Min A'm vexed...  1976.0   \n",
-       "2  {'keyword': 'Body', 'full_text': 'All are but ...  1733.0   \n",
-       "\n",
-       "                                           full_text  ... keyword_offset  \\\n",
-       "0  After car bodies are painted, they are moved i...  ...           10.0   \n",
-       "1                         Min A'm vexed ta hear yun.  ...            0.0   \n",
-       "2  All are but parts of one stupendous Whole, Who...  ...           49.0   \n",
-       "\n",
-       "                   vector_bert_base_-1,-2,-3,-4_mean  \\\n",
-       "0  [1.2747291, 0.25178745, 0.69486666, 0.42832682...   \n",
-       "1  [-0.10557328, 0.24347349, 0.731555, -0.4305202...   \n",
-       "2  [0.8197431, 0.04237363, 0.6312159, -0.2658673,...   \n",
-       "\n",
-       "                       vector_blert_-1,-2,-3,-4_mean label   id daterange  \\\n",
-       "0  [1.5054287, 1.1386966, 1.3405375, 0.8012274, -...     0  NaN       NaN   \n",
-       "1  [-0.49209523, 0.7658461, 0.07512934, 0.0148925...     0  NaN       NaN   \n",
-       "2  [0.60478234, 0.58020014, 0.053836707, -0.06571...     0  NaN       NaN   \n",
-       "\n",
-       "  provenance provenance_type relation_to_core_senses relation_to_seed_senses  \n",
-       "0        NaN             NaN                     NaN                     NaN  \n",
-       "1        NaN             NaN                     NaN                     NaN  \n",
-       "2        NaN             NaN                     NaN                     NaN  \n",
-       "\n",
-       "[3 rows x 21 columns]"
-      ],
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>sense_id</th>\n      <th>lemma_definition</th>\n      <th>definition</th>\n      <th>word_id</th>\n      <th>lemma</th>\n      <th>quotation_id</th>\n      <th>source</th>\n      <th>text</th>\n      <th>year</th>\n      <th>full_text</th>\n      <th>...</th>\n      <th>keyword_offset</th>\n      <th>vector_bert_base_-1,-2,-3,-4_mean</th>\n      <th>vector_blert_-1,-2,-3,-4_mean</th>\n      <th>label</th>\n      <th>id</th>\n      <th>daterange</th>\n      <th>provenance</th>\n      <th>provenance_type</th>\n      <th>relation_to_core_senses</th>\n      <th>relation_to_seed_senses</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>body_nn01-17170653</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Particular technical uses. The part of a vehic...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-132916428</td>\n      <td>{'title': 'Material Handling Engin.', 'author'...</td>\n      <td>{'keyword': 'bodies', 'full_text': 'After car ...</td>\n      <td>1990.0</td>\n      <td>After car bodies are painted, they are moved i...</td>\n      <td>...</td>\n      <td>10.0</td>\n      <td>[1.2747291, 0.25178745, 0.69486666, 0.42832682...</td>\n      <td>[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>man_nn01-110482153</td>\n      <td>An adult male human being. Without explicit co...</td>\n      <td>As vocative or as int., introducing a remark o...</td>\n      <td>man_nn01</td>\n      <td>man</td>\n      <td>man_nn01-110482440</td>\n      <td>{'title': 'Shaela', 'author': 'R. Bulter', 'ge...</td>\n      <td>{'keyword': 'Min', 'full_text': 'Min A'm vexed...</td>\n      <td>1976.0</td>\n      <td>Min A'm vexed ta hear yun.</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>[-0.10557328, 0.24347349, 0.731555, -0.4305202...</td>\n      <td>[-0.49209523, 0.7658461, 0.07512934, 0.0148925...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>body_nn01-17169813</td>\n      <td>The complete physical form of a person or anim...</td>\n      <td>Contrasted with the soul. Cf. soul body n. at ...</td>\n      <td>body_nn01</td>\n      <td>body</td>\n      <td>body_nn01-17169857</td>\n      <td>{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...</td>\n      <td>{'keyword': 'Body', 'full_text': 'All are but ...</td>\n      <td>1733.0</td>\n      <td>All are but parts of one stupendous Whole, Who...</td>\n      <td>...</td>\n      <td>49.0</td>\n      <td>[0.8197431, 0.04237363, 0.6312159, -0.2658673,...</td>\n      <td>[0.60478234, 0.58020014, 0.053836707, -0.06571...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>3 rows × 21 columns</p>\n</div>"
-     },
-     "metadata": {},
-     "execution_count": 75
-    }
-   ],
-   "source": [
-    "df_train.head(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 76,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',\n",
-       "       'quotation_id', 'source', 'text', 'year', 'full_text', 'keyword',\n",
-       "       'keyword_offset', 'vector_bert_base_-1,-2,-3,-4_mean',\n",
-       "       'vector_blert_-1,-2,-3,-4_mean', 'label', 'id', 'daterange',\n",
-       "       'provenance', 'provenance_type', 'relation_to_core_senses',\n",
-       "       'relation_to_seed_senses'],\n",
-       "      dtype='object')"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 76
-    }
-   ],
-   "source": [
-    "df_train.columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 100,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def enclose_keyword(row,enclose_token='\"'):\n",
-    "    \"\"\"enclose keyword with specific token to point\n",
-    "    learner towards to word it has to focus on\n",
-    "    \"\"\"\n",
-    "    sentence = ''\n",
-    "    for i,c in enumerate(row.full_text):\n",
-    "        if i == int(row.keyword_offset):\n",
-    "            sentence+=enclose_token + ' '\n",
-    "        elif i ==int(row.keyword_offset + len(row.keyword)):\n",
-    "            sentence+= ' ' + enclose_token\n",
-    "        sentence+=c\n",
-    "    return sentence\n",
-    "\n",
-    "#def merge_quotation_gloss(row):\n",
-    "#    out_string = '[GLOSS] '\n",
-    "#    if row.definition:\n",
-    "#        out_string+=row.definition\n",
-    "#    out_string+=' [QUOT] '  \n",
-    "#    if row.enclosed_quotation:\n",
-    "#        out_string+=row.enclosed_quotation\n",
-    "#    return out_string\n",
-    "\n",
-    "#def prep_train_text(row):\n",
-    "#    out_string='[TAGET] '+row.keyword+' [TAGET] : '\n",
-    "#    if row.definition:\n",
-    "#        out_string+=row.definition\n",
-    "#    out_string+=' [SEP] '  \n",
-    "#    if row.enclosed_quotation:\n",
-    "#        out_string+=row.enclosed_quotation\n",
-    "#    return out_string\n",
-    "\n",
-    "#def prep_test_text(row):\n",
-    "#    out_string='[TAGET] '+row.keyword+' [TAGET] : '\n",
-    "#    if row.enclosed_quotation:\n",
-    "#        out_string+=row.enclosed_quotation\n",
-    "#    return out_string\n",
-    "\n",
-    "#def merge_quotation_keyword(row):\n",
-    "#    out_string = '[TARGET] '\n",
-    "#    if row.keyword:\n",
-    "#        out_string+=row.keyword\n",
-    "#    out_string+=' [QUOT] '  \n",
-    "#    if row.enclosed_quotation:\n",
-    "#        out_string+=row.enclosed_quotation\n",
-    "#    return out_string\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 101,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def to_glossbert_format(df):\n",
-    "    def gloss_string(row, definition):\n",
-    "        out_string=''\n",
-    "        if row.enclosed_quotation:\n",
-    "            out_string+=row.enclosed_quotation\n",
-    "        out_string+=' [SEP] '  \n",
-    "        out_string+=row.keyword+': '\n",
-    "        if row.definition:\n",
-    "            out_string+=definition\n",
-    "        return out_string\n",
-    "\n",
-    "    df['enclosed_quotations'] = df.apply(enclose_keyword, axis=1)\n",
-    "    \n",
-    "    rows = [] \n",
-    "    for i,row in df.iterrows():\n",
-    "        rows.append([gloss_string(row, row.definition), 1])\n",
-    "        definitions = df[df.lemma==row.lemma].definition.unique()\n",
-    "        for d in definitions:\n",
-    "            if d != row.definition:\n",
-    "                rows.append([gloss_string(row,d), 0])\n",
-    "    \n",
-    "    return rows\n",
-    "\n",
-    "df_gloss_train = to_glossbert_format(df_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 102,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "[['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Particular technical uses. The part of a vehicle fitted to receive the load; spec. the part of a motor car in which driver and passengers sit, or the fuselage of an aeroplane. Cf. cart-body n. at  cart n. compounds 2, wide-body n.',\n",
-       "  1],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Contrasted with the soul. Cf. soul body n. at  soul n. compounds 4.',\n",
-       "  0],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Particular technical uses. The main part of a musical instrument, which in the case of traditional stringed instruments forms a resonating chamber.',\n",
-       "  0],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: The complete physical form of a person or animal; the assemblage of parts, organs, and tissues that constitutes the whole material organism.',\n",
-       "  0],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: A comprehensive and systematic collection of information, or of the details of any subject, esp. law; a textbook, a pandect. Usually with of.',\n",
-       "  0],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: A corpse.',\n",
-       "  0],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: The physical or mortal nature, state, or aspect of man. Frequently in in (the) body, out of (the) body and variants, sometimes contrasted with in spirit.',\n",
-       "  0],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: More widely: a material thing, an object; something that has physical existence and extension in space.',\n",
-       "  0],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Cell Biology. Any of various normal or abnormal structures found within the cytoplasm or nucleus of a cell. Frequently with distinguishing word.',\n",
-       "  0],\n",
-       " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Originally: †size or bulk; quantity (obsolete). In later use: a quantity, mass, or area of something.',\n",
-       "  0]]"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 102
-    }
-   ],
-   "source": [
-    "df_gloss_train[:10]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "path = Path('./data/training_data')\n",
-    "path.mkdir(exist_ok=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "csv_out_path = path / f\"{lemma}_{'_'.join(senses)}\"\n",
-    "csv_out_path.mkdir(exist_ok=True)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_train['enclosed_quotation'] = df_train.apply(enclose_keyword, axis=1)\n",
-    "df_train['text'] = df_train.apply(prep_train_text, axis=1)\n",
-    "df_train[['text','label']].to_csv(csv_out_path / \"train.csv\",index = False, sep='\\t')    \n",
-    "df_val['enclosed_quotation'] = df_val.apply(enclose_keyword, axis=1)\n",
-    "df_val['text'] = df_val.apply(prep_test_text, axis=1)\n",
-    "df_val[['text','label']].to_csv(csv_out_path / \"dev.csv\",index = False, sep='\\t')        \n",
-    "df_test['enclosed_quotation'] = df_test.apply(enclose_keyword, axis=1)\n",
-    "df_test['text'] = df_test.apply(prep_test_text, axis=1)\n",
-    "df_test[['text','label']].to_csv(csv_out_path / \"test.csv\",index = False, sep='\\t')    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "387     [TAGET] Men [TAGET] : He canton'd out the Coun...\n",
-       "1491    [TAGET] earths [TAGET] : Ley-grounds cannot be...\n",
-       "1841    [TAGET] earth [TAGET] : It is well to see the ...\n",
-       "1244    [TAGET] person [TAGET] : The administrator..ha...\n",
-       "1809    [TAGET] earth [TAGET] : While I drove by in my...\n",
-       "                              ...                        \n",
-       "736     [TAGET] machines [TAGET] : ‘Anyone,’ declared,...\n",
-       "610     [TAGET] machine [TAGET] : To each mortal perad...\n",
-       "1612    [TAGET] body [TAGET] : The coffee, we know, st...\n",
-       "1128    [TAGET] Personalities [TAGET] : Wisdom, Learni...\n",
-       "1281    [TAGET] person [TAGET] : I'm a people [TARGET]...\n",
-       "Name: text, Length: 383, dtype: object"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 81
-    }
-   ],
-   "source": [
-    "df_test.text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "'[TAGET] bodies [TAGET] : Particular technical uses. The part of a vehicle fitted to receive the load; spec. the part of a motor car in which driver and passengers sit, or the fuselage of an aeroplane. Cf. cart-body n. at  cart n. compounds 2, wide-body n. [SEP] After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color.'"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 82
-    }
-   ],
-   "source": [
-    "df_train.iloc[0].text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 83,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "2021-01-14 17:14:29,725 Reading data from data/training_data/machine_machine_nn01-38474140\n",
-      "2021-01-14 17:14:29,726 Train: data/training_data/machine_machine_nn01-38474140/train.csv\n",
-      "2021-01-14 17:14:29,727 Dev: data/training_data/machine_machine_nn01-38474140/dev.csv\n",
-      "2021-01-14 17:14:29,727 Test: data/training_data/machine_machine_nn01-38474140/test.csv\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "\n",
-    "# this is the folder in which train, test and dev files reside\n",
-    "data_folder = csv_out_path\n",
-    "\n",
-    "# column format indicating which columns hold the text and label(s)\n",
-    "column_name_map = {0: \"text\", 1: \"label\"}\n",
-    "\n",
-    "# load corpus containing training, test and dev data and if CSV has a header, you can skip it\n",
-    "corpus: Corpus = CSVClassificationCorpus(data_folder,\n",
-    "                                         column_name_map,\n",
-    "                                         skip_header=True,\n",
-    "                                         delimiter='\\t',    # tab-separated files\n",
-    ") "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "2021-01-14 17:14:29,745 Computing label dictionary. Progress:\n",
-      "100%|██████████| 1604/1604 [00:01<00:00, 1060.11it/s]2021-01-14 17:14:31,699 [b'0', b'1']\n",
-      "Dictionary with 2 tags: 0, 1\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "# 2. create the label dictionary\n",
-    "label_dict = corpus.make_label_dictionary()\n",
-    "print(label_dict)"
+    "train_glossbert(data_path,downsample=True)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {
-    "tags": [
-     "outputPrepend"
-    ]
-   },
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "eps=1e-12, elementwise_affine=True)\n",
-      "            (ffn): FFN(\n",
-      "              (dropout): Dropout(p=0.1, inplace=False)\n",
-      "              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
-      "              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
-      "            )\n",
-      "            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
-      "          )\n",
-      "          (4): TransformerBlock(\n",
-      "            (attention): MultiHeadSelfAttention(\n",
-      "              (dropout): Dropout(p=0.1, inplace=False)\n",
-      "              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
-      "              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
-      "              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
-      "              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
-      "            )\n",
-      "            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
-      "            (ffn): FFN(\n",
-      "              (dropout): Dropout(p=0.1, inplace=False)\n",
-      "              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
-      "              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
-      "            )\n",
-      "            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
-      "          )\n",
-      "          (5): TransformerBlock(\n",
-      "            (attention): MultiHeadSelfAttention(\n",
-      "              (dropout): Dropout(p=0.1, inplace=False)\n",
-      "              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
-      "              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
-      "              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
-      "              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
-      "            )\n",
-      "            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
-      "            (ffn): FFN(\n",
-      "              (dropout): Dropout(p=0.1, inplace=False)\n",
-      "              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
-      "              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
-      "            )\n",
-      "            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
-      "          )\n",
-      "        )\n",
-      "      )\n",
-      "    )\n",
-      "  )\n",
-      "  (decoder): Linear(in_features=768, out_features=2, bias=True)\n",
-      "  (loss_function): CrossEntropyLoss()\n",
-      "  (beta): 1.0\n",
-      "  (weights): {b'1': 10, b'0': 1}\n",
-      "  (weight_tensor) tensor([1., 1.], device='cuda:0')\n",
-      ")\"\n",
-      "2021-01-14 17:14:33,683 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:14:33,684 Corpus: \"Corpus: 1221 train + 306 dev + 383 test sentences\"\n",
-      "2021-01-14 17:14:33,684 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:14:33,685 Parameters:\n",
-      "2021-01-14 17:14:33,685  - learning_rate: \"1e-05\"\n",
-      "2021-01-14 17:14:33,686  - mini_batch_size: \"16\"\n",
-      "2021-01-14 17:14:33,686  - patience: \"3\"\n",
-      "2021-01-14 17:14:33,687  - anneal_factor: \"0.5\"\n",
-      "2021-01-14 17:14:33,687  - max_epochs: \"10\"\n",
-      "2021-01-14 17:14:33,688  - shuffle: \"True\"\n",
-      "2021-01-14 17:14:33,688  - train_with_dev: \"False\"\n",
-      "2021-01-14 17:14:33,689  - batch_growth_annealing: \"False\"\n",
-      "2021-01-14 17:14:33,690 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:14:33,690 Model training base path: \"models/taggers/trec\"\n",
-      "2021-01-14 17:14:33,691 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:14:33,691 Device: cuda:0\n",
-      "2021-01-14 17:14:33,692 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:14:33,692 Embeddings storage mode: cpu\n",
-      "2021-01-14 17:14:33,693 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:14:38,343 epoch 1 - iter 7/77 - loss 0.48898176 - samples/sec: 26.27 - lr: 0.000010\n",
-      "2021-01-14 17:14:42,454 epoch 1 - iter 14/77 - loss 0.42082447 - samples/sec: 27.53 - lr: 0.000010\n",
-      "2021-01-14 17:14:46,658 epoch 1 - iter 21/77 - loss 0.33773888 - samples/sec: 26.85 - lr: 0.000010\n",
-      "2021-01-14 17:14:50,844 epoch 1 - iter 28/77 - loss 0.31560597 - samples/sec: 27.01 - lr: 0.000010\n",
-      "2021-01-14 17:14:54,998 epoch 1 - iter 35/77 - loss 0.25972683 - samples/sec: 27.14 - lr: 0.000010\n",
-      "2021-01-14 17:14:59,209 epoch 1 - iter 42/77 - loss 0.23569006 - samples/sec: 26.75 - lr: 0.000010\n",
-      "2021-01-14 17:15:03,408 epoch 1 - iter 49/77 - loss 0.24985709 - samples/sec: 26.85 - lr: 0.000010\n",
-      "2021-01-14 17:15:07,633 epoch 1 - iter 56/77 - loss 0.23229837 - samples/sec: 26.77 - lr: 0.000010\n",
-      "2021-01-14 17:15:11,797 epoch 1 - iter 63/77 - loss 0.23326370 - samples/sec: 27.06 - lr: 0.000010\n",
-      "2021-01-14 17:15:16,012 epoch 1 - iter 70/77 - loss 0.21914055 - samples/sec: 26.79 - lr: 0.000010\n",
-      "2021-01-14 17:15:19,739 epoch 1 - iter 77/77 - loss 0.20128365 - samples/sec: 30.18 - lr: 0.000010\n",
-      "2021-01-14 17:15:19,814 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:15:19,814 EPOCH 1 done: loss 0.2013 - lr 0.0000100\n",
-      "2021-01-14 17:15:23,961 DEV : loss 0.33609411120414734 - score 0.9281\n",
-      "2021-01-14 17:15:24,224 BAD EPOCHS (no improvement): 0\n",
-      "saving best model\n",
-      "2021-01-14 17:15:25,155 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:15:29,794 epoch 2 - iter 7/77 - loss 0.00740182 - samples/sec: 26.44 - lr: 0.000010\n",
-      "2021-01-14 17:15:34,024 epoch 2 - iter 14/77 - loss 0.03390250 - samples/sec: 26.74 - lr: 0.000010\n",
-      "2021-01-14 17:15:38,226 epoch 2 - iter 21/77 - loss 0.03571354 - samples/sec: 26.91 - lr: 0.000010\n",
-      "2021-01-14 17:15:42,369 epoch 2 - iter 28/77 - loss 0.03136397 - samples/sec: 27.22 - lr: 0.000010\n",
-      "2021-01-14 17:15:46,510 epoch 2 - iter 35/77 - loss 0.03331735 - samples/sec: 27.22 - lr: 0.000010\n",
-      "2021-01-14 17:15:50,650 epoch 2 - iter 42/77 - loss 0.07917234 - samples/sec: 27.26 - lr: 0.000010\n",
-      "2021-01-14 17:15:54,832 epoch 2 - iter 49/77 - loss 0.07227532 - samples/sec: 26.94 - lr: 0.000010\n",
-      "2021-01-14 17:15:59,093 epoch 2 - iter 56/77 - loss 0.06382573 - samples/sec: 26.50 - lr: 0.000010\n",
-      "2021-01-14 17:16:03,303 epoch 2 - iter 63/77 - loss 0.08917253 - samples/sec: 26.79 - lr: 0.000010\n",
-      "2021-01-14 17:16:07,583 epoch 2 - iter 70/77 - loss 0.08041374 - samples/sec: 26.33 - lr: 0.000010\n",
-      "2021-01-14 17:16:11,374 epoch 2 - iter 77/77 - loss 0.08118116 - samples/sec: 29.72 - lr: 0.000010\n",
-      "2021-01-14 17:16:11,437 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:16:11,438 EPOCH 2 done: loss 0.0812 - lr 0.0000100\n",
-      "2021-01-14 17:16:15,649 DEV : loss 0.5065702795982361 - score 0.9248\n",
-      "2021-01-14 17:16:15,909 BAD EPOCHS (no improvement): 1\n",
-      "2021-01-14 17:16:15,910 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:16:20,491 epoch 3 - iter 7/77 - loss 0.00894024 - samples/sec: 26.72 - lr: 0.000010\n",
-      "2021-01-14 17:16:24,744 epoch 3 - iter 14/77 - loss 0.04298888 - samples/sec: 26.62 - lr: 0.000010\n",
-      "2021-01-14 17:16:29,099 epoch 3 - iter 21/77 - loss 0.05707598 - samples/sec: 25.96 - lr: 0.000010\n",
-      "2021-01-14 17:16:33,430 epoch 3 - iter 28/77 - loss 0.04700577 - samples/sec: 26.10 - lr: 0.000010\n",
-      "2021-01-14 17:16:37,615 epoch 3 - iter 35/77 - loss 0.03774460 - samples/sec: 26.92 - lr: 0.000010\n",
-      "2021-01-14 17:16:41,848 epoch 3 - iter 42/77 - loss 0.03161711 - samples/sec: 26.63 - lr: 0.000010\n",
-      "2021-01-14 17:16:46,019 epoch 3 - iter 49/77 - loss 0.02749447 - samples/sec: 27.00 - lr: 0.000010\n",
-      "2021-01-14 17:16:50,152 epoch 3 - iter 56/77 - loss 0.02414880 - samples/sec: 27.36 - lr: 0.000010\n",
-      "2021-01-14 17:16:54,291 epoch 3 - iter 63/77 - loss 0.02319205 - samples/sec: 27.22 - lr: 0.000010\n",
-      "2021-01-14 17:16:58,450 epoch 3 - iter 70/77 - loss 0.02129739 - samples/sec: 27.15 - lr: 0.000010\n",
-      "2021-01-14 17:17:02,224 epoch 3 - iter 77/77 - loss 0.01944040 - samples/sec: 29.79 - lr: 0.000010\n",
-      "2021-01-14 17:17:02,272 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:17:02,273 EPOCH 3 done: loss 0.0194 - lr 0.0000100\n",
-      "2021-01-14 17:17:06,524 DEV : loss 0.49905064702033997 - score 0.9216\n",
-      "2021-01-14 17:17:06,786 BAD EPOCHS (no improvement): 2\n",
-      "2021-01-14 17:17:06,787 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:17:11,358 epoch 4 - iter 7/77 - loss 0.00185599 - samples/sec: 26.75 - lr: 0.000010\n",
-      "2021-01-14 17:17:15,541 epoch 4 - iter 14/77 - loss 0.00136404 - samples/sec: 27.04 - lr: 0.000010\n",
-      "2021-01-14 17:17:19,712 epoch 4 - iter 21/77 - loss 0.00128906 - samples/sec: 27.07 - lr: 0.000010\n",
-      "2021-01-14 17:17:23,840 epoch 4 - iter 28/77 - loss 0.00118582 - samples/sec: 27.40 - lr: 0.000010\n",
-      "2021-01-14 17:17:28,020 epoch 4 - iter 35/77 - loss 0.00106619 - samples/sec: 26.98 - lr: 0.000010\n",
-      "2021-01-14 17:17:32,295 epoch 4 - iter 42/77 - loss 0.00094899 - samples/sec: 26.41 - lr: 0.000010\n",
-      "2021-01-14 17:17:36,522 epoch 4 - iter 49/77 - loss 0.00087282 - samples/sec: 26.66 - lr: 0.000010\n",
-      "2021-01-14 17:17:40,684 epoch 4 - iter 56/77 - loss 0.00097088 - samples/sec: 27.12 - lr: 0.000010\n",
-      "2021-01-14 17:17:44,912 epoch 4 - iter 63/77 - loss 0.00087886 - samples/sec: 26.69 - lr: 0.000010\n",
-      "2021-01-14 17:17:49,112 epoch 4 - iter 70/77 - loss 0.00110282 - samples/sec: 26.91 - lr: 0.000010\n",
-      "2021-01-14 17:17:52,950 epoch 4 - iter 77/77 - loss 0.00101001 - samples/sec: 29.33 - lr: 0.000010\n",
-      "2021-01-14 17:17:53,013 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:17:53,014 EPOCH 4 done: loss 0.0010 - lr 0.0000100\n",
-      "2021-01-14 17:17:57,333 DEV : loss 0.5981439352035522 - score 0.9216\n",
-      "2021-01-14 17:17:57,595 BAD EPOCHS (no improvement): 3\n",
-      "2021-01-14 17:17:57,596 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:18:02,196 epoch 5 - iter 7/77 - loss 0.00060182 - samples/sec: 26.72 - lr: 0.000010\n",
-      "2021-01-14 17:18:06,460 epoch 5 - iter 14/77 - loss 0.00037103 - samples/sec: 26.48 - lr: 0.000010\n",
-      "2021-01-14 17:18:10,664 epoch 5 - iter 21/77 - loss 0.00028840 - samples/sec: 26.89 - lr: 0.000010\n",
-      "2021-01-14 17:18:14,811 epoch 5 - iter 28/77 - loss 0.00025692 - samples/sec: 27.23 - lr: 0.000010\n",
-      "2021-01-14 17:18:18,940 epoch 5 - iter 35/77 - loss 0.00024187 - samples/sec: 27.30 - lr: 0.000010\n",
-      "2021-01-14 17:18:23,111 epoch 5 - iter 42/77 - loss 0.00021889 - samples/sec: 27.08 - lr: 0.000010\n",
-      "2021-01-14 17:18:27,299 epoch 5 - iter 49/77 - loss 0.00035217 - samples/sec: 26.95 - lr: 0.000010\n",
-      "2021-01-14 17:18:31,425 epoch 5 - iter 56/77 - loss 0.00032686 - samples/sec: 27.30 - lr: 0.000010\n",
-      "2021-01-14 17:18:35,586 epoch 5 - iter 63/77 - loss 0.00029595 - samples/sec: 27.13 - lr: 0.000010\n",
-      "2021-01-14 17:18:39,774 epoch 5 - iter 70/77 - loss 0.00027235 - samples/sec: 26.91 - lr: 0.000010\n",
-      "2021-01-14 17:18:43,540 epoch 5 - iter 77/77 - loss 0.00028631 - samples/sec: 29.89 - lr: 0.000010\n",
-      "2021-01-14 17:18:43,601 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:18:43,602 EPOCH 5 done: loss 0.0003 - lr 0.0000100\n",
-      "2021-01-14 17:18:46,910 DEV : loss 0.6656511425971985 - score 0.9248\n",
-      "Epoch     5: reducing learning rate of group 0 to 5.0000e-06.\n",
-      "2021-01-14 17:18:47,170 BAD EPOCHS (no improvement): 4\n",
-      "2021-01-14 17:18:47,171 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:18:53,782 epoch 6 - iter 7/77 - loss 0.00022932 - samples/sec: 25.95 - lr: 0.000005\n",
-      "2021-01-14 17:18:57,949 epoch 6 - iter 14/77 - loss 0.00016531 - samples/sec: 27.13 - lr: 0.000005\n",
-      "2021-01-14 17:19:02,280 epoch 6 - iter 21/77 - loss 0.00022724 - samples/sec: 25.99 - lr: 0.000005\n",
-      "2021-01-14 17:19:06,568 epoch 6 - iter 28/77 - loss 0.00026307 - samples/sec: 26.27 - lr: 0.000005\n",
-      "2021-01-14 17:19:10,654 epoch 6 - iter 35/77 - loss 0.00023027 - samples/sec: 27.57 - lr: 0.000005\n",
-      "2021-01-14 17:19:14,851 epoch 6 - iter 42/77 - loss 0.00021121 - samples/sec: 26.89 - lr: 0.000005\n",
-      "2021-01-14 17:19:19,029 epoch 6 - iter 49/77 - loss 0.00019923 - samples/sec: 26.94 - lr: 0.000005\n",
-      "2021-01-14 17:19:23,229 epoch 6 - iter 56/77 - loss 0.00018661 - samples/sec: 26.87 - lr: 0.000005\n",
-      "2021-01-14 17:19:27,356 epoch 6 - iter 63/77 - loss 0.00017462 - samples/sec: 27.26 - lr: 0.000005\n",
-      "2021-01-14 17:19:31,517 epoch 6 - iter 70/77 - loss 0.00016146 - samples/sec: 27.04 - lr: 0.000005\n",
-      "2021-01-14 17:19:35,455 epoch 6 - iter 77/77 - loss 0.00015018 - samples/sec: 28.60 - lr: 0.000005\n",
-      "2021-01-14 17:19:35,526 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:19:35,527 EPOCH 6 done: loss 0.0002 - lr 0.0000050\n",
-      "2021-01-14 17:19:39,223 DEV : loss 0.6893778443336487 - score 0.9248\n",
-      "2021-01-14 17:19:39,484 BAD EPOCHS (no improvement): 1\n",
-      "2021-01-14 17:19:39,486 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:19:44,844 epoch 7 - iter 7/77 - loss 0.00019211 - samples/sec: 27.26 - lr: 0.000005\n",
-      "2021-01-14 17:19:49,028 epoch 7 - iter 14/77 - loss 0.00066268 - samples/sec: 26.98 - lr: 0.000005\n",
-      "2021-01-14 17:19:53,253 epoch 7 - iter 21/77 - loss 0.00053614 - samples/sec: 26.67 - lr: 0.000005\n",
-      "2021-01-14 17:19:57,430 epoch 7 - iter 28/77 - loss 0.00043514 - samples/sec: 27.03 - lr: 0.000005\n",
-      "2021-01-14 17:20:01,594 epoch 7 - iter 35/77 - loss 0.00036258 - samples/sec: 27.03 - lr: 0.000005\n",
-      "2021-01-14 17:20:05,829 epoch 7 - iter 42/77 - loss 0.00031573 - samples/sec: 26.60 - lr: 0.000005\n",
-      "2021-01-14 17:20:10,013 epoch 7 - iter 49/77 - loss 0.00028645 - samples/sec: 26.94 - lr: 0.000005\n",
-      "2021-01-14 17:20:14,238 epoch 7 - iter 56/77 - loss 0.00025793 - samples/sec: 26.67 - lr: 0.000005\n",
-      "2021-01-14 17:20:18,381 epoch 7 - iter 63/77 - loss 0.00023890 - samples/sec: 27.16 - lr: 0.000005\n",
-      "2021-01-14 17:20:22,569 epoch 7 - iter 70/77 - loss 0.00021868 - samples/sec: 26.92 - lr: 0.000005\n",
-      "2021-01-14 17:20:26,365 epoch 7 - iter 77/77 - loss 0.00020789 - samples/sec: 29.64 - lr: 0.000005\n",
-      "2021-01-14 17:20:26,443 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:20:26,443 EPOCH 7 done: loss 0.0002 - lr 0.0000050\n",
-      "2021-01-14 17:20:29,751 DEV : loss 0.6999250054359436 - score 0.9248\n",
-      "2021-01-14 17:20:30,015 BAD EPOCHS (no improvement): 2\n",
-      "2021-01-14 17:20:30,016 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:20:34,509 epoch 8 - iter 7/77 - loss 0.00007416 - samples/sec: 27.22 - lr: 0.000005\n",
-      "2021-01-14 17:20:39,619 epoch 8 - iter 14/77 - loss 0.00005522 - samples/sec: 26.82 - lr: 0.000005\n",
-      "2021-01-14 17:20:43,900 epoch 8 - iter 21/77 - loss 0.00006253 - samples/sec: 26.35 - lr: 0.000005\n",
-      "2021-01-14 17:20:48,020 epoch 8 - iter 28/77 - loss 0.00015550 - samples/sec: 27.36 - lr: 0.000005\n",
-      "2021-01-14 17:20:52,243 epoch 8 - iter 35/77 - loss 0.00013324 - samples/sec: 26.66 - lr: 0.000005\n",
-      "2021-01-14 17:20:56,397 epoch 8 - iter 42/77 - loss 0.00012891 - samples/sec: 27.08 - lr: 0.000005\n",
-      "2021-01-14 17:21:00,493 epoch 8 - iter 49/77 - loss 0.00012485 - samples/sec: 27.50 - lr: 0.000005\n",
-      "2021-01-14 17:21:04,731 epoch 8 - iter 56/77 - loss 0.00014117 - samples/sec: 26.59 - lr: 0.000005\n",
-      "2021-01-14 17:21:08,909 epoch 8 - iter 63/77 - loss 0.00013634 - samples/sec: 26.95 - lr: 0.000005\n",
-      "2021-01-14 17:21:13,121 epoch 8 - iter 70/77 - loss 0.00012635 - samples/sec: 26.74 - lr: 0.000005\n",
-      "2021-01-14 17:21:16,967 epoch 8 - iter 77/77 - loss 0.00012047 - samples/sec: 29.24 - lr: 0.000005\n",
-      "2021-01-14 17:21:17,026 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:21:17,027 EPOCH 8 done: loss 0.0001 - lr 0.0000050\n",
-      "2021-01-14 17:21:20,598 DEV : loss 0.7107362747192383 - score 0.9248\n",
-      "2021-01-14 17:21:20,864 BAD EPOCHS (no improvement): 3\n",
-      "2021-01-14 17:21:20,865 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:21:26,269 epoch 9 - iter 7/77 - loss 0.00002916 - samples/sec: 26.76 - lr: 0.000005\n",
-      "2021-01-14 17:21:30,543 epoch 9 - iter 14/77 - loss 0.00006615 - samples/sec: 26.39 - lr: 0.000005\n",
-      "2021-01-14 17:21:34,808 epoch 9 - iter 21/77 - loss 0.00006364 - samples/sec: 26.48 - lr: 0.000005\n",
-      "2021-01-14 17:21:38,963 epoch 9 - iter 28/77 - loss 0.00013939 - samples/sec: 27.14 - lr: 0.000005\n",
-      "2021-01-14 17:21:43,165 epoch 9 - iter 35/77 - loss 0.00012621 - samples/sec: 26.86 - lr: 0.000005\n",
-      "2021-01-14 17:21:47,317 epoch 9 - iter 42/77 - loss 0.00012101 - samples/sec: 27.09 - lr: 0.000005\n",
-      "2021-01-14 17:21:51,532 epoch 9 - iter 49/77 - loss 0.00018356 - samples/sec: 26.78 - lr: 0.000005\n",
-      "2021-01-14 17:21:56,114 epoch 9 - iter 56/77 - loss 0.00016344 - samples/sec: 26.01 - lr: 0.000005\n",
-      "2021-01-14 17:22:00,349 epoch 9 - iter 63/77 - loss 0.00015143 - samples/sec: 26.58 - lr: 0.000005\n",
-      "2021-01-14 17:22:04,591 epoch 9 - iter 70/77 - loss 0.00013992 - samples/sec: 26.55 - lr: 0.000005\n",
-      "2021-01-14 17:22:08,380 epoch 9 - iter 77/77 - loss 0.00012958 - samples/sec: 29.74 - lr: 0.000005\n",
-      "2021-01-14 17:22:08,443 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:22:08,444 EPOCH 9 done: loss 0.0001 - lr 0.0000050\n",
-      "2021-01-14 17:22:11,739 DEV : loss 0.7207703590393066 - score 0.9248\n",
-      "Epoch     9: reducing learning rate of group 0 to 2.5000e-06.\n",
-      "2021-01-14 17:22:12,003 BAD EPOCHS (no improvement): 4\n",
-      "2021-01-14 17:22:12,004 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:22:16,599 epoch 10 - iter 7/77 - loss 0.00004176 - samples/sec: 26.48 - lr: 0.000003\n",
-      "2021-01-14 17:22:21,730 epoch 10 - iter 14/77 - loss 0.00004900 - samples/sec: 26.15 - lr: 0.000003\n",
-      "2021-01-14 17:22:25,925 epoch 10 - iter 21/77 - loss 0.00005921 - samples/sec: 26.94 - lr: 0.000003\n",
-      "2021-01-14 17:22:30,112 epoch 10 - iter 28/77 - loss 0.00005456 - samples/sec: 26.94 - lr: 0.000003\n",
-      "2021-01-14 17:22:34,389 epoch 10 - iter 35/77 - loss 0.00004909 - samples/sec: 26.34 - lr: 0.000003\n",
-      "2021-01-14 17:22:38,546 epoch 10 - iter 42/77 - loss 0.00004503 - samples/sec: 27.15 - lr: 0.000003\n",
-      "2021-01-14 17:22:42,757 epoch 10 - iter 49/77 - loss 0.00004776 - samples/sec: 26.71 - lr: 0.000003\n",
-      "2021-01-14 17:22:46,803 epoch 10 - iter 56/77 - loss 0.00004461 - samples/sec: 27.81 - lr: 0.000003\n",
-      "2021-01-14 17:22:51,063 epoch 10 - iter 63/77 - loss 0.00004382 - samples/sec: 26.47 - lr: 0.000003\n",
-      "2021-01-14 17:22:55,173 epoch 10 - iter 70/77 - loss 0.00005728 - samples/sec: 27.40 - lr: 0.000003\n",
-      "2021-01-14 17:22:58,941 epoch 10 - iter 77/77 - loss 0.00005446 - samples/sec: 29.83 - lr: 0.000003\n",
-      "2021-01-14 17:22:58,988 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:22:58,989 EPOCH 10 done: loss 0.0001 - lr 0.0000025\n",
-      "2021-01-14 17:23:02,304 DEV : loss 0.7250329852104187 - score 0.9248\n",
-      "2021-01-14 17:23:02,567 BAD EPOCHS (no improvement): 1\n",
-      "2021-01-14 17:23:04,650 ----------------------------------------------------------------------------------------------------\n",
-      "2021-01-14 17:23:04,651 Testing using best model ...\n",
-      "2021-01-14 17:23:04,653 loading file models/taggers/trec/best-model.pt\n",
-      "2021-01-14 17:23:09,692 \t0.9295\n",
-      "2021-01-14 17:23:09,693 \n",
-      "Results:\n",
-      "- F-score (micro) 0.9295\n",
-      "- F-score (macro) 0.4817\n",
-      "- Accuracy 0.9295\n",
-      "\n",
-      "By class:\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0     0.9295    1.0000    0.9635       356\n",
-      "           1     0.0000    0.0000    0.0000        27\n",
-      "\n",
-      "   micro avg     0.9295    0.9295    0.9295       383\n",
-      "   macro avg     0.4648    0.5000    0.4817       383\n",
-      "weighted avg     0.8640    0.9295    0.8955       383\n",
-      " samples avg     0.9295    0.9295    0.9295       383\n",
-      "\n",
-      "2021-01-14 17:23:09,694 ----------------------------------------------------------------------------------------------------\n"
-     ]
-    },
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "{'test_score': 0.9295,\n",
-       " 'dev_score_history': [0.9281,\n",
-       "  0.9248,\n",
-       "  0.9216,\n",
-       "  0.9216,\n",
-       "  0.9248,\n",
-       "  0.9248,\n",
-       "  0.9248,\n",
-       "  0.9248,\n",
-       "  0.9248,\n",
-       "  0.9248],\n",
-       " 'train_loss_history': [0.20128365170646023,\n",
-       "  0.08118115926717782,\n",
-       "  0.019440397426679537,\n",
-       "  0.0010100115429271352,\n",
-       "  0.00028631362048062414,\n",
-       "  0.00015017583772733613,\n",
-       "  0.0002078855192506468,\n",
-       "  0.00012047414655809278,\n",
-       "  0.00012958204591429078,\n",
-       "  5.446238951249556e-05],\n",
-       " 'dev_loss_history': [0.33609411120414734,\n",
-       "  0.5065702795982361,\n",
-       "  0.49905064702033997,\n",
-       "  0.5981439352035522,\n",
-       "  0.6656511425971985,\n",
-       "  0.6893778443336487,\n",
-       "  0.6999250054359436,\n",
-       "  0.7107362747192383,\n",
-       "  0.7207703590393066,\n",
-       "  0.7250329852104187]}"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 85
-    }
-   ],
-   "source": [
-    "\n",
-    "# 3. initialize transformer document embeddings (many models are available)\n",
-    "document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)\n",
-    "\n",
-    "# 4. create the text classifier\n",
-    "classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights={b\"1\":10, b\"0\":1}) # loss_weights={\"1\":10, \"0\":1}\n",
-    "\n",
-    "# 5. initialize the text classifier trainer with Adam optimizer\n",
-    "trainer = ModelTrainer(classifier, corpus, optimizer=Adam)\n",
-    "\n",
-    "# 6. start the training\n",
-    "trainer.train('models/taggers/trec',\n",
-    "              learning_rate=1e-5, # use very small learning rate\n",
-    "              mini_batch_size=16,\n",
-    "              mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine\n",
-    "              max_epochs=10, # terminate after 5 epochs\n",
-    "              )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/tasks/wsd_gloss.py b/tasks/wsd_gloss.py
new file mode 100644
index 0000000..efd90ae
--- /dev/null
+++ b/tasks/wsd_gloss.py
@@ -0,0 +1,106 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from pathlib import Path
+from torch.optim.adam import Adam
+from flair.datasets import CSVClassificationCorpus
+from flair.data import Corpus
+from flair.embeddings import TransformerDocumentEmbeddings
+from flair.models import TextClassifier
+from flair.trainers import ModelTrainer
+
+def enclose_keyword(row,enclose_token='"'):
+    """enclose keyword with specific token to point
+    learner towards to word it has to focus on
+    """
+    sentence = ''
+    for i,c in enumerate(row.full_text):
+        if i == int(row.keyword_offset):
+            sentence+=enclose_token + ' '
+        elif i ==int(row.keyword_offset + len(row.keyword)):
+            sentence+= ' ' + enclose_token
+        sentence+=c
+    return sentence
+
+def to_glossbert_format(df):
+    """convert rows in dataframe to GlossBERT format
+    """
+
+    def gloss_string(row, definition):
+        """combine gloss with quoations and keyword
+        """
+
+        out_string=''
+        if row.enclosed_quotation:
+            out_string+=row.enclosed_quotation
+        out_string+=' [SEP] '  
+        out_string+=row.keyword+': '
+        if row.definition:
+            out_string+=definition
+        return out_string
+
+    df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)
+    
+    rows = [] 
+    for _ ,row in df.iterrows():
+        rows.append([gloss_string(row, row.definition), "Yes", row.sense_id])
+        definitions = df[df.lemma==row.lemma].definition.unique()
+        for d in definitions:
+            if d != row.definition:
+                rows.append([gloss_string(row,d), "No",row.sense_id])
+    
+    return pd.DataFrame(rows, columns=['text','label','sense_id'])
+
+
+def create_glossbert_data(lemma,pos):
+    """create glossbert data from quotations dataframe
+    """
+
+    df_quotations = pd.read_pickle(f'./data/sfrel_quotations_{lemma}_{pos}.pickle')
+    df_quotations = df_quotations[~df_quotations.keyword_offset.isnull()]
+    df_quotations = df_quotations[~df_quotations.definition.isnull()]#.reset_index(drop=True)
+    df_glossbert = to_glossbert_format(df_quotations).sample(frac=1.0).reset_index(drop=True)
+    print(df_glossbert.shape)
+    # not sure if this is correct probably should split by positive example sentence?
+    df_train, df_test = train_test_split(df_glossbert, test_size=0.2, random_state=42,shuffle=True) # , stratify=df_glossbert[['label']]
+    df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42,shuffle=True)
+    
+    train_data_path = Path("./data/training_data")
+    train_data_path.mkdir(exist_ok=True)
+    df_out_path = train_data_path / f'{lemma}_{pos}'
+    df_out_path.mkdir(exist_ok=True)
+
+    df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\t')  
+    df_val.to_csv(df_out_path / 'dev.csv', index = False, sep='\t')  
+    df_test.to_csv(df_out_path / 'test.csv', index = False, sep='\t')
+
+    return df_out_path
+
+def train_glossbert(data_folder,downsample=False):
+    column_name_map = {0: "text", 1: "label"}
+
+    corpus = CSVClassificationCorpus(data_folder,
+                                        column_name_map,
+                                        skip_header=True,
+                                        delimiter='\t',    # tab-separated files
+                                        )
+    
+    if downsample:
+        print('Downsampling.')
+        corpus = corpus.downsample(0.1)
+    
+    label_dict = corpus.make_label_dictionary()
+    print(label_dict)
+
+    document_embeddings = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=True)
+
+    classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights={b"Yes":10, b"No":1}) # loss_weights={"1":10, "0":1}
+
+    trainer = ModelTrainer(classifier, corpus, optimizer=Adam)
+
+    trainer.train('models/classifier/glossbert',
+            learning_rate=1e-3, # use very small learning rate
+            mini_batch_size=16,
+            embeddings_storage_mode='gpu',
+            mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
+            max_epochs=50, # terminate after 5 epochs
+            )

From 542dfbf6a935a4c44d50e387114bac720edc6965 Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Fri, 15 Jan 2021 10:55:03 +0000
Subject: [PATCH 05/10] add simple command line glossbert

---
 run_glossbert.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 run_glossbert.py

diff --git a/run_glossbert.py b/run_glossbert.py
new file mode 100644
index 0000000..76dd994
--- /dev/null
+++ b/run_glossbert.py
@@ -0,0 +1,10 @@
+from tasks.wsd_gloss import create_glossbert_data, train_glossbert
+import sys
+
+def run(lemma,pos):
+    data_path = create_glossbert_data(lemma,pos)
+    train_glossbert(data_path)
+
+if __name__=="__main__":
+    lemma,pos = sys.argv[1],sys.argv[2]
+    run(lemma,pos)
\ No newline at end of file

From 8a6b916f3a39727d7e0aec12d6c85b4bb6d12614 Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Fri, 15 Jan 2021 12:16:55 +0000
Subject: [PATCH 06/10] stop notebook

---
 114.1 - review notebook - glossbert.ipynb | 389 ++++++++++++++++++++--
 1 file changed, 369 insertions(+), 20 deletions(-)

diff --git a/114.1 - review notebook - glossbert.ipynb b/114.1 - review notebook - glossbert.ipynb
index cb3c924..e5fad22 100644
--- a/114.1 - review notebook - glossbert.ipynb	
+++ b/114.1 - review notebook - glossbert.ipynb	
@@ -24,9 +24,17 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2"
@@ -34,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -43,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -54,22 +62,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "error",
-     "ename": "AttributeError",
-     "evalue": "'Series' object has no attribute 'enclosed_quotation'",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-6-c17e429c2492>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcreate_glossbert_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mpos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mcreate_glossbert_data\u001b[0;34m(lemma, pos)\u001b[0m\n\u001b[1;32m     60\u001b[0m     \u001b[0mdf_quotations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_quotations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeyword_offset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     61\u001b[0m     \u001b[0mdf_quotations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_quotations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m     \u001b[0mdf_glossbert\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mto_glossbert_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfrac\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     64\u001b[0m     \u001b[0mdf_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_glossbert\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstratify\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_glossbert\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mto_glossbert_format\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m     45\u001b[0m     \u001b[0mrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     46\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m         \u001b[0mrows\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mgloss_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Yes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msense_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     48\u001b[0m         \u001b[0mdefinitions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     49\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdefinitions\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mgloss_string\u001b[0;34m(row, definition)\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m         \u001b[0mout_string\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m         \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menclosed_quotation\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     36\u001b[0m             \u001b[0mout_string\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menclosed_quotation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     37\u001b[0m         \u001b[0mout_string\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0;34m' [SEP] '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   5272\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5273\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5274\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   5275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5276\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'enclosed_quotation'"
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "(676946, 3)\n"
      ]
     }
    ],
@@ -79,11 +79,360 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "2021-01-15 10:42:39,739 Reading data from data/training_data/machine_NN\n",
+      "2021-01-15 10:42:39,739 Train: data/training_data/machine_NN/train.csv\n",
+      "2021-01-15 10:42:39,740 Dev: data/training_data/machine_NN/dev.csv\n",
+      "2021-01-15 10:42:39,740 Test: data/training_data/machine_NN/test.csv\n",
+      "2021-01-15 10:42:44,075 Computing label dictionary. Progress:\n",
+      "100%|██████████| 56863/56863 [00:25<00:00, 2210.56it/s]2021-01-15 10:43:10,130 [b'No', b'Yes']\n",
+      "\n",
+      "Dictionary with 2 tags: No, Yes\n",
+      "2021-01-15 10:43:13,389 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-15 10:43:13,394 Model: \"TextClassifier(\n",
+      "  (document_embeddings): TransformerDocumentEmbeddings(\n",
+      "    (model): BertModel(\n",
+      "      (embeddings): BertEmbeddings(\n",
+      "        (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
+      "        (position_embeddings): Embedding(512, 768)\n",
+      "        (token_type_embeddings): Embedding(2, 768)\n",
+      "        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "        (dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "      (encoder): BertEncoder(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (1): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (2): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (3): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (4): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (5): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (6): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (7): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (8): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (9): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (10): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (11): BertLayer(\n",
+      "            (attention): BertAttention(\n",
+      "              (self): BertSelfAttention(\n",
+      "                (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "              (output): BertSelfOutput(\n",
+      "                (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "                (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              )\n",
+      "            )\n",
+      "            (intermediate): BertIntermediate(\n",
+      "              (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            )\n",
+      "            (output): BertOutput(\n",
+      "              (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (pooler): BertPooler(\n",
+      "        (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "        (activation): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (decoder): Linear(in_features=768, out_features=2, bias=True)\n",
+      "  (loss_function): CrossEntropyLoss()\n",
+      "  (beta): 1.0\n",
+      "  (weights): {b'Yes': 10, b'No': 1}\n",
+      "  (weight_tensor) tensor([1., 1.], device='cuda:0')\n",
+      ")\"\n",
+      "2021-01-15 10:43:13,395 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-15 10:43:13,396 Corpus: \"Corpus: 43324 train + 10831 dev + 13539 test sentences\"\n",
+      "2021-01-15 10:43:13,396 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-15 10:43:13,397 Parameters:\n",
+      "2021-01-15 10:43:13,398  - learning_rate: \"0.001\"\n",
+      "2021-01-15 10:43:13,398  - mini_batch_size: \"16\"\n",
+      "2021-01-15 10:43:13,399  - patience: \"3\"\n",
+      "2021-01-15 10:43:13,400  - anneal_factor: \"0.5\"\n",
+      "2021-01-15 10:43:13,400  - max_epochs: \"50\"\n",
+      "2021-01-15 10:43:13,401  - shuffle: \"True\"\n",
+      "2021-01-15 10:43:13,402  - train_with_dev: \"False\"\n",
+      "2021-01-15 10:43:13,403  - batch_growth_annealing: \"False\"\n",
+      "2021-01-15 10:43:13,403 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-15 10:43:13,404 Model training base path: \"models/classifier/glossbert\"\n",
+      "2021-01-15 10:43:13,405 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-15 10:43:13,406 Device: cuda:0\n",
+      "2021-01-15 10:43:13,407 ----------------------------------------------------------------------------------------------------\n",
+      "2021-01-15 10:43:13,408 Embeddings storage mode: cpu\n"
+     ]
+    },
+    {
+     "output_type": "error",
+     "ename": "TypeError",
+     "evalue": "__init__() got an unexpected keyword argument 'embedding_storage_mode'",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-15-2c22042a9a19>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrain_glossbert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_path\u001b[0m\u001b[0;34m,\u001b[0m  \u001b[0mdownsample\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mtrain_glossbert\u001b[0;34m(data_folder, downsample)\u001b[0m\n\u001b[1;32m    102\u001b[0m             \u001b[0membedding_storage_mode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'gpu'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    103\u001b[0m             \u001b[0mmini_batch_chunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# optionally set this if transformer is too much for your machine\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 104\u001b[0;31m             \u001b[0mmax_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# terminate after 5 epochs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    105\u001b[0m             )\n",
+      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/flair/trainers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, base_path, learning_rate, mini_batch_size, mini_batch_chunk_size, max_epochs, scheduler, cycle_momentum, anneal_factor, patience, initial_extra_patience, min_learning_rate, train_with_dev, monitor_train, monitor_test, embeddings_storage_mode, checkpoint, save_final_model, anneal_with_restarts, anneal_with_prestarts, batch_growth_annealing, shuffle, param_selection_mode, write_weights, num_workers, sampler, use_amp, amp_opt_level, eval_on_train_fraction, eval_on_train_shuffle, save_model_at_each_epoch, **kwargs)\u001b[0m\n\u001b[1;32m    224\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    225\u001b[0m         optimizer: torch.optim.Optimizer = self.optimizer(\n\u001b[0;32m--> 226\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlearning_rate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    227\u001b[0m         )\n\u001b[1;32m    228\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'embedding_storage_mode'"
+     ]
+    }
+   ],
    "source": [
-    "train_glossbert(data_path,downsample=True)"
+    "train_glossbert(data_path,  downsample=True)"
    ]
   },
   {

From 3ce710a274fe1819f124d5e1ba87118991ebb8a3 Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Mon, 18 Jan 2021 09:24:09 +0000
Subject: [PATCH 07/10] add review notebook for multi dataset learning

---
 114.2 - review notebook - multidataset.ipynb  | 195 ++++++++++++++++++
 data/grouped_senses.md                        |   0
 ...words_for_evaluation_selection_criteria.md |   0
 run_glossbert.py                              |   4 +-
 4 files changed, 197 insertions(+), 2 deletions(-)
 create mode 100644 114.2 - review notebook - multidataset.ipynb
 mode change 100644 => 100755 data/grouped_senses.md
 mode change 100644 => 100755 data/words_for_evaluation_selection_criteria.md

diff --git a/114.2 - review notebook - multidataset.ipynb b/114.2 - review notebook - multidataset.ipynb
new file mode 100644
index 0000000..19fd620
--- /dev/null
+++ b/114.2 - review notebook - multidataset.ipynb	
@@ -0,0 +1,195 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3",
+   "language": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from pathlib import Path\n",
+    "from torch.optim.adam import Adam\n",
+    "from flair.datasets import CSVClassificationCorpus\n",
+    "from flair.data import Corpus\n",
+    "from flair.embeddings import TransformerDocumentEmbeddings\n",
+    "from flair.models import TextClassifier\n",
+    "from flair.trainers import ModelTrainer\n",
+    "from tasks.wsd_gloss import enclose_keyword\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gloss_dfs(df):\n",
+    "    \"\"\"convert rows in dataframe to GlossBERT format\n",
+    "    \"\"\"\n",
+    "    df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)\n",
+    "    df_gl =  df[['enclosed_quotation','definition','label']]\n",
+    "    return df_gl[['enclosed_quotation','label']],df_gl[['definition','label']].drop_duplicates()\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lemma,pos='machine','NN'\n",
+    "df_quotations = pd.read_pickle(f'./data/sfrel_quotations_{lemma}_{pos}.pickle')\n",
+    "df_quotations = df_quotations[~df_quotations.keyword_offset.isnull()]\n",
+    "df_quotations = df_quotations[~df_quotations.definition.isnull()].reset_index(drop=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "(29752, 14)\n(29321, 14)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df_quotations.shape)          \n",
+    "quotation_valuecounts = df_quotations.sense_id.value_counts()\n",
+    "df_quotations = df_quotations[df_quotations.sense_id.isin(quotation_valuecounts[quotation_valuecounts>1].index)]\n",
+    "print(df_quotations.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "error",
+     "ename": "KeyError",
+     "evalue": "\"['label'] not in index\"",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-7-f7369b64ea32>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcontext_df\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mgloss_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgloss_dfs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-3-f4fd4d870172>\u001b[0m in \u001b[0;36mgloss_dfs\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m      3\u001b[0m     \"\"\"\n\u001b[1;32m      4\u001b[0m     \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menclose_keyword\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mdf_gl\u001b[0m \u001b[0;34m=\u001b[0m  \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'definition'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdf_gl\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdf_gl\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'definition'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   2804\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mis_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2805\u001b[0m                 \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2806\u001b[0;31m             \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2807\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2808\u001b[0m         \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1552\u001b[0m         self._validate_read_indexer(\n\u001b[0;32m-> 1553\u001b[0;31m             \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_missing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1554\u001b[0m         )\n\u001b[1;32m   1555\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1644\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"loc\"\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1645\u001b[0m                 \u001b[0mnot_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1646\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{not_found} not in index\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1648\u001b[0m             \u001b[0;31m# we skip the warning on Categorical/Interval\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyError\u001b[0m: \"['label'] not in index\""
+     ]
+    }
+   ],
+   "source": [
+    "#context_df,gloss_df = gloss_dfs((df_quotations))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(context_df.shape,gloss_df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data_path = Path(\"./data/training_data_ms\")\n",
+    "train_data_path.mkdir(exist_ok=True)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "error",
+     "ename": "ValueError",
+     "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-48-886e44c2b511>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m                                         \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m                                         \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m                                         stratify=df[['sense_id']]) # 1st\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m     df_train, df_val = train_test_split(df_train, \n",
+      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36mtrain_test_split\u001b[0;34m(*arrays, **options)\u001b[0m\n\u001b[1;32m   2150\u001b[0m                      random_state=random_state)\n\u001b[1;32m   2151\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2152\u001b[0;31m         \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstratify\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2154\u001b[0m     return list(chain.from_iterable((_safe_indexing(a, train),\n",
+      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36msplit\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m   1339\u001b[0m         \"\"\"\n\u001b[1;32m   1340\u001b[0m         \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindexable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1341\u001b[0;31m         \u001b[0;32mfor\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iter_indices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1342\u001b[0m             \u001b[0;32myield\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36m_iter_indices\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m   1666\u001b[0m         \u001b[0mclass_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbincount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_indices\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1667\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclass_counts\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1668\u001b[0;31m             raise ValueError(\"The least populated class in y has only 1\"\n\u001b[0m\u001b[1;32m   1669\u001b[0m                              \u001b[0;34m\" member, which is too few. The minimum\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1670\u001b[0m                              \u001b[0;34m\" number of groups for any class cannot\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2."
+     ]
+    }
+   ],
+   "source": [
+    "for n, df in [('context',context_df),('gloss',gloss_df)]:\n",
+    "\n",
+    "    df_out_path = train_data_path / f'{lemma}_{pos}_{n}'\n",
+    "    df_out_path.mkdir(exist_ok=True)\n",
+    "\n",
+    "    df_train, df_test = train_test_split(df, \n",
+    "                                        test_size=0.2, \n",
+    "                                        random_state=42,\n",
+    "                                        shuffle=True,\n",
+    "                                        stratify=df[['label']]\n",
+    "                                        ) # 1st\n",
+    "                                                \n",
+    "    df_train, df_val = train_test_split(df_train, \n",
+    "                                        test_size=0.1, \n",
+    "                                        random_state=42,\n",
+    "                                        shuffle=True,\n",
+    "                                        stratify=df_train[['label']] # bug here, try to do the stratification better\n",
+    "                                        ) # 2nd\n",
+    "    \n",
+    "    df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\\t')  \n",
+    "    df_val.to_csv(df_out_path / 'dev.csv', index = False, sep='\\t')  \n",
+    "    df_test.to_csv(df_out_path / 'test.csv', index = False, sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}
\ No newline at end of file
diff --git a/data/grouped_senses.md b/data/grouped_senses.md
old mode 100644
new mode 100755
diff --git a/data/words_for_evaluation_selection_criteria.md b/data/words_for_evaluation_selection_criteria.md
old mode 100644
new mode 100755
diff --git a/run_glossbert.py b/run_glossbert.py
index 76dd994..339eb55 100644
--- a/run_glossbert.py
+++ b/run_glossbert.py
@@ -3,8 +3,8 @@
 
 def run(lemma,pos):
     data_path = create_glossbert_data(lemma,pos)
-    train_glossbert(data_path)
+    train_glossbert(data_path,downsample=True)
 
 if __name__=="__main__":
     lemma,pos = sys.argv[1],sys.argv[2]
-    run(lemma,pos)
\ No newline at end of file
+    run(lemma,pos)

From 269a33e13997883c4d0397ea34f4b702f276e9e6 Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Mon, 18 Jan 2021 12:56:28 +0000
Subject: [PATCH 08/10] add multidataset training

---
 114.2 - review notebook - multidataset.ipynb | 152 +++++--------------
 1 file changed, 34 insertions(+), 118 deletions(-)

diff --git a/114.2 - review notebook - multidataset.ipynb b/114.2 - review notebook - multidataset.ipynb
index 19fd620..07cbd17 100644
--- a/114.2 - review notebook - multidataset.ipynb	
+++ b/114.2 - review notebook - multidataset.ipynb	
@@ -24,92 +24,71 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from pathlib import Path\n",
-    "from torch.optim.adam import Adam\n",
-    "from flair.datasets import CSVClassificationCorpus\n",
-    "from flair.data import Corpus\n",
-    "from flair.embeddings import TransformerDocumentEmbeddings\n",
-    "from flair.models import TextClassifier\n",
-    "from flair.trainers import ModelTrainer\n",
-    "from tasks.wsd_gloss import enclose_keyword\n"
+    "%load_ext autoreload\n",
+    "%autoreload 2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def gloss_dfs(df):\n",
-    "    \"\"\"convert rows in dataframe to GlossBERT format\n",
-    "    \"\"\"\n",
-    "    df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)\n",
-    "    df_gl =  df[['enclosed_quotation','definition','label']]\n",
-    "    return df_gl[['enclosed_quotation','label']],df_gl[['definition','label']].drop_duplicates()\n",
-    "    \n"
+    "from tasks.wsd_gloss import create_md_training_data, train_gloss_and_context"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "lemma,pos='machine','NN'\n",
-    "df_quotations = pd.read_pickle(f'./data/sfrel_quotations_{lemma}_{pos}.pickle')\n",
-    "df_quotations = df_quotations[~df_quotations.keyword_offset.isnull()]\n",
-    "df_quotations = df_quotations[~df_quotations.definition.isnull()].reset_index(drop=True)\n"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "(29752, 14)\n(29321, 14)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "print(df_quotations.shape)          \n",
-    "quotation_valuecounts = df_quotations.sense_id.value_counts()\n",
-    "df_quotations = df_quotations[df_quotations.sense_id.isin(quotation_valuecounts[quotation_valuecounts>1].index)]\n",
-    "print(df_quotations.shape)"
+    "lemma, pos = 'machine','NN'\n",
+    "senses = {'machine_nn01-38474140'} # machine_nn01-38475772 machine_nn01-38475923 machine_nn01-38475835 machine_nn01-38474140\n",
+    "relations = ['seed','synonym'] # ,'descendant','sibling'\n",
+    "eval_mode = \"lemma_etal\" # lemma or lemma_etal\n",
+    "experiment_id = 0"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
-     "output_type": "error",
-     "ename": "KeyError",
-     "evalue": "\"['label'] not in index\"",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-7-f7369b64ea32>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcontext_df\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mgloss_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgloss_dfs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m<ipython-input-3-f4fd4d870172>\u001b[0m in \u001b[0;36mgloss_dfs\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m      3\u001b[0m     \"\"\"\n\u001b[1;32m      4\u001b[0m     \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menclose_keyword\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mdf_gl\u001b[0m \u001b[0;34m=\u001b[0m  \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'definition'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdf_gl\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdf_gl\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'definition'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   2804\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mis_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2805\u001b[0m                 \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2806\u001b[0;31m             \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2807\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2808\u001b[0m         \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1552\u001b[0m         self._validate_read_indexer(\n\u001b[0;32m-> 1553\u001b[0;31m             \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_missing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1554\u001b[0m         )\n\u001b[1;32m   1555\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1644\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"loc\"\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1645\u001b[0m                 \u001b[0mnot_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1646\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{not_found} not in index\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1648\u001b[0m             \u001b[0;31m# we skip the warning on Categorical/Interval\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mKeyError\u001b[0m: \"['label'] not in index\""
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "# senses before filtering by date = 517\n",
+      "# senses after filtering by date = 433\n",
+      "\n",
+      "\n",
+      "# of seed senses 26 \n",
+      "# of synonyms 383 \n",
+      "# of branch senses 0\n",
+      "\n",
+      "\n",
+      "# of seeds selected 1 \n",
+      "# of synonyms selected 44 \n",
+      "# of branches selected 0\n",
+      "[LOG] #rows before removing None vector (1947, 21)\n",
+      "[LOG] #rows after removing None vector (1911, 21)\n"
      ]
     }
    ],
    "source": [
-    "#context_df,gloss_df = gloss_dfs((df_quotations))"
+    "create_md_training_data(lemma,pos,senses,relations,experiment_id=experiment_id)"
    ]
   },
   {
@@ -118,72 +97,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(context_df.shape,gloss_df.shape)"
+    "train_gloss_and_context(lemma,pos,experiment_id=experiment_id)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_data_path = Path(\"./data/training_data_ms\")\n",
-    "train_data_path.mkdir(exist_ok=True)\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "error",
-     "ename": "ValueError",
-     "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-48-886e44c2b511>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m                                         \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m                                         \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m                                         stratify=df[['sense_id']]) # 1st\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m     df_train, df_val = train_test_split(df_train, \n",
-      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36mtrain_test_split\u001b[0;34m(*arrays, **options)\u001b[0m\n\u001b[1;32m   2150\u001b[0m                      random_state=random_state)\n\u001b[1;32m   2151\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2152\u001b[0;31m         \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstratify\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2154\u001b[0m     return list(chain.from_iterable((_safe_indexing(a, train),\n",
-      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36msplit\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m   1339\u001b[0m         \"\"\"\n\u001b[1;32m   1340\u001b[0m         \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindexable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1341\u001b[0;31m         \u001b[0;32mfor\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iter_indices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1342\u001b[0m             \u001b[0;32myield\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36m_iter_indices\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m   1666\u001b[0m         \u001b[0mclass_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbincount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_indices\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1667\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclass_counts\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1668\u001b[0;31m             raise ValueError(\"The least populated class in y has only 1\"\n\u001b[0m\u001b[1;32m   1669\u001b[0m                              \u001b[0;34m\" member, which is too few. The minimum\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1670\u001b[0m                              \u001b[0;34m\" number of groups for any class cannot\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2."
-     ]
-    }
-   ],
-   "source": [
-    "for n, df in [('context',context_df),('gloss',gloss_df)]:\n",
-    "\n",
-    "    df_out_path = train_data_path / f'{lemma}_{pos}_{n}'\n",
-    "    df_out_path.mkdir(exist_ok=True)\n",
-    "\n",
-    "    df_train, df_test = train_test_split(df, \n",
-    "                                        test_size=0.2, \n",
-    "                                        random_state=42,\n",
-    "                                        shuffle=True,\n",
-    "                                        stratify=df[['label']]\n",
-    "                                        ) # 1st\n",
-    "                                                \n",
-    "    df_train, df_val = train_test_split(df_train, \n",
-    "                                        test_size=0.1, \n",
-    "                                        random_state=42,\n",
-    "                                        shuffle=True,\n",
-    "                                        stratify=df_train[['label']] # bug here, try to do the stratification better\n",
-    "                                        ) # 2nd\n",
-    "    \n",
-    "    df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\\t')  \n",
-    "    df_val.to_csv(df_out_path / 'dev.csv', index = False, sep='\\t')  \n",
-    "    df_test.to_csv(df_out_path / 'test.csv', index = False, sep='\\t')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,

From 39d736c5ed6672ba8693e099cbe76ea62562ac4f Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Mon, 18 Jan 2021 13:03:17 +0000
Subject: [PATCH 09/10] add typing to functions

---
 tasks/wsd_gloss.py | 127 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 118 insertions(+), 9 deletions(-)

diff --git a/tasks/wsd_gloss.py b/tasks/wsd_gloss.py
index efd90ae..c736c48 100644
--- a/tasks/wsd_gloss.py
+++ b/tasks/wsd_gloss.py
@@ -1,14 +1,19 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
-from pathlib import Path
+from pathlib import Path, PosixPath
+from utils.classificaton_utils import binarize
 from torch.optim.adam import Adam
 from flair.datasets import CSVClassificationCorpus
-from flair.data import Corpus
+from flair.data import MultiCorpus
 from flair.embeddings import TransformerDocumentEmbeddings
 from flair.models import TextClassifier
 from flair.trainers import ModelTrainer
 
-def enclose_keyword(row,enclose_token='"'):
+# ---------------------------------------
+# glossbert method ----------------------
+
+def enclose_keyword(row:pd.Series,
+                    enclose_token:str='"'):
     """enclose keyword with specific token to point
     learner towards to word it has to focus on
     """
@@ -21,11 +26,11 @@ def enclose_keyword(row,enclose_token='"'):
         sentence+=c
     return sentence
 
-def to_glossbert_format(df):
+def to_glossbert_format(df:pd.DataFrame):
     """convert rows in dataframe to GlossBERT format
     """
 
-    def gloss_string(row, definition):
+    def gloss_string(row:pd.Series, definition:str):
         """combine gloss with quoations and keyword
         """
 
@@ -34,8 +39,8 @@ def gloss_string(row, definition):
             out_string+=row.enclosed_quotation
         out_string+=' [SEP] '  
         out_string+=row.keyword+': '
-        if row.definition:
-            out_string+=definition
+        #if row.definition:
+        out_string+=definition
         return out_string
 
     df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)
@@ -51,7 +56,8 @@ def gloss_string(row, definition):
     return pd.DataFrame(rows, columns=['text','label','sense_id'])
 
 
-def create_glossbert_data(lemma,pos):
+def create_glossbert_data(lemma:str,
+                        pos:str):
     """create glossbert data from quotations dataframe
     """
 
@@ -75,7 +81,9 @@ def create_glossbert_data(lemma,pos):
 
     return df_out_path
 
-def train_glossbert(data_folder,downsample=False):
+def train_glossbert(data_folder:PosixPath,
+                    downsample:bool=False):
+
     column_name_map = {0: "text", 1: "label"}
 
     corpus = CSVClassificationCorpus(data_folder,
@@ -104,3 +112,104 @@ def train_glossbert(data_folder,downsample=False):
             mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
             max_epochs=50, # terminate after 5 epochs
             )
+
+
+# ---------------------------------------
+# multidataset training -----------------
+
+
+def context_gloss_dfs(df:pd.DataFrame):
+    """convert rows in dataframe to GlossBERT format
+    """
+    df = df[~df.keyword_offset.isnull()]
+    df = df[~df.definition.isnull()].reset_index(drop=True)
+    df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)
+    df_gl =  df[['enclosed_quotation','definition','label']]
+    return df_gl[['enclosed_quotation','label']],df_gl[['definition','label']].drop_duplicates()
+
+def create_md_training_data(lemma:str, 
+                            pos:str, 
+                            senses:set, 
+                            relations:list, 
+                            experiment_id:int=0,
+                            eval_mode:str='lemma_etal'):
+    """create data for multidataset training
+    """
+    df_train, df_val, df_test = binarize(lemma,
+                        pos,
+                        senses, 
+                        relations,
+                        strict_filter=True,
+                        start=1700,
+                        end=2000,
+                        eval_mode=eval_mode)
+
+    data = list(map(context_gloss_dfs,[df_train, df_val, df_test]))
+
+    train_data_path = Path("./data/training_data_md")
+    train_data_path.mkdir(exist_ok=True)
+
+    for context, gloss in data:
+        for n, df in [('context',context),('gloss',gloss)]:
+
+            df_out_path= train_data_path / f'{lemma}_{pos}_{experiment_id}_{n}'
+            df_out_path.mkdir(exist_ok=True)
+
+            df_train, df_test = train_test_split(df, 
+                                        test_size=0.2, 
+                                        random_state=42,
+                                        shuffle=True,
+                                        stratify=df[['label']]
+                                        ) # 1st
+                                                
+            df_train, df_val = train_test_split(df_train, 
+                                        test_size=0.1, 
+                                        random_state=42,
+                                        shuffle=True,
+                                        stratify=df_train[['label']] # bug here, try to do the stratification better
+                                        ) # 2nd
+    
+            df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\t')  
+            df_val.to_csv(df_out_path / 'dev.csv', index = False, sep='\t')  
+            df_test.to_csv(df_out_path / 'test.csv', index = False, sep='\t')
+
+def train_gloss_and_context(lemma:str,
+                            pos:str,
+                            experiment_id:int=0,
+                            data_folder:PosixPath=Path("./data/training_data_md"),
+                            downsample:bool=False):
+    column_name_map = {0: "text", 1: "label"}
+
+    context_corpus = CSVClassificationCorpus(data_folder / f"{lemma}_{pos}_{experiment_id}_context",
+                                        column_name_map,
+                                        skip_header=True,
+                                        delimiter='\t',    # tab-separated files
+                                        )
+    gloss_corpus = CSVClassificationCorpus(data_folder / f"{lemma}_{pos}_{experiment_id}_gloss",
+                                        column_name_map,
+                                        skip_header=True,
+                                        delimiter='\t',    # tab-separated files
+                                        )
+    
+    corpus = MultiCorpus([context_corpus, gloss_corpus])
+    
+    if downsample:
+        print('Downsampling...')
+        corpus = corpus.downsample(0.1)
+    
+    label_dict = corpus.make_label_dictionary()
+    print(label_dict)
+
+    document_embeddings = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=True)
+
+    classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # loss_weights={"1":10, "0":1}
+
+    trainer = ModelTrainer(classifier, corpus, optimizer=Adam)
+
+    trainer.train('models/classifier/glossbert',
+            learning_rate=1e-3, # use very small learning rate
+            mini_batch_size=16,
+            embeddings_storage_mode='gpu',
+            mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
+            max_epochs=50, # terminate after 5 epochs
+            )
\ No newline at end of file

From d547400f55536c35d9ce3dea5fcce519704a9aa0 Mon Sep 17 00:00:00 2001
From: kasparvonbeelen
 <kbeelen@lwm-embeddings.kebr4boiy0luzjt0lpaozyeibd.ax.internal.cloudapp.net>
Date: Mon, 18 Jan 2021 15:33:29 +0000
Subject: [PATCH 10/10] update documentation for gloss function

---
 tasks/wsd_gloss.py           | 113 +++++++++++++++---
 utils/classificaton_utils.py | 224 +++++++++++++++++------------------
 2 files changed, 207 insertions(+), 130 deletions(-)

diff --git a/tasks/wsd_gloss.py b/tasks/wsd_gloss.py
index c736c48..cc32b98 100644
--- a/tasks/wsd_gloss.py
+++ b/tasks/wsd_gloss.py
@@ -13,9 +13,19 @@
 # glossbert method ----------------------
 
 def enclose_keyword(row:pd.Series,
-                    enclose_token:str='"'):
+                    enclose_token:str='"') -> str:
     """enclose keyword with specific token to point
-    learner towards to word it has to focus on
+    learner towards to word it has to focus on. this 
+    is part of the weak supervision when learning
+    from context/quotations.
+
+    Arguments:
+        row (pd.Series): row of quotations dataframe
+        enclose_token (str): use token to mark target expression
+                    effectively this serves begin and end token
+
+    Returns:
+        quotation with target token marked by `enclose_token`
     """
     sentence = ''
     for i,c in enumerate(row.full_text):
@@ -26,12 +36,26 @@ def enclose_keyword(row:pd.Series,
         sentence+=c
     return sentence
 
-def to_glossbert_format(df:pd.DataFrame):
+def to_glossbert_format(df:pd.DataFrame) -> pd.DataFrame:
     """convert rows in dataframe to GlossBERT format
+    Argument:
+        df (pd.DataFrame): quotations dataframe
+
+    Returns:
+        pd.DataFrame with format confirming the 
+        GlossBERT template
     """
 
-    def gloss_string(row:pd.Series, definition:str):
-        """combine gloss with quoations and keyword
+    def gloss_string(row:pd.Series, definition:str) -> str:
+        """combine gloss with quotations and keyword
+        
+        Arguments:
+            row (pd.Series): row of dataframe
+            definition (str): definition to use as gloss
+        
+        Returns:
+            out_string that combines as quotation/context
+            with a gloss seperated by [SEP]
         """
 
         out_string=''
@@ -46,19 +70,28 @@ def gloss_string(row:pd.Series, definition:str):
     df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)
     
     rows = [] 
+
+    # create labelled observations 1 of the context matches the definition
+    # 0 for the other cases (this method used weak supervision)
     for _ ,row in df.iterrows():
-        rows.append([gloss_string(row, row.definition), "Yes", row.sense_id])
+        rows.append([gloss_string(row, row.definition), "1", row.sense_id])
         definitions = df[df.lemma==row.lemma].definition.unique()
         for d in definitions:
             if d != row.definition:
-                rows.append([gloss_string(row,d), "No",row.sense_id])
+                rows.append([gloss_string(row,d), "0",row.sense_id])
     
     return pd.DataFrame(rows, columns=['text','label','sense_id'])
 
 
 def create_glossbert_data(lemma:str,
-                        pos:str):
-    """create glossbert data from quotations dataframe
+                        pos:str) -> PosixPath:
+    """Create glossbert data from quotations dataframe
+    Arguments:
+        lemma (str): lemma 
+        pos (str): part-of-speech
+
+    Return:
+        path as PosixPath to location where data is stored
     """
 
     df_quotations = pd.read_pickle(f'./data/sfrel_quotations_{lemma}_{pos}.pickle')
@@ -82,7 +115,18 @@ def create_glossbert_data(lemma:str,
     return df_out_path
 
 def train_glossbert(data_folder:PosixPath,
-                    downsample:bool=False):
+                    downsample:bool=False) -> bool:
+    """train as GlossBERT model
+    Arguments:
+        data_folder (PosixPath): folder where train/dev and 
+                    test set are stored as csv files
+        downsample (bool): if True we use only ten per cent
+                        of the data for training and testing
+                        primarily used for demo puroposes
+                
+    Return:
+        return True after training
+    """
 
     column_name_map = {0: "text", 1: "label"}
 
@@ -112,20 +156,29 @@ def train_glossbert(data_folder:PosixPath,
             mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
             max_epochs=50, # terminate after 5 epochs
             )
+    
+    return True
 
 
 # ---------------------------------------
 # multidataset training -----------------
 
 
-def context_gloss_dfs(df:pd.DataFrame):
-    """convert rows in dataframe to GlossBERT format
+def context_gloss_dfs(df:pd.DataFrame) -> tuple:
+    """split the quotations dataframe in a context/quotation 
+    and a gloss dataframe.
+
+    Arguments:
+        df (pd.DataFrame): quotations dataframe
+    Returns:
+        a tuple in the format (context_df, gloss_df)
     """
     df = df[~df.keyword_offset.isnull()]
     df = df[~df.definition.isnull()].reset_index(drop=True)
     df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)
     df_gl =  df[['enclosed_quotation','definition','label']]
-    return df_gl[['enclosed_quotation','label']],df_gl[['definition','label']].drop_duplicates()
+    return (df_gl[['enclosed_quotation','label']],
+            df_gl[['definition','label']].drop_duplicates())
 
 def create_md_training_data(lemma:str, 
                             pos:str, 
@@ -133,7 +186,17 @@ def create_md_training_data(lemma:str,
                             relations:list, 
                             experiment_id:int=0,
                             eval_mode:str='lemma_etal'):
-    """create data for multidataset training
+    """create data for multidataset training in which
+    we train a model simultaneously on quotations and glosses.
+
+    Arguments:
+        lemma (str): lemma
+        pos (str): part-of-speech
+        senses (set): senses that define the positive class
+        relations (list): relation used for expanding the senses
+        experiment_id (int): integer identifier used as id
+        eval_mode (str): evalation mode (lemma or lemma_etal)
+
     """
     df_train, df_val, df_test = binarize(lemma,
                         pos,
@@ -166,7 +229,7 @@ def create_md_training_data(lemma:str,
                                         test_size=0.1, 
                                         random_state=42,
                                         shuffle=True,
-                                        stratify=df_train[['label']] # bug here, try to do the stratification better
+                                        stratify=df_train[['label']] 
                                         ) # 2nd
     
             df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\t')  
@@ -177,7 +240,22 @@ def train_gloss_and_context(lemma:str,
                             pos:str,
                             experiment_id:int=0,
                             data_folder:PosixPath=Path("./data/training_data_md"),
-                            downsample:bool=False):
+                            downsample:bool=False) -> bool:
+    """fine-tune a transformer model on both the context and the gloss
+
+    Arguments:
+        lemma (str): lemma
+        pos (str): part-of-speech
+        experiment_id (int): integer used to identify experiment
+        data_folder (PosixPath): main folder for storing the 
+                context and gloss folder
+        downsample (bool): if True we use only 10% of the data
+                for training and testing
+
+    Returns:
+        returns True after model has finished training
+    """
+
     column_name_map = {0: "text", 1: "label"}
 
     context_corpus = CSVClassificationCorpus(data_folder / f"{lemma}_{pos}_{experiment_id}_context",
@@ -212,4 +290,5 @@ def train_gloss_and_context(lemma:str,
             embeddings_storage_mode='gpu',
             mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
             max_epochs=50, # terminate after 5 epochs
-            )
\ No newline at end of file
+            )
+    return True
\ No newline at end of file
diff --git a/utils/classificaton_utils.py b/utils/classificaton_utils.py
index 3eb8861..91f8112 100644
--- a/utils/classificaton_utils.py
+++ b/utils/classificaton_utils.py
@@ -10,9 +10,7 @@
 from typing import Union
 from utils.dataset_download import *
 from sklearn.model_selection import train_test_split
-from tasks import wsd
-from utils import nlp_tools
-#import swifter
+
 
 cosine_similiarity = lambda x, target : 1 - cosine(x,target)
 
@@ -310,116 +308,116 @@ def merge_definitions(row):
 # Depreciated code
 # To be removed before release
 
-def eval_lemma(lemma,
-                pos,
-                idx,
-                embedding_methods,
-                start=1760,
-                end=1920,
-                vector_type='vector_bert_base_-1,-2,-3,-4_mean',
-                skip_vectorize=False,
-                train_on_dev=True):
-
-    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
-    
-    if not skip_vectorize:
-        vectorize_target_expressions(quotations_path,embedding_methods)
-
-    lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle')
-    senses = set(lemma_senses[lemma_senses.word_id==f'{lemma}_{pos.lower()}{idx}'].id)
-
-    relations = ['seed','synonym'] # ,'descendant','sibling'
-    eval_mode = "lemma_etal" # lemma or lemma_etal
-
-    wemb_model = Word2Vec.load("/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/models/w2v_004/w2v_words.model")
-    y_true,y_pred_bin_centr, y_pred_ts_bin_centr,y_pred_sense_centr,y_pred_ts_sense_centr, rand, token_overlap,w2v_lesk = [], [],[],[], [], [],[], []
-
-
-    tqdm.pandas()
-
-    for sense in senses:
-        
-            print(sense)
-            df_train, df_val, df_test = binarize(lemma,
-                        pos,
-                        {sense}, 
-                        relations,
-                        strict_filter=True,
-                        start=start,
-                        end=end,
-                        eval_mode=eval_mode)
-            # no quotations for sense and timeframe
-            if df_train is None: continue
-
-
-
-            y_true.extend(df_test.label.to_list())
-
-
-            if train_on_dev:
-                df_train = pd.concat([df_train, df_val], axis=0)
-
-            df_train["nlp_full_text"] = df_train.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
-
-            df_val["nlp_full_text"] = df_val.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
-
-            df_test["nlp_full_text"] = df_test.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
-
-            # random 
-            df_test["random"] = df_test.progress_apply(lambda row: wsd.random_predict(), axis=1)
-            rand.extend(df_test["random"].to_list())
-            
-            # token overlap
-            df_selected_senses = generate_definition_df(df_train,lemma,eval_mode=eval_mode)
-            df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)
-            df_test["def_tok_overlap_ranking"] = df_test.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)
-            token_overlap.extend(df_test["def_tok_overlap_ranking"].to_list())
-
-            #w2v lesk
-            # Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
-            #df_test["w2v_lesk_ranking"] = df_test.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)
-            #w2v_lesk.extend(df_test['w2v_lesk_ranking'].to_list())
-
-            # binary centroid
-            centroid_vectors = df_train.groupby('label')[vector_type].apply(np.mean,axis=0)
-            df_test[f"bert_centroid_binary_{vector_type}"] = df_test[vector_type].progress_apply(wsd.bert_binary_centroid_vector,
-                                                                                            centroid_vectors = centroid_vectors,
-                                                                                            )
-            y_pred_bin_centr.extend(df_test[f"bert_centroid_binary_{vector_type}"].to_list())
-            #results[f"bert_centroid_binary_{vector_type}_{sense}"] = (wsd.eval(f"bert_centroid_binary_{vector_type}",df_test),len(df_test))
-            
-            # binary centroid time sensitive
-            df_test[f"bert_ts_centroid_binary_{vector_type}"] = df_test.progress_apply(wsd.bert_ts_binary_centroid_vector, df_train=df_train, axis=1)
-            y_pred_ts_bin_centr.extend(df_test[f"bert_ts_centroid_binary_{vector_type}"].to_list())
-            
-            #results[f"bert_ts_centroid_binary_{vector_type}_{sense}"] = (wsd.eval(f"bert_ts_centroid_binary_{vector_type}",df_test),len(df_test))
-            
-            # sense level centroid
-            senseid2label = dict(df_test[['sense_id','label']].values)
-            df_test[f"bert_centroid_sense_{vector_type}"] = df_test.progress_apply(wsd.bert_sense_centroid_vector,  
-                            senseid2label= senseid2label,
-                            vector_col=vector_type,
-                            df_train = df_train, axis=1)
-            
-            y_pred_sense_centr.extend(df_test[f"bert_centroid_sense_{vector_type}"].to_list())
-
-            df_test[f"bert_ts_centroid_sense_{vector_type}"] = df_test.progress_apply(wsd.bert_ts_sense_centroid_vector,  
-                            senseid2label= senseid2label,
-                            vector_col=vector_type,
-                            df_train = df_train, axis=1)
-            
-            y_pred_ts_sense_centr.extend(df_test[f"bert_centroid_sense_{vector_type}"].to_list())
-            #results[f"bert_centroid_sense_{vector_type}_{sense}"] = (wsd.eval(f"bert_centroid_sense_{vector_type}",df_test),len(df_test))
-            # semaxis
-            #centroid_vectors = df_train.groupby('label')[vector_type].apply(np.mean,axis=0)
-            #sem_axis = centroid_vectors[1] - centroid_vectors[0] 
-            #df_test[f"bert_semaxis_{vector_type}"] = df_test[vector_type].progress_apply(wsd.bert_semaxis_vector, sem_axis=sem_axis, return_label=True, threshold=.0)
-            #y_pred_semaxis.extend(df_test[f"bert_semaxis_{vector_type}"].to_list())
-            #results[f"bert_semaxis_{vector_type}_{sense}"]  = (wsd.eval(f"bert_semaxis_{vector_type}",df_test),len(df_test))
-    
-            #df_test.to_pickle(f'./data/results/{lemma}_{pos}_{sense}.results')
-    
-    return y_true, y_pred_bin_centr,y_pred_ts_bin_centr,y_pred_sense_centr,y_pred_ts_sense_centr, rand, token_overlap, w2v_lesk
+#def eval_lemma(lemma,
+#                pos,
+#                idx,
+#                embedding_methods,
+#                start=1760,
+#                end=1920,
+#                vector_type='vector_bert_base_-1,-2,-3,-4_mean',
+#                skip_vectorize=False,
+#                train_on_dev=True):
+#
+#    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
+#    
+#    if not skip_vectorize:
+#        vectorize_target_expressions(quotations_path,embedding_methods)
+#
+#    lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle')
+#    senses = set(lemma_senses[lemma_senses.word_id==f'{lemma}_{pos.lower()}{idx}'].id)
+#
+#    relations = ['seed','synonym'] # ,'descendant','sibling'
+#    eval_mode = "lemma_etal" # lemma or lemma_etal
+#
+#    wemb_model = Word2Vec.load("/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/models/w2v_004/w2v_words.model")
+#    y_true,y_pred_bin_centr, y_pred_ts_bin_centr,y_pred_sense_centr,y_pred_ts_sense_centr, rand, token_overlap,w2v_lesk = [], [],[],[], [], [],[], []
+#
+#
+#    tqdm.pandas()
+#
+#    for sense in senses:
+#        
+#            print(sense)
+#            df_train, df_val, df_test = binarize(lemma,
+#                        pos,
+#                        {sense}, 
+#                        relations,
+#                        strict_filter=True,
+#                        start=start,
+#                        end=end,
+#                        eval_mode=eval_mode)
+#            # no quotations for sense and timeframe
+#            if df_train is None: continue
+#
+#
+#
+#            y_true.extend(df_test.label.to_list())
+#
+#
+#            if train_on_dev:
+#                df_train = pd.concat([df_train, df_val], axis=0)
+#
+#            df_train["nlp_full_text"] = df_train.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
+#
+#            df_val["nlp_full_text"] = df_val.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
+#
+#            df_test["nlp_full_text"] = df_test.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
+#
+#            # random 
+#            df_test["random"] = df_test.progress_apply(lambda row: wsd.random_predict(), axis=1)
+#            rand.extend(df_test["random"].to_list())
+#            
+#            # token overlap
+#            df_selected_senses = generate_definition_df(df_train,lemma,eval_mode=eval_mode)
+#            df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)
+#            df_test["def_tok_overlap_ranking"] = df_test.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)
+#            token_overlap.extend(df_test["def_tok_overlap_ranking"].to_list())
+#
+#            #w2v lesk
+#            # Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
+#            #df_test["w2v_lesk_ranking"] = df_test.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)
+#            #w2v_lesk.extend(df_test['w2v_lesk_ranking'].to_list())
+#
+#            # binary centroid
+#            centroid_vectors = df_train.groupby('label')[vector_type].apply(np.mean,axis=0)
+#            df_test[f"bert_centroid_binary_{vector_type}"] = df_test[vector_type].progress_apply(wsd.bert_binary_centroid_vector,
+#                                                                                            centroid_vectors = centroid_vectors,
+#                                                                                            )
+#            y_pred_bin_centr.extend(df_test[f"bert_centroid_binary_{vector_type}"].to_list())
+#            #results[f"bert_centroid_binary_{vector_type}_{sense}"] = (wsd.eval(f"bert_centroid_binary_{vector_type}",df_test),len(df_test))
+#            
+#            # binary centroid time sensitive
+#            df_test[f"bert_ts_centroid_binary_{vector_type}"] = df_test.progress_apply(wsd.bert_ts_binary_centroid_vector, df_train=df_train, axis=1)
+#            y_pred_ts_bin_centr.extend(df_test[f"bert_ts_centroid_binary_{vector_type}"].to_list())
+#            
+#            #results[f"bert_ts_centroid_binary_{vector_type}_{sense}"] = (wsd.eval(f"bert_ts_centroid_binary_{vector_type}",df_test),len(df_test))
+#            
+#            # sense level centroid
+#            senseid2label = dict(df_test[['sense_id','label']].values)
+#            df_test[f"bert_centroid_sense_{vector_type}"] = df_test.progress_apply(wsd.bert_sense_centroid_vector,  
+#                            senseid2label= senseid2label,
+#                            vector_col=vector_type,
+#                            df_train = df_train, axis=1)
+#            
+#            y_pred_sense_centr.extend(df_test[f"bert_centroid_sense_{vector_type}"].to_list())
+#
+#            df_test[f"bert_ts_centroid_sense_{vector_type}"] = df_test.progress_apply(wsd.bert_ts_sense_centroid_vector,  
+#                            senseid2label= senseid2label,
+#                            vector_col=vector_type,
+#                            df_train = df_train, axis=1)
+#            
+#            y_pred_ts_sense_centr.extend(df_test[f"bert_centroid_sense_{vector_type}"].to_list())
+#            #results[f"bert_centroid_sense_{vector_type}_{sense}"] = (wsd.eval(f"bert_centroid_sense_{vector_type}",df_test),len(df_test))
+#            # semaxis
+#            #centroid_vectors = df_train.groupby('label')[vector_type].apply(np.mean,axis=0)
+#            #sem_axis = centroid_vectors[1] - centroid_vectors[0] 
+#            #df_test[f"bert_semaxis_{vector_type}"] = df_test[vector_type].progress_apply(wsd.bert_semaxis_vector, sem_axis=sem_axis, return_label=True, threshold=.0)
+#            #y_pred_semaxis.extend(df_test[f"bert_semaxis_{vector_type}"].to_list())
+#            #results[f"bert_semaxis_{vector_type}_{sense}"]  = (wsd.eval(f"bert_semaxis_{vector_type}",df_test),len(df_test))
+#    
+#            #df_test.to_pickle(f'./data/results/{lemma}_{pos}_{sense}.results')
+#    
+#    return y_true, y_pred_bin_centr,y_pred_ts_bin_centr,y_pred_sense_centr,y_pred_ts_sense_centr, rand, token_overlap, w2v_lesk
 
 
 def bert_avg_quot_nn_wsd(query_vector: np.array,