From 0b8c06eb2cc2d4fa695d6c743ce96db9f4e70b09 Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Thu, 14 Jan 2021 10:54:32 +0000 Subject: [PATCH 01/10] initial commit --- tasks/wsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/wsd.py b/tasks/wsd.py index 74657b3..566c63f 100644 --- a/tasks/wsd.py +++ b/tasks/wsd.py @@ -106,7 +106,7 @@ def svm_wemb_baseline(df_train,df_test,wemb_model): return y_pred ### --------------------------------------------------- -# BERT CENTROID METHODS +#  BERT CENTROID METHODS ### --------------------------------------------------- # binary centroid vectors From 0e0fd9b3f1a0b754894a1dd9d8127d747020c53f Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Thu, 14 Jan 2021 16:35:39 +0000 Subject: [PATCH 02/10] add model fine-tuning approach --- 114.1 - review notebook - glossbert.ipynb | 378 ++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 114.1 - review notebook - glossbert.ipynb diff --git a/114.1 - review notebook - glossbert.ipynb b/114.1 - review notebook - glossbert.ipynb new file mode 100644 index 0000000..c7e679f --- /dev/null +++ b/114.1 - review notebook - glossbert.ipynb @@ -0,0 +1,378 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3", + "language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import pickle\n", + "import pandas as pd\n", + "from tasks import wsd\n", + "from pathlib import Path\n", + "from tasks import wsd\n", + "from utils import nlp_tools\n", + "from tqdm.auto import tqdm\n", + "import numpy as np\n", + "import json\n", + "from sklearn.metrics import classification_report\n", + "from flair.embeddings import TransformerWordEmbeddings\n", + "from utils.dataset_download import harvest_data_from_extended_senses\n", + "from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "lemma = 'machine'\n", + "pos = 'NN'\n", + "senses = {'machine_nn01-38474140'} # machine_nn01-38475772 machine_nn01-38475923 machine_nn01-38475835 machine_nn01-38474140\n", + "relations = ['seed','synonym'] # ,'descendant','sibling'\n", + "eval_mode = \"lemma_etal\" # lemma or lemma_etal\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "# senses before filtering by date = 517\n", + "# senses after filtering by date = 433\n", + "\n", + "\n", + "# of seed senses 26 \n", + "# of synonyms 383 \n", + "# of branch senses 0\n", + "\n", + "\n", + "# of seeds selected 1 \n", + "# of synonyms selected 44 \n", + "# of branches selected 0\n", + "[LOG] #rows before removing None vector (1947, 21)\n", + "[LOG] #rows after removing None vector (1911, 21)\n" + ] + } + ], + "source": [ + "df_train, df_val, df_test = binarize(lemma,\n", + " pos,\n", + " senses, \n", + " relations,\n", + " strict_filter=True,\n", + " start=1700,\n", + " end=2000,\n", + " eval_mode=eval_mode)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " sense_id lemma_definition \\\n", + "0 body_nn01-17170653 The complete physical form of a person or anim... \n", + "1 man_nn01-110482153 An adult male human being. Without explicit co... \n", + "2 body_nn01-17169813 The complete physical form of a person or anim... \n", + "\n", + " definition word_id lemma \\\n", + "0 Particular technical uses. The part of a vehic... body_nn01 body \n", + "1 As vocative or as int., introducing a remark o... man_nn01 man \n", + "2 Contrasted with the soul. Cf. soul body n. at ... body_nn01 body \n", + "\n", + " quotation_id source \\\n", + "0 body_nn01-132916428 {'title': 'Material Handling Engin.', 'author'... \n", + "1 man_nn01-110482440 {'title': 'Shaela', 'author': 'R. Bulter', 'ge... \n", + "2 body_nn01-17169857 {'title': 'Ess. Man', 'author': 'A. Pope', 'ge... \n", + "\n", + " text year \\\n", + "0 {'keyword': 'bodies', 'full_text': 'After car ... 1990.0 \n", + "1 {'keyword': 'Min', 'full_text': 'Min A'm vexed... 1976.0 \n", + "2 {'keyword': 'Body', 'full_text': 'All are but ... 1733.0 \n", + "\n", + " full_text ... keyword_offset \\\n", + "0 After car bodies are painted, they are moved i... ... 10.0 \n", + "1 Min A'm vexed ta hear yun. ... 0.0 \n", + "2 All are but parts of one stupendous Whole, Who... ... 49.0 \n", + "\n", + " vector_bert_base_-1,-2,-3,-4_mean \\\n", + "0 [1.2747291, 0.25178745, 0.69486666, 0.42832682... \n", + "1 [-0.10557328, 0.24347349, 0.731555, -0.4305202... \n", + "2 [0.8197431, 0.04237363, 0.6312159, -0.2658673,... \n", + "\n", + " vector_blert_-1,-2,-3,-4_mean label id daterange \\\n", + "0 [1.5054287, 1.1386966, 1.3405375, 0.8012274, -... 0 NaN NaN \n", + "1 [-0.49209523, 0.7658461, 0.07512934, 0.0148925... 0 NaN NaN \n", + "2 [0.60478234, 0.58020014, 0.053836707, -0.06571... 0 NaN NaN \n", + "\n", + " provenance provenance_type relation_to_core_senses relation_to_seed_senses \n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + "[3 rows x 21 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sense_idlemma_definitiondefinitionword_idlemmaquotation_idsourcetextyearfull_text...keyword_offsetvector_bert_base_-1,-2,-3,-4_meanvector_blert_-1,-2,-3,-4_meanlabeliddaterangeprovenanceprovenance_typerelation_to_core_sensesrelation_to_seed_senses
0body_nn01-17170653The complete physical form of a person or anim...Particular technical uses. The part of a vehic...body_nn01bodybody_nn01-132916428{'title': 'Material Handling Engin.', 'author'...{'keyword': 'bodies', 'full_text': 'After car ...1990.0After car bodies are painted, they are moved i......10.0[1.2747291, 0.25178745, 0.69486666, 0.42832682...[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...0NaNNaNNaNNaNNaNNaN
1man_nn01-110482153An adult male human being. Without explicit co...As vocative or as int., introducing a remark o...man_nn01manman_nn01-110482440{'title': 'Shaela', 'author': 'R. Bulter', 'ge...{'keyword': 'Min', 'full_text': 'Min A'm vexed...1976.0Min A'm vexed ta hear yun....0.0[-0.10557328, 0.24347349, 0.731555, -0.4305202...[-0.49209523, 0.7658461, 0.07512934, 0.0148925...0NaNNaNNaNNaNNaNNaN
2body_nn01-17169813The complete physical form of a person or anim...Contrasted with the soul. Cf. soul body n. at ...body_nn01bodybody_nn01-17169857{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...{'keyword': 'Body', 'full_text': 'All are but ...1733.0All are but parts of one stupendous Whole, Who......49.0[0.8197431, 0.04237363, 0.6312159, -0.2658673,...[0.60478234, 0.58020014, 0.053836707, -0.06571...0NaNNaNNaNNaNNaNNaN
\n

3 rows × 21 columns

\n
" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "df_train.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',\n", + " 'quotation_id', 'source', 'text', 'year', 'full_text', 'keyword',\n", + " 'keyword_offset', 'vector_bert_base_-1,-2,-3,-4_mean',\n", + " 'vector_blert_-1,-2,-3,-4_mean', 'label', 'id', 'daterange',\n", + " 'provenance', 'provenance_type', 'relation_to_core_senses',\n", + " 'relation_to_seed_senses'],\n", + " dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "df_train.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def enclose_keyword(row,enclose_token='[TARGET]'):\n", + " \"\"\"enclose keyword with specific token to point\n", + " learner towards to word it has to focus on\n", + " \"\"\"\n", + " sentence = ''\n", + " for i,c in enumerate(row.full_text):\n", + " if i == int(row.keyword_offset):\n", + " sentence+=enclose_token + ' '\n", + " elif i ==int(row.keyword_offset + len(row.keyword)):\n", + " sentence+= ' ' + enclose_token\n", + " sentence+=c\n", + " return sentence\n", + "\n", + "def merge_quotation_gloss(row):\n", + " out_string = '[GLOSS] '\n", + " if row.definition:\n", + " out_string+=row.definition\n", + " out_string+=' [QUOT] ' \n", + " if row.enclosed_quotation:\n", + " out_string+=row.enclosed_quotation\n", + " return out_string\n", + "\n", + "def merge_quotation_keyword(row):\n", + " out_string = '[TARGET] '\n", + " if row.keyword:\n", + " out_string+=row.keyword\n", + " out_string+=' [QUOT] ' \n", + " if row.enclosed_quotation:\n", + " out_string+=row.enclosed_quotation\n", + " return out_string\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "path = Path('./data/training_data')\n", + "path.mkdir(exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "csv_out_path = path / f\"{lemma}_{'_'.join(senses)}\"\n", + "csv_out_path.mkdir(exist_ok=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "df_train['enclosed_quotation'] = df_train.apply(enclose_keyword, axis=1)\n", + "df_train['train_text'] = df_train.apply(merge_quotation_keyword, axis=1)\n", + "df_train[['train_text','label']].to_csv(csv_out_path / \"train.csv\",index = False, sep='\\t') \n", + "df_val['enclosed_quotation'] = df_val.apply(enclose_keyword, axis=1)\n", + "df_val['train_text'] = df_val.apply(merge_quotation_keyword, axis=1)\n", + "df_val[['train_text','label']].to_csv(csv_out_path / \"dev.csv\",index = False, sep='\\t') \n", + "df_test['enclosed_quotation'] = df_test.apply(enclose_keyword, axis=1)\n", + "df_test['train_text'] = df_test.apply(merge_quotation_keyword, axis=1)\n", + "df_test[['train_text','label']].to_csv(csv_out_path / \"test.csv\",index = False, sep='\\t') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2021-01-14 16:01:53,769 Reading data from data/training_data/machine_machine_nn01-38474140\n", + "2021-01-14 16:01:53,770 Train: data/training_data/machine_machine_nn01-38474140/train.csv\n", + "2021-01-14 16:01:53,770 Dev: data/training_data/machine_machine_nn01-38474140/dev.csv\n", + "2021-01-14 16:01:53,770 Test: data/training_data/machine_machine_nn01-38474140/test.csv\n" + ] + } + ], + "source": [ + "from flair.data import Corpus\n", + "from flair.datasets import CSVClassificationCorpus\n", + "\n", + "# this is the folder in which train, test and dev files reside\n", + "data_folder = csv_out_path\n", + "\n", + "# column format indicating which columns hold the text and label(s)\n", + "column_name_map = {0: \"text\", 1: \"label\"}\n", + "\n", + "# load corpus containing training, test and dev data and if CSV has a header, you can skip it\n", + "corpus: Corpus = CSVClassificationCorpus(data_folder,\n", + " column_name_map,\n", + " skip_header=True,\n", + " delimiter='\\t', # tab-separated files\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2021-01-14 16:01:56,791 Computing label dictionary. Progress:\n", + "100%|██████████| 1604/1604 [00:00<00:00, 2103.07it/s]2021-01-14 16:01:57,857 [b'0', b'1']\n", + "Dictionary with 2 tags: 0, 1\n", + "\n" + ] + } + ], + "source": [ + "# 2. create the label dictionary\n", + "label_dict = corpus.make_label_dictionary()\n", + "print(label_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.optim.adam import Adam\n", + "from flair.data import Corpus\n", + "from flair.datasets import TREC_6\n", + "from flair.embeddings import TransformerDocumentEmbeddings\n", + "from flair.models import TextClassifier\n", + "from flair.trainers import ModelTrainer\n", + "\n", + "\n", + "\n", + "\n", + "# 3. initialize transformer document embeddings (many models are available)\n", + "document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)\n", + "\n", + "# 4. create the text classifier\n", + "classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)\n", + "\n", + "# 5. initialize the text classifier trainer with Adam optimizer\n", + "trainer = ModelTrainer(classifier, corpus, optimizer=Adam)\n", + "\n", + "# 6. start the training\n", + "trainer.train('models/taggers/trec',\n", + " learning_rate=1e-5, # use very small learning rate\n", + " mini_batch_size=16,\n", + " mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine\n", + " max_epochs=5, # terminate after 5 epochs\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file From acc2cb960a845ddb5d51a926bcd75df00e1601de Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Thu, 14 Jan 2021 17:53:03 +0000 Subject: [PATCH 03/10] add glossbert method as draft --- 114.1 - review notebook - glossbert.ipynb | 914 ++++++++++++++++++++-- 1 file changed, 852 insertions(+), 62 deletions(-) diff --git a/114.1 - review notebook - glossbert.ipynb b/114.1 - review notebook - glossbert.ipynb index c7e679f..60288ba 100644 --- a/114.1 - review notebook - glossbert.ipynb +++ b/114.1 - review notebook - glossbert.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -51,12 +51,19 @@ "from sklearn.metrics import classification_report\n", "from flair.embeddings import TransformerWordEmbeddings\n", "from utils.dataset_download import harvest_data_from_extended_senses\n", - "from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma" + "from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma\n", + "from torch.optim.adam import Adam\n", + "from flair.datasets import CSVClassificationCorpus\n", + "from flair.data import Corpus\n", + "from flair.datasets import TREC_6\n", + "from flair.embeddings import TransformerDocumentEmbeddings\n", + "from flair.models import TextClassifier\n", + "from flair.trainers import ModelTrainer" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -70,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -107,7 +114,353 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " sense_id \\\n", + "238 machine_nn01-38474140 \n", + "305 machine_nn01-38474140 \n", + "713 machine_nn01-38474140 \n", + "938 machine_nn01-38474140 \n", + "1042 machine_nn01-38474140 \n", + "1056 machine_nn01-38474140 \n", + "\n", + " lemma_definition \\\n", + "238 A complex device, consisting of a number of in... \n", + "305 A complex device, consisting of a number of in... \n", + "713 A complex device, consisting of a number of in... \n", + "938 A complex device, consisting of a number of in... \n", + "1042 A complex device, consisting of a number of in... \n", + "1056 A complex device, consisting of a number of in... \n", + "\n", + " definition word_id \\\n", + "238 A living body, esp. the human body considered ... machine_nn01 \n", + "305 A living body, esp. the human body considered ... machine_nn01 \n", + "713 A living body, esp. the human body considered ... machine_nn01 \n", + "938 A living body, esp. the human body considered ... machine_nn01 \n", + "1042 A living body, esp. the human body considered ... machine_nn01 \n", + "1056 A living body, esp. the human body considered ... machine_nn01 \n", + "\n", + " lemma quotation_id \\\n", + "238 machine machine_nn01-38474169 \n", + "305 machine machine_nn01-38474177 \n", + "713 machine machine_nn01-38474195 \n", + "938 machine machine_nn01-38474223 \n", + "1042 machine machine_nn01-38474203 \n", + "1056 machine machine_nn01-38474212 \n", + "\n", + " source \\\n", + "238 {'title': 'Death's Vision', 'author': 'J. Reyn... \n", + "305 {'title': 'Spectator', 'author': 'J. Addison',... \n", + "713 {'title': 'Med. & Physical Jrnl.', 'author': N... \n", + "938 {'title': 'Of Human Bondage', 'author': 'W. S.... \n", + "1042 {'title': 'Poems', 'author': 'W. Wordsworth', ... \n", + "1056 {'title': 'Telegraphy', 'author': 'W. H. Preec... \n", + "\n", + " text year \\\n", + "238 {'keyword': 'Machins', 'full_text': 'What Nobl... 1709.0 \n", + "305 {'keyword': 'Machine', 'full_text': 'Cheerfuln... 1712.0 \n", + "713 {'keyword': 'machine', 'full_text': 'When a pr... 1805.0 \n", + "938 {'keyword': 'machine', 'full_text': 'He wonder... 1915.0 \n", + "1042 {'keyword': 'machine', 'full_text': 'And now I... 1807.0 \n", + "1056 {'keyword': 'machine', 'full_text': 'The human... 1876.0 \n", + "\n", + " full_text ... keyword_offset \\\n", + "238 What Nobler Souls the Nobler Machins Wear. ... 29.0 \n", + "305 Cheerfulness is..the best Promoter of Health. ... ... 70.0 \n", + "713 When a product of diseased action has been eff... ... 82.0 \n", + "938 He wondered whether at the very end, now that ... ... 50.0 \n", + "1042 And now I see with eye serene The very pulse o... ... 52.0 \n", + "1056 The human machine tires, and as a consequence ... ... 10.0 \n", + "\n", + " vector_bert_base_-1,-2,-3,-4_mean \\\n", + "238 [0.5628562, -0.04788875, 0.074935675, -0.22630... \n", + "305 [0.0052292813, 0.12355395, 0.023108626, 0.2251... \n", + "713 [0.25928053, 0.049638785, 0.022315167, 0.34901... \n", + "938 [0.38040048, 0.38440758, 0.45397452, 0.1211486... \n", + "1042 [-0.46428305, 0.013232344, -0.595714, 0.049642... \n", + "1056 [0.6930934, 0.09074756, -0.13974331, 0.1105655... \n", + "\n", + " vector_blert_-1,-2,-3,-4_mean label \\\n", + "238 [-0.15516208, 0.289941, -0.15124893, -0.206332... 1 \n", + "305 [-0.04755735, 0.20182909, 0.33001357, -0.04851... 1 \n", + "713 [-0.16033216, -0.16846322, 0.5062964, 0.102019... 1 \n", + "938 [-0.059219074, 0.23112743, 0.42189148, 0.02944... 1 \n", + "1042 [0.021248298, 0.28699854, 0.24638082, -0.01793... 1 \n", + "1056 [0.11798739, -0.0029160888, 0.29418808, -0.076... 1 \n", + "\n", + " id \\\n", + "238 machine_nn01-38474140 \n", + "305 machine_nn01-38474140 \n", + "713 machine_nn01-38474140 \n", + "938 machine_nn01-38474140 \n", + "1042 machine_nn01-38474140 \n", + "1056 machine_nn01-38474140 \n", + "\n", + " daterange \\\n", + "238 {'end': None, 'start': 1604, 'obsolete': False... \n", + "305 {'end': None, 'start': 1604, 'obsolete': False... \n", + "713 {'end': None, 'start': 1604, 'obsolete': False... \n", + "938 {'end': None, 'start': 1604, 'obsolete': False... \n", + "1042 {'end': None, 'start': 1604, 'obsolete': False... \n", + "1056 {'end': None, 'start': 1604, 'obsolete': False... \n", + "\n", + " provenance provenance_type \\\n", + "238 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", + "305 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", + "713 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", + "938 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", + "1042 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", + "1056 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", + "\n", + " relation_to_core_senses relation_to_seed_senses \n", + "238 {machine_nn01-38474140} {machine_nn01-38474140} \n", + "305 {machine_nn01-38474140} {machine_nn01-38474140} \n", + "713 {machine_nn01-38474140} {machine_nn01-38474140} \n", + "938 {machine_nn01-38474140} {machine_nn01-38474140} \n", + "1042 {machine_nn01-38474140} {machine_nn01-38474140} \n", + "1056 {machine_nn01-38474140} {machine_nn01-38474140} \n", + "\n", + "[6 rows x 21 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sense_idlemma_definitiondefinitionword_idlemmaquotation_idsourcetextyearfull_text...keyword_offsetvector_bert_base_-1,-2,-3,-4_meanvector_blert_-1,-2,-3,-4_meanlabeliddaterangeprovenanceprovenance_typerelation_to_core_sensesrelation_to_seed_senses
238machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474169{'title': 'Death's Vision', 'author': 'J. Reyn...{'keyword': 'Machins', 'full_text': 'What Nobl...1709.0What Nobler Souls the Nobler Machins Wear....29.0[0.5628562, -0.04788875, 0.074935675, -0.22630...[-0.15516208, 0.289941, -0.15124893, -0.206332...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
305machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474177{'title': 'Spectator', 'author': 'J. Addison',...{'keyword': 'Machine', 'full_text': 'Cheerfuln...1712.0Cheerfulness is..the best Promoter of Health. ......70.0[0.0052292813, 0.12355395, 0.023108626, 0.2251...[-0.04755735, 0.20182909, 0.33001357, -0.04851...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
713machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474195{'title': 'Med. & Physical Jrnl.', 'author': N...{'keyword': 'machine', 'full_text': 'When a pr...1805.0When a product of diseased action has been eff......82.0[0.25928053, 0.049638785, 0.022315167, 0.34901...[-0.16033216, -0.16846322, 0.5062964, 0.102019...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
938machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474223{'title': 'Of Human Bondage', 'author': 'W. S....{'keyword': 'machine', 'full_text': 'He wonder...1915.0He wondered whether at the very end, now that ......50.0[0.38040048, 0.38440758, 0.45397452, 0.1211486...[-0.059219074, 0.23112743, 0.42189148, 0.02944...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
1042machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474203{'title': 'Poems', 'author': 'W. Wordsworth', ...{'keyword': 'machine', 'full_text': 'And now I...1807.0And now I see with eye serene The very pulse o......52.0[-0.46428305, 0.013232344, -0.595714, 0.049642...[0.021248298, 0.28699854, 0.24638082, -0.01793...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
1056machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474212{'title': 'Telegraphy', 'author': 'W. H. Preec...{'keyword': 'machine', 'full_text': 'The human...1876.0The human machine tires, and as a consequence ......10.0[0.6930934, 0.09074756, -0.13974331, 0.1105655...[0.11798739, -0.0029160888, 0.29418808, -0.076...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
\n

6 rows × 21 columns

\n
" + }, + "metadata": {}, + "execution_count": 72 + } + ], + "source": [ + "df_train[df_train.sense_id=='machine_nn01-38474140']" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array(['body_nn01', 'man_nn01', 'machine_nn01', 'carcass_nn01',\n", + " 'person_nn01', 'case_nn02', 'personage_nn01', 'corporeity_nn01',\n", + " 'structure_nn01', 'dust_nn01', 'case_nn01', 'automaton_nn01',\n", + " 'earth_nn01', 'soma_nn02', 'bulk_nn01', 'microcosm_nn01',\n", + " 'personality_nn01', 'tabernacle_nn01', 'vessel_nn01',\n", + " 'corpse_nn01', 'case_nn04', 'clay_nn01', 'clod_nn01',\n", + " 'skinful_nn01', 'carrion_nn01', 'embodiment_nn01', 'corpus_nn01',\n", + " 'flesh_nn01', 'soma_nn01', 'bloodbulk_nn01', 'earth_nn02',\n", + " 'soulcase_nn02', 'corporation_nn01', 'chassis_nn01', 'bulk_nn03',\n", + " 'bouk_nn01', 'outwall_nn01', 'case_nn03', 'incarnation_nn01',\n", + " 'bonehouse_nn01', 'man_nn04', 'bulk_nn02', 'soulcase_nn01',\n", + " 'godsimage_nn01', 'quarrons_nn01'], dtype=object)" + ] + }, + "metadata": {}, + "execution_count": 73 + } + ], + "source": [ + "df_train.word_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " sense_id \\\n", + "0 body_nn01-17170653 \n", + "1 man_nn01-110482153 \n", + "2 body_nn01-17169813 \n", + "3 machine_nn01-38474877 \n", + "4 carcass_nn01-10177258 \n", + "... ... \n", + "1215 man_nn01-110479060 \n", + "1216 person_nn01-30950985 \n", + "1217 clay_nn01-9320873 \n", + "1218 case_nn02-10018131 \n", + "1220 man_nn01-110487579 \n", + "\n", + " lemma_definition \\\n", + "0 The complete physical form of a person or anim... \n", + "1 An adult male human being. Without explicit co... \n", + "2 The complete physical form of a person or anim... \n", + "3 A complex device, consisting of a number of in... \n", + "4 The dead body of a person or animal; but no lo... \n", + "... ... \n", + "1215 An adult male human being. Without explicit co... \n", + "1216 An individual human being; a man, woman, or ch... \n", + "1217 A stiff viscous earth found, in many varieties... \n", + "1218 A box, bag, or other receptacle, designed to c... \n", + "1220 An adult male human being. Without explicit co... \n", + "\n", + " definition word_id \\\n", + "0 Particular technical uses. The part of a vehic... body_nn01 \n", + "1 As vocative or as int., introducing a remark o... man_nn01 \n", + "2 Contrasted with the soul. Cf. soul body n. at ... body_nn01 \n", + "3 A bicycle or tricycle; a motorcycle. Formerly ... machine_nn01 \n", + "4 The naked framework or ‘shell’ of a building b... carcass_nn01 \n", + "... ... ... \n", + "1215 A husband. Now chiefly English regional (north... man_nn01 \n", + "1216 Law. An individual (natural person n.) or corp... person_nn01 \n", + "1217 Short for clay-pipe n. at compounds 2 (colloq... clay_nn01 \n", + "1218 slang. A house, esp. one used as a brothel. Cf... case_nn02 \n", + "1220 In Cumbria: a cairn marking a summit or promin... man_nn01 \n", + "\n", + " lemma quotation_id \\\n", + "0 body body_nn01-132916428 \n", + "1 man man_nn01-110482440 \n", + "2 body body_nn01-17169857 \n", + "3 machine machine_nn01-38474966 \n", + "4 carcass carcass_nn01-10177295 \n", + "... ... ... \n", + "1215 man man_nn01-110479206 \n", + "1216 person person_nn01-30951076 \n", + "1217 clay clay_nn01-9320896 \n", + "1218 case case_nn02-10018191 \n", + "1220 man man_nn01-110487624 \n", + "\n", + " source \\\n", + "0 {'title': 'Material Handling Engin.', 'author'... \n", + "1 {'title': 'Shaela', 'author': 'R. Bulter', 'ge... \n", + "2 {'title': 'Ess. Man', 'author': 'A. Pope', 'ge... \n", + "3 {'title': 'National Trust Mag.', 'author': Non... \n", + "4 {'title': 'New Pract. Builder', 'author': 'P. ... \n", + "... ... \n", + "1215 {'title': 'Four Years S. Afr.', 'author': 'C. ... \n", + "1216 {'title': 'Daily News', 'author': None, 'gende... \n", + "1217 {'title': 'Held in Bondage', 'author': '‘Ouida... \n", + "1218 {'title': 'Mop Fair', 'author': 'A. M. Binstea... \n", + "1220 {'title': 'Northern Affair', 'author': 'D. K. ... \n", + "\n", + " text year \\\n", + "0 {'keyword': 'bodies', 'full_text': 'After car ... 1990.0 \n", + "1 {'keyword': 'Min', 'full_text': 'Min A'm vexed... 1976.0 \n", + "2 {'keyword': 'Body', 'full_text': 'All are but ... 1733.0 \n", + "3 {'keyword': 'machines', 'full_text': 'The cycl... 1992.0 \n", + "4 {'keyword': 'Carcase', 'full_text': 'Carcase o... 1823.0 \n", + "... ... ... \n", + "1215 {'keyword': 'man', 'full_text': 'The wife brok... 1829.0 \n", + "1216 {'keyword': 'persons', 'full_text': 'A Bill..e... 1900.0 \n", + "1217 {'keyword': 'clays', 'full_text': 'Filthy bird... 1863.0 \n", + "1218 {'keyword': 'case', 'full_text': 'They arrange... 1905.0 \n", + "1220 {'keyword': 'man', 'full_text': 'Over the elep... 1964.0 \n", + "\n", + " full_text ... keyword_offset \\\n", + "0 After car bodies are painted, they are moved i... ... 10.0 \n", + "1 Min A'm vexed ta hear yun. ... 0.0 \n", + "2 All are but parts of one stupendous Whole, Who... ... 49.0 \n", + "3 The cyclists..took on the circular 21- or 42-m... ... 92.0 \n", + "4 Carcase of a Building, the naked walls, and th... ... 0.0 \n", + "... ... ... ... \n", + "1215 The wife broke out, ‘You lament a brother, and... ... 79.0 \n", + "1216 A Bill..extending to juridical persons, that i... ... 31.0 \n", + "1217 Filthy bird's-eye, smoked in clays. ... 29.0 \n", + "1218 They arranges to stop ‘private’ in Brighton, a... ... 57.0 \n", + "1220 Over the elephant rocks and under the lee of t... ... 55.0 \n", + "\n", + " vector_bert_base_-1,-2,-3,-4_mean \\\n", + "0 [1.2747291, 0.25178745, 0.69486666, 0.42832682... \n", + "1 [-0.10557328, 0.24347349, 0.731555, -0.4305202... \n", + "2 [0.8197431, 0.04237363, 0.6312159, -0.2658673,... \n", + "3 [-0.18150243, -0.24230756, -0.3336587, 0.34879... \n", + "4 [0.6567496, -0.050804906, 0.31024605, 0.059706... \n", + "... ... \n", + "1215 [-0.07307064, -0.31692728, 0.38834277, -0.2980... \n", + "1216 [0.030711764, 0.28706473, 0.6596842, -0.132111... \n", + "1217 [-0.016634814, 0.6912965, -0.18498293, -0.2104... \n", + "1218 [0.16278893, -0.17927478, 0.34916735, -0.34717... \n", + "1220 [0.12908892, 0.1654679, -0.077464886, -0.44454... \n", + "\n", + " vector_blert_-1,-2,-3,-4_mean label \\\n", + "0 [1.5054287, 1.1386966, 1.3405375, 0.8012274, -... 0 \n", + "1 [-0.49209523, 0.7658461, 0.07512934, 0.0148925... 0 \n", + "2 [0.60478234, 0.58020014, 0.053836707, -0.06571... 0 \n", + "3 [-0.14852196, 0.69629294, 0.30973893, 0.598406... 0 \n", + "4 [0.41240987, 0.10217035, 0.48574266, 0.8627304... 0 \n", + "... ... ... \n", + "1215 [-0.20098017, 0.47577783, 0.013388823, -0.2808... 0 \n", + "1216 [-0.42745396, 0.4621299, 0.34301567, 0.2193956... 0 \n", + "1217 [-0.2833503, 0.80949837, -0.5981247, 0.4331013... 0 \n", + "1218 [0.3253876, 0.12327082, -0.077930324, 0.450299... 0 \n", + "1220 [-0.4877532, 0.62317544, -0.4543179, -0.167910... 0 \n", + "\n", + " id \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 machine_nn01-38474877 \n", + "4 NaN \n", + "... ... \n", + "1215 NaN \n", + "1216 NaN \n", + "1217 NaN \n", + "1218 NaN \n", + "1220 NaN \n", + "\n", + " daterange \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 {'end': None, 'start': 1823, 'obsolete': False... \n", + "4 NaN \n", + "... ... \n", + "1215 NaN \n", + "1216 NaN \n", + "1217 NaN \n", + "1218 NaN \n", + "1220 NaN \n", + "\n", + " provenance provenance_type \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 [[machine_nn01-38474877, seed, machine_nn01]] seed \n", + "4 NaN NaN \n", + "... ... ... \n", + "1215 NaN NaN \n", + "1216 NaN NaN \n", + "1217 NaN NaN \n", + "1218 NaN NaN \n", + "1220 NaN NaN \n", + "\n", + " relation_to_core_senses relation_to_seed_senses \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 {machine_nn01-38474877} {machine_nn01-38474877} \n", + "4 NaN NaN \n", + "... ... ... \n", + "1215 NaN NaN \n", + "1216 NaN NaN \n", + "1217 NaN NaN \n", + "1218 NaN NaN \n", + "1220 NaN NaN \n", + "\n", + "[1135 rows x 21 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sense_idlemma_definitiondefinitionword_idlemmaquotation_idsourcetextyearfull_text...keyword_offsetvector_bert_base_-1,-2,-3,-4_meanvector_blert_-1,-2,-3,-4_meanlabeliddaterangeprovenanceprovenance_typerelation_to_core_sensesrelation_to_seed_senses
0body_nn01-17170653The complete physical form of a person or anim...Particular technical uses. The part of a vehic...body_nn01bodybody_nn01-132916428{'title': 'Material Handling Engin.', 'author'...{'keyword': 'bodies', 'full_text': 'After car ...1990.0After car bodies are painted, they are moved i......10.0[1.2747291, 0.25178745, 0.69486666, 0.42832682...[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...0NaNNaNNaNNaNNaNNaN
1man_nn01-110482153An adult male human being. Without explicit co...As vocative or as int., introducing a remark o...man_nn01manman_nn01-110482440{'title': 'Shaela', 'author': 'R. Bulter', 'ge...{'keyword': 'Min', 'full_text': 'Min A'm vexed...1976.0Min A'm vexed ta hear yun....0.0[-0.10557328, 0.24347349, 0.731555, -0.4305202...[-0.49209523, 0.7658461, 0.07512934, 0.0148925...0NaNNaNNaNNaNNaNNaN
2body_nn01-17169813The complete physical form of a person or anim...Contrasted with the soul. Cf. soul body n. at ...body_nn01bodybody_nn01-17169857{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...{'keyword': 'Body', 'full_text': 'All are but ...1733.0All are but parts of one stupendous Whole, Who......49.0[0.8197431, 0.04237363, 0.6312159, -0.2658673,...[0.60478234, 0.58020014, 0.053836707, -0.06571...0NaNNaNNaNNaNNaNNaN
3machine_nn01-38474877A complex device, consisting of a number of in...A bicycle or tricycle; a motorcycle. Formerly ...machine_nn01machinemachine_nn01-38474966{'title': 'National Trust Mag.', 'author': Non...{'keyword': 'machines', 'full_text': 'The cycl...1992.0The cyclists..took on the circular 21- or 42-m......92.0[-0.18150243, -0.24230756, -0.3336587, 0.34879...[-0.14852196, 0.69629294, 0.30973893, 0.598406...0machine_nn01-38474877{'end': None, 'start': 1823, 'obsolete': False...[[machine_nn01-38474877, seed, machine_nn01]]seed{machine_nn01-38474877}{machine_nn01-38474877}
4carcass_nn01-10177258The dead body of a person or animal; but no lo...The naked framework or ‘shell’ of a building b...carcass_nn01carcasscarcass_nn01-10177295{'title': 'New Pract. Builder', 'author': 'P. ...{'keyword': 'Carcase', 'full_text': 'Carcase o...1823.0Carcase of a Building, the naked walls, and th......0.0[0.6567496, -0.050804906, 0.31024605, 0.059706...[0.41240987, 0.10217035, 0.48574266, 0.8627304...0NaNNaNNaNNaNNaNNaN
..................................................................
1215man_nn01-110479060An adult male human being. Without explicit co...A husband. Now chiefly English regional (north...man_nn01manman_nn01-110479206{'title': 'Four Years S. Afr.', 'author': 'C. ...{'keyword': 'man', 'full_text': 'The wife brok...1829.0The wife broke out, ‘You lament a brother, and......79.0[-0.07307064, -0.31692728, 0.38834277, -0.2980...[-0.20098017, 0.47577783, 0.013388823, -0.2808...0NaNNaNNaNNaNNaNNaN
1216person_nn01-30950985An individual human being; a man, woman, or ch...Law. An individual (natural person n.) or corp...person_nn01personperson_nn01-30951076{'title': 'Daily News', 'author': None, 'gende...{'keyword': 'persons', 'full_text': 'A Bill..e...1900.0A Bill..extending to juridical persons, that i......31.0[0.030711764, 0.28706473, 0.6596842, -0.132111...[-0.42745396, 0.4621299, 0.34301567, 0.2193956...0NaNNaNNaNNaNNaNNaN
1217clay_nn01-9320873A stiff viscous earth found, in many varieties...Short for clay-pipe n. at compounds 2 (colloq...clay_nn01clayclay_nn01-9320896{'title': 'Held in Bondage', 'author': '‘Ouida...{'keyword': 'clays', 'full_text': 'Filthy bird...1863.0Filthy bird's-eye, smoked in clays....29.0[-0.016634814, 0.6912965, -0.18498293, -0.2104...[-0.2833503, 0.80949837, -0.5981247, 0.4331013...0NaNNaNNaNNaNNaNNaN
1218case_nn02-10018131A box, bag, or other receptacle, designed to c...slang. A house, esp. one used as a brothel. Cf...case_nn02casecase_nn02-10018191{'title': 'Mop Fair', 'author': 'A. M. Binstea...{'keyword': 'case', 'full_text': 'They arrange...1905.0They arranges to stop ‘private’ in Brighton, a......57.0[0.16278893, -0.17927478, 0.34916735, -0.34717...[0.3253876, 0.12327082, -0.077930324, 0.450299...0NaNNaNNaNNaNNaNNaN
1220man_nn01-110487579An adult male human being. Without explicit co...In Cumbria: a cairn marking a summit or promin...man_nn01manman_nn01-110487624{'title': 'Northern Affair', 'author': 'D. K. ...{'keyword': 'man', 'full_text': 'Over the elep...1964.0Over the elephant rocks and under the lee of t......55.0[0.12908892, 0.1654679, -0.077464886, -0.44454...[-0.4877532, 0.62317544, -0.4543179, -0.167910...0NaNNaNNaNNaNNaNNaN
\n

1135 rows × 21 columns

\n
" + }, + "metadata": {}, + "execution_count": 74 + } + ], + "source": [ + "df_train[df_train.label==\"0\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, "metadata": {}, "outputs": [ { @@ -159,7 +512,7 @@ "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sense_idlemma_definitiondefinitionword_idlemmaquotation_idsourcetextyearfull_text...keyword_offsetvector_bert_base_-1,-2,-3,-4_meanvector_blert_-1,-2,-3,-4_meanlabeliddaterangeprovenanceprovenance_typerelation_to_core_sensesrelation_to_seed_senses
0body_nn01-17170653The complete physical form of a person or anim...Particular technical uses. The part of a vehic...body_nn01bodybody_nn01-132916428{'title': 'Material Handling Engin.', 'author'...{'keyword': 'bodies', 'full_text': 'After car ...1990.0After car bodies are painted, they are moved i......10.0[1.2747291, 0.25178745, 0.69486666, 0.42832682...[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...0NaNNaNNaNNaNNaNNaN
1man_nn01-110482153An adult male human being. Without explicit co...As vocative or as int., introducing a remark o...man_nn01manman_nn01-110482440{'title': 'Shaela', 'author': 'R. Bulter', 'ge...{'keyword': 'Min', 'full_text': 'Min A'm vexed...1976.0Min A'm vexed ta hear yun....0.0[-0.10557328, 0.24347349, 0.731555, -0.4305202...[-0.49209523, 0.7658461, 0.07512934, 0.0148925...0NaNNaNNaNNaNNaNNaN
2body_nn01-17169813The complete physical form of a person or anim...Contrasted with the soul. Cf. soul body n. at ...body_nn01bodybody_nn01-17169857{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...{'keyword': 'Body', 'full_text': 'All are but ...1733.0All are but parts of one stupendous Whole, Who......49.0[0.8197431, 0.04237363, 0.6312159, -0.2658673,...[0.60478234, 0.58020014, 0.053836707, -0.06571...0NaNNaNNaNNaNNaNNaN
\n

3 rows × 21 columns

\n
" }, "metadata": {}, - "execution_count": 6 + "execution_count": 75 } ], "source": [ @@ -168,7 +521,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 76, "metadata": {}, "outputs": [ { @@ -185,7 +538,7 @@ ] }, "metadata": {}, - "execution_count": 7 + "execution_count": 76 } ], "source": [ @@ -194,11 +547,11 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ - "def enclose_keyword(row,enclose_token='[TARGET]'):\n", + "def enclose_keyword(row,enclose_token='\"'):\n", " \"\"\"enclose keyword with specific token to point\n", " learner towards to word it has to focus on\n", " \"\"\"\n", @@ -211,30 +564,115 @@ " sentence+=c\n", " return sentence\n", "\n", - "def merge_quotation_gloss(row):\n", - " out_string = '[GLOSS] '\n", - " if row.definition:\n", - " out_string+=row.definition\n", - " out_string+=' [QUOT] ' \n", - " if row.enclosed_quotation:\n", - " out_string+=row.enclosed_quotation\n", - " return out_string\n", + "#def merge_quotation_gloss(row):\n", + "# out_string = '[GLOSS] '\n", + "# if row.definition:\n", + "# out_string+=row.definition\n", + "# out_string+=' [QUOT] ' \n", + "# if row.enclosed_quotation:\n", + "# out_string+=row.enclosed_quotation\n", + "# return out_string\n", + "\n", + "#def prep_train_text(row):\n", + "# out_string='[TAGET] '+row.keyword+' [TAGET] : '\n", + "# if row.definition:\n", + "# out_string+=row.definition\n", + "# out_string+=' [SEP] ' \n", + "# if row.enclosed_quotation:\n", + "# out_string+=row.enclosed_quotation\n", + "# return out_string\n", "\n", - "def merge_quotation_keyword(row):\n", - " out_string = '[TARGET] '\n", - " if row.keyword:\n", - " out_string+=row.keyword\n", - " out_string+=' [QUOT] ' \n", - " if row.enclosed_quotation:\n", - " out_string+=row.enclosed_quotation\n", - " return out_string\n", + "#def prep_test_text(row):\n", + "# out_string='[TAGET] '+row.keyword+' [TAGET] : '\n", + "# if row.enclosed_quotation:\n", + "# out_string+=row.enclosed_quotation\n", + "# return out_string\n", "\n", + "#def merge_quotation_keyword(row):\n", + "# out_string = '[TARGET] '\n", + "# if row.keyword:\n", + "# out_string+=row.keyword\n", + "# out_string+=' [QUOT] ' \n", + "# if row.enclosed_quotation:\n", + "# out_string+=row.enclosed_quotation\n", + "# return out_string\n", "\n" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "def to_glossbert_format(df):\n", + " def gloss_string(row, definition):\n", + " out_string=''\n", + " if row.enclosed_quotation:\n", + " out_string+=row.enclosed_quotation\n", + " out_string+=' [SEP] ' \n", + " out_string+=row.keyword+': '\n", + " if row.definition:\n", + " out_string+=definition\n", + " return out_string\n", + "\n", + " df['enclosed_quotations'] = df.apply(enclose_keyword, axis=1)\n", + " \n", + " rows = [] \n", + " for i,row in df.iterrows():\n", + " rows.append([gloss_string(row, row.definition), 1])\n", + " definitions = df[df.lemma==row.lemma].definition.unique()\n", + " for d in definitions:\n", + " if d != row.definition:\n", + " rows.append([gloss_string(row,d), 0])\n", + " \n", + " return rows\n", + "\n", + "df_gloss_train = to_glossbert_format(df_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Particular technical uses. The part of a vehicle fitted to receive the load; spec. the part of a motor car in which driver and passengers sit, or the fuselage of an aeroplane. Cf. cart-body n. at cart n. compounds 2, wide-body n.',\n", + " 1],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Contrasted with the soul. Cf. soul body n. at soul n. compounds 4.',\n", + " 0],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Particular technical uses. The main part of a musical instrument, which in the case of traditional stringed instruments forms a resonating chamber.',\n", + " 0],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: The complete physical form of a person or animal; the assemblage of parts, organs, and tissues that constitutes the whole material organism.',\n", + " 0],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: A comprehensive and systematic collection of information, or of the details of any subject, esp. law; a textbook, a pandect. Usually with of.',\n", + " 0],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: A corpse.',\n", + " 0],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: The physical or mortal nature, state, or aspect of man. Frequently in in (the) body, out of (the) body and variants, sometimes contrasted with in spirit.',\n", + " 0],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: More widely: a material thing, an object; something that has physical existence and extension in space.',\n", + " 0],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Cell Biology. Any of various normal or abnormal structures found within the cytoplasm or nucleus of a cell. Frequently with distinguishing word.',\n", + " 0],\n", + " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Originally: †size or bulk; quantity (obsolete). In later use: a quantity, mass, or area of something.',\n", + " 0]]" + ] + }, + "metadata": {}, + "execution_count": 102 + } + ], + "source": [ + "df_gloss_train[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -245,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -255,47 +693,90 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "df_train['enclosed_quotation'] = df_train.apply(enclose_keyword, axis=1)\n", - "df_train['train_text'] = df_train.apply(merge_quotation_keyword, axis=1)\n", - "df_train[['train_text','label']].to_csv(csv_out_path / \"train.csv\",index = False, sep='\\t') \n", + "df_train['text'] = df_train.apply(prep_train_text, axis=1)\n", + "df_train[['text','label']].to_csv(csv_out_path / \"train.csv\",index = False, sep='\\t') \n", "df_val['enclosed_quotation'] = df_val.apply(enclose_keyword, axis=1)\n", - "df_val['train_text'] = df_val.apply(merge_quotation_keyword, axis=1)\n", - "df_val[['train_text','label']].to_csv(csv_out_path / \"dev.csv\",index = False, sep='\\t') \n", + "df_val['text'] = df_val.apply(prep_test_text, axis=1)\n", + "df_val[['text','label']].to_csv(csv_out_path / \"dev.csv\",index = False, sep='\\t') \n", "df_test['enclosed_quotation'] = df_test.apply(enclose_keyword, axis=1)\n", - "df_test['train_text'] = df_test.apply(merge_quotation_keyword, axis=1)\n", - "df_test[['train_text','label']].to_csv(csv_out_path / \"test.csv\",index = False, sep='\\t') " + "df_test['text'] = df_test.apply(prep_test_text, axis=1)\n", + "df_test[['text','label']].to_csv(csv_out_path / \"test.csv\",index = False, sep='\\t') " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "387 [TAGET] Men [TAGET] : He canton'd out the Coun...\n", + "1491 [TAGET] earths [TAGET] : Ley-grounds cannot be...\n", + "1841 [TAGET] earth [TAGET] : It is well to see the ...\n", + "1244 [TAGET] person [TAGET] : The administrator..ha...\n", + "1809 [TAGET] earth [TAGET] : While I drove by in my...\n", + " ... \n", + "736 [TAGET] machines [TAGET] : ‘Anyone,’ declared,...\n", + "610 [TAGET] machine [TAGET] : To each mortal perad...\n", + "1612 [TAGET] body [TAGET] : The coffee, we know, st...\n", + "1128 [TAGET] Personalities [TAGET] : Wisdom, Learni...\n", + "1281 [TAGET] person [TAGET] : I'm a people [TARGET]...\n", + "Name: text, Length: 383, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 81 + } + ], + "source": [ + "df_test.text" + ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'[TAGET] bodies [TAGET] : Particular technical uses. The part of a vehicle fitted to receive the load; spec. the part of a motor car in which driver and passengers sit, or the fuselage of an aeroplane. Cf. cart-body n. at cart n. compounds 2, wide-body n. [SEP] After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color.'" + ] + }, + "metadata": {}, + "execution_count": 82 + } + ], + "source": [ + "df_train.iloc[0].text" + ] + }, + { + "cell_type": "code", + "execution_count": 83, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "2021-01-14 16:01:53,769 Reading data from data/training_data/machine_machine_nn01-38474140\n", - "2021-01-14 16:01:53,770 Train: data/training_data/machine_machine_nn01-38474140/train.csv\n", - "2021-01-14 16:01:53,770 Dev: data/training_data/machine_machine_nn01-38474140/dev.csv\n", - "2021-01-14 16:01:53,770 Test: data/training_data/machine_machine_nn01-38474140/test.csv\n" + "2021-01-14 17:14:29,725 Reading data from data/training_data/machine_machine_nn01-38474140\n", + "2021-01-14 17:14:29,726 Train: data/training_data/machine_machine_nn01-38474140/train.csv\n", + "2021-01-14 17:14:29,727 Dev: data/training_data/machine_machine_nn01-38474140/dev.csv\n", + "2021-01-14 17:14:29,727 Test: data/training_data/machine_machine_nn01-38474140/test.csv\n" ] } ], "source": [ - "from flair.data import Corpus\n", - "from flair.datasets import CSVClassificationCorpus\n", + "\n", "\n", "# this is the folder in which train, test and dev files reside\n", "data_folder = csv_out_path\n", @@ -313,15 +794,15 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 84, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "2021-01-14 16:01:56,791 Computing label dictionary. Progress:\n", - "100%|██████████| 1604/1604 [00:00<00:00, 2103.07it/s]2021-01-14 16:01:57,857 [b'0', b'1']\n", + "2021-01-14 17:14:29,745 Computing label dictionary. Progress:\n", + "100%|██████████| 1604/1604 [00:01<00:00, 1060.11it/s]2021-01-14 17:14:31,699 [b'0', b'1']\n", "Dictionary with 2 tags: 0, 1\n", "\n" ] @@ -335,25 +816,320 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 85, + "metadata": { + "tags": [ + "outputPrepend" + ] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "eps=1e-12, elementwise_affine=True)\n", + " (ffn): FFN(\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", + " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " )\n", + " (4): TransformerBlock(\n", + " (attention): MultiHeadSelfAttention(\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " )\n", + " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (ffn): FFN(\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", + " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " )\n", + " (5): TransformerBlock(\n", + " (attention): MultiHeadSelfAttention(\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", + " )\n", + " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (ffn): FFN(\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", + " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (decoder): Linear(in_features=768, out_features=2, bias=True)\n", + " (loss_function): CrossEntropyLoss()\n", + " (beta): 1.0\n", + " (weights): {b'1': 10, b'0': 1}\n", + " (weight_tensor) tensor([1., 1.], device='cuda:0')\n", + ")\"\n", + "2021-01-14 17:14:33,683 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:14:33,684 Corpus: \"Corpus: 1221 train + 306 dev + 383 test sentences\"\n", + "2021-01-14 17:14:33,684 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:14:33,685 Parameters:\n", + "2021-01-14 17:14:33,685 - learning_rate: \"1e-05\"\n", + "2021-01-14 17:14:33,686 - mini_batch_size: \"16\"\n", + "2021-01-14 17:14:33,686 - patience: \"3\"\n", + "2021-01-14 17:14:33,687 - anneal_factor: \"0.5\"\n", + "2021-01-14 17:14:33,687 - max_epochs: \"10\"\n", + "2021-01-14 17:14:33,688 - shuffle: \"True\"\n", + "2021-01-14 17:14:33,688 - train_with_dev: \"False\"\n", + "2021-01-14 17:14:33,689 - batch_growth_annealing: \"False\"\n", + "2021-01-14 17:14:33,690 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:14:33,690 Model training base path: \"models/taggers/trec\"\n", + "2021-01-14 17:14:33,691 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:14:33,691 Device: cuda:0\n", + "2021-01-14 17:14:33,692 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:14:33,692 Embeddings storage mode: cpu\n", + "2021-01-14 17:14:33,693 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:14:38,343 epoch 1 - iter 7/77 - loss 0.48898176 - samples/sec: 26.27 - lr: 0.000010\n", + "2021-01-14 17:14:42,454 epoch 1 - iter 14/77 - loss 0.42082447 - samples/sec: 27.53 - lr: 0.000010\n", + "2021-01-14 17:14:46,658 epoch 1 - iter 21/77 - loss 0.33773888 - samples/sec: 26.85 - lr: 0.000010\n", + "2021-01-14 17:14:50,844 epoch 1 - iter 28/77 - loss 0.31560597 - samples/sec: 27.01 - lr: 0.000010\n", + "2021-01-14 17:14:54,998 epoch 1 - iter 35/77 - loss 0.25972683 - samples/sec: 27.14 - lr: 0.000010\n", + "2021-01-14 17:14:59,209 epoch 1 - iter 42/77 - loss 0.23569006 - samples/sec: 26.75 - lr: 0.000010\n", + "2021-01-14 17:15:03,408 epoch 1 - iter 49/77 - loss 0.24985709 - samples/sec: 26.85 - lr: 0.000010\n", + "2021-01-14 17:15:07,633 epoch 1 - iter 56/77 - loss 0.23229837 - samples/sec: 26.77 - lr: 0.000010\n", + "2021-01-14 17:15:11,797 epoch 1 - iter 63/77 - loss 0.23326370 - samples/sec: 27.06 - lr: 0.000010\n", + "2021-01-14 17:15:16,012 epoch 1 - iter 70/77 - loss 0.21914055 - samples/sec: 26.79 - lr: 0.000010\n", + "2021-01-14 17:15:19,739 epoch 1 - iter 77/77 - loss 0.20128365 - samples/sec: 30.18 - lr: 0.000010\n", + "2021-01-14 17:15:19,814 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:15:19,814 EPOCH 1 done: loss 0.2013 - lr 0.0000100\n", + "2021-01-14 17:15:23,961 DEV : loss 0.33609411120414734 - score 0.9281\n", + "2021-01-14 17:15:24,224 BAD EPOCHS (no improvement): 0\n", + "saving best model\n", + "2021-01-14 17:15:25,155 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:15:29,794 epoch 2 - iter 7/77 - loss 0.00740182 - samples/sec: 26.44 - lr: 0.000010\n", + "2021-01-14 17:15:34,024 epoch 2 - iter 14/77 - loss 0.03390250 - samples/sec: 26.74 - lr: 0.000010\n", + "2021-01-14 17:15:38,226 epoch 2 - iter 21/77 - loss 0.03571354 - samples/sec: 26.91 - lr: 0.000010\n", + "2021-01-14 17:15:42,369 epoch 2 - iter 28/77 - loss 0.03136397 - samples/sec: 27.22 - lr: 0.000010\n", + "2021-01-14 17:15:46,510 epoch 2 - iter 35/77 - loss 0.03331735 - samples/sec: 27.22 - lr: 0.000010\n", + "2021-01-14 17:15:50,650 epoch 2 - iter 42/77 - loss 0.07917234 - samples/sec: 27.26 - lr: 0.000010\n", + "2021-01-14 17:15:54,832 epoch 2 - iter 49/77 - loss 0.07227532 - samples/sec: 26.94 - lr: 0.000010\n", + "2021-01-14 17:15:59,093 epoch 2 - iter 56/77 - loss 0.06382573 - samples/sec: 26.50 - lr: 0.000010\n", + "2021-01-14 17:16:03,303 epoch 2 - iter 63/77 - loss 0.08917253 - samples/sec: 26.79 - lr: 0.000010\n", + "2021-01-14 17:16:07,583 epoch 2 - iter 70/77 - loss 0.08041374 - samples/sec: 26.33 - lr: 0.000010\n", + "2021-01-14 17:16:11,374 epoch 2 - iter 77/77 - loss 0.08118116 - samples/sec: 29.72 - lr: 0.000010\n", + "2021-01-14 17:16:11,437 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:16:11,438 EPOCH 2 done: loss 0.0812 - lr 0.0000100\n", + "2021-01-14 17:16:15,649 DEV : loss 0.5065702795982361 - score 0.9248\n", + "2021-01-14 17:16:15,909 BAD EPOCHS (no improvement): 1\n", + "2021-01-14 17:16:15,910 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:16:20,491 epoch 3 - iter 7/77 - loss 0.00894024 - samples/sec: 26.72 - lr: 0.000010\n", + "2021-01-14 17:16:24,744 epoch 3 - iter 14/77 - loss 0.04298888 - samples/sec: 26.62 - lr: 0.000010\n", + "2021-01-14 17:16:29,099 epoch 3 - iter 21/77 - loss 0.05707598 - samples/sec: 25.96 - lr: 0.000010\n", + "2021-01-14 17:16:33,430 epoch 3 - iter 28/77 - loss 0.04700577 - samples/sec: 26.10 - lr: 0.000010\n", + "2021-01-14 17:16:37,615 epoch 3 - iter 35/77 - loss 0.03774460 - samples/sec: 26.92 - lr: 0.000010\n", + "2021-01-14 17:16:41,848 epoch 3 - iter 42/77 - loss 0.03161711 - samples/sec: 26.63 - lr: 0.000010\n", + "2021-01-14 17:16:46,019 epoch 3 - iter 49/77 - loss 0.02749447 - samples/sec: 27.00 - lr: 0.000010\n", + "2021-01-14 17:16:50,152 epoch 3 - iter 56/77 - loss 0.02414880 - samples/sec: 27.36 - lr: 0.000010\n", + "2021-01-14 17:16:54,291 epoch 3 - iter 63/77 - loss 0.02319205 - samples/sec: 27.22 - lr: 0.000010\n", + "2021-01-14 17:16:58,450 epoch 3 - iter 70/77 - loss 0.02129739 - samples/sec: 27.15 - lr: 0.000010\n", + "2021-01-14 17:17:02,224 epoch 3 - iter 77/77 - loss 0.01944040 - samples/sec: 29.79 - lr: 0.000010\n", + "2021-01-14 17:17:02,272 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:17:02,273 EPOCH 3 done: loss 0.0194 - lr 0.0000100\n", + "2021-01-14 17:17:06,524 DEV : loss 0.49905064702033997 - score 0.9216\n", + "2021-01-14 17:17:06,786 BAD EPOCHS (no improvement): 2\n", + "2021-01-14 17:17:06,787 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:17:11,358 epoch 4 - iter 7/77 - loss 0.00185599 - samples/sec: 26.75 - lr: 0.000010\n", + "2021-01-14 17:17:15,541 epoch 4 - iter 14/77 - loss 0.00136404 - samples/sec: 27.04 - lr: 0.000010\n", + "2021-01-14 17:17:19,712 epoch 4 - iter 21/77 - loss 0.00128906 - samples/sec: 27.07 - lr: 0.000010\n", + "2021-01-14 17:17:23,840 epoch 4 - iter 28/77 - loss 0.00118582 - samples/sec: 27.40 - lr: 0.000010\n", + "2021-01-14 17:17:28,020 epoch 4 - iter 35/77 - loss 0.00106619 - samples/sec: 26.98 - lr: 0.000010\n", + "2021-01-14 17:17:32,295 epoch 4 - iter 42/77 - loss 0.00094899 - samples/sec: 26.41 - lr: 0.000010\n", + "2021-01-14 17:17:36,522 epoch 4 - iter 49/77 - loss 0.00087282 - samples/sec: 26.66 - lr: 0.000010\n", + "2021-01-14 17:17:40,684 epoch 4 - iter 56/77 - loss 0.00097088 - samples/sec: 27.12 - lr: 0.000010\n", + "2021-01-14 17:17:44,912 epoch 4 - iter 63/77 - loss 0.00087886 - samples/sec: 26.69 - lr: 0.000010\n", + "2021-01-14 17:17:49,112 epoch 4 - iter 70/77 - loss 0.00110282 - samples/sec: 26.91 - lr: 0.000010\n", + "2021-01-14 17:17:52,950 epoch 4 - iter 77/77 - loss 0.00101001 - samples/sec: 29.33 - lr: 0.000010\n", + "2021-01-14 17:17:53,013 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:17:53,014 EPOCH 4 done: loss 0.0010 - lr 0.0000100\n", + "2021-01-14 17:17:57,333 DEV : loss 0.5981439352035522 - score 0.9216\n", + "2021-01-14 17:17:57,595 BAD EPOCHS (no improvement): 3\n", + "2021-01-14 17:17:57,596 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:18:02,196 epoch 5 - iter 7/77 - loss 0.00060182 - samples/sec: 26.72 - lr: 0.000010\n", + "2021-01-14 17:18:06,460 epoch 5 - iter 14/77 - loss 0.00037103 - samples/sec: 26.48 - lr: 0.000010\n", + "2021-01-14 17:18:10,664 epoch 5 - iter 21/77 - loss 0.00028840 - samples/sec: 26.89 - lr: 0.000010\n", + "2021-01-14 17:18:14,811 epoch 5 - iter 28/77 - loss 0.00025692 - samples/sec: 27.23 - lr: 0.000010\n", + "2021-01-14 17:18:18,940 epoch 5 - iter 35/77 - loss 0.00024187 - samples/sec: 27.30 - lr: 0.000010\n", + "2021-01-14 17:18:23,111 epoch 5 - iter 42/77 - loss 0.00021889 - samples/sec: 27.08 - lr: 0.000010\n", + "2021-01-14 17:18:27,299 epoch 5 - iter 49/77 - loss 0.00035217 - samples/sec: 26.95 - lr: 0.000010\n", + "2021-01-14 17:18:31,425 epoch 5 - iter 56/77 - loss 0.00032686 - samples/sec: 27.30 - lr: 0.000010\n", + "2021-01-14 17:18:35,586 epoch 5 - iter 63/77 - loss 0.00029595 - samples/sec: 27.13 - lr: 0.000010\n", + "2021-01-14 17:18:39,774 epoch 5 - iter 70/77 - loss 0.00027235 - samples/sec: 26.91 - lr: 0.000010\n", + "2021-01-14 17:18:43,540 epoch 5 - iter 77/77 - loss 0.00028631 - samples/sec: 29.89 - lr: 0.000010\n", + "2021-01-14 17:18:43,601 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:18:43,602 EPOCH 5 done: loss 0.0003 - lr 0.0000100\n", + "2021-01-14 17:18:46,910 DEV : loss 0.6656511425971985 - score 0.9248\n", + "Epoch 5: reducing learning rate of group 0 to 5.0000e-06.\n", + "2021-01-14 17:18:47,170 BAD EPOCHS (no improvement): 4\n", + "2021-01-14 17:18:47,171 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:18:53,782 epoch 6 - iter 7/77 - loss 0.00022932 - samples/sec: 25.95 - lr: 0.000005\n", + "2021-01-14 17:18:57,949 epoch 6 - iter 14/77 - loss 0.00016531 - samples/sec: 27.13 - lr: 0.000005\n", + "2021-01-14 17:19:02,280 epoch 6 - iter 21/77 - loss 0.00022724 - samples/sec: 25.99 - lr: 0.000005\n", + "2021-01-14 17:19:06,568 epoch 6 - iter 28/77 - loss 0.00026307 - samples/sec: 26.27 - lr: 0.000005\n", + "2021-01-14 17:19:10,654 epoch 6 - iter 35/77 - loss 0.00023027 - samples/sec: 27.57 - lr: 0.000005\n", + "2021-01-14 17:19:14,851 epoch 6 - iter 42/77 - loss 0.00021121 - samples/sec: 26.89 - lr: 0.000005\n", + "2021-01-14 17:19:19,029 epoch 6 - iter 49/77 - loss 0.00019923 - samples/sec: 26.94 - lr: 0.000005\n", + "2021-01-14 17:19:23,229 epoch 6 - iter 56/77 - loss 0.00018661 - samples/sec: 26.87 - lr: 0.000005\n", + "2021-01-14 17:19:27,356 epoch 6 - iter 63/77 - loss 0.00017462 - samples/sec: 27.26 - lr: 0.000005\n", + "2021-01-14 17:19:31,517 epoch 6 - iter 70/77 - loss 0.00016146 - samples/sec: 27.04 - lr: 0.000005\n", + "2021-01-14 17:19:35,455 epoch 6 - iter 77/77 - loss 0.00015018 - samples/sec: 28.60 - lr: 0.000005\n", + "2021-01-14 17:19:35,526 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:19:35,527 EPOCH 6 done: loss 0.0002 - lr 0.0000050\n", + "2021-01-14 17:19:39,223 DEV : loss 0.6893778443336487 - score 0.9248\n", + "2021-01-14 17:19:39,484 BAD EPOCHS (no improvement): 1\n", + "2021-01-14 17:19:39,486 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:19:44,844 epoch 7 - iter 7/77 - loss 0.00019211 - samples/sec: 27.26 - lr: 0.000005\n", + "2021-01-14 17:19:49,028 epoch 7 - iter 14/77 - loss 0.00066268 - samples/sec: 26.98 - lr: 0.000005\n", + "2021-01-14 17:19:53,253 epoch 7 - iter 21/77 - loss 0.00053614 - samples/sec: 26.67 - lr: 0.000005\n", + "2021-01-14 17:19:57,430 epoch 7 - iter 28/77 - loss 0.00043514 - samples/sec: 27.03 - lr: 0.000005\n", + "2021-01-14 17:20:01,594 epoch 7 - iter 35/77 - loss 0.00036258 - samples/sec: 27.03 - lr: 0.000005\n", + "2021-01-14 17:20:05,829 epoch 7 - iter 42/77 - loss 0.00031573 - samples/sec: 26.60 - lr: 0.000005\n", + "2021-01-14 17:20:10,013 epoch 7 - iter 49/77 - loss 0.00028645 - samples/sec: 26.94 - lr: 0.000005\n", + "2021-01-14 17:20:14,238 epoch 7 - iter 56/77 - loss 0.00025793 - samples/sec: 26.67 - lr: 0.000005\n", + "2021-01-14 17:20:18,381 epoch 7 - iter 63/77 - loss 0.00023890 - samples/sec: 27.16 - lr: 0.000005\n", + "2021-01-14 17:20:22,569 epoch 7 - iter 70/77 - loss 0.00021868 - samples/sec: 26.92 - lr: 0.000005\n", + "2021-01-14 17:20:26,365 epoch 7 - iter 77/77 - loss 0.00020789 - samples/sec: 29.64 - lr: 0.000005\n", + "2021-01-14 17:20:26,443 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:20:26,443 EPOCH 7 done: loss 0.0002 - lr 0.0000050\n", + "2021-01-14 17:20:29,751 DEV : loss 0.6999250054359436 - score 0.9248\n", + "2021-01-14 17:20:30,015 BAD EPOCHS (no improvement): 2\n", + "2021-01-14 17:20:30,016 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:20:34,509 epoch 8 - iter 7/77 - loss 0.00007416 - samples/sec: 27.22 - lr: 0.000005\n", + "2021-01-14 17:20:39,619 epoch 8 - iter 14/77 - loss 0.00005522 - samples/sec: 26.82 - lr: 0.000005\n", + "2021-01-14 17:20:43,900 epoch 8 - iter 21/77 - loss 0.00006253 - samples/sec: 26.35 - lr: 0.000005\n", + "2021-01-14 17:20:48,020 epoch 8 - iter 28/77 - loss 0.00015550 - samples/sec: 27.36 - lr: 0.000005\n", + "2021-01-14 17:20:52,243 epoch 8 - iter 35/77 - loss 0.00013324 - samples/sec: 26.66 - lr: 0.000005\n", + "2021-01-14 17:20:56,397 epoch 8 - iter 42/77 - loss 0.00012891 - samples/sec: 27.08 - lr: 0.000005\n", + "2021-01-14 17:21:00,493 epoch 8 - iter 49/77 - loss 0.00012485 - samples/sec: 27.50 - lr: 0.000005\n", + "2021-01-14 17:21:04,731 epoch 8 - iter 56/77 - loss 0.00014117 - samples/sec: 26.59 - lr: 0.000005\n", + "2021-01-14 17:21:08,909 epoch 8 - iter 63/77 - loss 0.00013634 - samples/sec: 26.95 - lr: 0.000005\n", + "2021-01-14 17:21:13,121 epoch 8 - iter 70/77 - loss 0.00012635 - samples/sec: 26.74 - lr: 0.000005\n", + "2021-01-14 17:21:16,967 epoch 8 - iter 77/77 - loss 0.00012047 - samples/sec: 29.24 - lr: 0.000005\n", + "2021-01-14 17:21:17,026 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:21:17,027 EPOCH 8 done: loss 0.0001 - lr 0.0000050\n", + "2021-01-14 17:21:20,598 DEV : loss 0.7107362747192383 - score 0.9248\n", + "2021-01-14 17:21:20,864 BAD EPOCHS (no improvement): 3\n", + "2021-01-14 17:21:20,865 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:21:26,269 epoch 9 - iter 7/77 - loss 0.00002916 - samples/sec: 26.76 - lr: 0.000005\n", + "2021-01-14 17:21:30,543 epoch 9 - iter 14/77 - loss 0.00006615 - samples/sec: 26.39 - lr: 0.000005\n", + "2021-01-14 17:21:34,808 epoch 9 - iter 21/77 - loss 0.00006364 - samples/sec: 26.48 - lr: 0.000005\n", + "2021-01-14 17:21:38,963 epoch 9 - iter 28/77 - loss 0.00013939 - samples/sec: 27.14 - lr: 0.000005\n", + "2021-01-14 17:21:43,165 epoch 9 - iter 35/77 - loss 0.00012621 - samples/sec: 26.86 - lr: 0.000005\n", + "2021-01-14 17:21:47,317 epoch 9 - iter 42/77 - loss 0.00012101 - samples/sec: 27.09 - lr: 0.000005\n", + "2021-01-14 17:21:51,532 epoch 9 - iter 49/77 - loss 0.00018356 - samples/sec: 26.78 - lr: 0.000005\n", + "2021-01-14 17:21:56,114 epoch 9 - iter 56/77 - loss 0.00016344 - samples/sec: 26.01 - lr: 0.000005\n", + "2021-01-14 17:22:00,349 epoch 9 - iter 63/77 - loss 0.00015143 - samples/sec: 26.58 - lr: 0.000005\n", + "2021-01-14 17:22:04,591 epoch 9 - iter 70/77 - loss 0.00013992 - samples/sec: 26.55 - lr: 0.000005\n", + "2021-01-14 17:22:08,380 epoch 9 - iter 77/77 - loss 0.00012958 - samples/sec: 29.74 - lr: 0.000005\n", + "2021-01-14 17:22:08,443 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:22:08,444 EPOCH 9 done: loss 0.0001 - lr 0.0000050\n", + "2021-01-14 17:22:11,739 DEV : loss 0.7207703590393066 - score 0.9248\n", + "Epoch 9: reducing learning rate of group 0 to 2.5000e-06.\n", + "2021-01-14 17:22:12,003 BAD EPOCHS (no improvement): 4\n", + "2021-01-14 17:22:12,004 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:22:16,599 epoch 10 - iter 7/77 - loss 0.00004176 - samples/sec: 26.48 - lr: 0.000003\n", + "2021-01-14 17:22:21,730 epoch 10 - iter 14/77 - loss 0.00004900 - samples/sec: 26.15 - lr: 0.000003\n", + "2021-01-14 17:22:25,925 epoch 10 - iter 21/77 - loss 0.00005921 - samples/sec: 26.94 - lr: 0.000003\n", + "2021-01-14 17:22:30,112 epoch 10 - iter 28/77 - loss 0.00005456 - samples/sec: 26.94 - lr: 0.000003\n", + "2021-01-14 17:22:34,389 epoch 10 - iter 35/77 - loss 0.00004909 - samples/sec: 26.34 - lr: 0.000003\n", + "2021-01-14 17:22:38,546 epoch 10 - iter 42/77 - loss 0.00004503 - samples/sec: 27.15 - lr: 0.000003\n", + "2021-01-14 17:22:42,757 epoch 10 - iter 49/77 - loss 0.00004776 - samples/sec: 26.71 - lr: 0.000003\n", + "2021-01-14 17:22:46,803 epoch 10 - iter 56/77 - loss 0.00004461 - samples/sec: 27.81 - lr: 0.000003\n", + "2021-01-14 17:22:51,063 epoch 10 - iter 63/77 - loss 0.00004382 - samples/sec: 26.47 - lr: 0.000003\n", + "2021-01-14 17:22:55,173 epoch 10 - iter 70/77 - loss 0.00005728 - samples/sec: 27.40 - lr: 0.000003\n", + "2021-01-14 17:22:58,941 epoch 10 - iter 77/77 - loss 0.00005446 - samples/sec: 29.83 - lr: 0.000003\n", + "2021-01-14 17:22:58,988 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:22:58,989 EPOCH 10 done: loss 0.0001 - lr 0.0000025\n", + "2021-01-14 17:23:02,304 DEV : loss 0.7250329852104187 - score 0.9248\n", + "2021-01-14 17:23:02,567 BAD EPOCHS (no improvement): 1\n", + "2021-01-14 17:23:04,650 ----------------------------------------------------------------------------------------------------\n", + "2021-01-14 17:23:04,651 Testing using best model ...\n", + "2021-01-14 17:23:04,653 loading file models/taggers/trec/best-model.pt\n", + "2021-01-14 17:23:09,692 \t0.9295\n", + "2021-01-14 17:23:09,693 \n", + "Results:\n", + "- F-score (micro) 0.9295\n", + "- F-score (macro) 0.4817\n", + "- Accuracy 0.9295\n", + "\n", + "By class:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.9295 1.0000 0.9635 356\n", + " 1 0.0000 0.0000 0.0000 27\n", + "\n", + " micro avg 0.9295 0.9295 0.9295 383\n", + " macro avg 0.4648 0.5000 0.4817 383\n", + "weighted avg 0.8640 0.9295 0.8955 383\n", + " samples avg 0.9295 0.9295 0.9295 383\n", + "\n", + "2021-01-14 17:23:09,694 ----------------------------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'test_score': 0.9295,\n", + " 'dev_score_history': [0.9281,\n", + " 0.9248,\n", + " 0.9216,\n", + " 0.9216,\n", + " 0.9248,\n", + " 0.9248,\n", + " 0.9248,\n", + " 0.9248,\n", + " 0.9248,\n", + " 0.9248],\n", + " 'train_loss_history': [0.20128365170646023,\n", + " 0.08118115926717782,\n", + " 0.019440397426679537,\n", + " 0.0010100115429271352,\n", + " 0.00028631362048062414,\n", + " 0.00015017583772733613,\n", + " 0.0002078855192506468,\n", + " 0.00012047414655809278,\n", + " 0.00012958204591429078,\n", + " 5.446238951249556e-05],\n", + " 'dev_loss_history': [0.33609411120414734,\n", + " 0.5065702795982361,\n", + " 0.49905064702033997,\n", + " 0.5981439352035522,\n", + " 0.6656511425971985,\n", + " 0.6893778443336487,\n", + " 0.6999250054359436,\n", + " 0.7107362747192383,\n", + " 0.7207703590393066,\n", + " 0.7250329852104187]}" + ] + }, + "metadata": {}, + "execution_count": 85 + } + ], "source": [ - "from torch.optim.adam import Adam\n", - "from flair.data import Corpus\n", - "from flair.datasets import TREC_6\n", - "from flair.embeddings import TransformerDocumentEmbeddings\n", - "from flair.models import TextClassifier\n", - "from flair.trainers import ModelTrainer\n", - "\n", - "\n", - "\n", "\n", "# 3. initialize transformer document embeddings (many models are available)\n", "document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)\n", "\n", "# 4. create the text classifier\n", - "classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)\n", + "classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights={b\"1\":10, b\"0\":1}) # loss_weights={\"1\":10, \"0\":1}\n", "\n", "# 5. initialize the text classifier trainer with Adam optimizer\n", "trainer = ModelTrainer(classifier, corpus, optimizer=Adam)\n", @@ -363,10 +1139,24 @@ " learning_rate=1e-5, # use very small learning rate\n", " mini_batch_size=16,\n", " mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine\n", - " max_epochs=5, # terminate after 5 epochs\n", + " max_epochs=10, # terminate after 5 epochs\n", " )" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, From ab0783ba0f886dc3e4a86bcb8733b37e596e1ecc Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Fri, 15 Jan 2021 10:46:55 +0000 Subject: [PATCH 04/10] implement glossbert --- 114.1 - review notebook - glossbert.ipynb | 1109 +-------------------- tasks/wsd_gloss.py | 106 ++ 2 files changed, 125 insertions(+), 1090 deletions(-) create mode 100644 tasks/wsd_gloss.py diff --git a/114.1 - review notebook - glossbert.ipynb b/114.1 - review notebook - glossbert.ipynb index 60288ba..cb3c924 100644 --- a/114.1 - review notebook - glossbert.ipynb +++ b/114.1 - review notebook - glossbert.ipynb @@ -34,1129 +34,58 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "%matplotlib inline\n", - "import pickle\n", - "import pandas as pd\n", - "from tasks import wsd\n", - "from pathlib import Path\n", - "from tasks import wsd\n", - "from utils import nlp_tools\n", - "from tqdm.auto import tqdm\n", - "import numpy as np\n", - "import json\n", - "from sklearn.metrics import classification_report\n", - "from flair.embeddings import TransformerWordEmbeddings\n", - "from utils.dataset_download import harvest_data_from_extended_senses\n", - "from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma\n", - "from torch.optim.adam import Adam\n", - "from flair.datasets import CSVClassificationCorpus\n", - "from flair.data import Corpus\n", - "from flair.datasets import TREC_6\n", - "from flair.embeddings import TransformerDocumentEmbeddings\n", - "from flair.models import TextClassifier\n", - "from flair.trainers import ModelTrainer" + "from tasks.wsd_gloss import create_glossbert_data, train_glossbert" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "lemma = 'machine'\n", "pos = 'NN'\n", - "senses = {'machine_nn01-38474140'} # machine_nn01-38475772 machine_nn01-38475923 machine_nn01-38475835 machine_nn01-38474140\n", - "relations = ['seed','synonym'] # ,'descendant','sibling'\n", - "eval_mode = \"lemma_etal\" # lemma or lemma_etal\n", " " ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 6, "metadata": {}, "outputs": [ { - "output_type": "stream", - "name": "stdout", - "text": [ - "# senses before filtering by date = 517\n", - "# senses after filtering by date = 433\n", - "\n", - "\n", - "# of seed senses 26 \n", - "# of synonyms 383 \n", - "# of branch senses 0\n", - "\n", - "\n", - "# of seeds selected 1 \n", - "# of synonyms selected 44 \n", - "# of branches selected 0\n", - "[LOG] #rows before removing None vector (1947, 21)\n", - "[LOG] #rows after removing None vector (1911, 21)\n" + "output_type": "error", + "ename": "AttributeError", + "evalue": "'Series' object has no attribute 'enclosed_quotation'", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcreate_glossbert_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mpos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mcreate_glossbert_data\u001b[0;34m(lemma, pos)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0mdf_quotations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_quotations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeyword_offset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mdf_quotations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_quotations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m \u001b[0mdf_glossbert\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mto_glossbert_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfrac\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_glossbert\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstratify\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_glossbert\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mto_glossbert_format\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m \u001b[0mrows\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mgloss_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Yes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msense_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 48\u001b[0m \u001b[0mdefinitions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdefinitions\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mgloss_string\u001b[0;34m(row, definition)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0mout_string\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menclosed_quotation\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0mout_string\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menclosed_quotation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mout_string\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0;34m' [SEP] '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5272\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5273\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5274\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5276\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'enclosed_quotation'" ] } ], "source": [ - "df_train, df_val, df_test = binarize(lemma,\n", - " pos,\n", - " senses, \n", - " relations,\n", - " strict_filter=True,\n", - " start=1700,\n", - " end=2000,\n", - " eval_mode=eval_mode)" + "data_path = create_glossbert_data(lemma,pos)" ] }, { "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " sense_id \\\n", - "238 machine_nn01-38474140 \n", - "305 machine_nn01-38474140 \n", - "713 machine_nn01-38474140 \n", - "938 machine_nn01-38474140 \n", - "1042 machine_nn01-38474140 \n", - "1056 machine_nn01-38474140 \n", - "\n", - " lemma_definition \\\n", - "238 A complex device, consisting of a number of in... \n", - "305 A complex device, consisting of a number of in... \n", - "713 A complex device, consisting of a number of in... \n", - "938 A complex device, consisting of a number of in... \n", - "1042 A complex device, consisting of a number of in... \n", - "1056 A complex device, consisting of a number of in... \n", - "\n", - " definition word_id \\\n", - "238 A living body, esp. the human body considered ... machine_nn01 \n", - "305 A living body, esp. the human body considered ... machine_nn01 \n", - "713 A living body, esp. the human body considered ... machine_nn01 \n", - "938 A living body, esp. the human body considered ... machine_nn01 \n", - "1042 A living body, esp. the human body considered ... machine_nn01 \n", - "1056 A living body, esp. the human body considered ... machine_nn01 \n", - "\n", - " lemma quotation_id \\\n", - "238 machine machine_nn01-38474169 \n", - "305 machine machine_nn01-38474177 \n", - "713 machine machine_nn01-38474195 \n", - "938 machine machine_nn01-38474223 \n", - "1042 machine machine_nn01-38474203 \n", - "1056 machine machine_nn01-38474212 \n", - "\n", - " source \\\n", - "238 {'title': 'Death's Vision', 'author': 'J. Reyn... \n", - "305 {'title': 'Spectator', 'author': 'J. Addison',... \n", - "713 {'title': 'Med. & Physical Jrnl.', 'author': N... \n", - "938 {'title': 'Of Human Bondage', 'author': 'W. S.... \n", - "1042 {'title': 'Poems', 'author': 'W. Wordsworth', ... \n", - "1056 {'title': 'Telegraphy', 'author': 'W. H. Preec... \n", - "\n", - " text year \\\n", - "238 {'keyword': 'Machins', 'full_text': 'What Nobl... 1709.0 \n", - "305 {'keyword': 'Machine', 'full_text': 'Cheerfuln... 1712.0 \n", - "713 {'keyword': 'machine', 'full_text': 'When a pr... 1805.0 \n", - "938 {'keyword': 'machine', 'full_text': 'He wonder... 1915.0 \n", - "1042 {'keyword': 'machine', 'full_text': 'And now I... 1807.0 \n", - "1056 {'keyword': 'machine', 'full_text': 'The human... 1876.0 \n", - "\n", - " full_text ... keyword_offset \\\n", - "238 What Nobler Souls the Nobler Machins Wear. ... 29.0 \n", - "305 Cheerfulness is..the best Promoter of Health. ... ... 70.0 \n", - "713 When a product of diseased action has been eff... ... 82.0 \n", - "938 He wondered whether at the very end, now that ... ... 50.0 \n", - "1042 And now I see with eye serene The very pulse o... ... 52.0 \n", - "1056 The human machine tires, and as a consequence ... ... 10.0 \n", - "\n", - " vector_bert_base_-1,-2,-3,-4_mean \\\n", - "238 [0.5628562, -0.04788875, 0.074935675, -0.22630... \n", - "305 [0.0052292813, 0.12355395, 0.023108626, 0.2251... \n", - "713 [0.25928053, 0.049638785, 0.022315167, 0.34901... \n", - "938 [0.38040048, 0.38440758, 0.45397452, 0.1211486... \n", - "1042 [-0.46428305, 0.013232344, -0.595714, 0.049642... \n", - "1056 [0.6930934, 0.09074756, -0.13974331, 0.1105655... \n", - "\n", - " vector_blert_-1,-2,-3,-4_mean label \\\n", - "238 [-0.15516208, 0.289941, -0.15124893, -0.206332... 1 \n", - "305 [-0.04755735, 0.20182909, 0.33001357, -0.04851... 1 \n", - "713 [-0.16033216, -0.16846322, 0.5062964, 0.102019... 1 \n", - "938 [-0.059219074, 0.23112743, 0.42189148, 0.02944... 1 \n", - "1042 [0.021248298, 0.28699854, 0.24638082, -0.01793... 1 \n", - "1056 [0.11798739, -0.0029160888, 0.29418808, -0.076... 1 \n", - "\n", - " id \\\n", - "238 machine_nn01-38474140 \n", - "305 machine_nn01-38474140 \n", - "713 machine_nn01-38474140 \n", - "938 machine_nn01-38474140 \n", - "1042 machine_nn01-38474140 \n", - "1056 machine_nn01-38474140 \n", - "\n", - " daterange \\\n", - "238 {'end': None, 'start': 1604, 'obsolete': False... \n", - "305 {'end': None, 'start': 1604, 'obsolete': False... \n", - "713 {'end': None, 'start': 1604, 'obsolete': False... \n", - "938 {'end': None, 'start': 1604, 'obsolete': False... \n", - "1042 {'end': None, 'start': 1604, 'obsolete': False... \n", - "1056 {'end': None, 'start': 1604, 'obsolete': False... \n", - "\n", - " provenance provenance_type \\\n", - "238 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", - "305 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", - "713 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", - "938 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", - "1042 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", - "1056 [[machine_nn01-38474140, seed, machine_nn01]] seed \n", - "\n", - " relation_to_core_senses relation_to_seed_senses \n", - "238 {machine_nn01-38474140} {machine_nn01-38474140} \n", - "305 {machine_nn01-38474140} {machine_nn01-38474140} \n", - "713 {machine_nn01-38474140} {machine_nn01-38474140} \n", - "938 {machine_nn01-38474140} {machine_nn01-38474140} \n", - "1042 {machine_nn01-38474140} {machine_nn01-38474140} \n", - "1056 {machine_nn01-38474140} {machine_nn01-38474140} \n", - "\n", - "[6 rows x 21 columns]" - ], - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sense_idlemma_definitiondefinitionword_idlemmaquotation_idsourcetextyearfull_text...keyword_offsetvector_bert_base_-1,-2,-3,-4_meanvector_blert_-1,-2,-3,-4_meanlabeliddaterangeprovenanceprovenance_typerelation_to_core_sensesrelation_to_seed_senses
238machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474169{'title': 'Death's Vision', 'author': 'J. Reyn...{'keyword': 'Machins', 'full_text': 'What Nobl...1709.0What Nobler Souls the Nobler Machins Wear....29.0[0.5628562, -0.04788875, 0.074935675, -0.22630...[-0.15516208, 0.289941, -0.15124893, -0.206332...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
305machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474177{'title': 'Spectator', 'author': 'J. Addison',...{'keyword': 'Machine', 'full_text': 'Cheerfuln...1712.0Cheerfulness is..the best Promoter of Health. ......70.0[0.0052292813, 0.12355395, 0.023108626, 0.2251...[-0.04755735, 0.20182909, 0.33001357, -0.04851...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
713machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474195{'title': 'Med. & Physical Jrnl.', 'author': N...{'keyword': 'machine', 'full_text': 'When a pr...1805.0When a product of diseased action has been eff......82.0[0.25928053, 0.049638785, 0.022315167, 0.34901...[-0.16033216, -0.16846322, 0.5062964, 0.102019...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
938machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474223{'title': 'Of Human Bondage', 'author': 'W. S....{'keyword': 'machine', 'full_text': 'He wonder...1915.0He wondered whether at the very end, now that ......50.0[0.38040048, 0.38440758, 0.45397452, 0.1211486...[-0.059219074, 0.23112743, 0.42189148, 0.02944...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
1042machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474203{'title': 'Poems', 'author': 'W. Wordsworth', ...{'keyword': 'machine', 'full_text': 'And now I...1807.0And now I see with eye serene The very pulse o......52.0[-0.46428305, 0.013232344, -0.595714, 0.049642...[0.021248298, 0.28699854, 0.24638082, -0.01793...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
1056machine_nn01-38474140A complex device, consisting of a number of in...A living body, esp. the human body considered ...machine_nn01machinemachine_nn01-38474212{'title': 'Telegraphy', 'author': 'W. H. Preec...{'keyword': 'machine', 'full_text': 'The human...1876.0The human machine tires, and as a consequence ......10.0[0.6930934, 0.09074756, -0.13974331, 0.1105655...[0.11798739, -0.0029160888, 0.29418808, -0.076...1machine_nn01-38474140{'end': None, 'start': 1604, 'obsolete': False...[[machine_nn01-38474140, seed, machine_nn01]]seed{machine_nn01-38474140}{machine_nn01-38474140}
\n

6 rows × 21 columns

\n
" - }, - "metadata": {}, - "execution_count": 72 - } - ], - "source": [ - "df_train[df_train.sense_id=='machine_nn01-38474140']" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array(['body_nn01', 'man_nn01', 'machine_nn01', 'carcass_nn01',\n", - " 'person_nn01', 'case_nn02', 'personage_nn01', 'corporeity_nn01',\n", - " 'structure_nn01', 'dust_nn01', 'case_nn01', 'automaton_nn01',\n", - " 'earth_nn01', 'soma_nn02', 'bulk_nn01', 'microcosm_nn01',\n", - " 'personality_nn01', 'tabernacle_nn01', 'vessel_nn01',\n", - " 'corpse_nn01', 'case_nn04', 'clay_nn01', 'clod_nn01',\n", - " 'skinful_nn01', 'carrion_nn01', 'embodiment_nn01', 'corpus_nn01',\n", - " 'flesh_nn01', 'soma_nn01', 'bloodbulk_nn01', 'earth_nn02',\n", - " 'soulcase_nn02', 'corporation_nn01', 'chassis_nn01', 'bulk_nn03',\n", - " 'bouk_nn01', 'outwall_nn01', 'case_nn03', 'incarnation_nn01',\n", - " 'bonehouse_nn01', 'man_nn04', 'bulk_nn02', 'soulcase_nn01',\n", - " 'godsimage_nn01', 'quarrons_nn01'], dtype=object)" - ] - }, - "metadata": {}, - "execution_count": 73 - } - ], - "source": [ - "df_train.word_id.unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " sense_id \\\n", - "0 body_nn01-17170653 \n", - "1 man_nn01-110482153 \n", - "2 body_nn01-17169813 \n", - "3 machine_nn01-38474877 \n", - "4 carcass_nn01-10177258 \n", - "... ... \n", - "1215 man_nn01-110479060 \n", - "1216 person_nn01-30950985 \n", - "1217 clay_nn01-9320873 \n", - "1218 case_nn02-10018131 \n", - "1220 man_nn01-110487579 \n", - "\n", - " lemma_definition \\\n", - "0 The complete physical form of a person or anim... \n", - "1 An adult male human being. Without explicit co... \n", - "2 The complete physical form of a person or anim... \n", - "3 A complex device, consisting of a number of in... \n", - "4 The dead body of a person or animal; but no lo... \n", - "... ... \n", - "1215 An adult male human being. Without explicit co... \n", - "1216 An individual human being; a man, woman, or ch... \n", - "1217 A stiff viscous earth found, in many varieties... \n", - "1218 A box, bag, or other receptacle, designed to c... \n", - "1220 An adult male human being. Without explicit co... \n", - "\n", - " definition word_id \\\n", - "0 Particular technical uses. The part of a vehic... body_nn01 \n", - "1 As vocative or as int., introducing a remark o... man_nn01 \n", - "2 Contrasted with the soul. Cf. soul body n. at ... body_nn01 \n", - "3 A bicycle or tricycle; a motorcycle. Formerly ... machine_nn01 \n", - "4 The naked framework or ‘shell’ of a building b... carcass_nn01 \n", - "... ... ... \n", - "1215 A husband. Now chiefly English regional (north... man_nn01 \n", - "1216 Law. An individual (natural person n.) or corp... person_nn01 \n", - "1217 Short for clay-pipe n. at compounds 2 (colloq... clay_nn01 \n", - "1218 slang. A house, esp. one used as a brothel. Cf... case_nn02 \n", - "1220 In Cumbria: a cairn marking a summit or promin... man_nn01 \n", - "\n", - " lemma quotation_id \\\n", - "0 body body_nn01-132916428 \n", - "1 man man_nn01-110482440 \n", - "2 body body_nn01-17169857 \n", - "3 machine machine_nn01-38474966 \n", - "4 carcass carcass_nn01-10177295 \n", - "... ... ... \n", - "1215 man man_nn01-110479206 \n", - "1216 person person_nn01-30951076 \n", - "1217 clay clay_nn01-9320896 \n", - "1218 case case_nn02-10018191 \n", - "1220 man man_nn01-110487624 \n", - "\n", - " source \\\n", - "0 {'title': 'Material Handling Engin.', 'author'... \n", - "1 {'title': 'Shaela', 'author': 'R. Bulter', 'ge... \n", - "2 {'title': 'Ess. Man', 'author': 'A. Pope', 'ge... \n", - "3 {'title': 'National Trust Mag.', 'author': Non... \n", - "4 {'title': 'New Pract. Builder', 'author': 'P. ... \n", - "... ... \n", - "1215 {'title': 'Four Years S. Afr.', 'author': 'C. ... \n", - "1216 {'title': 'Daily News', 'author': None, 'gende... \n", - "1217 {'title': 'Held in Bondage', 'author': '‘Ouida... \n", - "1218 {'title': 'Mop Fair', 'author': 'A. M. Binstea... \n", - "1220 {'title': 'Northern Affair', 'author': 'D. K. ... \n", - "\n", - " text year \\\n", - "0 {'keyword': 'bodies', 'full_text': 'After car ... 1990.0 \n", - "1 {'keyword': 'Min', 'full_text': 'Min A'm vexed... 1976.0 \n", - "2 {'keyword': 'Body', 'full_text': 'All are but ... 1733.0 \n", - "3 {'keyword': 'machines', 'full_text': 'The cycl... 1992.0 \n", - "4 {'keyword': 'Carcase', 'full_text': 'Carcase o... 1823.0 \n", - "... ... ... \n", - "1215 {'keyword': 'man', 'full_text': 'The wife brok... 1829.0 \n", - "1216 {'keyword': 'persons', 'full_text': 'A Bill..e... 1900.0 \n", - "1217 {'keyword': 'clays', 'full_text': 'Filthy bird... 1863.0 \n", - "1218 {'keyword': 'case', 'full_text': 'They arrange... 1905.0 \n", - "1220 {'keyword': 'man', 'full_text': 'Over the elep... 1964.0 \n", - "\n", - " full_text ... keyword_offset \\\n", - "0 After car bodies are painted, they are moved i... ... 10.0 \n", - "1 Min A'm vexed ta hear yun. ... 0.0 \n", - "2 All are but parts of one stupendous Whole, Who... ... 49.0 \n", - "3 The cyclists..took on the circular 21- or 42-m... ... 92.0 \n", - "4 Carcase of a Building, the naked walls, and th... ... 0.0 \n", - "... ... ... ... \n", - "1215 The wife broke out, ‘You lament a brother, and... ... 79.0 \n", - "1216 A Bill..extending to juridical persons, that i... ... 31.0 \n", - "1217 Filthy bird's-eye, smoked in clays. ... 29.0 \n", - "1218 They arranges to stop ‘private’ in Brighton, a... ... 57.0 \n", - "1220 Over the elephant rocks and under the lee of t... ... 55.0 \n", - "\n", - " vector_bert_base_-1,-2,-3,-4_mean \\\n", - "0 [1.2747291, 0.25178745, 0.69486666, 0.42832682... \n", - "1 [-0.10557328, 0.24347349, 0.731555, -0.4305202... \n", - "2 [0.8197431, 0.04237363, 0.6312159, -0.2658673,... \n", - "3 [-0.18150243, -0.24230756, -0.3336587, 0.34879... \n", - "4 [0.6567496, -0.050804906, 0.31024605, 0.059706... \n", - "... ... \n", - "1215 [-0.07307064, -0.31692728, 0.38834277, -0.2980... \n", - "1216 [0.030711764, 0.28706473, 0.6596842, -0.132111... \n", - "1217 [-0.016634814, 0.6912965, -0.18498293, -0.2104... \n", - "1218 [0.16278893, -0.17927478, 0.34916735, -0.34717... \n", - "1220 [0.12908892, 0.1654679, -0.077464886, -0.44454... \n", - "\n", - " vector_blert_-1,-2,-3,-4_mean label \\\n", - "0 [1.5054287, 1.1386966, 1.3405375, 0.8012274, -... 0 \n", - "1 [-0.49209523, 0.7658461, 0.07512934, 0.0148925... 0 \n", - "2 [0.60478234, 0.58020014, 0.053836707, -0.06571... 0 \n", - "3 [-0.14852196, 0.69629294, 0.30973893, 0.598406... 0 \n", - "4 [0.41240987, 0.10217035, 0.48574266, 0.8627304... 0 \n", - "... ... ... \n", - "1215 [-0.20098017, 0.47577783, 0.013388823, -0.2808... 0 \n", - "1216 [-0.42745396, 0.4621299, 0.34301567, 0.2193956... 0 \n", - "1217 [-0.2833503, 0.80949837, -0.5981247, 0.4331013... 0 \n", - "1218 [0.3253876, 0.12327082, -0.077930324, 0.450299... 0 \n", - "1220 [-0.4877532, 0.62317544, -0.4543179, -0.167910... 0 \n", - "\n", - " id \\\n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 machine_nn01-38474877 \n", - "4 NaN \n", - "... ... \n", - "1215 NaN \n", - "1216 NaN \n", - "1217 NaN \n", - "1218 NaN \n", - "1220 NaN \n", - "\n", - " daterange \\\n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 {'end': None, 'start': 1823, 'obsolete': False... \n", - "4 NaN \n", - "... ... \n", - "1215 NaN \n", - "1216 NaN \n", - "1217 NaN \n", - "1218 NaN \n", - "1220 NaN \n", - "\n", - " provenance provenance_type \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 [[machine_nn01-38474877, seed, machine_nn01]] seed \n", - "4 NaN NaN \n", - "... ... ... \n", - "1215 NaN NaN \n", - "1216 NaN NaN \n", - "1217 NaN NaN \n", - "1218 NaN NaN \n", - "1220 NaN NaN \n", - "\n", - " relation_to_core_senses relation_to_seed_senses \n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 {machine_nn01-38474877} {machine_nn01-38474877} \n", - "4 NaN NaN \n", - "... ... ... \n", - "1215 NaN NaN \n", - "1216 NaN NaN \n", - "1217 NaN NaN \n", - "1218 NaN NaN \n", - "1220 NaN NaN \n", - "\n", - "[1135 rows x 21 columns]" - ], - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sense_idlemma_definitiondefinitionword_idlemmaquotation_idsourcetextyearfull_text...keyword_offsetvector_bert_base_-1,-2,-3,-4_meanvector_blert_-1,-2,-3,-4_meanlabeliddaterangeprovenanceprovenance_typerelation_to_core_sensesrelation_to_seed_senses
0body_nn01-17170653The complete physical form of a person or anim...Particular technical uses. The part of a vehic...body_nn01bodybody_nn01-132916428{'title': 'Material Handling Engin.', 'author'...{'keyword': 'bodies', 'full_text': 'After car ...1990.0After car bodies are painted, they are moved i......10.0[1.2747291, 0.25178745, 0.69486666, 0.42832682...[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...0NaNNaNNaNNaNNaNNaN
1man_nn01-110482153An adult male human being. Without explicit co...As vocative or as int., introducing a remark o...man_nn01manman_nn01-110482440{'title': 'Shaela', 'author': 'R. Bulter', 'ge...{'keyword': 'Min', 'full_text': 'Min A'm vexed...1976.0Min A'm vexed ta hear yun....0.0[-0.10557328, 0.24347349, 0.731555, -0.4305202...[-0.49209523, 0.7658461, 0.07512934, 0.0148925...0NaNNaNNaNNaNNaNNaN
2body_nn01-17169813The complete physical form of a person or anim...Contrasted with the soul. Cf. soul body n. at ...body_nn01bodybody_nn01-17169857{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...{'keyword': 'Body', 'full_text': 'All are but ...1733.0All are but parts of one stupendous Whole, Who......49.0[0.8197431, 0.04237363, 0.6312159, -0.2658673,...[0.60478234, 0.58020014, 0.053836707, -0.06571...0NaNNaNNaNNaNNaNNaN
3machine_nn01-38474877A complex device, consisting of a number of in...A bicycle or tricycle; a motorcycle. Formerly ...machine_nn01machinemachine_nn01-38474966{'title': 'National Trust Mag.', 'author': Non...{'keyword': 'machines', 'full_text': 'The cycl...1992.0The cyclists..took on the circular 21- or 42-m......92.0[-0.18150243, -0.24230756, -0.3336587, 0.34879...[-0.14852196, 0.69629294, 0.30973893, 0.598406...0machine_nn01-38474877{'end': None, 'start': 1823, 'obsolete': False...[[machine_nn01-38474877, seed, machine_nn01]]seed{machine_nn01-38474877}{machine_nn01-38474877}
4carcass_nn01-10177258The dead body of a person or animal; but no lo...The naked framework or ‘shell’ of a building b...carcass_nn01carcasscarcass_nn01-10177295{'title': 'New Pract. Builder', 'author': 'P. ...{'keyword': 'Carcase', 'full_text': 'Carcase o...1823.0Carcase of a Building, the naked walls, and th......0.0[0.6567496, -0.050804906, 0.31024605, 0.059706...[0.41240987, 0.10217035, 0.48574266, 0.8627304...0NaNNaNNaNNaNNaNNaN
..................................................................
1215man_nn01-110479060An adult male human being. Without explicit co...A husband. Now chiefly English regional (north...man_nn01manman_nn01-110479206{'title': 'Four Years S. Afr.', 'author': 'C. ...{'keyword': 'man', 'full_text': 'The wife brok...1829.0The wife broke out, ‘You lament a brother, and......79.0[-0.07307064, -0.31692728, 0.38834277, -0.2980...[-0.20098017, 0.47577783, 0.013388823, -0.2808...0NaNNaNNaNNaNNaNNaN
1216person_nn01-30950985An individual human being; a man, woman, or ch...Law. An individual (natural person n.) or corp...person_nn01personperson_nn01-30951076{'title': 'Daily News', 'author': None, 'gende...{'keyword': 'persons', 'full_text': 'A Bill..e...1900.0A Bill..extending to juridical persons, that i......31.0[0.030711764, 0.28706473, 0.6596842, -0.132111...[-0.42745396, 0.4621299, 0.34301567, 0.2193956...0NaNNaNNaNNaNNaNNaN
1217clay_nn01-9320873A stiff viscous earth found, in many varieties...Short for clay-pipe n. at compounds 2 (colloq...clay_nn01clayclay_nn01-9320896{'title': 'Held in Bondage', 'author': '‘Ouida...{'keyword': 'clays', 'full_text': 'Filthy bird...1863.0Filthy bird's-eye, smoked in clays....29.0[-0.016634814, 0.6912965, -0.18498293, -0.2104...[-0.2833503, 0.80949837, -0.5981247, 0.4331013...0NaNNaNNaNNaNNaNNaN
1218case_nn02-10018131A box, bag, or other receptacle, designed to c...slang. A house, esp. one used as a brothel. Cf...case_nn02casecase_nn02-10018191{'title': 'Mop Fair', 'author': 'A. M. Binstea...{'keyword': 'case', 'full_text': 'They arrange...1905.0They arranges to stop ‘private’ in Brighton, a......57.0[0.16278893, -0.17927478, 0.34916735, -0.34717...[0.3253876, 0.12327082, -0.077930324, 0.450299...0NaNNaNNaNNaNNaNNaN
1220man_nn01-110487579An adult male human being. Without explicit co...In Cumbria: a cairn marking a summit or promin...man_nn01manman_nn01-110487624{'title': 'Northern Affair', 'author': 'D. K. ...{'keyword': 'man', 'full_text': 'Over the elep...1964.0Over the elephant rocks and under the lee of t......55.0[0.12908892, 0.1654679, -0.077464886, -0.44454...[-0.4877532, 0.62317544, -0.4543179, -0.167910...0NaNNaNNaNNaNNaNNaN
\n

1135 rows × 21 columns

\n
" - }, - "metadata": {}, - "execution_count": 74 - } - ], - "source": [ - "df_train[df_train.label==\"0\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " sense_id lemma_definition \\\n", - "0 body_nn01-17170653 The complete physical form of a person or anim... \n", - "1 man_nn01-110482153 An adult male human being. Without explicit co... \n", - "2 body_nn01-17169813 The complete physical form of a person or anim... \n", - "\n", - " definition word_id lemma \\\n", - "0 Particular technical uses. The part of a vehic... body_nn01 body \n", - "1 As vocative or as int., introducing a remark o... man_nn01 man \n", - "2 Contrasted with the soul. Cf. soul body n. at ... body_nn01 body \n", - "\n", - " quotation_id source \\\n", - "0 body_nn01-132916428 {'title': 'Material Handling Engin.', 'author'... \n", - "1 man_nn01-110482440 {'title': 'Shaela', 'author': 'R. Bulter', 'ge... \n", - "2 body_nn01-17169857 {'title': 'Ess. Man', 'author': 'A. Pope', 'ge... \n", - "\n", - " text year \\\n", - "0 {'keyword': 'bodies', 'full_text': 'After car ... 1990.0 \n", - "1 {'keyword': 'Min', 'full_text': 'Min A'm vexed... 1976.0 \n", - "2 {'keyword': 'Body', 'full_text': 'All are but ... 1733.0 \n", - "\n", - " full_text ... keyword_offset \\\n", - "0 After car bodies are painted, they are moved i... ... 10.0 \n", - "1 Min A'm vexed ta hear yun. ... 0.0 \n", - "2 All are but parts of one stupendous Whole, Who... ... 49.0 \n", - "\n", - " vector_bert_base_-1,-2,-3,-4_mean \\\n", - "0 [1.2747291, 0.25178745, 0.69486666, 0.42832682... \n", - "1 [-0.10557328, 0.24347349, 0.731555, -0.4305202... \n", - "2 [0.8197431, 0.04237363, 0.6312159, -0.2658673,... \n", - "\n", - " vector_blert_-1,-2,-3,-4_mean label id daterange \\\n", - "0 [1.5054287, 1.1386966, 1.3405375, 0.8012274, -... 0 NaN NaN \n", - "1 [-0.49209523, 0.7658461, 0.07512934, 0.0148925... 0 NaN NaN \n", - "2 [0.60478234, 0.58020014, 0.053836707, -0.06571... 0 NaN NaN \n", - "\n", - " provenance provenance_type relation_to_core_senses relation_to_seed_senses \n", - "0 NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN \n", - "\n", - "[3 rows x 21 columns]" - ], - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sense_idlemma_definitiondefinitionword_idlemmaquotation_idsourcetextyearfull_text...keyword_offsetvector_bert_base_-1,-2,-3,-4_meanvector_blert_-1,-2,-3,-4_meanlabeliddaterangeprovenanceprovenance_typerelation_to_core_sensesrelation_to_seed_senses
0body_nn01-17170653The complete physical form of a person or anim...Particular technical uses. The part of a vehic...body_nn01bodybody_nn01-132916428{'title': 'Material Handling Engin.', 'author'...{'keyword': 'bodies', 'full_text': 'After car ...1990.0After car bodies are painted, they are moved i......10.0[1.2747291, 0.25178745, 0.69486666, 0.42832682...[1.5054287, 1.1386966, 1.3405375, 0.8012274, -...0NaNNaNNaNNaNNaNNaN
1man_nn01-110482153An adult male human being. Without explicit co...As vocative or as int., introducing a remark o...man_nn01manman_nn01-110482440{'title': 'Shaela', 'author': 'R. Bulter', 'ge...{'keyword': 'Min', 'full_text': 'Min A'm vexed...1976.0Min A'm vexed ta hear yun....0.0[-0.10557328, 0.24347349, 0.731555, -0.4305202...[-0.49209523, 0.7658461, 0.07512934, 0.0148925...0NaNNaNNaNNaNNaNNaN
2body_nn01-17169813The complete physical form of a person or anim...Contrasted with the soul. Cf. soul body n. at ...body_nn01bodybody_nn01-17169857{'title': 'Ess. Man', 'author': 'A. Pope', 'ge...{'keyword': 'Body', 'full_text': 'All are but ...1733.0All are but parts of one stupendous Whole, Who......49.0[0.8197431, 0.04237363, 0.6312159, -0.2658673,...[0.60478234, 0.58020014, 0.053836707, -0.06571...0NaNNaNNaNNaNNaNNaN
\n

3 rows × 21 columns

\n
" - }, - "metadata": {}, - "execution_count": 75 - } - ], - "source": [ - "df_train.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',\n", - " 'quotation_id', 'source', 'text', 'year', 'full_text', 'keyword',\n", - " 'keyword_offset', 'vector_bert_base_-1,-2,-3,-4_mean',\n", - " 'vector_blert_-1,-2,-3,-4_mean', 'label', 'id', 'daterange',\n", - " 'provenance', 'provenance_type', 'relation_to_core_senses',\n", - " 'relation_to_seed_senses'],\n", - " dtype='object')" - ] - }, - "metadata": {}, - "execution_count": 76 - } - ], - "source": [ - "df_train.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [], - "source": [ - "def enclose_keyword(row,enclose_token='\"'):\n", - " \"\"\"enclose keyword with specific token to point\n", - " learner towards to word it has to focus on\n", - " \"\"\"\n", - " sentence = ''\n", - " for i,c in enumerate(row.full_text):\n", - " if i == int(row.keyword_offset):\n", - " sentence+=enclose_token + ' '\n", - " elif i ==int(row.keyword_offset + len(row.keyword)):\n", - " sentence+= ' ' + enclose_token\n", - " sentence+=c\n", - " return sentence\n", - "\n", - "#def merge_quotation_gloss(row):\n", - "# out_string = '[GLOSS] '\n", - "# if row.definition:\n", - "# out_string+=row.definition\n", - "# out_string+=' [QUOT] ' \n", - "# if row.enclosed_quotation:\n", - "# out_string+=row.enclosed_quotation\n", - "# return out_string\n", - "\n", - "#def prep_train_text(row):\n", - "# out_string='[TAGET] '+row.keyword+' [TAGET] : '\n", - "# if row.definition:\n", - "# out_string+=row.definition\n", - "# out_string+=' [SEP] ' \n", - "# if row.enclosed_quotation:\n", - "# out_string+=row.enclosed_quotation\n", - "# return out_string\n", - "\n", - "#def prep_test_text(row):\n", - "# out_string='[TAGET] '+row.keyword+' [TAGET] : '\n", - "# if row.enclosed_quotation:\n", - "# out_string+=row.enclosed_quotation\n", - "# return out_string\n", - "\n", - "#def merge_quotation_keyword(row):\n", - "# out_string = '[TARGET] '\n", - "# if row.keyword:\n", - "# out_string+=row.keyword\n", - "# out_string+=' [QUOT] ' \n", - "# if row.enclosed_quotation:\n", - "# out_string+=row.enclosed_quotation\n", - "# return out_string\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [], - "source": [ - "def to_glossbert_format(df):\n", - " def gloss_string(row, definition):\n", - " out_string=''\n", - " if row.enclosed_quotation:\n", - " out_string+=row.enclosed_quotation\n", - " out_string+=' [SEP] ' \n", - " out_string+=row.keyword+': '\n", - " if row.definition:\n", - " out_string+=definition\n", - " return out_string\n", - "\n", - " df['enclosed_quotations'] = df.apply(enclose_keyword, axis=1)\n", - " \n", - " rows = [] \n", - " for i,row in df.iterrows():\n", - " rows.append([gloss_string(row, row.definition), 1])\n", - " definitions = df[df.lemma==row.lemma].definition.unique()\n", - " for d in definitions:\n", - " if d != row.definition:\n", - " rows.append([gloss_string(row,d), 0])\n", - " \n", - " return rows\n", - "\n", - "df_gloss_train = to_glossbert_format(df_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Particular technical uses. The part of a vehicle fitted to receive the load; spec. the part of a motor car in which driver and passengers sit, or the fuselage of an aeroplane. Cf. cart-body n. at cart n. compounds 2, wide-body n.',\n", - " 1],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Contrasted with the soul. Cf. soul body n. at soul n. compounds 4.',\n", - " 0],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Particular technical uses. The main part of a musical instrument, which in the case of traditional stringed instruments forms a resonating chamber.',\n", - " 0],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: The complete physical form of a person or animal; the assemblage of parts, organs, and tissues that constitutes the whole material organism.',\n", - " 0],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: A comprehensive and systematic collection of information, or of the details of any subject, esp. law; a textbook, a pandect. Usually with of.',\n", - " 0],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: A corpse.',\n", - " 0],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: The physical or mortal nature, state, or aspect of man. Frequently in in (the) body, out of (the) body and variants, sometimes contrasted with in spirit.',\n", - " 0],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: More widely: a material thing, an object; something that has physical existence and extension in space.',\n", - " 0],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Cell Biology. Any of various normal or abnormal structures found within the cytoplasm or nucleus of a cell. Frequently with distinguishing word.',\n", - " 0],\n", - " ['After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color. [SEP] bodies: Originally: †size or bulk; quantity (obsolete). In later use: a quantity, mass, or area of something.',\n", - " 0]]" - ] - }, - "metadata": {}, - "execution_count": 102 - } - ], - "source": [ - "df_gloss_train[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "path = Path('./data/training_data')\n", - "path.mkdir(exist_ok=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [], - "source": [ - "csv_out_path = path / f\"{lemma}_{'_'.join(senses)}\"\n", - "csv_out_path.mkdir(exist_ok=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 80, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "df_train['enclosed_quotation'] = df_train.apply(enclose_keyword, axis=1)\n", - "df_train['text'] = df_train.apply(prep_train_text, axis=1)\n", - "df_train[['text','label']].to_csv(csv_out_path / \"train.csv\",index = False, sep='\\t') \n", - "df_val['enclosed_quotation'] = df_val.apply(enclose_keyword, axis=1)\n", - "df_val['text'] = df_val.apply(prep_test_text, axis=1)\n", - "df_val[['text','label']].to_csv(csv_out_path / \"dev.csv\",index = False, sep='\\t') \n", - "df_test['enclosed_quotation'] = df_test.apply(enclose_keyword, axis=1)\n", - "df_test['text'] = df_test.apply(prep_test_text, axis=1)\n", - "df_test[['text','label']].to_csv(csv_out_path / \"test.csv\",index = False, sep='\\t') " - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "387 [TAGET] Men [TAGET] : He canton'd out the Coun...\n", - "1491 [TAGET] earths [TAGET] : Ley-grounds cannot be...\n", - "1841 [TAGET] earth [TAGET] : It is well to see the ...\n", - "1244 [TAGET] person [TAGET] : The administrator..ha...\n", - "1809 [TAGET] earth [TAGET] : While I drove by in my...\n", - " ... \n", - "736 [TAGET] machines [TAGET] : ‘Anyone,’ declared,...\n", - "610 [TAGET] machine [TAGET] : To each mortal perad...\n", - "1612 [TAGET] body [TAGET] : The coffee, we know, st...\n", - "1128 [TAGET] Personalities [TAGET] : Wisdom, Learni...\n", - "1281 [TAGET] person [TAGET] : I'm a people [TARGET]...\n", - "Name: text, Length: 383, dtype: object" - ] - }, - "metadata": {}, - "execution_count": 81 - } - ], - "source": [ - "df_test.text" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'[TAGET] bodies [TAGET] : Particular technical uses. The part of a vehicle fitted to receive the load; spec. the part of a motor car in which driver and passengers sit, or the fuselage of an aeroplane. Cf. cart-body n. at cart n. compounds 2, wide-body n. [SEP] After car [TARGET] bodies [TARGET] are painted, they are moved into storage to coordinate the production schedule with the number of bodies painted a specific color.'" - ] - }, - "metadata": {}, - "execution_count": 82 - } - ], - "source": [ - "df_train.iloc[0].text" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2021-01-14 17:14:29,725 Reading data from data/training_data/machine_machine_nn01-38474140\n", - "2021-01-14 17:14:29,726 Train: data/training_data/machine_machine_nn01-38474140/train.csv\n", - "2021-01-14 17:14:29,727 Dev: data/training_data/machine_machine_nn01-38474140/dev.csv\n", - "2021-01-14 17:14:29,727 Test: data/training_data/machine_machine_nn01-38474140/test.csv\n" - ] - } - ], - "source": [ - "\n", - "\n", - "# this is the folder in which train, test and dev files reside\n", - "data_folder = csv_out_path\n", - "\n", - "# column format indicating which columns hold the text and label(s)\n", - "column_name_map = {0: \"text\", 1: \"label\"}\n", - "\n", - "# load corpus containing training, test and dev data and if CSV has a header, you can skip it\n", - "corpus: Corpus = CSVClassificationCorpus(data_folder,\n", - " column_name_map,\n", - " skip_header=True,\n", - " delimiter='\\t', # tab-separated files\n", - ") " - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2021-01-14 17:14:29,745 Computing label dictionary. Progress:\n", - "100%|██████████| 1604/1604 [00:01<00:00, 1060.11it/s]2021-01-14 17:14:31,699 [b'0', b'1']\n", - "Dictionary with 2 tags: 0, 1\n", - "\n" - ] - } - ], - "source": [ - "# 2. create the label dictionary\n", - "label_dict = corpus.make_label_dictionary()\n", - "print(label_dict)" + "train_glossbert(data_path,downsample=True)" ] }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "tags": [ - "outputPrepend" - ] - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "eps=1e-12, elementwise_affine=True)\n", - " (ffn): FFN(\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", - " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", - " )\n", - " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " )\n", - " (4): TransformerBlock(\n", - " (attention): MultiHeadSelfAttention(\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " )\n", - " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (ffn): FFN(\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", - " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", - " )\n", - " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " )\n", - " (5): TransformerBlock(\n", - " (attention): MultiHeadSelfAttention(\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " )\n", - " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (ffn): FFN(\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", - " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", - " )\n", - " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (decoder): Linear(in_features=768, out_features=2, bias=True)\n", - " (loss_function): CrossEntropyLoss()\n", - " (beta): 1.0\n", - " (weights): {b'1': 10, b'0': 1}\n", - " (weight_tensor) tensor([1., 1.], device='cuda:0')\n", - ")\"\n", - "2021-01-14 17:14:33,683 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:14:33,684 Corpus: \"Corpus: 1221 train + 306 dev + 383 test sentences\"\n", - "2021-01-14 17:14:33,684 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:14:33,685 Parameters:\n", - "2021-01-14 17:14:33,685 - learning_rate: \"1e-05\"\n", - "2021-01-14 17:14:33,686 - mini_batch_size: \"16\"\n", - "2021-01-14 17:14:33,686 - patience: \"3\"\n", - "2021-01-14 17:14:33,687 - anneal_factor: \"0.5\"\n", - "2021-01-14 17:14:33,687 - max_epochs: \"10\"\n", - "2021-01-14 17:14:33,688 - shuffle: \"True\"\n", - "2021-01-14 17:14:33,688 - train_with_dev: \"False\"\n", - "2021-01-14 17:14:33,689 - batch_growth_annealing: \"False\"\n", - "2021-01-14 17:14:33,690 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:14:33,690 Model training base path: \"models/taggers/trec\"\n", - "2021-01-14 17:14:33,691 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:14:33,691 Device: cuda:0\n", - "2021-01-14 17:14:33,692 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:14:33,692 Embeddings storage mode: cpu\n", - "2021-01-14 17:14:33,693 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:14:38,343 epoch 1 - iter 7/77 - loss 0.48898176 - samples/sec: 26.27 - lr: 0.000010\n", - "2021-01-14 17:14:42,454 epoch 1 - iter 14/77 - loss 0.42082447 - samples/sec: 27.53 - lr: 0.000010\n", - "2021-01-14 17:14:46,658 epoch 1 - iter 21/77 - loss 0.33773888 - samples/sec: 26.85 - lr: 0.000010\n", - "2021-01-14 17:14:50,844 epoch 1 - iter 28/77 - loss 0.31560597 - samples/sec: 27.01 - lr: 0.000010\n", - "2021-01-14 17:14:54,998 epoch 1 - iter 35/77 - loss 0.25972683 - samples/sec: 27.14 - lr: 0.000010\n", - "2021-01-14 17:14:59,209 epoch 1 - iter 42/77 - loss 0.23569006 - samples/sec: 26.75 - lr: 0.000010\n", - "2021-01-14 17:15:03,408 epoch 1 - iter 49/77 - loss 0.24985709 - samples/sec: 26.85 - lr: 0.000010\n", - "2021-01-14 17:15:07,633 epoch 1 - iter 56/77 - loss 0.23229837 - samples/sec: 26.77 - lr: 0.000010\n", - "2021-01-14 17:15:11,797 epoch 1 - iter 63/77 - loss 0.23326370 - samples/sec: 27.06 - lr: 0.000010\n", - "2021-01-14 17:15:16,012 epoch 1 - iter 70/77 - loss 0.21914055 - samples/sec: 26.79 - lr: 0.000010\n", - "2021-01-14 17:15:19,739 epoch 1 - iter 77/77 - loss 0.20128365 - samples/sec: 30.18 - lr: 0.000010\n", - "2021-01-14 17:15:19,814 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:15:19,814 EPOCH 1 done: loss 0.2013 - lr 0.0000100\n", - "2021-01-14 17:15:23,961 DEV : loss 0.33609411120414734 - score 0.9281\n", - "2021-01-14 17:15:24,224 BAD EPOCHS (no improvement): 0\n", - "saving best model\n", - "2021-01-14 17:15:25,155 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:15:29,794 epoch 2 - iter 7/77 - loss 0.00740182 - samples/sec: 26.44 - lr: 0.000010\n", - "2021-01-14 17:15:34,024 epoch 2 - iter 14/77 - loss 0.03390250 - samples/sec: 26.74 - lr: 0.000010\n", - "2021-01-14 17:15:38,226 epoch 2 - iter 21/77 - loss 0.03571354 - samples/sec: 26.91 - lr: 0.000010\n", - "2021-01-14 17:15:42,369 epoch 2 - iter 28/77 - loss 0.03136397 - samples/sec: 27.22 - lr: 0.000010\n", - "2021-01-14 17:15:46,510 epoch 2 - iter 35/77 - loss 0.03331735 - samples/sec: 27.22 - lr: 0.000010\n", - "2021-01-14 17:15:50,650 epoch 2 - iter 42/77 - loss 0.07917234 - samples/sec: 27.26 - lr: 0.000010\n", - "2021-01-14 17:15:54,832 epoch 2 - iter 49/77 - loss 0.07227532 - samples/sec: 26.94 - lr: 0.000010\n", - "2021-01-14 17:15:59,093 epoch 2 - iter 56/77 - loss 0.06382573 - samples/sec: 26.50 - lr: 0.000010\n", - "2021-01-14 17:16:03,303 epoch 2 - iter 63/77 - loss 0.08917253 - samples/sec: 26.79 - lr: 0.000010\n", - "2021-01-14 17:16:07,583 epoch 2 - iter 70/77 - loss 0.08041374 - samples/sec: 26.33 - lr: 0.000010\n", - "2021-01-14 17:16:11,374 epoch 2 - iter 77/77 - loss 0.08118116 - samples/sec: 29.72 - lr: 0.000010\n", - "2021-01-14 17:16:11,437 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:16:11,438 EPOCH 2 done: loss 0.0812 - lr 0.0000100\n", - "2021-01-14 17:16:15,649 DEV : loss 0.5065702795982361 - score 0.9248\n", - "2021-01-14 17:16:15,909 BAD EPOCHS (no improvement): 1\n", - "2021-01-14 17:16:15,910 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:16:20,491 epoch 3 - iter 7/77 - loss 0.00894024 - samples/sec: 26.72 - lr: 0.000010\n", - "2021-01-14 17:16:24,744 epoch 3 - iter 14/77 - loss 0.04298888 - samples/sec: 26.62 - lr: 0.000010\n", - "2021-01-14 17:16:29,099 epoch 3 - iter 21/77 - loss 0.05707598 - samples/sec: 25.96 - lr: 0.000010\n", - "2021-01-14 17:16:33,430 epoch 3 - iter 28/77 - loss 0.04700577 - samples/sec: 26.10 - lr: 0.000010\n", - "2021-01-14 17:16:37,615 epoch 3 - iter 35/77 - loss 0.03774460 - samples/sec: 26.92 - lr: 0.000010\n", - "2021-01-14 17:16:41,848 epoch 3 - iter 42/77 - loss 0.03161711 - samples/sec: 26.63 - lr: 0.000010\n", - "2021-01-14 17:16:46,019 epoch 3 - iter 49/77 - loss 0.02749447 - samples/sec: 27.00 - lr: 0.000010\n", - "2021-01-14 17:16:50,152 epoch 3 - iter 56/77 - loss 0.02414880 - samples/sec: 27.36 - lr: 0.000010\n", - "2021-01-14 17:16:54,291 epoch 3 - iter 63/77 - loss 0.02319205 - samples/sec: 27.22 - lr: 0.000010\n", - "2021-01-14 17:16:58,450 epoch 3 - iter 70/77 - loss 0.02129739 - samples/sec: 27.15 - lr: 0.000010\n", - "2021-01-14 17:17:02,224 epoch 3 - iter 77/77 - loss 0.01944040 - samples/sec: 29.79 - lr: 0.000010\n", - "2021-01-14 17:17:02,272 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:17:02,273 EPOCH 3 done: loss 0.0194 - lr 0.0000100\n", - "2021-01-14 17:17:06,524 DEV : loss 0.49905064702033997 - score 0.9216\n", - "2021-01-14 17:17:06,786 BAD EPOCHS (no improvement): 2\n", - "2021-01-14 17:17:06,787 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:17:11,358 epoch 4 - iter 7/77 - loss 0.00185599 - samples/sec: 26.75 - lr: 0.000010\n", - "2021-01-14 17:17:15,541 epoch 4 - iter 14/77 - loss 0.00136404 - samples/sec: 27.04 - lr: 0.000010\n", - "2021-01-14 17:17:19,712 epoch 4 - iter 21/77 - loss 0.00128906 - samples/sec: 27.07 - lr: 0.000010\n", - "2021-01-14 17:17:23,840 epoch 4 - iter 28/77 - loss 0.00118582 - samples/sec: 27.40 - lr: 0.000010\n", - "2021-01-14 17:17:28,020 epoch 4 - iter 35/77 - loss 0.00106619 - samples/sec: 26.98 - lr: 0.000010\n", - "2021-01-14 17:17:32,295 epoch 4 - iter 42/77 - loss 0.00094899 - samples/sec: 26.41 - lr: 0.000010\n", - "2021-01-14 17:17:36,522 epoch 4 - iter 49/77 - loss 0.00087282 - samples/sec: 26.66 - lr: 0.000010\n", - "2021-01-14 17:17:40,684 epoch 4 - iter 56/77 - loss 0.00097088 - samples/sec: 27.12 - lr: 0.000010\n", - "2021-01-14 17:17:44,912 epoch 4 - iter 63/77 - loss 0.00087886 - samples/sec: 26.69 - lr: 0.000010\n", - "2021-01-14 17:17:49,112 epoch 4 - iter 70/77 - loss 0.00110282 - samples/sec: 26.91 - lr: 0.000010\n", - "2021-01-14 17:17:52,950 epoch 4 - iter 77/77 - loss 0.00101001 - samples/sec: 29.33 - lr: 0.000010\n", - "2021-01-14 17:17:53,013 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:17:53,014 EPOCH 4 done: loss 0.0010 - lr 0.0000100\n", - "2021-01-14 17:17:57,333 DEV : loss 0.5981439352035522 - score 0.9216\n", - "2021-01-14 17:17:57,595 BAD EPOCHS (no improvement): 3\n", - "2021-01-14 17:17:57,596 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:18:02,196 epoch 5 - iter 7/77 - loss 0.00060182 - samples/sec: 26.72 - lr: 0.000010\n", - "2021-01-14 17:18:06,460 epoch 5 - iter 14/77 - loss 0.00037103 - samples/sec: 26.48 - lr: 0.000010\n", - "2021-01-14 17:18:10,664 epoch 5 - iter 21/77 - loss 0.00028840 - samples/sec: 26.89 - lr: 0.000010\n", - "2021-01-14 17:18:14,811 epoch 5 - iter 28/77 - loss 0.00025692 - samples/sec: 27.23 - lr: 0.000010\n", - "2021-01-14 17:18:18,940 epoch 5 - iter 35/77 - loss 0.00024187 - samples/sec: 27.30 - lr: 0.000010\n", - "2021-01-14 17:18:23,111 epoch 5 - iter 42/77 - loss 0.00021889 - samples/sec: 27.08 - lr: 0.000010\n", - "2021-01-14 17:18:27,299 epoch 5 - iter 49/77 - loss 0.00035217 - samples/sec: 26.95 - lr: 0.000010\n", - "2021-01-14 17:18:31,425 epoch 5 - iter 56/77 - loss 0.00032686 - samples/sec: 27.30 - lr: 0.000010\n", - "2021-01-14 17:18:35,586 epoch 5 - iter 63/77 - loss 0.00029595 - samples/sec: 27.13 - lr: 0.000010\n", - "2021-01-14 17:18:39,774 epoch 5 - iter 70/77 - loss 0.00027235 - samples/sec: 26.91 - lr: 0.000010\n", - "2021-01-14 17:18:43,540 epoch 5 - iter 77/77 - loss 0.00028631 - samples/sec: 29.89 - lr: 0.000010\n", - "2021-01-14 17:18:43,601 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:18:43,602 EPOCH 5 done: loss 0.0003 - lr 0.0000100\n", - "2021-01-14 17:18:46,910 DEV : loss 0.6656511425971985 - score 0.9248\n", - "Epoch 5: reducing learning rate of group 0 to 5.0000e-06.\n", - "2021-01-14 17:18:47,170 BAD EPOCHS (no improvement): 4\n", - "2021-01-14 17:18:47,171 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:18:53,782 epoch 6 - iter 7/77 - loss 0.00022932 - samples/sec: 25.95 - lr: 0.000005\n", - "2021-01-14 17:18:57,949 epoch 6 - iter 14/77 - loss 0.00016531 - samples/sec: 27.13 - lr: 0.000005\n", - "2021-01-14 17:19:02,280 epoch 6 - iter 21/77 - loss 0.00022724 - samples/sec: 25.99 - lr: 0.000005\n", - "2021-01-14 17:19:06,568 epoch 6 - iter 28/77 - loss 0.00026307 - samples/sec: 26.27 - lr: 0.000005\n", - "2021-01-14 17:19:10,654 epoch 6 - iter 35/77 - loss 0.00023027 - samples/sec: 27.57 - lr: 0.000005\n", - "2021-01-14 17:19:14,851 epoch 6 - iter 42/77 - loss 0.00021121 - samples/sec: 26.89 - lr: 0.000005\n", - "2021-01-14 17:19:19,029 epoch 6 - iter 49/77 - loss 0.00019923 - samples/sec: 26.94 - lr: 0.000005\n", - "2021-01-14 17:19:23,229 epoch 6 - iter 56/77 - loss 0.00018661 - samples/sec: 26.87 - lr: 0.000005\n", - "2021-01-14 17:19:27,356 epoch 6 - iter 63/77 - loss 0.00017462 - samples/sec: 27.26 - lr: 0.000005\n", - "2021-01-14 17:19:31,517 epoch 6 - iter 70/77 - loss 0.00016146 - samples/sec: 27.04 - lr: 0.000005\n", - "2021-01-14 17:19:35,455 epoch 6 - iter 77/77 - loss 0.00015018 - samples/sec: 28.60 - lr: 0.000005\n", - "2021-01-14 17:19:35,526 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:19:35,527 EPOCH 6 done: loss 0.0002 - lr 0.0000050\n", - "2021-01-14 17:19:39,223 DEV : loss 0.6893778443336487 - score 0.9248\n", - "2021-01-14 17:19:39,484 BAD EPOCHS (no improvement): 1\n", - "2021-01-14 17:19:39,486 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:19:44,844 epoch 7 - iter 7/77 - loss 0.00019211 - samples/sec: 27.26 - lr: 0.000005\n", - "2021-01-14 17:19:49,028 epoch 7 - iter 14/77 - loss 0.00066268 - samples/sec: 26.98 - lr: 0.000005\n", - "2021-01-14 17:19:53,253 epoch 7 - iter 21/77 - loss 0.00053614 - samples/sec: 26.67 - lr: 0.000005\n", - "2021-01-14 17:19:57,430 epoch 7 - iter 28/77 - loss 0.00043514 - samples/sec: 27.03 - lr: 0.000005\n", - "2021-01-14 17:20:01,594 epoch 7 - iter 35/77 - loss 0.00036258 - samples/sec: 27.03 - lr: 0.000005\n", - "2021-01-14 17:20:05,829 epoch 7 - iter 42/77 - loss 0.00031573 - samples/sec: 26.60 - lr: 0.000005\n", - "2021-01-14 17:20:10,013 epoch 7 - iter 49/77 - loss 0.00028645 - samples/sec: 26.94 - lr: 0.000005\n", - "2021-01-14 17:20:14,238 epoch 7 - iter 56/77 - loss 0.00025793 - samples/sec: 26.67 - lr: 0.000005\n", - "2021-01-14 17:20:18,381 epoch 7 - iter 63/77 - loss 0.00023890 - samples/sec: 27.16 - lr: 0.000005\n", - "2021-01-14 17:20:22,569 epoch 7 - iter 70/77 - loss 0.00021868 - samples/sec: 26.92 - lr: 0.000005\n", - "2021-01-14 17:20:26,365 epoch 7 - iter 77/77 - loss 0.00020789 - samples/sec: 29.64 - lr: 0.000005\n", - "2021-01-14 17:20:26,443 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:20:26,443 EPOCH 7 done: loss 0.0002 - lr 0.0000050\n", - "2021-01-14 17:20:29,751 DEV : loss 0.6999250054359436 - score 0.9248\n", - "2021-01-14 17:20:30,015 BAD EPOCHS (no improvement): 2\n", - "2021-01-14 17:20:30,016 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:20:34,509 epoch 8 - iter 7/77 - loss 0.00007416 - samples/sec: 27.22 - lr: 0.000005\n", - "2021-01-14 17:20:39,619 epoch 8 - iter 14/77 - loss 0.00005522 - samples/sec: 26.82 - lr: 0.000005\n", - "2021-01-14 17:20:43,900 epoch 8 - iter 21/77 - loss 0.00006253 - samples/sec: 26.35 - lr: 0.000005\n", - "2021-01-14 17:20:48,020 epoch 8 - iter 28/77 - loss 0.00015550 - samples/sec: 27.36 - lr: 0.000005\n", - "2021-01-14 17:20:52,243 epoch 8 - iter 35/77 - loss 0.00013324 - samples/sec: 26.66 - lr: 0.000005\n", - "2021-01-14 17:20:56,397 epoch 8 - iter 42/77 - loss 0.00012891 - samples/sec: 27.08 - lr: 0.000005\n", - "2021-01-14 17:21:00,493 epoch 8 - iter 49/77 - loss 0.00012485 - samples/sec: 27.50 - lr: 0.000005\n", - "2021-01-14 17:21:04,731 epoch 8 - iter 56/77 - loss 0.00014117 - samples/sec: 26.59 - lr: 0.000005\n", - "2021-01-14 17:21:08,909 epoch 8 - iter 63/77 - loss 0.00013634 - samples/sec: 26.95 - lr: 0.000005\n", - "2021-01-14 17:21:13,121 epoch 8 - iter 70/77 - loss 0.00012635 - samples/sec: 26.74 - lr: 0.000005\n", - "2021-01-14 17:21:16,967 epoch 8 - iter 77/77 - loss 0.00012047 - samples/sec: 29.24 - lr: 0.000005\n", - "2021-01-14 17:21:17,026 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:21:17,027 EPOCH 8 done: loss 0.0001 - lr 0.0000050\n", - "2021-01-14 17:21:20,598 DEV : loss 0.7107362747192383 - score 0.9248\n", - "2021-01-14 17:21:20,864 BAD EPOCHS (no improvement): 3\n", - "2021-01-14 17:21:20,865 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:21:26,269 epoch 9 - iter 7/77 - loss 0.00002916 - samples/sec: 26.76 - lr: 0.000005\n", - "2021-01-14 17:21:30,543 epoch 9 - iter 14/77 - loss 0.00006615 - samples/sec: 26.39 - lr: 0.000005\n", - "2021-01-14 17:21:34,808 epoch 9 - iter 21/77 - loss 0.00006364 - samples/sec: 26.48 - lr: 0.000005\n", - "2021-01-14 17:21:38,963 epoch 9 - iter 28/77 - loss 0.00013939 - samples/sec: 27.14 - lr: 0.000005\n", - "2021-01-14 17:21:43,165 epoch 9 - iter 35/77 - loss 0.00012621 - samples/sec: 26.86 - lr: 0.000005\n", - "2021-01-14 17:21:47,317 epoch 9 - iter 42/77 - loss 0.00012101 - samples/sec: 27.09 - lr: 0.000005\n", - "2021-01-14 17:21:51,532 epoch 9 - iter 49/77 - loss 0.00018356 - samples/sec: 26.78 - lr: 0.000005\n", - "2021-01-14 17:21:56,114 epoch 9 - iter 56/77 - loss 0.00016344 - samples/sec: 26.01 - lr: 0.000005\n", - "2021-01-14 17:22:00,349 epoch 9 - iter 63/77 - loss 0.00015143 - samples/sec: 26.58 - lr: 0.000005\n", - "2021-01-14 17:22:04,591 epoch 9 - iter 70/77 - loss 0.00013992 - samples/sec: 26.55 - lr: 0.000005\n", - "2021-01-14 17:22:08,380 epoch 9 - iter 77/77 - loss 0.00012958 - samples/sec: 29.74 - lr: 0.000005\n", - "2021-01-14 17:22:08,443 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:22:08,444 EPOCH 9 done: loss 0.0001 - lr 0.0000050\n", - "2021-01-14 17:22:11,739 DEV : loss 0.7207703590393066 - score 0.9248\n", - "Epoch 9: reducing learning rate of group 0 to 2.5000e-06.\n", - "2021-01-14 17:22:12,003 BAD EPOCHS (no improvement): 4\n", - "2021-01-14 17:22:12,004 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:22:16,599 epoch 10 - iter 7/77 - loss 0.00004176 - samples/sec: 26.48 - lr: 0.000003\n", - "2021-01-14 17:22:21,730 epoch 10 - iter 14/77 - loss 0.00004900 - samples/sec: 26.15 - lr: 0.000003\n", - "2021-01-14 17:22:25,925 epoch 10 - iter 21/77 - loss 0.00005921 - samples/sec: 26.94 - lr: 0.000003\n", - "2021-01-14 17:22:30,112 epoch 10 - iter 28/77 - loss 0.00005456 - samples/sec: 26.94 - lr: 0.000003\n", - "2021-01-14 17:22:34,389 epoch 10 - iter 35/77 - loss 0.00004909 - samples/sec: 26.34 - lr: 0.000003\n", - "2021-01-14 17:22:38,546 epoch 10 - iter 42/77 - loss 0.00004503 - samples/sec: 27.15 - lr: 0.000003\n", - "2021-01-14 17:22:42,757 epoch 10 - iter 49/77 - loss 0.00004776 - samples/sec: 26.71 - lr: 0.000003\n", - "2021-01-14 17:22:46,803 epoch 10 - iter 56/77 - loss 0.00004461 - samples/sec: 27.81 - lr: 0.000003\n", - "2021-01-14 17:22:51,063 epoch 10 - iter 63/77 - loss 0.00004382 - samples/sec: 26.47 - lr: 0.000003\n", - "2021-01-14 17:22:55,173 epoch 10 - iter 70/77 - loss 0.00005728 - samples/sec: 27.40 - lr: 0.000003\n", - "2021-01-14 17:22:58,941 epoch 10 - iter 77/77 - loss 0.00005446 - samples/sec: 29.83 - lr: 0.000003\n", - "2021-01-14 17:22:58,988 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:22:58,989 EPOCH 10 done: loss 0.0001 - lr 0.0000025\n", - "2021-01-14 17:23:02,304 DEV : loss 0.7250329852104187 - score 0.9248\n", - "2021-01-14 17:23:02,567 BAD EPOCHS (no improvement): 1\n", - "2021-01-14 17:23:04,650 ----------------------------------------------------------------------------------------------------\n", - "2021-01-14 17:23:04,651 Testing using best model ...\n", - "2021-01-14 17:23:04,653 loading file models/taggers/trec/best-model.pt\n", - "2021-01-14 17:23:09,692 \t0.9295\n", - "2021-01-14 17:23:09,693 \n", - "Results:\n", - "- F-score (micro) 0.9295\n", - "- F-score (macro) 0.4817\n", - "- Accuracy 0.9295\n", - "\n", - "By class:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.9295 1.0000 0.9635 356\n", - " 1 0.0000 0.0000 0.0000 27\n", - "\n", - " micro avg 0.9295 0.9295 0.9295 383\n", - " macro avg 0.4648 0.5000 0.4817 383\n", - "weighted avg 0.8640 0.9295 0.8955 383\n", - " samples avg 0.9295 0.9295 0.9295 383\n", - "\n", - "2021-01-14 17:23:09,694 ----------------------------------------------------------------------------------------------------\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'test_score': 0.9295,\n", - " 'dev_score_history': [0.9281,\n", - " 0.9248,\n", - " 0.9216,\n", - " 0.9216,\n", - " 0.9248,\n", - " 0.9248,\n", - " 0.9248,\n", - " 0.9248,\n", - " 0.9248,\n", - " 0.9248],\n", - " 'train_loss_history': [0.20128365170646023,\n", - " 0.08118115926717782,\n", - " 0.019440397426679537,\n", - " 0.0010100115429271352,\n", - " 0.00028631362048062414,\n", - " 0.00015017583772733613,\n", - " 0.0002078855192506468,\n", - " 0.00012047414655809278,\n", - " 0.00012958204591429078,\n", - " 5.446238951249556e-05],\n", - " 'dev_loss_history': [0.33609411120414734,\n", - " 0.5065702795982361,\n", - " 0.49905064702033997,\n", - " 0.5981439352035522,\n", - " 0.6656511425971985,\n", - " 0.6893778443336487,\n", - " 0.6999250054359436,\n", - " 0.7107362747192383,\n", - " 0.7207703590393066,\n", - " 0.7250329852104187]}" - ] - }, - "metadata": {}, - "execution_count": 85 - } - ], - "source": [ - "\n", - "# 3. initialize transformer document embeddings (many models are available)\n", - "document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)\n", - "\n", - "# 4. create the text classifier\n", - "classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights={b\"1\":10, b\"0\":1}) # loss_weights={\"1\":10, \"0\":1}\n", - "\n", - "# 5. initialize the text classifier trainer with Adam optimizer\n", - "trainer = ModelTrainer(classifier, corpus, optimizer=Adam)\n", - "\n", - "# 6. start the training\n", - "trainer.train('models/taggers/trec',\n", - " learning_rate=1e-5, # use very small learning rate\n", - " mini_batch_size=16,\n", - " mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine\n", - " max_epochs=10, # terminate after 5 epochs\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, diff --git a/tasks/wsd_gloss.py b/tasks/wsd_gloss.py new file mode 100644 index 0000000..efd90ae --- /dev/null +++ b/tasks/wsd_gloss.py @@ -0,0 +1,106 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from pathlib import Path +from torch.optim.adam import Adam +from flair.datasets import CSVClassificationCorpus +from flair.data import Corpus +from flair.embeddings import TransformerDocumentEmbeddings +from flair.models import TextClassifier +from flair.trainers import ModelTrainer + +def enclose_keyword(row,enclose_token='"'): + """enclose keyword with specific token to point + learner towards to word it has to focus on + """ + sentence = '' + for i,c in enumerate(row.full_text): + if i == int(row.keyword_offset): + sentence+=enclose_token + ' ' + elif i ==int(row.keyword_offset + len(row.keyword)): + sentence+= ' ' + enclose_token + sentence+=c + return sentence + +def to_glossbert_format(df): + """convert rows in dataframe to GlossBERT format + """ + + def gloss_string(row, definition): + """combine gloss with quoations and keyword + """ + + out_string='' + if row.enclosed_quotation: + out_string+=row.enclosed_quotation + out_string+=' [SEP] ' + out_string+=row.keyword+': ' + if row.definition: + out_string+=definition + return out_string + + df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1) + + rows = [] + for _ ,row in df.iterrows(): + rows.append([gloss_string(row, row.definition), "Yes", row.sense_id]) + definitions = df[df.lemma==row.lemma].definition.unique() + for d in definitions: + if d != row.definition: + rows.append([gloss_string(row,d), "No",row.sense_id]) + + return pd.DataFrame(rows, columns=['text','label','sense_id']) + + +def create_glossbert_data(lemma,pos): + """create glossbert data from quotations dataframe + """ + + df_quotations = pd.read_pickle(f'./data/sfrel_quotations_{lemma}_{pos}.pickle') + df_quotations = df_quotations[~df_quotations.keyword_offset.isnull()] + df_quotations = df_quotations[~df_quotations.definition.isnull()]#.reset_index(drop=True) + df_glossbert = to_glossbert_format(df_quotations).sample(frac=1.0).reset_index(drop=True) + print(df_glossbert.shape) + # not sure if this is correct probably should split by positive example sentence? + df_train, df_test = train_test_split(df_glossbert, test_size=0.2, random_state=42,shuffle=True) # , stratify=df_glossbert[['label']] + df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42,shuffle=True) + + train_data_path = Path("./data/training_data") + train_data_path.mkdir(exist_ok=True) + df_out_path = train_data_path / f'{lemma}_{pos}' + df_out_path.mkdir(exist_ok=True) + + df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\t') + df_val.to_csv(df_out_path / 'dev.csv', index = False, sep='\t') + df_test.to_csv(df_out_path / 'test.csv', index = False, sep='\t') + + return df_out_path + +def train_glossbert(data_folder,downsample=False): + column_name_map = {0: "text", 1: "label"} + + corpus = CSVClassificationCorpus(data_folder, + column_name_map, + skip_header=True, + delimiter='\t', # tab-separated files + ) + + if downsample: + print('Downsampling.') + corpus = corpus.downsample(0.1) + + label_dict = corpus.make_label_dictionary() + print(label_dict) + + document_embeddings = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=True) + + classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights={b"Yes":10, b"No":1}) # loss_weights={"1":10, "0":1} + + trainer = ModelTrainer(classifier, corpus, optimizer=Adam) + + trainer.train('models/classifier/glossbert', + learning_rate=1e-3, # use very small learning rate + mini_batch_size=16, + embeddings_storage_mode='gpu', + mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine + max_epochs=50, # terminate after 5 epochs + ) From 542dfbf6a935a4c44d50e387114bac720edc6965 Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Fri, 15 Jan 2021 10:55:03 +0000 Subject: [PATCH 05/10] add simple command line glossbert --- run_glossbert.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 run_glossbert.py diff --git a/run_glossbert.py b/run_glossbert.py new file mode 100644 index 0000000..76dd994 --- /dev/null +++ b/run_glossbert.py @@ -0,0 +1,10 @@ +from tasks.wsd_gloss import create_glossbert_data, train_glossbert +import sys + +def run(lemma,pos): + data_path = create_glossbert_data(lemma,pos) + train_glossbert(data_path) + +if __name__=="__main__": + lemma,pos = sys.argv[1],sys.argv[2] + run(lemma,pos) \ No newline at end of file From 8a6b916f3a39727d7e0aec12d6c85b4bb6d12614 Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Fri, 15 Jan 2021 12:16:55 +0000 Subject: [PATCH 06/10] stop notebook --- 114.1 - review notebook - glossbert.ipynb | 389 ++++++++++++++++++++-- 1 file changed, 369 insertions(+), 20 deletions(-) diff --git a/114.1 - review notebook - glossbert.ipynb b/114.1 - review notebook - glossbert.ipynb index cb3c924..e5fad22 100644 --- a/114.1 - review notebook - glossbert.ipynb +++ b/114.1 - review notebook - glossbert.ipynb @@ -24,9 +24,17 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -34,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -54,22 +62,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "metadata": {}, "outputs": [ { - "output_type": "error", - "ename": "AttributeError", - "evalue": "'Series' object has no attribute 'enclosed_quotation'", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcreate_glossbert_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mpos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mcreate_glossbert_data\u001b[0;34m(lemma, pos)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0mdf_quotations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_quotations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeyword_offset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mdf_quotations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_quotations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m \u001b[0mdf_glossbert\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mto_glossbert_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfrac\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_glossbert\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstratify\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_glossbert\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mto_glossbert_format\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m \u001b[0mrows\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mgloss_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Yes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msense_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 48\u001b[0m \u001b[0mdefinitions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdefinitions\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mgloss_string\u001b[0;34m(row, definition)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0mout_string\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menclosed_quotation\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0mout_string\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menclosed_quotation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mout_string\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0;34m' [SEP] '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5272\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5273\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5274\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5276\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'enclosed_quotation'" + "output_type": "stream", + "name": "stdout", + "text": [ + "(676946, 3)\n" ] } ], @@ -79,11 +79,360 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2021-01-15 10:42:39,739 Reading data from data/training_data/machine_NN\n", + "2021-01-15 10:42:39,739 Train: data/training_data/machine_NN/train.csv\n", + "2021-01-15 10:42:39,740 Dev: data/training_data/machine_NN/dev.csv\n", + "2021-01-15 10:42:39,740 Test: data/training_data/machine_NN/test.csv\n", + "2021-01-15 10:42:44,075 Computing label dictionary. Progress:\n", + "100%|██████████| 56863/56863 [00:25<00:00, 2210.56it/s]2021-01-15 10:43:10,130 [b'No', b'Yes']\n", + "\n", + "Dictionary with 2 tags: No, Yes\n", + "2021-01-15 10:43:13,389 ----------------------------------------------------------------------------------------------------\n", + "2021-01-15 10:43:13,394 Model: \"TextClassifier(\n", + " (document_embeddings): TransformerDocumentEmbeddings(\n", + " (model): BertModel(\n", + " (embeddings): BertEmbeddings(\n", + " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", + " (position_embeddings): Embedding(512, 768)\n", + " (token_type_embeddings): Embedding(2, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): BertEncoder(\n", + " (layer): ModuleList(\n", + " (0): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (1): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (2): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (3): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (4): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (5): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (6): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (7): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (8): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (9): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (10): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (11): BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pooler): BertPooler(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (activation): Tanh()\n", + " )\n", + " )\n", + " )\n", + " (decoder): Linear(in_features=768, out_features=2, bias=True)\n", + " (loss_function): CrossEntropyLoss()\n", + " (beta): 1.0\n", + " (weights): {b'Yes': 10, b'No': 1}\n", + " (weight_tensor) tensor([1., 1.], device='cuda:0')\n", + ")\"\n", + "2021-01-15 10:43:13,395 ----------------------------------------------------------------------------------------------------\n", + "2021-01-15 10:43:13,396 Corpus: \"Corpus: 43324 train + 10831 dev + 13539 test sentences\"\n", + "2021-01-15 10:43:13,396 ----------------------------------------------------------------------------------------------------\n", + "2021-01-15 10:43:13,397 Parameters:\n", + "2021-01-15 10:43:13,398 - learning_rate: \"0.001\"\n", + "2021-01-15 10:43:13,398 - mini_batch_size: \"16\"\n", + "2021-01-15 10:43:13,399 - patience: \"3\"\n", + "2021-01-15 10:43:13,400 - anneal_factor: \"0.5\"\n", + "2021-01-15 10:43:13,400 - max_epochs: \"50\"\n", + "2021-01-15 10:43:13,401 - shuffle: \"True\"\n", + "2021-01-15 10:43:13,402 - train_with_dev: \"False\"\n", + "2021-01-15 10:43:13,403 - batch_growth_annealing: \"False\"\n", + "2021-01-15 10:43:13,403 ----------------------------------------------------------------------------------------------------\n", + "2021-01-15 10:43:13,404 Model training base path: \"models/classifier/glossbert\"\n", + "2021-01-15 10:43:13,405 ----------------------------------------------------------------------------------------------------\n", + "2021-01-15 10:43:13,406 Device: cuda:0\n", + "2021-01-15 10:43:13,407 ----------------------------------------------------------------------------------------------------\n", + "2021-01-15 10:43:13,408 Embeddings storage mode: cpu\n" + ] + }, + { + "output_type": "error", + "ename": "TypeError", + "evalue": "__init__() got an unexpected keyword argument 'embedding_storage_mode'", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrain_glossbert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownsample\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/tasks/wsd_gloss.py\u001b[0m in \u001b[0;36mtrain_glossbert\u001b[0;34m(data_folder, downsample)\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0membedding_storage_mode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'gpu'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0mmini_batch_chunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# optionally set this if transformer is too much for your machine\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 104\u001b[0;31m \u001b[0mmax_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# terminate after 5 epochs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 105\u001b[0m )\n", + "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/flair/trainers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, base_path, learning_rate, mini_batch_size, mini_batch_chunk_size, max_epochs, scheduler, cycle_momentum, anneal_factor, patience, initial_extra_patience, min_learning_rate, train_with_dev, monitor_train, monitor_test, embeddings_storage_mode, checkpoint, save_final_model, anneal_with_restarts, anneal_with_prestarts, batch_growth_annealing, shuffle, param_selection_mode, write_weights, num_workers, sampler, use_amp, amp_opt_level, eval_on_train_fraction, eval_on_train_shuffle, save_model_at_each_epoch, **kwargs)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m optimizer: torch.optim.Optimizer = self.optimizer(\n\u001b[0;32m--> 226\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlearning_rate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 227\u001b[0m )\n\u001b[1;32m 228\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'embedding_storage_mode'" + ] + } + ], "source": [ - "train_glossbert(data_path,downsample=True)" + "train_glossbert(data_path, downsample=True)" ] }, { From 3ce710a274fe1819f124d5e1ba87118991ebb8a3 Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Mon, 18 Jan 2021 09:24:09 +0000 Subject: [PATCH 07/10] add review notebook for multi dataset learning --- 114.2 - review notebook - multidataset.ipynb | 195 ++++++++++++++++++ data/grouped_senses.md | 0 ...words_for_evaluation_selection_criteria.md | 0 run_glossbert.py | 4 +- 4 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 114.2 - review notebook - multidataset.ipynb mode change 100644 => 100755 data/grouped_senses.md mode change 100644 => 100755 data/words_for_evaluation_selection_criteria.md diff --git a/114.2 - review notebook - multidataset.ipynb b/114.2 - review notebook - multidataset.ipynb new file mode 100644 index 0000000..19fd620 --- /dev/null +++ b/114.2 - review notebook - multidataset.ipynb @@ -0,0 +1,195 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3", + "language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from pathlib import Path\n", + "from torch.optim.adam import Adam\n", + "from flair.datasets import CSVClassificationCorpus\n", + "from flair.data import Corpus\n", + "from flair.embeddings import TransformerDocumentEmbeddings\n", + "from flair.models import TextClassifier\n", + "from flair.trainers import ModelTrainer\n", + "from tasks.wsd_gloss import enclose_keyword\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def gloss_dfs(df):\n", + " \"\"\"convert rows in dataframe to GlossBERT format\n", + " \"\"\"\n", + " df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)\n", + " df_gl = df[['enclosed_quotation','definition','label']]\n", + " return df_gl[['enclosed_quotation','label']],df_gl[['definition','label']].drop_duplicates()\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "lemma,pos='machine','NN'\n", + "df_quotations = pd.read_pickle(f'./data/sfrel_quotations_{lemma}_{pos}.pickle')\n", + "df_quotations = df_quotations[~df_quotations.keyword_offset.isnull()]\n", + "df_quotations = df_quotations[~df_quotations.definition.isnull()].reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(29752, 14)\n(29321, 14)\n" + ] + } + ], + "source": [ + "print(df_quotations.shape) \n", + "quotation_valuecounts = df_quotations.sense_id.value_counts()\n", + "df_quotations = df_quotations[df_quotations.sense_id.isin(quotation_valuecounts[quotation_valuecounts>1].index)]\n", + "print(df_quotations.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "KeyError", + "evalue": "\"['label'] not in index\"", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcontext_df\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mgloss_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgloss_dfs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mgloss_dfs\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m 3\u001b[0m \"\"\"\n\u001b[1;32m 4\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menclose_keyword\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf_gl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'definition'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf_gl\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdf_gl\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'definition'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2804\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2805\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2806\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2807\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2808\u001b[0m \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1552\u001b[0m self._validate_read_indexer(\n\u001b[0;32m-> 1553\u001b[0;31m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_missing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1554\u001b[0m )\n\u001b[1;32m 1555\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"loc\"\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1645\u001b[0m \u001b[0mnot_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1646\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{not_found} not in index\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1648\u001b[0m \u001b[0;31m# we skip the warning on Categorical/Interval\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: \"['label'] not in index\"" + ] + } + ], + "source": [ + "#context_df,gloss_df = gloss_dfs((df_quotations))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(context_df.shape,gloss_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data_path = Path(\"./data/training_data_ms\")\n", + "train_data_path.mkdir(exist_ok=True)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "ValueError", + "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m stratify=df[['sense_id']]) # 1st\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m df_train, df_val = train_test_split(df_train, \n", + "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36mtrain_test_split\u001b[0;34m(*arrays, **options)\u001b[0m\n\u001b[1;32m 2150\u001b[0m random_state=random_state)\n\u001b[1;32m 2151\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2152\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstratify\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2154\u001b[0m return list(chain.from_iterable((_safe_indexing(a, train),\n", + "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36msplit\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m 1339\u001b[0m \"\"\"\n\u001b[1;32m 1340\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindexable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1341\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iter_indices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1342\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36m_iter_indices\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m 1666\u001b[0m \u001b[0mclass_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbincount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_indices\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1667\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclass_counts\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1668\u001b[0;31m raise ValueError(\"The least populated class in y has only 1\"\n\u001b[0m\u001b[1;32m 1669\u001b[0m \u001b[0;34m\" member, which is too few. The minimum\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1670\u001b[0m \u001b[0;34m\" number of groups for any class cannot\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2." + ] + } + ], + "source": [ + "for n, df in [('context',context_df),('gloss',gloss_df)]:\n", + "\n", + " df_out_path = train_data_path / f'{lemma}_{pos}_{n}'\n", + " df_out_path.mkdir(exist_ok=True)\n", + "\n", + " df_train, df_test = train_test_split(df, \n", + " test_size=0.2, \n", + " random_state=42,\n", + " shuffle=True,\n", + " stratify=df[['label']]\n", + " ) # 1st\n", + " \n", + " df_train, df_val = train_test_split(df_train, \n", + " test_size=0.1, \n", + " random_state=42,\n", + " shuffle=True,\n", + " stratify=df_train[['label']] # bug here, try to do the stratification better\n", + " ) # 2nd\n", + " \n", + " df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\\t') \n", + " df_val.to_csv(df_out_path / 'dev.csv', index = False, sep='\\t') \n", + " df_test.to_csv(df_out_path / 'test.csv', index = False, sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/data/grouped_senses.md b/data/grouped_senses.md old mode 100644 new mode 100755 diff --git a/data/words_for_evaluation_selection_criteria.md b/data/words_for_evaluation_selection_criteria.md old mode 100644 new mode 100755 diff --git a/run_glossbert.py b/run_glossbert.py index 76dd994..339eb55 100644 --- a/run_glossbert.py +++ b/run_glossbert.py @@ -3,8 +3,8 @@ def run(lemma,pos): data_path = create_glossbert_data(lemma,pos) - train_glossbert(data_path) + train_glossbert(data_path,downsample=True) if __name__=="__main__": lemma,pos = sys.argv[1],sys.argv[2] - run(lemma,pos) \ No newline at end of file + run(lemma,pos) From 269a33e13997883c4d0397ea34f4b702f276e9e6 Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Mon, 18 Jan 2021 12:56:28 +0000 Subject: [PATCH 08/10] add multidataset training --- 114.2 - review notebook - multidataset.ipynb | 152 +++++-------------- 1 file changed, 34 insertions(+), 118 deletions(-) diff --git a/114.2 - review notebook - multidataset.ipynb b/114.2 - review notebook - multidataset.ipynb index 19fd620..07cbd17 100644 --- a/114.2 - review notebook - multidataset.ipynb +++ b/114.2 - review notebook - multidataset.ipynb @@ -24,92 +24,71 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from pathlib import Path\n", - "from torch.optim.adam import Adam\n", - "from flair.datasets import CSVClassificationCorpus\n", - "from flair.data import Corpus\n", - "from flair.embeddings import TransformerDocumentEmbeddings\n", - "from flair.models import TextClassifier\n", - "from flair.trainers import ModelTrainer\n", - "from tasks.wsd_gloss import enclose_keyword\n" + "%load_ext autoreload\n", + "%autoreload 2" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "def gloss_dfs(df):\n", - " \"\"\"convert rows in dataframe to GlossBERT format\n", - " \"\"\"\n", - " df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1)\n", - " df_gl = df[['enclosed_quotation','definition','label']]\n", - " return df_gl[['enclosed_quotation','label']],df_gl[['definition','label']].drop_duplicates()\n", - " \n" + "from tasks.wsd_gloss import create_md_training_data, train_gloss_and_context" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 27, "metadata": {}, "outputs": [], - "source": [ - "lemma,pos='machine','NN'\n", - "df_quotations = pd.read_pickle(f'./data/sfrel_quotations_{lemma}_{pos}.pickle')\n", - "df_quotations = df_quotations[~df_quotations.keyword_offset.isnull()]\n", - "df_quotations = df_quotations[~df_quotations.definition.isnull()].reset_index(drop=True)\n" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "(29752, 14)\n(29321, 14)\n" - ] - } - ], + "outputs": [], "source": [ - "print(df_quotations.shape) \n", - "quotation_valuecounts = df_quotations.sense_id.value_counts()\n", - "df_quotations = df_quotations[df_quotations.sense_id.isin(quotation_valuecounts[quotation_valuecounts>1].index)]\n", - "print(df_quotations.shape)" + "lemma, pos = 'machine','NN'\n", + "senses = {'machine_nn01-38474140'} # machine_nn01-38475772 machine_nn01-38475923 machine_nn01-38475835 machine_nn01-38474140\n", + "relations = ['seed','synonym'] # ,'descendant','sibling'\n", + "eval_mode = \"lemma_etal\" # lemma or lemma_etal\n", + "experiment_id = 0" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ { - "output_type": "error", - "ename": "KeyError", - "evalue": "\"['label'] not in index\"", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcontext_df\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mgloss_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgloss_dfs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_quotations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m\u001b[0m in \u001b[0;36mgloss_dfs\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m 3\u001b[0m \"\"\"\n\u001b[1;32m 4\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menclose_keyword\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf_gl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'definition'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf_gl\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'enclosed_quotation'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdf_gl\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'definition'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2804\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2805\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2806\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2807\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2808\u001b[0m \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1552\u001b[0m self._validate_read_indexer(\n\u001b[0;32m-> 1553\u001b[0;31m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_missing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1554\u001b[0m )\n\u001b[1;32m 1555\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"loc\"\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1645\u001b[0m \u001b[0mnot_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1646\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{not_found} not in index\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1648\u001b[0m \u001b[0;31m# we skip the warning on Categorical/Interval\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: \"['label'] not in index\"" + "output_type": "stream", + "name": "stdout", + "text": [ + "# senses before filtering by date = 517\n", + "# senses after filtering by date = 433\n", + "\n", + "\n", + "# of seed senses 26 \n", + "# of synonyms 383 \n", + "# of branch senses 0\n", + "\n", + "\n", + "# of seeds selected 1 \n", + "# of synonyms selected 44 \n", + "# of branches selected 0\n", + "[LOG] #rows before removing None vector (1947, 21)\n", + "[LOG] #rows after removing None vector (1911, 21)\n" ] } ], "source": [ - "#context_df,gloss_df = gloss_dfs((df_quotations))" + "create_md_training_data(lemma,pos,senses,relations,experiment_id=experiment_id)" ] }, { @@ -118,72 +97,9 @@ "metadata": {}, "outputs": [], "source": [ - "print(context_df.shape,gloss_df.shape)" + "train_gloss_and_context(lemma,pos,experiment_id=experiment_id)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_data_path = Path(\"./data/training_data_ms\")\n", - "train_data_path.mkdir(exist_ok=True)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "output_type": "error", - "ename": "ValueError", - "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m stratify=df[['sense_id']]) # 1st\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m df_train, df_val = train_test_split(df_train, \n", - "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36mtrain_test_split\u001b[0;34m(*arrays, **options)\u001b[0m\n\u001b[1;32m 2150\u001b[0m random_state=random_state)\n\u001b[1;32m 2151\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2152\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstratify\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2154\u001b[0m return list(chain.from_iterable((_safe_indexing(a, train),\n", - "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36msplit\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m 1339\u001b[0m \"\"\"\n\u001b[1;32m 1340\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindexable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1341\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iter_indices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1342\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1343\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/data/anaconda/envs/py37torch/lib/python3.7/site-packages/sklearn/model_selection/_split.py\u001b[0m in \u001b[0;36m_iter_indices\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m 1666\u001b[0m \u001b[0mclass_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbincount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_indices\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1667\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclass_counts\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1668\u001b[0;31m raise ValueError(\"The least populated class in y has only 1\"\n\u001b[0m\u001b[1;32m 1669\u001b[0m \u001b[0;34m\" member, which is too few. The minimum\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1670\u001b[0m \u001b[0;34m\" number of groups for any class cannot\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2." - ] - } - ], - "source": [ - "for n, df in [('context',context_df),('gloss',gloss_df)]:\n", - "\n", - " df_out_path = train_data_path / f'{lemma}_{pos}_{n}'\n", - " df_out_path.mkdir(exist_ok=True)\n", - "\n", - " df_train, df_test = train_test_split(df, \n", - " test_size=0.2, \n", - " random_state=42,\n", - " shuffle=True,\n", - " stratify=df[['label']]\n", - " ) # 1st\n", - " \n", - " df_train, df_val = train_test_split(df_train, \n", - " test_size=0.1, \n", - " random_state=42,\n", - " shuffle=True,\n", - " stratify=df_train[['label']] # bug here, try to do the stratification better\n", - " ) # 2nd\n", - " \n", - " df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\\t') \n", - " df_val.to_csv(df_out_path / 'dev.csv', index = False, sep='\\t') \n", - " df_test.to_csv(df_out_path / 'test.csv', index = False, sep='\\t')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, From 39d736c5ed6672ba8693e099cbe76ea62562ac4f Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Mon, 18 Jan 2021 13:03:17 +0000 Subject: [PATCH 09/10] add typing to functions --- tasks/wsd_gloss.py | 127 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 9 deletions(-) diff --git a/tasks/wsd_gloss.py b/tasks/wsd_gloss.py index efd90ae..c736c48 100644 --- a/tasks/wsd_gloss.py +++ b/tasks/wsd_gloss.py @@ -1,14 +1,19 @@ import pandas as pd from sklearn.model_selection import train_test_split -from pathlib import Path +from pathlib import Path, PosixPath +from utils.classificaton_utils import binarize from torch.optim.adam import Adam from flair.datasets import CSVClassificationCorpus -from flair.data import Corpus +from flair.data import MultiCorpus from flair.embeddings import TransformerDocumentEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer -def enclose_keyword(row,enclose_token='"'): +# --------------------------------------- +# glossbert method ---------------------- + +def enclose_keyword(row:pd.Series, + enclose_token:str='"'): """enclose keyword with specific token to point learner towards to word it has to focus on """ @@ -21,11 +26,11 @@ def enclose_keyword(row,enclose_token='"'): sentence+=c return sentence -def to_glossbert_format(df): +def to_glossbert_format(df:pd.DataFrame): """convert rows in dataframe to GlossBERT format """ - def gloss_string(row, definition): + def gloss_string(row:pd.Series, definition:str): """combine gloss with quoations and keyword """ @@ -34,8 +39,8 @@ def gloss_string(row, definition): out_string+=row.enclosed_quotation out_string+=' [SEP] ' out_string+=row.keyword+': ' - if row.definition: - out_string+=definition + #if row.definition: + out_string+=definition return out_string df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1) @@ -51,7 +56,8 @@ def gloss_string(row, definition): return pd.DataFrame(rows, columns=['text','label','sense_id']) -def create_glossbert_data(lemma,pos): +def create_glossbert_data(lemma:str, + pos:str): """create glossbert data from quotations dataframe """ @@ -75,7 +81,9 @@ def create_glossbert_data(lemma,pos): return df_out_path -def train_glossbert(data_folder,downsample=False): +def train_glossbert(data_folder:PosixPath, + downsample:bool=False): + column_name_map = {0: "text", 1: "label"} corpus = CSVClassificationCorpus(data_folder, @@ -104,3 +112,104 @@ def train_glossbert(data_folder,downsample=False): mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=50, # terminate after 5 epochs ) + + +# --------------------------------------- +# multidataset training ----------------- + + +def context_gloss_dfs(df:pd.DataFrame): + """convert rows in dataframe to GlossBERT format + """ + df = df[~df.keyword_offset.isnull()] + df = df[~df.definition.isnull()].reset_index(drop=True) + df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1) + df_gl = df[['enclosed_quotation','definition','label']] + return df_gl[['enclosed_quotation','label']],df_gl[['definition','label']].drop_duplicates() + +def create_md_training_data(lemma:str, + pos:str, + senses:set, + relations:list, + experiment_id:int=0, + eval_mode:str='lemma_etal'): + """create data for multidataset training + """ + df_train, df_val, df_test = binarize(lemma, + pos, + senses, + relations, + strict_filter=True, + start=1700, + end=2000, + eval_mode=eval_mode) + + data = list(map(context_gloss_dfs,[df_train, df_val, df_test])) + + train_data_path = Path("./data/training_data_md") + train_data_path.mkdir(exist_ok=True) + + for context, gloss in data: + for n, df in [('context',context),('gloss',gloss)]: + + df_out_path= train_data_path / f'{lemma}_{pos}_{experiment_id}_{n}' + df_out_path.mkdir(exist_ok=True) + + df_train, df_test = train_test_split(df, + test_size=0.2, + random_state=42, + shuffle=True, + stratify=df[['label']] + ) # 1st + + df_train, df_val = train_test_split(df_train, + test_size=0.1, + random_state=42, + shuffle=True, + stratify=df_train[['label']] # bug here, try to do the stratification better + ) # 2nd + + df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\t') + df_val.to_csv(df_out_path / 'dev.csv', index = False, sep='\t') + df_test.to_csv(df_out_path / 'test.csv', index = False, sep='\t') + +def train_gloss_and_context(lemma:str, + pos:str, + experiment_id:int=0, + data_folder:PosixPath=Path("./data/training_data_md"), + downsample:bool=False): + column_name_map = {0: "text", 1: "label"} + + context_corpus = CSVClassificationCorpus(data_folder / f"{lemma}_{pos}_{experiment_id}_context", + column_name_map, + skip_header=True, + delimiter='\t', # tab-separated files + ) + gloss_corpus = CSVClassificationCorpus(data_folder / f"{lemma}_{pos}_{experiment_id}_gloss", + column_name_map, + skip_header=True, + delimiter='\t', # tab-separated files + ) + + corpus = MultiCorpus([context_corpus, gloss_corpus]) + + if downsample: + print('Downsampling...') + corpus = corpus.downsample(0.1) + + label_dict = corpus.make_label_dictionary() + print(label_dict) + + document_embeddings = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=True) + + classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # loss_weights={"1":10, "0":1} + + trainer = ModelTrainer(classifier, corpus, optimizer=Adam) + + trainer.train('models/classifier/glossbert', + learning_rate=1e-3, # use very small learning rate + mini_batch_size=16, + embeddings_storage_mode='gpu', + mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine + max_epochs=50, # terminate after 5 epochs + ) \ No newline at end of file From d547400f55536c35d9ce3dea5fcce519704a9aa0 Mon Sep 17 00:00:00 2001 From: kasparvonbeelen Date: Mon, 18 Jan 2021 15:33:29 +0000 Subject: [PATCH 10/10] update documentation for gloss function --- tasks/wsd_gloss.py | 113 +++++++++++++++--- utils/classificaton_utils.py | 224 +++++++++++++++++------------------ 2 files changed, 207 insertions(+), 130 deletions(-) diff --git a/tasks/wsd_gloss.py b/tasks/wsd_gloss.py index c736c48..cc32b98 100644 --- a/tasks/wsd_gloss.py +++ b/tasks/wsd_gloss.py @@ -13,9 +13,19 @@ # glossbert method ---------------------- def enclose_keyword(row:pd.Series, - enclose_token:str='"'): + enclose_token:str='"') -> str: """enclose keyword with specific token to point - learner towards to word it has to focus on + learner towards to word it has to focus on. this + is part of the weak supervision when learning + from context/quotations. + + Arguments: + row (pd.Series): row of quotations dataframe + enclose_token (str): use token to mark target expression + effectively this serves begin and end token + + Returns: + quotation with target token marked by `enclose_token` """ sentence = '' for i,c in enumerate(row.full_text): @@ -26,12 +36,26 @@ def enclose_keyword(row:pd.Series, sentence+=c return sentence -def to_glossbert_format(df:pd.DataFrame): +def to_glossbert_format(df:pd.DataFrame) -> pd.DataFrame: """convert rows in dataframe to GlossBERT format + Argument: + df (pd.DataFrame): quotations dataframe + + Returns: + pd.DataFrame with format confirming the + GlossBERT template """ - def gloss_string(row:pd.Series, definition:str): - """combine gloss with quoations and keyword + def gloss_string(row:pd.Series, definition:str) -> str: + """combine gloss with quotations and keyword + + Arguments: + row (pd.Series): row of dataframe + definition (str): definition to use as gloss + + Returns: + out_string that combines as quotation/context + with a gloss seperated by [SEP] """ out_string='' @@ -46,19 +70,28 @@ def gloss_string(row:pd.Series, definition:str): df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1) rows = [] + + # create labelled observations 1 of the context matches the definition + # 0 for the other cases (this method used weak supervision) for _ ,row in df.iterrows(): - rows.append([gloss_string(row, row.definition), "Yes", row.sense_id]) + rows.append([gloss_string(row, row.definition), "1", row.sense_id]) definitions = df[df.lemma==row.lemma].definition.unique() for d in definitions: if d != row.definition: - rows.append([gloss_string(row,d), "No",row.sense_id]) + rows.append([gloss_string(row,d), "0",row.sense_id]) return pd.DataFrame(rows, columns=['text','label','sense_id']) def create_glossbert_data(lemma:str, - pos:str): - """create glossbert data from quotations dataframe + pos:str) -> PosixPath: + """Create glossbert data from quotations dataframe + Arguments: + lemma (str): lemma + pos (str): part-of-speech + + Return: + path as PosixPath to location where data is stored """ df_quotations = pd.read_pickle(f'./data/sfrel_quotations_{lemma}_{pos}.pickle') @@ -82,7 +115,18 @@ def create_glossbert_data(lemma:str, return df_out_path def train_glossbert(data_folder:PosixPath, - downsample:bool=False): + downsample:bool=False) -> bool: + """train as GlossBERT model + Arguments: + data_folder (PosixPath): folder where train/dev and + test set are stored as csv files + downsample (bool): if True we use only ten per cent + of the data for training and testing + primarily used for demo puroposes + + Return: + return True after training + """ column_name_map = {0: "text", 1: "label"} @@ -112,20 +156,29 @@ def train_glossbert(data_folder:PosixPath, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=50, # terminate after 5 epochs ) + + return True # --------------------------------------- # multidataset training ----------------- -def context_gloss_dfs(df:pd.DataFrame): - """convert rows in dataframe to GlossBERT format +def context_gloss_dfs(df:pd.DataFrame) -> tuple: + """split the quotations dataframe in a context/quotation + and a gloss dataframe. + + Arguments: + df (pd.DataFrame): quotations dataframe + Returns: + a tuple in the format (context_df, gloss_df) """ df = df[~df.keyword_offset.isnull()] df = df[~df.definition.isnull()].reset_index(drop=True) df['enclosed_quotation'] = df.apply(enclose_keyword, axis=1) df_gl = df[['enclosed_quotation','definition','label']] - return df_gl[['enclosed_quotation','label']],df_gl[['definition','label']].drop_duplicates() + return (df_gl[['enclosed_quotation','label']], + df_gl[['definition','label']].drop_duplicates()) def create_md_training_data(lemma:str, pos:str, @@ -133,7 +186,17 @@ def create_md_training_data(lemma:str, relations:list, experiment_id:int=0, eval_mode:str='lemma_etal'): - """create data for multidataset training + """create data for multidataset training in which + we train a model simultaneously on quotations and glosses. + + Arguments: + lemma (str): lemma + pos (str): part-of-speech + senses (set): senses that define the positive class + relations (list): relation used for expanding the senses + experiment_id (int): integer identifier used as id + eval_mode (str): evalation mode (lemma or lemma_etal) + """ df_train, df_val, df_test = binarize(lemma, pos, @@ -166,7 +229,7 @@ def create_md_training_data(lemma:str, test_size=0.1, random_state=42, shuffle=True, - stratify=df_train[['label']] # bug here, try to do the stratification better + stratify=df_train[['label']] ) # 2nd df_train.to_csv(df_out_path / 'train.csv', index = False, sep='\t') @@ -177,7 +240,22 @@ def train_gloss_and_context(lemma:str, pos:str, experiment_id:int=0, data_folder:PosixPath=Path("./data/training_data_md"), - downsample:bool=False): + downsample:bool=False) -> bool: + """fine-tune a transformer model on both the context and the gloss + + Arguments: + lemma (str): lemma + pos (str): part-of-speech + experiment_id (int): integer used to identify experiment + data_folder (PosixPath): main folder for storing the + context and gloss folder + downsample (bool): if True we use only 10% of the data + for training and testing + + Returns: + returns True after model has finished training + """ + column_name_map = {0: "text", 1: "label"} context_corpus = CSVClassificationCorpus(data_folder / f"{lemma}_{pos}_{experiment_id}_context", @@ -212,4 +290,5 @@ def train_gloss_and_context(lemma:str, embeddings_storage_mode='gpu', mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=50, # terminate after 5 epochs - ) \ No newline at end of file + ) + return True \ No newline at end of file diff --git a/utils/classificaton_utils.py b/utils/classificaton_utils.py index 3eb8861..91f8112 100644 --- a/utils/classificaton_utils.py +++ b/utils/classificaton_utils.py @@ -10,9 +10,7 @@ from typing import Union from utils.dataset_download import * from sklearn.model_selection import train_test_split -from tasks import wsd -from utils import nlp_tools -#import swifter + cosine_similiarity = lambda x, target : 1 - cosine(x,target) @@ -310,116 +308,116 @@ def merge_definitions(row): # Depreciated code # To be removed before release -def eval_lemma(lemma, - pos, - idx, - embedding_methods, - start=1760, - end=1920, - vector_type='vector_bert_base_-1,-2,-3,-4_mean', - skip_vectorize=False, - train_on_dev=True): - - quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle" - - if not skip_vectorize: - vectorize_target_expressions(quotations_path,embedding_methods) - - lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle') - senses = set(lemma_senses[lemma_senses.word_id==f'{lemma}_{pos.lower()}{idx}'].id) - - relations = ['seed','synonym'] # ,'descendant','sibling' - eval_mode = "lemma_etal" # lemma or lemma_etal - - wemb_model = Word2Vec.load("/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/models/w2v_004/w2v_words.model") - y_true,y_pred_bin_centr, y_pred_ts_bin_centr,y_pred_sense_centr,y_pred_ts_sense_centr, rand, token_overlap,w2v_lesk = [], [],[],[], [], [],[], [] - - - tqdm.pandas() - - for sense in senses: - - print(sense) - df_train, df_val, df_test = binarize(lemma, - pos, - {sense}, - relations, - strict_filter=True, - start=start, - end=end, - eval_mode=eval_mode) - # no quotations for sense and timeframe - if df_train is None: continue - - - - y_true.extend(df_test.label.to_list()) - - - if train_on_dev: - df_train = pd.concat([df_train, df_val], axis=0) - - df_train["nlp_full_text"] = df_train.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1) - - df_val["nlp_full_text"] = df_val.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1) - - df_test["nlp_full_text"] = df_test.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1) - - # random - df_test["random"] = df_test.progress_apply(lambda row: wsd.random_predict(), axis=1) - rand.extend(df_test["random"].to_list()) - - # token overlap - df_selected_senses = generate_definition_df(df_train,lemma,eval_mode=eval_mode) - df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1) - df_test["def_tok_overlap_ranking"] = df_test.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1) - token_overlap.extend(df_test["def_tok_overlap_ranking"].to_list()) - - #w2v lesk - # Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored. - #df_test["w2v_lesk_ranking"] = df_test.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1) - #w2v_lesk.extend(df_test['w2v_lesk_ranking'].to_list()) - - # binary centroid - centroid_vectors = df_train.groupby('label')[vector_type].apply(np.mean,axis=0) - df_test[f"bert_centroid_binary_{vector_type}"] = df_test[vector_type].progress_apply(wsd.bert_binary_centroid_vector, - centroid_vectors = centroid_vectors, - ) - y_pred_bin_centr.extend(df_test[f"bert_centroid_binary_{vector_type}"].to_list()) - #results[f"bert_centroid_binary_{vector_type}_{sense}"] = (wsd.eval(f"bert_centroid_binary_{vector_type}",df_test),len(df_test)) - - # binary centroid time sensitive - df_test[f"bert_ts_centroid_binary_{vector_type}"] = df_test.progress_apply(wsd.bert_ts_binary_centroid_vector, df_train=df_train, axis=1) - y_pred_ts_bin_centr.extend(df_test[f"bert_ts_centroid_binary_{vector_type}"].to_list()) - - #results[f"bert_ts_centroid_binary_{vector_type}_{sense}"] = (wsd.eval(f"bert_ts_centroid_binary_{vector_type}",df_test),len(df_test)) - - # sense level centroid - senseid2label = dict(df_test[['sense_id','label']].values) - df_test[f"bert_centroid_sense_{vector_type}"] = df_test.progress_apply(wsd.bert_sense_centroid_vector, - senseid2label= senseid2label, - vector_col=vector_type, - df_train = df_train, axis=1) - - y_pred_sense_centr.extend(df_test[f"bert_centroid_sense_{vector_type}"].to_list()) - - df_test[f"bert_ts_centroid_sense_{vector_type}"] = df_test.progress_apply(wsd.bert_ts_sense_centroid_vector, - senseid2label= senseid2label, - vector_col=vector_type, - df_train = df_train, axis=1) - - y_pred_ts_sense_centr.extend(df_test[f"bert_centroid_sense_{vector_type}"].to_list()) - #results[f"bert_centroid_sense_{vector_type}_{sense}"] = (wsd.eval(f"bert_centroid_sense_{vector_type}",df_test),len(df_test)) - # semaxis - #centroid_vectors = df_train.groupby('label')[vector_type].apply(np.mean,axis=0) - #sem_axis = centroid_vectors[1] - centroid_vectors[0] - #df_test[f"bert_semaxis_{vector_type}"] = df_test[vector_type].progress_apply(wsd.bert_semaxis_vector, sem_axis=sem_axis, return_label=True, threshold=.0) - #y_pred_semaxis.extend(df_test[f"bert_semaxis_{vector_type}"].to_list()) - #results[f"bert_semaxis_{vector_type}_{sense}"] = (wsd.eval(f"bert_semaxis_{vector_type}",df_test),len(df_test)) - - #df_test.to_pickle(f'./data/results/{lemma}_{pos}_{sense}.results') - - return y_true, y_pred_bin_centr,y_pred_ts_bin_centr,y_pred_sense_centr,y_pred_ts_sense_centr, rand, token_overlap, w2v_lesk +#def eval_lemma(lemma, +# pos, +# idx, +# embedding_methods, +# start=1760, +# end=1920, +# vector_type='vector_bert_base_-1,-2,-3,-4_mean', +# skip_vectorize=False, +# train_on_dev=True): +# +# quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle" +# +# if not skip_vectorize: +# vectorize_target_expressions(quotations_path,embedding_methods) +# +# lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle') +# senses = set(lemma_senses[lemma_senses.word_id==f'{lemma}_{pos.lower()}{idx}'].id) +# +# relations = ['seed','synonym'] # ,'descendant','sibling' +# eval_mode = "lemma_etal" # lemma or lemma_etal +# +# wemb_model = Word2Vec.load("/deezy_datadrive/kaspar-playground/dictionary_expansion/HistoricalDictionaryExpansion/models/w2v_004/w2v_words.model") +# y_true,y_pred_bin_centr, y_pred_ts_bin_centr,y_pred_sense_centr,y_pred_ts_sense_centr, rand, token_overlap,w2v_lesk = [], [],[],[], [], [],[], [] +# +# +# tqdm.pandas() +# +# for sense in senses: +# +# print(sense) +# df_train, df_val, df_test = binarize(lemma, +# pos, +# {sense}, +# relations, +# strict_filter=True, +# start=start, +# end=end, +# eval_mode=eval_mode) +# # no quotations for sense and timeframe +# if df_train is None: continue +# +# +# +# y_true.extend(df_test.label.to_list()) +# +# +# if train_on_dev: +# df_train = pd.concat([df_train, df_val], axis=0) +# +# df_train["nlp_full_text"] = df_train.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1) +# +# df_val["nlp_full_text"] = df_val.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1) +# +# df_test["nlp_full_text"] = df_test.apply(lambda row: nlp_tools.preprocess(row["full_text"]), axis=1) +# +# # random +# df_test["random"] = df_test.progress_apply(lambda row: wsd.random_predict(), axis=1) +# rand.extend(df_test["random"].to_list()) +# +# # token overlap +# df_selected_senses = generate_definition_df(df_train,lemma,eval_mode=eval_mode) +# df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1) +# df_test["def_tok_overlap_ranking"] = df_test.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1) +# token_overlap.extend(df_test["def_tok_overlap_ranking"].to_list()) +# +# #w2v lesk +# # Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored. +# #df_test["w2v_lesk_ranking"] = df_test.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1) +# #w2v_lesk.extend(df_test['w2v_lesk_ranking'].to_list()) +# +# # binary centroid +# centroid_vectors = df_train.groupby('label')[vector_type].apply(np.mean,axis=0) +# df_test[f"bert_centroid_binary_{vector_type}"] = df_test[vector_type].progress_apply(wsd.bert_binary_centroid_vector, +# centroid_vectors = centroid_vectors, +# ) +# y_pred_bin_centr.extend(df_test[f"bert_centroid_binary_{vector_type}"].to_list()) +# #results[f"bert_centroid_binary_{vector_type}_{sense}"] = (wsd.eval(f"bert_centroid_binary_{vector_type}",df_test),len(df_test)) +# +# # binary centroid time sensitive +# df_test[f"bert_ts_centroid_binary_{vector_type}"] = df_test.progress_apply(wsd.bert_ts_binary_centroid_vector, df_train=df_train, axis=1) +# y_pred_ts_bin_centr.extend(df_test[f"bert_ts_centroid_binary_{vector_type}"].to_list()) +# +# #results[f"bert_ts_centroid_binary_{vector_type}_{sense}"] = (wsd.eval(f"bert_ts_centroid_binary_{vector_type}",df_test),len(df_test)) +# +# # sense level centroid +# senseid2label = dict(df_test[['sense_id','label']].values) +# df_test[f"bert_centroid_sense_{vector_type}"] = df_test.progress_apply(wsd.bert_sense_centroid_vector, +# senseid2label= senseid2label, +# vector_col=vector_type, +# df_train = df_train, axis=1) +# +# y_pred_sense_centr.extend(df_test[f"bert_centroid_sense_{vector_type}"].to_list()) +# +# df_test[f"bert_ts_centroid_sense_{vector_type}"] = df_test.progress_apply(wsd.bert_ts_sense_centroid_vector, +# senseid2label= senseid2label, +# vector_col=vector_type, +# df_train = df_train, axis=1) +# +# y_pred_ts_sense_centr.extend(df_test[f"bert_centroid_sense_{vector_type}"].to_list()) +# #results[f"bert_centroid_sense_{vector_type}_{sense}"] = (wsd.eval(f"bert_centroid_sense_{vector_type}",df_test),len(df_test)) +# # semaxis +# #centroid_vectors = df_train.groupby('label')[vector_type].apply(np.mean,axis=0) +# #sem_axis = centroid_vectors[1] - centroid_vectors[0] +# #df_test[f"bert_semaxis_{vector_type}"] = df_test[vector_type].progress_apply(wsd.bert_semaxis_vector, sem_axis=sem_axis, return_label=True, threshold=.0) +# #y_pred_semaxis.extend(df_test[f"bert_semaxis_{vector_type}"].to_list()) +# #results[f"bert_semaxis_{vector_type}_{sense}"] = (wsd.eval(f"bert_semaxis_{vector_type}",df_test),len(df_test)) +# +# #df_test.to_pickle(f'./data/results/{lemma}_{pos}_{sense}.results') +# +# return y_true, y_pred_bin_centr,y_pred_ts_bin_centr,y_pred_sense_centr,y_pred_ts_sense_centr, rand, token_overlap, w2v_lesk def bert_avg_quot_nn_wsd(query_vector: np.array,