diff --git a/homeworks_advanced/assignment1_04_conv_cvae/asssignment1_04_convolutional_cvae.ipynb b/homeworks_advanced/assignment1_04_conv_cvae/assignment1_04_convolutional_cvae.ipynb similarity index 100% rename from homeworks_advanced/assignment1_04_conv_cvae/asssignment1_04_convolutional_cvae.ipynb rename to homeworks_advanced/assignment1_04_conv_cvae/assignment1_04_convolutional_cvae.ipynb diff --git a/homeworks_advanced/extra_Lab_QA/LICENSE b/homeworks_advanced/extra_Lab_QA/LICENSE new file mode 100644 index 000000000..e1b9ab003 --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2019 Christopher Chute http://chrischute.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/homeworks_advanced/extra_Lab_QA/README.md b/homeworks_advanced/extra_Lab_QA/README.md new file mode 100644 index 000000000..8a16c1741 --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/README.md @@ -0,0 +1,34 @@ +#### Extra Lab: QA system + +In this homework your goal is to build the QA system for Russian language using the [SberQuAD dataset](https://arxiv.org/pdf/1912.09723.pdf). The preprocessing code and baseline solution (BiDAF) are the slightly adapted version of the [Stanford CS224n Starter code](https://github.com/chrischute/squad). + +The starting point of this assighnment is the `SberQuAD_preprocessing_and_problem_statement.ipynb` notebook. +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/ml-mipt/blob/advanced_f20/homeworks_advanced/extra_Lab_QA/SberQuAD_preprocessing_and_problem_statement.ipynb) + + +Next comes the original instructions from the https://github.com/chrischute/squad repository. + +P.s. Downgrading PyTorch is not required, starter code works fine on PyTorch 1.4 +P.p.s. If you are running in Colab, mount your Google Drive and store the checkpoints/word vectors there. [Official instruction](https://colab.research.google.com/notebooks/io.ipynb), [Habr post](https://habr.com/ru/post/348058/). Restarting the kernel after you finished the preprocessing (and saved the data to your disk) might be a good idea to release the memory. + +#### Setup + +1. Make sure you have [Miniconda](https://docs.conda.io/en/latest/miniconda.html) installed + 1. Conda is a package manager that sandboxes your project’s dependencies in a virtual environment + 2. Miniconda contains Conda and its dependencies with no extra packages by default (as opposed to Anaconda, which installs some extra packages) + +2. cd into src, run `conda env create -f environment.yml` + 1. This creates a Conda environment called `squad` + +3. Run `source activate squad` + 1. This activates the `squad` environment + 2. Do this each time you want to write/test your code + +4. Run `python setup.py` + 1. This downloads SQuAD 2.0 training and dev sets, as well as the GloVe 300-dimensional word vectors (840B) + 2. This also pre-processes the dataset for efficient data loading + 3. For a MacBook Pro on the Stanford network, `setup.py` takes around 30 minutes total + +5. Browse the code in `train.py` + 1. The `train.py` script is the entry point for training a model. It reads command-line arguments, loads the SQuAD dataset, and trains a model. + 2. You may find it helpful to browse the arguments provided by the starter code. Either look directly at the `parser.add_argument` lines in the source code, or run `python train.py -h`. diff --git a/homeworks_advanced/extra_Lab_QA/SberQuAD_preprocessing_and_problem_statement.ipynb b/homeworks_advanced/extra_Lab_QA/SberQuAD_preprocessing_and_problem_statement.ipynb new file mode 100644 index 000000000..21beac000 --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/SberQuAD_preprocessing_and_problem_statement.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Credits: the provided initial code is an adaptation of the [Starter code for Stanford CS224n default final project on SQuAD 2.0](https://github.com/chrischute/squad) which is shared under MIT License. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook does initial preprocessing for the SberQuAD dataset and will give you the starting point in this assignment. If it looks too complex and/or time/resourse-expensive, you may stick to homework05 as well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Preprocessing\n", + "This code is a bit changed version of the code from `setup.py`. If you want to work with the SQuAD dataset, stick to the original instructions from the https://github.com/chrischute/squad repository." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If running on Colab, uncomment the following lines \n", + "\n", + "# !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/homeworks/homework04/args.py -nc\n", + "# !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/homeworks/homework04/layers.py -nc\n", + "# !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/homeworks/homework04/models.py -nc\n", + "# !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/homeworks/homework04/setup.py -nc\n", + "# !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/homeworks/homework04/test.py -nc\n", + "# !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/homeworks/homework04/train.py -nc\n", + "# !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/homeworks/homework04/util.py -nc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If running on Colab, uncomment the following lines \n", + "\n", + "# !pip install ujson\n", + "# !pip install tensorboardX\n", + "# !pip install pymorphy2==0.8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Train a model on SQuAD.\n", + "\n", + "Author:\n", + " Chris Chute (chute@stanford.edu)\n", + "\"\"\"\n", + "\n", + "import numpy as np\n", + "import random\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "import torch.optim.lr_scheduler as sched\n", + "import torch.utils.data as data\n", + "import util\n", + "\n", + "from args import get_train_args\n", + "from collections import OrderedDict\n", + "from json import dumps\n", + "from models import BiDAF\n", + "from tensorboardX import SummaryWriter\n", + "from tqdm import tqdm\n", + "from ujson import load as json_load\n", + "from util import collate_fn, SQuAD" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "Path(\"./data\").mkdir(parents=True, exist_ok=True)\n", + "Path(\"./save\").mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Downloading the SberQuAD data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz -nc -O ./data/sber_squad_clean-v1.1.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! tar -xzvf ./data/sber_squad_clean-v1.1.tar.gz\n", + "! mv train-v1.1.json data\n", + "! mv dev-v1.1.json data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Downloading the word vectors (this may take a while)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! wget http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec -nc -O ./data/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And finally the preprocessing for the SberQuAD dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_file = './data/train-v1.1.json'\n", + "dev_file = './data/dev-v1.1.json'\n", + "glove_file = './data/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from setup import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment this cell if needed\n", + "# !pip install pymorphy2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nlp = spacy.blank(\"ru\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following cell may take a while (usually 10 minutes or less)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process training set and use it to decide on the word/character vocabularies\n", + "word_counter, char_counter = Counter(), Counter()\n", + "train_examples, train_eval = process_file(train_file, \"train\", word_counter, char_counter, nlp)\n", + "word_emb_mat, word2idx_dict = get_embedding(\n", + " word_counter, 'word', emb_file=glove_file, vec_size=300, num_vectors=1560132)\n", + "char_emb_mat, char2idx_dict = get_embedding(\n", + " char_counter, 'char', emb_file=None, vec_size=64)\n", + "\n", + "\n", + "dev_examples, dev_eval = process_file(dev_file, \"dev\", word_counter, char_counter, nlp)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have the preprocessed data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_record_file = './data/train.npz'\n", + "dev_record_file = './data/dev.npz'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from args import add_common_args, get_setup_args" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Retreiving the default arguments for the preprocessing script\n", + "_args = get_setup_args(bypass=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_args" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "build_features(_args, train_examples, \"train\", train_record_file, word2idx_dict, char2idx_dict)\n", + "dev_meta = build_features(_args, dev_examples, \"dev\", dev_record_file, word2idx_dict, char2idx_dict)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save(_args.word_emb_file, word_emb_mat, message=\"word embedding\")\n", + "save(_args.char_emb_file, char_emb_mat, message=\"char embedding\")\n", + "save(_args.train_eval_file, train_eval, message=\"train eval\")\n", + "save(_args.dev_eval_file, dev_eval, message=\"dev eval\")\n", + "save(_args.word2idx_file, word2idx_dict, message=\"word dictionary\")\n", + "save(_args.char2idx_file, char2idx_dict, message=\"char dictionary\")\n", + "save(_args.dev_meta_file, dev_meta, message=\"dev meta\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. The experiment\n", + "\n", + "Now you are almost ready to go. You may follow these steps to begin (or just start your experiments here).\n", + "\n", + "1. Try running the `train.py` script from the console (or via `!`) (default command-line arguments are ok for the start). If will run the BiDAF model on the preprocessed data. Set `--use_squad_v2` flag to False (SberQuAD is similar to SQuAD v1.1).\n", + "\n", + "Example code (be careful with the path and the names of the variables):\n", + "```\n", + "python train.py --name first_run_on_sberquad --use_squad_v2 False\n", + "```\n", + "\n", + "2. After if finishes (might take an 1-2-3 hours depending on the hardware), evaluate your model on the `dev` set and measure the quality.\n", + "Example code (be careful with the path and the names of the variables):\n", + "```\n", + " python test.py --split dev --load_path ./save/train/first_run_on_sberquad-02/best.pth.tar --name best_evaluation_experiment\n", + "```\n", + "The result should be similar to the following:\n", + "```\n", + ">>> Dev NLL: 02.47, F1: 75.62, EM: 55.73, AvNA: 99.42\n", + "```\n", + "\n", + "The [DeepPavlov's RuBERT](http://docs.deeppavlov.ai/en/master/features/models/squad.html) achieves $F1 = 84.60\\pm0.11$ and $EM = 66.30\\pm0.24$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Here comes your quest: try to improve the quality of this QA system. \n", + "\n", + "This is a very creative assignment. It is all about experimenting, trying different approaches (and a lot of computations). But if you wish to stick to some numbers, try to increase F1 at least by $5$ points.\n", + "\n", + "Here are some ideas that might help you on your way:\n", + "* Try adapting the optimization hyperparameters/network structure to Russian language (the baseline is designed for English SQuAD dataset).\n", + "* Incorporating the additional information about the data (like PoS tags) might be a good idea.\n", + "* __Distilling the knowledge from a pre-trained RuBERT__ (e.g. try to use the predictions of the model we've discussed on `week10` as soft targets).\n", + "* Or anything else.\n", + "\n", + "\n", + "And, first of all, read the initial code carefully.\n", + "\n", + "\n", + "Good luck! Feel free to share your results :)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3_research env", + "language": "python", + "name": "py3_research" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/homeworks_advanced/extra_Lab_QA/args.py b/homeworks_advanced/extra_Lab_QA/args.py new file mode 100644 index 000000000..47a59d157 --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/args.py @@ -0,0 +1,247 @@ +"""Command-line arguments for setup.py, train.py, test.py. + +Author: + Chris Chute (chute@stanford.edu) +""" + +import argparse + + +def get_setup_args(bypass=False): + """Get arguments needed in setup.py.""" + parser = argparse.ArgumentParser('Download and pre-process SQuAD') + + add_common_args(parser) + + parser.add_argument('--train_url', + type=str, + default='https://github.com/chrischute/squad/data/train-v2.0.json') + parser.add_argument('--dev_url', + type=str, + default='https://github.com/chrischute/squad/data/dev-v2.0.json') + parser.add_argument('--test_url', + type=str, + default='https://github.com/chrischute/squad/data/test-v2.0.json') + parser.add_argument('--glove_url', + type=str, + default='http://nlp.stanford.edu/data/glove.840B.300d.zip') + parser.add_argument('--dev_meta_file', + type=str, + default='./data/dev_meta.json') + parser.add_argument('--test_meta_file', + type=str, + default='./data/test_meta.json') + parser.add_argument('--word2idx_file', + type=str, + default='./data/word2idx.json') + parser.add_argument('--char2idx_file', + type=str, + default='./data/char2idx.json') + parser.add_argument('--answer_file', + type=str, + default='./data/answer.json') + parser.add_argument('--para_limit', + type=int, + default=400, + help='Max number of words in a paragraph') + parser.add_argument('--ques_limit', + type=int, + default=50, + help='Max number of words to keep from a question') + parser.add_argument('--test_para_limit', + type=int, + default=1000, + help='Max number of words in a paragraph at test time') + parser.add_argument('--test_ques_limit', + type=int, + default=100, + help='Max number of words in a question at test time') + parser.add_argument('--char_dim', + type=int, + default=64, + help='Size of char vectors (char-level embeddings)') + parser.add_argument('--glove_dim', + type=int, + default=300, + help='Size of GloVe word vectors to use') + parser.add_argument('--glove_num_vecs', + type=int, + default=2196017, + help='Number of GloVe vectors') + parser.add_argument('--ans_limit', + type=int, + default=30, + help='Max number of words in a training example answer') + parser.add_argument('--char_limit', + type=int, + default=16, + help='Max number of chars to keep from a word') + parser.add_argument('--include_test_examples', + type=lambda s: s.lower().startswith('t'), + default=True, + help='Process examples from the test set') + + if bypass: + args = parser.parse_args('') + else: + args = parser.parse_args() + + return args + + +def get_train_args(): + """Get arguments needed in train.py.""" + parser = argparse.ArgumentParser('Train a model on SQuAD') + + add_common_args(parser) + add_train_test_args(parser) + + parser.add_argument('--eval_steps', + type=int, + default=50000, + help='Number of steps between successive evaluations.') + parser.add_argument('--lr', + type=float, + default=0.5, + help='Learning rate.') + parser.add_argument('--l2_wd', + type=float, + default=0, + help='L2 weight decay.') + parser.add_argument('--num_epochs', + type=int, + default=30, + help='Number of epochs for which to train. Negative means forever.') + parser.add_argument('--drop_prob', + type=float, + default=0.2, + help='Probability of zeroing an activation in dropout layers.') + parser.add_argument('--metric_name', + type=str, + default='F1', + choices=('NLL', 'EM', 'F1'), + help='Name of dev metric to determine best checkpoint.') + parser.add_argument('--max_checkpoints', + type=int, + default=5, + help='Maximum number of checkpoints to keep on disk.') + parser.add_argument('--max_grad_norm', + type=float, + default=5.0, + help='Maximum gradient norm for gradient clipping.') + parser.add_argument('--seed', + type=int, + default=224, + help='Random seed for reproducibility.') + parser.add_argument('--ema_decay', + type=float, + default=0.999, + help='Decay rate for exponential moving average of parameters.') + + args = parser.parse_args() + + if args.metric_name == 'NLL': + # Best checkpoint is the one that minimizes negative log-likelihood + args.maximize_metric = False + elif args.metric_name in ('EM', 'F1'): + # Best checkpoint is the one that maximizes EM or F1 + args.maximize_metric = True + else: + raise ValueError(f'Unrecognized metric name: "{args.metric_name}"') + + return args + + +def get_test_args(): + """Get arguments needed in test.py.""" + parser = argparse.ArgumentParser('Test a trained model on SQuAD') + + add_common_args(parser) + add_train_test_args(parser) + + parser.add_argument('--split', + type=str, + default='dev', + choices=('train', 'dev', 'test'), + help='Split to use for testing.') + parser.add_argument('--sub_file', + type=str, + default='submission.csv', + help='Name for submission file.') + + # Require load_path for test.py + args = parser.parse_args() + if not args.load_path: + raise argparse.ArgumentError('Missing required argument --load_path') + + return args + + +def add_common_args(parser): + """Add arguments common to all 3 scripts: setup.py, train.py, test.py""" + parser.add_argument('--train_record_file', + type=str, + default='./data/train.npz') + parser.add_argument('--dev_record_file', + type=str, + default='./data/dev.npz') + parser.add_argument('--test_record_file', + type=str, + default='./data/test.npz') + parser.add_argument('--word_emb_file', + type=str, + default='./data/word_emb.json') + parser.add_argument('--char_emb_file', + type=str, + default='./data/char_emb.json') + parser.add_argument('--train_eval_file', + type=str, + default='./data/train_eval.json') + parser.add_argument('--dev_eval_file', + type=str, + default='./data/dev_eval.json') + parser.add_argument('--test_eval_file', + type=str, + default='./data/test_eval.json') + + +def add_train_test_args(parser): + """Add arguments common to train.py and test.py""" + parser.add_argument('--name', + '-n', + type=str, + required=True, + help='Name to identify training or test run.') + parser.add_argument('--max_ans_len', + type=int, + default=15, + help='Maximum length of a predicted answer.') + parser.add_argument('--num_workers', + type=int, + default=4, + help='Number of sub-processes to use per data loader.') + parser.add_argument('--save_dir', + type=str, + default='./save/', + help='Base directory for saving information.') + parser.add_argument('--batch_size', + type=int, + default=64, + help='Batch size per GPU. Scales automatically when \ + multiple GPUs are available.') + parser.add_argument('--use_squad_v2', + type=lambda s: s.lower().startswith('t'), + default=True, + help='Whether to use SQuAD 2.0 (unanswerable) questions.') + parser.add_argument('--hidden_size', + type=int, + default=100, + help='Number of features in encoder hidden layers.') + parser.add_argument('--num_visuals', + type=int, + default=10, + help='Number of examples to visualize in TensorBoard.') + parser.add_argument('--load_path', + type=str, + default=None, + help='Path to load as a model checkpoint.') diff --git a/homeworks_advanced/extra_Lab_QA/layers.py b/homeworks_advanced/extra_Lab_QA/layers.py new file mode 100644 index 000000000..6859e4d39 --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/layers.py @@ -0,0 +1,222 @@ +"""Assortment of layers for use in models.py. + +Author: + Chris Chute (chute@stanford.edu) +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +from util import masked_softmax + + +class Embedding(nn.Module): + """Embedding layer used by BiDAF, without the character-level component. + + Word-level embeddings are further refined using a 2-layer Highway Encoder + (see `HighwayEncoder` class for details). + + Args: + word_vectors (torch.Tensor): Pre-trained word vectors. + hidden_size (int): Size of hidden activations. + drop_prob (float): Probability of zero-ing out activations + """ + def __init__(self, word_vectors, hidden_size, drop_prob): + super(Embedding, self).__init__() + self.drop_prob = drop_prob + self.embed = nn.Embedding.from_pretrained(word_vectors) + self.proj = nn.Linear(word_vectors.size(1), hidden_size, bias=False) + self.hwy = HighwayEncoder(2, hidden_size) + + def forward(self, x): + emb = self.embed(x) # (batch_size, seq_len, embed_size) + emb = F.dropout(emb, self.drop_prob, self.training) + emb = self.proj(emb) # (batch_size, seq_len, hidden_size) + emb = self.hwy(emb) # (batch_size, seq_len, hidden_size) + + return emb + + +class HighwayEncoder(nn.Module): + """Encode an input sequence using a highway network. + + Based on the paper: + "Highway Networks" + by Rupesh Kumar Srivastava, Klaus Greff, Jürgen Schmidhuber + (https://arxiv.org/abs/1505.00387). + + Args: + num_layers (int): Number of layers in the highway encoder. + hidden_size (int): Size of hidden activations. + """ + def __init__(self, num_layers, hidden_size): + super(HighwayEncoder, self).__init__() + self.transforms = nn.ModuleList([nn.Linear(hidden_size, hidden_size) + for _ in range(num_layers)]) + self.gates = nn.ModuleList([nn.Linear(hidden_size, hidden_size) + for _ in range(num_layers)]) + + def forward(self, x): + for gate, transform in zip(self.gates, self.transforms): + # Shapes of g, t, and x are all (batch_size, seq_len, hidden_size) + g = torch.sigmoid(gate(x)) + t = F.relu(transform(x)) + x = g * t + (1 - g) * x + + return x + + +class RNNEncoder(nn.Module): + """General-purpose layer for encoding a sequence using a bidirectional RNN. + + Encoded output is the RNN's hidden state at each position, which + has shape `(batch_size, seq_len, hidden_size * 2)`. + + Args: + input_size (int): Size of a single timestep in the input. + hidden_size (int): Size of the RNN hidden state. + num_layers (int): Number of layers of RNN cells to use. + drop_prob (float): Probability of zero-ing out activations. + """ + def __init__(self, + input_size, + hidden_size, + num_layers, + drop_prob=0.): + super(RNNEncoder, self).__init__() + self.drop_prob = drop_prob + self.rnn = nn.LSTM(input_size, hidden_size, num_layers, + batch_first=True, + bidirectional=True, + dropout=drop_prob if num_layers > 1 else 0.) + + def forward(self, x, lengths): + # Save original padded length for use by pad_packed_sequence + orig_len = x.size(1) + + # Sort by length and pack sequence for RNN + lengths, sort_idx = lengths.sort(0, descending=True) + x = x[sort_idx] # (batch_size, seq_len, input_size) + x = pack_padded_sequence(x, lengths, batch_first=True) + + # Apply RNN + x, _ = self.rnn(x) # (batch_size, seq_len, 2 * hidden_size) + + # Unpack and reverse sort + x, _ = pad_packed_sequence(x, batch_first=True, total_length=orig_len) + _, unsort_idx = sort_idx.sort(0) + x = x[unsort_idx] # (batch_size, seq_len, 2 * hidden_size) + + # Apply dropout (RNN applies dropout after all but the last layer) + x = F.dropout(x, self.drop_prob, self.training) + + return x + + +class BiDAFAttention(nn.Module): + """Bidirectional attention originally used by BiDAF. + + Bidirectional attention computes attention in two directions: + The context attends to the query and the query attends to the context. + The output of this layer is the concatenation of [context, c2q_attention, + context * c2q_attention, context * q2c_attention]. This concatenation allows + the attention vector at each timestep, along with the embeddings from + previous layers, to flow through the attention layer to the modeling layer. + The output has shape (batch_size, context_len, 8 * hidden_size). + + Args: + hidden_size (int): Size of hidden activations. + drop_prob (float): Probability of zero-ing out activations. + """ + def __init__(self, hidden_size, drop_prob=0.1): + super(BiDAFAttention, self).__init__() + self.drop_prob = drop_prob + self.c_weight = nn.Parameter(torch.zeros(hidden_size, 1)) + self.q_weight = nn.Parameter(torch.zeros(hidden_size, 1)) + self.cq_weight = nn.Parameter(torch.zeros(1, 1, hidden_size)) + for weight in (self.c_weight, self.q_weight, self.cq_weight): + nn.init.xavier_uniform_(weight) + self.bias = nn.Parameter(torch.zeros(1)) + + def forward(self, c, q, c_mask, q_mask): + batch_size, c_len, _ = c.size() + q_len = q.size(1) + s = self.get_similarity_matrix(c, q) # (batch_size, c_len, q_len) + c_mask = c_mask.view(batch_size, c_len, 1) # (batch_size, c_len, 1) + q_mask = q_mask.view(batch_size, 1, q_len) # (batch_size, 1, q_len) + s1 = masked_softmax(s, q_mask, dim=2) # (batch_size, c_len, q_len) + s2 = masked_softmax(s, c_mask, dim=1) # (batch_size, c_len, q_len) + + # (bs, c_len, q_len) x (bs, q_len, hid_size) => (bs, c_len, hid_size) + a = torch.bmm(s1, q) + # (bs, c_len, c_len) x (bs, c_len, hid_size) => (bs, c_len, hid_size) + b = torch.bmm(torch.bmm(s1, s2.transpose(1, 2)), c) + + x = torch.cat([c, a, c * a, c * b], dim=2) # (bs, c_len, 4 * hid_size) + + return x + + def get_similarity_matrix(self, c, q): + """Get the "similarity matrix" between context and query (using the + terminology of the BiDAF paper). + + A naive implementation as described in BiDAF would concatenate the + three vectors then project the result with a single weight matrix. This + method is a more memory-efficient implementation of the same operation. + + See Also: + Equation 1 in https://arxiv.org/abs/1611.01603 + """ + c_len, q_len = c.size(1), q.size(1) + c = F.dropout(c, self.drop_prob, self.training) # (bs, c_len, hid_size) + q = F.dropout(q, self.drop_prob, self.training) # (bs, q_len, hid_size) + + # Shapes: (batch_size, c_len, q_len) + s0 = torch.matmul(c, self.c_weight).expand([-1, -1, q_len]) + s1 = torch.matmul(q, self.q_weight).transpose(1, 2)\ + .expand([-1, c_len, -1]) + s2 = torch.matmul(c * self.cq_weight, q.transpose(1, 2)) + s = s0 + s1 + s2 + self.bias + + return s + + +class BiDAFOutput(nn.Module): + """Output layer used by BiDAF for question answering. + + Computes a linear transformation of the attention and modeling + outputs, then takes the softmax of the result to get the start pointer. + A bidirectional LSTM is then applied the modeling output to produce `mod_2`. + A second linear+softmax of the attention output and `mod_2` is used + to get the end pointer. + + Args: + hidden_size (int): Hidden size used in the BiDAF model. + drop_prob (float): Probability of zero-ing out activations. + """ + def __init__(self, hidden_size, drop_prob): + super(BiDAFOutput, self).__init__() + self.att_linear_1 = nn.Linear(8 * hidden_size, 1) + self.mod_linear_1 = nn.Linear(2 * hidden_size, 1) + + self.rnn = RNNEncoder(input_size=2 * hidden_size, + hidden_size=hidden_size, + num_layers=1, + drop_prob=drop_prob) + + self.att_linear_2 = nn.Linear(8 * hidden_size, 1) + self.mod_linear_2 = nn.Linear(2 * hidden_size, 1) + + def forward(self, att, mod, mask): + # Shapes: (batch_size, seq_len, 1) + logits_1 = self.att_linear_1(att) + self.mod_linear_1(mod) + mod_2 = self.rnn(mod, mask.sum(-1)) + logits_2 = self.att_linear_2(att) + self.mod_linear_2(mod_2) + + # Shapes: (batch_size, seq_len) + log_p1 = masked_softmax(logits_1.squeeze(), mask, log_softmax=True) + log_p2 = masked_softmax(logits_2.squeeze(), mask, log_softmax=True) + + return log_p1, log_p2 diff --git a/homeworks_advanced/extra_Lab_QA/models.py b/homeworks_advanced/extra_Lab_QA/models.py new file mode 100644 index 000000000..3487ea20e --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/models.py @@ -0,0 +1,72 @@ +"""Top-level model classes. + +Author: + Chris Chute (chute@stanford.edu) +""" + +import layers +import torch +import torch.nn as nn + + +class BiDAF(nn.Module): + """Baseline BiDAF model for SQuAD. + + Based on the paper: + "Bidirectional Attention Flow for Machine Comprehension" + by Minjoon Seo, Aniruddha Kembhavi, Ali Farhadi, Hannaneh Hajishirzi + (https://arxiv.org/abs/1611.01603). + + Follows a high-level structure commonly found in SQuAD models: + - Embedding layer: Embed word indices to get word vectors. + - Encoder layer: Encode the embedded sequence. + - Attention layer: Apply an attention mechanism to the encoded sequence. + - Model encoder layer: Encode the sequence again. + - Output layer: Simple layer (e.g., fc + softmax) to get final outputs. + + Args: + word_vectors (torch.Tensor): Pre-trained word vectors. + hidden_size (int): Number of features in the hidden state at each layer. + drop_prob (float): Dropout probability. + """ + def __init__(self, word_vectors, hidden_size, drop_prob=0.): + super(BiDAF, self).__init__() + self.emb = layers.Embedding(word_vectors=word_vectors, + hidden_size=hidden_size, + drop_prob=drop_prob) + + self.enc = layers.RNNEncoder(input_size=hidden_size, + hidden_size=hidden_size, + num_layers=1, + drop_prob=drop_prob) + + self.att = layers.BiDAFAttention(hidden_size=2 * hidden_size, + drop_prob=drop_prob) + + self.mod = layers.RNNEncoder(input_size=8 * hidden_size, + hidden_size=hidden_size, + num_layers=2, + drop_prob=drop_prob) + + self.out = layers.BiDAFOutput(hidden_size=hidden_size, + drop_prob=drop_prob) + + def forward(self, cw_idxs, qw_idxs): + c_mask = torch.zeros_like(cw_idxs) != cw_idxs + q_mask = torch.zeros_like(qw_idxs) != qw_idxs + c_len, q_len = c_mask.sum(-1), q_mask.sum(-1) + + c_emb = self.emb(cw_idxs) # (batch_size, c_len, hidden_size) + q_emb = self.emb(qw_idxs) # (batch_size, q_len, hidden_size) + + c_enc = self.enc(c_emb, c_len) # (batch_size, c_len, 2 * hidden_size) + q_enc = self.enc(q_emb, q_len) # (batch_size, q_len, 2 * hidden_size) + + att = self.att(c_enc, q_enc, + c_mask, q_mask) # (batch_size, c_len, 8 * hidden_size) + + mod = self.mod(att, c_len) # (batch_size, c_len, 2 * hidden_size) + + out = self.out(att, mod, c_mask) # 2 tensors, each (batch_size, c_len) + + return out diff --git a/homeworks_advanced/extra_Lab_QA/setup.py b/homeworks_advanced/extra_Lab_QA/setup.py new file mode 100644 index 000000000..c270cdf00 --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/setup.py @@ -0,0 +1,396 @@ +"""Download and pre-process SQuAD and GloVe. + +Usage: + > source activate squad + > python setup.py + +Pre-processing code adapted from: + > https://github.com/HKUST-KnowComp/R-Net/blob/master/prepro.py + +Author: + Chris Chute (chute@stanford.edu) +""" + +import numpy as np +import os +import spacy +import ujson as json +import urllib.request + +from args import get_setup_args +from codecs import open +from collections import Counter +from subprocess import run +from tqdm import tqdm +from zipfile import ZipFile + + +def download_url(url, output_path, show_progress=True): + class DownloadProgressBar(tqdm): + def update_to(self, b=1, bsize=1, tsize=None): + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) + + if show_progress: + # Download with a progress bar + with DownloadProgressBar(unit='B', unit_scale=True, + miniters=1, desc=url.split('/')[-1]) as t: + urllib.request.urlretrieve(url, + filename=output_path, + reporthook=t.update_to) + else: + # Simple download with no progress bar + urllib.request.urlretrieve(url, output_path) + + +def url_to_data_path(url): + return os.path.join('./data/', url.split('/')[-1]) + + +def download(args): + downloads = [ + # Can add other downloads here (e.g., other word vectors) + ('GloVe word vectors', args.glove_url), + ] + + for name, url in downloads: + output_path = url_to_data_path(url) + if not os.path.exists(output_path): + print(f'Downloading {name}...') + download_url(url, output_path) + + if os.path.exists(output_path) and output_path.endswith('.zip'): + extracted_path = output_path.replace('.zip', '') + if not os.path.exists(extracted_path): + print(f'Unzipping {name}...') + with ZipFile(output_path, 'r') as zip_fh: + zip_fh.extractall(extracted_path) + + print('Downloading spacy language model...') + run(['python', '-m', 'spacy', 'download', 'en']) + +def word_tokenize(sent, nlp): + doc = nlp(sent) + return [token.text for token in doc] + + +def convert_idx(text, tokens): + current = 0 + spans = [] + for token in tokens: + current = text.find(token, current) + if current < 0: + print(f"Token {token} cannot be found") + raise Exception() + spans.append((current, current + len(token))) + current += len(token) + return spans + + +def process_file(filename, data_type, word_counter, char_counter, nlp): + print(f"Pre-processing {data_type} examples...") + examples = [] + eval_examples = {} + total = 0 + with open(filename, "r") as fh: + source = json.load(fh) + for article in tqdm(source["data"]): + for para in article["paragraphs"]: + context = para["context"].replace( + "''", '" ').replace("``", '" ') + context_tokens = word_tokenize(context, nlp) + context_chars = [list(token) for token in context_tokens] + spans = convert_idx(context, context_tokens) + for token in context_tokens: + word_counter[token] += len(para["qas"]) + for char in token: + char_counter[char] += len(para["qas"]) + for qa in para["qas"]: + total += 1 + ques = qa["question"].replace( + "''", '" ').replace("``", '" ') + ques_tokens = word_tokenize(ques, nlp) + ques_chars = [list(token) for token in ques_tokens] + for token in ques_tokens: + word_counter[token] += 1 + for char in token: + char_counter[char] += 1 + y1s, y2s = [], [] + answer_texts = [] + for answer in qa["answers"]: + answer_text = answer["text"] + answer_start = answer['answer_start'] + answer_end = answer_start + len(answer_text) + answer_texts.append(answer_text) + answer_span = [] + for idx, span in enumerate(spans): + if not (answer_end <= span[0] or answer_start >= span[1]): + answer_span.append(idx) + y1, y2 = answer_span[0], answer_span[-1] + y1s.append(y1) + y2s.append(y2) + example = {"context_tokens": context_tokens, + "context_chars": context_chars, + "ques_tokens": ques_tokens, + "ques_chars": ques_chars, + "y1s": y1s, + "y2s": y2s, + "id": total} + examples.append(example) + eval_examples[str(total)] = {"context": context, + "question": ques, + "spans": spans, + "answers": answer_texts, + "uuid": qa["id"]} + print(f"{len(examples)} questions in total") + return examples, eval_examples + + +def get_embedding(counter, data_type, limit=-1, emb_file=None, vec_size=None, num_vectors=None): + print(f"Pre-processing {data_type} vectors...") + embedding_dict = {} + filtered_elements = [k for k, v in counter.items() if v > limit] + if emb_file is not None: + assert vec_size is not None + with open(emb_file, "r", encoding="utf-8") as fh: + for line in tqdm(fh, total=num_vectors): + array = line.split() + word = "".join(array[0:-vec_size]) + vector = list(map(float, array[-vec_size:])) + if word in counter and counter[word] > limit: + embedding_dict[word] = vector + print(f"{len(embedding_dict)} / {len(filtered_elements)} tokens have corresponding {data_type} embedding vector") + else: + assert vec_size is not None + for token in filtered_elements: + embedding_dict[token] = [np.random.normal( + scale=0.1) for _ in range(vec_size)] + print(f"{len(filtered_elements)} tokens have corresponding {data_type} embedding vector") + + NULL = "--NULL--" + OOV = "--OOV--" + token2idx_dict = {token: idx for idx, token in enumerate(embedding_dict.keys(), 2)} + token2idx_dict[NULL] = 0 + token2idx_dict[OOV] = 1 + embedding_dict[NULL] = [0. for _ in range(vec_size)] + embedding_dict[OOV] = [0. for _ in range(vec_size)] + idx2emb_dict = {idx: embedding_dict[token] + for token, idx in token2idx_dict.items()} + emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))] + return emb_mat, token2idx_dict + + +def convert_to_features(args, data, word2idx_dict, char2idx_dict, is_test): + example = {} + context, question = data + context = context.replace("''", '" ').replace("``", '" ') + question = question.replace("''", '" ').replace("``", '" ') + example['context_tokens'] = word_tokenize(context) + example['ques_tokens'] = word_tokenize(question) + example['context_chars'] = [list(token) for token in example['context_tokens']] + example['ques_chars'] = [list(token) for token in example['ques_tokens']] + + para_limit = args.test_para_limit if is_test else args.para_limit + ques_limit = args.test_ques_limit if is_test else args.ques_limit + char_limit = args.char_limit + + def filter_func(example): + return len(example["context_tokens"]) > para_limit or \ + len(example["ques_tokens"]) > ques_limit + + if filter_func(example): + raise ValueError("Context/Questions lengths are over the limit") + + context_idxs = np.zeros([para_limit], dtype=np.int32) + context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32) + ques_idxs = np.zeros([ques_limit], dtype=np.int32) + ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32) + + def _get_word(word): + for each in (word, word.lower(), word.capitalize(), word.upper()): + if each in word2idx_dict: + return word2idx_dict[each] + return 1 + + def _get_char(char): + if char in char2idx_dict: + return char2idx_dict[char] + return 1 + + for i, token in enumerate(example["context_tokens"]): + context_idxs[i] = _get_word(token) + + for i, token in enumerate(example["ques_tokens"]): + ques_idxs[i] = _get_word(token) + + for i, token in enumerate(example["context_chars"]): + for j, char in enumerate(token): + if j == char_limit: + break + context_char_idxs[i, j] = _get_char(char) + + for i, token in enumerate(example["ques_chars"]): + for j, char in enumerate(token): + if j == char_limit: + break + ques_char_idxs[i, j] = _get_char(char) + + return context_idxs, context_char_idxs, ques_idxs, ques_char_idxs + + +def is_answerable(example): + return len(example['y2s']) > 0 and len(example['y1s']) > 0 + + +def build_features(args, examples, data_type, out_file, word2idx_dict, char2idx_dict, is_test=False): + para_limit = args.test_para_limit if is_test else args.para_limit + ques_limit = args.test_ques_limit if is_test else args.ques_limit + ans_limit = args.ans_limit + char_limit = args.char_limit + + def drop_example(ex, is_test_=False): + if is_test_: + drop = False + else: + drop = len(ex["context_tokens"]) > para_limit or \ + len(ex["ques_tokens"]) > ques_limit or \ + (is_answerable(ex) and + ex["y2s"][0] - ex["y1s"][0] > ans_limit) + + return drop + + print(f"Converting {data_type} examples to indices...") + total = 0 + total_ = 0 + meta = {} + context_idxs = [] + context_char_idxs = [] + ques_idxs = [] + ques_char_idxs = [] + y1s = [] + y2s = [] + ids = [] + for n, example in tqdm(enumerate(examples)): + total_ += 1 + + if drop_example(example, is_test): + continue + + total += 1 + + def _get_word(word): + for each in (word, word.lower(), word.capitalize(), word.upper()): + if each in word2idx_dict: + return word2idx_dict[each] + return 1 + + def _get_char(char): + if char in char2idx_dict: + return char2idx_dict[char] + return 1 + + context_idx = np.zeros([para_limit], dtype=np.int32) + context_char_idx = np.zeros([para_limit, char_limit], dtype=np.int32) + ques_idx = np.zeros([ques_limit], dtype=np.int32) + ques_char_idx = np.zeros([ques_limit, char_limit], dtype=np.int32) + + for i, token in enumerate(example["context_tokens"]): + context_idx[i] = _get_word(token) + context_idxs.append(context_idx) + + for i, token in enumerate(example["ques_tokens"]): + ques_idx[i] = _get_word(token) + ques_idxs.append(ques_idx) + + for i, token in enumerate(example["context_chars"]): + for j, char in enumerate(token): + if j == char_limit: + break + context_char_idx[i, j] = _get_char(char) + context_char_idxs.append(context_char_idx) + + for i, token in enumerate(example["ques_chars"]): + for j, char in enumerate(token): + if j == char_limit: + break + ques_char_idx[i, j] = _get_char(char) + ques_char_idxs.append(ques_char_idx) + + if is_answerable(example): + start, end = example["y1s"][-1], example["y2s"][-1] + else: + start, end = -1, -1 + + y1s.append(start) + y2s.append(end) + ids.append(example["id"]) + + np.savez(out_file, + context_idxs=np.array(context_idxs), + context_char_idxs=np.array(context_char_idxs), + ques_idxs=np.array(ques_idxs), + ques_char_idxs=np.array(ques_char_idxs), + y1s=np.array(y1s), + y2s=np.array(y2s), + ids=np.array(ids)) + print(f"Built {total} / {total_} instances of features in total") + meta["total"] = total + return meta + + +def save(filename, obj, message=None): + if message is not None: + print(f"Saving {message}...") + with open(filename, "w") as fh: + json.dump(obj, fh) + + +def pre_process(args): + # Process training set and use it to decide on the word/character vocabularies + word_counter, char_counter = Counter(), Counter() + train_examples, train_eval = process_file(args.train_file, "train", word_counter, char_counter) + word_emb_mat, word2idx_dict = get_embedding( + word_counter, 'word', emb_file=args.glove_file, vec_size=args.glove_dim, num_vectors=args.glove_num_vecs) + char_emb_mat, char2idx_dict = get_embedding( + char_counter, 'char', emb_file=None, vec_size=args.char_dim) + + # Process dev and test sets + dev_examples, dev_eval = process_file(args.dev_file, "dev", word_counter, char_counter) + build_features(args, train_examples, "train", args.train_record_file, word2idx_dict, char2idx_dict) + dev_meta = build_features(args, dev_examples, "dev", args.dev_record_file, word2idx_dict, char2idx_dict) + if args.include_test_examples: + test_examples, test_eval = process_file(args.test_file, "test", word_counter, char_counter) + save(args.test_eval_file, test_eval, message="test eval") + test_meta = build_features(args, test_examples, "test", + args.test_record_file, word2idx_dict, char2idx_dict, is_test=True) + save(args.test_meta_file, test_meta, message="test meta") + + save(args.word_emb_file, word_emb_mat, message="word embedding") + save(args.char_emb_file, char_emb_mat, message="char embedding") + save(args.train_eval_file, train_eval, message="train eval") + save(args.dev_eval_file, dev_eval, message="dev eval") + save(args.word2idx_file, word2idx_dict, message="word dictionary") + save(args.char2idx_file, char2idx_dict, message="char dictionary") + save(args.dev_meta_file, dev_meta, message="dev meta") + + +if __name__ == '__main__': + # Get command-line args + args_ = get_setup_args() + + # Download resources + download(args_) + + # Import spacy language model + nlp = spacy.blank("en") + + # Preprocess dataset + args_.train_file = url_to_data_path(args_.train_url) + args_.dev_file = url_to_data_path(args_.dev_url) + if args_.include_test_examples: + args_.test_file = url_to_data_path(args_.test_url) + glove_dir = url_to_data_path(args_.glove_url.replace('.zip', '')) + glove_ext = f'.txt' if glove_dir.endswith('d') else f'.{args_.glove_dim}d.txt' + args_.glove_file = os.path.join(glove_dir, os.path.basename(glove_dir) + glove_ext) + pre_process(args_) diff --git a/homeworks_advanced/extra_Lab_QA/test.py b/homeworks_advanced/extra_Lab_QA/test.py new file mode 100644 index 000000000..745fa36bf --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/test.py @@ -0,0 +1,138 @@ +"""Test a model and generate submission CSV. + +Usage: + > python test.py --split SPLIT --load_path PATH --name NAME + where + > SPLIT is either "dev" or "test" + > PATH is a path to a checkpoint (e.g., save/train/model-01/best.pth.tar) + > NAME is a name to identify the test run + +Author: + Chris Chute (chute@stanford.edu) +""" + +import csv +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.data as data +import util + +from args import get_test_args +from collections import OrderedDict +from json import dumps +from models import BiDAF +from os.path import join +from tensorboardX import SummaryWriter +from tqdm import tqdm +from ujson import load as json_load +from util import collate_fn, SQuAD + + +def main(args): + # Set up logging + args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) + log = util.get_logger(args.save_dir, args.name) + log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') + device, gpu_ids = util.get_available_devices() + args.batch_size *= max(1, len(gpu_ids)) + + # Get embeddings + log.info('Loading embeddings...') + word_vectors = util.torch_from_json(args.word_emb_file) + + # Get model + log.info('Building model...') + model = BiDAF(word_vectors=word_vectors, + hidden_size=args.hidden_size) + model = nn.DataParallel(model, gpu_ids) + log.info(f'Loading checkpoint from {args.load_path}...') + model = util.load_model(model, args.load_path, gpu_ids, return_step=False) + model = model.to(device) + model.eval() + + # Get data loader + log.info('Building dataset...') + record_file = vars(args)[f'{args.split}_record_file'] + dataset = SQuAD(record_file, args.use_squad_v2) + data_loader = data.DataLoader(dataset, + batch_size=args.batch_size, + shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_fn) + + # Evaluate + log.info(f'Evaluating on {args.split} split...') + nll_meter = util.AverageMeter() + pred_dict = {} # Predictions for TensorBoard + sub_dict = {} # Predictions for submission + eval_file = vars(args)[f'{args.split}_eval_file'] + with open(eval_file, 'r') as fh: + gold_dict = json_load(fh) + with torch.no_grad(), \ + tqdm(total=len(dataset)) as progress_bar: + for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: + # Setup for forward + cw_idxs = cw_idxs.to(device) + qw_idxs = qw_idxs.to(device) + batch_size = cw_idxs.size(0) + + # Forward + log_p1, log_p2 = model(cw_idxs, qw_idxs) + y1, y2 = y1.to(device), y2.to(device) + loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) + nll_meter.update(loss.item(), batch_size) + + # Get F1 and EM scores + p1, p2 = log_p1.exp(), log_p2.exp() + starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) + + # Log info + progress_bar.update(batch_size) + if args.split != 'test': + # No labels for the test set, so NLL would be invalid + progress_bar.set_postfix(NLL=nll_meter.avg) + + idx2pred, uuid2pred = util.convert_tokens(gold_dict, + ids.tolist(), + starts.tolist(), + ends.tolist(), + args.use_squad_v2) + pred_dict.update(idx2pred) + sub_dict.update(uuid2pred) + + # Log results (except for test set, since it does not come with labels) + if args.split != 'test': + results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) + results_list = [('NLL', nll_meter.avg), + ('F1', results['F1']), + ('EM', results['EM'])] + if args.use_squad_v2: + results_list.append(('AvNA', results['AvNA'])) + results = OrderedDict(results_list) + + # Log to console + results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) + log.info(f'{args.split.title()} {results_str}') + + # Log to TensorBoard + tbx = SummaryWriter(args.save_dir) + util.visualize(tbx, + pred_dict=pred_dict, + eval_path=eval_file, + step=0, + split=args.split, + num_visuals=args.num_visuals) + + # Write submission file + sub_path = join(args.save_dir, args.split + '_' + args.sub_file) + log.info(f'Writing submission file to {sub_path}...') + with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: + csv_writer = csv.writer(csv_fh, delimiter=',') + csv_writer.writerow(['Id', 'Predicted']) + for uuid in sorted(sub_dict): + csv_writer.writerow([uuid, sub_dict[uuid]]) + + +if __name__ == '__main__': + main(get_test_args()) diff --git a/homeworks_advanced/extra_Lab_QA/train.py b/homeworks_advanced/extra_Lab_QA/train.py new file mode 100644 index 000000000..42e426504 --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/train.py @@ -0,0 +1,212 @@ +"""Train a model on SQuAD. + +Author: + Chris Chute (chute@stanford.edu) +""" + +import numpy as np +import random +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.optim.lr_scheduler as sched +import torch.utils.data as data +import util + +from args import get_train_args +from collections import OrderedDict +from json import dumps +from models import BiDAF +from tensorboardX import SummaryWriter +from tqdm import tqdm +from ujson import load as json_load +from util import collate_fn, SQuAD + + +def main(args): + # Set up logging and devices + args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) + log = util.get_logger(args.save_dir, args.name) + tbx = SummaryWriter(args.save_dir) + + import warnings + warnings.filterwarnings('ignore') + + device, args.gpu_ids = util.get_available_devices() + log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') + args.batch_size *= max(1, len(args.gpu_ids)) + + # Set random seed + log.info(f'Using random seed {args.seed}...') + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + # Get embeddings + log.info('Loading embeddings...') + word_vectors = util.torch_from_json(args.word_emb_file) + + # Get model + log.info('Building model...') + model = BiDAF(word_vectors=word_vectors, + hidden_size=args.hidden_size, + drop_prob=args.drop_prob) + model = nn.DataParallel(model, args.gpu_ids) + if args.load_path: + log.info(f'Loading checkpoint from {args.load_path}...') + model, step = util.load_model(model, args.load_path, args.gpu_ids) + else: + step = 0 + model = model.to(device) + model.train() + ema = util.EMA(model, args.ema_decay) + + # Get saver + saver = util.CheckpointSaver(args.save_dir, + max_checkpoints=args.max_checkpoints, + metric_name=args.metric_name, + maximize_metric=args.maximize_metric, + log=log) + + # Get optimizer and scheduler + optimizer = optim.Adadelta(model.parameters(), args.lr, + weight_decay=args.l2_wd) + scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR + + # Get data loader + log.info('Building dataset...') + train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) + train_loader = data.DataLoader(train_dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + collate_fn=collate_fn) + dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) + dev_loader = data.DataLoader(dev_dataset, + batch_size=args.batch_size, + shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_fn) + + # Train + log.info('Training...') + steps_till_eval = args.eval_steps + epoch = step // len(train_dataset) + while epoch != args.num_epochs: + epoch += 1 + log.info(f'Starting epoch {epoch}...') + with torch.enable_grad(), \ + tqdm(total=len(train_loader.dataset)) as progress_bar: + for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: + # Setup for forward + cw_idxs = cw_idxs.to(device) + qw_idxs = qw_idxs.to(device) + batch_size = cw_idxs.size(0) + optimizer.zero_grad() + + # Forward + log_p1, log_p2 = model(cw_idxs, qw_idxs) + y1, y2 = y1.to(device), y2.to(device) + loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) + loss_val = loss.item() + + # Backward + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + optimizer.step() + scheduler.step(step // batch_size) + ema(model, step // batch_size) + + # Log info + step += batch_size + progress_bar.update(batch_size) + progress_bar.set_postfix(epoch=epoch, + NLL=loss_val) + tbx.add_scalar('train/NLL', loss_val, step) + tbx.add_scalar('train/LR', + optimizer.param_groups[0]['lr'], + step) + + steps_till_eval -= batch_size + if steps_till_eval <= 0: + steps_till_eval = args.eval_steps + + # Evaluate and save checkpoint + log.info(f'Evaluating at step {step}...') + ema.assign(model) + results, pred_dict = evaluate(model, dev_loader, device, + args.dev_eval_file, + args.max_ans_len, + args.use_squad_v2) + saver.save(step, model, results[args.metric_name], device) + ema.resume(model) + + # Log to console + results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) + log.info(f'Dev {results_str}') + + # Log to TensorBoard + log.info('Visualizing in TensorBoard...') + for k, v in results.items(): + tbx.add_scalar(f'dev/{k}', v, step) + util.visualize(tbx, + pred_dict=pred_dict, + eval_path=args.dev_eval_file, + step=step, + split='dev', + num_visuals=args.num_visuals) + + +def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): + nll_meter = util.AverageMeter() + + model.eval() + pred_dict = {} + with open(eval_file, 'r') as fh: + gold_dict = json_load(fh) + with torch.no_grad(), \ + tqdm(total=len(data_loader.dataset)) as progress_bar: + for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: + # Setup for forward + cw_idxs = cw_idxs.to(device) + qw_idxs = qw_idxs.to(device) + batch_size = cw_idxs.size(0) + + # Forward + log_p1, log_p2 = model(cw_idxs, qw_idxs) + y1, y2 = y1.to(device), y2.to(device) + loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) + nll_meter.update(loss.item(), batch_size) + + # Get F1 and EM scores + p1, p2 = log_p1.exp(), log_p2.exp() + starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) + + # Log info + progress_bar.update(batch_size) + progress_bar.set_postfix(NLL=nll_meter.avg) + + preds, _ = util.convert_tokens(gold_dict, + ids.tolist(), + starts.tolist(), + ends.tolist(), + use_squad_v2) + pred_dict.update(preds) + + model.train() + + results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) + results_list = [('NLL', nll_meter.avg), + ('F1', results['F1']), + ('EM', results['EM'])] + if use_squad_v2: + results_list.append(('AvNA', results['AvNA'])) + results = OrderedDict(results_list) + + return results, pred_dict + + +if __name__ == '__main__': + main(get_train_args()) diff --git a/homeworks_advanced/extra_Lab_QA/util.py b/homeworks_advanced/extra_Lab_QA/util.py new file mode 100644 index 000000000..1cad0bb70 --- /dev/null +++ b/homeworks_advanced/extra_Lab_QA/util.py @@ -0,0 +1,725 @@ +"""Utility classes and methods. + +Author: + Chris Chute (chute@stanford.edu) +""" +import logging +import os +import queue +import re +import shutil +import string +import torch +import torch.nn.functional as F +import torch.utils.data as data +import tqdm +import numpy as np +import ujson as json + +from collections import Counter + + +class SQuAD(data.Dataset): + """Stanford Question Answering Dataset (SQuAD). + + Each item in the dataset is a tuple with the following entries (in order): + - context_idxs: Indices of the words in the context. + Shape (context_len,). + - context_char_idxs: Indices of the characters in the context. + Shape (context_len, max_word_len). + - question_idxs: Indices of the words in the question. + Shape (question_len,). + - question_char_idxs: Indices of the characters in the question. + Shape (question_len, max_word_len). + - y1: Index of word in the context where the answer begins. + -1 if no answer. + - y2: Index of word in the context where the answer ends. + -1 if no answer. + - id: ID of the example. + + Args: + data_path (str): Path to .npz file containing pre-processed dataset. + use_v2 (bool): Whether to use SQuAD 2.0 questions. Otherwise only use SQuAD 1.1. + """ + def __init__(self, data_path, use_v2=True): + super(SQuAD, self).__init__() + + dataset = np.load(data_path) + self.context_idxs = torch.from_numpy(dataset['context_idxs']).long() + self.context_char_idxs = torch.from_numpy(dataset['context_char_idxs']).long() + self.question_idxs = torch.from_numpy(dataset['ques_idxs']).long() + self.question_char_idxs = torch.from_numpy(dataset['ques_char_idxs']).long() + self.y1s = torch.from_numpy(dataset['y1s']).long() + self.y2s = torch.from_numpy(dataset['y2s']).long() + + if use_v2: + # SQuAD 2.0: Use index 0 for no-answer token (token 1 = OOV) + batch_size, c_len, w_len = self.context_char_idxs.size() + ones = torch.ones((batch_size, 1), dtype=torch.int64) + self.context_idxs = torch.cat((ones, self.context_idxs), dim=1) + self.question_idxs = torch.cat((ones, self.question_idxs), dim=1) + + ones = torch.ones((batch_size, 1, w_len), dtype=torch.int64) + self.context_char_idxs = torch.cat((ones, self.context_char_idxs), dim=1) + self.question_char_idxs = torch.cat((ones, self.question_char_idxs), dim=1) + + self.y1s += 1 + self.y2s += 1 + + # SQuAD 1.1: Ignore no-answer examples + self.ids = torch.from_numpy(dataset['ids']).long() + self.valid_idxs = [idx for idx in range(len(self.ids)) + if use_v2 or self.y1s[idx].item() >= 0] + + def __getitem__(self, idx): + idx = self.valid_idxs[idx] + example = (self.context_idxs[idx], + self.context_char_idxs[idx], + self.question_idxs[idx], + self.question_char_idxs[idx], + self.y1s[idx], + self.y2s[idx], + self.ids[idx]) + + return example + + def __len__(self): + return len(self.valid_idxs) + + +def collate_fn(examples): + """Create batch tensors from a list of individual examples returned + by `SQuAD.__getitem__`. Merge examples of different length by padding + all examples to the maximum length in the batch. + + Args: + examples (list): List of tuples of the form (context_idxs, context_char_idxs, + question_idxs, question_char_idxs, y1s, y2s, ids). + + Returns: + examples (tuple): Tuple of tensors (context_idxs, context_char_idxs, question_idxs, + question_char_idxs, y1s, y2s, ids). All of shape (batch_size, ...), where + the remaining dimensions are the maximum length of examples in the input. + + Adapted from: + https://github.com/yunjey/seq2seq-dataloader + """ + def merge_0d(scalars, dtype=torch.int64): + return torch.tensor(scalars, dtype=dtype) + + def merge_1d(arrays, dtype=torch.int64, pad_value=0): + lengths = [(a != pad_value).sum() for a in arrays] + padded = torch.zeros(len(arrays), max(lengths), dtype=dtype) + for i, seq in enumerate(arrays): + end = lengths[i] + padded[i, :end] = seq[:end] + return padded + + def merge_2d(matrices, dtype=torch.int64, pad_value=0): + heights = [(m.sum(1) != pad_value).sum() for m in matrices] + widths = [(m.sum(0) != pad_value).sum() for m in matrices] + padded = torch.zeros(len(matrices), max(heights), max(widths), dtype=dtype) + for i, seq in enumerate(matrices): + height, width = heights[i], widths[i] + padded[i, :height, :width] = seq[:height, :width] + return padded + + # Group by tensor type + context_idxs, context_char_idxs, \ + question_idxs, question_char_idxs, \ + y1s, y2s, ids = zip(*examples) + + # Merge into batch tensors + context_idxs = merge_1d(context_idxs) + context_char_idxs = merge_2d(context_char_idxs) + question_idxs = merge_1d(question_idxs) + question_char_idxs = merge_2d(question_char_idxs) + y1s = merge_0d(y1s) + y2s = merge_0d(y2s) + ids = merge_0d(ids) + + return (context_idxs, context_char_idxs, + question_idxs, question_char_idxs, + y1s, y2s, ids) + + +class AverageMeter: + """Keep track of average values over time. + + Adapted from: + > https://github.com/pytorch/examples/blob/master/imagenet/main.py + """ + def __init__(self): + self.avg = 0 + self.sum = 0 + self.count = 0 + + def reset(self): + """Reset meter.""" + self.__init__() + + def update(self, val, num_samples=1): + """Update meter with new value `val`, the average of `num` samples. + + Args: + val (float): Average value to update the meter with. + num_samples (int): Number of samples that were averaged to + produce `val`. + """ + self.count += num_samples + self.sum += val * num_samples + self.avg = self.sum / self.count + + +class EMA: + """Exponential moving average of model parameters. + Args: + model (torch.nn.Module): Model with parameters whose EMA will be kept. + decay (float): Decay rate for exponential moving average. + """ + def __init__(self, model, decay): + self.decay = decay + self.shadow = {} + self.original = {} + + # Register model parameters + for name, param in model.named_parameters(): + if param.requires_grad: + self.shadow[name] = param.data.clone() + + def __call__(self, model, num_updates): + decay = min(self.decay, (1.0 + num_updates) / (10.0 + num_updates)) + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + new_average = \ + (1.0 - decay) * param.data + decay * self.shadow[name] + self.shadow[name] = new_average.clone() + + def assign(self, model): + """Assign exponential moving average of parameter values to the + respective parameters. + Args: + model (torch.nn.Module): Model to assign parameter values. + """ + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + self.original[name] = param.data.clone() + param.data = self.shadow[name] + + def resume(self, model): + """Restore original parameters to a model. That is, put back + the values that were in each parameter at the last call to `assign`. + Args: + model (torch.nn.Module): Model to assign parameter values. + """ + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + param.data = self.original[name] + + +class CheckpointSaver: + """Class to save and load model checkpoints. + + Save the best checkpoints as measured by a metric value passed into the + `save` method. Overwrite checkpoints with better checkpoints once + `max_checkpoints` have been saved. + + Args: + save_dir (str): Directory to save checkpoints. + max_checkpoints (int): Maximum number of checkpoints to keep before + overwriting old ones. + metric_name (str): Name of metric used to determine best model. + maximize_metric (bool): If true, best checkpoint is that which maximizes + the metric value passed in via `save`. Otherwise, best checkpoint + minimizes the metric. + log (logging.Logger): Optional logger for printing information. + """ + def __init__(self, save_dir, max_checkpoints, metric_name, + maximize_metric=False, log=None): + super(CheckpointSaver, self).__init__() + + self.save_dir = save_dir + self.max_checkpoints = max_checkpoints + self.metric_name = metric_name + self.maximize_metric = maximize_metric + self.best_val = None + self.ckpt_paths = queue.PriorityQueue() + self.log = log + self._print(f"Saver will {'max' if maximize_metric else 'min'}imize {metric_name}...") + + def is_best(self, metric_val): + """Check whether `metric_val` is the best seen so far. + + Args: + metric_val (float): Metric value to compare to prior checkpoints. + """ + if metric_val is None: + # No metric reported + return False + + if self.best_val is None: + # No checkpoint saved yet + return True + + return ((self.maximize_metric and self.best_val < metric_val) + or (not self.maximize_metric and self.best_val > metric_val)) + + def _print(self, message): + """Print a message if logging is enabled.""" + if self.log is not None: + self.log.info(message) + + def save(self, step, model, metric_val, device): + """Save model parameters to disk. + + Args: + step (int): Total number of examples seen during training so far. + model (torch.nn.DataParallel): Model to save. + metric_val (float): Determines whether checkpoint is best so far. + device (torch.device): Device where model resides. + """ + ckpt_dict = { + 'model_name': model.__class__.__name__, + 'model_state': model.cpu().state_dict(), + 'step': step + } + model.to(device) + + checkpoint_path = os.path.join(self.save_dir, + f'step_{step}.pth.tar') + torch.save(ckpt_dict, checkpoint_path) + self._print(f'Saved checkpoint: {checkpoint_path}') + + if self.is_best(metric_val): + # Save the best model + self.best_val = metric_val + best_path = os.path.join(self.save_dir, 'best.pth.tar') + shutil.copy(checkpoint_path, best_path) + self._print(f'New best checkpoint at step {step}...') + + # Add checkpoint path to priority queue (lowest priority removed first) + if self.maximize_metric: + priority_order = metric_val + else: + priority_order = -metric_val + + self.ckpt_paths.put((priority_order, checkpoint_path)) + + # Remove a checkpoint if more than max_checkpoints have been saved + if self.ckpt_paths.qsize() > self.max_checkpoints: + _, worst_ckpt = self.ckpt_paths.get() + try: + os.remove(worst_ckpt) + self._print(f'Removed checkpoint: {worst_ckpt}') + except OSError: + # Avoid crashing if checkpoint has been removed or protected + pass + + +def load_model(model, checkpoint_path, gpu_ids, return_step=True): + """Load model parameters from disk. + + Args: + model (torch.nn.DataParallel): Load parameters into this model. + checkpoint_path (str): Path to checkpoint to load. + gpu_ids (list): GPU IDs for DataParallel. + return_step (bool): Also return the step at which checkpoint was saved. + + Returns: + model (torch.nn.DataParallel): Model loaded from checkpoint. + step (int): Step at which checkpoint was saved. Only if `return_step`. + """ + device = f"cuda:{gpu_ids[0] if gpu_ids else 'cpu'}" + ckpt_dict = torch.load(checkpoint_path, map_location=device) + + # Build model, load parameters + model.load_state_dict(ckpt_dict['model_state']) + + if return_step: + step = ckpt_dict['step'] + return model, step + + return model + + +def get_available_devices(): + """Get IDs of all available GPUs. + + Returns: + device (torch.device): Main device (GPU 0 or CPU). + gpu_ids (list): List of IDs of all GPUs that are available. + """ + gpu_ids = [] + if torch.cuda.is_available(): + gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())] + device = torch.device(f'cuda:{gpu_ids[0]}') + torch.cuda.set_device(device) + else: + device = torch.device('cpu') + + return device, gpu_ids + + +def masked_softmax(logits, mask, dim=-1, log_softmax=False): + """Take the softmax of `logits` over given dimension, and set + entries to 0 wherever `mask` is 0. + + Args: + logits (torch.Tensor): Inputs to the softmax function. + mask (torch.Tensor): Same shape as `logits`, with 0 indicating + positions that should be assigned 0 probability in the output. + dim (int): Dimension over which to take softmax. + log_softmax (bool): Take log-softmax rather than regular softmax. + E.g., some PyTorch functions such as `F.nll_loss` expect log-softmax. + + Returns: + probs (torch.Tensor): Result of taking masked softmax over the logits. + """ + mask = mask.type(torch.float32) + masked_logits = mask * logits + (1 - mask) * -1e30 + softmax_fn = F.log_softmax if log_softmax else F.softmax + probs = softmax_fn(masked_logits, dim) + + return probs + + +def visualize(tbx, pred_dict, eval_path, step, split, num_visuals): + """Visualize text examples to TensorBoard. + + Args: + tbx (tensorboardX.SummaryWriter): Summary writer. + pred_dict (dict): dict of predictions of the form id -> pred. + eval_path (str): Path to eval JSON file. + step (int): Number of examples seen so far during training. + split (str): Name of data split being visualized. + num_visuals (int): Number of visuals to select at random from preds. + """ + if num_visuals <= 0: + return + if num_visuals > len(pred_dict): + num_visuals = len(pred_dict) + + visual_ids = np.random.choice(list(pred_dict), size=num_visuals, replace=False) + + with open(eval_path, 'r') as eval_file: + eval_dict = json.load(eval_file) + for i, id_ in enumerate(visual_ids): + pred = pred_dict[id_] or 'N/A' + example = eval_dict[str(id_)] + question = example['question'] + context = example['context'] + answers = example['answers'] + + gold = answers[0] if answers else 'N/A' + tbl_fmt = (f'- **Question:** {question}\n' + + f'- **Context:** {context}\n' + + f'- **Answer:** {gold}\n' + + f'- **Prediction:** {pred}') + tbx.add_text(tag=f'{split}/{i+1}_of_{num_visuals}', + text_string=tbl_fmt, + global_step=step) + + +def save_preds(preds, save_dir, file_name='predictions.csv'): + """Save predictions `preds` to a CSV file named `file_name` in `save_dir`. + + Args: + preds (list): List of predictions each of the form (id, start, end), + where id is an example ID, and start/end are indices in the context. + save_dir (str): Directory in which to save the predictions file. + file_name (str): File name for the CSV file. + + Returns: + save_path (str): Path where CSV file was saved. + """ + # Validate format + if (not isinstance(preds, list) + or any(not isinstance(p, tuple) or len(p) != 3 for p in preds)): + raise ValueError('preds must be a list of tuples (id, start, end)') + + # Make sure predictions are sorted by ID + preds = sorted(preds, key=lambda p: p[0]) + + # Save to a CSV file + save_path = os.path.join(save_dir, file_name) + np.savetxt(save_path, np.array(preds), delimiter=',', fmt='%d') + + return save_path + + +def get_save_dir(base_dir, name, training, id_max=100): + """Get a unique save directory by appending the smallest positive integer + `id < id_max` that is not already taken (i.e., no dir exists with that id). + + Args: + base_dir (str): Base directory in which to make save directories. + name (str): Name to identify this training run. Need not be unique. + training (bool): Save dir. is for training (determines subdirectory). + id_max (int): Maximum ID number before raising an exception. + + Returns: + save_dir (str): Path to a new directory with a unique name. + """ + for uid in range(1, id_max): + subdir = 'train' if training else 'test' + save_dir = os.path.join(base_dir, subdir, f'{name}-{uid:02d}') + if not os.path.exists(save_dir): + os.makedirs(save_dir) + return save_dir + + raise RuntimeError('Too many save directories created with the same name. \ + Delete old save directories or use another name.') + + +def get_logger(log_dir, name): + """Get a `logging.Logger` instance that prints to the console + and an auxiliary file. + + Args: + log_dir (str): Directory in which to create the log file. + name (str): Name to identify the logs. + + Returns: + logger (logging.Logger): Logger instance for logging events. + """ + class StreamHandlerWithTQDM(logging.Handler): + """Let `logging` print without breaking `tqdm` progress bars. + + See Also: + > https://stackoverflow.com/questions/38543506 + """ + def emit(self, record): + try: + msg = self.format(record) + tqdm.tqdm.write(msg) + self.flush() + except (KeyboardInterrupt, SystemExit): + raise + except: + self.handleError(record) + + # Create logger + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + + # Log everything (i.e., DEBUG level and above) to a file + log_path = os.path.join(log_dir, 'log.txt') + file_handler = logging.FileHandler(log_path) + file_handler.setLevel(logging.DEBUG) + + # Log everything except DEBUG level (i.e., INFO level and above) to console + console_handler = StreamHandlerWithTQDM() + console_handler.setLevel(logging.INFO) + + # Create format for the logs + file_formatter = logging.Formatter('[%(asctime)s] %(message)s', + datefmt='%m.%d.%y %H:%M:%S') + file_handler.setFormatter(file_formatter) + console_formatter = logging.Formatter('[%(asctime)s] %(message)s', + datefmt='%m.%d.%y %H:%M:%S') + console_handler.setFormatter(console_formatter) + + # add the handlers to the logger + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + return logger + + +def torch_from_json(path, dtype=torch.float32): + """Load a PyTorch Tensor from a JSON file. + + Args: + path (str): Path to the JSON file to load. + dtype (torch.dtype): Data type of loaded array. + + Returns: + tensor (torch.Tensor): Tensor loaded from JSON file. + """ + with open(path, 'r') as fh: + array = np.array(json.load(fh)) + + tensor = torch.from_numpy(array).type(dtype) + + return tensor + + +def discretize(p_start, p_end, max_len=15, no_answer=False): + """Discretize soft predictions to get start and end indices. + + Choose the pair `(i, j)` of indices that maximizes `p1[i] * p2[j]` + subject to `i <= j` and `j - i + 1 <= max_len`. + + Args: + p_start (torch.Tensor): Soft predictions for start index. + Shape (batch_size, context_len). + p_end (torch.Tensor): Soft predictions for end index. + Shape (batch_size, context_len). + max_len (int): Maximum length of the discretized prediction. + I.e., enforce that `preds[i, 1] - preds[i, 0] + 1 <= max_len`. + no_answer (bool): Treat 0-index as the no-answer prediction. Consider + a prediction no-answer if `preds[0, 0] * preds[0, 1]` is greater + than the probability assigned to the max-probability span. + + Returns: + start_idxs (torch.Tensor): Hard predictions for start index. + Shape (batch_size,) + end_idxs (torch.Tensor): Hard predictions for end index. + Shape (batch_size,) + """ + if p_start.min() < 0 or p_start.max() > 1 \ + or p_end.min() < 0 or p_end.max() > 1: + raise ValueError('Expected p_start and p_end to have values in [0, 1]') + + # Compute pairwise probabilities + p_start = p_start.unsqueeze(dim=2) + p_end = p_end.unsqueeze(dim=1) + p_joint = torch.matmul(p_start, p_end) # (batch_size, c_len, c_len) + + # Restrict to pairs (i, j) such that i <= j <= i + max_len - 1 + c_len, device = p_start.size(1), p_start.device + is_legal_pair = torch.triu(torch.ones((c_len, c_len), device=device)) + is_legal_pair -= torch.triu(torch.ones((c_len, c_len), device=device), + diagonal=max_len) + if no_answer: + # Index 0 is no-answer + p_no_answer = p_joint[:, 0, 0].clone() + is_legal_pair[0, :] = 0 + is_legal_pair[:, 0] = 0 + else: + p_no_answer = None + p_joint *= is_legal_pair + + # Take pair (i, j) that maximizes p_joint + max_in_row, _ = torch.max(p_joint, dim=2) + max_in_col, _ = torch.max(p_joint, dim=1) + start_idxs = torch.argmax(max_in_row, dim=-1) + end_idxs = torch.argmax(max_in_col, dim=-1) + + if no_answer: + # Predict no-answer whenever p_no_answer > max_prob + max_prob, _ = torch.max(max_in_col, dim=-1) + start_idxs[p_no_answer > max_prob] = 0 + end_idxs[p_no_answer > max_prob] = 0 + + return start_idxs, end_idxs + + +def convert_tokens(eval_dict, qa_id, y_start_list, y_end_list, no_answer): + """Convert predictions to tokens from the context. + + Args: + eval_dict (dict): Dictionary with eval info for the dataset. This is + used to perform the mapping from IDs and indices to actual text. + qa_id (int): List of QA example IDs. + y_start_list (list): List of start predictions. + y_end_list (list): List of end predictions. + no_answer (bool): Questions can have no answer. E.g., SQuAD 2.0. + + Returns: + pred_dict (dict): Dictionary index IDs -> predicted answer text. + sub_dict (dict): Dictionary UUIDs -> predicted answer text (submission). + """ + pred_dict = {} + sub_dict = {} + for qid, y_start, y_end in zip(qa_id, y_start_list, y_end_list): + context = eval_dict[str(qid)]["context"] + spans = eval_dict[str(qid)]["spans"] + uuid = eval_dict[str(qid)]["uuid"] + if no_answer and (y_start == 0 or y_end == 0): + pred_dict[str(qid)] = '' + sub_dict[uuid] = '' + else: + if no_answer: + y_start, y_end = y_start - 1, y_end - 1 + start_idx = spans[y_start][0] + end_idx = spans[y_end][1] + pred_dict[str(qid)] = context[start_idx: end_idx] + sub_dict[uuid] = context[start_idx: end_idx] + return pred_dict, sub_dict + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + if not ground_truths: + return metric_fn(prediction, '') + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def eval_dicts(gold_dict, pred_dict, no_answer): + avna = f1 = em = total = 0 + for key, value in pred_dict.items(): + total += 1 + ground_truths = gold_dict[key]['answers'] + prediction = value + em += metric_max_over_ground_truths(compute_em, prediction, ground_truths) + f1 += metric_max_over_ground_truths(compute_f1, prediction, ground_truths) + if no_answer: + avna += compute_avna(prediction, ground_truths) + + eval_dict = {'EM': 100. * em / total, + 'F1': 100. * f1 / total} + + if no_answer: + eval_dict['AvNA'] = 100. * avna / total + + return eval_dict + + +def compute_avna(prediction, ground_truths): + """Compute answer vs. no-answer accuracy.""" + return float(bool(prediction) == bool(ground_truths)) + + +# All methods below this line are from the official SQuAD 2.0 eval script +# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ +def normalize_answer(s): + """Convert to lowercase and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) + return re.sub(regex, ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: + return [] + return normalize_answer(s).split() + + +def compute_em(a_gold, a_pred): + return int(normalize_answer(a_gold) == normalize_answer(a_pred)) + + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = Counter(gold_toks) & Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1