From 9e7e2773302d1a70101f0e0a39175300fe2a452a Mon Sep 17 00:00:00 2001 From: Max Ploner Date: Wed, 3 Apr 2024 18:07:09 +0200 Subject: [PATCH] Added initial version --- index.html | 282 +++- media/accuracy_by_model_size_bear.svg | 1774 +++++++++++++++++++++++++ media/bear_evaluation_final.svg | 758 +++++++++++ style.css | 104 ++ 4 files changed, 2909 insertions(+), 9 deletions(-) create mode 100644 media/accuracy_by_model_size_bear.svg create mode 100644 media/bear_evaluation_final.svg diff --git a/index.html b/index.html index c3d9b42..1240339 100644 --- a/index.html +++ b/index.html @@ -1,12 +1,276 @@ - - - - - LM Pub Quiz - - - - + + + + LM Pub Quiz + + + + + + + +
+

LM Pub Quiz

+

Evaluating language models using multiple choice items

+ +
+ +
+
+ Illustration of how LM Pub Quiz evaluates LMs. +
Illustration of how LM Pub Quiz evaluates LMs: Answers are ranked by the (pseudo) log-likelihoods of the textual statements derived from all of the answer options.
+
+
+ +
+
Accepted at NAACL 2024
+

BEAR: A Unified Framework for Evaluating Relational Knowledge in Causal and Masked Language Models

+

Abstract

+

+ Knowledge probing assesses to which degree a language model (LM) has successfully learned relational knowledge during pre-training. Probing is an inexpensive way to compare LMs of different sizes and training configurations. However, previous approaches rely on the objective function used in pre-training LMs and are thus applicable only to masked or causal LMs. As a result, comparing different types of LMs becomes impossible. To address this, we propose an approach that uses an LM's inherent ability to estimate the log-likelihood of any given textual statement. We carefully design an evaluation dataset of 7,731 instances (40,916 in a larger variant) from which we produce alternative statements for each relational fact, one of which is correct. We then evaluate whether an LM correctly assigns the highest log-likelihood to the correct statement. Our experimental evaluation of 22 common LMs shows that our proposed framework, BEAR, can effectively probe for knowledge across different LM types. We release the BEAR datasets and an open-source framework that implements the probing approach to the research community to facilitate the evaluation and development of LMs. +

+

+ + + +

+
+
+
+ Illustration of how LM Pub Quiz evaluates LMs. +
Accuracy of various models on the BEAR dataset.
+
+
+
+

Model Results

+

+ We evaluated 22 lanuages models (of various sizes, trained using different pretraining objectives, and of both causal and masked LM types) on the BEAR dataset. +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TypeNum ParamsBEARBEAR1:1BEARN:1
Model
Llama-2-13b-hfCLM13b 66.9% ± 1.0% 66.5% ± 1.6% 67.0% ± 1.1%
Mistral-7B-v0.1CLM7.0b 65.4% ± 1.1% 64.5% ± 1.2% 65.5% ± 1.1%
gemma-7bCLM7.0b 63.7% ± 1.3% 63.5% ± 0.7% 63.8% ± 1.4%
Llama-2-7b-hfCLM7.0b 62.4% ± 1.3% 62.2% ± 1.1% 62.4% ± 1.3%
gemma-2bCLM2.0b 51.5% ± 1.0% 53.1% ± 1.3% 51.3% ± 1.0%
opt-30bCLM30b 47.9% ± 0.5% 45.8% ± 1.0% 48.2% ± 0.6%
opt-13bCLM13b 45.4% ± 0.8% 43.5% ± 2.1% 45.7% ± 0.6%
opt-6.7bCLM6.7b 43.8% ± 1.1% 42.5% ± 1.0% 43.9% ± 1.2%
opt-2.7bCLM2.7b 37.3% ± 0.9% 35.6% ± 0.7% 37.5% ± 1.0%
opt-1.3bCLM1.3b 31.5% ± 0.8% 31.3% ± 0.6% 31.5% ± 0.9%
gpt2-xlCLM1.6b 26.2% ± 0.7% 24.1% ± 1.6% 26.5% ± 0.6%
gpt2-largeCLM812M 22.2% ± 0.6% 20.1% ± 1.8% 22.5% ± 0.5%
roberta-largeMLM355M 21.5% ± 0.8% 22.0% ± 1.1% 21.5% ± 0.8%
bert-large-casedMLM335M 19.9% ± 0.5% 16.6% ± 1.0% 20.3% ± 0.5%
opt-350mCLM350M 19.6% ± 0.6% 18.6% ± 1.2% 19.7% ± 0.6%
gpt2-mediumCLM355M 19.0% ± 0.8% 16.0% ± 2.6% 19.4% ± 0.6%
bert-base-casedMLM109M 18.4% ± 0.4% 15.0% ± 1.1% 18.8% ± 0.4%
roberta-baseMLM125M 16.4% ± 0.7% 15.8% ± 1.8% 16.5% ± 0.8%
opt-125mCLM125M 16.4% ± 0.5% 14.0% ± 1.3% 16.7% ± 0.4%
xlm-roberta-largeMLM561M 14.3% ± 0.3% 14.9% ± 1.7% 14.3% ± 0.5%
gpt2CLM137M 13.5% ± 0.8%  9.4% ± 2.1% 14.0% ± 0.7%
xlm-roberta-baseMLM279M 11.4% ± 0.2% 11.4% ± 1.1% 11.4% ± 0.2%
Random Baseline--  4.7%  1.7%  5.1%
+
+
+

Citation

+

When using the dataset or library, please cite the following paper:

+
@inproceedings{wiland-ploner-akbik-2024-bear,
+    title = "BEAR: A Unified Framework for Evaluating Relational Knowledge in Causal and Masked Language Models",
+    author = "Wiland, Jacek and Ploner, Max  and Akbik, Alan",
+    booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
+    year = "2024",
+    publisher = "Association for Computational Linguistics",
+}
+
+ diff --git a/media/accuracy_by_model_size_bear.svg b/media/accuracy_by_model_size_bear.svg new file mode 100644 index 0000000..26173d3 --- /dev/null +++ b/media/accuracy_by_model_size_bear.svg @@ -0,0 +1,1774 @@ + + + + + + + + 2024-04-03T17:11:41.257169 + image/svg+xml + + + Matplotlib v3.7.4, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/media/bear_evaluation_final.svg b/media/bear_evaluation_final.svg new file mode 100644 index 0000000..e401250 --- /dev/null +++ b/media/bear_evaluation_final.svg @@ -0,0 +1,758 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/style.css b/style.css index e69de29..042164d 100644 --- a/style.css +++ b/style.css @@ -0,0 +1,104 @@ +:root { + --primary-color: #003366; + --nc-lk-1: var(--primary-color); + --nc-lk-2: #0055AA; +} + +@media (prefers-color-scheme: dark) { + /* Disable dark mode for now.*/ + :root { + --nc-tx-1: #000000; + --nc-tx-2: #1A1A1A; + --nc-bg-1: #FFFFFF; + --nc-bg-2: #F6F8FA; + --nc-bg-3: #E5E7EB; + --nc-lk-1: #0070F3; + --nc-lk-2: #0366D6; + --nc-lk-tx: #FFFFFF; + --nc-ac-1: #79FFE1; + --nc-ac-tx: #0C4047; + } +} + +header { + --nc-tx-1: #ffffff; + --nc-tx-2: #eeeeee; + --nc-lk-1: #3291FF; + --nc-lk-2: #0070F3; + --nc-lk-tx: #FFFFFF; + background-color: var(--primary-color); +} + +h1, h2, h3 { + margin-bottom: 1rem; +} + +h2 { + font-size: 1.6rem; +} + +h3 { + font-size: 1.4rem; +} + +h4 { + font-size: 1.2rem; +} + + +body { + background-color: var(--nc-bg-2); + max-width: 850px; +} + +section { + background-color: var(--nc-bg-1); +} + +header nav { + color: var(--nc-tx-2) +} + +header h2.subtitle { + font-weight: normal; + font-size: 1.3em; + padding-bottom: .8em; + padding-top: .5rem; +} + +figure { + text-align: center; +} + +figcaption { + font-size: .9rem; + border-top: 1px solid var(--nc-bg-3); + padding-top: 0.5rem; + padding-left: 0.8rem; + padding-right: 0.8rem; + text-align: right; +} + +.shadow-box { + box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2); + padding: 1.25rem; + margin-top: 3rem; +} + +.badge { + background-color: var(--primary-color); + color: white; + padding: 3px 8px; + text-align: center; + border-radius: 20px; + font-size: 0.7rem; +} + +.badge.acl { + background-color: #ed1c24; + margin: .5rem; +} + +.dataframe { + font-size: .8rem; +}