From 573f9e43d4348d731fb57a976655874632e03713 Mon Sep 17 00:00:00 2001 From: Kazu Version Bump Date: Tue, 17 Dec 2024 16:10:43 +0000 Subject: [PATCH] =?UTF-8?q?Bump=20version:=202.2.1=20=E2=86=92=202.3.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 14 ++++++++++++++ docs/_changelog.d/+scaling.feature.rst | 1 - docs/_changelog.d/+training.feature.rst | 1 - docs/_changelog.d/multilabel_bert.feature.rst | 2 -- docs/_changelog.d/pytorch_memory_issue.bugfix.rst | 1 - kazu/__init__.py | 2 +- 6 files changed, 15 insertions(+), 6 deletions(-) delete mode 100644 docs/_changelog.d/+scaling.feature.rst delete mode 100644 docs/_changelog.d/+training.feature.rst delete mode 100644 docs/_changelog.d/multilabel_bert.feature.rst delete mode 100644 docs/_changelog.d/pytorch_memory_issue.bugfix.rst diff --git a/CHANGELOG.md b/CHANGELOG.md index 15f68a5e..d6ed560a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](http://semver.org/). +## 2.3.0 - 2024-12-17 + +### Features + +- Release new multilabel biomedBERT model trained on LLM (Gemini) synthetically generated NER data. The model was trained on over 7000 LLM annoted documents with a total of 295822 samples. + The model was trained for 21 epochs and achieved an F1 score of 95.6% on a held out test set. (multilabel_bert) +- added multilabel NER training example and config. +- added scaling kazu with Ray docs and example. + +### Bugfixes + +- Fix issue with TransformersModelForTokenClassificationNerStep when processing large amounts of documents. The fix offloads tensors onto cpu before performin the torch.cat operation which lead to a zero tensor before. (pytorch_memory_issue) + + ## 2.2.1 - 2024-10-21 ### Features diff --git a/docs/_changelog.d/+scaling.feature.rst b/docs/_changelog.d/+scaling.feature.rst deleted file mode 100644 index 20602a00..00000000 --- a/docs/_changelog.d/+scaling.feature.rst +++ /dev/null @@ -1 +0,0 @@ -added scaling kazu with Ray docs and example. diff --git a/docs/_changelog.d/+training.feature.rst b/docs/_changelog.d/+training.feature.rst deleted file mode 100644 index e8862b2e..00000000 --- a/docs/_changelog.d/+training.feature.rst +++ /dev/null @@ -1 +0,0 @@ -added multilabel NER training example and config. diff --git a/docs/_changelog.d/multilabel_bert.feature.rst b/docs/_changelog.d/multilabel_bert.feature.rst deleted file mode 100644 index f471e1cc..00000000 --- a/docs/_changelog.d/multilabel_bert.feature.rst +++ /dev/null @@ -1,2 +0,0 @@ -Release new multilabel biomedBERT model trained on LLM (Gemini) synthetically generated NER data. The model was trained on over 7000 LLM annoted documents with a total of 295822 samples. -The model was trained for 21 epochs and achieved an F1 score of 95.6% on a held out test set. diff --git a/docs/_changelog.d/pytorch_memory_issue.bugfix.rst b/docs/_changelog.d/pytorch_memory_issue.bugfix.rst deleted file mode 100644 index b47fec63..00000000 --- a/docs/_changelog.d/pytorch_memory_issue.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix issue with TransformersModelForTokenClassificationNerStep when processing large amounts of documents. The fix offloads tensors onto cpu before performin the torch.cat operation which lead to a zero tensor before. diff --git a/kazu/__init__.py b/kazu/__init__.py index b19ee4b7..55e47090 100644 --- a/kazu/__init__.py +++ b/kazu/__init__.py @@ -1 +1 @@ -__version__ = "2.2.1" +__version__ = "2.3.0"