From 352e1d786f01abd51c6e5497d513dc379a36e407 Mon Sep 17 00:00:00 2001 From: Shubhanshu Mishra Date: Mon, 11 Apr 2022 16:20:32 -0500 Subject: [PATCH 1/4] Fixes #59 - Add CEI dataset - Initial commit to add CEI --- biodatasets/cei/cei.py | 277 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 biodatasets/cei/cei.py diff --git a/biodatasets/cei/cei.py b/biodatasets/cei/cei.py new file mode 100644 index 00000000..801b5bfc --- /dev/null +++ b/biodatasets/cei/cei.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/add_dataset.html + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. + +[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) +""" + +import os +from typing import List, Tuple, Dict + +import datasets +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + + +# TODO: Add BibTeX citation +_CITATION = """\ +@article{, + author = {}, + title = {}, + journal = {}, + volume = {}, + year = {}, + url = {}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +# TODO: create a module level variable with your dataset name (should match script name) +# E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer +_DATASETNAME = "[dataset_name]" + +# TODO: Add description of the dataset here +# You can copy an official description +_DESCRIPTION = """\ +This dataset is designed for XXX NLP task. +""" + +# TODO: Add a link to an official homepage for the dataset here (if possible) +_HOMEPAGE = "" + +# TODO: Add the licence for the dataset here (if possible) +# Note that this doesn't have to be a common open source license. +# Some datasets have custom licenses. In this case, simply put the full license terms +# into `_LICENSE` +_LICENSE = "" + +# TODO: Add links to the urls needed to download your dataset files. +# For local datasets, this variable can be an empty dictionary. + +# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. +# In most cases the URLs will be the same for the source and bigbio config. +# However, if you need to access different files for each config you can have multiple entries in this dict. +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + _DATASETNAME: "url or list of urls or ... ", +} + +# TODO: add supported task by dataset. One dataset may support multiple tasks +_SUPPORTED_TASKS = [ + Tasks.NAMED_ENTITY_RECOGNITION, + Tasks.RELATION_EXTRACTION, +] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" +# This version doesn't have to be consistent with semantic versioning. Anything that is +# provided by the original dataset as a version goes. +_SOURCE_VERSION = "" + +_BIGBIO_VERSION = "1.0.0" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +# Append "Dataset" to the class name: BioASQ --> BioasqDataset +class CieDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + # You will be able to load the "source" or "bigbio" configurations with + # ds_source = datasets.load_dataset('my_dataset', name='source') + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') + + # For local datasets you can make use of the `data_dir` and `data_files` kwargs + # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits + # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") + + # TODO: For each dataset, implement Config for Source and BigBio; + # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. + # Each of them should contain: + # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] + # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) + # - description: one line description for the dataset + # - schema: options = (source|bigbio_[bigbio_schema_name]) + # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) + # where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="[dataset_name]_source", + version=SOURCE_VERSION, + description="[dataset_name] source schema", + schema="source", + subset_id="[dataset_name]", + ), + BigBioConfig( + name="[dataset_name]_bigbio_[bigbio_schema_name]", + version=BIGBIO_VERSION, + description="[dataset_name] BigBio schema", + schema="bigbio_[bigbio_schema_name]", + subset_id="[dataset_name]", + ), + ] + + DEFAULT_CONFIG_NAME = "[dataset_name]_source" + + def _info(self) -> datasets.DatasetInfo: + + # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. + + # You can arbitrarily nest lists and dictionaries. + # For iterables, use lists over tuples or `datasets.Sequence` + + if self.config.schema == "source": + # TODO: Create your source schema here + raise NotImplementedError() + + # EX: Arbitrary NER type dataset + # features = datasets.Features( + # { + # "doc_id": datasets.Value("string"), + # "text": datasets.Value("string"), + # "entities": [ + # { + # "offsets": [datasets.Value("int64")], + # "text": datasets.Value("string"), + # "type": datasets.Value("string"), + # "entity_id": datasets.Value("string"), + # } + # ], + # } + # ) + + # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. + + # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format. + + # For example bigbio_kb, bigbio_t2t + elif self.config.schema == "bigbio_[bigbio_schema_name]": + # e.g. features = schemas.kb_features + # TODO: Choose your big-bio schema here + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration + + # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name + + # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath + + # PUBLIC DATASETS: Assign your data-dir based on the dl_manager. + + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager + + # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. + + # TODO: KEEP if your dataset is PUBLIC; remove if not + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + # TODO: KEEP if your dataset is LOCAL; remove if NOT + if self.config.data_dir is None: + raise ValueError( + "This is a local dataset. Please pass the data_dir kwarg to load_dataset." + ) + else: + data_dir = self.config.data_dir + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.jsonl"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "test.jsonl"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "dev.jsonl"), + "split": "dev", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. + + # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. + + # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + + if self.config.schema == "source": + # TODO: yield (key, example) tuples in the original dataset schema + for key, example in thing: + yield key, example + + elif self.config.schema == "bigbio_[bigbio_schema_name]": + # TODO: yield (key, example) tuples in the bigbio schema + for key, example in thing: + yield key, example + + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py + + +# This allows you to run your dataloader with `python [dataset_name].py` during development +# TODO: Remove this before making your PR +if __name__ == "__main__": + datasets.load_dataset(__file__) From 99507a1ec87fa71449dd317560a4a1c1cf1f09d7 Mon Sep 17 00:00:00 2001 From: Shubhanshu Mishra Date: Tue, 12 Apr 2022 11:04:00 -0500 Subject: [PATCH 2/4] Added info. Need to figure our data parsing. --- biodatasets/cei/cei.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/biodatasets/cei/cei.py b/biodatasets/cei/cei.py index 801b5bfc..64fa72c9 100644 --- a/biodatasets/cei/cei.py +++ b/biodatasets/cei/cei.py @@ -43,21 +43,21 @@ # TODO: Add BibTeX citation _CITATION = """\ @article{, - author = {}, - title = {}, - journal = {}, - volume = {}, - year = {}, - url = {}, - doi = {}, - biburl = {}, - bibsource = {} + author = {Larsson, Kristin and Baker, Simon and Silins, Ilona and Guo, Yufan and Stenius, Ulla and Korhonen, Anna and Berglund, Marika}, + title = {Text mining for improved exposure assessment}, + journal = {PloS one}, + volume = {12}, + year = {2017}, + url = {https://doi.org/10.1371/journal.pone.0173132}, + doi = {10.1371/journal.pone.0173132}, + biburl = {https://journals.plos.org/plosone/article/citation/bibtex?id=10.1371/journal.pone.0173132}, + bibsource = {PloS one} } """ # TODO: create a module level variable with your dataset name (should match script name) # E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer -_DATASETNAME = "[dataset_name]" +_DATASETNAME = "cei" # TODO: Add description of the dataset here # You can copy an official description @@ -66,13 +66,13 @@ """ # TODO: Add a link to an official homepage for the dataset here (if possible) -_HOMEPAGE = "" +_HOMEPAGE = "https://github.com/sb895/chemical-exposure-information-corpus" # TODO: Add the licence for the dataset here (if possible) # Note that this doesn't have to be a common open source license. # Some datasets have custom licenses. In this case, simply put the full license terms # into `_LICENSE` -_LICENSE = "" +_LICENSE = "GPL-3.0 License" # TODO: Add links to the urls needed to download your dataset files. # For local datasets, this variable can be an empty dictionary. From 105774ef5ad8ba62f649172132708db4592506da Mon Sep 17 00:00:00 2001 From: Shubhanshu Mishra Date: Sat, 30 Apr 2022 00:47:43 -0500 Subject: [PATCH 3/4] Added working code. --- biodatasets/cei/cei.py | 236 +++++++++++++---------------------------- 1 file changed, 73 insertions(+), 163 deletions(-) diff --git a/biodatasets/cei/cei.py b/biodatasets/cei/cei.py index 64fa72c9..fbf51a9c 100644 --- a/biodatasets/cei/cei.py +++ b/biodatasets/cei/cei.py @@ -14,24 +14,14 @@ # limitations under the License. """ -This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. - -When modifying it for your dataset, look for TODO items that offer specific instructions. - -Full documentation on writing dataset loading scripts can be found here: -https://huggingface.co/docs/datasets/add_dataset.html - -To create a dataset loading script you will create a class and implement 3 methods: - * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. - * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. - * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. - -TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. - -[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) +The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated by experts according to a taxonomy. +The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels are assigned to each sentence in the corpus. +The labels are found under the "labels" directory, while the tokenized text can be found under "text" directory. +The filenames are the corresponding PubMed IDs (PMID). """ -import os +from pathlib import Path +import re from typing import List, Tuple, Dict import datasets @@ -40,7 +30,6 @@ from utils.constants import Tasks -# TODO: Add BibTeX citation _CITATION = """\ @article{, author = {Larsson, Kristin and Baker, Simon and Silins, Ilona and Guo, Yufan and Stenius, Ulla and Korhonen, Anna and Berglund, Marika}, @@ -55,132 +44,74 @@ } """ -# TODO: create a module level variable with your dataset name (should match script name) -# E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer _DATASETNAME = "cei" -# TODO: Add description of the dataset here -# You can copy an official description _DESCRIPTION = """\ -This dataset is designed for XXX NLP task. +The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated by experts according to a taxonomy. +The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels are assigned to each sentence in the corpus. +The labels are found under the "labels" directory, while the tokenized text can be found under "text" directory. +The filenames are the corresponding PubMed IDs (PMID). """ -# TODO: Add a link to an official homepage for the dataset here (if possible) _HOMEPAGE = "https://github.com/sb895/chemical-exposure-information-corpus" -# TODO: Add the licence for the dataset here (if possible) -# Note that this doesn't have to be a common open source license. -# Some datasets have custom licenses. In this case, simply put the full license terms -# into `_LICENSE` _LICENSE = "GPL-3.0 License" -# TODO: Add links to the urls needed to download your dataset files. -# For local datasets, this variable can be an empty dictionary. - -# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. -# In most cases the URLs will be the same for the source and bigbio config. -# However, if you need to access different files for each config you can have multiple entries in this dict. -# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) _URLS = { - _DATASETNAME: "url or list of urls or ... ", + _DATASETNAME: "https://github.com/sb895/chemical-exposure-information-corpus/archive/refs/heads/master.zip", } -# TODO: add supported task by dataset. One dataset may support multiple tasks -_SUPPORTED_TASKS = [ - Tasks.NAMED_ENTITY_RECOGNITION, - Tasks.RELATION_EXTRACTION, -] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] +_SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] -# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" -# This version doesn't have to be consistent with semantic versioning. Anything that is -# provided by the original dataset as a version goes. -_SOURCE_VERSION = "" +_SOURCE_VERSION = "1.0.0" _BIGBIO_VERSION = "1.0.0" +LABEL_REGEX = re.compile(r"[BE][a-z\-\ ]+") + -# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case -# Append "Dataset" to the class name: BioASQ --> BioasqDataset class CieDataset(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" + """The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated by experts according to a taxonomy. + The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels are assigned to each sentence in the corpus. + The labels are found under the "labels" directory, while the tokenized text can be found under "text" directory. + The filenames are the corresponding PubMed IDs (PMID).""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - # You will be able to load the "source" or "bigbio" configurations with - # ds_source = datasets.load_dataset('my_dataset', name='source') - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') - - # For local datasets you can make use of the `data_dir` and `data_files` kwargs - # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits - # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") - - # TODO: For each dataset, implement Config for Source and BigBio; - # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. - # Each of them should contain: - # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] - # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) - # - description: one line description for the dataset - # - schema: options = (source|bigbio_[bigbio_schema_name]) - # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) - # where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) - BUILDER_CONFIGS = [ BigBioConfig( - name="[dataset_name]_source", + name=f"{_DATASETNAME}_source", version=SOURCE_VERSION, - description="[dataset_name] source schema", + description=f"{_DATASETNAME} source schema", schema="source", - subset_id="[dataset_name]", + subset_id=f"{_DATASETNAME}", ), BigBioConfig( - name="[dataset_name]_bigbio_[bigbio_schema_name]", + name=f"{_DATASETNAME}_bigbio_text", version=BIGBIO_VERSION, - description="[dataset_name] BigBio schema", - schema="bigbio_[bigbio_schema_name]", - subset_id="[dataset_name]", + description=f"{_DATASETNAME} BigBio schema", + schema="bigbio_text", + subset_id=f"{_DATASETNAME}", ), ] - DEFAULT_CONFIG_NAME = "[dataset_name]_source" + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" def _info(self) -> datasets.DatasetInfo: - # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. - - # You can arbitrarily nest lists and dictionaries. - # For iterables, use lists over tuples or `datasets.Sequence` - if self.config.schema == "source": - # TODO: Create your source schema here - raise NotImplementedError() - - # EX: Arbitrary NER type dataset - # features = datasets.Features( - # { - # "doc_id": datasets.Value("string"), - # "text": datasets.Value("string"), - # "entities": [ - # { - # "offsets": [datasets.Value("int64")], - # "text": datasets.Value("string"), - # "type": datasets.Value("string"), - # "entity_id": datasets.Value("string"), - # } - # ], - # } - # ) - - # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. - - # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format. - - # For example bigbio_kb, bigbio_t2t - elif self.config.schema == "bigbio_[bigbio_schema_name]": - # e.g. features = schemas.kb_features - # TODO: Choose your big-bio schema here - raise NotImplementedError() + features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "label_text": datasets.Value("string"), + "labels": datasets.Sequence(datasets.Value("string")), + } + ) + elif self.config.schema == "bigbio_text": + features = schemas.text_features return datasets.DatasetInfo( description=_DESCRIPTION, @@ -192,86 +123,65 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - - # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name - - # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath - - # PUBLIC DATASETS: Assign your data-dir based on the dl_manager. - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager - - # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. - - # TODO: KEEP if your dataset is PUBLIC; remove if not urls = _URLS[_DATASETNAME] data_dir = dl_manager.download_and_extract(urls) - # TODO: KEEP if your dataset is LOCAL; remove if NOT - if self.config.data_dir is None: - raise ValueError( - "This is a local dataset. Please pass the data_dir kwarg to load_dataset." - ) - else: - data_dir = self.config.data_dir - - # Not all datasets have predefined canonical train/val/test splits. - # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. - return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # Whatever you put in gen_kwargs will be passed to _generate_examples gen_kwargs={ - "filepath": os.path.join(data_dir, "train.jsonl"), + "base_dir": ( + Path(data_dir) / "chemical-exposure-information-corpus-master" + ), "split": "train", }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": os.path.join(data_dir, "test.jsonl"), - "split": "test", - }, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": os.path.join(data_dir, "dev.jsonl"), - "split": "dev", - }, - ), + ) ] - # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` - - # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. - - def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + def _generate_examples(self, base_dir: Path, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. - # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. - - # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + text_files = list(base_dir.glob("./text/*.txt")) if self.config.schema == "source": # TODO: yield (key, example) tuples in the original dataset schema - for key, example in thing: + for text_file in text_files: + key, example = self._read_example_from_file(text_file) yield key, example elif self.config.schema == "bigbio_[bigbio_schema_name]": # TODO: yield (key, example) tuples in the bigbio schema - for key, example in thing: + for text_file in text_files: + key, example = self._read_example_from_file_in_kb_schema(text_file) yield key, example - -# This template is based on the following template from the datasets package: -# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py + def _read_example_from_file(self, text_file: Path) -> Tuple[str, Dict]: + label_file = text_file.parent.parent / "labels" / text_file.name + with open(text_file, encoding="utf-8") as fp: + text = fp.read().rstrip() + with open(label_file, encoding="utf-8") as fp: + label_text = fp.read() + labels = [l.strip(" -") for l in LABEL_REGEX.findall(label_text)] + key = text_file.name.rsplit(".", 1)[0] + example = { + "id": key, + "document_id": key, + "text": text, + "label_text": label_text, + "labels": labels, + } + return key, example + + def _read_example_from_file_in_kb_schema(self, text_file: Path) -> Tuple[str, Dict]: + key, example = self._read_example_from_file(text_file) + example = { + k: v + for k, v in example.items() + if k in {"id", "document_id", "text", "labels"} + } + return key, example -# This allows you to run your dataloader with `python [dataset_name].py` during development -# TODO: Remove this before making your PR if __name__ == "__main__": datasets.load_dataset(__file__) From 52c208d074cf894ff94de447c04ff4c1ed58308d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Mon, 28 Oct 2024 11:43:56 +0100 Subject: [PATCH 4/4] refactor: Revise implementation of CEI to hub-style integration --- bigbio/hub/hub_repos/cei/README.md | 44 ++ bigbio/hub/hub_repos/cei/bigbiohub.py | 590 ++++++++++++++++++ .../hub/hub_repos}/cei/cei.py | 77 +-- 3 files changed, 666 insertions(+), 45 deletions(-) create mode 100644 bigbio/hub/hub_repos/cei/README.md create mode 100644 bigbio/hub/hub_repos/cei/bigbiohub.py rename {biodatasets => bigbio/hub/hub_repos}/cei/cei.py (69%) diff --git a/bigbio/hub/hub_repos/cei/README.md b/bigbio/hub/hub_repos/cei/README.md new file mode 100644 index 00000000..82267544 --- /dev/null +++ b/bigbio/hub/hub_repos/cei/README.md @@ -0,0 +1,44 @@ +--- +language: + - en +bigbio_language: + - English +multilinguality: monolingual +pretty_name: CEI +homepage: https://github.com/sb895/chemical-exposure-information-corpus +bigbio_pubmed: true +bigbio_public: true +bigbio_tasks: + - TXTCLASS +--- + + +# Dataset Card for Chemical Exposure Information (CEI) Corpus + +## Dataset Description + +- **Homepage:** https://github.com/sb895/chemical-exposure-information-corpus +- **Pubmed:** True +- **Public:** True +- **Tasks:** TXTCLASS + +The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated by +experts according to a taxonomy. The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels are +assigned to each sentence in the corpus. + +## Citation Information + +``` +@article{, + author = {Larsson, Kristin and Baker, Simon and Silins, Ilona and Guo, Yufan and Stenius, Ulla and Korhonen, \ + Anna and Berglund, Marika}, + title = {Text mining for improved exposure assessment}, + journal = {PloS one}, + volume = {12}, + year = {2017}, + url = {https://doi.org/10.1371/journal.pone.0173132}, + doi = {10.1371/journal.pone.0173132}, + biburl = {https://journals.plos.org/plosone/article/citation/bibtex?id=10.1371/journal.pone.0173132}, + bibsource = {PloS one} +} +``` diff --git a/bigbio/hub/hub_repos/cei/bigbiohub.py b/bigbio/hub/hub_repos/cei/bigbiohub.py new file mode 100644 index 00000000..2af473d3 --- /dev/null +++ b/bigbio/hub/hub_repos/cei/bigbiohub.py @@ -0,0 +1,590 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + if annotation_file.exists(): + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example \ No newline at end of file diff --git a/biodatasets/cei/cei.py b/bigbio/hub/hub_repos/cei/cei.py similarity index 69% rename from biodatasets/cei/cei.py rename to bigbio/hub/hub_repos/cei/cei.py index fbf51a9c..ad110433 100644 --- a/biodatasets/cei/cei.py +++ b/bigbio/hub/hub_repos/cei/cei.py @@ -13,26 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated by experts according to a taxonomy. -The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels are assigned to each sentence in the corpus. -The labels are found under the "labels" directory, while the tokenized text can be found under "text" directory. -The filenames are the corresponding PubMed IDs (PMID). -""" - -from pathlib import Path import re -from typing import List, Tuple, Dict +from pathlib import Path +from typing import Dict, List, Tuple import datasets -from utils import schemas -from utils.configs import BigBioConfig -from utils.constants import Tasks +from .bigbiohub import BigBioConfig, Tasks, text_features + +_DATASETNAME = "cei" +_DISPLAYNAME = "CEI" + +_LANGUAGES = ["English"] +_LOCAL = False +_PUBMED = True _CITATION = """\ @article{, - author = {Larsson, Kristin and Baker, Simon and Silins, Ilona and Guo, Yufan and Stenius, Ulla and Korhonen, Anna and Berglund, Marika}, + author = {Larsson, Kristin and Baker, Simon and Silins, Ilona and Guo, Yufan and Stenius, Ulla and Korhonen, \ + Anna and Berglund, Marika}, title = {Text mining for improved exposure assessment}, journal = {PloS one}, volume = {12}, @@ -44,18 +43,14 @@ } """ -_DATASETNAME = "cei" - _DESCRIPTION = """\ -The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated by experts according to a taxonomy. -The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels are assigned to each sentence in the corpus. -The labels are found under the "labels" directory, while the tokenized text can be found under "text" directory. -The filenames are the corresponding PubMed IDs (PMID). +The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated by \ +experts according to a taxonomy. The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels are \ +assigned to each sentence in the corpus. """ _HOMEPAGE = "https://github.com/sb895/chemical-exposure-information-corpus" - -_LICENSE = "GPL-3.0 License" +_LICENSE = "GPL_3p0" _URLS = { _DATASETNAME: "https://github.com/sb895/chemical-exposure-information-corpus/archive/refs/heads/master.zip", @@ -64,17 +59,16 @@ _SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] _SOURCE_VERSION = "1.0.0" - _BIGBIO_VERSION = "1.0.0" -LABEL_REGEX = re.compile(r"[BE][a-z\-\ ]+") +LABEL_REGEX = re.compile(r"[Be][a-z\-\ ]+") class CieDataset(datasets.GeneratorBasedBuilder): - """The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated by experts according to a taxonomy. - The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels are assigned to each sentence in the corpus. - The labels are found under the "labels" directory, while the tokenized text can be found under "text" directory. - The filenames are the corresponding PubMed IDs (PMID).""" + """The Chemical Exposure Information (CEI) Corpus consists of 3661 PubMed publication abstracts manually annotated + by experts according to a taxonomy. The taxonomy consists of 32 classes in a hierarchy. Zero or more class labels + are assigned to each sentence in the corpus. The labels are found under the "labels" directory, while the tokenized + text can be found under "text" directory. The filenames are the corresponding PubMed IDs (PMID).""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) @@ -111,7 +105,9 @@ def _info(self) -> datasets.DatasetInfo: } ) elif self.config.schema == "bigbio_text": - features = schemas.text_features + features = text_features + else: + raise NotImplementedError(f"Schema {self.config.schema} not supported") return datasets.DatasetInfo( description=_DESCRIPTION, @@ -129,11 +125,8 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, - # Whatever you put in gen_kwargs will be passed to _generate_examples gen_kwargs={ - "base_dir": ( - Path(data_dir) / "chemical-exposure-information-corpus-master" - ), + "base_dir": (Path(data_dir) / "chemical-exposure-information-corpus-master"), "split": "train", }, ) @@ -142,7 +135,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: def _generate_examples(self, base_dir: Path, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - text_files = list(base_dir.glob("./text/*.txt")) + text_files = sorted(list(base_dir.glob("./text/*.txt"))) if self.config.schema == "source": # TODO: yield (key, example) tuples in the original dataset schema @@ -150,19 +143,21 @@ def _generate_examples(self, base_dir: Path, split: str) -> Tuple[int, Dict]: key, example = self._read_example_from_file(text_file) yield key, example - elif self.config.schema == "bigbio_[bigbio_schema_name]": + elif self.config.schema == "bigbio_text": # TODO: yield (key, example) tuples in the bigbio schema for text_file in text_files: key, example = self._read_example_from_file_in_kb_schema(text_file) yield key, example def _read_example_from_file(self, text_file: Path) -> Tuple[str, Dict]: - label_file = text_file.parent.parent / "labels" / text_file.name with open(text_file, encoding="utf-8") as fp: text = fp.read().rstrip() + + label_file = text_file.parent.parent / "labels" / text_file.name with open(label_file, encoding="utf-8") as fp: label_text = fp.read() - labels = [l.strip(" -") for l in LABEL_REGEX.findall(label_text)] + + labels = [line.strip(" -") for line in LABEL_REGEX.findall(label_text)] key = text_file.name.rsplit(".", 1)[0] example = { "id": key, @@ -175,13 +170,5 @@ def _read_example_from_file(self, text_file: Path) -> Tuple[str, Dict]: def _read_example_from_file_in_kb_schema(self, text_file: Path) -> Tuple[str, Dict]: key, example = self._read_example_from_file(text_file) - example = { - k: v - for k, v in example.items() - if k in {"id", "document_id", "text", "labels"} - } + example = {k: v for k, v in example.items() if k in {"id", "document_id", "text", "labels"}} return key, example - - -if __name__ == "__main__": - datasets.load_dataset(__file__)