Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Getting error when running the notebook #1

Open
PrashantSaikia opened this issue Jul 13, 2023 · 0 comments
Open

Getting error when running the notebook #1

PrashantSaikia opened this issue Jul 13, 2023 · 0 comments

Comments

@PrashantSaikia
Copy link

Here's the stacktrace:

Downloading and preparing dataset slither-audited-smart-contracts/big-multilabel (download: 1.63 GiB, generated: 4.39 GiB, post-processed: Unknown size, total: 6.01 GiB) to C:/Users/username/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/big-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13...
Downloading data files: 100%|███████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 498.91it/s]
Extracting data files: 100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 248.87it/s]
                                                                                                                       
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~\anaconda3\lib\site-packages\datasets\builder.py:1597, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1596 try:
-> 1597     writer = writer_class(
   1598         features=self.info.features,
   1599         path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
   1600         writer_batch_size=self._writer_batch_size,
   1601         hash_salt=split_info.name,
   1602         check_duplicates=check_duplicate_keys,
   1603         storage_options=self._fs.storage_options,
   1604         embed_local_files=embed_local_files,
   1605     )
   1606     try:

File ~\anaconda3\lib\site-packages\datasets\arrow_writer.py:334, in ArrowWriter.__init__(self, schema, features, path, stream, fingerprint, writer_batch_size, hash_salt, check_duplicates, disable_nullable, update_features, with_metadata, unit, embed_local_files, storage_options)
    329 self._path = (
    330     fs_token_paths[2][0]
    331     if not is_remote_filesystem(self._fs)
    332     else self._fs.unstrip_protocol(fs_token_paths[2][0])
    333 )
--> 334 self.stream = self._fs.open(fs_token_paths[2][0], "wb")
    335 self._closable_stream = True

File ~\anaconda3\lib\site-packages\fsspec\spec.py:1199, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
   1198 ac = kwargs.pop("autocommit", not self._intrans)
-> 1199 f = self._open(
   1200     path,
   1201     mode=mode,
   1202     block_size=block_size,
   1203     autocommit=ac,
   1204     cache_options=cache_options,
   1205     **kwargs,
   1206 )
   1207 if compression is not None:

File ~\anaconda3\lib\site-packages\fsspec\implementations\local.py:183, in LocalFileSystem._open(self, path, mode, block_size, **kwargs)
    182     self.makedirs(self._parent(path), exist_ok=True)
--> 183 return LocalFileOpener(path, mode, fs=self, **kwargs)

File ~\anaconda3\lib\site-packages\fsspec\implementations\local.py:314, in LocalFileOpener.__init__(self, path, mode, autocommit, fs, compression, **kwargs)
    313 self.blocksize = io.DEFAULT_BUFFER_SIZE
--> 314 self._open()

File ~\anaconda3\lib\site-packages\fsspec\implementations\local.py:319, in LocalFileOpener._open(self)
    318 if self.autocommit or "w" not in self.mode:
--> 319     self.f = open(self.path, mode=self.mode)
    320     if self.compression:

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/username/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/big-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13.incomplete/slither-audited-smart-contracts-validation-00000-00000-of-NNNNN.arrow'

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Cell In[2], line 11
      6 from datasets import load_dataset
      8 # Due to a bug in the HuggingFace dataset, at the moment two file checksums do not correspond to what
      9 # is in the dataset metadata, thus we have to load the data splits with the flag ignore_verification
     10 # set to true
---> 11 train_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
     12 test_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='test', ignore_verifications=True)
     13 val_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)

File ~\anaconda3\lib\site-packages\datasets\load.py:1797, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   1794 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1796 # Download and prepare data
-> 1797 builder_instance.download_and_prepare(
   1798     download_config=download_config,
   1799     download_mode=download_mode,
   1800     verification_mode=verification_mode,
   1801     try_from_hf_gcs=try_from_hf_gcs,
   1802     num_proc=num_proc,
   1803     storage_options=storage_options,
   1804 )
   1806 # Build dataset for splits
   1807 keep_in_memory = (
   1808     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1809 )

File ~\anaconda3\lib\site-packages\datasets\builder.py:890, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    888     if num_proc is not None:
    889         prepare_split_kwargs["num_proc"] = num_proc
--> 890     self._download_and_prepare(
    891         dl_manager=dl_manager,
    892         verification_mode=verification_mode,
    893         **prepare_split_kwargs,
    894         **download_and_prepare_kwargs,
    895     )
    896 # Sync info
    897 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~\anaconda3\lib\site-packages\datasets\builder.py:1649, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs)
   1648 def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
-> 1649     super()._download_and_prepare(
   1650         dl_manager,
   1651         verification_mode,
   1652         check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
   1653         or verification_mode == VerificationMode.ALL_CHECKS,
   1654         **prepare_splits_kwargs,
   1655     )

File ~\anaconda3\lib\site-packages\datasets\builder.py:985, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    981 split_dict.add(split_generator.split_info)
    983 try:
    984     # Prepare split will record examples associated to the split
--> 985     self._prepare_split(split_generator, **prepare_split_kwargs)
    986 except OSError as e:
    987     raise OSError(
    988         "Cannot find data file. "
    989         + (self.manual_download_instructions or "")
    990         + "\nOriginal error:\n"
    991         + str(e)
    992     ) from None

File ~\anaconda3\lib\site-packages\datasets\builder.py:1487, in GeneratorBasedBuilder._prepare_split(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)
   1485 job_id = 0
   1486 with pbar:
-> 1487     for job_id, done, content in self._prepare_split_single(
   1488         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1489     ):
   1490         if done:
   1491             result = content

File ~\anaconda3\lib\site-packages\datasets\builder.py:1644, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1642     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1643         e = e.__context__
-> 1644     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1646 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

Is there an updated process to generate the dataset?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant