Getting error when running the notebook #1

PrashantSaikia · 2023-07-13T05:49:04Z

Here's the stacktrace:

Downloading and preparing dataset slither-audited-smart-contracts/big-multilabel (download: 1.63 GiB, generated: 4.39 GiB, post-processed: Unknown size, total: 6.01 GiB) to C:/Users/username/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/big-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13...
Downloading data files: 100%|███████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 498.91it/s]
Extracting data files: 100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 248.87it/s]
                                                                                                                       
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~\anaconda3\lib\site-packages\datasets\builder.py:1597, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1596 try:
-> 1597     writer = writer_class(
   1598         features=self.info.features,
   1599         path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
   1600         writer_batch_size=self._writer_batch_size,
   1601         hash_salt=split_info.name,
   1602         check_duplicates=check_duplicate_keys,
   1603         storage_options=self._fs.storage_options,
   1604         embed_local_files=embed_local_files,
   1605     )
   1606     try:

File ~\anaconda3\lib\site-packages\datasets\arrow_writer.py:334, in ArrowWriter.__init__(self, schema, features, path, stream, fingerprint, writer_batch_size, hash_salt, check_duplicates, disable_nullable, update_features, with_metadata, unit, embed_local_files, storage_options)
    329 self._path = (
    330     fs_token_paths[2][0]
    331     if not is_remote_filesystem(self._fs)
    332     else self._fs.unstrip_protocol(fs_token_paths[2][0])
    333 )
--> 334 self.stream = self._fs.open(fs_token_paths[2][0], "wb")
    335 self._closable_stream = True

File ~\anaconda3\lib\site-packages\fsspec\spec.py:1199, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
   1198 ac = kwargs.pop("autocommit", not self._intrans)
-> 1199 f = self._open(
   1200     path,
   1201     mode=mode,
   1202     block_size=block_size,
   1203     autocommit=ac,
   1204     cache_options=cache_options,
   1205     **kwargs,
   1206 )
   1207 if compression is not None:

File ~\anaconda3\lib\site-packages\fsspec\implementations\local.py:183, in LocalFileSystem._open(self, path, mode, block_size, **kwargs)
    182     self.makedirs(self._parent(path), exist_ok=True)
--> 183 return LocalFileOpener(path, mode, fs=self, **kwargs)

File ~\anaconda3\lib\site-packages\fsspec\implementations\local.py:314, in LocalFileOpener.__init__(self, path, mode, autocommit, fs, compression, **kwargs)
    313 self.blocksize = io.DEFAULT_BUFFER_SIZE
--> 314 self._open()

File ~\anaconda3\lib\site-packages\fsspec\implementations\local.py:319, in LocalFileOpener._open(self)
    318 if self.autocommit or "w" not in self.mode:
--> 319     self.f = open(self.path, mode=self.mode)
    320     if self.compression:

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/username/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/big-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13.incomplete/slither-audited-smart-contracts-validation-00000-00000-of-NNNNN.arrow'

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Cell In[2], line 11
      6 from datasets import load_dataset
      8 # Due to a bug in the HuggingFace dataset, at the moment two file checksums do not correspond to what
      9 # is in the dataset metadata, thus we have to load the data splits with the flag ignore_verification
     10 # set to true
---> 11 train_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='train', ignore_verifications=True)
     12 test_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='test', ignore_verifications=True)
     13 val_set = load_dataset("mwritescode/slither-audited-smart-contracts", 'big-multilabel', split='validation', ignore_verifications=True)

File ~\anaconda3\lib\site-packages\datasets\load.py:1797, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   1794 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1796 # Download and prepare data
-> 1797 builder_instance.download_and_prepare(
   1798     download_config=download_config,
   1799     download_mode=download_mode,
   1800     verification_mode=verification_mode,
   1801     try_from_hf_gcs=try_from_hf_gcs,
   1802     num_proc=num_proc,
   1803     storage_options=storage_options,
   1804 )
   1806 # Build dataset for splits
   1807 keep_in_memory = (
   1808     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1809 )

File ~\anaconda3\lib\site-packages\datasets\builder.py:890, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    888     if num_proc is not None:
    889         prepare_split_kwargs["num_proc"] = num_proc
--> 890     self._download_and_prepare(
    891         dl_manager=dl_manager,
    892         verification_mode=verification_mode,
    893         **prepare_split_kwargs,
    894         **download_and_prepare_kwargs,
    895     )
    896 # Sync info
    897 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~\anaconda3\lib\site-packages\datasets\builder.py:1649, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs)
   1648 def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
-> 1649     super()._download_and_prepare(
   1650         dl_manager,
   1651         verification_mode,
   1652         check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
   1653         or verification_mode == VerificationMode.ALL_CHECKS,
   1654         **prepare_splits_kwargs,
   1655     )

File ~\anaconda3\lib\site-packages\datasets\builder.py:985, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    981 split_dict.add(split_generator.split_info)
    983 try:
    984     # Prepare split will record examples associated to the split
--> 985     self._prepare_split(split_generator, **prepare_split_kwargs)
    986 except OSError as e:
    987     raise OSError(
    988         "Cannot find data file. "
    989         + (self.manual_download_instructions or "")
    990         + "\nOriginal error:\n"
    991         + str(e)
    992     ) from None

File ~\anaconda3\lib\site-packages\datasets\builder.py:1487, in GeneratorBasedBuilder._prepare_split(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)
   1485 job_id = 0
   1486 with pbar:
-> 1487     for job_id, done, content in self._prepare_split_single(
   1488         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1489     ):
   1490         if done:
   1491             result = content

File ~\anaconda3\lib\site-packages\datasets\builder.py:1644, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1642     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1643         e = e.__context__
-> 1644     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1646 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

Is there an updated process to generate the dataset?

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Getting error when running the notebook #1

Getting error when running the notebook #1

PrashantSaikia commented Jul 13, 2023

Getting error when running the notebook #1

Getting error when running the notebook #1

Comments

PrashantSaikia commented Jul 13, 2023