Skip to content

Commit

Permalink
bring hashfile from dvc-objects
Browse files Browse the repository at this point in the history
  • Loading branch information
efiop committed Jun 14, 2022
1 parent 625823b commit 5565291
Show file tree
Hide file tree
Showing 37 changed files with 1,203 additions and 34 deletions.
3 changes: 3 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ repos:
rev: v2.1.0
hooks:
- id: codespell
args:
- --ignore-words-list
- fo
- repo: https://github.com/asottile/pyupgrade
rev: v2.31.0
hooks:
Expand Down
3 changes: 1 addition & 2 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

nox.options.reuse_existing_virtualenvs = True
nox.options.sessions = "lint", "tests"
locations = "src", "tests"


@nox.session(python=["3.8", "3.9", "3.10"])
Expand All @@ -29,7 +28,7 @@ def lint(session: nox.Session) -> None:
args = *(session.posargs or ("--show-diff-on-failure",)), "--all-files"
session.run("pre-commit", "run", *args)
session.run("python", "-m", "mypy")
session.run("python", "-m", "pylint", *locations)
session.run("python", "-m", "pylint", "src")


@nox.session
Expand Down
10 changes: 9 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ install_requires=
dictdiffer>=0.8.1
pygtrie>=2.3.2
shortuuid>=0.5.0
dvc-objects==0.0.5
dvc-objects==0.0.6
diskcache>=5.2.1
nanotime>=0.5.2

[options.extras_require]
tests =
Expand All @@ -37,6 +39,12 @@ tests =
pytest-mock==3.7.0
pylint==2.14.1
mypy==0.961
moto[server]==3.1.13
requests==2.28.0
types-requests==2.27.30
universal-pathlib==0.0.18
s3fs[boto3]>=2022.02.0
aiobotocore[boto3]>2
dev =
%(tests)s

Expand Down
3 changes: 2 additions & 1 deletion src/dvc_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
if TYPE_CHECKING:
from dvc_objects.db import ObjectDB
from dvc_objects.file import HashFile
from dvc_objects.hashfile.hash_info import HashInfo

from .hashfile.hash_info import HashInfo

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion src/dvc_data/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

def get_odb(fs, path, **config):
from dvc_objects.fs import Schemes
from dvc_objects.hashfile.db import HashFileDB

from ..hashfile.db import HashFileDB
from .local import LocalHashFileDB

if fs.protocol == Schemes.LOCAL:
Expand Down
7 changes: 4 additions & 3 deletions src/dvc_data/db/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ def __init__(
): # pylint: disable=super-init-not-called
from dvc_objects.fs import LocalFileSystem
from dvc_objects.fs.utils import makedirs
from dvc_objects.hashfile.cache import Cache, Index

from ..hashfile.cache import Cache, Index

self.index_dir = os.path.join(tmp_dir, self.INDEX_DIR, name)
makedirs(self.index_dir, exist_ok=True)
Expand All @@ -110,7 +111,7 @@ def dir_hashes(self):

def clear(self):
"""Clear this index (to force re-indexing later)."""
from dvc_objects.hashfile.cache import Timeout
from ..hashfile.cache import Timeout

try:
self.index.clear()
Expand All @@ -119,7 +120,7 @@ def clear(self):

def update(self, dir_hashes: Iterable[str], file_hashes: Iterable[str]):
"""Update this index, adding the specified hashes."""
from dvc_objects.hashfile.cache import Timeout
from ..hashfile.cache import Timeout

try:
with self.index.transact():
Expand Down
3 changes: 2 additions & 1 deletion src/dvc_data/db/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from dvc_objects.errors import ObjectDBError, ObjectFormatError
from dvc_objects.fs.system import umask
from dvc_objects.fs.utils import copyfile, relpath, remove
from dvc_objects.hashfile.db import HashFileDB
from funcy import cached_property
from shortuuid import uuid

from ..hashfile.db import HashFileDB

logger = logging.getLogger(__name__)


Expand Down
22 changes: 17 additions & 5 deletions src/dvc_data/db/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@

from dvc_objects.db import ObjectDB
from dvc_objects.errors import ObjectFormatError
from dvc_objects.hashfile.db import HashFileDB, HashInfo
from dvc_objects.hashfile.hash import hash_file
from dvc_objects.hashfile.obj import HashFile

from ..hashfile.db import HashFileDB, HashInfo
from ..hashfile.hash import hash_file
from ..hashfile.obj import HashFile
from ..objects.reference import ReferenceObject

if TYPE_CHECKING:
from dvc_objects.fs.base import AnyFSPath, FileSystem
from dvc_objects.fs.callbacks import Callback

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -61,18 +62,29 @@ def add(
path: "AnyFSPath",
fs: "FileSystem",
oid: str,
hardlink: bool = False,
callback: "Callback" = None,
**kwargs,
): # pylint: disable=arguments-differ
hash_info = HashInfo(self.hash_name, oid)
if hash_info.isdir:
return self.raw.add(path, fs, oid, **kwargs)
return self.raw.add(
path, fs, oid, hardlink=hardlink, callback=callback, **kwargs
)

obj = ReferenceObject.from_path(
path, fs, hash_info, fs_cache=self._fs_cache
)
self._obj_cache[hash_info] = self._deref(obj)

return self.raw.add(obj.path, obj.fs, oid, **kwargs)
return self.raw.add(
obj.path,
obj.fs,
oid,
hardlink=hardlink,
callback=callback,
**kwargs,
)

def check(
self,
Expand Down
3 changes: 2 additions & 1 deletion src/dvc_data/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

if TYPE_CHECKING:
from dvc_objects.file import HashFile
from dvc_objects.hashfile.hash_info import HashInfo

from .hashfile.hash_info import HashInfo

ADD = "add"
MODIFY = "modify"
Expand Down
2 changes: 1 addition & 1 deletion src/dvc_data/gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def gc(odb, used, jobs=None, cache_odb=None, shallow=True):
)

def _is_dir_hash(_hash):
from dvc_objects.hashfile.hash_info import HASH_DIR_SUFFIX
from .hashfile.hash_info import HASH_DIR_SUFFIX

return _hash.endswith(HASH_DIR_SUFFIX)

Expand Down
Binary file added src/dvc_data/hashfile/.state.py.swp
Binary file not shown.
Empty file.
15 changes: 15 additions & 0 deletions src/dvc_data/hashfile/_ignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import TYPE_CHECKING, Iterator

from typing_extensions import Protocol

if TYPE_CHECKING:
from .fs.base import AnyFSPath, FileSystem

# pylint: disable=unused-argument


class Ignore(Protocol):
def find(
self, fs: "FileSystem", path: "AnyFSPath"
) -> Iterator["AnyFSPath"]:
...
65 changes: 65 additions & 0 deletions src/dvc_data/hashfile/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import pickle
from functools import wraps
from typing import Any

import diskcache
from diskcache import Disk as disk
from diskcache import Index # noqa: F401, pylint: disable=unused-import
from diskcache import Timeout # noqa: F401, pylint: disable=unused-import

# pylint: disable=redefined-builtin


class DiskError(Exception):
def __init__(self, directory: str, type: str) -> None:
self.directory = directory
self.type = type
super().__init__(f"Could not open disk '{type}' in {directory}")


def translate_pickle_error(fn):
@wraps(fn)
def wrapped(self, *args, **kwargs):
try:
return fn(self, *args, **kwargs)
except (pickle.PickleError, ValueError) as e:
if isinstance(e, ValueError) and "pickle protocol" not in str(e):
raise
# pylint: disable=protected-access
raise DiskError(self._directory, type=self._type) from e

return wrapped


class Disk(disk):
"""Reraise pickle-related errors as DiskError."""

# we need type to differentiate cache for better error messages
_type: str

put = translate_pickle_error(disk.put)
get = translate_pickle_error(disk.get)
store = translate_pickle_error(disk.store)
fetch = translate_pickle_error(disk.fetch)


class Cache(diskcache.Cache):
"""Extended to handle pickle errors and use a constant pickle protocol."""

def __init__(
self,
directory: str = None,
timeout: int = 60,
disk: disk = Disk, # pylint: disable=redefined-outer-name
type: str = None,
**settings: Any,
) -> None:
settings.setdefault("disk_pickle_protocol", 4)
super().__init__(
directory=directory, timeout=timeout, disk=disk, **settings
)
self.disk._type = self._type = type or os.path.basename(self.directory)

def __getstate__(self):
return (*super().__getstate__(), self._type)
Loading

0 comments on commit 5565291

Please sign in to comment.