Skip to content

Commit

Permalink
compute seed file hash incrementally
Browse files Browse the repository at this point in the history
  • Loading branch information
noppaz committed Mar 6, 2023
1 parent b681908 commit 880b035
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 2 deletions.
6 changes: 6 additions & 0 deletions .changes/unreleased/Under the Hood-20230305-093644.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Under the Hood
body: Compute seed file hash incrementally
time: 2023-03-05T09:36:44.023758357Z
custom:
Author: noppaz
Issue: "7124"
14 changes: 14 additions & 0 deletions core/dbt/contracts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from mashumaro.types import SerializableType
from typing import List, Optional, Union, Dict, Any

from dbt.clients.system import convert_path
from dbt.constants import MAXIMUM_SEED_SIZE
from dbt.dataclass_schema import dbtClassMixin, StrEnum

Expand Down Expand Up @@ -109,6 +110,19 @@ def from_contents(cls, contents: str, name="sha256") -> "FileHash":
checksum = hashlib.new(name, data).hexdigest()
return cls(name=name, checksum=checksum)

@classmethod
def from_path(cls, path: str, name="sha256") -> "FileHash":
"""Create a file hash from the file at given path."""
path = convert_path(path)
chunk_size = 1 * 1024 * 1024
file_hash = hashlib.new(name)
with open(path, "rb") as handle:
chunk = handle.read(chunk_size)
while chunk:
file_hash.update(chunk)
chunk = handle.read(chunk_size)
return cls(name=name, checksum=file_hash.hexdigest())


@dataclass
class RemoteFile(dbtClassMixin):
Expand Down
3 changes: 1 addition & 2 deletions core/dbt/parser/read_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ def load_seed_source_file(match: FilePath, project_name) -> SourceFile:
# We don't want to calculate a hash of this file. Use the path.
source_file = SourceFile.big_seed(match)
else:
file_contents = load_file_contents(match.absolute_path, strip=False)
checksum = FileHash.from_contents(file_contents)
checksum = FileHash.from_path(match.absolute_path)
source_file = SourceFile(path=match, checksum=checksum)
source_file.contents = ""
source_file.parse_file_type = ParseFileType.Seed
Expand Down

0 comments on commit 880b035

Please sign in to comment.