Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make MAXIMUM_SEED_SIZE_MIB configurable #11177

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changes/unreleased/Features-20230307-134838.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Features
body: Make MAXIMUM_SEED_SIZE_MIB configurable
time: 2023-03-07T13:48:38.792321024Z
custom:
Author: noppaz acurtis-evi
Issue: 7117 7124
22 changes: 22 additions & 0 deletions core/dbt/artifacts/resources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List, Optional

from dbt.artifacts.resources.types import NodeType
from dbt_common.clients.system import convert_path
from dbt_common.dataclass_schema import dbtClassMixin


Expand Down Expand Up @@ -60,6 +61,27 @@
checksum = hashlib.new(name, data).hexdigest()
return cls(name=name, checksum=checksum)

@classmethod
def from_path(cls, path: str, name="sha256") -> "FileHash":
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non-breaking change in the dbt/artifacts/ directory, so I will add the artifact_minor_upgrade label to this PR

"""Create a file hash from the file at given path. The hash is always the
utf-8 encoding of the contents which is stripped to give similar hashes
as `FileHash.from_contents`.
"""
path = convert_path(path)
chunk_size = 1 * 1024 * 1024
file_hash = hashlib.new(name)
with open(path, "r") as handle:

Check warning on line 73 in core/dbt/artifacts/resources/base.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/artifacts/resources/base.py#L70-L73

Added lines #L70 - L73 were not covered by tests
# Left and rightstrip start and end of contents to give identical
# results as the seed hashing implementation with from_contents
chunk = handle.read(chunk_size).lstrip()
while chunk:
next_chunk = handle.read(chunk_size)
if not next_chunk:
chunk = chunk.rstrip()
file_hash.update(chunk.encode("utf-8"))
chunk = next_chunk
return cls(name=name, checksum=file_hash.hexdigest())

Check warning on line 83 in core/dbt/artifacts/resources/base.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/artifacts/resources/base.py#L76-L83

Added lines #L76 - L83 were not covered by tests


@dataclass
class Docs(dbtClassMixin):
Expand Down
1 change: 1 addition & 0 deletions core/dbt/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def global_flags(func):
@p.log_level_file
@p.log_path
@p.macro_debugging
@p.maximum_seed_size_mib
@p.partial_parse
@p.partial_parse_file_path
@p.partial_parse_file_diff
Expand Down
8 changes: 8 additions & 0 deletions core/dbt/cli/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,14 @@
default="eager",
)

maximum_seed_size_mib = click.option(
"--maximum-seed-size-mib",
envvar="DBT_MAXIMUM_SEED_SIZE_MIB",
help="Specify max size (MiB) for seed files that will be hashed for state comparison.",
type=click.INT,
default=1,
)

lock = click.option(
"--lock",
envvar=None,
Expand Down
3 changes: 0 additions & 3 deletions core/dbt/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@

SECRET_PLACEHOLDER = "$$$DBT_SECRET_START$$${}$$$DBT_SECRET_END$$$"

MAXIMUM_SEED_SIZE = 1 * 1024 * 1024
MAXIMUM_SEED_SIZE_NAME = "1MB"

PIN_PACKAGE_URL = (
"https://docs.getdbt.com/docs/package-management#section-specifying-package-versions"
)
Expand Down
6 changes: 2 additions & 4 deletions core/dbt/contracts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from mashumaro.types import SerializableType

from dbt.artifacts.resources.base import FileHash
from dbt.constants import MAXIMUM_SEED_SIZE
from dbt_common.dataclass_schema import StrEnum, dbtClassMixin

from .util import SourceKey
Expand Down Expand Up @@ -65,9 +64,8 @@
def original_file_path(self) -> str:
return os.path.join(self.searched_path, self.relative_path)

def seed_too_large(self) -> bool:
"""Return whether the file this represents is over the seed size limit"""
return os.stat(self.full_path).st_size > MAXIMUM_SEED_SIZE
def file_size(self) -> int:
return os.stat(self.full_path).st_size

Check warning on line 68 in core/dbt/contracts/files.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/contracts/files.py#L68

Added line #L68 was not covered by tests


@dataclass
Expand Down
1 change: 1 addition & 0 deletions core/dbt/contracts/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ class ProjectFlags(ExtensibleDbtClassMixin):
log_format_file: Optional[str] = None
log_level: Optional[str] = None
log_level_file: Optional[str] = None
maximum_seed_size_mib: Optional[int] = None
partial_parse: Optional[bool] = None
populate_cache: Optional[bool] = None
printer_width: Optional[int] = None
Expand Down
17 changes: 11 additions & 6 deletions core/dbt/events/types.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import json

from dbt.constants import MAXIMUM_SEED_SIZE_NAME, PIN_PACKAGE_URL
from dbt.constants import PIN_PACKAGE_URL
from dbt.events.base_types import (
DebugLevel,
DynamicLevel,
ErrorLevel,
InfoLevel,
WarnLevel,
)
from dbt.flags import get_flags
from dbt_common.events.base_types import EventLevel
from dbt_common.events.format import (
format_fancy_output_line,
Expand Down Expand Up @@ -675,10 +676,11 @@ def code(self) -> str:
return "I052"

def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was "
f"<={MAXIMUM_SEED_SIZE_NAME}, so it has changed"
f">{maximum_seed_size_name} in size. The previous file was "
f"<={maximum_seed_size_name}, so it has changed"
)
return msg

Expand All @@ -688,9 +690,10 @@ def code(self) -> str:
return "I053"

def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size at the same path, dbt "
f">{maximum_seed_size_name} in size at the same path, dbt "
f"cannot tell if it has changed: assuming they are the same"
)
return msg
Expand All @@ -701,9 +704,10 @@ def code(self) -> str:
return "I054"

def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was in "
f">{maximum_seed_size_name} in size. The previous file was in "
f"a different location, assuming it has changed"
)
return msg
Expand All @@ -714,9 +718,10 @@ def code(self) -> str:
return "I055"

def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file had a "
f">{maximum_seed_size_name} in size. The previous file had a "
f"checksum type of {self.checksum_name}, so it has changed"
)
return msg
Expand Down
1 change: 1 addition & 0 deletions core/dbt/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def get_flag_dict():
"log_path",
"invocation_command",
"empty",
"maximum_seed_size_mib",
}
return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr}

Expand Down
9 changes: 6 additions & 3 deletions core/dbt/parser/read_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from dbt.events.types import InputFileDiffError
from dbt.exceptions import ParsingError
from dbt.flags import get_flags
from dbt.parser.common import schema_file_keys
from dbt.parser.schemas import yaml_from_file
from dbt.parser.search import filesystem_search
Expand Down Expand Up @@ -123,12 +124,14 @@

# Special processing for big seed files
def load_seed_source_file(match: FilePath, project_name) -> SourceFile:
if match.seed_too_large():
# Users can configure the maximum seed size (MiB) that will be hashed for state comparison
maximum_seed_size = get_flags().MAXIMUM_SEED_SIZE_MIB * 1024 * 1024

Check warning on line 128 in core/dbt/parser/read_files.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/parser/read_files.py#L128

Added line #L128 was not covered by tests
# maximum_seed_size = 0 means no limit
if match.file_size() > maximum_seed_size and maximum_seed_size != 0:

Check warning on line 130 in core/dbt/parser/read_files.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/parser/read_files.py#L130

Added line #L130 was not covered by tests
# We don't want to calculate a hash of this file. Use the path.
source_file = SourceFile.big_seed(match)
else:
file_contents = load_file_contents(match.absolute_path, strip=True)
checksum = FileHash.from_contents(file_contents)
checksum = FileHash.from_path(match.absolute_path)

Check warning on line 134 in core/dbt/parser/read_files.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/parser/read_files.py#L134

Added line #L134 was not covered by tests
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have confirmed that this is not a "breaking" change, insofar as the same seed produces the same checksum before and after this change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update: The failing test on Windows seems to indicate that the seed checksum does indeed change, as a result of this PR, but only on Windows. The contributor mentioned this code comment as indication that we expect actually different checksums on Windows versus MacOS / Linux.

In that test, the checksum for 'seed.test.seed':

  • on MacOS + Linux (before/after): '381eb2f...'
  • on Windows (before): '54a28a3...'
  • on Windows (after): '381eb2f...'

I think our options are:

  1. Restore the behavior to return actually different checksums on different systems. That might be easiest, but I agree it's more desirable to return the same checksum regardless of OS.
  2. Place this behind a behavior-change flag, which we very quickly flip to "True" by default (in the next minor version of dbt-core), with targeted advisory for users on Windows: "When this change rolls out, all your seeds will be detected as modified, if comparing against a manifest produced before this code change."
  3. Roll this out in the next minor version of dbt-core, without a behavior-change flag. It won't affect the vast majority of users (who are not running on Windows). We mention it in the upgrade guide.

I lean toward option (2) for thoroughness.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder weather we want to just do a logic of:

  • if we find the hash is different on Windows(maybe also if the manifest was computed with a previous version of dbt), compute with the other method to see if it changed, if not, we return not changed but persist the new hash in artifact.

This should provide a seamless upgrade experience and we don't need to have a flag at all.

source_file = SourceFile(path=match, checksum=checksum)
source_file.contents = ""
source_file.parse_file_type = ParseFileType.Seed
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/defer_state/test_modified_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def test_changed_seed_contents_state(self, project):
"./state",
]
)
assert ">1MB" in str(exc.value)
assert ">1MiB" in str(exc.value)

# now check if unmodified returns none
results = run_dbt(
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/graph/test_selector_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedExceedsLimitSamePath"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
assert not search_manifest_using_method(manifest, method, "new")
warn_or_error_patch.assert_not_called()
Expand All @@ -793,7 +793,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedExceedsLimitSamePath"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")


def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state, seed):
Expand All @@ -807,7 +807,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedIncreased"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
assert not search_manifest_using_method(manifest, method, "new")
warn_or_error_patch.assert_not_called()
Expand All @@ -820,7 +820,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedIncreased"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")


def test_select_state_changed_seed_checksum_path_to_sha(manifest, previous_state, seed):
Expand Down
Loading