Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build race condition #727

Merged
merged 39 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
ffe4f00
add notes
Jan 10, 2024
1277263
start towards prototype solution
Jan 12, 2024
5f35168
track build hashes on MultiBuildTracker
Jan 17, 2024
353b2cc
restore lost changes
Jan 17, 2024
d9c8571
fix mistake
Jan 17, 2024
f44a8dd
implement hash->lock map
Jan 19, 2024
a4e3fa7
fix lingering misnamed function call
Jan 19, 2024
992d526
remove stray code
hwikle-lanl Jan 26, 2024
113b552
cache build hash on creation
hwikle-lanl Jan 26, 2024
f72d24b
attempt to resolve style test failure
hwikle-lanl Jan 26, 2024
e3e036c
second attempt at style test
hwikle-lanl Jan 26, 2024
ba686f2
add "fuzzy" lockfile implementation
hwikle-lanl Jan 26, 2024
f6d69f6
fix minor style issue
hwikle-lanl Jan 26, 2024
0af8ad1
reimplement timeout on build locks
hwikle-lanl Jan 26, 2024
7d96487
update build locking test
hwikle-lanl Jan 26, 2024
6acbf4e
ignore vscode workspace
hwikle-lanl Jan 29, 2024
cb33723
attempt to fix timeout
hwikle-lanl Jan 29, 2024
99d60c2
fix timed lock context management
hwikle-lanl Jan 29, 2024
fad32cf
fix style issues; tweak lock context
hwikle-lanl Jan 29, 2024
51d3003
fix mistake; more style fixes
hwikle-lanl Jan 29, 2024
3b8c3e4
fix changed method name
hwikle-lanl Jan 29, 2024
1487d4b
another style fix
hwikle-lanl Jan 29, 2024
8641ae8
even more style fixes
hwikle-lanl Jan 29, 2024
6a4716b
(attempt to) fix broken unit test
hwikle-lanl Jan 31, 2024
04abb19
fix mistake with protected class
hwikle-lanl Jan 31, 2024
f14423f
fix accidentally deleted import
hwikle-lanl Jan 31, 2024
2550e1b
restore removed import
hwikle-lanl Jan 31, 2024
d29b0d1
attempt to fix broken docstring
hwikle-lanl Jan 31, 2024
0623307
restore (janky) docstring to pass unit test
hwikle-lanl Jan 31, 2024
e9d0d42
fix unit test
hwikle-lanl Jan 31, 2024
9e85e90
fix unit test
hwikle-lanl Jan 31, 2024
d7f824f
fix unit test
hwikle-lanl Jan 31, 2024
709541f
fix unit test
hwikle-lanl Jan 31, 2024
6411448
Merge branch 'fuzzy-lock' into build-race-condition
hwikle-lanl Jan 31, 2024
a987d76
tweak status file check
hwikle-lanl Jan 31, 2024
afc4c18
style fixes
hwikle-lanl Jan 31, 2024
f2a9b9d
fix build lock unit test
hwikle-lanl Feb 2, 2024
9524805
remove fuzzy lock -- for inclusion later
hwikle-lanl Feb 5, 2024
08a0afa
Merge remote-tracking branch 'origin/build-race-condition' into build…
hwikle-lanl Feb 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
.deps_installed
.env
*.pyc
PAV/scripts/site
.idea
*.swp
*.swo

# Python Artifacts
__pycache__/
*.pyc

# IntelliJ Artifacts (Why?)
.idea

# VSCode Artifacts
.vscode
*.code-workspace

pav.pstats
.local/
__pycache__/
.vscode
examples/working_dir
36 changes: 31 additions & 5 deletions lib/pavilion/build_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import datetime
import threading
from collections import defaultdict
from typing import List
from typing import List, ContextManager
from contextlib import contextmanager

from pavilion.status_file import STATES

Expand All @@ -19,25 +20,50 @@ def __init__(self):
# A map of build tokens to build names
self.messages = {}
self.status = {}
self.status_files = {}
self.status_files = {} # type: Dict[TestBuilder, TestStatusFile]
self.trackers = {}
self.lock = threading.Lock()
self._build_locks = {} # type: Dict[str, threading.Lock]

def register(self, test) -> "BuildTracker":
"""Register a builder, and get your own build tracker.

:param test: The builder object to track.
:return: A build tracker instance that can be used by builds directly."""
:param test: The TestRun object to track.
:return: A build tracker instance that can be used by builds directly."""

tracker = BuildTracker(test, self)
hash = test.builder.build_hash

with self.lock:
# Test may actually be a TestRun object rather than a TestBuilder object,
# which has no builder attribute
self.status_files[test.builder] = test.status
self.status[test.builder] = None
self.messages[test.builder] = []
self.trackers[test.builder] = tracker

if hash not in self._build_locks:
self._build_locks[hash] = threading.Lock()

return tracker

@contextmanager
def make_lock_context(self, hash: str, timeout: float = -1) -> ContextManager[bool]:
"""Return a context manager to manage the build-specific lock.

:param str hash: The hash identifying the specific build.
:return: A context manager to manage the (optionally) timed lock
associated with the build."""

lock = self._build_locks[hash]

try:
result = lock.acquire(timeout=timeout)
yield result
finally:
if result:
lock.release()

def update(self, builder, note, state=None):
"""Add a message for the given builder without changes the status.

Expand Down Expand Up @@ -85,7 +111,7 @@ def failures(self) -> List['BuildTracker']:
class BuildTracker:
"""Tracks the status updates for a single build."""

def __init__(self, test, tracker):
def __init__(self, test: 'TestRun', tracker: MultiBuildTracker):
self.test = test
if test is None:
self.builder = None
Expand Down
97 changes: 55 additions & 42 deletions lib/pavilion/builder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Contains the object for tracking multi-threaded builds, along with
the TestBuilder class itself."""

# pylint: disable=too-many-lines

import glob
import hashlib
import io
Expand All @@ -24,7 +26,6 @@
from pavilion.test_config import parse_timeout
from pavilion.test_config.spack import SpackEnvConfig


class TestBuilder:
"""Manages a test build and their organization.

Expand Down Expand Up @@ -62,8 +63,7 @@ def __init__(self, pav_cfg: pavilion.config.PavConfig, working_dir: Path, config
:param script: Path to the build script
:param templates: Paths to template files and their destinations.
:param spack_config: Give a spack config to enable spack builds.
:param build_name: The build name, if this is a build that
already exists.
:param build_name: The build name, if this is a build that already exists.
:raises TestBuilderError: When the builder can't be initialized.
"""

Expand All @@ -73,6 +73,7 @@ def __init__(self, pav_cfg: pavilion.config.PavConfig, working_dir: Path, config
self._script_path = script
self._download_dest = download_dest
self._templates: Dict[Path, Path] = templates or {}
self._build_hash = None

try:
self._timeout = parse_timeout(config.get('timeout'))
Expand Down Expand Up @@ -166,7 +167,16 @@ def log_updated(self) -> Union[float, None]:
except OSError:
return None

def create_build_hash(self) -> str:
@property
def build_hash(self) -> str:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cache this to a self._build_hash.

"""Get the cached build hash, if it exists. Otherwise,
create it and cache it."""
if self._build_hash is None:
self._build_hash = self._create_build_hash()

return self._build_hash

def _create_build_hash(self) -> str:
"""Turn the build config, and everything the build needs, into a hash.
This includes the build config itself, the source tarball, and all
extra files."""
Expand Down Expand Up @@ -237,13 +247,13 @@ def create_build_hash(self) -> str:

hash_obj.update(self._config.get('specificity', '').encode())

return hash_obj.hexdigest()
return hash_obj.hexdigest()[:self.BUILD_HASH_BYTES*2]

def name_build(self) -> str:
"""Search for the first non-deprecated version of this build (whether
or not it exists) and name the build for it."""

base_hash = self.create_build_hash()[:self.BUILD_HASH_BYTES*2]
base_hash = self.build_hash

builds_dir = self._pav_cfg.working_dir/'builds'
name = base_hash
Expand Down Expand Up @@ -385,45 +395,45 @@ def build(self, test_id: str, tracker: BuildTracker,
:return: True if these steps completed successfully.
"""

# Only try to do the build if it doesn't already exist and is finished.
mb_tracker = tracker.tracker

if not self.finished_path.exists():
# Make sure another test doesn't try to do the build at
# the same time.
# Note cleanup of failed builds HAS to occur under this lock to
# avoid a race condition, even though it would be way simpler to
# do it in .build()

tracker.update(
state=STATES.BUILD_WAIT,
note="Waiting on lock for build {}.".format(self.name))
lock_path = self.path.with_suffix('.lock')
with lockfile.LockFile(lock_path, group=self._pav_cfg.shared_group) as lock:

timed_lock = mb_tracker.make_lock_context(self.build_hash)
with timed_lock as acquired:
# Make sure the build wasn't created while we waited for
# the lock.
if not self.finished_path.exists():
tracker.update(
state=STATES.BUILDING,
note="Starting build {}.".format(self.name))

# If the build directory exists, we're assuming there was
# an incomplete build at this point.
if self.path.exists():
tracker.warn(
"Build lock acquired, but build exists that was "
"not marked as finished. Deleting...")
try:
shutil.rmtree(self.path)
except OSError as err:
tracker.error(
"Could not remove unfinished build.\n{}"
.format(err))
return False

with lockfile.LockFilePoker(lock):
# Attempt to perform the actual build, this shouldn't
# raise an exception unless something goes terribly
# wrong.
# This will also set the test status for
# non-catastrophic cases.
if acquired:
if not self.finished_path.exists():
tracker.update(
state=STATES.BUILDING,
note="Starting build {}.".format(self.name))

# If the build directory exists, we're assuming there was
# an incomplete build at this point.

if self.path.exists():
tracker.warn(
"Build lock acquired, but build exists that was "
"not marked as finished. Deleting...")
try:
shutil.rmtree(self.path)
except OSError as err:
tracker.error(
"Could not remove unfinished build.\n{}"
.format(err))
return False

if not self._build(self.path, cancel_event, test_id, tracker):

try:
Expand All @@ -446,16 +456,19 @@ def build(self, test_id: str, tracker: BuildTracker,

return False

try:
self.finished_path.touch()
except OSError:
tracker.warn("Could not touch '<build>.finished' file.")
try:
self.finished_path.touch()
except OSError:
tracker.warn("Could not touch '<build>.finished' file.")

else:
tracker.update(
state=STATES.BUILD_REUSED,
note="Build {s.name} created while waiting for build "
"lock.".format(s=self))
else:
tracker.update(
state=STATES.BUILD_REUSED,
note="Build {s.name} created while waiting for build "
"lock.".format(s=self))
tracker.warn("Timed out when attempting to acquire lock")
return False
else:
tracker.update(
note=("Build {s.name} is being reused.".format(s=self)),
Expand Down Expand Up @@ -502,7 +515,7 @@ def create_spack_env(self, build_dir):
with open(spack_env_config.as_posix(), "w+") as spack_env_file:
SpackEnvConfig().dump(spack_env_file, values=config,)

def _build(self, build_dir, cancel_event, test_id, tracker: BuildTracker):
def _build(self, build_dir, cancel_event, test_id, tracker: BuildTracker) -> bool:
"""Perform the build. This assumes there actually is a build to perform.
:param Path build_dir: The directory in which to perform the build.
:param threading.Event cancel_event: Event to signal that the build
Expand Down
3 changes: 3 additions & 0 deletions lib/pavilion/lockfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,9 @@ def _create_lockfile(self):
file_note = ",".join([os.uname()[1], utils.get_login(), str(self.expire_period),
self._id])
file_note = file_note.encode('utf8')

# This will not work on certain version combinations of Linux/NFS
# Proposal: replace with link/fstat pattern
file_num = os.open(path, os.O_EXCL | os.O_CREAT | os.O_RDWR)
os.write(file_num, file_note)
os.close(file_num)
Expand Down
3 changes: 1 addition & 2 deletions lib/pavilion/test_run/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,8 +585,7 @@ def build(self, cancel_event=None, tracker: BuildTracker = None):
"""

if tracker is None:
mb_tracker = MultiBuildTracker()
tracker = mb_tracker.register(self)
tracker = MultiBuildTracker().register(self)

if not self.saved:
raise RuntimeError("The .save() method must be called before you "
Expand Down
4 changes: 2 additions & 2 deletions lib/pavilion/unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import time
from hashlib import sha1
from pathlib import Path
from typing import List
from typing import List, Dict, Any

import pavilion.schedulers
from pavilion import arguments
Expand Down Expand Up @@ -256,7 +256,7 @@ def get_hash(filename):
'schedule': {},
}

def _quick_test_cfg(self):
def _quick_test_cfg(self) -> Dict[str, Any]:
"""Return a pre-populated test config to use with
``self._quick_test``. This can be used as is, or modified for
desired effect.
Expand Down
60 changes: 24 additions & 36 deletions test/tests/builder_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@
from pathlib import Path

from pavilion import builder
from pavilion import lockfile
from pavilion import wget
from pavilion.build_tracker import DummyTracker
from pavilion.build_tracker import MultiBuildTracker, DummyTracker
from pavilion.errors import TestRunError
from pavilion.status_file import STATES
from pavilion.unittest import PavTestCase
Expand All @@ -25,40 +24,29 @@ def test_build_locking(self):

# Make a unique build.
test_cfg = self._quick_test_cfg()
test_cfg['build']['cmds'] = ["echo {}".format(uuid.uuid4().hex)]
test = self._quick_test(test_cfg, build=False, finalize=False)

bldr = builder.TestBuilder(self.pav_cfg, self.pav_cfg.working_dir, test.config['build'],
test.build_script_path, test.status, Path('/tmp'))

# Prelock the build.
lock = lockfile.LockFile(bldr.path.with_suffix('.lock'), expires_after=2)
lock._create_lockfile()

start = time.time()
bldr.build(test.full_id, DummyTracker())
# Make sure the builder actually waited for the lock to expire.
# Note: This is inelegant, but I can't really think of a better (single-threaded) way.
self.assertGreater(time.time() - start, 2.0)

# Now start a build that takes some time and has to update its lockfile.
test_cfg = self._quick_test_cfg()
test_cfg['build']['cmds'] = ["echo {}".format(uuid.uuid4().hex), 'sleep 5']
test = self._quick_test(test_cfg, build=False, finalize=False)
bldr = builder.TestBuilder(self.pav_cfg, self.pav_cfg.working_dir, test.config['build'],
test.build_script_path, test.status, Path('/tmp'))
bldr_lock_path = bldr.path.with_suffix('.lock')

# And now I'm using threading anyway...
thread = threading.Thread(target=bldr.build, args=(test.full_id, DummyTracker()))
thread.start()
# We must wait a moment for the builder to actually create the lockfile.
while not bldr_lock_path.exists():
time.sleep(0.1)

start = time.time()
with lockfile.LockFile(bldr_lock_path):
self.assertGreater(time.time() - start, 5)
test_cfg['build']['cmds'] = ['sleep 0.2']

num_tests = 3

mb_tracker = MultiBuildTracker()
tests = []
threads = []

for _ in range(num_tests):
tests.append(self._quick_test(test_cfg, build=False, finalize=False))

for test in tests:
tracker = mb_tracker.register(test)
thread = threading.Thread(target=test.builder.build, args=(test.full_id, tracker))
threads.append(thread)
thread.run()

map(lambda x: x.join(), threads)

states = [test.status.current().state for test in tests]
reused = list(filter(lambda x: x == STATES.BUILD_REUSED, states))
self.assertEqual(len(reused), num_tests - 1)
self.assertIn(STATES.BUILD_SUCCESS, states)

def test_setup_build_dir(self):
"""Make sure we can correctly handle all of the various archive
Expand Down
Loading