Skip to content

Commit

Permalink
plugin: keep jobs in PRIORITY after reprioritize
Browse files Browse the repository at this point in the history
Problem: the priority plugin will raise an exception on a job if it is
held in SCHED state while the plugin is reloaded (or Flux is restarted)
and jobs are reprioritized without first loading flux-accounting data to
this plugin. This behavior is not graceful and we should instead
continue to hold a job in PRIORITY while the plugin waits to receive
flux-accounting data.

Add a check of the plugin's internal map to see if we are still waiting
on flux-accounting data to be loaded in; if so, continue to hold the job
while we wait for data.

Add a sharness test that reproduces the issue raised in #406 and ensure
that jobs continue to be held after a reprioritization without loading
flux-accounting data to the priority plugin.
  • Loading branch information
cmoussa1 committed Jan 8, 2024
1 parent 1a84215 commit aa3d928
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/plugins/mf_priority.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,9 @@ static int priority_cb (flux_plugin_t *p,
if (b->max_run_jobs == BANK_INFO_MISSING) {
// try to look up user again
it = users.find (userid);
if (it == users.end ()) {
if (it == users.end () || check_map_for_dne_only () == true) {
// the plugin could still be waiting on flux-accounting data
// to be loaded in; keep the job in PRIORITY state
return flux_jobtap_priority_unavail (p, args);
} else {
// make sure user belongs to bank they specified; if no bank was
Expand Down
1 change: 1 addition & 0 deletions t/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ TESTSCRIPTS = \
t1028-mf-priority-issue385.t \
t1029-mf-priority-default-bank.t \
t1030-mf-priority-update-queue.t \
t1031-mf-priority-issue406.t \
t5000-valgrind.t \
python/t1000-example.py

Expand Down
109 changes: 109 additions & 0 deletions t/t1031-mf-priority-issue406.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/bin/bash

test_description='ensure jobs are still held in PRIORITY after reprioritization if plugin has no data'

. `dirname $0`/sharness.sh
MULTI_FACTOR_PRIORITY=${FLUX_BUILD_DIR}/src/plugins/.libs/mf_priority.so
SUBMIT_AS=${SHARNESS_TEST_SRCDIR}/scripts/submit_as.py
DB_PATH=$(pwd)/FluxAccountingTest.db

export TEST_UNDER_FLUX_NO_JOB_EXEC=y
export TEST_UNDER_FLUX_SCHED_SIMPLE_MODE="limited=1"
test_under_flux 1 job

flux setattr log-stderr-level 1

test_expect_success 'create flux-accounting DB' '
flux account -p $(pwd)/FluxAccountingTest.db create-db
'

test_expect_success 'start flux-accounting service' '
flux account-service -p ${DB_PATH} -t
'

test_expect_success 'load multi-factor priority plugin' '
flux jobtap load -r .priority-default ${MULTI_FACTOR_PRIORITY} &&
flux jobtap list | grep mf_priority
'

test_expect_success 'add some banks to the DB' '
flux account add-bank root 1 &&
flux account add-bank --parent-bank=root A 1
'

test_expect_success 'add some users to the DB' '
flux account add-user --username=user1 --userid=5001 --bank=A &&
flux account add-user --username=user2 --userid=5002 --bank=A &&
flux account add-user --username=user3 --userid=5003 --bank=A
'

test_expect_success 'send flux-accounting DB information to the plugin' '
flux account-priority-update -p $(pwd)/FluxAccountingTest.db
'

test_expect_success 'stop the queue' '
flux queue stop
'

test_expect_success 'submit jobs as three different users' '
job1=$(flux python ${SUBMIT_AS} 5001 hostname) &&
job2=$(flux python ${SUBMIT_AS} 5002 hostname) &&
job3=$(flux python ${SUBMIT_AS} 5003 hostname)
'

test_expect_success 'check that the jobs successfully received their priority' '
flux job wait-event -vt 5 $job1 priority &&
flux job wait-event -vt 5 $job2 priority &&
flux job wait-event -vt 5 $job3 priority
'

test_expect_success 'unload plugin' '
flux jobtap remove mf_priority.so
'

test_expect_success 'reload multi-factor priority plugin' '
flux jobtap load ${MULTI_FACTOR_PRIORITY} &&
flux jobtap list | grep mf_priority
'

test_expect_success 'reprioritize jobs' '
cat <<-EOF >reprioritize.py
import flux
flux.Flux().rpc("job-manager.mf_priority.reprioritize")
EOF
flux python reprioritize.py
'

test_expect_success 'make sure job 1 is still in PRIORITY state' '
flux job wait-event -vt 10 $job1 depend &&
flux job info $job1 eventlog > eventlog.out &&
cat eventlog.out &&
grep "depend" eventlog.out
'

test_expect_success 'make sure job 2 is still in PRIORITY state' '
flux job wait-event -vt 10 $job2 depend &&
flux job info $job2 eventlog > eventlog.out &&
cat eventlog.out &&
grep "depend" eventlog.out
'

test_expect_success 'make sure job 3 is still in PRIORITY state' '
flux job wait-event -vt 10 $job3 depend &&
flux job info $job3 eventlog > eventlog.out &&
cat eventlog.out &&
grep "depend" eventlog.out
'

test_expect_success 'cancel jobs' '
flux job cancel $job1 &&
flux job cancel $job2 &&
flux job cancel $job3
'

test_expect_success 'shut down flux-accounting service' '
flux python -c "import flux; flux.Flux().rpc(\"accounting.shutdown_service\").get()"
'

test_done

0 comments on commit aa3d928

Please sign in to comment.