Skip to content

Commit

Permalink
Merge pull request #48 from roehrich-hpe/release-v0.0.2
Browse files Browse the repository at this point in the history
Release v0.0.2
  • Loading branch information
roehrich-hpe authored Sep 28, 2023
2 parents bb72f13 + f2dece2 commit 2284bb5
Show file tree
Hide file tree
Showing 15 changed files with 187 additions and 190 deletions.
8 changes: 3 additions & 5 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
[submodule "testsuite/submodules/dws"]
path = testsuite/submodules/dws
url = https://github.com/HewlettPackard/dws.git
branch = releases/v0
[submodule "testsuite/submodules/slurm-docker-cluster"]
path = testsuite/submodules/slurm-docker-cluster
url = [email protected]:DataWorkflowServices/slurm-docker-cluster.git
[submodule "testsuite/submodules/dws-test-driver"]
path = testsuite/submodules/dws-test-driver
url = [email protected]:DataWorkflowServices/dws-test-driver.git
branch = releases/v0
[submodule "testsuite/submodules/dws"]
path = testsuite/submodules/dws
url = [email protected]:DataWorkflowServices/dws.git
6 changes: 6 additions & 0 deletions src/burst_buffer/burst_buffer.conf
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@
# See https://slurm.schedmd.com/burst_buffer.conf.html
Directive=DW

# If set, then teardown a burst buffer after file staging error. Otherwise
# preserve the burst buffer for analysis and manual teardown.
# See https://slurm.schedmd.com/burst_buffer.conf.html
# and https://slurm.schedmd.com/burst_buffer.html#states
Flags=TeardownFailure

78 changes: 67 additions & 11 deletions src/burst_buffer/burst_buffer.lua
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ DEFAULT_LABEL_KEY = "origin"
DEFAULT_LABEL_VAL = lua_script_name

-- The fully-qualified name of the DWS Workflow CRD.
local WORKFLOW_CRD = "workflows.dws.cray.hpe.com"
local WORKFLOW_CRD = "workflows.dataworkflowservices.github.io"

KUBECTL_CACHE_DIR = "/tmp/burst_buffer_kubectl_cache"

Expand Down Expand Up @@ -118,7 +118,7 @@ end
-- resource with keywords that must be replaced by the caller.
function DWS:template()
return [[
apiVersion: dws.cray.hpe.com/v1alpha2
apiVersion: dataworkflowservices.github.io/v1alpha2
kind: Workflow
metadata:
name: WF_NAME
Expand Down Expand Up @@ -280,9 +280,16 @@ end

-- DWS:get_driver_errors will collect driver errors from the Workflow resource
-- with respect to the given state.
function DWS:get_driver_errors(state)
local error_list = {}
local jsonpath = [[{range .status.drivers[?(@.watchState=="]].. state ..[[")]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
-- If all_errors=true then collect all errors from all states in all drivers.
-- On success this returns true and a string with all of the errors.
-- On failure this returns false, an empty string for the errors, and a string
-- explaining why it couldn't collect the errors.
function DWS:get_driver_errors(state, all_errors)
local driver_index = [[?(@.watchState=="]].. state ..[[")]]
if all_errors == true then
driver_index = "*"
end
local jsonpath = [[{range .status.drivers[]] .. driver_index .. [[]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
local ret, output = self:get_jsonpath(jsonpath)
if ret == false then
return ret, "", "could not get driver errors: " .. output
Expand Down Expand Up @@ -442,6 +449,18 @@ function DWS:kubectl(cmd)
return self:io_popen(kcmd)
end

-- DWS:scancel will run the Slurm scancel command and collect its output.
-- On success this returns true and the output of the command.
-- On failure this returns false and the output of the command.
function DWS:scancel(jobId, hurry)
local hurry_opt = ""
if hurry == true then
hurry_opt = "--hurry "
end
local scmd = "scancel " .. hurry_opt .. jobId
return self:io_popen(scmd)
end

-- DWS:io_popen will run the given command and collect its output.
-- On success this returns true and the output of the command.
-- On failure this returns false and the output of the command.
Expand Down Expand Up @@ -627,24 +646,51 @@ function slurm_bb_job_teardown(job_id, job_script, hurry)
hurry_flag = true
end
local workflow = DWS(make_workflow_name(job_id))
local done, err = workflow:set_workflow_state_and_wait("Teardown", hurry_flag)

local ret = slurm.SUCCESS
-- Does the workflow have a fatal error in it?
-- If so, we'll call scancel as well.
local done, state_errors, err = workflow:get_driver_errors("", true)
if done == false then
if string.find(err, [["]] .. workflow.name .. [[" not found]]) then
-- It's already gone, and that's what we wanted anyway.
return slurm.SUCCESS
else
slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: %s", lua_script_name, workflow.name, err)
return slurm.ERROR, err
slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: unable to check driver errors: %s", lua_script_name, workflow.name, err)
ret = slurm.ERROR
-- fall-through, let the Workflow delete happen.
end
end

done, err = workflow:set_workflow_state_and_wait("Teardown", hurry_flag)
if done == false then
slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: %s", lua_script_name, workflow.name, err)
ret = slurm.ERROR
-- fall-through, let the Workflow delete happen.
end

done, err = workflow:delete()
if done == false then
slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s, delete: %s", lua_script_name, workflow.name, err)
return slurm.ERROR, err
ret = slurm.ERROR
-- fall-through, let any necessary scancel happen.
end

if state_errors ~= "" then
-- Now do the scancel. This will terminate this Lua script and will
-- trigger slurm to call our teardown again, but that'll be a no-op
-- when it comes back here.
slurm.log_info("%s: slurm_bb_job_teardown(), workflow=%s: executing scancel --hurry %s, found driver errors: %s", lua_script_name, workflow.name, job_id, state_errors)
_, err = workflow:scancel(job_id, true)
if err == "" then
err = "(no output)"
end
end

return slurm.SUCCESS
if ret == slurm.SUCCESS then
err = ""
end
return ret, err
end

--[[
Expand Down Expand Up @@ -844,10 +890,20 @@ function slurm_bb_get_status(...)
local args = {...}
args.n = select("#", ...)

local found_jid = false
local jid = 0
if args.n == 2 and args[1] == "workflow" then
-- Slurm 22.05
jid = args[2]
found_jid = true
elseif args.n == 4 and args[3] == "workflow" then
-- Slurm 23.02
jid = args[4]
found_jid = true
end
if found_jid == true then
local done = false
local status = {}
local jid = args[2]
if string.find(jid, "^%d+$") == nil then
msg = "A job ID must contain only digits."
else
Expand Down
18 changes: 9 additions & 9 deletions testsuite/integration/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,20 @@ setup-dws:
@{\
set -e ; \
cd ../submodules/dws ; \
docker buildx build -t local/dws-operator:test --load . ; \
IMAGE_TAG_BASE=local/dws-operator VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
kubectl wait deployment --timeout=120s -n dws-operator-system dws-operator-controller-manager --for condition=Available=True ; \
kubectl wait deployment --timeout=120s -n dws-operator-system dws-operator-webhook --for condition=Available=True ; \
docker buildx build -t local/dws:test --load . ; \
IMAGE_TAG_BASE=local/dws VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
kubectl wait deployment --timeout=120s -n dws-system dws-controller-manager --for condition=Available=True ; \
kubectl wait deployment --timeout=120s -n dws-system dws-webhook --for condition=Available=True ; \
}

.PHONY: setup-dws-test-driver
setup-dws-test-driver:
@{\
set -e ; \
cd ../submodules/dws-test-driver ; \
docker buildx build -t local/dws-test-driver-operator:test --load . ; \
IMAGE_TAG_BASE=local/dws-test-driver-operator VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
kubectl wait deployment --timeout=60s -n dws-test-operator-system dws-test-driver-controller-manager --for condition=Available=True ; \
docker buildx build -t local/dws-test-driver:test --load . ; \
IMAGE_TAG_BASE=local/dws-test-driver VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
kubectl wait deployment --timeout=60s -n dws-test-system dws-test-driver-controller-manager --for condition=Available=True ; \
}

.PHONY: setup
Expand All @@ -75,10 +75,10 @@ debug:
kubectl describe node dws-control-plane dws-worker
echo
echo "***** DWS DEPLOYMENT *****"
kubectl describe deployment -n dws-operator-system dws-operator-controller-manager
kubectl describe deployment -n dws-system dws-controller-manager
echo
echo "***** DWS LOGS *****"
kubectl logs -n dws-operator-system deployment/dws-operator-controller-manager
kubectl logs -n dws-system deployment/dws-controller-manager

.PHONY: reports
reports:
Expand Down
67 changes: 27 additions & 40 deletions testsuite/integration/src/features/test_dws_states.feature
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Feature: Data Workflow Services State Progression
Verify that the DWS-Slurm Burst Buffer Plugin progresses through Data
Workflow Services states

@happy_one
Scenario: The DWS-BB Plugin progresses through DWS states
Given a job script:
#!/bin/bash
Expand All @@ -44,13 +45,15 @@ Feature: Data Workflow Services State Progression
And the Workflow and job progress to the PostRun state
And the Workflow and job progress to the DataOut state
And the Workflow and job progress to the Teardown state
And the job state is COMPLETED
And the job has eventually been COMPLETED

# DWS does not allow spaces in key/value pairs in directives. To skirt around this
# constraint, the dws-test-driver replaces underscores ("_") in the message value with
# spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages
# It also makes it easier to check that the error is included in scontrol output.
Scenario Outline: The DWS-BB Plugin can handle fatal driver errors before being canceled
# This scenario assumes that "Flags=TeardownFailure" is set in burst_buffer.conf.
@fatal_one
Scenario Outline: Report fatal errors from Proposal, Setup, DataIn, PreRun
Given a job script:
#!/bin/bash

Expand All @@ -59,12 +62,13 @@ Feature: Data Workflow Services State Progression
/bin/hostname

When the job is run
Then a Workflow has been created for the job
And the Workflow and job report fatal errors at the <workflowState> state
And the job is canceled
And the Workflow and job progress to the Teardown state
And the job's final system comment contains the following:
And some Workflow has been created for the job
And the Workflow reports fatal errors at the <workflowState> state
Then the job's system comment eventually contains the following:
TEST FATAL ERROR
And the Workflow and job progress to the Teardown state
And the Workflow has eventually been deleted
And the job has eventually been CANCELLED

Examples:
# *** HEADER ***
Expand All @@ -73,14 +77,15 @@ Feature: Data Workflow Services State Progression
| Proposal |
| Setup |
| DataIn |
| PostRun |
| DataOut |
| PreRun |

# With the exception of PreRun, states will need to be canceled with the
# "--hurry" flag to transition to the Teardown state. If
# "Flags=TeardownFailure" is set in burst_buffer.conf, then all states will
# transition to Teardown without needing to be canceled
Scenario Outline: The DWS-BB Plugin can handle fatal driver errors for PreRun
# DWS does not allow spaces in key/value pairs in directives. To skirt around this
# constraint, the dws-test-driver replaces underscores ("_") in the message value with
# spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages
# It also makes it easier to check that the error is included in scontrol output.
# This scenario assumes that "Flags=TeardownFailure" is set in burst_buffer.conf.
@fatal_two
Scenario Outline: Report fatal errors from PostRun and DataOut
Given a job script:
#!/bin/bash

Expand All @@ -89,35 +94,17 @@ Feature: Data Workflow Services State Progression
/bin/hostname

When the job is run
Then a Workflow has been created for the job
And the Workflow reports a fatal error in the <workflowState> state
And the Workflow and job progress to the Teardown state
# Slurm moved it from PreRun/Error to Teardown without canceling
# the job. So the driver (this test) must cancel it.
And the job is canceled
And the job's final system comment contains the following:
And some Workflow has been created for the job
And the Workflow reports fatal errors at the <workflowState> state
Then the job's system comment eventually contains the following:
TEST FATAL ERROR
And the Workflow and job progress to the Teardown state
And the Workflow has eventually been deleted
And the job has eventually been COMPLETED

Examples:
# *** HEADER ***
| workflowState |
# *** VALUES ***
| PreRun |

Scenario: The DWS-BB Plugin can handle fatal driver errors during Teardown
Given a job script:
#!/bin/bash

#DW Teardown action=error message=TEST_FATAL_ERROR severity=Fatal
/bin/hostname

When the job is run
Then a Workflow has been created for the job
And the Workflow reports a fatal error in the Teardown state
And the job's intermediate system comment contains the following:
TEST FATAL ERROR
# Eventually the driver (this test) must work through the Teardown
# issues and complete that step. Slurm has already marked the job
# as completed and is now looping over slurm_bb_job_teardown() in
# burst_buffer.lua.
And the Workflow error is cleared from the Teardown state
| PostRun |
| DataOut |
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Feature: Integration test environment
srun -l /bin/hostname
srun -l /bin/pwd
When the job is run
Then the job state is COMPLETED
Then the job has eventually been COMPLETED

Scenario: Kubernetes and slurm are connected
Given the kubernetes cluster kube-system UID
Expand Down
4 changes: 3 additions & 1 deletion testsuite/integration/src/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ bdd_features_base_dir = features
markers =
environment
dws_states

happy_one
fatal_one
fatal_two
18 changes: 4 additions & 14 deletions testsuite/integration/src/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,17 +72,7 @@ def _(slurmctld, script_path):
# remove the slurm output from the jobs folder
slurmctld.remove_job_output(jobId, outputFilePath, errorFilePath)

@then(parsers.parse('the job state is {expectedJobState}'))
def _(slurmctld, jobId, expectedJobState):
"""the job state is <expectedJobState>"""
jobState, out = slurmctld.get_final_job_state(jobId)

if expectedJobState == "COMPLETED" and jobState == "FAILED":
warnings.warn(ResourceWarning((f"Job {jobId} failed unexpectedly.\n") + \
"This may happen if Slurm doesn't have enough resources to schedule the job.\n" + \
"This is not considered a test failure, in this context, since DWS isn't\n" + \
"dependent on the job's failure or success."
))
return

assert jobState == expectedJobState, "Unexpected Job State: " + jobState + "\n" + out
@then(parsers.parse('the job has eventually been {job_state:l}'))
def _(slurmctld, jobId, job_state):
"""the job has eventually been <job_state>"""
slurmctld.wait_until_job_has_been_x(jobId, job_state)
Loading

0 comments on commit 2284bb5

Please sign in to comment.