Skip to content

Commit

Permalink
[criu checkpoint] added a timeout to exit when the resume fails
Browse files Browse the repository at this point in the history
  • Loading branch information
bzizou authored and npf committed Dec 8, 2021
1 parent 662a6b0 commit 86ac8f3
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions misc/checkpointing/criu/test.script.checkpoint.oar
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
#OAR --checkpoint 240
#OAR --notify mail:[email protected]

# Timeout to adapt: 600 is a good value for bigger jobs
RESUME_TIMEOUT=90

# Handler for checkpointing signal sent by OAR
handler() { echo "Caught checkpoint signal at: `date`"
echo "Checkpointing..."
Expand All @@ -22,12 +25,21 @@ source /applis/site/nix.sh
if [ -e checkpoint_ok ]
then
rm -f checkpoint/pidfile
sleep 30
echo -e "$(pwd)" > /var/lib/checkpoints/$OAR_JOB_ID.resume
# Wait for the restore (for pidfile to be created)
while [ \! -e checkpoint/pidfile ]
declare -i c=1
while [ \! -e checkpoint/pidfile -a $c -le $RESUME_TIMEOUT ]
do
sleep 5
sleep 1
let c++
done
if [ $c -eq $RESUME_TIMEOUT ]
then
echo "ERROR: Timeout on resume!" >&2
exit 3
fi
sleep 5
PROG_PID=$(cat checkpoint/pidfile)

# No checkpoint, starting the program
Expand Down

0 comments on commit 86ac8f3

Please sign in to comment.