Skip to content

Commit

Permalink
lsf criu scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
xunpan committed Nov 3, 2021
1 parent 9fe8a4d commit 082d1e6
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ A setuid tool to submit LSF jobs on behalf of other users.

- [LSF chatops](https://github.com/IBMSpectrumComputing/lsf-utils/tree/master/chatops/errbot)
An `Errbot` plugin to help you talking with your `LSF` cluster by `Slack` from anywhere.

- [LSF checkpoint with CRIU](https://github.com/IBMSpectrumComputing/lsf-utils/tree/master/criu)
Scripts to use `CRIU` for LSF checkpoint/restart job
41 changes: 41 additions & 0 deletions criu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# LSF checkpoint with CRIU

## Usage

To prepare checkpoint environment for [CRIU](https://criu.org/Main_Page),
echkpnt scripts should be downloaded and copy to your `LSF` environment.
```
$ cp echkpnt.criu erestart.criu $LSF_SERVERDIR
```

To use `CRIU` checkpoint method:
1. submit a job with method `criu`
```
$ bsub -k '/usr/local/work/chkpnt/ method=criu' ./counter
Job <477> is submitted to default queue <normal>.
```

2. checkpoint the job with killing
```
$ bchkpnt -k 477
Job <477> is being checkpointed
```

3. restart the checkpointed job
```
$ brestart /usr/local/work/chkpnt/ 477
Job <478> is submitted to queue <normal>.
```
## Debug
By default, `CRIU` checkpoint scripts log debug information to `/tmp` directory.
```
$ cat /tmp/lsf-job-477.cr.log
2021-11-02 19:09:27,035 lsf-criu-checkpoint[2670944]: start checkpointing ......
2021-11-02 19:09:27,035 lsf-criu-checkpoint[2670944]: ['-c', '-k', '-d', '477.tmp', '2669404']
2021-11-02 19:09:27,044 lsf-criu-checkpoint[2670944]: job process id: 2669408
2021-11-02 19:09:27,044 lsf-criu-checkpoint[2670944]: ['criu', 'dump', '-D', '/usr/local/work/chkpnt//477', '-t', '2669408', '--shell-job']
2021-11-02 19:09:27,864 lsf-criu-checkpoint[2670944]: b''
2021-11-02 19:09:27,864 lsf-criu-checkpoint[2670944]: b'Warn (compel/arch/x86/src/lib/infect.c:340): Will restore 2669410 with interrupted system call\n'
2021-11-02 19:09:27,865 lsf-criu-checkpoint[2670944]: leave checkpointing ...
```

76 changes: 76 additions & 0 deletions criu/echkpnt.criu
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/python3

# Copyright International Business Machines Corp, 2021
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os
import psutil
import logging
import subprocess

log = None

def set_log(logfile):
global log

filelog = logging.FileHandler(logfile)
logformat = logging.Formatter("%(asctime)s %(name)s[%(process)d]: %(message)s")
filelog.setFormatter(logformat)

log = logging.getLogger('lsf-criu-checkpoint')
log.setLevel(logging.DEBUG)
log.addHandler(filelog)

def log_debug(message):
if log is not None :
log.debug(message)

def run_it(cmd) :
proc = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
out, err = proc.communicate()
if out is not None and out.strip() != '':
log_debug(out)
if err is not None and err.strip() != '':
log_debug(err)
proc.wait()

def main(argv):
# For debug purpose only. Comment it out if you don't want debug messages
set_log("/tmp/lsf-job-" + os.environ['LSB_JOBID'] + ".cr.log")

log_debug("start checkpointing ......")
log_debug(argv)

# get process id for running job to checkpoint
p = psutil.Process(int(argv[-1]))
jobpid = str(p.children()[0].pid)
log_debug('job process id: ' + jobpid)

# make checkpoint command line
cmd = ["criu", "dump", "-D", os.environ['LSB_CHKPNT_DIR'] , "-t", jobpid, "--shell-job"];
if '-k' not in argv:
cmd.append('--leave-running')
log_debug(cmd)

# checkpoint
run_it(cmd)

log_debug("leave checkpointing ...")
sys.exit(0)


if __name__ == "__main__":
main(sys.argv[1:])

89 changes: 89 additions & 0 deletions criu/erestart.criu
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/python3

# Copyright International Business Machines Corp, 2021
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os
import re
import logging
import subprocess

log = None

def set_log(logfile):
global log

filelog = logging.FileHandler(logfile)
logformat = logging.Formatter("%(asctime)s %(name)s[%(process)d]: %(message)s")
filelog.setFormatter(logformat)

log = logging.getLogger('lsf-criu-restart')
log.setLevel(logging.DEBUG)
log.addHandler(filelog)

def log_debug(message):
if log is not None :
log.debug(message)

def run_it(cmd) :
proc = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
out, err = proc.communicate()
if out is not None and out.strip() != '':
log_debug(out)
if err is not None and err.strip() != '':
log_debug(err)
proc.wait()

def main(argv):
# for debug purpose only
set_log("/tmp/lsf-job-" + os.environ['LSB_JOBID'] + ".cr.log")

log_debug("start restarting ......")
log_debug(argv)

# get job cgroup from memory sub group
cgtop="/sys/fs/cgroup/memory/lsf/" + os.environ['LSF_CGROUP_TOPDIR_KEY']
jobcg = "^job\." + os.environ['LSB_JOBID'] + "\..*"
jobcgre = re.compile(jobcg)
jobcgroup = ""
for f in os.listdir(cgtop):
if (jobcgre.match(f)):
jobcgroup = f
break;

log_debug("job cgroup is <" + jobcgroup + ">")

cgrp_options = ""
if jobcgroup != "":
cgrp_path= "/lsf/" + os.environ['LSF_CGROUP_TOPDIR_KEY'] + "/" + jobcgroup + "/"
mem_cgrp= "memory:" + cgrp_path
frz_cgrp= "freezer:" + cgrp_path
cpu_cgrp= "cpu,cpuacct:" + cgrp_path
cgrp_options= " --cgroup-root " + mem_cgrp + " --cgroup-root " + frz_cgrp + " --cgroup-root " + cpu_cgrp

cmd = "LSB_RESTART_CMD=criu restore -D " + os.environ['LSB_CHKPNT_DIR'] + cgrp_options + " --shell-job \n"
log_debug(cmd)

f = open(os.environ['LSB_CHKPNT_DIR'] + "/.restart_cmd", "w")
f.write(cmd);
f.close();

log_debug("leaving ...")
sys.exit(0);


if __name__ == "__main__":
main(sys.argv[1:])

0 comments on commit 082d1e6

Please sign in to comment.