Skip to content

Commit

Permalink
Add archive (#287)
Browse files Browse the repository at this point in the history
Enable usage of session rotation for lossless online trace consumption.

---------

Co-authored-by: Thomas Applencourt <[email protected]>
  • Loading branch information
2 people authored and Brice Videau committed Sep 19, 2024
1 parent dcd4aed commit 481dd72
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 14 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/presubmit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ jobs:
run: tar -xvf thapi.tar
- run: sudo apt update; sudo apt install -y $APT_PACKAGE bats coreutils libpocl2 clinfo
- run: sudo gem install babeltrace2 opencl_ruby_ffi
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
Expand All @@ -214,7 +214,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down Expand Up @@ -252,7 +252,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down Expand Up @@ -284,7 +284,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencie
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down
4 changes: 4 additions & 0 deletions integration_tests/general.bats
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ teardown_file() {
rm out.pftrace
}

@test "archive_summary" {
$IPROF --archive $THAPI_TEST_BIN
}

@test "replay_summary" {
$IPROF $THAPI_TEST_BIN
$IPROF -r
Expand Down
9 changes: 9 additions & 0 deletions utils/babeltrace_thapi.in
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def get_components(names)
components_classes = {
'source.ctf.fs' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('fs'),
'source.ctf.lttng_live' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-live'),
'source.ctf.lttng_archive' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-archive'),
'filter.utils.muxer' => BT2::BTPlugin.find('utils').get_filter_component_class_by_name('muxer'),
'sink.text.pretty' => BT2::BTPlugin.find('text').get_sink_component_class_by_name('pretty'),
'sink.ctf.fs' => BT2::BTPlugin.find('ctf').get_sink_component_class_by_name('fs'),
Expand Down Expand Up @@ -200,6 +201,10 @@ def get_and_add_components(graph, names, l_inputs)
graph.add(comp, 'source_live',
params: { 'inputs' => $options[:inputs],
'session-not-found-action' => 'end' })
when 'source.ctf.lttng_archive'
graph.add(comp, 'source_archive',
params: { 'session-name' => $options[:archive],
'session-found-file-path' => $options[:'archive-session-found-file-path'] })
when 'source.ctf.fs'
s = Find.find(*l_inputs)
.reject { |path| FileTest.directory?(path) }
Expand Down Expand Up @@ -281,6 +286,8 @@ def bt_graphs(inputs)
@bt_graphs[inputs] ||= begin
g_comps = [if $options[:live]
'source.ctf.lttng_live'
elsif $options[:archive]
'source.ctf.lttng_archive'
else
'source.ctf.fs'
end]
Expand Down Expand Up @@ -354,6 +361,8 @@ class BabeltraceParserThapi < OptionParserWithDefaultAndValidation
'Format: backend_name[:backend_level],...',
default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1'])
on('--debug', default: false)
on('--archive SESSION-NAME')
on('--archive-session-found-file-path PATH')
on('--[no-]muxer')
on('-v', '--version', 'Print the version string') do
puts File.read(File.join(DATADIR, 'version'))
Expand Down
79 changes: 69 additions & 10 deletions xprof/xprof.rb.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ PREFIX = '@prefix@'
DATAROOTDIR = File.join(PREFIX, 'share')
DATADIR = DATAROOTDIR

LTTNG_ARCHIVE_SIZE = '50M'
LTTNG_ARCHIVE_TIMER = '60s'
LTTNG_DIRWATCH_SIZE = '500' # In MiB
LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1

$LOAD_PATH.unshift(DATADIR) if File.directory?(DATADIR)
require 'open3'
require 'fileutils'
Expand Down Expand Up @@ -311,6 +316,7 @@ class Sync_daemon
raise
ensure
return unless f

f.global_barrier
f.finalize
end
Expand Down Expand Up @@ -565,16 +571,24 @@ def lm_setup_lttng(backends)
end

end
# This is required to force the creation of an trace,
# so that dirwatch doesn't complain about empty trace
if OPTIONS[:archive]
exec("lttng enable-rotation --session=#{lttng_session_uuid} --size=#{LTTNG_ARCHIVE_SIZE} --timer=#{LTTNG_ARCHIVE_TIMER}")
end
exec("lttng start #{lttng_session_uuid}")
end

def lm_lttng_teardown_session
raise unless mpi_local_master?

exec("lttng rotate #{lttng_session_uuid}") if OPTIONS[:archive]
exec("lttng destroy #{lttng_session_uuid}")
end

def lm_lttng_kill_sessiond
raise unless mpi_local_master?

# Need to kill the sessiond Daemon. It's safe because each job has their own
#
# In theory, opening the lttng-sessiond.pid file is racy.
Expand All @@ -601,15 +615,35 @@ def lm_babeltrace(backends)
opts << "--output #{thapi_trace_dir_tmp}"
opts << "--backends #{backends.join(',')}"
opts << '--no-discard-metadata' if type == 'aggreg' && OPTIONS.include?(:'kernel-verbose')
exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}")

if OPTIONS[:archive]
read_file = File.join(lttng_trace_dir_tmp, 'bt_archive_ready')
opts << "--archive #{lttng_session_uuid} --archive-session-found-file-path=#{read_file}"
cmd = "#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}"
LOGGER.debug(cmd)
pid_bt = spawn(cmd)

cmd = "dirwatch.py --log-level=CRITICAL #{lttng_session_uuid} #{LTTNG_DIRWATCH_SIZE}"
LOGGER.debug(cmd)
pid_dirwatch = spawn(cmd)

until File.exist?(read_file)
# Ensure that dirwatch.py didn't crash, and deadlock
Process.wait(pid_dirwatch, Process::WNOHANG)
sleep(LTTNG_DIRWATCH_LOCK_RETRY_DELAY)
end
[pid_bt, pid_dirwatch]
else
exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}")
end
end

# _
# |_) ._ _ _ _ _ _ o ._ _
# | | (_) (_ (/_ _> _> | | | (_|
# _|

# Some naming convension
# Some naming convention
# lm == function executed only local_master
# gm == function executed only global_master

Expand All @@ -619,6 +653,11 @@ def lm_move_to_shared
if OPTIONS.include?(:trace) || !OPTIONS[:analysis]
# The Apps finished, lttng finished, need to move to the shared tmp folder
FileUtils.mkdir_p(File.dirname(thapi_trace_dir_tmp))
# NOTE: I don't understand `mv`
# File.mv(a, b) will put a into b (aka a/b)
# FileUtils.rename(a,b) will move a as b, but may
# raise Invalid cross-device error.
# So we use `exec(mv -T a b)`, this have the added benefice of logging
exec("mv #{lttng_trace_dir_tmp} #{thapi_trace_dir_tmp}")
else
# `lm_babeltrace` finished, can remove `tmp` folder
Expand All @@ -634,7 +673,7 @@ def gm_rename_folder
# Replace it with a better name, and update the root metadata.

thapi_trace_dir_tmp_root = File.dirname(thapi_trace_dir_tmp)
# Because of `traced-rank`, `mpi_master` may not have any trace avalaible,
# Because of `traced-rank`, `mpi_master` may not have any trace available,
# so find the first hostname who have a metadata
FileUtils.cp(Dir.glob("#{thapi_trace_dir_tmp_root}/*/thapi_metadata.yaml").first,
File.join(thapi_trace_dir_tmp_root, 'thapi_metadata.yaml'))
Expand All @@ -658,26 +697,45 @@ def trace_and_on_node_processing(usr_argv)
# All ranks need to set the LLTTNG_HOME env
# so they can have access to the daemon
ENV['LTTNG_HOME'] = lttng_home_dir
# Only local master spawn LTTNG daemon and start session
lm_setup_lttng(backends) if mpi_local_master?
LOGGER.debug("LTTNG_HOME = #{ENV.fetch('LTTNG_HOME', nil)}")

# Only local master spawn daemons (lttng, and babeltrace)
# and the start the lttng-session
pids = if mpi_local_master?
lm_setup_lttng(backends)
lm_babeltrace(backends) if OPTIONS[:archive]
end
# Other local node cannot start before lttng and the daemon
syncd.local_barrier('waiting_for_lttng_setup')
# Launch User Command
launch_usr_bin(h, usr_argv)

# We need to be sure that all the local ranks are finished
# We need to ensure that all the local ranks have finished
# running the user application
# before the local master stops the lttng session
syncd.local_barrier('waiting_for_application_ending')

# Everything from now on, is some local-master processing
# The `Sync_daemon` context will handle the call to the global barrier
# for the early exiting ranks
return unless mpi_local_master?

# Stop Lttng session
# Stop Lttng session and babeltrace daemons
lm_lttng_teardown_session
# Lttng session is finished,
if OPTIONS[:archive]
LOGGER.debug("Waiting for babeltrace_thapi and dirwatch (#{pids}) to finish")
pids.each do |pid|
Process.wait(pid)
raise "#{pid} failed" unless $?.success?
end
end
# we can kill the session daemon
lm_lttng_kill_sessiond
# Preprocess trace
lm_babeltrace(backends)
lm_babeltrace(backends) unless OPTIONS[:archive]
lm_move_to_shared
end
# Global master rename the unique trace folder to a more
# human friendly name
gm_rename_folder if mpi_master?
end

Expand Down Expand Up @@ -772,6 +830,7 @@ if __FILE__ == $PROGRAM_NAME
parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.",
'Format: backend_name[:backend_level],...',
default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1'])
parser.on('--[no-]archive', 'Trigger for ardhive support', default: false)

# Analysis
parser.on('-r', '--replay [PATH]', 'Replay traces for post-mortem analysis.',
Expand Down

0 comments on commit 481dd72

Please sign in to comment.