Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add archive #287

Merged
merged 5 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/presubmit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ jobs:
run: tar -xvf thapi.tar
- run: sudo apt update; sudo apt install -y $APT_PACKAGE bats coreutils libpocl2 clinfo
- run: sudo gem install babeltrace2 opencl_ruby_ffi
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
Expand All @@ -214,7 +214,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down Expand Up @@ -252,7 +252,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down Expand Up @@ -284,7 +284,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencie
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down
4 changes: 4 additions & 0 deletions integration_tests/general.bats
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ teardown_file() {
rm out.pftrace
}

@test "archive_summary" {
$IPROF --archive $THAPI_TEST_BIN
}

@test "replay_summary" {
$IPROF $THAPI_TEST_BIN
$IPROF -r
Expand Down
9 changes: 9 additions & 0 deletions utils/babeltrace_thapi.in
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def get_components(names)
components_classes = {
'source.ctf.fs' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('fs'),
'source.ctf.lttng_live' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-live'),
'source.ctf.lttng_archive' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-archive'),
'filter.utils.muxer' => BT2::BTPlugin.find('utils').get_filter_component_class_by_name('muxer'),
'sink.text.pretty' => BT2::BTPlugin.find('text').get_sink_component_class_by_name('pretty'),
'sink.ctf.fs' => BT2::BTPlugin.find('ctf').get_sink_component_class_by_name('fs'),
Expand Down Expand Up @@ -200,6 +201,10 @@ def get_and_add_components(graph, names, l_inputs)
graph.add(comp, 'source_live',
params: { 'inputs' => $options[:inputs],
'session-not-found-action' => 'end' })
when 'source.ctf.lttng_archive'
graph.add(comp, 'source_archive',
params: { 'session-name' => $options[:archive],
'session-found-file-path' => $options[:'archive-session-found-file-path'] })
when 'source.ctf.fs'
s = Find.find(*l_inputs)
.reject { |path| FileTest.directory?(path) }
Expand Down Expand Up @@ -281,6 +286,8 @@ def bt_graphs(inputs)
@bt_graphs[inputs] ||= begin
g_comps = [if $options[:live]
'source.ctf.lttng_live'
elsif $options[:archive]
'source.ctf.lttng_archive'
else
'source.ctf.fs'
end]
Expand Down Expand Up @@ -354,6 +361,8 @@ class BabeltraceParserThapi < OptionParserWithDefaultAndValidation
'Format: backend_name[:backend_level],...',
default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1'])
on('--debug', default: false)
on('--archive SESSION-NAME')
on('--archive-session-found-file-path PATH')
on('--[no-]muxer')
on('-v', '--version', 'Print the version string') do
puts File.read(File.join(DATADIR, 'version'))
Expand Down
79 changes: 69 additions & 10 deletions xprof/xprof.rb.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ PREFIX = '@prefix@'
DATAROOTDIR = File.join(PREFIX, 'share')
DATADIR = DATAROOTDIR

LTTNG_ARCHIVE_SIZE = '50M'
LTTNG_ARCHIVE_TIMER = '60s'
LTTNG_DIRWATCH_SIZE = '500' # In MiB
LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1

$LOAD_PATH.unshift(DATADIR) if File.directory?(DATADIR)
require 'open3'
require 'fileutils'
Expand Down Expand Up @@ -311,6 +316,7 @@ class Sync_daemon
raise
ensure
return unless f

f.global_barrier
f.finalize
end
Expand Down Expand Up @@ -560,16 +566,24 @@ def lm_setup_lttng(backends)
end

end
# This is required to force the creation of an trace,
# so that dirwatch doesn't complain about empty trace
if OPTIONS[:archive]
exec("lttng enable-rotation --session=#{lttng_session_uuid} --size=#{LTTNG_ARCHIVE_SIZE} --timer=#{LTTNG_ARCHIVE_TIMER}")
end
exec("lttng start #{lttng_session_uuid}")
end

def lm_lttng_teardown_session
raise unless mpi_local_master?

exec("lttng rotate #{lttng_session_uuid}") if OPTIONS[:archive]
exec("lttng destroy #{lttng_session_uuid}")
end

def lm_lttng_kill_sessiond
raise unless mpi_local_master?

# Need to kill the sessiond Daemon. It's safe because each job has their own
#
# In theory, opening the lttng-sessiond.pid file is racy.
Expand All @@ -596,15 +610,35 @@ def lm_babeltrace(backends)
opts << "--output #{thapi_trace_dir_tmp}"
opts << "--backends #{backends.join(',')}"
opts << '--no-discard-metadata' if type == 'aggreg' && OPTIONS.include?(:'kernel-verbose')
exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}")

if OPTIONS[:archive]
read_file = File.join(lttng_trace_dir_tmp, 'bt_archive_ready')
opts << "--archive #{lttng_session_uuid} --archive-session-found-file-path=#{read_file}"
cmd = "#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}"
LOGGER.debug(cmd)
pid_bt = spawn(cmd)

cmd = "dirwatch.py --log-level=CRITICAL #{lttng_session_uuid} #{LTTNG_DIRWATCH_SIZE}"
LOGGER.debug(cmd)
pid_dirwatch = spawn(cmd)

until File.exist?(read_file)
# Ensure that dirwatch.py didn't crash, and deadlock
Process.wait(pid_dirwatch, Process::WNOHANG)
sleep(LTTNG_DIRWATCH_LOCK_RETRY_DELAY)
end
[pid_bt, pid_dirwatch]
else
exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}")
end
end

# _
# |_) ._ _ _ _ _ _ o ._ _
# | | (_) (_ (/_ _> _> | | | (_|
# _|

# Some naming convension
# Some naming convention
# lm == function executed only local_master
# gm == function executed only global_master

Expand All @@ -614,6 +648,11 @@ def lm_move_to_shared
if OPTIONS.include?(:trace) || !OPTIONS[:analysis]
# The Apps finished, lttng finished, need to move to the shared tmp folder
FileUtils.mkdir_p(File.dirname(thapi_trace_dir_tmp))
# NOTE: I don't understand `mv`
# File.mv(a, b) will put a into b (aka a/b)
# FileUtils.rename(a,b) will move a as b, but may
# raise Invalid cross-device error.
# So we use `exec(mv -T a b)`, this have the added benefice of logging
exec("mv #{lttng_trace_dir_tmp} #{thapi_trace_dir_tmp}")
else
# `lm_babeltrace` finished, can remove `tmp` folder
Expand All @@ -629,7 +668,7 @@ def gm_rename_folder
# Replace it with a better name, and update the root metadata.

thapi_trace_dir_tmp_root = File.dirname(thapi_trace_dir_tmp)
# Because of `traced-rank`, `mpi_master` may not have any trace avalaible,
# Because of `traced-rank`, `mpi_master` may not have any trace available,
# so find the first hostname who have a metadata
FileUtils.cp(Dir.glob("#{thapi_trace_dir_tmp_root}/*/thapi_metadata.yaml").first,
File.join(thapi_trace_dir_tmp_root, 'thapi_metadata.yaml'))
Expand All @@ -653,26 +692,45 @@ def trace_and_on_node_processing(usr_argv)
# All ranks need to set the LLTTNG_HOME env
# so they can have access to the daemon
ENV['LTTNG_HOME'] = lttng_home_dir
# Only local master spawn LTTNG daemon and start session
lm_setup_lttng(backends) if mpi_local_master?
LOGGER.debug("LTTNG_HOME = #{ENV.fetch('LTTNG_HOME', nil)}")

# Only local master spawn daemons (lttng, and babeltrace)
# and the start the lttng-session
pids = if mpi_local_master?
lm_setup_lttng(backends)
lm_babeltrace(backends) if OPTIONS[:archive]
end
# Other local node cannot start before lttng and the daemon
syncd.local_barrier('waiting_for_lttng_setup')
# Launch User Command
launch_usr_bin(h, usr_argv)

# We need to be sure that all the local ranks are finished
# We need to ensure that all the local ranks have finished
# running the user application
# before the local master stops the lttng session
syncd.local_barrier('waiting_for_application_ending')

# Everything from now on, is some local-master processing
# The `Sync_daemon` context will handle the call to the global barrier
# for the early exiting ranks
return unless mpi_local_master?

# Stop Lttng session
# Stop Lttng session and babeltrace daemons
lm_lttng_teardown_session
# Lttng session is finished,
if OPTIONS[:archive]
LOGGER.debug("Waiting for babeltrace_thapi and dirwatch (#{pids}) to finish")
pids.each do |pid|
Process.wait(pid)
raise "#{pid} failed" unless $?.success?
end
end
# we can kill the session daemon
lm_lttng_kill_sessiond
# Preprocess trace
lm_babeltrace(backends)
lm_babeltrace(backends) unless OPTIONS[:archive]
lm_move_to_shared
end
# Global master rename the unique trace folder to a more
# human friendly name
gm_rename_folder if mpi_master?
end

Expand Down Expand Up @@ -767,6 +825,7 @@ if __FILE__ == $PROGRAM_NAME
parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.",
'Format: backend_name[:backend_level],...',
default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1'])
parser.on('--[no-]archive', 'Trigger for ardhive support', default: false)

# Analysis
parser.on('-r', '--replay [PATH]', 'Replay traces for post-mortem analysis.',
Expand Down