forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprofiler_kineto.h
218 lines (190 loc) · 7.83 KB
/
profiler_kineto.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#pragma once
#include <string>
#include <vector>
#include <torch/csrc/profiler/api.h>
#include <torch/csrc/profiler/events.h>
#include <torch/csrc/profiler/stubs/base.h>
#include <torch/csrc/profiler/util.h>
namespace torch {
namespace profiler::impl {
struct Result;
namespace kineto {
struct ActivityTraceWrapper;
} // namespace kineto
} // namespace profiler::impl
namespace autograd::profiler {
using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;
using extra_meta_t = std::unordered_map<std::string, std::string>;
struct TORCH_API KinetoEvent {
KinetoEvent(
const std::shared_ptr<const torch::profiler::impl::Result>&,
const bool verbose);
uint64_t startThreadId() const;
uint64_t endThreadId() const;
uint8_t activityType() const;
uint64_t fwdThreadId() const;
bool hasShapes() const;
const c10::ArrayRef<std::vector<int64_t>> shapes() const;
bool hasTypes() const;
const c10::ArrayRef<std::string> dtypes() const;
bool hasConcreteInputs() const;
const c10::ArrayRef<c10::IValue> concreteInputs() const;
bool hasKwinputs() const;
const std::unordered_map<std::string, c10::IValue> kwinputs() const;
uint64_t flops() const;
int64_t sequenceNr() const;
bool hasStack() const;
const c10::ArrayRef<std::string> stack() const;
uint8_t scope() const;
bool hasModuleHierarchy() const;
const c10::ArrayRef<std::string> moduleHierarchy() const;
int64_t debugHandle() const;
std::string name() const;
c10::DeviceType deviceType() const;
int deviceIndex() const;
int64_t nBytes() const;
uint64_t startNs() const;
uint64_t endNs() const;
uint64_t durationNs() const;
bool isAsync() const;
uint64_t correlationId() const;
uint64_t linkedCorrelationId() const;
int64_t deviceResourceId() const;
std::string backend() const;
bool isPythonFunction() const;
int64_t cudaElapsedUs() const;
int64_t privateuse1ElapsedUs() const;
void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
extra_meta_t extraMeta() const;
private:
torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
torch::profiler::impl::ProfilerVoidEventStub fallbackEnd() const;
std::shared_ptr<const torch::profiler::impl::Result> result_;
std::vector<std::string> python_stack_;
// Copy fields from result so we can return ArrayRefs.
std::vector<std::vector<int64_t>> shapes_;
std::vector<std::string> dtypes_;
std::vector<c10::IValue> concrete_inputs_;
std::unordered_map<std::string, c10::IValue> kwinputs_;
};
// Consolidating events returned directly from Kineto
// with events manually created by us (e.g. start/stop marks,
// memory allocation events)
struct TORCH_API ProfilerResult {
ProfilerResult();
ProfilerResult(
uint64_t start_time,
std::vector<KinetoEvent> events,
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
trace,
std::vector<experimental_event_t>&& event_tree);
~ProfilerResult();
uint64_t trace_start_ns() const {
return trace_start_ns_;
}
const std::vector<KinetoEvent>& events() const {
return events_;
}
const std::vector<experimental_event_t>& event_tree() const {
return event_tree_;
}
void save(const std::string& path);
private:
uint64_t trace_start_ns_ = 0;
std::vector<KinetoEvent> events_;
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
std::vector<experimental_event_t> event_tree_;
};
/*
* This API is used by backends to record latency of events that
* happened in the backend but were not visible to pytorch runtime.
* For example, if part of the model is lowered to a dsp backend, then
* the execution of that part of the model is delegated to the backend.
* When backend finishes execution it has an option to provide profiling
* information (latency only at the moment) corresponding to different operators
* that were executed in the backend.
* When such events are recorded by backend using this API, the event
* records will be collected by active kineto profiler. If no kineto profiler
* is active then the event is ignored.
* This provides us with a way to generate all the profiling information
* for a model regardless of where model (or part of it) executed.
* @param start_time_us: start time in us of the event
* @param end_time_us: end time in us of the event
* @param debug_handle: debug handle to correlate this event/op with
* model level module/source information
* @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
* @param event_name: name of the event, e.g. op name
* @param backend_name: name of the backend where the event took place.
*/
TORCH_API void reportBackendEventToActiveKinetoProfiler(
const int64_t start_time_us,
const int64_t end_time_us,
const int64_t debug_handle,
const at::RecordScope scope,
const std::string& event_name,
const std::string& backend_name);
TORCH_API void enableProfiler(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities,
const std::unordered_set<at::RecordScope>& scopes = {});
/*
* Same as enableProfiler but with callback to do post-processing of
* KinetoEvents.
* enableProfilerWithEventPostProcess enables profiler to capture
* specified activities, with specified RecordFunction scope, if any.
* Additionally, it takes a functor that does in-place post processing of
* events, e.g. populate stack trace or module hierarchy information lazily
* using debug_handle.
* Example usage is with lite interpreter that has recording scope of
* LITE_INTERPRETER. In this case lite interpreter runtime, records debug
* handles in RecordFunction, along with other information. Debug handles are
* eventually passed down to KinetoEvent and recorded as part of the event.
* KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
* profiler using post-processing callback, via
* enableProfilerWithEventPostProcess, that takes these debug handles and
* generates stack trace and module hierarchy information, once profiling is
* done.
*/
using post_process_t = std::function<void(
/*debug_handle */ int64_t,
/*jit_stack */ std::vector<std::string>&,
/*jit_modules */ std::vector<std::string>&)>;
TORCH_API void enableProfilerWithEventPostProcess(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities,
post_process_t&& cb,
const std::unordered_set<at::RecordScope>& scopes = {});
TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
TORCH_API void prepareProfiler(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities);
TORCH_API void toggleCollectionDynamic(
const bool enable,
const std::set<torch::profiler::impl::ActivityType>& activities);
/**
* When a C++ thread really has no control over how the profiler was enabled,
* for example, by some unreachable Python code, it can call these functions
* to test/join/unjoin itself into the collection set of a profiler, if any.
* Without calling these functions, the symptom may be "not seeing GPU events
* from some child C++ threads". This is an example on how to use them,
*
* using namespace torch::autograd::profiler;
* bool enabled = isProfilerEnabledInMainThread();
* if (enabled != saved_enabled_state) {
* if (enabled) {
* enableProfilerInChildThread();
* } else {
* disableProfilerInChildThread();
* }
* saved_enabled_state = enabled;
* }
*/
TORCH_API bool isProfilerEnabledInMainThread();
TORCH_API void enableProfilerInChildThread();
TORCH_API void disableProfilerInChildThread();
} // namespace autograd::profiler
namespace profiler::impl {
// Experimental.
TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);
} // namespace profiler::impl
} // namespace torch