@@ -135,8 +135,9 @@ void kernel_runner::prepare_kernel_args(const kernel_selector::KernelsData& kern
135
135
if (zero_points_exist) {
136
136
const auto & zero_point_params =
137
137
static_cast <const kernel_selector::weight_bias_zero_point_params&>(weights_bias_params);
138
- if (weight_zero_point_buffers.empty ()) {
139
- for (auto & weight_zero_point : zero_point_params.weights_zero_points ) {
138
+ if (!zero_point_params.weights_zero_points .empty ()) {
139
+ if (weight_zero_point_buffers.empty ()) {
140
+ auto & weight_zero_point = zero_point_params.weights_zero_points [0 ];
140
141
auto num_of_elements = static_cast <int >(weight_zero_point.PhysicalSize ());
141
142
weight_zero_point_buffers.push_back (
142
143
engine->allocate_memory ({
@@ -145,28 +146,33 @@ void kernel_runner::prepare_kernel_args(const kernel_selector::KernelsData& kern
145
146
tensor (1 , num_of_elements, 1 , 1 ) },
146
147
0 ));
147
148
}
149
+ args.weights_zero_points = weight_zero_point_buffers[0 ];
148
150
}
149
- if (activation_zero_point_buffers.empty ()) {
150
- for (auto & activation_zero_point : zero_point_params.activations_zero_points ) {
151
+ if (!zero_point_params.activations_zero_points .empty ()) {
152
+ if (activation_zero_point_buffers.empty ()) {
153
+ auto & activation_zero_point = zero_point_params.activations_zero_points [0 ];
151
154
auto num_of_elements = static_cast <int >(activation_zero_point.PhysicalSize ());
152
- weight_zero_point_buffers .push_back (
155
+ activation_zero_point_buffers .push_back (
153
156
engine->allocate_memory ({
154
157
from_data_type (activation_zero_point.GetDType ()),
155
158
format::bfyx,
156
159
tensor (1 , num_of_elements, 1 , 1 ) },
157
160
0 ));
158
161
}
162
+ args.activations_zero_points = activation_zero_point_buffers[0 ];
159
163
}
160
- if (compensation_buffers.empty ()) {
161
- for (auto & compensation : zero_point_params.compensation ) {
164
+ if (!zero_point_params.compensation .empty ()) {
165
+ if (compensation_buffers.empty ()) {
166
+ auto & compensation = zero_point_params.compensation [0 ];
162
167
auto num_of_elements = static_cast <int >(compensation.PhysicalSize ());
163
- weight_zero_point_buffers .push_back (
168
+ compensation_buffers .push_back (
164
169
engine->allocate_memory ({
165
170
from_data_type (compensation.GetDType ()),
166
171
format::bfyx,
167
172
tensor (1 , num_of_elements, 1 , 1 ) },
168
173
0 ));
169
174
}
175
+ args.compensation = compensation_buffers[0 ];
170
176
}
171
177
}
172
178
}
@@ -202,27 +208,32 @@ std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_se
202
208
int i = 0 ;
203
209
for (auto it = batch_start; it < batch_end; it++) {
204
210
std::vector<event_impl::ptr> events;
205
- auto kernel_run_time = std::chrono::nanoseconds::zero ();
211
+ auto kernel_run_time = std::chrono::nanoseconds::max ();
206
212
int num_of_runs = 0 ;
207
213
208
214
for (int iteration = 0 ; iteration < runs_per_kernel; iteration++) {
209
215
event_impl::ptr event;
210
216
try {
211
217
event = kernels[i].run (0 , it->kernels [0 ], {}, args);
218
+ } catch (std::exception& e) {
219
+ std::cout << " [clDNN] Could not run kernel for auto-tune: " << it->kernelName
220
+ << " with auto-tune index " << it->autoTuneIndex << std::endl
221
+ << " , error message:" << e.what ();
212
222
} catch (...) {
213
223
// Could not run this kernel. Push back NULL event (will be ignored later).
224
+ std::cout << " [clDNN] Could not run kernel for auto-tune: " << it->kernelName
225
+ << " with auto-tune index " << it->autoTuneIndex << std::endl;
214
226
}
215
227
events.push_back (event);
216
228
}
217
-
218
229
context->queue (0 ).finish ();
219
230
220
231
for (auto & event : events) {
221
232
if (event.get () != NULL ) {
222
233
auto profiling_intervals = event->get_profiling_info ();
223
234
for (auto const & profiling_interval : profiling_intervals) {
224
235
if (profiling_interval.name == " executing" ) {
225
- kernel_run_time += profiling_interval.value ->value ();
236
+ kernel_run_time = std::min ( profiling_interval.value ->value (), kernel_run_time );
226
237
num_of_runs++;
227
238
break ;
228
239
}
@@ -231,7 +242,7 @@ std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_se
231
242
}
232
243
233
244
if (num_of_runs > 0 ) {
234
- run_times.push_back (kernel_run_time / num_of_runs );
245
+ run_times.push_back (kernel_run_time);
235
246
num_of_kernels_run += 1 ;
236
247
} else {
237
248
run_times.push_back (std::chrono::nanoseconds::max ());
0 commit comments