@@ -6138,7 +6138,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6138
6138
6139
6139
// multi-thread
6140
6140
6141
- for (uint32_t n_threads = 1 ; n_threads <= std::thread::hardware_concurrency (); n_threads ++) {
6141
+ for (uint32_t k = 1 ; k <= n_threads; k ++) {
6142
6142
char * src = (char *) malloc (size);
6143
6143
char * dst = (char *) malloc (size);
6144
6144
@@ -6149,8 +6149,8 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6149
6149
double tsum = 0.0 ;
6150
6150
6151
6151
auto helper = [&](int th) {
6152
- const int64_t i0 = (th + 0 )*size/n_threads ;
6153
- const int64_t i1 = (th + 1 )*size/n_threads ;
6152
+ const int64_t i0 = (th + 0 )*size/k ;
6153
+ const int64_t i1 = (th + 1 )*size/k ;
6154
6154
6155
6155
for (size_t i = 0 ; i < n; i++) {
6156
6156
memcpy (dst + i0, src + i0, i1 - i0);
@@ -6161,22 +6161,22 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
6161
6161
6162
6162
const int64_t t0 = ggml_time_us ();
6163
6163
6164
- std::vector<std::thread> threads (n_threads - 1 );
6165
- for (uint32_t th = 0 ; th < n_threads - 1 ; ++th) {
6164
+ std::vector<std::thread> threads (k - 1 );
6165
+ for (uint32_t th = 0 ; th < k - 1 ; ++th) {
6166
6166
threads[th] = std::thread (helper, th);
6167
6167
}
6168
6168
6169
- helper (n_threads - 1 );
6169
+ helper (k - 1 );
6170
6170
6171
- for (uint32_t th = 0 ; th < n_threads - 1 ; ++th) {
6171
+ for (uint32_t th = 0 ; th < k - 1 ; ++th) {
6172
6172
threads[th].join ();
6173
6173
}
6174
6174
6175
6175
const int64_t t1 = ggml_time_us ();
6176
6176
6177
6177
tsum += (t1 - t0)*1e-6 ;
6178
6178
6179
- snprintf (strbuf, sizeof (strbuf), " memcpy: %7.2f GB/s (%2d thread)\n " , (double ) (n*size)/(tsum*1e9 ), n_threads );
6179
+ snprintf (strbuf, sizeof (strbuf), " memcpy: %7.2f GB/s (%2d thread)\n " , (double ) (n*size)/(tsum*1e9 ), k );
6180
6180
s += strbuf;
6181
6181
6182
6182
// needed to prevent the compiler from optimizing the memcpy away
0 commit comments