@@ -384,7 +384,6 @@ struct vk_context {
384
384
};
385
385
386
386
struct ggml_tensor_extra_gpu {
387
- ggml_backend_vk_context * backend_ctx;
388
387
size_t ctx_idx;
389
388
390
389
vk_buffer_ref buffer_gpu;
@@ -2746,9 +2745,6 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2746
2745
ggml_vk_ensure_sync_staging_buffer (src->device , size);
2747
2746
ggml_vk_ensure_sync_staging_buffer (dst->device , size);
2748
2747
2749
- std::lock_guard<std::mutex> src_lock (src->device ->mutex );
2750
- std::lock_guard<std::mutex> dst_lock (dst->device ->mutex );
2751
-
2752
2748
// Copy to src staging buffer
2753
2749
ggml_vk_buffer_copy (src->device ->sync_staging , 0 , src, src_offset, size);
2754
2750
// memcpy to dst staging buffer
@@ -3228,18 +3224,30 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3228
3224
stride_batch_y = src1->nb [0 ] / ggml_type_size (src1->type );
3229
3225
}
3230
3226
3227
+ const uint32_t max_groups_x = ctx->device ->properties .limits .maxComputeWorkGroupCount [0 ];
3228
+
3229
+ uint32_t groups_x = ne01;
3230
+ uint32_t groups_z = 1 ;
3231
+
3232
+ if (ne01 > max_groups_x) {
3233
+ groups_z = 64 ;
3234
+ groups_x /= groups_z;
3235
+ }
3236
+
3231
3237
// compute
3232
3238
const vk_mat_vec_push_constants pc = {
3233
3239
(uint32_t )ne00, (uint32_t )ne10, (uint32_t )ne10, (uint32_t )ne01,
3234
3240
stride_batch_x, stride_batch_y, (uint32_t )(ne20*ne21),
3235
3241
(uint32_t )ne02, (uint32_t )ne12, (uint32_t )r2, (uint32_t )r3,
3236
3242
};
3237
3243
ggml_vk_sync_buffers (subctx);
3238
- ggml_vk_dispatch_pipeline (ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof (vk_mat_vec_push_constants), &pc, { (uint32_t )ne01, (uint32_t )(ne12 * ne13), 1 });
3244
+ ggml_vk_dispatch_pipeline (ctx, subctx, dmmv,
3245
+ { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} },
3246
+ sizeof (vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t )(ne12 * ne13), groups_z });
3239
3247
}
3240
3248
3241
3249
static void ggml_vk_mul_mat_vec_p021_f16_f32 (ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3242
- VK_LOG_DEBUG (" ggml_vk_mul_mat_p021_f16_f32(( " << src0 << " , name=" << src0->name << " , type=" << src0->type << " , ne0=" << src0->ne [0 ] << " , ne1=" << src0->ne [1 ] << " , ne2=" << src0->ne [2 ] << " , ne3=" << src0->ne [3 ] << " , nb0=" << src0->nb [0 ] << " , nb1=" << src0->nb [1 ] << " , nb2=" << src0->nb [2 ] << " , nb3=" << src0->nb [3 ];
3250
+ VK_LOG_DEBUG (" ggml_vk_mul_mat_p021_f16_f32(" << src0 << " , name=" << src0->name << " , type=" << src0->type << " , ne0=" << src0->ne [0 ] << " , ne1=" << src0->ne [1 ] << " , ne2=" << src0->ne [2 ] << " , ne3=" << src0->ne [3 ] << " , nb0=" << src0->nb [0 ] << " , nb1=" << src0->nb [1 ] << " , nb2=" << src0->nb [2 ] << " , nb3=" << src0->nb [3 ];
3243
3251
std::cerr << " ), (" << src1 << " , name=" << src1->name << " , type=" << src1->type << " , ne0=" << src1->ne [0 ] << " , ne1=" << src1->ne [1 ] << " , ne2=" << src1->ne [2 ] << " , ne3=" << src1->ne [3 ] << " , nb0=" << src1->nb [0 ] << " , nb1=" << src1->nb [1 ] << " , nb2=" << src1->nb [2 ] << " , nb3=" << src1->nb [3 ];
3244
3252
std::cerr << " ), (" << dst << " , name=" << dst->name << " , type=" << dst->type << " , ne0=" << dst->ne [0 ] << " , ne1=" << dst->ne [1 ] << " , ne2=" << dst->ne [2 ] << " , ne3=" << dst->ne [3 ] << " , nb0=" << dst->nb [0 ] << " , nb1=" << dst->nb [1 ] << " , nb2=" << dst->nb [2 ] << " , nb3=" << dst->nb [3 ] << " ),)" );
3245
3253
GGML_ASSERT (ggml_is_permuted (src0) && ggml_is_permuted (src1));
@@ -3740,6 +3748,16 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3740
3748
stride_batch_y = src1->nb [0 ] / ggml_type_size (src1->type );
3741
3749
}
3742
3750
3751
+ const uint32_t max_groups_x = ctx->device ->properties .limits .maxComputeWorkGroupCount [0 ];
3752
+
3753
+ uint32_t groups_x = ne01;
3754
+ uint32_t groups_z = 1 ;
3755
+
3756
+ if (ne01 > max_groups_x) {
3757
+ groups_z = 64 ;
3758
+ groups_x /= groups_z;
3759
+ }
3760
+
3743
3761
// compute
3744
3762
const vk_mat_vec_id_push_constants pc = {
3745
3763
(uint32_t )ne00, (uint32_t )ne10, (uint32_t )ne10, (uint32_t )ne01,
@@ -3749,7 +3767,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3749
3767
ggml_vk_sync_buffers (subctx);
3750
3768
ggml_vk_dispatch_pipeline (ctx, subctx, dmmv,
3751
3769
{ { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } },
3752
- sizeof (vk_mat_vec_id_push_constants), &pc, { ( uint32_t )ne01 , (uint32_t )nei0, 1 });
3770
+ sizeof (vk_mat_vec_id_push_constants), &pc, { groups_x , (uint32_t )nei0, groups_z });
3753
3771
}
3754
3772
3755
3773
static void ggml_vk_mul_mat_id (ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
@@ -5606,7 +5624,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5606
5624
}
5607
5625
5608
5626
extra->ctx_idx = ctx->compute_ctx ->idx ;
5609
- extra->backend_ctx = ctx;
5610
5627
5611
5628
#ifdef GGML_VULKAN_CHECK_RESULTS
5612
5629
// Force context reset on each node so that each tensor ends up in its own context
0 commit comments