@@ -2102,6 +2102,7 @@ struct ggml_compute_params {
2102
2102
2103
2103
// work buffer for all threads
2104
2104
size_t wsize;
2105
+ size_t qsize;
2105
2106
void * wdata;
2106
2107
2107
2108
struct ggml_compute_state_shared * shared;
@@ -13421,7 +13422,12 @@ UseGgmlGemm1:;
13421
13422
#endif
13422
13423
13423
13424
if (src1->type != vec_dot_type) {
13424
- char * wdata = params->wdata;
13425
+ char * wdata = (char *)params->wdata + params->wsize - params->qsize;
13426
+
13427
+ if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) {
13428
+ goto AlreadyQunatized;
13429
+ }
13430
+ wdata += GGML_MAX_NAME;
13425
13431
13426
13432
#if IK_PRINT_TIMING
13427
13433
int64_t t1 = ggml_time_us();
@@ -13431,7 +13437,7 @@ UseGgmlGemm1:;
13431
13437
const size_t nbw2 = nbw1*ne11;
13432
13438
const size_t nbw3 = nbw2*ne12;
13433
13439
13434
- assert(params->wsize >= ne13*nbw3);
13440
+ assert(params->qsize >= ne13*nbw3);
13435
13441
GGML_ASSERT(src1->type == GGML_TYPE_F32);
13436
13442
13437
13443
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -13459,14 +13465,18 @@ UseGgmlGemm1:;
13459
13465
#endif
13460
13466
13461
13467
if (ith == 0) {
13468
+ wdata -= GGML_MAX_NAME;
13469
+ memcpy(wdata, src1->name, GGML_MAX_NAME);
13462
13470
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
13463
13471
atomic_store(¶ms->shared->current_chunk, nth);
13464
13472
}
13465
13473
13474
+ AlreadyQunatized:;
13466
13475
ggml_barrier(params->shared);
13467
13476
}
13468
13477
13469
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
13478
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data
13479
+ : (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME);
13470
13480
13471
13481
#if GGML_USE_IQK_MULMAT
13472
13482
if (src1->type != vec_dot_type && dst->type == GGML_TYPE_F32) {
@@ -13631,9 +13641,10 @@ static void ggml_compute_forward_mul_mat_id(
13631
13641
const int n_ids = ids->ne[0]; // n_expert_used
13632
13642
const int n_as = ne02; // n_expert
13633
13643
13634
- char * wdata_src1_end = (src1->type == vec_dot_type) ?
13635
- (char *) params->wdata :
13636
- (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
13644
+ char * qdata = (char *)params->wdata + params->wsize - params->qsize;
13645
+
13646
+ char * wdata_src1_end = (src1->type == vec_dot_type) ? qdata :
13647
+ qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
13637
13648
13638
13649
struct mmid_row_mapping {
13639
13650
int32_t i1;
@@ -13643,14 +13654,19 @@ static void ggml_compute_forward_mul_mat_id(
13643
13654
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
13644
13655
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
13645
13656
13657
+ bool store_name = false;
13646
13658
if (src1->type != vec_dot_type) {
13647
- char * wdata = params->wdata;
13659
+ if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) {
13660
+ goto QuantizationAlreadyDone;
13661
+ }
13662
+ store_name = true;
13663
+ char * wdata = qdata + GGML_MAX_NAME;
13648
13664
13649
13665
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
13650
13666
const size_t nbw2 = nbw1*ne11;
13651
13667
const size_t nbw3 = nbw2*ne12;
13652
13668
13653
- assert(params->wsize >= ne13*nbw3);
13669
+ assert(params->qsize >= ne13*nbw3);
13654
13670
GGML_ASSERT(src1->type == GGML_TYPE_F32);
13655
13671
13656
13672
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -13666,7 +13682,12 @@ static void ggml_compute_forward_mul_mat_id(
13666
13682
13667
13683
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
13668
13684
13685
+ QuantizationAlreadyDone:;
13669
13686
if (ith == 0) {
13687
+ if (store_name) {
13688
+ memcpy(qdata, src1->name, GGML_MAX_NAME);
13689
+ }
13690
+
13670
13691
// initialize matrix_row_counts
13671
13692
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
13672
13693
@@ -13695,7 +13716,7 @@ static void ggml_compute_forward_mul_mat_id(
13695
13716
13696
13717
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
13697
13718
13698
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata ;
13719
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME ;
13699
13720
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
13700
13721
13701
13722
const int64_t nr0 = ne01; // src0 rows
@@ -20148,6 +20169,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
20148
20169
}
20149
20170
20150
20171
size_t work_size = 0;
20172
+ size_t q_size = 0;
20151
20173
20152
20174
struct ggml_cplan cplan;
20153
20175
memset(&cplan, 0, sizeof(struct ggml_cplan));
@@ -20163,6 +20185,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
20163
20185
max_tasks = MAX(max_tasks, n_tasks);
20164
20186
20165
20187
size_t cur = 0;
20188
+ size_t cur_q = 0;
20166
20189
20167
20190
switch (node->op) {
20168
20191
case GGML_OP_CPY:
@@ -20193,7 +20216,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
20193
20216
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
20194
20217
20195
20218
if (node->src[1]->type != vec_dot_type) {
20196
- cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
20219
+ cur_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
20197
20220
}
20198
20221
} break;
20199
20222
case GGML_OP_MUL_MAT_ID:
@@ -20203,12 +20226,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
20203
20226
const struct ggml_tensor * src1 = node->src[1];
20204
20227
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
20205
20228
if (src1->type != vec_dot_type) {
20206
- cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
20229
+ cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
20207
20230
}
20208
20231
const int n_as = src0->ne[2];
20209
- cur += GGML_PAD(cur, sizeof(int64_t)); // align
20210
- cur += n_as * sizeof(int64_t); // matrix_row_counts
20211
- cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
20232
+ cur_q += GGML_PAD(cur, sizeof(int64_t)); // align
20233
+ cur_q += n_as * sizeof(int64_t); // matrix_row_counts
20234
+ cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
20212
20235
} break;
20213
20236
case GGML_OP_OUT_PROD:
20214
20237
{
@@ -20297,14 +20320,20 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
20297
20320
}
20298
20321
20299
20322
work_size = MAX(work_size, cur);
20323
+ q_size = MAX(q_size, cur_q);
20300
20324
}
20301
20325
20302
20326
if (work_size > 0) {
20303
20327
work_size += CACHE_LINE_SIZE*(n_threads - 1);
20304
20328
}
20329
+ if (q_size > 0) {
20330
+ q_size += GGML_MAX_NAME;
20331
+ }
20332
+ work_size += q_size;
20305
20333
20306
20334
cplan.n_threads = MIN(max_tasks, n_threads);
20307
20335
cplan.work_size = work_size;
20336
+ cplan.q_size = q_size;
20308
20337
cplan.work_data = NULL;
20309
20338
20310
20339
return cplan;
@@ -20322,6 +20351,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
20322
20351
/*.ith =*/ state->ith,
20323
20352
/*.nth =*/ state->shared->n_threads,
20324
20353
/*.wsize =*/ cplan->work_size,
20354
+ /*.qsize =*/ cplan->q_size,
20325
20355
/*.wdata =*/ cplan->work_data,
20326
20356
/*.shared=*/ state->shared,
20327
20357
};
0 commit comments