Skip to content

Commit 0bf4d99

Browse files
ikawrakowKawrakow
andauthored
Do not quantize activations if not necessary (ikawrakow#79)
* Do not quantize activations if not necessary * Do not quantize activations if not necessary also for MoE models --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent ba39280 commit 0bf4d99

File tree

2 files changed

+45
-14
lines changed

2 files changed

+45
-14
lines changed

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,7 @@ extern "C" {
654654
// since https://github.com/ggerganov/ggml/issues/287
655655
struct ggml_cplan {
656656
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
657+
size_t q_size;
657658
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
658659

659660
int n_threads;

ggml/src/ggml.c

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2102,6 +2102,7 @@ struct ggml_compute_params {
21022102

21032103
// work buffer for all threads
21042104
size_t wsize;
2105+
size_t qsize;
21052106
void * wdata;
21062107

21072108
struct ggml_compute_state_shared * shared;
@@ -13421,7 +13422,12 @@ UseGgmlGemm1:;
1342113422
#endif
1342213423

1342313424
if (src1->type != vec_dot_type) {
13424-
char * wdata = params->wdata;
13425+
char * wdata = (char *)params->wdata + params->wsize - params->qsize;
13426+
13427+
if (strncmp(src1->name, wdata - GGML_MAX_NAME, GGML_MAX_NAME) == 0) {
13428+
goto AlreadyQunatized;
13429+
}
13430+
wdata += GGML_MAX_NAME;
1342513431

1342613432
#if IK_PRINT_TIMING
1342713433
int64_t t1 = ggml_time_us();
@@ -13431,7 +13437,7 @@ UseGgmlGemm1:;
1343113437
const size_t nbw2 = nbw1*ne11;
1343213438
const size_t nbw3 = nbw2*ne12;
1343313439

13434-
assert(params->wsize >= ne13*nbw3);
13440+
assert(params->qsize >= ne13*nbw3);
1343513441
GGML_ASSERT(src1->type == GGML_TYPE_F32);
1343613442

1343713443
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -13459,14 +13465,18 @@ UseGgmlGemm1:;
1345913465
#endif
1346013466

1346113467
if (ith == 0) {
13468+
wdata -= GGML_MAX_NAME;
13469+
memcpy(wdata, src1->name, GGML_MAX_NAME);
1346213470
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
1346313471
atomic_store(&params->shared->current_chunk, nth);
1346413472
}
1346513473

13474+
AlreadyQunatized:;
1346613475
ggml_barrier(params->shared);
1346713476
}
1346813477

13469-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
13478+
const void * wdata = (src1->type == vec_dot_type) ? src1->data
13479+
: (const void *)((const char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME);
1347013480

1347113481
#if GGML_USE_IQK_MULMAT
1347213482
if (src1->type != vec_dot_type && dst->type == GGML_TYPE_F32) {
@@ -13631,9 +13641,10 @@ static void ggml_compute_forward_mul_mat_id(
1363113641
const int n_ids = ids->ne[0]; // n_expert_used
1363213642
const int n_as = ne02; // n_expert
1363313643

13634-
char * wdata_src1_end = (src1->type == vec_dot_type) ?
13635-
(char *) params->wdata :
13636-
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
13644+
char * qdata = (char *)params->wdata + params->wsize - params->qsize;
13645+
13646+
char * wdata_src1_end = (src1->type == vec_dot_type) ? qdata :
13647+
qdata + GGML_PAD(GGML_MAX_NAME + ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
1363713648

1363813649
struct mmid_row_mapping {
1363913650
int32_t i1;
@@ -13643,14 +13654,19 @@ static void ggml_compute_forward_mul_mat_id(
1364313654
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
1364413655
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
1364513656

13657+
bool store_name = false;
1364613658
if (src1->type != vec_dot_type) {
13647-
char * wdata = params->wdata;
13659+
if (strncmp(src1->name, qdata, GGML_MAX_NAME) == 0) {
13660+
goto QuantizationAlreadyDone;
13661+
}
13662+
store_name = true;
13663+
char * wdata = qdata + GGML_MAX_NAME;
1364813664

1364913665
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
1365013666
const size_t nbw2 = nbw1*ne11;
1365113667
const size_t nbw3 = nbw2*ne12;
1365213668

13653-
assert(params->wsize >= ne13*nbw3);
13669+
assert(params->qsize >= ne13*nbw3);
1365413670
GGML_ASSERT(src1->type == GGML_TYPE_F32);
1365513671

1365613672
for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -13666,7 +13682,12 @@ static void ggml_compute_forward_mul_mat_id(
1366613682

1366713683
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
1366813684

13685+
QuantizationAlreadyDone:;
1366913686
if (ith == 0) {
13687+
if (store_name) {
13688+
memcpy(qdata, src1->name, GGML_MAX_NAME);
13689+
}
13690+
1367013691
// initialize matrix_row_counts
1367113692
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
1367213693

@@ -13695,7 +13716,7 @@ static void ggml_compute_forward_mul_mat_id(
1369513716

1369613717
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
1369713718

13698-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
13719+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : qdata + GGML_MAX_NAME;
1369913720
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
1370013721

1370113722
const int64_t nr0 = ne01; // src0 rows
@@ -20148,6 +20169,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2014820169
}
2014920170

2015020171
size_t work_size = 0;
20172+
size_t q_size = 0;
2015120173

2015220174
struct ggml_cplan cplan;
2015320175
memset(&cplan, 0, sizeof(struct ggml_cplan));
@@ -20163,6 +20185,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2016320185
max_tasks = MAX(max_tasks, n_tasks);
2016420186

2016520187
size_t cur = 0;
20188+
size_t cur_q = 0;
2016620189

2016720190
switch (node->op) {
2016820191
case GGML_OP_CPY:
@@ -20193,7 +20216,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2019320216
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
2019420217

2019520218
if (node->src[1]->type != vec_dot_type) {
20196-
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
20219+
cur_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
2019720220
}
2019820221
} break;
2019920222
case GGML_OP_MUL_MAT_ID:
@@ -20203,12 +20226,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2020320226
const struct ggml_tensor * src1 = node->src[1];
2020420227
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
2020520228
if (src1->type != vec_dot_type) {
20206-
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
20229+
cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
2020720230
}
2020820231
const int n_as = src0->ne[2];
20209-
cur += GGML_PAD(cur, sizeof(int64_t)); // align
20210-
cur += n_as * sizeof(int64_t); // matrix_row_counts
20211-
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
20232+
cur_q += GGML_PAD(cur, sizeof(int64_t)); // align
20233+
cur_q += n_as * sizeof(int64_t); // matrix_row_counts
20234+
cur_q += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
2021220235
} break;
2021320236
case GGML_OP_OUT_PROD:
2021420237
{
@@ -20297,14 +20320,20 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
2029720320
}
2029820321

2029920322
work_size = MAX(work_size, cur);
20323+
q_size = MAX(q_size, cur_q);
2030020324
}
2030120325

2030220326
if (work_size > 0) {
2030320327
work_size += CACHE_LINE_SIZE*(n_threads - 1);
2030420328
}
20329+
if (q_size > 0) {
20330+
q_size += GGML_MAX_NAME;
20331+
}
20332+
work_size += q_size;
2030520333

2030620334
cplan.n_threads = MIN(max_tasks, n_threads);
2030720335
cplan.work_size = work_size;
20336+
cplan.q_size = q_size;
2030820337
cplan.work_data = NULL;
2030920338

2031020339
return cplan;
@@ -20322,6 +20351,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2032220351
/*.ith =*/ state->ith,
2032320352
/*.nth =*/ state->shared->n_threads,
2032420353
/*.wsize =*/ cplan->work_size,
20354+
/*.qsize =*/ cplan->q_size,
2032520355
/*.wdata =*/ cplan->work_data,
2032620356
/*.shared=*/ state->shared,
2032720357
};

0 commit comments

Comments
 (0)