Skip to content

Commit 1b2f685

Browse files
committed
https://github.com/ggerganov/llama.cpp/pull/11427
metal : use residency sets
1 parent 9e2634d commit 1b2f685

File tree

1 file changed

+117
-18
lines changed

1 file changed

+117
-18
lines changed

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 117 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919
// max number of MTLCommandBuffer used to submit a graph for processing
2020
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
2121

22-
#define UNUSED(x) (void)(x)
22+
// create residency sets only on macOS >= 15.0
23+
#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
24+
#define GGML_METAL_HAS_RESIDENCY_SETS 1
25+
#endif
2326

2427
// globals
2528

@@ -39,6 +42,7 @@
3942

4043
bool has_simdgroup_reduction;
4144
bool has_simdgroup_mm;
45+
bool has_residency_sets;
4246
bool has_bfloat;
4347
bool use_bfloat;
4448

@@ -48,6 +52,7 @@
4852
/*.mtl_device_ref_count =*/ 0,
4953
/*.has_simdgroup_reduction =*/ false,
5054
/*.has_simdgroup_mm =*/ false,
55+
/*.has_residency_sets =*/ false,
5156
/*.has_bfloat =*/ false,
5257
/*.use_bfloat =*/ false,
5358
/*.name =*/ "",
@@ -64,7 +69,9 @@
6469
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
6570

6671
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
67-
72+
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
73+
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
74+
#endif
6875
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
6976
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
7077

@@ -483,6 +490,10 @@ @implementation GGMLMetalClass
483490
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
484491

485492
ctx->queue = [device newCommandQueue];
493+
if (ctx->queue == nil) {
494+
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
495+
return NULL;
496+
}
486497
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
487498

488499
id<MTLLibrary> metal_library;
@@ -649,6 +660,7 @@ @implementation GGMLMetalClass
649660

650661
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
651662
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
663+
GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
652664
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
653665
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
654666
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
@@ -1035,8 +1047,70 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
10351047
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
10361048
int n_buffers;
10371049
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1050+
1051+
// optional MTLResidencySet
1052+
id rset;
10381053
};
10391054

1055+
// rset init
1056+
static bool ggml_backend_metal_buffer_rset_init(
1057+
struct ggml_backend_metal_buffer_context * ctx,
1058+
struct ggml_backend_metal_device_context * ctx_dev,
1059+
id<MTLDevice> device) {
1060+
ctx->rset = nil;
1061+
1062+
if (!ctx_dev->has_residency_sets) {
1063+
return true;
1064+
}
1065+
1066+
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1067+
if (@available(macOS 15.0, *)) {
1068+
MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
1069+
desc.label = @"ggml_backend_metal";
1070+
desc.initialCapacity = ctx->n_buffers;
1071+
1072+
NSError * error;
1073+
ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
1074+
if (error) {
1075+
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
1076+
[desc release];
1077+
return false;
1078+
}
1079+
1080+
[desc release];
1081+
1082+
for (int i = 0; i < ctx->n_buffers; i++) {
1083+
[ctx->rset addAllocation:ctx->buffers[i].metal];
1084+
}
1085+
1086+
[ctx->rset commit];
1087+
[ctx->rset requestResidency];
1088+
1089+
return true;
1090+
}
1091+
#else
1092+
GGML_UNUSED(ctx_dev);
1093+
GGML_UNUSED(device);
1094+
#endif
1095+
1096+
return true;
1097+
}
1098+
1099+
// rset free
1100+
static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
1101+
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1102+
if (@available(macOS 15.0, *)) {
1103+
if (ctx->rset) {
1104+
[ctx->rset endResidency];
1105+
[ctx->rset removeAllAllocations];
1106+
[ctx->rset release];
1107+
}
1108+
}
1109+
#else
1110+
GGML_UNUSED(ctx);
1111+
#endif
1112+
}
1113+
10401114
// finds the Metal buffer that contains the tensor data on the GPU device
10411115
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
10421116
// Metal buffer based on the host memory pointer
@@ -4164,6 +4238,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41644238
for (int i = 0; i < ctx->n_buffers; i++) {
41654239
[ctx->buffers[i].metal release];
41664240
}
4241+
4242+
ggml_backend_metal_buffer_rset_free(ctx);
41674243
ggml_backend_metal_device_rel(buffer->buft->device->context);
41684244

41694245
if (ctx->owned) {
@@ -4186,19 +4262,19 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41864262
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
41874263
memset((char *)tensor->data + offset, value, size);
41884264

4189-
UNUSED(buffer);
4265+
GGML_UNUSED(buffer);
41904266
}
41914267

41924268
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
41934269
memcpy((char *)tensor->data + offset, data, size);
41944270

4195-
UNUSED(buffer);
4271+
GGML_UNUSED(buffer);
41964272
}
41974273

41984274
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
41994275
memcpy(data, (const char *)tensor->data + offset, size);
42004276

4201-
UNUSED(buffer);
4277+
GGML_UNUSED(buffer);
42024278
}
42034279

42044280
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@@ -4208,7 +4284,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
42084284
}
42094285
return false;
42104286

4211-
UNUSED(buffer);
4287+
GGML_UNUSED(buffer);
42124288
}
42134289

42144290
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -4234,7 +4310,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
42344310
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
42354311
return "Metal";
42364312

4237-
UNUSED(buft);
4313+
GGML_UNUSED(buft);
42384314
}
42394315

42404316
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@@ -4258,8 +4334,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
42584334
}
42594335
#endif
42604336
#endif
4261-
UNUSED(device);
4262-
UNUSED(size_aligned);
4337+
GGML_UNUSED(device);
4338+
GGML_UNUSED(size_aligned);
42634339
}
42644340

42654341
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -4272,7 +4348,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
42724348
size_aligned += (size_page - (size_aligned % size_page));
42734349
}
42744350

4275-
id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
4351+
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
4352+
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
42764353

42774354
ctx->all_data = ggml_metal_host_malloc(size_aligned);
42784355
ctx->all_size = size_aligned;
@@ -4295,7 +4372,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
42954372
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
42964373
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
42974374
free(ctx);
4298-
ggml_backend_metal_device_rel(buft->device->context);
4375+
ggml_backend_metal_device_rel(ctx_dev);
4376+
return NULL;
4377+
}
4378+
4379+
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
4380+
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4381+
free(ctx);
4382+
ggml_backend_metal_device_rel(ctx_dev);
42994383
return NULL;
43004384
}
43014385

@@ -4306,7 +4390,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43064390

43074391
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
43084392
return 32;
4309-
UNUSED(buft);
4393+
GGML_UNUSED(buft);
43104394
}
43114395

43124396
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -4316,13 +4400,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
43164400

43174401
return max_size;
43184402

4319-
UNUSED(buft);
4403+
GGML_UNUSED(buft);
43204404
}
43214405

43224406
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
43234407
return true;
43244408

4325-
UNUSED(buft);
4409+
GGML_UNUSED(buft);
43264410
}
43274411

43284412
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@@ -4345,7 +4429,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
43454429
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
43464430
return "Metal_Mapped";
43474431

4348-
UNUSED(buft);
4432+
GGML_UNUSED(buft);
43494433
}
43504434

43514435
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
@@ -4388,7 +4472,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
43884472
size_aligned += (size_page - (size_aligned % size_page));
43894473
}
43904474

4391-
id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
4475+
struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4476+
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
43924477

43934478
// the buffer fits into the max buffer size allowed by the device
43944479
if (size_aligned <= device.maxBufferLength) {
@@ -4441,6 +4526,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44414526
}
44424527
}
44434528

4529+
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
4530+
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4531+
free(ctx);
4532+
ggml_backend_metal_device_rel(ctx_dev);
4533+
return NULL;
4534+
}
4535+
44444536
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
44454537
}
44464538

@@ -4449,7 +4541,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44494541
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
44504542
return "Metal";
44514543

4452-
UNUSED(backend);
4544+
GGML_UNUSED(backend);
44534545
}
44544546

44554547
static void ggml_backend_metal_free(ggml_backend_t backend) {
@@ -4754,6 +4846,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
47544846
}
47554847
}
47564848

4849+
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
4850+
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4851+
free(ctx);
4852+
ggml_backend_metal_device_rel(ctx_dev);
4853+
return NULL;
4854+
}
4855+
47574856
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
47584857
}
47594858

@@ -4767,7 +4866,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
47674866
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
47684867
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
47694868

4770-
UNUSED(dev);
4869+
GGML_UNUSED(dev);
47714870
}
47724871

47734872
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {

0 commit comments

Comments
 (0)