19
19
// max number of MTLCommandBuffer used to submit a graph for processing
20
20
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
21
21
22
- #define UNUSED (x ) (void )(x)
22
+ // create residency sets only on macOS >= 15.0
23
+ #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
24
+ #define GGML_METAL_HAS_RESIDENCY_SETS 1
25
+ #endif
23
26
24
27
// globals
25
28
39
42
40
43
bool has_simdgroup_reduction;
41
44
bool has_simdgroup_mm;
45
+ bool has_residency_sets;
42
46
bool has_bfloat;
43
47
bool use_bfloat;
44
48
48
52
/* .mtl_device_ref_count =*/ 0 ,
49
53
/* .has_simdgroup_reduction =*/ false ,
50
54
/* .has_simdgroup_mm =*/ false ,
55
+ /* .has_residency_sets =*/ false ,
51
56
/* .has_bfloat =*/ false ,
52
57
/* .use_bfloat =*/ false ,
53
58
/* .name =*/ " " ,
64
69
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily: MTLGPUFamilyMetal3_GGML];
65
70
66
71
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily: MTLGPUFamilyApple7];
67
-
72
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
73
+ ctx->has_residency_sets = getenv (" GGML_METAL_NO_RESIDENCY" ) == NULL ;
74
+ #endif
68
75
ctx->has_bfloat = [ctx->mtl_device supportsFamily: MTLGPUFamilyMetal3_GGML];
69
76
ctx->has_bfloat |= [ctx->mtl_device supportsFamily: MTLGPUFamilyApple6];
70
77
@@ -483,6 +490,10 @@ @implementation GGMLMetalClass
483
490
GGML_LOG_INFO (" %s : picking default device: %s \n " , __func__, [[device name ] UTF8String ]);
484
491
485
492
ctx->queue = [device newCommandQueue ];
493
+ if (ctx->queue == nil ) {
494
+ GGML_LOG_ERROR (" %s : error: failed to create command queue\n " , __func__);
495
+ return NULL ;
496
+ }
486
497
ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
487
498
488
499
id <MTLLibrary > metal_library;
@@ -649,6 +660,7 @@ @implementation GGMLMetalClass
649
660
650
661
GGML_LOG_INFO (" %s : simdgroup reduction = %s \n " , __func__, ctx_dev->has_simdgroup_reduction ? " true" : " false" );
651
662
GGML_LOG_INFO (" %s : simdgroup matrix mul. = %s \n " , __func__, ctx_dev->has_simdgroup_mm ? " true" : " false" );
663
+ GGML_LOG_INFO (" %s : has residency sets = %s \n " , __func__, ctx_dev->has_residency_sets ? " true" : " false" );
652
664
GGML_LOG_INFO (" %s : has bfloat = %s \n " , __func__, ctx_dev->has_bfloat ? " true" : " false" );
653
665
GGML_LOG_INFO (" %s : use bfloat = %s \n " , __func__, ctx_dev->use_bfloat ? " true" : " false" );
654
666
GGML_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx_dev->mtl_device .hasUnifiedMemory ? " true" : " false" );
@@ -1035,8 +1047,70 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
1035
1047
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1036
1048
int n_buffers;
1037
1049
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1050
+
1051
+ // optional MTLResidencySet
1052
+ id rset;
1038
1053
};
1039
1054
1055
+ // rset init
1056
+ static bool ggml_backend_metal_buffer_rset_init (
1057
+ struct ggml_backend_metal_buffer_context * ctx,
1058
+ struct ggml_backend_metal_device_context * ctx_dev,
1059
+ id <MTLDevice > device) {
1060
+ ctx->rset = nil ;
1061
+
1062
+ if (!ctx_dev->has_residency_sets ) {
1063
+ return true ;
1064
+ }
1065
+
1066
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1067
+ if (@available (macOS 15.0 , *)) {
1068
+ MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc ] init ];
1069
+ desc.label = @" ggml_backend_metal" ;
1070
+ desc.initialCapacity = ctx->n_buffers ;
1071
+
1072
+ NSError * error;
1073
+ ctx->rset = [device newResidencySetWithDescriptor: desc error: &error];
1074
+ if (error) {
1075
+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
1076
+ [desc release ];
1077
+ return false ;
1078
+ }
1079
+
1080
+ [desc release ];
1081
+
1082
+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
1083
+ [ctx->rset addAllocation: ctx->buffers[i].metal];
1084
+ }
1085
+
1086
+ [ctx->rset commit ];
1087
+ [ctx->rset requestResidency ];
1088
+
1089
+ return true ;
1090
+ }
1091
+ #else
1092
+ GGML_UNUSED (ctx_dev);
1093
+ GGML_UNUSED (device);
1094
+ #endif
1095
+
1096
+ return true ;
1097
+ }
1098
+
1099
+ // rset free
1100
+ static void ggml_backend_metal_buffer_rset_free (struct ggml_backend_metal_buffer_context * ctx) {
1101
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1102
+ if (@available (macOS 15.0 , *)) {
1103
+ if (ctx->rset ) {
1104
+ [ctx->rset endResidency ];
1105
+ [ctx->rset removeAllAllocations ];
1106
+ [ctx->rset release ];
1107
+ }
1108
+ }
1109
+ #else
1110
+ GGML_UNUSED (ctx);
1111
+ #endif
1112
+ }
1113
+
1040
1114
// finds the Metal buffer that contains the tensor data on the GPU device
1041
1115
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
1042
1116
// Metal buffer based on the host memory pointer
@@ -4164,6 +4238,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
4164
4238
for (int i = 0 ; i < ctx->n_buffers ; i++) {
4165
4239
[ctx->buffers[i].metal release ];
4166
4240
}
4241
+
4242
+ ggml_backend_metal_buffer_rset_free (ctx);
4167
4243
ggml_backend_metal_device_rel (buffer->buft ->device ->context );
4168
4244
4169
4245
if (ctx->owned ) {
@@ -4186,19 +4262,19 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
4186
4262
static void ggml_backend_metal_buffer_memset_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
4187
4263
memset ((char *)tensor->data + offset, value, size);
4188
4264
4189
- UNUSED (buffer);
4265
+ GGML_UNUSED (buffer);
4190
4266
}
4191
4267
4192
4268
static void ggml_backend_metal_buffer_set_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
4193
4269
memcpy ((char *)tensor->data + offset, data, size);
4194
4270
4195
- UNUSED (buffer);
4271
+ GGML_UNUSED (buffer);
4196
4272
}
4197
4273
4198
4274
static void ggml_backend_metal_buffer_get_tensor (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
4199
4275
memcpy (data, (const char *)tensor->data + offset, size);
4200
4276
4201
- UNUSED (buffer);
4277
+ GGML_UNUSED (buffer);
4202
4278
}
4203
4279
4204
4280
static bool ggml_backend_metal_buffer_cpy_tensor (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@@ -4208,7 +4284,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
4208
4284
}
4209
4285
return false ;
4210
4286
4211
- UNUSED (buffer);
4287
+ GGML_UNUSED (buffer);
4212
4288
}
4213
4289
4214
4290
static void ggml_backend_metal_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value) {
@@ -4234,7 +4310,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
4234
4310
static const char * ggml_backend_metal_buffer_type_get_name (ggml_backend_buffer_type_t buft) {
4235
4311
return " Metal" ;
4236
4312
4237
- UNUSED (buft);
4313
+ GGML_UNUSED (buft);
4238
4314
}
4239
4315
4240
4316
static void ggml_backend_metal_log_allocated_size (id <MTLDevice > device, size_t size_aligned) {
@@ -4258,8 +4334,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
4258
4334
}
4259
4335
#endif
4260
4336
#endif
4261
- UNUSED (device);
4262
- UNUSED (size_aligned);
4337
+ GGML_UNUSED (device);
4338
+ GGML_UNUSED (size_aligned);
4263
4339
}
4264
4340
4265
4341
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size) {
@@ -4272,7 +4348,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4272
4348
size_aligned += (size_page - (size_aligned % size_page));
4273
4349
}
4274
4350
4275
- id <MTLDevice > device = ggml_backend_metal_device_acq (buft->device ->context );
4351
+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device ->context ;
4352
+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
4276
4353
4277
4354
ctx->all_data = ggml_metal_host_malloc (size_aligned);
4278
4355
ctx->all_size = size_aligned;
@@ -4295,7 +4372,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4295
4372
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers [0 ].metal == nil )) {
4296
4373
GGML_LOG_ERROR (" %s : error: failed to allocate buffer, size = %8.2f MiB\n " , __func__, size_aligned / 1024.0 / 1024.0 );
4297
4374
free (ctx);
4298
- ggml_backend_metal_device_rel (buft->device ->context );
4375
+ ggml_backend_metal_device_rel (ctx_dev);
4376
+ return NULL ;
4377
+ }
4378
+
4379
+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4380
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4381
+ free (ctx);
4382
+ ggml_backend_metal_device_rel (ctx_dev);
4299
4383
return NULL ;
4300
4384
}
4301
4385
@@ -4306,7 +4390,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4306
4390
4307
4391
static size_t ggml_backend_metal_buffer_type_get_alignment (ggml_backend_buffer_type_t buft) {
4308
4392
return 32 ;
4309
- UNUSED (buft);
4393
+ GGML_UNUSED (buft);
4310
4394
}
4311
4395
4312
4396
static size_t ggml_backend_metal_buffer_type_get_max_size (ggml_backend_buffer_type_t buft) {
@@ -4316,13 +4400,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
4316
4400
4317
4401
return max_size;
4318
4402
4319
- UNUSED (buft);
4403
+ GGML_UNUSED (buft);
4320
4404
}
4321
4405
4322
4406
static bool ggml_backend_metal_buffer_type_is_host (ggml_backend_buffer_type_t buft) {
4323
4407
return true ;
4324
4408
4325
- UNUSED (buft);
4409
+ GGML_UNUSED (buft);
4326
4410
}
4327
4411
4328
4412
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type (void ) {
@@ -4345,7 +4429,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
4345
4429
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name (ggml_backend_buffer_type_t buft) {
4346
4430
return " Metal_Mapped" ;
4347
4431
4348
- UNUSED (buft);
4432
+ GGML_UNUSED (buft);
4349
4433
}
4350
4434
4351
4435
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type (void ) {
@@ -4388,7 +4472,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4388
4472
size_aligned += (size_page - (size_aligned % size_page));
4389
4473
}
4390
4474
4391
- id <MTLDevice > device = ggml_backend_metal_device_acq (&g_ggml_ctx_dev_main);
4475
+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4476
+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
4392
4477
4393
4478
// the buffer fits into the max buffer size allowed by the device
4394
4479
if (size_aligned <= device.maxBufferLength ) {
@@ -4441,6 +4526,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4441
4526
}
4442
4527
}
4443
4528
4529
+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4530
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4531
+ free (ctx);
4532
+ ggml_backend_metal_device_rel (ctx_dev);
4533
+ return NULL ;
4534
+ }
4535
+
4444
4536
return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
4445
4537
}
4446
4538
@@ -4449,7 +4541,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4449
4541
static const char * ggml_backend_metal_name (ggml_backend_t backend) {
4450
4542
return " Metal" ;
4451
4543
4452
- UNUSED (backend);
4544
+ GGML_UNUSED (backend);
4453
4545
}
4454
4546
4455
4547
static void ggml_backend_metal_free (ggml_backend_t backend) {
@@ -4754,6 +4846,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
4754
4846
}
4755
4847
}
4756
4848
4849
+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4850
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4851
+ free (ctx);
4852
+ ggml_backend_metal_device_rel (ctx_dev);
4853
+ return NULL ;
4854
+ }
4855
+
4757
4856
return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
4758
4857
}
4759
4858
@@ -4767,7 +4866,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
4767
4866
return buft->iface .get_name == ggml_backend_metal_buffer_type_get_name ||
4768
4867
buft->iface .get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
4769
4868
4770
- UNUSED (dev);
4869
+ GGML_UNUSED (dev);
4771
4870
}
4772
4871
4773
4872
static bool ggml_backend_metal_device_offload_op (ggml_backend_dev_t dev, const struct ggml_tensor * op) {
0 commit comments