19
19
// max number of MTLCommandBuffer used to submit a graph for processing
20
20
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
21
21
22
+ #define GGML_METAL_MAX_RESIDENCY_SETS 128
23
+
22
24
#define UNUSED (x ) (void )(x)
23
25
24
26
// globals
37
39
id <MTLDevice > mtl_device;
38
40
int mtl_device_ref_count;
39
41
42
+ id <MTLResidencySet> mtl_residency_set[GGML_METAL_MAX_RESIDENCY_SETS];
43
+ int mtl_residency_set_n;
44
+
40
45
bool has_simdgroup_reduction;
41
46
bool has_simdgroup_mm;
42
47
bool has_bfloat;
46
51
} g_ggml_ctx_dev_main = {
47
52
/* .mtl_device =*/ nil ,
48
53
/* .mtl_device_ref_count =*/ 0 ,
54
+ /* .mtl_residency_set =*/ { nil },
55
+ /* .mtl_residency_set_n =*/ 0 ,
49
56
/* .has_simdgroup_reduction =*/ false ,
50
57
/* .has_simdgroup_mm =*/ false ,
51
58
/* .has_bfloat =*/ false ,
@@ -95,6 +102,41 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
95
102
}
96
103
}
97
104
105
+ // add residency set
106
+ static bool ggml_backend_metal_device_add_residency_set (struct ggml_backend_metal_device_context * ctx, id <MTLResidencySet> residency_set) {
107
+ assert (ctx != NULL );
108
+ assert (queue != nil );
109
+
110
+ if (ctx->mtl_residency_set_n >= GGML_METAL_MAX_RESIDENCY_SETS) {
111
+ GGML_LOG_ERROR (" %s : warning: maximum number of residency sets reached\n " , __func__);
112
+ return false ;
113
+ }
114
+
115
+ ctx->mtl_residency_set [ctx->mtl_residency_set_n++] = residency_set;
116
+
117
+ return true ;
118
+ }
119
+
120
+ // remove residency set
121
+ static bool ggml_backend_metal_device_remove_residency_set (struct ggml_backend_metal_device_context * ctx, id <MTLResidencySet> residency_set) {
122
+ assert (ctx != NULL );
123
+ assert (residency_set != nil );
124
+
125
+ for (int i = 0 ; i < ctx->mtl_residency_set_n ; ++i) {
126
+ if (ctx->mtl_residency_set [i] == residency_set) {
127
+ for (int j = i; j < ctx->mtl_residency_set_n - 1 ; ++j) {
128
+ ctx->mtl_residency_set [j] = ctx->mtl_residency_set [j + 1 ];
129
+ }
130
+
131
+ ctx->mtl_residency_set_n --;
132
+
133
+ return true ;
134
+ }
135
+ }
136
+
137
+ return false ;
138
+ }
139
+
98
140
// kernels
99
141
100
142
struct ggml_metal_kernel {
@@ -483,6 +525,11 @@ @implementation GGMLMetalClass
483
525
GGML_LOG_INFO (" %s : picking default device: %s \n " , __func__, [[device name ] UTF8String ]);
484
526
485
527
ctx->queue = [device newCommandQueue ];
528
+ if (ctx->queue == nil ) {
529
+ GGML_LOG_ERROR (" %s : error: failed to create command queue\n " , __func__);
530
+ return NULL ;
531
+ }
532
+
486
533
ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
487
534
488
535
id <MTLLibrary > metal_library;
@@ -1035,6 +1082,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
1035
1082
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1036
1083
int n_buffers;
1037
1084
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1085
+
1086
+ id <MTLResidencySet> residency_set;
1038
1087
};
1039
1088
1040
1089
// finds the Metal buffer that contains the tensor data on the GPU device
@@ -4039,6 +4088,23 @@ static enum ggml_status ggml_metal_graph_compute(
4039
4088
struct ggml_backend_metal_context * ctx = backend->context ;
4040
4089
struct ggml_backend_metal_device_context * ctx_dev = backend->device ->context ;
4041
4090
4091
+ // attached residency sets to the queue on the first run
4092
+ // also tested to attached them on each run, but it does not make a difference
4093
+ static bool is_first = true ;
4094
+ if (is_first) {
4095
+ is_first = false ;
4096
+ GGML_LOG_INFO (" %s : adding %d residency sets\n " , __func__, ctx_dev->mtl_residency_set_n );
4097
+ [ctx->queue addResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4098
+ }
4099
+
4100
+ // this does not make a difference
4101
+ // for (int i = 0; i < ctx_dev->mtl_residency_set_n; ++i) {
4102
+ // GGML_LOG_INFO("%s: residency set %d allocations size = %zu\n", __func__, i, [ctx_dev->mtl_residency_set[i] allocatedSize]);
4103
+ // [ctx_dev->mtl_residency_set[i] requestResidency];
4104
+ // }
4105
+
4106
+ int64_t t_start_us = ggml_time_us ();
4107
+
4042
4108
// number of nodes encoded by the main thread (empirically determined)
4043
4109
const int n_main = 128 ;
4044
4110
@@ -4086,19 +4152,25 @@ static enum ggml_status ggml_metal_graph_compute(
4086
4152
// the main thread commits the first few commands immediately
4087
4153
// command_buffer[n_cb]
4088
4154
{
4089
- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4155
+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
4090
4156
ctx->command_buffers [n_cb] = command_buffer;
4091
4157
4158
+ // does not make a difference
4159
+ [command_buffer useResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4160
+
4092
4161
[command_buffer enqueue ];
4093
4162
ctx->encode_async (n_cb);
4094
4163
}
4095
4164
4096
4165
// prepare the rest of the command buffers asynchronously
4097
4166
// command_buffer[0.. n_cb)
4098
4167
for (int cb_idx = 0 ; cb_idx < n_cb; ++cb_idx) {
4099
- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4168
+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
4100
4169
ctx->command_buffers [cb_idx] = command_buffer;
4101
4170
4171
+ // does not make a difference
4172
+ [command_buffer useResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4173
+
4102
4174
// always enqueue the first two command buffers
4103
4175
// enqueue all of the command buffers if we don't need to abort
4104
4176
if (cb_idx < 2 || ctx->abort_callback == NULL ) {
@@ -4163,6 +4235,10 @@ static enum ggml_status ggml_metal_graph_compute(
4163
4235
}
4164
4236
}
4165
4237
4238
+ int64_t t_end_us = ggml_time_us ();
4239
+
4240
+ GGML_LOG_DEBUG (" %s : compute graph took %8.2f ms\n " , __func__, (t_end_us - t_start_us) / 1000.0 );
4241
+
4166
4242
return GGML_STATUS_SUCCESS;
4167
4243
}
4168
4244
@@ -4176,6 +4252,13 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
4176
4252
for (int i = 0 ; i < ctx->n_buffers ; i++) {
4177
4253
[ctx->buffers[i].metal release ];
4178
4254
}
4255
+
4256
+ ggml_backend_metal_device_remove_residency_set (buffer->buft ->device ->context , ctx->residency_set );
4257
+
4258
+ [ctx->residency_set endResidency ];
4259
+ [ctx->residency_set removeAllAllocations ];
4260
+ [ctx->residency_set release ];
4261
+
4179
4262
ggml_backend_metal_device_rel (buffer->buft ->device ->context );
4180
4263
4181
4264
if (ctx->owned ) {
@@ -4284,7 +4367,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4284
4367
size_aligned += (size_page - (size_aligned % size_page));
4285
4368
}
4286
4369
4287
- id <MTLDevice > device = ggml_backend_metal_device_acq (buft->device ->context );
4370
+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device ->context ;
4371
+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
4288
4372
4289
4373
ctx->all_data = ggml_metal_host_malloc (size_aligned);
4290
4374
ctx->all_size = size_aligned;
@@ -4307,10 +4391,34 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4307
4391
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers [0 ].metal == nil )) {
4308
4392
GGML_LOG_ERROR (" %s : error: failed to allocate buffer, size = %8.2f MiB\n " , __func__, size_aligned / 1024.0 / 1024.0 );
4309
4393
free (ctx);
4310
- ggml_backend_metal_device_rel (buft-> device -> context );
4394
+ ggml_backend_metal_device_rel (ctx_dev );
4311
4395
return NULL ;
4312
4396
}
4313
4397
4398
+ {
4399
+ MTLResidencySetDescriptor * desc;
4400
+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4401
+ desc.label = @" Primary residency set" ;
4402
+ desc.initialCapacity = ctx->n_buffers ;
4403
+
4404
+ NSError *error;
4405
+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4406
+ if (error) {
4407
+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4408
+ return NULL ;
4409
+ }
4410
+
4411
+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4412
+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4413
+ }
4414
+
4415
+ [ctx->residency_set commit ];
4416
+ [ctx->residency_set requestResidency ];
4417
+
4418
+ // track the residency set in the device context
4419
+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4420
+ }
4421
+
4314
4422
// ggml_backend_metal_log_allocated_size(device, size_aligned);
4315
4423
4316
4424
return ggml_backend_buffer_init (buft, ggml_backend_metal_buffer_i, ctx, size);
@@ -4400,7 +4508,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4400
4508
size_aligned += (size_page - (size_aligned % size_page));
4401
4509
}
4402
4510
4403
- id <MTLDevice > device = ggml_backend_metal_device_acq (&g_ggml_ctx_dev_main);
4511
+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4512
+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
4404
4513
4405
4514
// the buffer fits into the max buffer size allowed by the device
4406
4515
if (size_aligned <= device.maxBufferLength ) {
@@ -4453,6 +4562,30 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4453
4562
}
4454
4563
}
4455
4564
4565
+ {
4566
+ MTLResidencySetDescriptor * desc;
4567
+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4568
+ desc.label = @" Primary residency set" ;
4569
+ desc.initialCapacity = ctx->n_buffers ;
4570
+
4571
+ NSError *error;
4572
+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4573
+ if (error) {
4574
+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4575
+ return NULL ;
4576
+ }
4577
+
4578
+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4579
+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4580
+ }
4581
+
4582
+ [ctx->residency_set commit ];
4583
+ [ctx->residency_set requestResidency ];
4584
+
4585
+ // track the residency set in the device context
4586
+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4587
+ }
4588
+
4456
4589
return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
4457
4590
}
4458
4591
@@ -4766,6 +4899,30 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
4766
4899
}
4767
4900
}
4768
4901
4902
+ {
4903
+ MTLResidencySetDescriptor * desc;
4904
+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4905
+ desc.label = @" Primary residency set" ;
4906
+ desc.initialCapacity = ctx->n_buffers ;
4907
+
4908
+ NSError *error;
4909
+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4910
+ if (error) {
4911
+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4912
+ return NULL ;
4913
+ }
4914
+
4915
+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4916
+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4917
+ }
4918
+
4919
+ [ctx->residency_set commit ];
4920
+ [ctx->residency_set requestResidency ];
4921
+
4922
+ // track the residency set in the device context
4923
+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4924
+ }
4925
+
4769
4926
return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
4770
4927
}
4771
4928
0 commit comments