File tree Expand file tree Collapse file tree 4 files changed +8
-4
lines changed Expand file tree Collapse file tree 4 files changed +8
-4
lines changed Original file line number Diff line number Diff line change @@ -586,7 +586,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
586
586
lparams.n_batch = params.n_batch ;
587
587
lparams.n_gpu_layers = params.n_gpu_layers ;
588
588
lparams.main_gpu = params.main_gpu ;
589
- memcpy ( lparams.tensor_split , params.tensor_split , LLAMA_MAX_DEVICES* sizeof ( float )) ;
589
+ lparams.tensor_split = params.tensor_split ;
590
590
lparams.low_vram = params.low_vram ;
591
591
lparams.seed = params.seed ;
592
592
lparams.f16_kv = params.memory_f16 ;
Original file line number Diff line number Diff line change @@ -2512,6 +2512,9 @@ void ggml_init_cublas() {
2512
2512
}
2513
2513
2514
2514
void ggml_cuda_set_tensor_split (const float * tensor_split) {
2515
+ if (tensor_split == nullptr ) {
2516
+ return ;
2517
+ }
2515
2518
bool all_zero = true ;
2516
2519
for (int i = 0 ; i < g_device_count; ++i) {
2517
2520
if (tensor_split[i] != 0 .0f ) {
Original file line number Diff line number Diff line change @@ -849,7 +849,7 @@ struct llama_context_params llama_context_default_params() {
849
849
/* .n_batch =*/ 512 ,
850
850
/* .gpu_layers =*/ 0 ,
851
851
/* .main_gpu =*/ 0 ,
852
- /* .tensor_split =*/ { 0 } ,
852
+ /* .tensor_split =*/ nullptr ,
853
853
/* .rope_freq_base =*/ 10000 .0f ,
854
854
/* .rope_freq_scale =*/ 1 .0f ,
855
855
/* .progress_callback =*/ nullptr ,
@@ -1289,7 +1289,7 @@ static bool llama_model_load(
1289
1289
int n_batch,
1290
1290
int n_gpu_layers,
1291
1291
int main_gpu,
1292
- float * tensor_split,
1292
+ const float * tensor_split,
1293
1293
float rope_freq_base,
1294
1294
float rope_freq_scale,
1295
1295
bool low_vram,
Original file line number Diff line number Diff line change @@ -88,7 +88,8 @@ extern "C" {
88
88
int32_t n_batch; // prompt processing batch size
89
89
int32_t n_gpu_layers; // number of layers to store in VRAM
90
90
int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
91
+
92
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
92
93
93
94
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
94
95
float rope_freq_base; // RoPE base frequency
You can’t perform that action at this time.
0 commit comments