@@ -29,13 +29,14 @@ var Aliases map[string]string = map[string]string{
29
29
const (
30
30
LlamaGGML = "llama-ggml"
31
31
32
- LLamaCPP = "llama-cpp"
32
+ LLamaCPP = "llama-cpp"
33
33
34
34
LLamaCPPCUDA12 = "llama-cpp-cuda12"
35
35
LLamaCPPAVX2 = "llama-cpp-avx2"
36
36
LLamaCPPAVX = "llama-cpp-avx"
37
37
LLamaCPPFallback = "llama-cpp-fallback"
38
38
LLamaCPPCUDA = "llama-cpp-cuda"
39
+ LLamaCPPGRPC = "llama-cpp-grpc"
39
40
40
41
Gpt4AllLlamaBackend = "gpt4all-llama"
41
42
Gpt4AllMptBackend = "gpt4all-mpt"
81
82
}
82
83
}
83
84
84
- foundLCPPAVX , foundLCPPAVX2 , foundLCPPFallback := false , false , false
85
+ // if we find the llama.cpp variants, show them of as a single backend (llama-cpp)
86
+ foundLCPPAVX , foundLCPPAVX2 , foundLCPPFallback , foundLCPPGRPC := false , false , false , false
85
87
if _ , ok := backends [LLamaCPP ]; ! ok {
86
88
for _ , e := range entry {
87
89
if strings .Contains (e .Name (), LLamaCPPAVX2 ) && ! foundLCPPAVX2 {
@@ -96,16 +98,23 @@ ENTRY:
96
98
backends [LLamaCPP ] = append (backends [LLamaCPP ], LLamaCPPFallback )
97
99
foundLCPPFallback = true
98
100
}
101
+ if strings .Contains (e .Name (), LLamaCPPGRPC ) && ! foundLCPPGRPC {
102
+ backends [LLamaCPP ] = append (backends [LLamaCPP ], LLamaCPPGRPC )
103
+ foundLCPPGRPC = true
104
+ }
99
105
}
100
106
}
101
107
102
108
// order backends from the asset directory.
103
109
// as we scan for backends, we want to keep some order which backends are tried of.
104
110
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
105
- // sets a priority list
106
- // First has more priority
111
+
112
+ // sets a priority list - first has more priority
107
113
priorityList := []string {
108
- // First llama.cpp and llama-ggml
114
+
115
+ // First llama.cpp(variants) and llama-ggml to follow.
116
+ // We keep the fallback to prevent that if the llama.cpp variants
117
+ // that depends on shared libs if breaks have still a safety net.
109
118
LLamaCPP , LlamaGGML , Gpt4All , LLamaCPPFallback ,
110
119
}
111
120
@@ -142,6 +151,50 @@ ENTRY:
142
151
return orderedBackends , nil
143
152
}
144
153
154
+ // selectGRPCProcess selects the GRPC process to start based on system capabilities
155
+ func selectGRPCProcess (backend , assetDir string ) string {
156
+ foundCUDA := false
157
+ var grpcProcess string
158
+
159
+ // Select backend now just for llama.cpp
160
+ if backend != LLamaCPP {
161
+ return ""
162
+ }
163
+
164
+ // Note: This environment variable is read by the LocalAI's llama.cpp grpc-server
165
+ if os .Getenv ("LLAMACPP_GRPC_SERVERS" ) != "" {
166
+ return backendPath (assetDir , LLamaCPPGRPC )
167
+ }
168
+
169
+ gpus , err := xsysinfo .GPUs ()
170
+ if err == nil {
171
+ for _ , gpu := range gpus {
172
+ if strings .Contains (gpu .String (), "nvidia" ) {
173
+ log .Info ().Msgf ("[%s] attempting to load with CUDA variant" , backend )
174
+ grpcProcess = backendPath (assetDir , LLamaCPPCUDA )
175
+ if _ , err := os .Stat (grpcProcess ); err == nil {
176
+ foundCUDA = true
177
+ }
178
+ }
179
+ }
180
+ }
181
+
182
+ if ! foundCUDA {
183
+ if cpu .X86 .HasAVX2 {
184
+ log .Info ().Msgf ("[%s] attempting to load with AVX2 variant" , backend )
185
+ grpcProcess = backendPath (assetDir , LLamaCPPAVX2 )
186
+ } else if cpu .X86 .HasAVX {
187
+ log .Info ().Msgf ("[%s] attempting to load with AVX variant" , backend )
188
+ grpcProcess = backendPath (assetDir , LLamaCPPAVX )
189
+ } else {
190
+ log .Info ().Msgf ("[%s] attempting to load with fallback variant" , backend )
191
+ grpcProcess = backendPath (assetDir , LLamaCPPFallback )
192
+ }
193
+ }
194
+
195
+ return grpcProcess
196
+ }
197
+
145
198
// starts the grpcModelProcess for the backend, and returns a grpc client
146
199
// It also loads the model
147
200
func (ml * ModelLoader ) grpcModel (backend string , o * Options ) func (string , string ) (ModelAddress , error ) {
@@ -192,33 +245,10 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
192
245
} else {
193
246
grpcProcess := backendPath (o .assetDir , backend )
194
247
195
- foundCUDA := false
196
- // for llama-cpp, check CPU capabilities and load the appropriate variant
197
- if backend == LLamaCPP {
198
- gpus , err := xsysinfo .GPUs ()
199
- if err == nil {
200
- for _ , gpu := range gpus {
201
- if strings .Contains (gpu .String (), "nvidia" ) {
202
- log .Info ().Msgf ("[%s] attempting to load with CUDA variant" , backend )
203
- grpcProcess = backendPath (o .assetDir , LLamaCPPCUDA )
204
- if _ , err := os .Stat (grpcProcess ); err == nil {
205
- foundCUDA = true
206
- }
207
- }
208
- }
209
- }
210
-
211
- if ! foundCUDA {
212
- if cpu .X86 .HasAVX2 {
213
- log .Info ().Msgf ("[%s] attempting to load with AVX2 variant" , backend )
214
- grpcProcess = backendPath (o .assetDir , LLamaCPPAVX2 )
215
- } else if cpu .X86 .HasAVX {
216
- log .Info ().Msgf ("[%s] attempting to load with AVX variant" , backend )
217
- grpcProcess = backendPath (o .assetDir , LLamaCPPAVX )
218
- } else {
219
- log .Info ().Msgf ("[%s] attempting to load with fallback variant" , backend )
220
- grpcProcess = backendPath (o .assetDir , LLamaCPPFallback )
221
- }
248
+ if os .Getenv ("DISABLE_AUTODETECT" ) != "true" {
249
+ // autoDetect GRPC process to start based on system capabilities
250
+ if selectedProcess := selectGRPCProcess (backend , o .assetDir ); selectedProcess != "" {
251
+ grpcProcess = selectedProcess
222
252
}
223
253
}
224
254
0 commit comments