You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
expect_strings_after_json: false # Expect string after JSON data.
146
+
no_action_function_name: "" # Function name to call when no action is determined.
147
+
no_action_description_name: "" # Description name for no-action functions.
148
+
response_regex: [] # Regular expressions to match response from
149
+
json_regex_match: [] # Regular expressions to match JSON data when in tool mode
150
+
replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns.
151
+
replace_llm_results: [] # Replace language model results with arbitrary strings or patterns.
152
+
capture_llm_results: [] # Capture language model results as text result, among JSON, in function calls. For instance, if a model returns a block for "thinking" and a block for "response", this will allow you to capture the thinking block.
153
+
return_name_in_function_response: false # Some models might prefer to use "name" rather then "function" when returning JSON data. This will allow to use "name" as a key in the JSON response.
154
+
155
+
# Feature gating flags to enable experimental or optional features.
156
+
feature_flags: {}
157
+
158
+
# System prompt to use by default.
159
+
system_prompt: ""
160
+
161
+
# Configuration for splitting tensors across GPUs.
199
162
tensor_split: ""
163
+
164
+
# Identifier for the main GPU used in multi-GPU setups.
200
165
main_gpu: ""
201
-
# Define a prompt cache path (relative to the models)
202
-
prompt_cache_path: "prompt-cache"
203
-
# Cache all the prompts
204
-
prompt_cache_all: true
205
-
# Read only
166
+
167
+
# Small value added to the denominator in RMS normalization to prevent division by zero.
168
+
rms_norm_eps: 0
169
+
170
+
# Natural question generation model parameter.
171
+
ngqa: 0
172
+
173
+
# Path where prompt cache is stored.
174
+
prompt_cache_path: ""
175
+
176
+
# Whether to cache all prompts.
177
+
prompt_cache_all: false
178
+
179
+
# Whether the prompt cache is read-only.
206
180
prompt_cache_ro: false
207
-
# Enable mmap
208
-
mmap: true
209
-
# Enable low vram mode (GPU only)
210
-
low_vram: true
211
-
# Set NUMA mode (CPU only)
212
-
numa: true
213
-
# Lora settings
214
-
lora_adapter: "/path/to/lora/adapter"
215
-
lora_base: "/path/to/lora/base"
216
-
# Disable mulmatq (CUDA)
217
-
no_mulmatq: true
218
-
219
-
# Diffusers/transformers
220
-
cuda: true
181
+
182
+
# Mirostat sampling settings.
183
+
mirostat_eta: null
184
+
mirostat_tau: null
185
+
mirostat: null
186
+
187
+
# GPU-specific layers configuration.
188
+
gpu_layers: null
189
+
190
+
# Memory mapping for efficient I/O operations.
191
+
mmap: null
192
+
193
+
# Memory locking to ensure data remains in RAM.
194
+
mmlock: null
195
+
196
+
# Mode to use minimal VRAM for GPU operations.
197
+
low_vram: null
198
+
199
+
# Words or phrases that halts processing.
200
+
stopwords: []
201
+
202
+
# Strings to cut from responses to maintain context or relevance.
203
+
cutstrings: []
204
+
205
+
# Strings to trim from responses for cleaner outputs.
206
+
trimspace: []
207
+
trimsuffix: []
208
+
209
+
# Default context size for the model's understanding of the conversation or text.
210
+
context_size: null
211
+
212
+
# Non-uniform memory access settings, useful for systems with multiple CPUs.
213
+
numa: false
214
+
215
+
# Configuration for LoRA
216
+
lora_adapter: ""
217
+
lora_base: ""
218
+
lora_scale: 0
219
+
220
+
# Disable matrix multiplication queuing in GPU operations.
221
+
no_mulmatq: false
222
+
223
+
# Model for generating draft responses.
224
+
draft_model: ""
225
+
n_draft: 0
226
+
227
+
# Quantization settings for the model, impacting memory and processing speed.
228
+
quantization: ""
229
+
230
+
# Utilization percentage of GPU memory to allocate for the model. (vLLM)
231
+
gpu_memory_utilization: 0
232
+
233
+
# Whether to trust and execute remote code.
234
+
trust_remote_code: false
235
+
236
+
# Force eager execution of TensorFlow operations if applicable. (vLLM)
237
+
enforce_eager: false
238
+
239
+
# Space allocated for swapping data in and out of memory. (vLLM)
240
+
swap_space: 0
241
+
242
+
# Maximum model length, possibly referring to the number of tokens or parameters. (vLLM)
243
+
max_model_len: 0
244
+
245
+
# Size of the tensor parallelism in distributed computing environments. (vLLM)
246
+
tensor_parallel_size: 0
247
+
248
+
# vision model to use for multimodal
249
+
mmproj: ""
250
+
251
+
# Disables offloading of key/value pairs in transformer models to save memory.
252
+
no_kv_offloading: false
253
+
254
+
# Scaling factor for the rope penalty.
255
+
rope_scaling: ""
256
+
257
+
# Type of configuration, often related to the type of task or model architecture.
258
+
type: ""
259
+
260
+
# YARN settings
261
+
yarn_ext_factor: 0
262
+
yarn_attn_factor: 0
263
+
yarn_beta_fast: 0
264
+
yarn_beta_slow: 0
265
+
266
+
# AutoGPT-Q settings, for configurations specific to GPT models.
267
+
autogptq:
268
+
model_base_name: "" # Base name of the model.
269
+
device: "" # Device to run the model on.
270
+
triton: false # Whether to use Triton Inference Server.
271
+
use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing.
272
+
273
+
# configuration for diffusers model
274
+
diffusers:
275
+
cuda: false # Whether to use CUDA
276
+
pipeline_type: "" # Type of pipeline to use.
277
+
scheduler_type: "" # Type of scheduler for controlling operations.
278
+
enable_parameters: "" # Parameters to enable in the diffuser.
279
+
cfg_scale: 0 # Scale for CFG in the diffuser setup.
280
+
img2img: false # Whether image-to-image transformation is supported.
281
+
clip_skip: 0 # Number of steps to skip in CLIP operations.
282
+
clip_model: "" # Model to use for CLIP operations.
283
+
clip_subfolder: "" # Subfolder for storing CLIP-related data.
284
+
control_net: "" # Control net to use
285
+
286
+
# Step count, usually for image processing models
287
+
step: 0
288
+
289
+
# Configuration for gRPC communication.
290
+
grpc:
291
+
attempts: 0 # Number of retry attempts for gRPC calls.
292
+
attempts_sleep_time: 0 # Sleep time between retries.
293
+
294
+
# Text-to-Speech (TTS) configuration.
295
+
tts:
296
+
voice: "" # Voice setting for TTS.
297
+
vall-e:
298
+
audio_path: "" # Path to audio files for Vall-E.
299
+
300
+
# Whether to use CUDA for GPU-based operations.
301
+
cuda: false
302
+
303
+
# List of files to download as part of the setup or operations.
|**DOCKER_INSTALL**| Set to "true" to enable the installation of Docker images. |
25
+
|**USE_AIO**| Set to "true" to use the all-in-one LocalAI Docker image. |
26
+
|**API_KEY**| Specify an API key for accessing LocalAI, if required. |
27
+
|**CORE_IMAGES**| Set to "true" to download core LocalAI images. |
28
+
|**PORT**| Specifies the port on which LocalAI will run (default is 8080). |
29
+
|**THREADS**| Number of processor threads the application should use. Defaults to the number of logical cores minus one. |
30
+
|**VERSION**| Specifies the version of LocalAI to install. Defaults to the latest available version. |
31
+
|**MODELS_PATH**| Directory path where LocalAI models are stored (default is /usr/share/local-ai/models). |
32
+
33
+
We are looking into improving the installer, and as this is a first iteration any feedback is welcome! Open up an [issue](https://github.com/mudler/LocalAI/issues/new/choose) if something doesn't work for you!
0 commit comments