My LlamaSwap Configuration
My llamaSwap configuration. Individual configurations can be used to run llama.cpp
as standalone.
~/Applications/llamacpp/config.yamlyaml
# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 120
# valid log levels: debug, info (default), warn, error
logLevel: debug
groups:
# group1 is same as the default behaviour of llama-swap where only one model is allowed
# to run a time across the whole llama-swap instance
"Other":
# swap: controls the model swapping behaviour in within the group
# - optional, default: true
# - true : only one model is allowed to run at a time
# - false: all models can run together, no swapping
swap: true
# exclusive: controls how the group affects other groups
# - optional, default: true
# - true: causes all other groups to unload when this group runs a model
# - false: does not affect other groups
exclusive: true
# members references the models defined above
# required
members:
- "DeepSeek-R1-0528-8B"
- "Devstral-Small-2505"
- "THUDM_GLM-4-9B-0414"
- "Tesslate_Tessa-Rust-T1-7B"
- "Hunyuan-A13B-Instruct-Thinking"
- "Hunyuan-A13B-Instruct-Non-Thinking"
"Gemma3":
# swap: controls the model swapping behaviour in within the group
# - optional, default: true
# - true : only one model is allowed to run at a time
# - false: all models can run together, no swapping
swap: true
# exclusive: controls how the group affects other groups
# - optional, default: true
# - true: causes all other groups to unload when this group runs a model
# - false: does not affect other groups
exclusive: true
# members references the models defined above
# required
members:
- "Gemma3-4B"
- "Gemma3-12B"
"Nanonets-OCR":
# swap: controls the model swapping behaviour in within the group
# - optional, default: true
# - true : only one model is allowed to run at a time
# - false: all models can run together, no swapping
swap: true
# exclusive: controls how the group affects other groups
# - optional, default: true
# - true: causes all other groups to unload when this group runs a model
# - false: does not affect other groups
exclusive: true
# members references the models defined above
# required
members:
- "Nanonets-OCR"
"Qwen3-30B-A3B":
# swap: controls the model swapping behaviour in within the group
# - optional, default: true
# - true : only one model is allowed to run at a time
# - false: all models can run together, no swapping
swap: true
# exclusive: controls how the group affects other groups
# - optional, default: true
# - true: causes all other groups to unload when this group runs a model
# - false: does not affect other groups
exclusive: true
# members references the models defined above
# required
members:
- "Qwen-A3B-Q4-Thinking"
- "Qwen-A3B-Q6-Thinking"
- "Qwen-A3B-Q4-No-Thinking"
- "Qwen-A3B-Q6-No-Thinking"
macros:
"llama-cpp": >
/home/tipu/Applications/llamacpp/llama-server
--port ${PORT}
--api-key 12345
models:
"Qwen-A3B-Q4-Thinking":
# cmd: the command to run to start the inference server.
# - required
# - it is just a string, similar to what you would run on the CLI
# - using `|` allows for comments in the command, these will be parsed out
# - macros can be used within cmd
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-128K-UD-Q4_K_XL.gguf
--jinja
--reasoning-format none
-n -1
-ngl 99
--temp 0.6
--top-k 20
--top-p 0.95
--min-p 0
-c 10240
--no-context-shift
--mlock
--ubatch-size 128
--batch-size 2048
--seed -1
-t 4
-fa
--no-mmap
--no-warmup
--presence-penalty 1.1
-a Qwen-A3B-Q4-Thinking
--no-webui
# env: define an array of environment variables to inject into cmd's environment
# - optional, default: empty array
# - each value is a single string
# - in the format: ENV_NAME=value
#env:
# - "CUDA_VISIBLE_DEVICES=0,1,2"
# proxy: the URL where llama-swap routes API requests
# - optional, default: http://localhost:${PORT}
# - if you used ${PORT} in cmd this can be omitted
# - if you use a custom port in cmd this *must* be set
#proxy: http://127.0.0.1:8999
# aliases: alternative model names that this model configuration is used for
# - optional, default: empty array
# - aliases must be unique globally
# - useful for impersonating a specific model
aliases:
- "Qwen-A3B-Q4-Thinking"
# checkEndpoint: URL path to check if the server is ready
# - optional, default: /health
# - use "none" to skip endpoint ready checking
# - endpoint is expected to return an HTTP 200 response
# - all requests wait until the endpoint is ready (or fails)
#checkEndpoint: /custom-endpoint
# ttl: automatically unload the model after this many seconds
# - optional, default: 0
# - ttl values must be a value greater than 0
# - a value of 0 disables automatic unloading of the model
ttl: 120
# useModelName: overrides the model name that is sent to upstream server
# - optional, default: ""
# - useful when the upstream server expects a specific model name or format
useModelName: "Qwen-A3B-Q4-Thinking"
# filters: a dictionary of filter settings
# - optional, default: empty dictionary
filters:
# strip_params: a comma separated list of parameters to remove from the request
# - optional, default: ""
# - useful for preventing overriding of default server params by requests
# - `model` parameter is never removed
# - can be any JSON key in the request body
# - recommended to stick to sampling parameters
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Qwen-A3B-Q6-Thinking":
# cmd: the command to run to start the inference server.
# - required
# - it is just a string, similar to what you would run on the CLI
# - using `|` allows for comments in the command, these will be parsed out
# - macros can be used within cmd
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-128K-UD-Q6_K_XL.gguf
--jinja
--reasoning-format none
-n -1
-ngl 99
--temp 0.6
--top-k 20
--top-p 0.95
--min-p 0
-c 10240
--no-context-shift
--mlock
--ubatch-size 128
--batch-size 2048
--seed -1
-t 4
-fa
--no-mmap
--no-warmup
--presence-penalty 1.1
-a Qwen-A3B-Q6-Thinking
--no-webui
# env: define an array of environment variables to inject into cmd's environment
# - optional, default: empty array
# - each value is a single string
# - in the format: ENV_NAME=value
#env:
# - "CUDA_VISIBLE_DEVICES=0,1,2"
# proxy: the URL where llama-swap routes API requests
# - optional, default: http://localhost:${PORT}
# - if you used ${PORT} in cmd this can be omitted
# - if you use a custom port in cmd this *must* be set
#proxy: http://127.0.0.1:8999
# aliases: alternative model names that this model configuration is used for
# - optional, default: empty array
# - aliases must be unique globally
# - useful for impersonating a specific model
aliases:
- "Qwen-A3B-Q6-Thinking"
# checkEndpoint: URL path to check if the server is ready
# - optional, default: /health
# - use "none" to skip endpoint ready checking
# - endpoint is expected to return an HTTP 200 response
# - all requests wait until the endpoint is ready (or fails)
#checkEndpoint: /custom-endpoint
# ttl: automatically unload the model after this many seconds
# - optional, default: 0
# - ttl values must be a value greater than 0
# - a value of 0 disables automatic unloading of the model
ttl: 120
# useModelName: overrides the model name that is sent to upstream server
# - optional, default: ""
# - useful when the upstream server expects a specific model name or format
useModelName: "Qwen-A3B-Q6-Thinking"
# filters: a dictionary of filter settings
# - optional, default: empty dictionary
filters:
# strip_params: a comma separated list of parameters to remove from the request
# - optional, default: ""
# - useful for preventing overriding of default server params by requests
# - `model` parameter is never removed
# - can be any JSON key in the request body
# - recommended to stick to sampling parameters
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Qwen-A3B-Q4-No-Thinking":
# cmd: the command to run to start the inference server.
# - required
# - it is just a string, similar to what you would run on the CLI
# - using `|` allows for comments in the command, these will be parsed out
# - macros can be used within cmd
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-128K-UD-Q4_K_XL.gguf
--jinja
--reasoning-budget 0
-n -1
-ngl 99
--temp 0.7
--top-k 20
--top-p 0.8
--min-p 0
-c 10240
--no-context-shift
--mlock
--no-mmap
--no-warmup
--ubatch-size 128
--batch-size 2048
--seed -1
-t 4
-fa
--presence-penalty 1.1
-a Qwen-A3B-Q4-No-Thinking
--no-webui
# env: define an array of environment variables to inject into cmd's environment
# - optional, default: empty array
# - each value is a single string
# - in the format: ENV_NAME=value
#env:
# - "CUDA_VISIBLE_DEVICES=0,1,2"
# proxy: the URL where llama-swap routes API requests
# - optional, default: http://localhost:${PORT}
# - if you used ${PORT} in cmd this can be omitted
# - if you use a custom port in cmd this *must* be set
#proxy: http://127.0.0.1:8999
# aliases: alternative model names that this model configuration is used for
# - optional, default: empty array
# - aliases must be unique globally
# - useful for impersonating a specific model
aliases:
- "Qwen-A3B-Q4-No-Thinking"
# checkEndpoint: URL path to check if the server is ready
# - optional, default: /health
# - use "none" to skip endpoint ready checking
# - endpoint is expected to return an HTTP 200 response
# - all requests wait until the endpoint is ready (or fails)
#checkEndpoint: /custom-endpoint
# ttl: automatically unload the model after this many seconds
# - optional, default: 0
# - ttl values must be a value greater than 0
# - a value of 0 disables automatic unloading of the model
ttl: 120
# useModelName: overrides the model name that is sent to upstream server
# - optional, default: ""
# - useful when the upstream server expects a specific model name or format
useModelName: "Qwen-A3B-Q4-No-Thinking"
# filters: a dictionary of filter settings
# - optional, default: empty dictionary
filters:
# strip_params: a comma separated list of parameters to remove from the request
# - optional, default: ""
# - useful for preventing overriding of default server params by requests
# - `model` parameter is never removed
# - can be any JSON key in the request body
# - recommended to stick to sampling parameters
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Qwen-A3B-Q6-No-Thinking":
# cmd: the command to run to start the inference server.
# - required
# - it is just a string, similar to what you would run on the CLI
# - using `|` allows for comments in the command, these will be parsed out
# - macros can be used within cmd
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-128K-UD-Q6_K_XL.gguf
--jinja
--reasoning-budget 0
-n -1
-ngl 99
--temp 0.7
--top-k 20
--top-p 0.8
--min-p 0
-c 10240
--no-context-shift
--mlock
--no-mmap
--no-warmup
--ubatch-size 128
--batch-size 2048
--seed -1
-t 4
-fa
--presence-penalty 1.1
-a Qwen-A3B-Q6-No-Thinking
--no-webui
# env: define an array of environment variables to inject into cmd's environment
# - optional, default: empty array
# - each value is a single string
# - in the format: ENV_NAME=value
#env:
# - "CUDA_VISIBLE_DEVICES=0,1,2"
# proxy: the URL where llama-swap routes API requests
# - optional, default: http://localhost:${PORT}
# - if you used ${PORT} in cmd this can be omitted
# - if you use a custom port in cmd this *must* be set
#proxy: http://127.0.0.1:8999
# aliases: alternative model names that this model configuration is used for
# - optional, default: empty array
# - aliases must be unique globally
# - useful for impersonating a specific model
aliases:
- "Qwen-A3B-Q6-No-Thinking"
# checkEndpoint: URL path to check if the server is ready
# - optional, default: /health
# - use "none" to skip endpoint ready checking
# - endpoint is expected to return an HTTP 200 response
# - all requests wait until the endpoint is ready (or fails)
#checkEndpoint: /custom-endpoint
# ttl: automatically unload the model after this many seconds
# - optional, default: 0
# - ttl values must be a value greater than 0
# - a value of 0 disables automatic unloading of the model
ttl: 120
# useModelName: overrides the model name that is sent to upstream server
# - optional, default: ""
# - useful when the upstream server expects a specific model name or format
useModelName: "Qwen-A3B-Q6-No-Thinking"
# filters: a dictionary of filter settings
# - optional, default: empty dictionary
filters:
# strip_params: a comma separated list of parameters to remove from the request
# - optional, default: ""
# - useful for preventing overriding of default server params by requests
# - `model` parameter is never removed
# - can be any JSON key in the request body
# - recommended to stick to sampling parameters
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Gemma3-4B":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/gemma3-4B/gemma-3-4b-it-UD-Q5_K_XL.gguf
--mmproj /home/tipu/.lmstudio/models/unsloth/gemma3-4B/mmproj-BF16.gguf
--jinja
-n -1
-ngl 99
--repeat-penalty 1.0
--min-p 0.01
--top-k 64
--top-p 0.95
-t 4
--no-webui
-a Gemma3-4B
-c 10240
--no-context-shift
--mlock
--no-mmap
--no-warmup
--seed 3501
--swa-full
aliases:
- "Gemma3-4B"
ttl: 120
useModelName: "Gemma3-4B"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Gemma3-12B":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/gemma3_12B/gemma-3-12b-it-UD-Q5_K_XL.gguf
--mmproj /home/tipu/.lmstudio/models/unsloth/gemma3_12B/mmproj-BF16.gguf
--jinja
-n -1
-ngl 99
--repeat-penalty 1.0
--min-p 0.01
--top-k 64
--top-p 0.95
-t 4
--no-mmap
--no-warmup
--no-webui
-a Gemma3-12B
-c 10240
--no-context-shift
--mlock
--seed 3503
--swa-full
aliases:
- "Gemma3-12B"
ttl: 120
useModelName: "Gemma3-12B"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Nanonets-OCR":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/Nanonets-OCR/Nanonets-OCR-s-Q8_0.gguf
--mmproj /home/tipu/.lmstudio/models/unsloth/Nanonets-OCR/nanonets-mmproj-F16.gguf
-n -1
-ngl 99
--jinja
--repeat-penalty 1.1
--temp 0.0
--min-p 0.01
-t 4
--no-webui
-a Nanonets-OCR
-c 10240
--ubatch-size 128
--batch-size 2048
--mlock
--seed -1
--swa-full
--no-escape
-fa
--no-mmap
--no-warmup
aliases:
- "Nanonets-OCR"
ttl: 120
useModelName: "Nanonets-OCR"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Devstral-Small-2505":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/Devstral-Small-2505/Devstral-Small-2505-UD-Q5_K_XL.gguf
--jinja
-n -1
-ngl 99
--repeat-penalty 1.0
--temp 0.15
--top-p 0.95
--min-p 0.01
--top-k 40
-t 4
--ubatch-size 128
--batch-size 2048
--no-webui
-a Devstral-Small-2505
-c 10240
--no-context-shift
--mlock
-fa
--no-mmap
--no-warmup
aliases:
- "Devstral-Small-2505"
ttl: 120
useModelName: "Devstral-Small-2505"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k"
"DeepSeek-R1-0528-8B":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/DeepSeek-R1-0528-8B-GGUF/DeepSeek-R1-0528-Qwen3-8B-UD-Q6_K_XL.gguf
--jinja
-n -1
-ngl 99
--repeat-penalty 1.05
--temp 0.6
--top-p 0.95
--min-p 0.00
--top-k 20
-t 4
--no-webui
-a DeepSeek-R1-0528-8B
-c 10240
--no-context-shift
--mlock
--ubatch-size 128
--batch-size 2048
-fa
--no-mmap
--no-warmup
aliases:
- "DeepSeek-R1-0528-8B"
ttl: 120
useModelName: "DeepSeek-R1-0528-8B"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k"
"THUDM_GLM-4-9B-0414":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/GLM-4-9B-0414/GLM-4-9B-0414-UD-Q5_K_XL.gguf
--jinja
-n -1
--temp 0.2
--top-p 1
--min-p 0.01
-ngl 99
-t 4
--no-webui
-a THUDM_GLM-4-9B-0414
-c 10240
--no-context-shift
--ubatch-size 128
--batch-size 2048
--mlock
-fa
--swa-full
--no-mmap
--no-warmup
aliases:
- "THUDM_GLM-4-9B-0414"
ttl: 120
useModelName: "THUDM_GLM-4-9B-0414"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Tesslate_Tessa-Rust-T1-7B":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/bartowski/Tesslate_Tessa-Rust-T1-7B-GGUF/Tesslate_Tessa-Rust-T1-7B-Q6_K_L.gguf
--jinja
-n -1
-ngl 99
-t 4
--temp 0.2
--top-p 1
--min-p 0.01
--no-webui
-a Tesslate_Tessa-Rust-T1-7B
-c 10240
--no-context-shift
--mlock
-fa
--no-mmap
--no-warmup
aliases:
- "Tesslate_Tessa-Rust-T1-7B"
ttl: 120
useModelName: "Tesslate_Tessa-Rust-T1-7B"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k"
"Hunyuan-A13B-Thinking":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/Hunyuan-A13B-Instruct/Hunyuan-A13B-Instruct-IQ4_XS.gguf
--jinja
--reasoning-format none
--reasoning-budget -1
-n -1
-ngl 7
--temp 0.7
--top-k 20
--top-p 0.8
--repeat-penalty 1.05
-c 4096
--seed -1
-t 4
-fa
--mlock
--no-warmup
--ubatch-size 128
--batch-size 2048
-a Hunyuan-A13B-Thinking
--no-webui
--no-kv-offload
--cache-type-k q8_0
--cache-type-v q8_0
aliases:
- "Hunyuan-A13B-Thinking"
ttl: 120
useModelName: "Hunyuan-A13B-Thinking"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k, reasoning-budget"
"Hunyuan-A13B-Non-Thinking":
cmd: |
${llama-cpp}
-m /home/tipu/.lmstudio/models/unsloth/Hunyuan-A13B-Instruct/Hunyuan-A13B-Instruct-IQ4_XS.gguf
--jinja
--reasoning-format none
--reasoning-budget 0
-n -1
-ngl 7
--temp 0.5
--top-k 20
--top-p 0.7
--repeat-penalty 1.05
-c 4096
--seed -1
-t 4
-fa
--mlock
--no-warmup
-a Hunyuan-A13B-Non-Thinking
--no-webui
--ubatch-size 128
--batch-size 2048
--no-kv-offload
--cache-type-k q8_0
--cache-type-v q8_0
aliases:
- "Hunyuan-A13B-Non-Thinking"
ttl: 120
useModelName: "Hunyuan-A13B-Non-Thinking"
filters:
strip_params: "temperature, top_p, top_k, min-p, min-k, reasoning-budget"
# don't use these, just for testing if things are broken
"broken":
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
proxy: http://127.0.0.1:8999
unlisted: true
"broken_timeout":
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
proxy: http://127.0.0.1:9000
unlisted: true