My LlamaSwap Configuration

My llamaSwap configuration. Individual configurations can be used to run llama.cpp as standalone.
~/Applications/llamacpp/config.yamlyaml
# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 120

# valid log levels: debug, info (default), warn, error
logLevel: debug

groups:
  # group1 is same as the default behaviour of llama-swap where only one model is allowed
  # to run a time across the whole llama-swap instance
  "Other":
    # swap: controls the model swapping behaviour in within the group
    # - optional, default: true
    # - true : only one model is allowed to run at a time
    # - false: all models can run together, no swapping
    swap: true

    # exclusive: controls how the group affects other groups
    # - optional, default: true
    # - true: causes all other groups to unload when this group runs a model
    # - false: does not affect other groups
    exclusive: true

    # members references the models defined above
    # required
    members:
      - "DeepSeek-R1-0528-8B"
      - "Devstral-Small-2505"
      - "THUDM_GLM-4-9B-0414"
      - "Tesslate_Tessa-Rust-T1-7B"
      - "Hunyuan-A13B-Instruct-Thinking"
      - "Hunyuan-A13B-Instruct-Non-Thinking"

  "Gemma3":
    # swap: controls the model swapping behaviour in within the group
    # - optional, default: true
    # - true : only one model is allowed to run at a time
    # - false: all models can run together, no swapping
    swap: true

    # exclusive: controls how the group affects other groups
    # - optional, default: true
    # - true: causes all other groups to unload when this group runs a model
    # - false: does not affect other groups
    exclusive: true

    # members references the models defined above
    # required
    members:
      - "Gemma3-4B"
      - "Gemma3-12B"

  "Nanonets-OCR":
    # swap: controls the model swapping behaviour in within the group
    # - optional, default: true
    # - true : only one model is allowed to run at a time
    # - false: all models can run together, no swapping
    swap: true

    # exclusive: controls how the group affects other groups
    # - optional, default: true
    # - true: causes all other groups to unload when this group runs a model
    # - false: does not affect other groups
    exclusive: true

    # members references the models defined above
    # required
    members:
      - "Nanonets-OCR"

  "Qwen3-30B-A3B":
    # swap: controls the model swapping behaviour in within the group
    # - optional, default: true
    # - true : only one model is allowed to run at a time
    # - false: all models can run together, no swapping
    swap: true

    # exclusive: controls how the group affects other groups
    # - optional, default: true
    # - true: causes all other groups to unload when this group runs a model
    # - false: does not affect other groups
    exclusive: true

    # members references the models defined above
    # required
    members:
      - "Qwen-A3B-Q4-Thinking"
      - "Qwen-A3B-Q6-Thinking"
      - "Qwen-A3B-Q4-No-Thinking"
      - "Qwen-A3B-Q6-No-Thinking"

macros:
  "llama-cpp": >
    /home/tipu/Applications/llamacpp/llama-server
    --port ${PORT}
    --api-key 12345

models:

  "Qwen-A3B-Q4-Thinking":
    # cmd: the command to run to start the inference server.
    # - required
    # - it is just a string, similar to what you would run on the CLI
    # - using `|` allows for comments in the command, these will be parsed out
    # - macros can be used within cmd
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-128K-UD-Q4_K_XL.gguf
      --jinja 
      --reasoning-format none
      -n -1
      -ngl 99 
      --temp 0.6 
      --top-k 20 
      --top-p 0.95 
      --min-p 0 
      -c 10240 
      --no-context-shift 
      --mlock 
      --ubatch-size 128
      --batch-size 2048
      --seed -1
      -t 4 
      -fa 
      --no-mmap
      --no-warmup
      --presence-penalty 1.1
      -a Qwen-A3B-Q4-Thinking
      --no-webui

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
    # - each value is a single string
    # - in the format: ENV_NAME=value
    #env:
    #  - "CUDA_VISIBLE_DEVICES=0,1,2"

    # proxy: the URL where llama-swap routes API requests
    # - optional, default: http://localhost:${PORT}
    # - if you used ${PORT} in cmd this can be omitted
    # - if you use a custom port in cmd this *must* be set
    #proxy: http://127.0.0.1:8999

    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "Qwen-A3B-Q4-Thinking"

    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - use "none" to skip endpoint ready checking
    # - endpoint is expected to return an HTTP 200 response
    # - all requests wait until the endpoint is ready (or fails)
    #checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after this many seconds
    # - optional, default: 0
    # - ttl values must be a value greater than 0
    # - a value of 0 disables automatic unloading of the model
    ttl: 120

    # useModelName: overrides the model name that is sent to upstream server
    # - optional, default: ""
    # - useful when the upstream server expects a specific model name or format
    useModelName: "Qwen-A3B-Q4-Thinking"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
    filters:
      # strip_params: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for preventing overriding of default server params by requests
      # - `model` parameter is never removed
      # - can be any JSON key in the request body
      # - recommended to stick to sampling parameters
      strip_params: "temperature, top_p, top_k, min-p, min-k"
  
  "Qwen-A3B-Q6-Thinking":
    # cmd: the command to run to start the inference server.
    # - required
    # - it is just a string, similar to what you would run on the CLI
    # - using `|` allows for comments in the command, these will be parsed out
    # - macros can be used within cmd
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-128K-UD-Q6_K_XL.gguf
      --jinja 
      --reasoning-format none
      -n -1
      -ngl 99 
      --temp 0.6 
      --top-k 20 
      --top-p 0.95 
      --min-p 0 
      -c 10240 
      --no-context-shift 
      --mlock 
      --ubatch-size 128
      --batch-size 2048
      --seed -1
      -t 4 
      -fa 
      --no-mmap
      --no-warmup
      --presence-penalty 1.1
      -a Qwen-A3B-Q6-Thinking
      --no-webui

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
    # - each value is a single string
    # - in the format: ENV_NAME=value
    #env:
    #  - "CUDA_VISIBLE_DEVICES=0,1,2"

    # proxy: the URL where llama-swap routes API requests
    # - optional, default: http://localhost:${PORT}
    # - if you used ${PORT} in cmd this can be omitted
    # - if you use a custom port in cmd this *must* be set
    #proxy: http://127.0.0.1:8999

    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "Qwen-A3B-Q6-Thinking"

    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - use "none" to skip endpoint ready checking
    # - endpoint is expected to return an HTTP 200 response
    # - all requests wait until the endpoint is ready (or fails)
    #checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after this many seconds
    # - optional, default: 0
    # - ttl values must be a value greater than 0
    # - a value of 0 disables automatic unloading of the model
    ttl: 120

    # useModelName: overrides the model name that is sent to upstream server
    # - optional, default: ""
    # - useful when the upstream server expects a specific model name or format
    useModelName: "Qwen-A3B-Q6-Thinking"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
    filters:
      # strip_params: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for preventing overriding of default server params by requests
      # - `model` parameter is never removed
      # - can be any JSON key in the request body
      # - recommended to stick to sampling parameters
      strip_params: "temperature, top_p, top_k, min-p, min-k"
  
  "Qwen-A3B-Q4-No-Thinking":
    # cmd: the command to run to start the inference server.
    # - required
    # - it is just a string, similar to what you would run on the CLI
    # - using `|` allows for comments in the command, these will be parsed out
    # - macros can be used within cmd
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-128K-UD-Q4_K_XL.gguf
      --jinja 
      --reasoning-budget 0
      -n -1
      -ngl 99 
      --temp 0.7
      --top-k 20 
      --top-p 0.8
      --min-p 0 
      -c 10240 
      --no-context-shift 
      --mlock 
      --no-mmap
      --no-warmup
      --ubatch-size 128
      --batch-size 2048
      --seed -1
      -t 4 
      -fa 
      --presence-penalty 1.1
      -a Qwen-A3B-Q4-No-Thinking
      --no-webui

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
    # - each value is a single string
    # - in the format: ENV_NAME=value
    #env:
    #  - "CUDA_VISIBLE_DEVICES=0,1,2"

    # proxy: the URL where llama-swap routes API requests
    # - optional, default: http://localhost:${PORT}
    # - if you used ${PORT} in cmd this can be omitted
    # - if you use a custom port in cmd this *must* be set
    #proxy: http://127.0.0.1:8999

    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "Qwen-A3B-Q4-No-Thinking"

    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - use "none" to skip endpoint ready checking
    # - endpoint is expected to return an HTTP 200 response
    # - all requests wait until the endpoint is ready (or fails)
    #checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after this many seconds
    # - optional, default: 0
    # - ttl values must be a value greater than 0
    # - a value of 0 disables automatic unloading of the model
    ttl: 120

    # useModelName: overrides the model name that is sent to upstream server
    # - optional, default: ""
    # - useful when the upstream server expects a specific model name or format
    useModelName: "Qwen-A3B-Q4-No-Thinking"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
    filters:
      # strip_params: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for preventing overriding of default server params by requests
      # - `model` parameter is never removed
      # - can be any JSON key in the request body
      # - recommended to stick to sampling parameters
      strip_params: "temperature, top_p, top_k, min-p, min-k"
  
  "Qwen-A3B-Q6-No-Thinking":
    # cmd: the command to run to start the inference server.
    # - required
    # - it is just a string, similar to what you would run on the CLI
    # - using `|` allows for comments in the command, these will be parsed out
    # - macros can be used within cmd
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-128K-UD-Q6_K_XL.gguf
      --jinja 
      --reasoning-budget 0
      -n -1
      -ngl 99 
      --temp 0.7
      --top-k 20 
      --top-p 0.8 
      --min-p 0 
      -c 10240 
      --no-context-shift 
      --mlock 
      --no-mmap
      --no-warmup
      --ubatch-size 128
      --batch-size 2048
      --seed -1
      -t 4 
      -fa 
      --presence-penalty 1.1
      -a Qwen-A3B-Q6-No-Thinking
      --no-webui

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
    # - each value is a single string
    # - in the format: ENV_NAME=value
    #env:
    #  - "CUDA_VISIBLE_DEVICES=0,1,2"

    # proxy: the URL where llama-swap routes API requests
    # - optional, default: http://localhost:${PORT}
    # - if you used ${PORT} in cmd this can be omitted
    # - if you use a custom port in cmd this *must* be set
    #proxy: http://127.0.0.1:8999

    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "Qwen-A3B-Q6-No-Thinking"

    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - use "none" to skip endpoint ready checking
    # - endpoint is expected to return an HTTP 200 response
    # - all requests wait until the endpoint is ready (or fails)
    #checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after this many seconds
    # - optional, default: 0
    # - ttl values must be a value greater than 0
    # - a value of 0 disables automatic unloading of the model
    ttl: 120

    # useModelName: overrides the model name that is sent to upstream server
    # - optional, default: ""
    # - useful when the upstream server expects a specific model name or format
    useModelName: "Qwen-A3B-Q6-No-Thinking"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
    filters:
      # strip_params: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for preventing overriding of default server params by requests
      # - `model` parameter is never removed
      # - can be any JSON key in the request body
      # - recommended to stick to sampling parameters
      strip_params: "temperature, top_p, top_k, min-p, min-k"

  "Gemma3-4B":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/gemma3-4B/gemma-3-4b-it-UD-Q5_K_XL.gguf
      --mmproj /home/tipu/.lmstudio/models/unsloth/gemma3-4B/mmproj-BF16.gguf
      --jinja 
      -n -1
      -ngl 99 
      --repeat-penalty 1.0
      --min-p 0.01
      --top-k 64
      --top-p 0.95
      -t 4
      --no-webui
      -a Gemma3-4B 
      -c 10240 
      --no-context-shift 
      --mlock 
      --no-mmap
      --no-warmup
      --seed 3501
      --swa-full

    aliases:
      - "Gemma3-4B"

    ttl: 120

    useModelName: "Gemma3-4B"

    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k"
  
  "Gemma3-12B":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/gemma3_12B/gemma-3-12b-it-UD-Q5_K_XL.gguf
      --mmproj /home/tipu/.lmstudio/models/unsloth/gemma3_12B/mmproj-BF16.gguf
      --jinja 
      -n -1
      -ngl 99 
      --repeat-penalty 1.0
      --min-p 0.01
      --top-k 64
      --top-p 0.95
      -t 4
      --no-mmap
      --no-warmup
      --no-webui
      -a Gemma3-12B 
      -c 10240 
      --no-context-shift 
      --mlock 
      --seed 3503
      --swa-full

    aliases:
      - "Gemma3-12B"

    ttl: 120

    useModelName: "Gemma3-12B"

    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k"

  "Nanonets-OCR":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/Nanonets-OCR/Nanonets-OCR-s-Q8_0.gguf
      --mmproj /home/tipu/.lmstudio/models/unsloth/Nanonets-OCR/nanonets-mmproj-F16.gguf
      -n -1
      -ngl 99 
      --jinja
      --repeat-penalty 1.1
      --temp 0.0
      --min-p 0.01
      -t 4
      --no-webui
      -a Nanonets-OCR 
      -c 10240 
      --ubatch-size 128
      --batch-size 2048
      --mlock
      --seed -1
      --swa-full
      --no-escape
      -fa
      --no-mmap
      --no-warmup

    aliases:
      - "Nanonets-OCR"

    ttl: 120

    useModelName: "Nanonets-OCR"

    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k"

  "Devstral-Small-2505":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/Devstral-Small-2505/Devstral-Small-2505-UD-Q5_K_XL.gguf
      --jinja 
      -n -1
      -ngl 99 
      --repeat-penalty 1.0
      --temp 0.15
      --top-p 0.95
      --min-p 0.01
      --top-k 40
      -t 4
      --ubatch-size 128
      --batch-size 2048
      --no-webui
      -a Devstral-Small-2505
      -c 10240 
      --no-context-shift 
      --mlock 
      -fa
      --no-mmap
      --no-warmup

    aliases:
      - "Devstral-Small-2505"

    ttl: 120

    useModelName: "Devstral-Small-2505"

    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k"

  "DeepSeek-R1-0528-8B":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/DeepSeek-R1-0528-8B-GGUF/DeepSeek-R1-0528-Qwen3-8B-UD-Q6_K_XL.gguf
      --jinja 
      -n -1
      -ngl 99 
      --repeat-penalty 1.05
      --temp 0.6
      --top-p 0.95
      --min-p 0.00
      --top-k 20
      -t 4
      --no-webui
      -a DeepSeek-R1-0528-8B
      -c 10240 
      --no-context-shift 
      --mlock 
      --ubatch-size 128
      --batch-size 2048
      -fa
      --no-mmap
      --no-warmup

    aliases:
      - "DeepSeek-R1-0528-8B"

    ttl: 120

    useModelName: "DeepSeek-R1-0528-8B"

    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k"
  
  "THUDM_GLM-4-9B-0414":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/GLM-4-9B-0414/GLM-4-9B-0414-UD-Q5_K_XL.gguf
      --jinja 
      -n -1
      --temp 0.2
      --top-p 1
      --min-p 0.01
      -ngl 99 
      -t 4
      --no-webui
      -a THUDM_GLM-4-9B-0414
      -c 10240 
      --no-context-shift 
      --ubatch-size 128
      --batch-size 2048
      --mlock 
      -fa
      --swa-full
      --no-mmap
      --no-warmup

    aliases:
      - "THUDM_GLM-4-9B-0414"

    ttl: 120

    useModelName: "THUDM_GLM-4-9B-0414"

    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k"

  "Tesslate_Tessa-Rust-T1-7B":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/bartowski/Tesslate_Tessa-Rust-T1-7B-GGUF/Tesslate_Tessa-Rust-T1-7B-Q6_K_L.gguf
      --jinja 
      -n -1
      -ngl 99 
      -t 4
      --temp 0.2
      --top-p 1
      --min-p 0.01
      --no-webui
      -a Tesslate_Tessa-Rust-T1-7B
      -c 10240 
      --no-context-shift 
      --mlock 
      -fa
      --no-mmap
      --no-warmup

    aliases:
      - "Tesslate_Tessa-Rust-T1-7B"

    ttl: 120

    useModelName: "Tesslate_Tessa-Rust-T1-7B"

    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k"


  "Hunyuan-A13B-Thinking":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/Hunyuan-A13B-Instruct/Hunyuan-A13B-Instruct-IQ4_XS.gguf
      --jinja 
      --reasoning-format none
      --reasoning-budget -1
      -n -1
      -ngl 7
      --temp 0.7
      --top-k 20 
      --top-p 0.8 
      --repeat-penalty 1.05
      -c 4096
      --seed -1
      -t 4 
      -fa 
      --mlock
      --no-warmup
      --ubatch-size 128
      --batch-size 2048
      -a Hunyuan-A13B-Thinking
      --no-webui
      --no-kv-offload
      --cache-type-k q8_0
      --cache-type-v q8_0

    aliases:
      - "Hunyuan-A13B-Thinking"

    ttl: 120

    useModelName: "Hunyuan-A13B-Thinking"
  
    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k, reasoning-budget"
  
  "Hunyuan-A13B-Non-Thinking":
    cmd: |
      ${llama-cpp}
      -m /home/tipu/.lmstudio/models/unsloth/Hunyuan-A13B-Instruct/Hunyuan-A13B-Instruct-IQ4_XS.gguf
      --jinja 
      --reasoning-format none
      --reasoning-budget 0
      -n -1
      -ngl 7
      --temp 0.5
      --top-k 20 
      --top-p 0.7 
      --repeat-penalty 1.05
      -c 4096
      --seed -1
      -t 4 
      -fa 
      --mlock
      --no-warmup
      -a Hunyuan-A13B-Non-Thinking
      --no-webui
      --ubatch-size 128
      --batch-size 2048
      --no-kv-offload
      --cache-type-k q8_0
      --cache-type-v q8_0

    aliases:
      - "Hunyuan-A13B-Non-Thinking"

    ttl: 120

    useModelName: "Hunyuan-A13B-Non-Thinking"
  
    filters:

      strip_params: "temperature, top_p, top_k, min-p, min-k, reasoning-budget"

  # don't use these, just for testing if things are broken
  "broken":
    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
    proxy: http://127.0.0.1:8999
    unlisted: true
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9000
    unlisted: true