abetlen · adelmorad273-cmyk · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026
diff --git a/SECURITY.md b/SECURITY.md
@@ -0,0 +1,21 @@
+# Security Policy
+
+## Supported Versions
+
+Use this section to tell people about which versions of your project are
+currently being supported with security updates.
+
+| Version | Supported          |
+| ------- | ------------------ |
+| 5.1.x   | :white_check_mark: |
+| 5.0.x   | :x:                |
+| 4.0.x   | :white_check_mark: |
+| < 4.0   | :x:                |
+
+## Reporting a Vulnerability
+
+Use this section to tell people how to report a vulnerability.
+
+Tell them where to go, how often they can expect to get an update on a
+reported vulnerability, what to expect if the vulnerability is accepted or
+declined, etc.
diff --git a/docs/server.md b/docs/server.md
@@ -25,12 +25,21 @@ python3 -m llama_cpp.server --model <model_path>
 You can also pass chat-template kwargs at model load time from the CLI:
 
 ```bash
+# Linux / macOS (bash)
 python3 -m llama_cpp.server \
   --model <model_path> \
   --chat_format chatml \
   --chat_template_kwargs '{"enable_thinking": true}'
 ```
 
+```powershell
+# Windows (PowerShell) – use a backtick ` for line continuation, not a backslash \
+python -m llama_cpp.server `
+  --model <model_path> `
+  --chat_format chatml `
+  --chat_template_kwargs '{\"enable_thinking\": true}'
+```
+
 ### Server options
 
 For a full list of options, run:

diff --git a/examples/high_level_api/legion_slim5_rtx4060.py b/examples/high_level_api/legion_slim5_rtx4060.py
@@ -0,0 +1,223 @@
+"""
+Optimized llama-cpp-python configuration for:
+  Lenovo Legion Slim 5 (16" RH8)
+  - CPU:  Intel Core i7-13700H (6P + 8E cores)
+  - GPU:  NVIDIA GeForce RTX 4060 Laptop (8 GB VRAM, GDDR6)
+  - RAM:  16 GB DDR5-5200
+  - SSD:  1 TB NVMe
+
+Install with CUDA support first:
+
+  Bash / Linux / macOS:
+    CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --force-reinstall --no-cache-dir
+
+  PowerShell (Windows):
+    $env:CMAKE_ARGS = "-DGGML_CUDA=on"
+    python -m pip install llama-cpp-python --force-reinstall --no-cache-dir
+
+  Tip (Windows): install into a virtual environment to avoid dependency conflicts
+  with other tools in your global environment:
+    python -m venv .venv-llama
+    ./.venv-llama/Scripts/Activate.ps1
+    $env:CMAKE_ARGS = "-DGGML_CUDA=on"
+    python -m pip install llama-cpp-python --force-reinstall --no-cache-dir
+"""
+
+import argparse
+import json
+import os
+import sys
+
+from llama_cpp import Llama
+
+# ---------------------------------------------------------------------------
+# Hardware constants for this machine
+# ---------------------------------------------------------------------------
+VRAM_GB = 8          # RTX 4060 Laptop VRAM
+N_PHYSICAL_CORES = 6  # P-cores only (best single-thread perf on i7-13700H)
+
+# ---------------------------------------------------------------------------
+# Recommended quantisation levels (pick one based on your model size)
+# ---------------------------------------------------------------------------
+# Model 7B / 8B:
+#   Q5_K_M  → ~5.5 GB VRAM  ✅ recommended
+#   Q6_K    → ~6.5 GB VRAM  ✅ excellent quality
+#   Q8_0    → ~8.5 GB VRAM  ⚠️  tight fit, may spill to CPU RAM
+#
+# Model 13B:
+#   Q4_K_M  → ~7.5 GB VRAM  ✅ fits
+#   Q5_K_M  → ~9.0 GB VRAM  ❌ exceeds VRAM
+
+
+def build_llm(
+    model_path: str,
+    n_ctx: int = 4096,
+    n_gpu_layers: int = -1,  # -1 = offload all layers to GPU
+    n_batch: int = 512,
+    verbose: bool = False,
+) -> Llama:
+    """
+    Create a Llama instance tuned for the Legion Slim 5 / RTX 4060 laptop.
+
+    Args:
+        model_path:   Path to the .gguf model file.
+        n_ctx:        Context window size (tokens).  4096 is safe for 8 GB VRAM.
+        n_gpu_layers: Number of transformer layers to offload to the GPU.
+                      Use -1 to offload everything (default).  Reduce if you
+                      see CUDA out-of-memory errors.
+        n_batch:      Batch size for prompt evaluation.
+        verbose:      Print llama.cpp loading messages.
+
+    Returns:
+        A ready-to-use Llama instance.
+    """
+    return Llama(
+        model_path=model_path,
+        # --- GPU offload ---
+        n_gpu_layers=n_gpu_layers,  # RTX 4060 has 8 GB – offload as much as fits
+        offload_kqv=True,           # keep KV-cache on GPU for faster inference
+        # --- CPU threads ---
+        n_threads=N_PHYSICAL_CORES,  # use P-cores only for best throughput
+        n_threads_batch=N_PHYSICAL_CORES,
+        # --- Context / batching ---
+        n_ctx=n_ctx,
+        n_batch=n_batch,
+        # --- Memory ---
+        use_mmap=True,   # fast model loading from NVMe SSD
+        use_mlock=False, # don't pin 16 GB RAM – OS needs headroom
+        # --- Misc ---
+        verbose=verbose,
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Run inference optimised for the Lenovo Legion Slim 5 / RTX 4060"
+    )
+    parser.add_argument(
+        "-m", "--model",
+        required=True,
+        help="Path to the .gguf model file (e.g. mistral-7b-Q5_K_M.gguf)",
+    )
+    parser.add_argument(
+        "-p", "--prompt",
+        default="What are the names of the planets in the solar system?",
+        help="Prompt text",
+    )
+    parser.add_argument(
+        "--system-prompt",
+        default=None,
+        help="Optional system prompt prepended before the user prompt",
+    )
+    parser.add_argument(
+        "--max-tokens", type=int, default=256,
+        help="Maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--n-ctx", type=int, default=4096,
+        help="Context window size",
+    )
+    parser.add_argument(
+        "--n-gpu-layers", type=int, default=-1,
+        help="GPU layers to offload (-1 = all)",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=-1,
+        help="RNG seed for reproducible output (-1 = random)",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.8,
+        help="Sampling temperature (0.0 = greedy, higher = more creative)",
+    )
+    parser.add_argument(
+        "--top-p", type=float, default=0.95,
+        help="Nucleus sampling probability threshold",
+    )
+    parser.add_argument(
+        "--repeat-penalty", type=float, default=1.1,
+        help="Penalty applied to repeated tokens (1.0 = disabled)",
+    )
+    parser.add_argument(
+        "--json-output", action="store_true",
+        help="Print only raw JSON output (no banner); useful for piping",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="Print llama.cpp loading messages",
+    )
+    args = parser.parse_args()
+
+    # --- Validate model path -------------------------------------------------
+    model_path = os.path.abspath(args.model)
+    if not os.path.isfile(model_path):
+        print(
+            f"ERROR: model file not found: {model_path}\n"
+            "  Make sure the path is correct and the file exists.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    if not args.json_output:
+        print(f"Loading model: {model_path}")
+        print(f"GPU layers   : {'all' if args.n_gpu_layers == -1 else args.n_gpu_layers}")
+        print(f"Context size : {args.n_ctx} tokens\n")
+
+    # --- Load model ----------------------------------------------------------
+    try:
+        llm = build_llm(
+            model_path=model_path,
+            n_ctx=args.n_ctx,
+            n_gpu_layers=args.n_gpu_layers,
+            verbose=args.verbose,
+        )
+    except Exception as exc:
+        err = str(exc)
+        print(f"ERROR: failed to load model – {err}", file=sys.stderr)
+        if args.n_gpu_layers == -1 and (
+            "out of memory" in err.lower() or "cuda" in err.lower()
+        ):
+            print(
+                "  Hint: GPU ran out of VRAM while loading all layers.\n"
+                "  Try reducing --n-gpu-layers (e.g. --n-gpu-layers 28) to keep\n"
+                "  some layers on CPU RAM instead.",
+                file=sys.stderr,
+            )
+        sys.exit(1)
+
+    # --- Build prompt --------------------------------------------------------
+    if args.system_prompt:
+        full_prompt = f"{args.system_prompt}\n\n{args.prompt}"
+    else:
+        full_prompt = args.prompt
+
+    # --- Run inference -------------------------------------------------------
+    try:
+        output = llm(
+            full_prompt,
+            max_tokens=args.max_tokens,
+            stop=["Q:", "\n\n"],
+            echo=True,
+            seed=args.seed,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            repeat_penalty=args.repeat_penalty,
+        )
+    except Exception as exc:
+        err = str(exc)
+        print(f"ERROR: inference failed – {err}", file=sys.stderr)
+        if args.n_gpu_layers == -1 and (
+            "out of memory" in err.lower() or "cuda" in err.lower()
+        ):
+            print(
+                "  Hint: GPU ran out of VRAM during inference.\n"
+                "  Try reducing --n-gpu-layers (e.g. --n-gpu-layers 28) to keep\n"
+                "  some layers on CPU RAM instead.",
+                file=sys.stderr,
+            )
+        sys.exit(1)
+
+    print(json.dumps(output, indent=2, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
@@ -2,6 +2,7 @@
 
 import os
 import json
+import logging
 import typing
 import contextlib
 
@@ -17,6 +18,7 @@
 from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
 from fastapi.middleware import Middleware
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
 from fastapi.security import HTTPBearer
 from sse_starlette.sse import EventSourceResponse
 from starlette_context.plugins import RequestIdPlugin  # type: ignore
@@ -130,7 +132,18 @@ def create_app(
     )
 
     set_server_settings(server_settings)
+
+    logger = logging.getLogger(__name__)
+    ssl_enabled = bool(server_settings.ssl_keyfile and server_settings.ssl_certfile)
+    if not ssl_enabled:
+        logger.warning(
+            "SSL is not configured. The server is running over plain HTTP, "
+            "which is not secure. Pass --ssl_keyfile and --ssl_certfile to enable HTTPS."
+        )
+
     middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))]
+    if ssl_enabled:
+        middleware.append(Middleware(HTTPSRedirectMiddleware))
     app = FastAPI(
         middleware=middleware,
         title="🦙 llama.cpp Python API",

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -208,7 +208,7 @@ class ServerSettings(BaseSettings):
 
     # Uvicorn Settings
     host: str = Field(default="localhost", description="Listen address")
-    port: int = Field(default=8000, description="Listen port")
+    port: int = Field(default=8080, description="Listen port")
     ssl_keyfile: Optional[str] = Field(
         default=None, description="SSL key file for HTTPS"
     )