Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions SECURITY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Security Policy

## Supported Versions

Use this section to tell people about which versions of your project are
currently being supported with security updates.

| Version | Supported |
| ------- | ------------------ |
| 5.1.x | :white_check_mark: |
| 5.0.x | :x: |
| 4.0.x | :white_check_mark: |
| < 4.0 | :x: |

## Reporting a Vulnerability

Use this section to tell people how to report a vulnerability.

Tell them where to go, how often they can expect to get an update on a
reported vulnerability, what to expect if the vulnerability is accepted or
declined, etc.
9 changes: 9 additions & 0 deletions docs/server.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,21 @@ python3 -m llama_cpp.server --model <model_path>
You can also pass chat-template kwargs at model load time from the CLI:

```bash
# Linux / macOS (bash)
python3 -m llama_cpp.server \
--model <model_path> \
--chat_format chatml \
--chat_template_kwargs '{"enable_thinking": true}'
```

```powershell
# Windows (PowerShell) – use a backtick ` for line continuation, not a backslash \
python -m llama_cpp.server `
--model <model_path> `
--chat_format chatml `
--chat_template_kwargs '{\"enable_thinking\": true}'
```

### Server options

For a full list of options, run:
Expand Down
223 changes: 223 additions & 0 deletions examples/high_level_api/legion_slim5_rtx4060.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
"""
Optimized llama-cpp-python configuration for:
Lenovo Legion Slim 5 (16" RH8)
- CPU: Intel Core i7-13700H (6P + 8E cores)
- GPU: NVIDIA GeForce RTX 4060 Laptop (8 GB VRAM, GDDR6)
- RAM: 16 GB DDR5-5200
- SSD: 1 TB NVMe

Install with CUDA support first:

Bash / Linux / macOS:
CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --force-reinstall --no-cache-dir

PowerShell (Windows):
$env:CMAKE_ARGS = "-DGGML_CUDA=on"
python -m pip install llama-cpp-python --force-reinstall --no-cache-dir

Tip (Windows): install into a virtual environment to avoid dependency conflicts
with other tools in your global environment:
python -m venv .venv-llama
./.venv-llama/Scripts/Activate.ps1
$env:CMAKE_ARGS = "-DGGML_CUDA=on"
python -m pip install llama-cpp-python --force-reinstall --no-cache-dir
"""

import argparse
import json
import os
import sys

from llama_cpp import Llama

# ---------------------------------------------------------------------------
# Hardware constants for this machine
# ---------------------------------------------------------------------------
VRAM_GB = 8 # RTX 4060 Laptop VRAM
N_PHYSICAL_CORES = 6 # P-cores only (best single-thread perf on i7-13700H)

# ---------------------------------------------------------------------------
# Recommended quantisation levels (pick one based on your model size)
# ---------------------------------------------------------------------------
# Model 7B / 8B:
# Q5_K_M → ~5.5 GB VRAM ✅ recommended
# Q6_K → ~6.5 GB VRAM ✅ excellent quality
# Q8_0 → ~8.5 GB VRAM ⚠️ tight fit, may spill to CPU RAM
#
# Model 13B:
# Q4_K_M → ~7.5 GB VRAM ✅ fits
# Q5_K_M → ~9.0 GB VRAM ❌ exceeds VRAM


def build_llm(
model_path: str,
n_ctx: int = 4096,
n_gpu_layers: int = -1, # -1 = offload all layers to GPU
n_batch: int = 512,
verbose: bool = False,
) -> Llama:
"""
Create a Llama instance tuned for the Legion Slim 5 / RTX 4060 laptop.

Args:
model_path: Path to the .gguf model file.
n_ctx: Context window size (tokens). 4096 is safe for 8 GB VRAM.
n_gpu_layers: Number of transformer layers to offload to the GPU.
Use -1 to offload everything (default). Reduce if you
see CUDA out-of-memory errors.
n_batch: Batch size for prompt evaluation.
verbose: Print llama.cpp loading messages.

Returns:
A ready-to-use Llama instance.
"""
return Llama(
model_path=model_path,
# --- GPU offload ---
n_gpu_layers=n_gpu_layers, # RTX 4060 has 8 GB – offload as much as fits
offload_kqv=True, # keep KV-cache on GPU for faster inference
# --- CPU threads ---
n_threads=N_PHYSICAL_CORES, # use P-cores only for best throughput
n_threads_batch=N_PHYSICAL_CORES,
# --- Context / batching ---
n_ctx=n_ctx,
n_batch=n_batch,
# --- Memory ---
use_mmap=True, # fast model loading from NVMe SSD
use_mlock=False, # don't pin 16 GB RAM – OS needs headroom
# --- Misc ---
verbose=verbose,
)


def main() -> None:
parser = argparse.ArgumentParser(
description="Run inference optimised for the Lenovo Legion Slim 5 / RTX 4060"
)
parser.add_argument(
"-m", "--model",
required=True,
help="Path to the .gguf model file (e.g. mistral-7b-Q5_K_M.gguf)",
)
parser.add_argument(
"-p", "--prompt",
default="What are the names of the planets in the solar system?",
help="Prompt text",
)
parser.add_argument(
"--system-prompt",
default=None,
help="Optional system prompt prepended before the user prompt",
)
parser.add_argument(
"--max-tokens", type=int, default=256,
help="Maximum number of tokens to generate",
)
parser.add_argument(
"--n-ctx", type=int, default=4096,
help="Context window size",
)
parser.add_argument(
"--n-gpu-layers", type=int, default=-1,
help="GPU layers to offload (-1 = all)",
)
parser.add_argument(
"--seed", type=int, default=-1,
help="RNG seed for reproducible output (-1 = random)",
)
parser.add_argument(
"--temperature", type=float, default=0.8,
help="Sampling temperature (0.0 = greedy, higher = more creative)",
)
parser.add_argument(
"--top-p", type=float, default=0.95,
help="Nucleus sampling probability threshold",
)
parser.add_argument(
"--repeat-penalty", type=float, default=1.1,
help="Penalty applied to repeated tokens (1.0 = disabled)",
)
parser.add_argument(
"--json-output", action="store_true",
help="Print only raw JSON output (no banner); useful for piping",
)
parser.add_argument(
"--verbose", action="store_true",
help="Print llama.cpp loading messages",
)
args = parser.parse_args()

# --- Validate model path -------------------------------------------------
model_path = os.path.abspath(args.model)
if not os.path.isfile(model_path):
print(
f"ERROR: model file not found: {model_path}\n"
" Make sure the path is correct and the file exists.",
file=sys.stderr,
)
sys.exit(1)

if not args.json_output:
print(f"Loading model: {model_path}")
print(f"GPU layers : {'all' if args.n_gpu_layers == -1 else args.n_gpu_layers}")
print(f"Context size : {args.n_ctx} tokens\n")

# --- Load model ----------------------------------------------------------
try:
llm = build_llm(
model_path=model_path,
n_ctx=args.n_ctx,
n_gpu_layers=args.n_gpu_layers,
verbose=args.verbose,
)
except Exception as exc:
err = str(exc)
print(f"ERROR: failed to load model – {err}", file=sys.stderr)
if args.n_gpu_layers == -1 and (
"out of memory" in err.lower() or "cuda" in err.lower()
):
print(
" Hint: GPU ran out of VRAM while loading all layers.\n"
" Try reducing --n-gpu-layers (e.g. --n-gpu-layers 28) to keep\n"
" some layers on CPU RAM instead.",
file=sys.stderr,
)
sys.exit(1)

# --- Build prompt --------------------------------------------------------
if args.system_prompt:
full_prompt = f"{args.system_prompt}\n\n{args.prompt}"
else:
full_prompt = args.prompt

# --- Run inference -------------------------------------------------------
try:
output = llm(
full_prompt,
max_tokens=args.max_tokens,
stop=["Q:", "\n\n"],
echo=True,
seed=args.seed,
temperature=args.temperature,
top_p=args.top_p,
repeat_penalty=args.repeat_penalty,
)
except Exception as exc:
err = str(exc)
print(f"ERROR: inference failed – {err}", file=sys.stderr)
if args.n_gpu_layers == -1 and (
"out of memory" in err.lower() or "cuda" in err.lower()
):
print(
" Hint: GPU ran out of VRAM during inference.\n"
" Try reducing --n-gpu-layers (e.g. --n-gpu-layers 28) to keep\n"
" some layers on CPU RAM instead.",
file=sys.stderr,
)
sys.exit(1)

print(json.dumps(output, indent=2, ensure_ascii=False))


if __name__ == "__main__":
main()
13 changes: 13 additions & 0 deletions llama_cpp/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import json
import logging
import typing
import contextlib

Expand All @@ -17,6 +18,7 @@
from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
from fastapi.middleware import Middleware
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
from fastapi.security import HTTPBearer
from sse_starlette.sse import EventSourceResponse
from starlette_context.plugins import RequestIdPlugin # type: ignore
Expand Down Expand Up @@ -130,7 +132,18 @@ def create_app(
)

set_server_settings(server_settings)

logger = logging.getLogger(__name__)
ssl_enabled = bool(server_settings.ssl_keyfile and server_settings.ssl_certfile)
if not ssl_enabled:
logger.warning(
"SSL is not configured. The server is running over plain HTTP, "
"which is not secure. Pass --ssl_keyfile and --ssl_certfile to enable HTTPS."
)

middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))]
if ssl_enabled:
middleware.append(Middleware(HTTPSRedirectMiddleware))
app = FastAPI(
middleware=middleware,
title="🦙 llama.cpp Python API",
Expand Down
2 changes: 1 addition & 1 deletion llama_cpp/server/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ class ServerSettings(BaseSettings):

# Uvicorn Settings
host: str = Field(default="localhost", description="Listen address")
port: int = Field(default=8000, description="Listen port")
port: int = Field(default=8080, description="Listen port")
ssl_keyfile: Optional[str] = Field(
default=None, description="SSL key file for HTTPS"
)
Expand Down