From 2c35b393c82dd740c60db8f2f74777f487847974 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Mon, 20 Apr 2026 17:00:15 -0700
Subject: [PATCH] fix: improve error message when model loading fails

When llama_model_load_from_file returns None, the error was a generic
'Failed to load model from file'. With verbose=False (the default), the
llama.cpp log is suppressed, leaving users with no actionable information.

The error message now:
- includes the file size in GB to help diagnose OOM
- lists common causes (insufficient RAM/VRAM, unsupported format, corrupt file)
- suggests verbose=True to see the full llama.cpp error log
- suggests n_gpu_layers=-1 for GPU offloading

Fixes #2145
---
 llama_cpp/_internals.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index cde52c8c8..9ffabdf46 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -56,7 +56,19 @@ def __init__(
             )
 
         if model is None:
-            raise ValueError(f"Failed to load model from file: {path_model}")
+            try:
+                size_hint = f" (file size: {os.path.getsize(path_model) / (1024**3):.1f} GB)"
+            except OSError:
+                size_hint = ""
+            raise ValueError(
+                f"Failed to load model from file: {path_model}{size_hint}.
+"
+                "Common causes: insufficient RAM or VRAM for the model size, "
+                "unsupported quantization format, or corrupt file.
+"
+                "Tip: set verbose=True to see the full llama.cpp log, "
+                "or use n_gpu_layers=-1 to offload layers to GPU."
+            )
 
         vocab = llama_cpp.llama_model_get_vocab(model)