From d815bba0e3f2b5505d38344844cd1b8c32f66a3f Mon Sep 17 00:00:00 2001
From: Nausicaa Li <2239638+nausicaalii@users.noreply.github.com>
Date: Sat, 11 Apr 2026 15:51:51 -0700
Subject: [PATCH 1/2] perf: vectorize prefix matching with numpy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace O(n) Python for-loop in KV cache prefix matching and
longest_token_prefix() with numpy vectorized comparison.

The element-wise numpy comparison runs in optimized C/SIMD
instead of Python's interpreter loop, which matters as
conversation history grows (10K+ tokens).

No change in behavior — both paths find the first position
where cached and new token sequences diverge.
---
 llama_cpp/llama.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 11fe169cf..6529a75c8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -887,12 +887,18 @@ def generate(
 
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
-            longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1]):
-                if a == b:
-                    longest_prefix += 1
-                else:
-                    break
+            cached = self._input_ids
+            n = min(len(cached), len(tokens) - 1)
+            if n > 0:
+                eq = np.asarray(cached[:n]) == np.asarray(
+                    tokens[:n]
+                )
+                mismatch = np.argmin(eq)
+                longest_prefix = (
+                    int(n) if eq[mismatch] else int(mismatch)
+                )
+            else:
+                longest_prefix = 0
             if longest_prefix > 0:
                 if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
                     reset = False
@@ -2252,13 +2258,12 @@ def logits_to_logprobs(
 
     @staticmethod
     def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
-        longest_prefix = 0
-        for _a, _b in zip(a, b):
-            if _a == _b:
-                longest_prefix += 1
-            else:
-                break
-        return longest_prefix
+        n = min(len(a), len(b))
+        if n == 0:
+            return 0
+        eq = np.asarray(a[:n]) == np.asarray(b[:n])
+        mismatch = np.argmin(eq)
+        return int(n) if eq[mismatch] else int(mismatch)
 
     @classmethod
     def from_pretrained(

From aeb7d7cfc67370dbd807cbbaf2ef03eb09e51dcb Mon Sep 17 00:00:00 2001
From: Nausicaa Li <2239638+nausicaalii@users.noreply.github.com>
Date: Sun, 19 Apr 2026 22:25:11 -0700
Subject: [PATCH 2/2] refactor: deduplicate prefix matching and eliminate
 .tolist() overhead

Replace the inline prefix matching in generate() with a call to
longest_token_prefix(). Remove .tolist() conversions in
_create_completion() so numpy arrays are compared directly, avoiding
list conversion overhead.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 llama_cpp/llama.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6529a75c8..5899c3687 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -887,18 +887,9 @@ def generate(
 
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
-            cached = self._input_ids
-            n = min(len(cached), len(tokens) - 1)
-            if n > 0:
-                eq = np.asarray(cached[:n]) == np.asarray(
-                    tokens[:n]
-                )
-                mismatch = np.argmin(eq)
-                longest_prefix = (
-                    int(n) if eq[mismatch] else int(mismatch)
-                )
-            else:
-                longest_prefix = 0
+            longest_prefix = Llama.longest_token_prefix(
+                self._input_ids, tokens[:-1]
+            )
             if longest_prefix > 0:
                 if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
                     reset = False
@@ -1319,10 +1310,10 @@ def logit_bias_processor(
             try:
                 cache_item = self.cache[prompt_tokens]
                 cache_prefix_len = Llama.longest_token_prefix(
-                    cache_item.input_ids.tolist(), prompt_tokens
+                    cache_item.input_ids, prompt_tokens
                 )
                 eval_prefix_len = Llama.longest_token_prefix(
-                    self._input_ids.tolist(), prompt_tokens
+                    self._input_ids, prompt_tokens
                 )
                 if cache_prefix_len > eval_prefix_len:
                     self.load_state(cache_item)
@@ -2257,7 +2248,10 @@ def logits_to_logprobs(
         return subtract_maxs - out
 
     @staticmethod
-    def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
+    def longest_token_prefix(
+        a: Union[Sequence[int], npt.NDArray[np.intc]],
+        b: Union[Sequence[int], npt.NDArray[np.intc]],
+    ) -> int:
         n = min(len(a), len(b))
         if n == 0:
             return 0