From d815bba0e3f2b5505d38344844cd1b8c32f66a3f Mon Sep 17 00:00:00 2001 From: Nausicaa Li <2239638+nausicaalii@users.noreply.github.com> Date: Sat, 11 Apr 2026 15:51:51 -0700 Subject: [PATCH 1/2] perf: vectorize prefix matching with numpy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace O(n) Python for-loop in KV cache prefix matching and longest_token_prefix() with numpy vectorized comparison. The element-wise numpy comparison runs in optimized C/SIMD instead of Python's interpreter loop, which matters as conversation history grows (10K+ tokens). No change in behavior — both paths find the first position where cached and new token sequences diverge. --- llama_cpp/llama.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 11fe169cf..6529a75c8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -887,12 +887,18 @@ def generate( # Check for kv cache prefix match if reset and self.n_tokens > 0: - longest_prefix = 0 - for a, b in zip(self._input_ids, tokens[:-1]): - if a == b: - longest_prefix += 1 - else: - break + cached = self._input_ids + n = min(len(cached), len(tokens) - 1) + if n > 0: + eq = np.asarray(cached[:n]) == np.asarray( + tokens[:n] + ) + mismatch = np.argmin(eq) + longest_prefix = ( + int(n) if eq[mismatch] else int(mismatch) + ) + else: + longest_prefix = 0 if longest_prefix > 0: if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1): reset = False @@ -2252,13 +2258,12 @@ def logits_to_logprobs( @staticmethod def longest_token_prefix(a: Sequence[int], b: Sequence[int]): - longest_prefix = 0 - for _a, _b in zip(a, b): - if _a == _b: - longest_prefix += 1 - else: - break - return longest_prefix + n = min(len(a), len(b)) + if n == 0: + return 0 + eq = np.asarray(a[:n]) == np.asarray(b[:n]) + mismatch = np.argmin(eq) + return int(n) if eq[mismatch] else int(mismatch) @classmethod def from_pretrained( From aeb7d7cfc67370dbd807cbbaf2ef03eb09e51dcb Mon Sep 17 00:00:00 2001 From: Nausicaa Li <2239638+nausicaalii@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:25:11 -0700 Subject: [PATCH 2/2] refactor: deduplicate prefix matching and eliminate .tolist() overhead Replace the inline prefix matching in generate() with a call to longest_token_prefix(). Remove .tolist() conversions in _create_completion() so numpy arrays are compared directly, avoiding list conversion overhead. Co-Authored-By: Claude Opus 4.6 --- llama_cpp/llama.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6529a75c8..5899c3687 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -887,18 +887,9 @@ def generate( # Check for kv cache prefix match if reset and self.n_tokens > 0: - cached = self._input_ids - n = min(len(cached), len(tokens) - 1) - if n > 0: - eq = np.asarray(cached[:n]) == np.asarray( - tokens[:n] - ) - mismatch = np.argmin(eq) - longest_prefix = ( - int(n) if eq[mismatch] else int(mismatch) - ) - else: - longest_prefix = 0 + longest_prefix = Llama.longest_token_prefix( + self._input_ids, tokens[:-1] + ) if longest_prefix > 0: if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1): reset = False @@ -1319,10 +1310,10 @@ def logit_bias_processor( try: cache_item = self.cache[prompt_tokens] cache_prefix_len = Llama.longest_token_prefix( - cache_item.input_ids.tolist(), prompt_tokens + cache_item.input_ids, prompt_tokens ) eval_prefix_len = Llama.longest_token_prefix( - self._input_ids.tolist(), prompt_tokens + self._input_ids, prompt_tokens ) if cache_prefix_len > eval_prefix_len: self.load_state(cache_item) @@ -2257,7 +2248,10 @@ def logits_to_logprobs( return subtract_maxs - out @staticmethod - def longest_token_prefix(a: Sequence[int], b: Sequence[int]): + def longest_token_prefix( + a: Union[Sequence[int], npt.NDArray[np.intc]], + b: Union[Sequence[int], npt.NDArray[np.intc]], + ) -> int: n = min(len(a), len(b)) if n == 0: return 0