Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,20 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
model: gpt-5.4 # LLM model (any LiteLLM-supported provider)
language: en # Wiki output language
pageindex_threshold: 20 # PDF pages threshold for PageIndex
storage_backend: sqlite # Storage backend: sqlite (default) or json
```

### Storage Backend

OpenKB supports two storage backends for the file hash registry:

| Backend | Description | Use Case |
|---------|-------------|----------|
| `sqlite` | SQLite database (default) | Better concurrency, scalability, recommended for production |
| `json` | JSON file | Simple, human-readable, for small installations |

Migration from JSON to SQLite happens automatically when you switch to `sqlite` backend and a `hashes.json` file exists. The JSON file is preserved but no longer used.

Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):

| Provider | Model example |
Expand Down
31 changes: 18 additions & 13 deletions openkb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,15 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
4. Else: compile_short_doc.
"""
from openkb.agent.compiler import compile_long_doc, compile_short_doc
from openkb.state import HashRegistry
from openkb.state import get_registry

logger = logging.getLogger(__name__)
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
_setup_llm_key(kb_dir)
model: str = config.get("model", DEFAULT_CONFIG["model"])
registry = HashRegistry(openkb_dir / "hashes.json")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)

# 2. Convert document
click.echo(f"Adding: {file_path.name}")
Expand Down Expand Up @@ -299,9 +300,10 @@ def init():
"model": model,
"language": DEFAULT_CONFIG["language"],
"pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
"storage_backend": DEFAULT_CONFIG["storage_backend"],
}
save_config(openkb_dir / "config.yaml", config)
(openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
# SQLite DB 会在首次访问时由 get_registry() 自动创建,无需预创建

# Write API key to KB-local .env (0600) if the user provided one
if api_key:
Expand Down Expand Up @@ -591,13 +593,13 @@ def lint(ctx, fix):

def print_list(kb_dir: Path) -> None:
"""Print all documents in the knowledge base. Usable from CLI and chat REPL."""
openkb_dir = kb_dir / ".openkb"
hashes_file = openkb_dir / "hashes.json"
if not hashes_file.exists():
click.echo("No documents indexed yet.")
return
from openkb.state import get_registry

hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)
hashes = registry.all_entries()
if not hashes:
click.echo("No documents indexed yet.")
return
Expand Down Expand Up @@ -678,11 +680,14 @@ def print_status(kb_dir: Path) -> None:
click.echo(f" {'raw':<20} {raw_count:<10}")

# Hash registry summary
from openkb.state import get_registry

openkb_dir = kb_dir / ".openkb"
hashes_file = openkb_dir / "hashes.json"
if hashes_file.exists():
hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
click.echo(f"\n Total indexed: {len(hashes)} document(s)")
config = load_config(openkb_dir / "config.yaml")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)
hashes = registry.all_entries()
click.echo(f"\n Total indexed: {len(hashes)} document(s)")

# Last compile time: newest file in wiki/summaries/
summaries_dir = wiki_dir / "summaries"
Expand Down
1 change: 1 addition & 0 deletions openkb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"model": "gpt-5.4-mini",
"language": "en",
"pageindex_threshold": 20,
"storage_backend": "sqlite",
}

GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"
Expand Down
7 changes: 4 additions & 3 deletions openkb/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from openkb.config import load_config
from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
from openkb.state import HashRegistry
from openkb.state import get_registry

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -50,12 +50,13 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
threshold: int = config.get("pageindex_threshold", 20)
registry = HashRegistry(openkb_dir / "hashes.json")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)

# ------------------------------------------------------------------
# 1. Hash check
# ------------------------------------------------------------------
file_hash = HashRegistry.hash_file(src)
file_hash = registry.hash_file(src)
if registry.is_known(file_hash):
logger.info("Skipping already-known file: %s", src.name)
return ConvertResult(skipped=True)
Expand Down
168 changes: 163 additions & 5 deletions openkb/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,19 @@

import hashlib
import json
import sqlite3
from contextlib import contextmanager
from pathlib import Path
from typing import Iterator


def _hash_file(path: Path) -> str:
"""Return the SHA-256 hex digest (64 chars) of the file at path."""
h = hashlib.sha256()
with path.open("rb") as fh:
for chunk in iter(lambda: fh.read(65536), b""):
h.update(chunk)
return h.hexdigest()


class HashRegistry:
Expand Down Expand Up @@ -57,8 +69,154 @@ def _persist(self) -> None:
@staticmethod
def hash_file(path: Path) -> str:
"""Return the SHA-256 hex digest (64 chars) of the file at path."""
h = hashlib.sha256()
with path.open("rb") as fh:
for chunk in iter(lambda: fh.read(65536), b""):
h.update(chunk)
return h.hexdigest()
return _hash_file(path)


class DbRegistry:
"""SQLite-backed registry mapping file SHA-256 hashes to metadata dicts.

Provides better scalability, concurrency support, and extensibility
compared to JSON-backed HashRegistry.
"""

def __init__(self, path: Path, migrate_from: Path | None = None) -> None:
"""Initialize DbRegistry.

Args:
path: Path to SQLite database file.
migrate_from: Optional path to JSON file to migrate from.
Migration only happens if DB doesn't exist yet.
"""
self._path = path
should_migrate = migrate_from is not None and not path.exists()
self._init_db()
if should_migrate:
self._migrate_from_json(migrate_from)

def _migrate_from_json(self, json_path: Path) -> None:
"""Migrate data from JSON file to SQLite database."""
if not json_path.exists():
return

with json_path.open("r", encoding="utf-8") as fh:
data: dict[str, dict] = json.load(fh)

with self._connect() as conn:
for file_hash, metadata in data.items():
metadata_json = json.dumps(metadata, ensure_ascii=False)
conn.execute("""
INSERT OR REPLACE INTO registry (file_hash, metadata_json)
VALUES (?, ?)
""", (file_hash, metadata_json))

def _init_db(self) -> None:
"""Initialize database schema if not exists."""
self._path.parent.mkdir(parents=True, exist_ok=True)

with self._connect() as conn:
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
conn.execute("""
CREATE TABLE IF NOT EXISTS registry (
file_hash TEXT PRIMARY KEY,
metadata_json TEXT NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_created_at ON registry(created_at)
""")

@contextmanager
def _connect(self) -> Iterator[sqlite3.Connection]:
"""Context manager for database connections."""
conn = sqlite3.connect(str(self._path))
try:
yield conn
conn.commit()
finally:
conn.close()

def is_known(self, file_hash: str) -> bool:
"""Return True if file_hash is already registered."""
with self._connect() as conn:
cursor = conn.execute(
"SELECT 1 FROM registry WHERE file_hash = ?",
(file_hash,)
)
return cursor.fetchone() is not None

def get(self, file_hash: str) -> dict | None:
"""Return metadata for file_hash, or None if not found."""
with self._connect() as conn:
cursor = conn.execute(
"SELECT metadata_json FROM registry WHERE file_hash = ?",
(file_hash,)
)
row = cursor.fetchone()
if row is None:
return None
return json.loads(row[0])

def all_entries(self) -> dict[str, dict]:
"""Return a shallow copy of all hash -> metadata entries."""
with self._connect() as conn:
cursor = conn.execute(
"SELECT file_hash, metadata_json FROM registry"
)
return {
row[0]: json.loads(row[1])
for row in cursor.fetchall()
}

def add(self, file_hash: str, metadata: dict) -> None:
"""Register file_hash with metadata and persist to disk.

If file_hash already exists, updates the metadata.
"""
metadata_json = json.dumps(metadata, ensure_ascii=False)
with self._connect() as conn:
conn.execute("""
INSERT INTO registry (file_hash, metadata_json, updated_at)
VALUES (?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(file_hash) DO UPDATE SET
metadata_json = excluded.metadata_json,
updated_at = CURRENT_TIMESTAMP
""", (file_hash, metadata_json))

@staticmethod
def hash_file(path: Path) -> str:
"""Return the SHA-256 hex digest (64 chars) of the file at path."""
return _hash_file(path)


def get_registry(
openkb_dir: Path,
backend: str = "sqlite",
) -> HashRegistry | DbRegistry:
"""Factory function to get the appropriate registry implementation.

Args:
openkb_dir: Path to .openkb directory.
backend: Storage backend - "sqlite" or "json".

Returns:
HashRegistry for "json" backend, DbRegistry for "sqlite" backend.

When switching from json to sqlite and a JSON file exists,
automatically migrates the data.
"""
if backend not in ("sqlite", "json"):
raise ValueError(f"Unknown storage_backend: {backend!r}")

if backend == "json":
return HashRegistry(openkb_dir / "hashes.json")

db_path = openkb_dir / "hashes.db"
json_path = openkb_dir / "hashes.json"

if json_path.exists() and not db_path.exists():
return DbRegistry(db_path, migrate_from=json_path)

return DbRegistry(db_path)
9 changes: 5 additions & 4 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest.mock import patch

import pytest
import yaml
from click.testing import CliRunner

from openkb.cli import cli
Expand Down Expand Up @@ -30,11 +31,11 @@ def test_init_creates_structure(tmp_path):
assert (cwd / "wiki" / "log.md").is_file()
assert (cwd / "wiki" / "index.md").is_file()
assert (cwd / ".openkb" / "config.yaml").is_file()
assert (cwd / ".openkb" / "hashes.json").is_file()
# SQLite DB 在首次访问时由 get_registry() 惰性创建
assert not (cwd / ".openkb" / "hashes.json").exists()

# hashes.json is empty object
hashes = json.loads((cwd / ".openkb" / "hashes.json").read_text())
assert hashes == {}
config = yaml.safe_load((cwd / ".openkb" / "config.yaml").read_text())
assert config["storage_backend"] == "sqlite"

# index.md header
index_content = (cwd / "wiki" / "index.md").read_text()
Expand Down
37 changes: 37 additions & 0 deletions tests/test_config_storage_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Tests for storage_backend config option."""
from __future__ import annotations

from pathlib import Path

from openkb.config import DEFAULT_CONFIG, load_config, save_config


def test_default_config_has_storage_backend():
"""DEFAULT_CONFIG should include storage_backend key."""
assert "storage_backend" in DEFAULT_CONFIG


def test_default_storage_backend_is_sqlite():
"""Default storage_backend should be 'sqlite'."""
assert DEFAULT_CONFIG["storage_backend"] == "sqlite"


def test_load_config_includes_storage_backend(tmp_path):
"""load_config should return storage_backend from config file."""
config_path = tmp_path / "config.yaml"
save_config(config_path, {"storage_backend": "json"})
loaded = load_config(config_path)
assert loaded["storage_backend"] == "json"


def test_storage_backend_valid_values(tmp_path):
"""storage_backend should accept 'sqlite' or 'json'."""
config_path = tmp_path / "config.yaml"

save_config(config_path, {"storage_backend": "sqlite"})
loaded = load_config(config_path)
assert loaded["storage_backend"] == "sqlite"

save_config(config_path, {"storage_backend": "json"})
loaded = load_config(config_path)
assert loaded["storage_backend"] == "json"
5 changes: 3 additions & 2 deletions tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,15 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir):

def test_md_duplicate_skipped(self, kb_dir):
"""Second call with same file returns skipped=True when hash is registered."""
from openkb.state import HashRegistry
from openkb.state import get_registry

src = kb_dir / "raw" / "notes.md"
src.write_text("# Notes\n\nSome content here.", encoding="utf-8")

result1 = convert_document(src, kb_dir) # first call
# Simulate CLI registering the hash after successful compilation
registry = HashRegistry(kb_dir / ".openkb" / "hashes.json")
openkb_dir = kb_dir / ".openkb"
registry = get_registry(openkb_dir, backend="sqlite")
registry.add(result1.file_hash, {"name": src.name, "type": "md"})

result2 = convert_document(src, kb_dir) # second call
Expand Down
Loading