AutoSQL/AutoSQL.py at main · PrajwalAmte/AutoSQL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python3
"""
AutoSQL — Self-optimizing SQL Query Pipeline
─────────────────────────────────────────────
Inspired by karpathy/autoresearch.

The loop:
  LLM rewrites query → run it → measure (speed + correctness) → keep if better → repeat

Usage:
  python AutoSQL.py --query slow.sql --db mydb.sqlite
  python AutoSQL.py --query "SELECT ..." --db mydb.sqlite --iterations 15
  python AutoSQL.py --query "SELECT ..." --db mydb.sqlite --model gpt-4o --provider openai
  python AutoSQL.py --query "SELECT ..." --db mydb.sqlite --model claude-opus-4-5 --provider anthropic
  python AutoSQL.py --query "SELECT ..." --db mydb.sqlite --model llama3 --provider ollama
  python AutoSQL.py --query "SELECT ..." --db mydb.sqlite --model llama3 --base-url http://localhost:11434/v1

Requirements:
  pip install openai            # Groq / OpenAI / Ollama / Together / OpenRouter
  pip install anthropic         # only for Anthropic / Claude models
  export GROQ_API_KEY=gsk_...   # free at https://console.groq.com  (default provider)
  export OPENAI_API_KEY=sk_...  # for OpenAI models
  export ANTHROPIC_API_KEY=...  # for Claude models
"""

import os
import sqlite3
import time
import json
import argparse
import hashlib
from pathlib import Path

DEFAULT_MODEL    = "llama-3.3-70b-versatile"
DEFAULT_PROVIDER = "groq"

# Provider registry: name → OpenAI-compatible base URL + env-var for the API key
PROVIDERS: dict[str, dict] = {
    "groq":        {"base_url": "https://api.groq.com/openai/v1", "key_env": "GROQ_API_KEY"},
    "openai":      {"base_url": None,                              "key_env": "OPENAI_API_KEY"},
    "anthropic":   {"base_url": None,                              "key_env": "ANTHROPIC_API_KEY"},
    "ollama":      {"base_url": "http://localhost:11434/v1",        "key_env": None},
    "together":    {"base_url": "https://api.together.xyz/v1",     "key_env": "TOGETHER_API_KEY"},
    "openrouter":  {"base_url": "https://openrouter.ai/api/v1",    "key_env": "OPENROUTER_API_KEY"},
}

# ─── Helpers ──────────────────────────────────────────────────────────────────

def detect_provider(model: str, base_url: str | None) -> str:
    """Infer provider from model name when --provider is not given."""
    if base_url:
        return "openai"  # treat any custom URL as OpenAI-compatible
    m = model.lower()
    if m.startswith("claude"):
        return "anthropic"
    if m.startswith(("gpt-", "o1", "o3", "ft:gpt")):
        return "openai"
    return "groq"  # default — backwards-compatible


def llm_generate(model: str, prompt: str, provider: str, base_url: str | None = None) -> str:
    """Call the chosen LLM provider and return the completion text."""
    cfg = PROVIDERS.get(provider, {})

    # ── Anthropic ─────────────────────────────────────────────────────────────
    if provider == "anthropic":
        try:
            from anthropic import Anthropic
        except ImportError:
            raise RuntimeError("Run: pip install anthropic")
        api_key = os.environ.get("ANTHROPIC_API_KEY")
        if not api_key:
            raise RuntimeError("ANTHROPIC_API_KEY not set. Get a key → https://console.anthropic.com")
        client = Anthropic(api_key=api_key)
        msg = client.messages.create(
            model=model,
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt}],
        )
        return msg.content[0].text.strip()

    # ── OpenAI-compatible (openai / groq / ollama / together / openrouter / custom) ──
    try:
        from openai import OpenAI
    except ImportError:
        raise RuntimeError("Run: pip install openai")

    url     = base_url or cfg.get("base_url")
    key_env = cfg.get("key_env", "OPENAI_API_KEY")
    api_key = os.environ.get(key_env) if key_env else "ollama"  # Ollama ignores the key
    if not api_key:
        raise RuntimeError(f"{key_env} not set.")

    client_kwargs: dict = {"api_key": api_key}
    if url:
        client_kwargs["base_url"] = url

    client = OpenAI(**client_kwargs)
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=1024,
    )
    return resp.choices[0].message.content.strip()

def get_schema(conn: sqlite3.Connection) -> str:
    """Return a compact text representation of every table and its columns."""
    tables = conn.execute(
        "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
    ).fetchall()
    parts: list[str] = []
    for (tbl,) in tables:
        cols = conn.execute(f"PRAGMA table_info({tbl})").fetchall()
        col_str = ", ".join(f"{c[1]} {c[2]}" for c in cols)
        parts.append(f"  {tbl}({col_str})")
    return "\n".join(parts)


def get_query_plan(conn: sqlite3.Connection, query: str) -> str:
    """Return the SQLite query plan as a compact string."""
    try:
        rows = conn.execute(f"EXPLAIN QUERY PLAN {query}").fetchall()
        return "\n".join(f"  {r[3]}" for r in rows)
    except Exception:
        return "  (plan unavailable)"


def run_query(
    conn: sqlite3.Connection, query: str, runs: int = 3, timeout_ms: float = 0
) -> tuple[float, str | None, str | None]:
    """
    Execute `query` multiple times and return (avg_ms, result_hash, error).
    The hash is computed on the first run; timing is averaged over all runs.
    If timeout_ms > 0, SQLite's progress handler interrupts any run that exceeds
    the limit (checked every ~1 000 SQLite VM opcodes).
    """
    times: list[float] = []
    result_hash: str | None = None

    for i in range(runs):
        deadline: list[float] = []

        if timeout_ms > 0:
            deadline.append(time.perf_counter() + timeout_ms / 1_000)

            def _check_timeout() -> bool:
                return time.perf_counter() >= deadline[0]

            conn.set_progress_handler(_check_timeout, 1000)

        try:
            t0   = time.perf_counter()
            rows = conn.execute(query).fetchall()
            elapsed_ms = (time.perf_counter() - t0) * 1_000
        except Exception as exc:
            if timeout_ms > 0:
                conn.set_progress_handler(None, 0)
            msg = str(exc)
            if "interrupted" in msg.lower() or "callback" in msg.lower():
                return 0.0, None, f"timeout after {timeout_ms:.0f} ms"
            return 0.0, None, msg
        finally:
            if timeout_ms > 0:
                conn.set_progress_handler(None, 0)

        times.append(elapsed_ms)
        if i == 0:
            result_hash = hashlib.md5(str(sorted(rows)).encode()).hexdigest()

    return sum(times) / len(times), result_hash, None


# ─── LLM Optimizer ────────────────────────────────────────────────────────────

def build_prompt(
    schema: str,
    original_query: str,
    current_best: str,
    history: list[dict],
    iteration: int,
    query_plan: str = "",
) -> str:
    """Build the optimisation prompt, including the last 5 attempts for context."""
    attempts = ""
    if history:
        attempts = "\n\nPrevious attempts (learn from these):\n"
        for h in history[-5:]:
            tag = "✓ correct" if h["correct"] else "✗ wrong result"
            err = f"  error: {h['error']}" if h["error"] else ""
            attempts += (
                f"  [{h['iteration']}] {h['time_ms']:.1f}ms | {h['speedup']:.2f}x speedup | {tag}{err}\n"
                f"       {h['query'][:300].strip()}\n\n"
            )

    plan_section = f"\nQUERY PLAN (current best):\n{query_plan}\n" if query_plan else ""

    return f"""You are an expert SQL optimizer. Rewrite the query below to run as fast as possible
while returning byte-for-byte IDENTICAL results (same rows, same order).

SCHEMA:
{schema}

ORIGINAL QUERY (baseline):
{original_query}

CURRENT BEST QUERY (iteration {iteration - 1}):
{current_best}
{plan_section}{attempts}
OPTIMIZATION TECHNIQUES TO CONSIDER:
  • Replace correlated subqueries with pre-aggregated JOINs or CTEs
  • Use window functions (AVG OVER, SUM OVER) instead of self-joins
  • Push WHERE filters as early as possible (before joins)
  • Avoid re-scanning large tables multiple times
  • Minimise columns projected inside subqueries

Return ONLY the raw SQL — no explanation, no markdown fences, no backticks."""


def optimize(
    model: str,
    schema: str,
    original_query: str,
    current_best: str,
    history: list[dict],
    iteration: int,
    provider: str = DEFAULT_PROVIDER,
    base_url: str | None = None,
    query_plan: str = "",
) -> str:
    """Ask the LLM for an optimised version of the query."""
    prompt = build_prompt(schema, original_query, current_best, history, iteration, query_plan)
    raw = llm_generate(model, prompt, provider, base_url)

    # Strip markdown fences the model might sneak in
    if raw.startswith("```"):
        lines = raw.splitlines()
        lines = [l for l in lines if not l.startswith("```")]
        raw = "\n".join(lines).strip()

    return raw


# ─── Main Loop ────────────────────────────────────────────────────────────────

def autosql(
    conn: sqlite3.Connection,
    query: str,
    iterations: int = 10,
    model: str = DEFAULT_MODEL,
    provider: str | None = None,
    base_url: str | None = None,
    min_speedup: float = 0.0,
    timeout_ms: float = 0,
) -> str | None:
    schema   = get_schema(conn)
    provider = provider or detect_provider(model, base_url)

    bar = "─" * 62
    print(f"\n{bar}")
    print("  AutoSQL — Self-optimizing Query Pipeline")
    print(f"  Model    : {model}")
    print(f"  Provider : {provider}")
    print(bar)
    print(f"\nSchema:\n{schema}\n")
    print(f"Query:\n{query}\n")
    print(bar)

    # ── Baseline ──────────────────────────────────────────────────────────────
    print("\n  Measuring baseline …", end=" ", flush=True)
    baseline_ms, baseline_hash, err = run_query(conn, query, timeout_ms=timeout_ms)
    if err:
        print(f"\n✗ Baseline query failed: {err}")
        return None
    print(f"{baseline_ms:.1f} ms\n")

    best_query   = query
    best_ms      = baseline_ms
    best_speedup = 1.0
    history: list[dict] = []

    # ── Optimisation loop ─────────────────────────────────────────────────────
    for i in range(1, iterations + 1):
        print(f"  [{i:02d}/{iterations:02d}] Generating … ", end="", flush=True)

        query_plan = get_query_plan(conn, best_query)
        try:
            new_query = optimize(model, schema, query, best_query, history, i, provider, base_url, query_plan)
        except RuntimeError as exc:
            print(f"✗  {exc}")
            break

        new_ms, new_hash, err = run_query(conn, new_query, timeout_ms=timeout_ms)

        correct = (not err) and (new_hash == baseline_hash)
        speedup = (baseline_ms / new_ms) if (correct and new_ms > 0) else 0.0
        improved = correct and new_ms < best_ms

        # Verdict string
        if err:
            verdict = f"✗  error: {err[:55]}"
        elif not correct:
            verdict = "✗  wrong result"
        elif improved:
            best_query   = new_query
            best_ms      = new_ms
            best_speedup = speedup
            verdict = f"✓  {new_ms:.1f} ms  ({speedup:.2f}× faster)  ← NEW BEST"
        else:
            verdict = f"✓  {new_ms:.1f} ms  ({speedup:.2f}×)  no improvement"

        print(verdict)

        history.append({
            "iteration": i,
            "query":     new_query,
            "time_ms":   new_ms,
            "speedup":   speedup,
            "correct":   correct,
            "error":     err,
        })

        if min_speedup > 0 and best_speedup >= min_speedup:
            print(f"\n  Target speedup {min_speedup:.2f}× reached — stopping early.")
            break

    # ── Report ────────────────────────────────────────────────────────────────
    print(f"\n{bar}")
    print("  Final Report")
    print(bar)
    print(f"  Baseline : {baseline_ms:.1f} ms")
    print(f"  Best     : {best_ms:.1f} ms  ({best_speedup:.2f}× faster)\n")
    print("  Best Query:\n")
    for line in best_query.splitlines():
        print(f"    {line}")

    log = {
        "model":            model,
        "schema":           schema,
        "baseline_query":   query,
        "baseline_ms":      baseline_ms,
        "best_query":       best_query,
        "best_ms":          best_ms,
        "speedup":          best_speedup,
        "iterations":       history,
    }
    log_path = Path("autosql_log.json")
    log_path.write_text(json.dumps(log, indent=2))
    print(f"\n  Log → {log_path.resolve()}")
    print(bar + "\n")

    return best_query


# ─── CLI ──────────────────────────────────────────────────────────────────────

def main() -> None:
    parser = argparse.ArgumentParser(
        description="AutoSQL: self-optimizing SQL query pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python AutoSQL.py --query slow.sql --db app.sqlite
  python AutoSQL.py --query "SELECT ..." --db app.sqlite --iterations 15
  python AutoSQL.py --query "SELECT ..." --db app.sqlite --model gpt-4o --provider openai
  python AutoSQL.py --query "SELECT ..." --db app.sqlite --model claude-opus-4-5 --provider anthropic
  python AutoSQL.py --query "SELECT ..." --db app.sqlite --model llama3 --provider ollama
        """,
    )
    parser.add_argument("--query", type=str, required=True,
                        help="SQL query string or path to a .sql file")
    parser.add_argument("--db", type=str, required=True,
                        help="Path to SQLite database file")
    parser.add_argument("--iterations", type=int, default=10,
                        help="Optimisation iterations (default: 10)")
    parser.add_argument("--model", type=str, default=DEFAULT_MODEL,
                        help=f"LLM model to use (default: {DEFAULT_MODEL})")
    parser.add_argument("--provider", type=str, default=None,
                        choices=list(PROVIDERS.keys()),
                        help="LLM provider (auto-detected from model name if omitted)")
    parser.add_argument("--base-url", type=str, default=None,
                        help="Custom OpenAI-compatible base URL (e.g. http://localhost:11434/v1)")
    parser.add_argument("--min-speedup", type=float, default=0.0,
                        help="Stop early once this speedup multiplier is reached (e.g. 3.0)")
    parser.add_argument("--timeout", type=float, default=0,
                        help="Per-execution timeout in milliseconds; 0 = unlimited (default: 0)")
    args = parser.parse_args()

    conn = sqlite3.connect(args.db)
    query = (
        Path(args.query).read_text()
        if args.query.endswith(".sql")
        else args.query
    )

    autosql(conn, query, iterations=args.iterations, model=args.model,
            provider=args.provider, base_url=args.base_url,
            min_speedup=args.min_speedup, timeout_ms=args.timeout)


if __name__ == "__main__":
    main()