From 2bf28bf4e102a8aa1a8cfdd2cacd947e231797b9 Mon Sep 17 00:00:00 2001
From: Tim Felgentreff <tim.felgentreff@oracle.com>
Date: Fri, 10 Apr 2026 10:33:46 +0200
Subject: [PATCH 1/7] Add jsonrpc pipe microbenchmark to mx harness

---
 .../python/micro/jsonrpc-pipe.py              | 431 ++++++++++++++++++
 mx.graalpython/mx_graalpython_bench_param.py  |   2 +
 2 files changed, 433 insertions(+)
 create mode 100644 graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py

diff --git a/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py b/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py
new file mode 100644
index 0000000000..d104a3ad00
--- /dev/null
+++ b/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py
@@ -0,0 +1,431 @@
+# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# The Universal Permissive License (UPL), Version 1.0
+#
+# Subject to the condition set forth below, permission is hereby granted to any
+# person obtaining a copy of this software, associated documentation and/or
+# data (collectively the "Software"), free of charge and under any and all
+# copyright rights in the Software, and any and all patent rights owned or
+# freely licensable by each licensor hereunder covering either (i) the
+# unmodified Software as contributed to or provided by such licensor, or (ii)
+# the Larger Works (as defined below), to deal in both
+#
+# (a) the Software, and
+#
+# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+# one is included with the Software each a "Larger Work" to which the Software
+# is contributed by such licensors),
+#
+# without restriction, including without limitation the rights to copy, create
+# derivative works of, display, perform, and distribute the Software and make,
+# use, sell, offer for sale, import, export, have made, and have sold the
+# Software and the Larger Work(s), and to sublicense the foregoing rights on
+# either these or other terms.
+#
+# This license is subject to the following condition:
+#
+# The above copyright notice and either this complete permission notice or at a
+# minimum a reference to the UPL must be included in all copies or substantial
+# portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import annotations
+
+import argparse
+import io
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+
+
+EMAIL_RE = re.compile(r"\s+")
+NON_DIGIT_RE = re.compile(r"\D+")
+_STATE = None
+
+
+class Endpoint:
+    def __init__(self, mode, reader, writer, closeables=()):
+        self.mode = mode
+        self.reader = reader
+        self.writer = writer
+        self.closeables = closeables
+
+    def write_message(self, message):
+        line = json.dumps(message, separators=(",", ":"))
+        if self.mode == "text":
+            self.writer.write(line)
+            self.writer.write("\n")
+            self.writer.flush()
+            return
+        payload = (line + "\n").encode("utf-8")
+        if self.mode == "buffer":
+            self.writer.write(payload)
+            self.writer.flush()
+            return
+        write_all(self.writer, payload)
+
+    def read_message(self):
+        if self.mode == "text":
+            line = self.reader.readline()
+        else:
+            data = self.reader.readline()
+            line = data.decode("utf-8") if data else ""
+        if not line:
+            raise EOFError("unexpected EOF while reading line")
+        return json.loads(line)
+
+    def close(self):
+        streams = self.closeables if self.closeables else (self.reader, self.writer)
+        for stream in streams:
+            if hasattr(stream, "close"):
+                try:
+                    stream.close()
+                except OSError:
+                    pass
+
+
+class FDLineReader:
+    def __init__(self, fd):
+        self.fd = fd
+        self.pending = bytearray()
+
+    def readline(self):
+        while True:
+            newline = self.pending.find(b"\n")
+            if newline >= 0:
+                line = bytes(self.pending[: newline + 1])
+                del self.pending[: newline + 1]
+                return line
+            chunk = os.read(self.fd, 4096)
+            if not chunk:
+                if not self.pending:
+                    return b""
+                line = bytes(self.pending)
+                self.pending.clear()
+                return line
+            self.pending.extend(chunk)
+
+
+class State:
+    def __init__(self, roundtrips, client_io, worker_io, workload, payload_bytes, batch_size):
+        self.roundtrips = roundtrips
+        self.client_io = client_io
+        self.worker_io = worker_io
+        self.workload = workload
+        self.payload_bytes = payload_bytes
+        self.batch_size = batch_size
+        self.next_request_id = 1
+        self.process = None
+        self.endpoint = None
+
+
+def write_all(fd, data):
+    view = memoryview(data)
+    while view:
+        written = os.write(fd, view)
+        view = view[written:]
+
+
+def create_text_endpoint(read_raw, write_raw):
+    reader_buffer = io.BufferedReader(read_raw, buffer_size=8192)
+    writer_buffer = io.BufferedWriter(write_raw, buffer_size=8192)
+    reader = io.TextIOWrapper(reader_buffer, encoding="utf-8", newline=None)
+    writer = io.TextIOWrapper(writer_buffer, encoding="utf-8", newline="\n", line_buffering=False, write_through=False)
+    return Endpoint("text", reader, writer)
+
+
+def create_buffer_endpoint(read_raw, write_raw):
+    reader = io.BufferedReader(read_raw, buffer_size=8192)
+    writer = io.BufferedWriter(write_raw, buffer_size=8192)
+    return Endpoint("buffer", reader, writer)
+
+
+def create_fd_endpoint(read_fd, write_fd, closeables=()):
+    return Endpoint("fd", FDLineReader(read_fd), write_fd, closeables)
+
+
+def create_parent_endpoint(process, mode):
+    if mode == "text":
+        return create_text_endpoint(process.stdout, process.stdin)
+    if mode == "buffer":
+        return create_buffer_endpoint(process.stdout, process.stdin)
+    return create_fd_endpoint(process.stdout.fileno(), process.stdin.fileno(), (process.stdout, process.stdin))
+
+
+def create_worker_endpoint(mode):
+    if mode == "text":
+        return Endpoint("text", sys.stdin, sys.stdout)
+    if mode == "buffer":
+        return Endpoint("buffer", sys.stdin.buffer, sys.stdout.buffer)
+    return create_fd_endpoint(0, 1)
+
+
+def normalize_email(value):
+    return EMAIL_RE.sub("", value.strip().lower())
+
+
+def normalize_phone(value):
+    digits = NON_DIGIT_RE.sub("", value)
+    if digits.startswith("00"):
+        digits = digits[2:]
+    return digits
+
+
+def mask_email(value):
+    name, _, domain = value.partition("@")
+    if not domain:
+        return "***"
+    return "%s***@%s" % (name[:1], domain)
+
+
+def mask_phone(value):
+    if len(value) <= 4:
+        return "*" * len(value)
+    return "*" * (len(value) - 4) + value[-4:]
+
+
+def mask_row(row):
+    email = normalize_email(str(row.get("email", "")))
+    phone = normalize_phone(str(row.get("phone", "")))
+    return {
+        "email_normalized": email,
+        "phone_normalized": phone,
+        "email_masked": mask_email(email) if email else None,
+        "phone_masked": mask_phone(phone) if phone else None,
+        "region": str(row.get("region", "")).upper(),
+        "source": str(row.get("source", "")).lower(),
+    }
+
+
+def make_echo_payload(payload_bytes):
+    if payload_bytes <= 0:
+        return ""
+    unit = "payload-"
+    return (unit * ((payload_bytes // len(unit)) + 1))[:payload_bytes]
+
+
+def make_mask_row(index, payload_bytes):
+    suffix = make_echo_payload(max(payload_bytes, 8))
+    return {
+        "email": " User%s.%s@Example.COM " % (index, suffix),
+        "phone": "+49 (170) %04d-%s" % (index, suffix[:8]),
+        "region": "eu",
+        "source": "microbench",
+    }
+
+
+def build_request(kind, request_id, payload_bytes, batch_size):
+    if kind == "health":
+        method = "health"
+        params = {}
+    elif kind == "echo":
+        method = "echo"
+        params = {"payload": make_echo_payload(payload_bytes)}
+    elif kind == "mask":
+        method = "mask"
+        params = make_mask_row(request_id, payload_bytes)
+    elif kind == "mask_batch":
+        method = "mask_batch"
+        params = {"rows": [make_mask_row(request_id + i, payload_bytes) for i in range(batch_size)]}
+    else:
+        raise AssertionError("unsupported request kind: %s" % kind)
+    return {"jsonrpc": "2.0", "id": request_id, "method": method, "params": params}
+
+
+def handle_request(message):
+    request_id = message.get("id")
+    method = message.get("method")
+    params = message.get("params", {})
+    if method == "health":
+        result = {"ok": True, "worker": "jsonrpc-pipe", "protocol": "json-rpc-2.0-ndjson"}
+    elif method == "echo":
+        payload = str(dict(params).get("payload", ""))
+        result = {"ok": True, "echo": payload, "size": len(payload)}
+    elif method == "mask":
+        result = {"ok": True, "normalized": mask_row(dict(params))}
+    elif method == "mask_batch":
+        rows = [mask_row(dict(row)) for row in dict(params).get("rows", [])]
+        result = {"ok": True, "normalized": rows, "count": len(rows)}
+    else:
+        return {"jsonrpc": "2.0", "id": request_id, "error": {"code": -32601, "message": "method not found"}}
+    return {"jsonrpc": "2.0", "id": request_id, "result": result}
+
+
+def validate_response(request, response, kind, payload_bytes, batch_size):
+    if response.get("id") != request["id"]:
+        raise AssertionError("mismatched response id")
+    if "error" in response:
+        raise AssertionError("worker returned error: %s" % (response["error"],))
+    result = response.get("result")
+    if not isinstance(result, dict) or not result.get("ok"):
+        raise AssertionError("unexpected response payload: %s" % (response,))
+    if kind == "echo":
+        if result.get("echo") != make_echo_payload(payload_bytes):
+            raise AssertionError("echo payload mismatch")
+    elif kind == "mask":
+        expected = mask_row(make_mask_row(int(request["id"]), payload_bytes))
+        if result.get("normalized") != expected:
+            raise AssertionError("mask result mismatch")
+    elif kind == "mask_batch":
+        expected_rows = [mask_row(make_mask_row(int(request["id"]) + i, payload_bytes)) for i in range(batch_size)]
+        if result.get("count") != batch_size or result.get("normalized") != expected_rows:
+            raise AssertionError("mask_batch result mismatch")
+
+
+def run_roundtrips(state):
+    completed = 0
+    for _ in range(state.roundtrips):
+        request = build_request(state.workload, state.next_request_id, state.payload_bytes, state.batch_size)
+        state.next_request_id += 1
+        state.endpoint.write_message(request)
+        response = state.endpoint.read_message()
+        validate_response(request, response, state.workload, state.payload_bytes, state.batch_size)
+        completed += 1
+    return completed
+
+
+def parse_int(value):
+    if isinstance(value, int):
+        return value
+    return int(str(value).replace("_", ""))
+
+
+def __process_args__(roundtrips=500, client_io="text", worker_io="text", workload="mask", payload_bytes=64, batch_size=8):
+    return [
+        parse_int(roundtrips),
+        str(client_io),
+        str(worker_io),
+        str(workload),
+        parse_int(payload_bytes),
+        parse_int(batch_size),
+    ]
+
+
+def __setup__(roundtrips=500, client_io="text", worker_io="text", workload="mask", payload_bytes=64, batch_size=8):
+    global _STATE
+    __teardown__()
+    state = State(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size)
+    command = [
+        sys.executable,
+        __file__,
+        "--worker",
+        "--worker-io=%s" % worker_io,
+    ]
+    process = subprocess.Popen(
+        command,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        bufsize=0,
+    )
+    state.process = process
+    state.endpoint = create_parent_endpoint(process, client_io)
+    _STATE = state
+
+
+def __benchmark__(roundtrips=500, client_io="text", worker_io="text", workload="mask", payload_bytes=64, batch_size=8):
+    if _STATE is None:
+        __setup__(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size)
+    return run_roundtrips(_STATE)
+
+
+def __teardown__():
+    global _STATE
+    state = _STATE
+    _STATE = None
+    if state is None:
+        return
+    try:
+        if state.endpoint is not None:
+            state.endpoint.close()
+    finally:
+        if state.process is not None:
+            stderr = b""
+            try:
+                stderr = state.process.stderr.read() if state.process.stderr is not None else b""
+            except OSError:
+                pass
+            return_code = state.process.wait()
+            if return_code != 0:
+                raise RuntimeError("worker exited with status %d: %s" % (return_code, stderr.decode("utf-8", errors="replace")))
+
+
+def run_worker(worker_io):
+    endpoint = create_worker_endpoint(worker_io)
+    try:
+        while True:
+            try:
+                request = endpoint.read_message()
+            except EOFError:
+                return 0
+            endpoint.write_message(handle_request(request))
+    finally:
+        endpoint.close()
+
+
+def run_direct(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size):
+    start = time.perf_counter()
+    __setup__(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size)
+    try:
+        completed = __benchmark__(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size)
+    finally:
+        __teardown__()
+    wall = time.perf_counter() - start
+    print("roundtrips=%d" % completed)
+    print("wall_s=%s" % wall)
+    print("throughput_ops_s=%s" % (completed / wall if wall else 0.0))
+    return 0
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(description="Strict JSON-RPC-like pipe roundtrip microbenchmark.")
+    parser.add_argument("--worker", action="store_true")
+    parser.add_argument("--worker-io", choices=("text", "buffer", "fd"), default="text")
+    parser.add_argument("--roundtrips", type=parse_int, default=500)
+    parser.add_argument("--client-io", choices=("text", "buffer", "fd"), default="text")
+    parser.add_argument("--workload", choices=("health", "echo", "mask", "mask_batch"), default="mask")
+    parser.add_argument("--payload-bytes", type=parse_int, default=64)
+    parser.add_argument("--batch-size", type=parse_int, default=8)
+    args = parser.parse_args(argv)
+    if args.worker:
+        return run_worker(args.worker_io)
+    return run_direct(args.roundtrips, args.client_io, args.worker_io, args.workload, args.payload_bytes, args.batch_size)
+
+
+def run():
+    __setup__()
+    try:
+        __benchmark__()
+    finally:
+        __teardown__()
+
+
+def warmupIterations():
+    return 5
+
+
+def iterations():
+    return 10
+
+
+def summary():
+    return {
+        "name": "OutlierRemovalAverageSummary",
+        "lower-threshold": 0,
+        "upper-threshold": 0.3,
+    }
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/mx.graalpython/mx_graalpython_bench_param.py b/mx.graalpython/mx_graalpython_bench_param.py
index 78d5bf06c6..fe9eb3ef0e 100644
--- a/mx.graalpython/mx_graalpython_bench_param.py
+++ b/mx.graalpython/mx_graalpython_bench_param.py
@@ -121,6 +121,7 @@
     'virtualize-in-try-catch-oom': ITER_10,
     'phase_shift_warmup_baseline': ITER_5 + ['--self-measurement'] + ['500'],
     'phase_shift_warmup': ITER_3 + ['--self-measurement'] + ['1600', '500'],
+    'jsonrpc-pipe': ITER_10 + ['500', 'text', 'text', 'mask', '64'],
     'startup': ITER_5 + ['50'],
     'startup-imports': ITER_5 + ['20'],
 }
@@ -130,6 +131,7 @@
     'nano-arith': ITER_6 + WARMUP_2,
     'nano-loop': ITER_6 + WARMUP_2,
     'nano-if': ITER_6 + WARMUP_2,
+    'jsonrpc-pipe': ITER_6 + WARMUP_2 + ['100', 'text', 'text', 'mask', '64'],
     'arith-modulo-sized': ITER_6 + WARMUP_2 + ['1'],
     'if-generic': ITER_10 + WARMUP_2 + ['500000'],
     'if-generic-non-builtin': ITER_10 + WARMUP_2 + ['500000'],

From 0c27038cc5fe4dc7a14ad5259adc23b952c0c517 Mon Sep 17 00:00:00 2001
From: Tim Felgentreff <tim.felgentreff@oracle.com>
Date: Fri, 10 Apr 2026 10:42:24 +0200
Subject: [PATCH 2/7] Add notes on building standalones for agents

---
 AGENTS.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index 5c385a78db..01d3c96e8c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -68,6 +68,10 @@ It consists of: Java (Truffle) + C (CPython C-API compatibility) + Python stdlib
 * Style / formatting
   `mx python-style --fix`
   `mx python-gate --tags style`
+* Building standalones for benchmarking
+  - use `mx --env native-ee sforceimports && mx --env native-ee checkout-downstream compiler graal-enterprise` to get the right revisions
+  - use `mx -p ../graal/vm fetch-jdk -jdk-id labsjdk-ce-latest` and set JAVA_HOME as per that command's output
+  - use `mx --env jvm-ee-libgraal` and `mx --env native-ee` to build the JAVA and NATIVE standalone distributions
 
 ## NOTES
 - When searching for implementation, prefer `graalpython/com.oracle.graal.python/src/...` over vendored `lib-python` unless you are intentionally modifying upstream stdlib/tests.

From 13811b1a996b9be5cdfda464b667d4c4bf173c68 Mon Sep 17 00:00:00 2001
From: Tim Felgentreff <tim.felgentreff@oracle.com>
Date: Fri, 10 Apr 2026 11:51:06 +0200
Subject: [PATCH 3/7] Reduce tiny flush copying in _io

Cuts copies in TextIOWrapper pending-byte flushes and avoids a buffer slice copy in BufferedWriter flush when writePos is 0.

On the jsonrpc-pipe microbenchmark (jvm-ee, mx benchmark micro:jsonrpc-pipe --tracker none -- --python-vm=graalpython --python-vm-config=default --), AVG (no warmup) went from 0.188 s before these changes to 0.089 s after them.
---
 .../modules/io/BufferedWriterNodes.java       | 14 ++++-
 .../python/builtins/modules/io/PTextIO.java   | 16 ++---
 .../modules/io/PendingBytesOutputStream.java  | 62 +++++++++++++++++++
 .../modules/io/TextIOWrapperNodes.java        |  6 +-
 4 files changed, 82 insertions(+), 16 deletions(-)
 create mode 100644 graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PendingBytesOutputStream.java

diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java
index 5c7a52b178..c273e58231 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * The Universal Permissive License (UPL), Version 1.0
@@ -272,8 +272,16 @@ protected static void bufferedwriterFlushUnlocked(VirtualFrame frame, PBuffered
                 self.incRawPos(-rewind);
             }
             while (self.getWritePos() < self.getWriteEnd()) {
-                byte[] buf = PythonUtils.arrayCopyOfRange(self.getBuffer(), self.getWritePos(), self.getWriteEnd());
-                int n = rawWriteNode.execute(frame, inliningTarget, self, buf, buf.length);
+                byte[] buf;
+                int len;
+                if (self.getWritePos() == 0) {
+                    buf = self.getBuffer();
+                    len = self.getWriteEnd();
+                } else {
+                    buf = PythonUtils.arrayCopyOfRange(self.getBuffer(), self.getWritePos(), self.getWriteEnd());
+                    len = buf.length;
+                }
+                int n = rawWriteNode.execute(frame, inliningTarget, self, buf, len);
                 if (n == -2) {
                     throw raiseBlockingIOError.get(inliningTarget).raiseEAGAIN(WRITE_COULD_NOT_COMPLETE_WITHOUT_BLOCKING, 0);
                 }
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PTextIO.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PTextIO.java
index 200ff15b56..155ac163ad 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PTextIO.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PTextIO.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * The Universal Permissive License (UPL), Version 1.0
@@ -41,13 +41,9 @@
 package com.oracle.graal.python.builtins.modules.io;
 
 import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.append;
-import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.createOutputStream;
-import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.toByteArray;
 import static com.oracle.graal.python.nodes.StringLiterals.T_EMPTY_STRING;
 import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
 
-import java.io.ByteArrayOutputStream;
-
 import com.oracle.graal.python.builtins.objects.ints.IntBuiltins;
 import com.oracle.graal.python.builtins.objects.ints.IntNodes;
 import com.oracle.graal.python.builtins.objects.ints.PInt;
@@ -93,7 +89,7 @@ public final class PTextIO extends PTextIOBase {
     private int decodedCharsUsed; /* offset (in code points) into _decoded_chars for read() */
     private int decodedCharsLen; /* code point length of decodedChars */
 
-    private ByteArrayOutputStream pendingBytes;       // data waiting to be written.
+    private PendingBytesOutputStream pendingBytes;       // data waiting to be written.
 
     /*
      * snapshot is either NULL, or a tuple (dec_flags, next_input) where dec_flags is the second
@@ -112,7 +108,7 @@ public final class PTextIO extends PTextIOBase {
 
     public PTextIO(Object cls, Shape instanceShape) {
         super(cls, instanceShape);
-        pendingBytes = createOutputStream();
+        pendingBytes = new PendingBytesOutputStream();
     }
 
     @Override
@@ -324,11 +320,11 @@ TruffleString consumeAllDecodedChars(TruffleString.SubstringNode substringNode,
     }
 
     public void clearPendingBytes() {
-        pendingBytes = createOutputStream();
+        pendingBytes = new PendingBytesOutputStream();
     }
 
-    public byte[] getAndClearPendingBytes() {
-        byte[] b = toByteArray(pendingBytes);
+    public PendingBytesOutputStream getAndClearPendingBytes() {
+        PendingBytesOutputStream b = pendingBytes;
         clearPendingBytes();
         return b;
     }
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PendingBytesOutputStream.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PendingBytesOutputStream.java
new file mode 100644
index 0000000000..809711b9f3
--- /dev/null
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PendingBytesOutputStream.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * The Universal Permissive License (UPL), Version 1.0
+ *
+ * Subject to the condition set forth below, permission is hereby granted to any
+ * person obtaining a copy of this software, associated documentation and/or
+ * data (collectively the "Software"), free of charge and under any and all
+ * copyright rights in the Software, and any and all patent rights owned or
+ * freely licensable by each licensor hereunder covering either (i) the
+ * unmodified Software as contributed to or provided by such licensor, or (ii)
+ * the Larger Works (as defined below), to deal in both
+ *
+ * (a) the Software, and
+ *
+ * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+ * one is included with the Software each a "Larger Work" to which the Software
+ * is contributed by such licensors),
+ *
+ * without restriction, including without limitation the rights to copy, create
+ * derivative works of, display, perform, and distribute the Software and make,
+ * use, sell, offer for sale, import, export, have made, and have sold the
+ * Software and the Larger Work(s), and to sublicense the foregoing rights on
+ * either these or other terms.
+ *
+ * This license is subject to the following condition:
+ *
+ * The above copyright notice and either this complete permission notice or at a
+ * minimum a reference to the UPL must be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+package com.oracle.graal.python.builtins.modules.io;
+
+import java.io.ByteArrayOutputStream;
+
+final class PendingBytesOutputStream extends ByteArrayOutputStream {
+
+    PendingBytesOutputStream() {
+        super();
+    }
+
+    PendingBytesOutputStream(int size) {
+        super(size);
+    }
+
+    byte[] getBuffer() {
+        return buf;
+    }
+
+    int getCount() {
+        return count;
+    }
+}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java
index 5eb40a3447..f2afa5cb9d 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * The Universal Permissive License (UPL), Version 1.0
@@ -207,8 +207,8 @@ static void nothingTodo(@SuppressWarnings("unused") PTextIO self) {
         static void writeflush(VirtualFrame frame, Node inliningTarget, PTextIO self,
                         @Bind PythonLanguage language,
                         @Cached PyObjectCallMethodObjArgs callMethod) {
-            byte[] pending = self.getAndClearPendingBytes();
-            PBytes b = PFactory.createBytes(language, pending);
+            PendingBytesOutputStream pending = self.getAndClearPendingBytes();
+            PBytes b = PFactory.createBytes(language, pending.getBuffer(), pending.getCount());
             callMethod.execute(frame, inliningTarget, self.getBuffer(), T_WRITE, b);
             // TODO: check _PyIO_trap_eintr
         }

From 72a791bd10f9949412b566ad4901ff9101179107 Mon Sep 17 00:00:00 2001
From: Tim Felgentreff <tim.felgentreff@oracle.com>
Date: Fri, 10 Apr 2026 14:05:57 +0200
Subject: [PATCH 4/7] Add lower-level readInto fast path for buffered reads

Adds a PosixSupportLibrary readInto primitive and uses it for BufferedReader fill-buffer refills when the raw object is cached PFileIO and the refill starts at offset 0.

On the jsonrpc-pipe microbenchmark with a heavier repeated-run protocol (graalpy harness.py micro/jsonrpc-pipe.py -i 12 5000 text text mask 64), the current baseline had median AVG (no warmup) 0.551 s and mean 0.537 s across 5 runs. This change measured median 0.476 s and mean 0.503 s across 5 runs.
---
 .../io/BufferedReaderMixinBuiltins.java       | 80 +++++++++++++++++--
 .../python/runtime/EmulatedPosixSupport.java  | 25 ++++++
 .../python/runtime/LoggingPosixSupport.java   | 11 +++
 .../graal/python/runtime/NFIPosixSupport.java | 11 +++
 .../python/runtime/PosixSupportLibrary.java   |  2 +
 .../python/runtime/PreInitPosixSupport.java   |  9 +++
 6 files changed, 131 insertions(+), 7 deletions(-)

diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java
index 8f5fb49222..101ca493d0 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * The Universal Permissive License (UPL), Version 1.0
@@ -82,6 +82,7 @@
 import com.oracle.graal.python.builtins.modules.io.BufferedIONodesFactory.CheckIsClosedNodeGen;
 import com.oracle.graal.python.builtins.objects.PNone;
 import com.oracle.graal.python.builtins.objects.buffer.PythonBufferAccessLibrary;
+import com.oracle.graal.python.builtins.objects.exception.OSErrorEnum;
 import com.oracle.graal.python.builtins.objects.bytes.BytesNodes;
 import com.oracle.graal.python.builtins.objects.bytes.PByteArray;
 import com.oracle.graal.python.builtins.objects.bytes.PBytes;
@@ -97,7 +98,12 @@
 import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode;
 import com.oracle.graal.python.nodes.function.builtins.clinic.ArgumentClinicProvider;
 import com.oracle.graal.python.nodes.object.GetClassNode;
+import com.oracle.graal.python.nodes.PConstructAndRaiseNode;
+import com.oracle.graal.python.runtime.GilNode;
 import com.oracle.graal.python.runtime.IndirectCallData.InteropCallData;
+import com.oracle.graal.python.runtime.PosixSupport;
+import com.oracle.graal.python.runtime.PosixSupportLibrary;
+import com.oracle.graal.python.runtime.PythonContext;
 import com.oracle.graal.python.runtime.exception.PException;
 import com.oracle.graal.python.runtime.object.PFactory;
 import com.oracle.graal.python.util.PythonUtils;
@@ -114,6 +120,7 @@
 import com.oracle.truffle.api.frame.VirtualFrame;
 import com.oracle.truffle.api.library.CachedLibrary;
 import com.oracle.truffle.api.nodes.Node;
+import com.oracle.truffle.api.profiles.InlinedBranchProfile;
 import com.oracle.truffle.api.profiles.InlinedConditionProfile;
 import com.oracle.truffle.api.strings.TruffleString;
 
@@ -190,7 +197,8 @@ abstract static class FillBufferNode extends PNodeWithContext {
 
         @Specialization
         static int bufferedreaderFillBuffer(VirtualFrame frame, Node inliningTarget, PBuffered self,
-                        @Cached RawReadNode rawReadNode) {
+                        @Cached RawReadNode rawReadNode,
+                        @Cached RawReadIntoBufferNode rawReadIntoBufferNode) {
             int start;
             if (isValidReadBuffer(self)) {
                 start = self.getReadEnd();
@@ -198,21 +206,79 @@ static int bufferedreaderFillBuffer(VirtualFrame frame, Node inliningTarget, PBu
                 start = 0;
             }
             int len = self.getBufferSize() - start;
-            byte[] fill = rawReadNode.execute(frame, inliningTarget, self, len);
-            if (fill == BLOCKED) {
-                return -2;
+            int n;
+            if (start == 0 && self.isFastClosedChecks()) {
+                n = rawReadIntoBufferNode.execute(frame, inliningTarget, self.getFileIORaw(), self.getBuffer(), len);
+                if (n == -2) {
+                    return -2;
+                }
+            } else {
+                byte[] fill = rawReadNode.execute(frame, inliningTarget, self, len);
+                if (fill == BLOCKED) {
+                    return -2;
+                }
+                n = fill.length;
+                if (n > 0) {
+                    PythonUtils.arraycopy(fill, 0, self.getBuffer(), start, n);
+                }
             }
-            int n = fill.length;
             if (n == 0) {
                 return n;
             }
-            PythonUtils.arraycopy(fill, 0, self.getBuffer(), start, n);
             self.setReadEnd(start + n);
             self.setRawPos(start + n);
             return n;
         }
     }
 
+    @GenerateInline
+    @GenerateCached(false)
+    abstract static class RawReadIntoBufferNode extends PNodeWithContext {
+
+        public abstract int execute(VirtualFrame frame, Node inliningTarget, PFileIO raw, byte[] buffer, int len);
+
+        @Specialization
+        static int readIntoBuffer(VirtualFrame frame, Node inliningTarget, PFileIO raw, byte[] buffer, int len,
+                        @Bind PythonContext context,
+                        @CachedLibrary("context.getPosixSupport()") PosixSupportLibrary posixLib,
+                        @Cached InlinedBranchProfile readErrorProfile,
+                        @Cached InlinedBranchProfile readErrorProfile2,
+                        @Cached GilNode gil,
+                        @Cached PConstructAndRaiseNode.Lazy constructAndRaiseNode) {
+            try {
+                return readInto(raw.getFD(), buffer, len, inliningTarget, posixLib, context.getPosixSupport(), readErrorProfile, gil);
+            } catch (PosixSupportLibrary.PosixException e) {
+                if (e.getErrorCode() == OSErrorEnum.EAGAIN.getNumber()) {
+                    readErrorProfile2.enter(inliningTarget);
+                    return -2;
+                }
+                throw constructAndRaiseNode.get(inliningTarget).raiseOSErrorFromPosixException(frame, e);
+            }
+        }
+
+        private static int readInto(int fd, byte[] buffer, int len,
+                        Node inliningTarget, PosixSupportLibrary posixLib, PosixSupport posixSupport,
+                        InlinedBranchProfile errorProfile, GilNode gil) throws PosixSupportLibrary.PosixException {
+            gil.release(true);
+            try {
+                while (true) {
+                    try {
+                        return (int) posixLib.readInto(posixSupport, fd, new PosixSupportLibrary.Buffer(buffer, len));
+                    } catch (PosixSupportLibrary.PosixException e) {
+                        errorProfile.enter(inliningTarget);
+                        if (e.getErrorCode() == OSErrorEnum.EINTR.getNumber()) {
+                            PythonContext.triggerAsyncActions(inliningTarget);
+                        } else {
+                            throw e;
+                        }
+                    }
+                }
+            } finally {
+                gil.acquire();
+            }
+        }
+    }
+
     @Builtin(name = J_READABLE, minNumOfPositionalArgs = 1)
     @GenerateNodeFactory
     abstract static class ReadableNode extends PythonUnaryWithInitErrorBuiltinNode {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java
index 88ec3a30f9..af6c851da1 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java
@@ -526,6 +526,31 @@ public Buffer read(int fd, long length,
         }
     }
 
+    @ExportMessage
+    @SuppressWarnings({"unused", "static-method"})
+    public long readInto(int fd, Buffer data,
+                    @Bind Node inliningTarget,
+                    @Shared("errorBranch") @Cached InlinedBranchProfile errorBranch,
+                    @Shared("eq") @Cached TruffleString.EqualNode eqNode) throws PosixException {
+        Channel channel = getFileChannel(fd);
+        if (!(channel instanceof ReadableByteChannel readableChannel)) {
+            errorBranch.enter(inliningTarget);
+            throw posixException(OSErrorEnum.EBADF);
+        }
+        try {
+            int n = doReadIntoChannel(readableChannel, data.data, (int) data.length);
+            return n < 0 ? 0 : n;
+        } catch (Exception e) {
+            errorBranch.enter(inliningTarget);
+            throw posixException(OSErrorEnum.fromException(e, eqNode));
+        }
+    }
+
+    @TruffleBoundary(allowInlining = true)
+    private static int doReadIntoChannel(ReadableByteChannel channel, byte[] data, int length) throws IOException {
+        return channel.read(ByteBuffer.wrap(data, 0, length));
+    }
+
     @TruffleBoundary
     private static Buffer readBytesFromChannel(ReadableByteChannel channel, long sizeIn) throws IOException {
         long size = sizeIn;
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java
index e7371fadb9..7172f5b5ce 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java
@@ -184,6 +184,17 @@ final Buffer read(int fd, long length,
         }
     }
 
+    @ExportMessage
+    final long readInto(int fd, Buffer data,
+                    @CachedLibrary("this.delegate") PosixSupportLibrary lib) throws PosixException {
+        logEnter("readInto", "%d, %d", fd, data.length);
+        try {
+            return logExit("readInto", "%d", lib.readInto(delegate, fd, data));
+        } catch (PosixException e) {
+            throw logException("readInto", e);
+        }
+    }
+
     @ExportMessage
     final long write(int fd, Buffer data,
                     @CachedLibrary("this.delegate") PosixSupportLibrary lib) throws PosixException {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
index 1bf99341f3..9c343d9c34 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
@@ -565,6 +565,17 @@ public Buffer read(int fd, long length,
         return buffer.withLength(n);
     }
 
+    @ExportMessage
+    public long readInto(int fd, Buffer data,
+                    @Shared("invoke") @Cached InvokeNativeFunction invokeNode) throws PosixException {
+        setErrno(invokeNode, 0);
+        long n = invokeNode.callLong(this, PosixNativeFunction.call_read, fd, data.data, data.length);
+        if (n < 0) {
+            throw getErrnoAndThrowPosixException(invokeNode);
+        }
+        return n;
+    }
+
     @ExportMessage
     public long write(int fd, Buffer data,
                     @Shared("invoke") @Cached InvokeNativeFunction invokeNode) throws PosixException {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java
index 5ad6aaac3e..351f8d601c 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java
@@ -93,6 +93,8 @@ public abstract class PosixSupportLibrary extends Library {
 
     public abstract Buffer read(Object receiver, int fd, long length) throws PosixException;
 
+    public abstract long readInto(Object receiver, int fd, Buffer data) throws PosixException;
+
     public abstract long write(Object receiver, int fd, Buffer data) throws PosixException;
 
     public abstract int dup(Object receiver, int fd) throws PosixException;
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java
index 9b36dc9fd0..26916c89df 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java
@@ -203,6 +203,15 @@ final Buffer read(int fd, long length,
         return nativeLib.read(nativePosixSupport, fd, length);
     }
 
+    @ExportMessage
+    final long readInto(int fd, Buffer data,
+                    @CachedLibrary("this.nativePosixSupport") PosixSupportLibrary nativeLib) throws PosixException {
+        if (inPreInitialization) {
+            return PosixSupportLibrary.getUncached().readInto(emulatedPosixSupport, fd, data);
+        }
+        return nativeLib.readInto(nativePosixSupport, fd, data);
+    }
+
     @ExportMessage
     final long write(int fd, Buffer data,
                     @CachedLibrary("this.nativePosixSupport") PosixSupportLibrary nativeLib) throws PosixException {

From 565c1c6c9b7f97291d89455605ea40aee0aef317 Mon Sep 17 00:00:00 2001
From: Tim Felgentreff <tim.felgentreff@oracle.com>
Date: Fri, 10 Apr 2026 15:52:46 +0200
Subject: [PATCH 5/7] Add io perf investigation notes and scripts

---
 .../python/micro/jsonrpc-pipe.py              |  14 +-
 investigations/io_perf/notes.org              | 525 ++++++++++++++++++
 scripts/profile-jsonrpc-pipe-async-buffer.sh  |  51 ++
 scripts/profile-jsonrpc-pipe-async-text.sh    |  51 ++
 .../profile-jsonrpc-pipe-gprofng-buffer.sh    |  51 ++
 scripts/profile-jsonrpc-pipe-gprofng-text.sh  |  51 ++
 scripts/profile_jsonrpc_pipe_worker.py        | 202 +++++++
 7 files changed, 944 insertions(+), 1 deletion(-)
 create mode 100644 investigations/io_perf/notes.org
 create mode 100755 scripts/profile-jsonrpc-pipe-async-buffer.sh
 create mode 100755 scripts/profile-jsonrpc-pipe-async-text.sh
 create mode 100755 scripts/profile-jsonrpc-pipe-gprofng-buffer.sh
 create mode 100755 scripts/profile-jsonrpc-pipe-gprofng-text.sh
 create mode 100755 scripts/profile_jsonrpc_pipe_worker.py

diff --git a/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py b/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py
index d104a3ad00..83ee8cfd06 100644
--- a/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py
+++ b/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py
@@ -301,6 +301,18 @@ def parse_int(value):
     return int(str(value).replace("_", ""))
 
 
+def get_subprocess_launcher_args():
+    orig_argv = getattr(sys, "orig_argv", None)
+    if not orig_argv:
+        return [sys.executable]
+    launcher_args = [sys.executable]
+    for arg in orig_argv[1:]:
+        if not arg.startswith("-"):
+            break
+        launcher_args.append(arg)
+    return launcher_args
+
+
 def __process_args__(roundtrips=500, client_io="text", worker_io="text", workload="mask", payload_bytes=64, batch_size=8):
     return [
         parse_int(roundtrips),
@@ -317,7 +329,7 @@ def __setup__(roundtrips=500, client_io="text", worker_io="text", workload="mask
     __teardown__()
     state = State(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size)
     command = [
-        sys.executable,
+        *get_subprocess_launcher_args(),
         __file__,
         "--worker",
         "--worker-io=%s" % worker_io,
diff --git a/investigations/io_perf/notes.org b/investigations/io_perf/notes.org
new file mode 100644
index 0000000000..6a92d8ae8d
--- /dev/null
+++ b/investigations/io_perf/notes.org
@@ -0,0 +1,525 @@
+* IO Perf Investigation Notes
+
+** Scope
+
+Investigation target: remaining performance gap in workload 07 / jsonrpc tokenizer style
+strict request-response traffic, especially tiny write/flush + pipe readline overhead in
+GraalPy's =_io= stack.
+
+Current reference commit for runtime changes:
+- =930c9f5b09= Add lower-level readInto fast path for buffered reads
+
+Related commits in this investigation:
+- =39bde29f49= Reduce tiny flush copying in =_io=
+- =3cc0650e9c= WIP: add jsonrpc pipe profiling scripts
+- =9d27960795= Add jsonrpc pipe microbenchmark to mx harness
+
+** Benchmark Shape
+
+In-repo harness benchmark:
+- =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py=
+
+Representative command:
+#+begin_src bash
+graalpy graalpython/com.oracle.graal.python.benchmarks/python/harness.py \
+  graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \
+  -i 12 5000 text text mask 64
+#+end_src
+
+Interpretation:
+- =5000= roundtrips per benchmark iteration
+- =text text= means parent and worker both use text I/O wrappers
+- =mask= uses the tokenizer-like normalize/mask request shape
+- =64= payload bytes
+
+The heavier =5000=-roundtrip mode is preferred for A/B work over the smaller =500= mode,
+because the smaller mode is too noisy for subtle changes.
+
+** Profiling Tooling
+
+Worker-only profiling scripts:
+- =scripts/profile_jsonrpc_pipe_worker.py=
+- =scripts/profile-jsonrpc-pipe-async-text.sh=
+- =scripts/profile-jsonrpc-pipe-async-buffer.sh=
+- =scripts/profile-jsonrpc-pipe-gprofng-text.sh=
+- =scripts/profile-jsonrpc-pipe-gprofng-buffer.sh=
+
+Typical async-profiler runs:
+#+begin_src bash
+REQS=20000 scripts/profile-jsonrpc-pipe-async-text.sh /tmp/jsonrpc-ee-worker-text-current.svg
+REQS=20000 scripts/profile-jsonrpc-pipe-async-buffer.sh /tmp/jsonrpc-ee-worker-buffer-current.svg
+#+end_src
+
+Current useful profiler artifacts:
+- =/tmp/jsonrpc-ee-worker-text-current.svg=
+- =/tmp/jsonrpc-ee-worker-buffer-current.svg=
+
+Async-profiler conclusions:
+- Whole-process launcher/harness profiles are dominated by compilation noise.
+- Worker-only profiles are the useful ones.
+- The comparison between =text= and =buffer= modes clearly isolates the remaining text-layer cost.
+
+gprofng status:
+- Installed locally and usable.
+- Produced coarse output dominated by =<Unknown>= under this WSL2 setup.
+- Useful as a secondary cross-check, not the primary driver for this work.
+
+** Current Hotspots
+
+From worker-only async-profiler on current committed state (=930c9f5b09=):
+
+Text-mode still shows:
+- =TextIOWrapperBuiltins$WriteNode.write=
+- =TextIOWrapperNodes$ReadlineNode.readline=
+- =TextIOWrapperNodes$ReadChunkNode.readChunk=
+- =BufferedWriterNodes$FlushUnlockedNode.bufferedwriterFlushUnlocked=
+- =BufferedWriterNodes$RawWriteNode.bufferedwriterRawWrite=
+- =FileIOBuiltins$ReadintoNode.readinto=
+- =PosixModuleBuiltins$ReadNode.read=
+- =FileIOBuiltins$WriteNode.write=
+- =PosixModuleBuiltins$WriteNode.write=
+
+Buffer-mode drops the text wrapper layer and shows mainly:
+- =BufferedReaderMixinBuiltins$BufferedReadlineNode.readline=
+- =BufferedWriterNodes$FlushUnlockedNode.bufferedwriterFlushUnlocked=
+- =BufferedWriterNodes$RawWriteNode.bufferedwriterRawWrite=
+- =FileIOBuiltins$ReadintoNode.readinto=
+- =PosixModuleBuiltins$ReadNode.read=
+- =FileIOBuiltins$WriteNode.write=
+- =PosixModuleBuiltins$WriteNode.write=
+
+Interpretation:
+- Lower buffered/file/posix path has improved.
+- Remaining gap is increasingly concentrated in the text wrapper read path.
+
+** Kept Changes
+
+*** =39bde29f49= Reduce tiny flush copying in =_io=
+
+What changed:
+- Replaced pending text output buffering with a stealable byte buffer:
+  - =PTextIO=
+  - =PendingBytesOutputStream=
+- Avoided =toByteArray()= copy on =TextIOWrapper= flush:
+  - =TextIOWrapperNodes.WriteFlushNode=
+- Avoided slice copy in =BufferedWriterNodes.FlushUnlockedNode= when =writePos == 0=
+
+Measured effect on the small =mx benchmark micro:jsonrpc-pipe= configuration:
+- =AVG (no warmup)= improved from =0.188 s= to =0.089 s=
+
+*** =930c9f5b09= Add lower-level readInto fast path for buffered reads
+
+What changed:
+- Added =PosixSupportLibrary.readInto(Object receiver, int fd, Buffer data)=
+- Implemented it in:
+  - =NFIPosixSupport=
+  - =EmulatedPosixSupport=
+  - =LoggingPosixSupport=
+  - =PreInitPosixSupport=
+- Used it from =BufferedReaderMixinBuiltins.FillBufferNode= only when:
+  - refill starts at offset =0=
+  - raw object is cached =PFileIO=
+
+This stays below the Python protocol layer and avoids:
+- temporary =PByteArray=
+- Python-level =raw.readinto()= call
+- extra copy back into the buffered reader's internal byte array
+
+Repeated heavy-run comparison against prior committed state:
+
+Baseline (=39bde29f49=), 5 runs:
+- median =AVG (no warmup)=: =0.551 s=
+- mean   =AVG (no warmup)=: =0.537 s=
+- min/max: =0.344 s= / =0.707 s=
+
+Candidate (=930c9f5b09=), 5 runs:
+- median =AVG (no warmup)=: =0.476 s=
+- mean   =AVG (no warmup)=: =0.503 s=
+- min/max: =0.346 s= / =0.682 s=
+
+Interpretation:
+- This lower-level read-side change was worth keeping.
+
+** Rejected Experiments
+
+*** Rejected: Python-level direct fill-buffer shortcut
+
+Location:
+- =BufferedReaderMixinBuiltins.FillBufferNode=
+
+Idea:
+- When =start == 0=, call =raw.readinto()= directly on =self.getBuffer()=
+
+Why rejected:
+- went back through Python-level dispatch
+- added significant hot-node complexity
+- repeated runs gave mixed signal and no clear win
+
+*** Rejected: =ReadChunkNode= telling / non-telling split
+
+Location:
+- =TextIOWrapperNodes.ReadChunkNode=
+
+Idea:
+- Split =self.isTelling()= and non-=telling= cases into separate specializations
+
+Repeated heavy-run results:
+- runs: =0.446 s=, =0.804 s=, =0.515 s=, =0.617 s=, =0.621 s=
+- median =0.617 s=
+- mean =0.601 s=
+
+Compared to current baseline (=930c9f5b09=):
+- median =0.476 s=
+- mean =0.503 s=
+
+Why rejected:
+- regression on repeated runs
+- likely extra node shape / cache sharing cost outweighed saved work
+
+*** Rejected: =ReadChunkNode= =PBytes= fast path
+
+Location:
+- =TextIOWrapperNodes.ReadChunkNode=
+
+Idea:
+- If =inputChunk instanceof PBytes=, skip the generic buffer acquire/release path
+
+Repeated heavy-run results:
+- runs: =0.515 s=, =0.647 s=, =0.510 s=, =0.707 s=, =0.613 s=
+- median =0.613 s=
+- mean =0.598 s=
+
+Compared to current baseline (=930c9f5b09=):
+- median =0.476 s=
+- mean =0.503 s=
+
+Why rejected:
+- regression on repeated runs
+- too small/local a fast path to beat the resulting code shape in practice
+
+*** Blocked: direct TextIOWrapper readline -> buffered byte readline delegation
+
+Location:
+- =TextIOWrapperNodes.ReadlineNode=
+
+Idea:
+- In the common unlimited, non-=tell()= case with no decoded-char backlog, delegate directly to
+  the underlying buffered byte =readline= node and decode just that one line.
+
+Reasoning:
+- This would bypass:
+  - =TextIOWrapperNodes.ReadChunkNode=
+  - =FindLineEndingNode=
+  - some =TruffleString= churn in the common NDJSON case
+
+Why not pursued further yet:
+- Cross-file Truffle DSL node construction blocked the straightforward implementation.
+- Attempting to cache =BufferedReaderMixinBuiltins.BufferedReadlineNode= from
+  =TextIOWrapperNodes= did not compile:
+  - implicit =create()= is not available there
+  - explicit =...BufferedReadlineNodeGen.create()= was not accepted by the DSL expression parser
+- Retried with explicit generated-node cache expressions and still hit DSL/parser visibility issues.
+
+This direction may still be worthwhile, but likely requires one of:
+- moving a reusable helper into a place both nodes can access cleanly
+- a small refactor in =BufferedReaderMixinBuiltins=
+- or a different Java-level delegation approach that does not require cross-file cached-node construction
+
+*** Rejected: shared bufferedReadline helper + TextIOWrapper fast path
+
+Location:
+- =BufferedReaderMixinBuiltins=
+- =TextIOWrapperNodes.ReadlineNode=
+
+Idea:
+- Extract the buffered byte-line acquisition logic into a reusable pure Java helper in
+  =BufferedReaderMixinBuiltins=
+- Cache the necessary nodes at the call site in =TextIOWrapperNodes=
+- In the common case (=limit < 0=, no decoded backlog, no =tell()= tracking), acquire a byte line
+  from the buffered layer and decode just that line
+
+Why this was attractive:
+- avoids cross-file generated-node construction
+- keeps lower-layer logic shared instead of duplicated
+- bypasses =ReadChunkNode= and =FindLineEndingNode= in the common NDJSON case
+
+Repeated heavy-run results:
+- runs: =0.520 s=, =0.391 s=, =0.526 s=, =0.665 s=, =0.550 s=
+- median =AVG (no warmup)=: =0.526 s=
+- mean =AVG (no warmup)=: =0.530 s=
+- median =BEST=: =0.278 s=
+
+Compared to current baseline (=930c9f5b09=):
+- baseline median =AVG (no warmup)=: =0.476 s=
+- baseline mean =AVG (no warmup)=: =0.503 s=
+- baseline median =BEST=: =0.288 s=
+
+Conclusion:
+- not good enough to keep
+- slightly better tail minima / bests, but worse median and mean
+- reverted
+
+** Measurement Guidance
+
+Preferred comparison protocol for future changes:
+- Use the current committed state as A
+- Use a single candidate patch as B
+- Rebuild once for B
+- Run 5 repeated harness invocations with:
+  - =-i 12 5000 text text mask 64=
+- Compare at least:
+  - median =AVG (no warmup)=
+  - mean =AVG (no warmup)=
+  - min/max =AVG (no warmup)=
+  - median =BEST=
+
+Avoid using single-run =AVG (no warmup)= from the small =500=-roundtrip benchmark for go/no-go
+decisions on subtle changes.
+
+** Current CPython Comparison
+
+Repeated heavy-run comparison using:
+#+begin_src bash
+graalpython/com.oracle.graal.python.benchmarks/python/harness.py \
+  graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \
+  -i 12 5000 <parent-io> <worker-io> mask 64
+#+end_src
+
+Five-run summaries:
+
+| runtime/mode       | median AVG (no warmup) | mean AVG (no warmup) | median BEST |
+|--------------------+------------------------+-----------------------+-------------|
+| CPython text/text  | 0.529 s                | 0.554 s               | 0.508 s     |
+| CPython buffer/buffer | 0.513 s             | 0.517 s               | 0.488 s     |
+| GraalPy text/text  | 0.446 s                | 0.475 s               | 0.277 s     |
+| GraalPy buffer/buffer | 0.615 s             | 0.598 s               | 0.298 s     |
+
+Implications:
+- On the current committed state, GraalPy =text/text= is *faster* than CPython on this harness benchmark.
+- GraalPy =buffer/buffer= is still *slower* than CPython and also slower than GraalPy =text/text=.
+- Therefore, the remaining end-to-end issue on this benchmark is not simply "TextIOWrapper is slower".
+- The lower buffered/file/posix path still matters, and some previous assumptions should be re-checked
+  against the heavier benchmark protocol.
+
+** Interpreter (Compilation=false): native-ee standalone vs CPython
+
+*** Protocol
+
+Benchmark entrypoint and workload stayed the same as the prior investigation:
+- =graalpython/com.oracle.graal.python.benchmarks/python/harness.py=
+- =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py=
+- workload parameters: =5000 text text mask 64=
+
+Pinned harness commands used for this section:
+#+begin_src bash
+env LC_ALL=C.UTF-8 PYTHONHASHSEED=0 \
+  GRAAL_PYTHON_VM_ARGS='--experimental-options --engine.Compilation=false' \
+  taskset -c 2 ./mxbuild/linux-amd64/GRAALPY_NATIVE_STANDALONE/bin/graalpy \
+  graalpython/com.oracle.graal.python.benchmarks/python/harness.py \
+  graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \
+  -r 1 -i 12 5000 text text mask 64
+
+env LC_ALL=C.UTF-8 PYTHONHASHSEED=0 \
+  taskset -c 2 python3 \
+  graalpython/com.oracle.graal.python.benchmarks/python/harness.py \
+  graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \
+  -r 1 -i 12 5000 text text mask 64
+#+end_src
+
+Protocol details:
+- =taskset -c 2= pins both parent and worker to one CPU
+- =LC_ALL=C.UTF-8= and =PYTHONHASHSEED=0= kept constant across runs
+- =-r 1= gives one unmeasured in-process pre-run before the 12 measured iterations
+- repeated each harness invocation 5 times per runtime
+- primary summary metric: cross-run median/mean of harness =AVG (all runs)=
+- steady-state cross-check: mean of the last 6 raw durations from each 12-iteration harness run
+
+Why use =GRAAL_PYTHON_VM_ARGS= here:
+- it ensures the worker subprocess launched by =jsonrpc-pipe.py= also sees
+  =--experimental-options --engine.Compilation=false=
+
+Hardware/software context for this section:
+- host: WSL2 Linux =6.6.87.2-microsoft-standard-WSL2=
+- CPU: =13th Gen Intel(R) Core(TM) i9-13900H=, 20 online CPUs
+- native standalone: =GraalPy 3.12.8 (Oracle GraalVM Native 25.1.0)=
+- CPython: =Python 3.12.11=
+
+*** Harness Results
+
+Five-run summaries for the pinned =-r 1 -i 12 5000 text text mask 64= protocol:
+
+| runtime              | median AVG (all runs) | mean AVG (all runs) | median tail-6 avg | mean tail-6 avg | median BEST |
+|----------------------+------------------------+---------------------+-------------------+-----------------+-------------|
+| native-ee standalone | 1.203 s                | 1.234 s             | 1.162 s           | 1.190 s         | 1.124 s     |
+| CPython              | 0.244 s                | 0.244 s             | 0.242 s           | 0.244 s         | 0.236 s     |
+
+Observed gap:
+- native-ee standalone is about =4.93x= slower than CPython by median harness =AVG (all runs)=
+- even on the trailing-6 steady-state cross-check, native-ee standalone is still about =4.80x= slower
+
+Raw per-run summaries:
+- native-ee standalone:
+  - run 1: =AVG(all)= =1.217 s=, =BEST= =1.104 s=, tail-6 avg =1.134 s=
+  - run 2: =AVG(all)= =1.160 s=, =BEST= =1.105 s=, tail-6 avg =1.127 s=
+  - run 3: =AVG(all)= =1.195 s=, =BEST= =1.124 s=, tail-6 avg =1.164 s=
+  - run 4: =AVG(all)= =1.203 s=, =BEST= =1.129 s=, tail-6 avg =1.162 s=
+  - run 5: =AVG(all)= =1.394 s=, =BEST= =1.304 s=, tail-6 avg =1.361 s=
+- CPython:
+  - run 1: =AVG(all)= =0.250 s=, =BEST= =0.236 s=, tail-6 avg =0.253 s=
+  - run 2: =AVG(all)= =0.240 s=, =BEST= =0.236 s=, tail-6 avg =0.239 s=
+  - run 3: =AVG(all)= =0.243 s=, =BEST= =0.237 s=, tail-6 avg =0.242 s=
+  - run 4: =AVG(all)= =0.244 s=, =BEST= =0.235 s=, tail-6 avg =0.242 s=
+  - run 5: =AVG(all)= =0.244 s=, =BEST= =0.239 s=, tail-6 avg =0.242 s=
+
+Interpretation:
+- the current interpreter-mode gap is large and repeatable
+- the outlier native run 5 moves the mean a bit, but not the overall conclusion
+- the steady-state tail still shows a large gap, so this is not just cold startup
+
+*** Worker Breakdown
+
+Worker-only =gprofng= profile for native-ee standalone with =Compilation=false=:
+- collected by driving =15000= text-mode requests into a worker-only
+  =gprofng collect app -O /tmp/jsonrpc-native-text-compfalse.er -F off -- ...=
+  launch of:
+  - =./mxbuild/linux-amd64/GRAALPY_NATIVE_STANDALONE/bin/graalpy=
+  - =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py --worker --worker-io=text=
+- with =GRAAL_PYTHON_VM_ARGS='--experimental-options --engine.Compilation=false'=
+
+Top native-ee standalone interpreter-mode functions/suspects:
+- =PBytecodeDSLRootNodeGen$CachedBytecodeNode.continueAt=: =65.47%= inclusive CPU
+- =SubstrateEnterpriseOptimizedCallTarget.invokeFromInterpreter=: =18.83%= inclusive CPU
+- =OptimizedCallTarget.callBoundary=: =16.14%= inclusive CPU
+- =CallDispatchersFactory$FunctionCachedCallNodeGen$Inlined.execute=: =11.21%= inclusive CPU
+- allocation/GC shows up materially:
+  - =G1Library.allocateArray=: =9.87%= inclusive CPU
+  - =MemAllocator::allocate=: =9.87%= inclusive CPU
+  - =slowPathNewInstance=: =4.93%= inclusive CPU
+- text-layer work is present but not dominant on its own:
+  - =TextIOWrapperNodesFactory$ReadChunkNodeGen$Inlined.execute=: =5.38%= inclusive CPU
+- JSON/string work is also visible:
+  - =JSONUtils.appendString=
+  - =JSONEncoderBuiltins...AppendSimpleObject=
+  - =JSONScannerBuiltins...scanOnceUnicode=
+  - =PatternBuiltins...SubnInnerNode=
+- raw syscall wrappers are not the main bucket in this profile:
+  - =write=: =9.42%= exclusive CPU
+  - =read=: =0.90%= exclusive CPU
+
+Worker-only CPython =cProfile= on the same text-mode worker shape (15000 requests):
+- =TextIOWrapper.readline=: =0.780 s= cumulative inside a =1.658 s= worker profile
+- =TextIOWrapper.flush=: =0.227 s=
+- =write_message=: =0.419 s= cumulative
+- =read_message=: =0.946 s= cumulative
+- =json.dumps=: =0.159 s=
+- =json.loads=: =0.143 s=
+- =mask_row=: =0.206 s=
+
+Interpretation:
+- native-ee standalone spends a large fraction above the syscall boundary in bytecode interpreter
+  dispatch, call boundaries, dynamic dispatch, and allocation/GC
+- CPython still pays most of its visible worker cost in text I/O and JSON, but those hot paths stay
+  largely in optimized C implementations rather than showing a large interpreter-dispatch bucket
+- therefore the native-ee interpreter-mode gap is not just "readline/flush are slower"; a broader
+  dispatch/allocation cost is visible in the worker profile
+
+*** Syscall Cross-Check
+
+One =strace -f -c= run per runtime, same pinned direct benchmark shape:
+- native-ee standalone direct =text/text=, =5000= roundtrips:
+  - clean wall time without =strace=: =1.979 s=
+  - traced syscall mix:
+    - =futex=: =95.09%= traced syscall time, 852 calls
+    - =read=: 10200 calls
+    - =write=: 10003 calls
+- CPython direct =text/text=, =5000= roundtrips:
+  - clean wall time without =strace=: =0.573 s=
+  - traced syscall mix:
+    - =wait4=: =78.22%= traced syscall time, 85 calls
+    - =write=: 10043 calls
+    - =read=: 10502 calls
+    - =futex=: only 44 calls, =0.05%= traced syscall time
+
+Important caveat:
+- =strace= perturbs wall times substantially under WSL2, so use it only for syscall mix, not for
+  timing conclusions
+
+Interpretation:
+- native-ee standalone and CPython issue roughly the same order of magnitude of =read= and =write=
+  syscalls for this workload
+- the main gap therefore is not "native-ee does far more pipe syscalls"
+- native-ee shows much heavier =futex= activity, which suggests extra runtime coordination/synchronization
+  on top of the same basic I/O pattern
+
+*** Isolation Experiment: Drop Text Wrappers Only
+
+Direct-mode check using the same benchmark, pinned to CPU 2, 3 runs each:
+
+| runtime/mode                | median wall time |
+|----------------------------+------------------|
+| native-ee standalone text/text   | 1.701 s      |
+| native-ee standalone buffer/buffer | 1.377 s    |
+| CPython text/text          | 0.280 s          |
+| CPython buffer/buffer      | 0.276 s          |
+
+Interpretation:
+- removing =TextIOWrapper= helps native-ee standalone by about =19%= in this interpreter-mode direct check
+- CPython changes very little between =text/text= and =buffer/buffer= on this workload
+- but native-ee standalone =buffer/buffer= is still about =4.99x= slower than CPython =buffer/buffer=
+- therefore text I/O is a meaningful contributor, but it does not explain the full interpreter-mode gap
+
+*** Current Hypotheses
+
+Most likely contributors to the remaining interpreter-mode gap:
+- bytecode interpreter dispatch / call-boundary overhead in the native standalone
+- object allocation and GC churn in request decode/normalize/encode paths
+- text read path cost still matters, especially =ReadChunkNode=, but it is only part of the total gap
+- extra runtime synchronization (visible in =futex= activity) may be contributing to end-to-end time
+
+Most likely productive next targets:
+- reduce allocation and dispatch churn in the short-request text/JSON path
+- re-check =TextIOWrapperNodes.ReadlineNode= and =ReadChunkNode=, but do not assume that fixing them
+  alone will close the gap
+- inspect why the interpreter-mode native standalone still stays ~5x behind CPython even in
+  =buffer/buffer= mode
+
+*** Experiment Log
+
+- Kept for protocol:
+  - switched from command-line-only flags to =GRAAL_PYTHON_VM_ARGS= for native-ee standalone
+  - reason: ensures the worker subprocess also runs with =Compilation=false=
+- Tried as isolation only:
+  - direct =buffer/buffer= runs under =Compilation=false=
+  - result: useful diagnostic, but not a runtime change
+- No runtime code micro-optimization was committed in this pass:
+  - the new measurements point to multiple cost centers
+  - better to keep the notes reproducible first, then patch one hotspot at a time
+
+** WSL2 Notes
+
+Environment:
+- WSL2 kernel detected
+- =perf= is present but kernel-matched tooling is not configured cleanly
+- =gprofng= is installed
+
+Practical consequence:
+- use worker-only async-profiler as primary guide
+- use gprofng only as a coarse cross-check
+
+** Next Likely Target
+
+Most likely remaining productive area:
+- =TextIOWrapperNodes.ReadlineNode=
+
+Rationale:
+- lower buffered/file/posix stack has already been improved
+- text-mode still pays in:
+  - =TextIOWrapperNodes$ReadlineNode.readline=
+  - =TextIOWrapperNodes$ReadChunkNode.readChunk=
+
+But:
+- recent attempts show that naive local fast paths in =ReadChunkNode= are easy to get wrong
+- likely next useful change must avoid growing node shape too much
+- focus should be on reducing actual =TruffleString= / substring / concat churn in the
+  common short-line case, not just adding more conditionals
+- the promising "delegate to buffered byte readline" idea is currently blocked by DSL wiring issues
diff --git a/scripts/profile-jsonrpc-pipe-async-buffer.sh b/scripts/profile-jsonrpc-pipe-async-buffer.sh
new file mode 100755
index 0000000000..5ca134c571
--- /dev/null
+++ b/scripts/profile-jsonrpc-pipe-async-buffer.sh
@@ -0,0 +1,51 @@
+# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# The Universal Permissive License (UPL), Version 1.0
+#
+# Subject to the condition set forth below, permission is hereby granted to any
+# person obtaining a copy of this software, associated documentation and/or
+# data (collectively the "Software"), free of charge and under any and all
+# copyright rights in the Software, and any and all patent rights owned or
+# freely licensable by each licensor hereunder covering either (i) the
+# unmodified Software as contributed to or provided by such licensor, or (ii)
+# the Larger Works (as defined below), to deal in both
+#
+# (a) the Software, and
+#
+# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+# one is included with the Software each a "Larger Work" to which the Software
+# is contributed by such licensors),
+#
+# without restriction, including without limitation the rights to copy, create
+# derivative works of, display, perform, and distribute the Software and make,
+# use, sell, offer for sale, import, export, have made, and have sold the
+# Software and the Larger Work(s), and to sublicense the foregoing rights on
+# either these or other terms.
+#
+# This license is subject to the following condition:
+#
+# The above copyright notice and either this complete permission notice or at a
+# minimum a reference to the UPL must be included in all copies or substantial
+# portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#!/usr/bin/env bash
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+OUT="${1:-/tmp/jsonrpc-ee-worker-buffer.svg}"
+REQS="${REQS:-30000}"
+GRAALPY="${GRAALPY:-$ROOT/mxbuild/linux-amd64/GRAALPY_JVM_STANDALONE/bin/graalpy}"
+exec python3 "$ROOT/scripts/profile_jsonrpc_pipe_worker.py" \
+  --graalpy "$GRAALPY" \
+  --worker-io buffer \
+  --profiler async \
+  --requests "$REQS" \
+  --output "$OUT"
diff --git a/scripts/profile-jsonrpc-pipe-async-text.sh b/scripts/profile-jsonrpc-pipe-async-text.sh
new file mode 100755
index 0000000000..2b75b1741d
--- /dev/null
+++ b/scripts/profile-jsonrpc-pipe-async-text.sh
@@ -0,0 +1,51 @@
+# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# The Universal Permissive License (UPL), Version 1.0
+#
+# Subject to the condition set forth below, permission is hereby granted to any
+# person obtaining a copy of this software, associated documentation and/or
+# data (collectively the "Software"), free of charge and under any and all
+# copyright rights in the Software, and any and all patent rights owned or
+# freely licensable by each licensor hereunder covering either (i) the
+# unmodified Software as contributed to or provided by such licensor, or (ii)
+# the Larger Works (as defined below), to deal in both
+#
+# (a) the Software, and
+#
+# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+# one is included with the Software each a "Larger Work" to which the Software
+# is contributed by such licensors),
+#
+# without restriction, including without limitation the rights to copy, create
+# derivative works of, display, perform, and distribute the Software and make,
+# use, sell, offer for sale, import, export, have made, and have sold the
+# Software and the Larger Work(s), and to sublicense the foregoing rights on
+# either these or other terms.
+#
+# This license is subject to the following condition:
+#
+# The above copyright notice and either this complete permission notice or at a
+# minimum a reference to the UPL must be included in all copies or substantial
+# portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#!/usr/bin/env bash
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+OUT="${1:-/tmp/jsonrpc-ee-worker-text.svg}"
+REQS="${REQS:-30000}"
+GRAALPY="${GRAALPY:-$ROOT/mxbuild/linux-amd64/GRAALPY_JVM_STANDALONE/bin/graalpy}"
+exec python3 "$ROOT/scripts/profile_jsonrpc_pipe_worker.py" \
+  --graalpy "$GRAALPY" \
+  --worker-io text \
+  --profiler async \
+  --requests "$REQS" \
+  --output "$OUT"
diff --git a/scripts/profile-jsonrpc-pipe-gprofng-buffer.sh b/scripts/profile-jsonrpc-pipe-gprofng-buffer.sh
new file mode 100755
index 0000000000..3aed4f4026
--- /dev/null
+++ b/scripts/profile-jsonrpc-pipe-gprofng-buffer.sh
@@ -0,0 +1,51 @@
+# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# The Universal Permissive License (UPL), Version 1.0
+#
+# Subject to the condition set forth below, permission is hereby granted to any
+# person obtaining a copy of this software, associated documentation and/or
+# data (collectively the "Software"), free of charge and under any and all
+# copyright rights in the Software, and any and all patent rights owned or
+# freely licensable by each licensor hereunder covering either (i) the
+# unmodified Software as contributed to or provided by such licensor, or (ii)
+# the Larger Works (as defined below), to deal in both
+#
+# (a) the Software, and
+#
+# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+# one is included with the Software each a "Larger Work" to which the Software
+# is contributed by such licensors),
+#
+# without restriction, including without limitation the rights to copy, create
+# derivative works of, display, perform, and distribute the Software and make,
+# use, sell, offer for sale, import, export, have made, and have sold the
+# Software and the Larger Work(s), and to sublicense the foregoing rights on
+# either these or other terms.
+#
+# This license is subject to the following condition:
+#
+# The above copyright notice and either this complete permission notice or at a
+# minimum a reference to the UPL must be included in all copies or substantial
+# portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#!/usr/bin/env bash
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+OUT="${1:-/tmp/jsonrpc-ee-worker-buffer.er}"
+REQS="${REQS:-30000}"
+GRAALPY="${GRAALPY:-$ROOT/mxbuild/linux-amd64/GRAALPY_JVM_STANDALONE/bin/graalpy}"
+exec python3 "$ROOT/scripts/profile_jsonrpc_pipe_worker.py" \
+  --graalpy "$GRAALPY" \
+  --worker-io buffer \
+  --profiler gprofng \
+  --requests "$REQS" \
+  --output "$OUT"
diff --git a/scripts/profile-jsonrpc-pipe-gprofng-text.sh b/scripts/profile-jsonrpc-pipe-gprofng-text.sh
new file mode 100755
index 0000000000..63be3fb887
--- /dev/null
+++ b/scripts/profile-jsonrpc-pipe-gprofng-text.sh
@@ -0,0 +1,51 @@
+# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# The Universal Permissive License (UPL), Version 1.0
+#
+# Subject to the condition set forth below, permission is hereby granted to any
+# person obtaining a copy of this software, associated documentation and/or
+# data (collectively the "Software"), free of charge and under any and all
+# copyright rights in the Software, and any and all patent rights owned or
+# freely licensable by each licensor hereunder covering either (i) the
+# unmodified Software as contributed to or provided by such licensor, or (ii)
+# the Larger Works (as defined below), to deal in both
+#
+# (a) the Software, and
+#
+# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+# one is included with the Software each a "Larger Work" to which the Software
+# is contributed by such licensors),
+#
+# without restriction, including without limitation the rights to copy, create
+# derivative works of, display, perform, and distribute the Software and make,
+# use, sell, offer for sale, import, export, have made, and have sold the
+# Software and the Larger Work(s), and to sublicense the foregoing rights on
+# either these or other terms.
+#
+# This license is subject to the following condition:
+#
+# The above copyright notice and either this complete permission notice or at a
+# minimum a reference to the UPL must be included in all copies or substantial
+# portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#!/usr/bin/env bash
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+OUT="${1:-/tmp/jsonrpc-ee-worker-text.er}"
+REQS="${REQS:-30000}"
+GRAALPY="${GRAALPY:-$ROOT/mxbuild/linux-amd64/GRAALPY_JVM_STANDALONE/bin/graalpy}"
+exec python3 "$ROOT/scripts/profile_jsonrpc_pipe_worker.py" \
+  --graalpy "$GRAALPY" \
+  --worker-io text \
+  --profiler gprofng \
+  --requests "$REQS" \
+  --output "$OUT"
diff --git a/scripts/profile_jsonrpc_pipe_worker.py b/scripts/profile_jsonrpc_pipe_worker.py
new file mode 100755
index 0000000000..aed3bb4b90
--- /dev/null
+++ b/scripts/profile_jsonrpc_pipe_worker.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# The Universal Permissive License (UPL), Version 1.0
+#
+# Subject to the condition set forth below, permission is hereby granted to any
+# person obtaining a copy of this software, associated documentation and/or
+# data (collectively the "Software"), free of charge and under any and all
+# copyright rights in the Software, and any and all patent rights owned or
+# freely licensable by each licensor hereunder covering either (i) the
+# unmodified Software as contributed to or provided by such licensor, or (ii)
+# the Larger Works (as defined below), to deal in both
+#
+# (a) the Software, and
+#
+# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+# one is included with the Software each a "Larger Work" to which the Software
+# is contributed by such licensors),
+#
+# without restriction, including without limitation the rights to copy, create
+# derivative works of, display, perform, and distribute the Software and make,
+# use, sell, offer for sale, import, export, have made, and have sold the
+# Software and the Larger Work(s), and to sublicense the foregoing rights on
+# either these or other terms.
+#
+# This license is subject to the following condition:
+#
+# The above copyright notice and either this complete permission notice or at a
+# minimum a reference to the UPL must be included in all copies or substantial
+# portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+import subprocess
+import sys
+from typing import BinaryIO, TextIO
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Drive the jsonrpc-pipe worker under a profiler.")
+    parser.add_argument("--graalpy", required=True, help="Path to GraalPy launcher")
+    parser.add_argument("--worker-io", choices=("text", "buffer"), required=True)
+    parser.add_argument("--profiler", choices=("async", "gprofng"), required=True)
+    parser.add_argument("--requests", type=int, default=30000)
+    parser.add_argument("--output", required=True, help="Profile output file (async) or experiment dir (gprofng)")
+    parser.add_argument("--async-profiler-dir", default="/tmp/async-profiler-1.8.3-linux-x64")
+    parser.add_argument("--benchmark", default="graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py")
+    return parser.parse_args()
+
+
+def build_worker_cmd(args: argparse.Namespace) -> list[str]:
+    benchmark = str(Path(args.benchmark).resolve())
+    worker = [args.graalpy, benchmark, "--worker", f"--worker-io={args.worker_io}"]
+    if args.profiler == "async":
+        lib = Path(args.async_profiler_dir) / "build" / "libasyncProfiler.so"
+        return [
+            args.graalpy,
+            f"--vm.agentpath:{lib}=start,event=cpu,file={args.output}",
+            "--vm.XX:+UnlockDiagnosticVMOptions",
+            "--vm.XX:+DebugNonSafepoints",
+            benchmark,
+            "--worker",
+            f"--worker-io={args.worker_io}",
+        ]
+    return [
+        "gprofng",
+        "collect",
+        "app",
+        "-O",
+        args.output,
+        "-F",
+        "off",
+        "--",
+        *worker,
+    ]
+
+
+def make_request(index: int) -> dict[str, object]:
+    return {
+        "jsonrpc": "2.0",
+        "id": index,
+        "method": "mask",
+        "params": {
+            "email": f" User{index}.payload-payload@Example.COM ",
+            "phone": f"+49 (170) {index:04d}-payload-",
+            "region": "eu",
+            "source": "microbench",
+        },
+    }
+
+
+def read_json_line_text(stream: TextIO, stderr: TextIO) -> dict[str, object]:
+    while True:
+        line = stream.readline()
+        if not line:
+            raise RuntimeError(f"worker terminated early: {stderr.read()}")
+        if line.lstrip().startswith("{"):
+            return json.loads(line)
+
+
+def read_json_line_binary(stream: BinaryIO, stderr: BinaryIO) -> dict[str, object]:
+    while True:
+        line = stream.readline()
+        if not line:
+            raise RuntimeError(f"worker terminated early: {stderr.read().decode('utf-8', errors='replace')}")
+        if line.lstrip().startswith(b"{"):
+            return json.loads(line)
+
+
+def drive_text(process: subprocess.Popen[str], requests: int) -> None:
+    assert process.stdin is not None
+    assert process.stdout is not None
+    assert process.stderr is not None
+    for i in range(requests):
+        process.stdin.write(json.dumps(make_request(i), separators=(",", ":")) + "\n")
+        process.stdin.flush()
+        read_json_line_text(process.stdout, process.stderr)
+
+
+def drive_binary(process: subprocess.Popen[bytes], requests: int) -> None:
+    assert process.stdin is not None
+    assert process.stdout is not None
+    assert process.stderr is not None
+    for i in range(requests):
+        payload = (json.dumps(make_request(i), separators=(",", ":")) + "\n").encode("utf-8")
+        process.stdin.write(payload)
+        process.stdin.flush()
+        read_json_line_binary(process.stdout, process.stderr)
+
+
+def main() -> int:
+    args = parse_args()
+    output = Path(args.output)
+    output.parent.mkdir(parents=True, exist_ok=True)
+    if args.profiler == "gprofng" and output.exists():
+        if output.is_dir():
+            subprocess.check_call(["rm", "-rf", str(output)])
+        else:
+            output.unlink()
+    cmd = build_worker_cmd(args)
+    process_cwd = None
+    if args.profiler == "gprofng":
+        process_cwd = str(output.parent)
+        cmd[4] = output.name
+    text_mode = args.worker_io == "text"
+    if text_mode:
+        process = subprocess.Popen(
+            cmd,
+            cwd=process_cwd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            encoding="utf-8",
+            bufsize=1,
+        )
+        try:
+            drive_text(process, args.requests)
+            process.stdin.close()
+            rc = process.wait(timeout=120)
+            if rc != 0:
+                raise RuntimeError(process.stderr.read())
+            sys.stdout.write(process.stderr.read())
+        finally:
+            if process.poll() is None:
+                process.kill()
+    else:
+        process = subprocess.Popen(
+            cmd,
+            cwd=process_cwd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            bufsize=0,
+        )
+        try:
+            drive_binary(process, args.requests)
+            process.stdin.close()
+            rc = process.wait(timeout=120)
+            if rc != 0:
+                raise RuntimeError(process.stderr.read().decode("utf-8", errors="replace"))
+            sys.stdout.write(process.stderr.read().decode("utf-8", errors="replace"))
+        finally:
+            if process.poll() is None:
+                process.kill()
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From f0b4ac9a3839e3ad367adc1c292d95b161b094ba Mon Sep 17 00:00:00 2001
From: Tim Felgentreff <tim.felgentreff@oracle.com>
Date: Wed, 15 Apr 2026 16:40:12 +0200
Subject: [PATCH 6/7] Revert buffered _io readInto fast path

The direct buffered readInto refill path caused correctness regressions in\nseek/readline-sensitive code paths. In particular, it broke traceback\nsource extraction and tokenize/linecache-based reads, which reproduced as\nfailing test_traceback assertions and zipimport source-location failures.\n\nRemove the optimization and delete the now-dead PosixSupport readInto\nplumbing until there is a version that preserves buffered IO invariants.
---
 .../io/BufferedReaderMixinBuiltins.java       |  78 +--
 .../python/runtime/EmulatedPosixSupport.java  |  25 -
 .../python/runtime/LoggingPosixSupport.java   |  11 -
 .../graal/python/runtime/NFIPosixSupport.java |  11 -
 .../python/runtime/PosixSupportLibrary.java   |   2 -
 .../python/runtime/PreInitPosixSupport.java   |   9 -
 investigations/io_perf/notes.org              | 525 ------------------
 7 files changed, 6 insertions(+), 655 deletions(-)
 delete mode 100644 investigations/io_perf/notes.org

diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java
index 101ca493d0..f32954f9e4 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java
@@ -82,7 +82,6 @@
 import com.oracle.graal.python.builtins.modules.io.BufferedIONodesFactory.CheckIsClosedNodeGen;
 import com.oracle.graal.python.builtins.objects.PNone;
 import com.oracle.graal.python.builtins.objects.buffer.PythonBufferAccessLibrary;
-import com.oracle.graal.python.builtins.objects.exception.OSErrorEnum;
 import com.oracle.graal.python.builtins.objects.bytes.BytesNodes;
 import com.oracle.graal.python.builtins.objects.bytes.PByteArray;
 import com.oracle.graal.python.builtins.objects.bytes.PBytes;
@@ -98,12 +97,7 @@
 import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode;
 import com.oracle.graal.python.nodes.function.builtins.clinic.ArgumentClinicProvider;
 import com.oracle.graal.python.nodes.object.GetClassNode;
-import com.oracle.graal.python.nodes.PConstructAndRaiseNode;
-import com.oracle.graal.python.runtime.GilNode;
 import com.oracle.graal.python.runtime.IndirectCallData.InteropCallData;
-import com.oracle.graal.python.runtime.PosixSupport;
-import com.oracle.graal.python.runtime.PosixSupportLibrary;
-import com.oracle.graal.python.runtime.PythonContext;
 import com.oracle.graal.python.runtime.exception.PException;
 import com.oracle.graal.python.runtime.object.PFactory;
 import com.oracle.graal.python.util.PythonUtils;
@@ -120,7 +114,6 @@
 import com.oracle.truffle.api.frame.VirtualFrame;
 import com.oracle.truffle.api.library.CachedLibrary;
 import com.oracle.truffle.api.nodes.Node;
-import com.oracle.truffle.api.profiles.InlinedBranchProfile;
 import com.oracle.truffle.api.profiles.InlinedConditionProfile;
 import com.oracle.truffle.api.strings.TruffleString;
 
@@ -197,8 +190,7 @@ abstract static class FillBufferNode extends PNodeWithContext {
 
         @Specialization
         static int bufferedreaderFillBuffer(VirtualFrame frame, Node inliningTarget, PBuffered self,
-                        @Cached RawReadNode rawReadNode,
-                        @Cached RawReadIntoBufferNode rawReadIntoBufferNode) {
+                        @Cached RawReadNode rawReadNode) {
             int start;
             if (isValidReadBuffer(self)) {
                 start = self.getReadEnd();
@@ -206,79 +198,21 @@ static int bufferedreaderFillBuffer(VirtualFrame frame, Node inliningTarget, PBu
                 start = 0;
             }
             int len = self.getBufferSize() - start;
-            int n;
-            if (start == 0 && self.isFastClosedChecks()) {
-                n = rawReadIntoBufferNode.execute(frame, inliningTarget, self.getFileIORaw(), self.getBuffer(), len);
-                if (n == -2) {
-                    return -2;
-                }
-            } else {
-                byte[] fill = rawReadNode.execute(frame, inliningTarget, self, len);
-                if (fill == BLOCKED) {
-                    return -2;
-                }
-                n = fill.length;
-                if (n > 0) {
-                    PythonUtils.arraycopy(fill, 0, self.getBuffer(), start, n);
-                }
+            byte[] fill = rawReadNode.execute(frame, inliningTarget, self, len);
+            if (fill == BLOCKED) {
+                return -2;
             }
+            int n = fill.length;
             if (n == 0) {
                 return n;
             }
+            PythonUtils.arraycopy(fill, 0, self.getBuffer(), start, n);
             self.setReadEnd(start + n);
             self.setRawPos(start + n);
             return n;
         }
     }
 
-    @GenerateInline
-    @GenerateCached(false)
-    abstract static class RawReadIntoBufferNode extends PNodeWithContext {
-
-        public abstract int execute(VirtualFrame frame, Node inliningTarget, PFileIO raw, byte[] buffer, int len);
-
-        @Specialization
-        static int readIntoBuffer(VirtualFrame frame, Node inliningTarget, PFileIO raw, byte[] buffer, int len,
-                        @Bind PythonContext context,
-                        @CachedLibrary("context.getPosixSupport()") PosixSupportLibrary posixLib,
-                        @Cached InlinedBranchProfile readErrorProfile,
-                        @Cached InlinedBranchProfile readErrorProfile2,
-                        @Cached GilNode gil,
-                        @Cached PConstructAndRaiseNode.Lazy constructAndRaiseNode) {
-            try {
-                return readInto(raw.getFD(), buffer, len, inliningTarget, posixLib, context.getPosixSupport(), readErrorProfile, gil);
-            } catch (PosixSupportLibrary.PosixException e) {
-                if (e.getErrorCode() == OSErrorEnum.EAGAIN.getNumber()) {
-                    readErrorProfile2.enter(inliningTarget);
-                    return -2;
-                }
-                throw constructAndRaiseNode.get(inliningTarget).raiseOSErrorFromPosixException(frame, e);
-            }
-        }
-
-        private static int readInto(int fd, byte[] buffer, int len,
-                        Node inliningTarget, PosixSupportLibrary posixLib, PosixSupport posixSupport,
-                        InlinedBranchProfile errorProfile, GilNode gil) throws PosixSupportLibrary.PosixException {
-            gil.release(true);
-            try {
-                while (true) {
-                    try {
-                        return (int) posixLib.readInto(posixSupport, fd, new PosixSupportLibrary.Buffer(buffer, len));
-                    } catch (PosixSupportLibrary.PosixException e) {
-                        errorProfile.enter(inliningTarget);
-                        if (e.getErrorCode() == OSErrorEnum.EINTR.getNumber()) {
-                            PythonContext.triggerAsyncActions(inliningTarget);
-                        } else {
-                            throw e;
-                        }
-                    }
-                }
-            } finally {
-                gil.acquire();
-            }
-        }
-    }
-
     @Builtin(name = J_READABLE, minNumOfPositionalArgs = 1)
     @GenerateNodeFactory
     abstract static class ReadableNode extends PythonUnaryWithInitErrorBuiltinNode {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java
index af6c851da1..88ec3a30f9 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java
@@ -526,31 +526,6 @@ public Buffer read(int fd, long length,
         }
     }
 
-    @ExportMessage
-    @SuppressWarnings({"unused", "static-method"})
-    public long readInto(int fd, Buffer data,
-                    @Bind Node inliningTarget,
-                    @Shared("errorBranch") @Cached InlinedBranchProfile errorBranch,
-                    @Shared("eq") @Cached TruffleString.EqualNode eqNode) throws PosixException {
-        Channel channel = getFileChannel(fd);
-        if (!(channel instanceof ReadableByteChannel readableChannel)) {
-            errorBranch.enter(inliningTarget);
-            throw posixException(OSErrorEnum.EBADF);
-        }
-        try {
-            int n = doReadIntoChannel(readableChannel, data.data, (int) data.length);
-            return n < 0 ? 0 : n;
-        } catch (Exception e) {
-            errorBranch.enter(inliningTarget);
-            throw posixException(OSErrorEnum.fromException(e, eqNode));
-        }
-    }
-
-    @TruffleBoundary(allowInlining = true)
-    private static int doReadIntoChannel(ReadableByteChannel channel, byte[] data, int length) throws IOException {
-        return channel.read(ByteBuffer.wrap(data, 0, length));
-    }
-
     @TruffleBoundary
     private static Buffer readBytesFromChannel(ReadableByteChannel channel, long sizeIn) throws IOException {
         long size = sizeIn;
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java
index 7172f5b5ce..e7371fadb9 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java
@@ -184,17 +184,6 @@ final Buffer read(int fd, long length,
         }
     }
 
-    @ExportMessage
-    final long readInto(int fd, Buffer data,
-                    @CachedLibrary("this.delegate") PosixSupportLibrary lib) throws PosixException {
-        logEnter("readInto", "%d, %d", fd, data.length);
-        try {
-            return logExit("readInto", "%d", lib.readInto(delegate, fd, data));
-        } catch (PosixException e) {
-            throw logException("readInto", e);
-        }
-    }
-
     @ExportMessage
     final long write(int fd, Buffer data,
                     @CachedLibrary("this.delegate") PosixSupportLibrary lib) throws PosixException {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
index 9c343d9c34..1bf99341f3 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
@@ -565,17 +565,6 @@ public Buffer read(int fd, long length,
         return buffer.withLength(n);
     }
 
-    @ExportMessage
-    public long readInto(int fd, Buffer data,
-                    @Shared("invoke") @Cached InvokeNativeFunction invokeNode) throws PosixException {
-        setErrno(invokeNode, 0);
-        long n = invokeNode.callLong(this, PosixNativeFunction.call_read, fd, data.data, data.length);
-        if (n < 0) {
-            throw getErrnoAndThrowPosixException(invokeNode);
-        }
-        return n;
-    }
-
     @ExportMessage
     public long write(int fd, Buffer data,
                     @Shared("invoke") @Cached InvokeNativeFunction invokeNode) throws PosixException {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java
index 351f8d601c..5ad6aaac3e 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java
@@ -93,8 +93,6 @@ public abstract class PosixSupportLibrary extends Library {
 
     public abstract Buffer read(Object receiver, int fd, long length) throws PosixException;
 
-    public abstract long readInto(Object receiver, int fd, Buffer data) throws PosixException;
-
     public abstract long write(Object receiver, int fd, Buffer data) throws PosixException;
 
     public abstract int dup(Object receiver, int fd) throws PosixException;
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java
index 26916c89df..9b36dc9fd0 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java
@@ -203,15 +203,6 @@ final Buffer read(int fd, long length,
         return nativeLib.read(nativePosixSupport, fd, length);
     }
 
-    @ExportMessage
-    final long readInto(int fd, Buffer data,
-                    @CachedLibrary("this.nativePosixSupport") PosixSupportLibrary nativeLib) throws PosixException {
-        if (inPreInitialization) {
-            return PosixSupportLibrary.getUncached().readInto(emulatedPosixSupport, fd, data);
-        }
-        return nativeLib.readInto(nativePosixSupport, fd, data);
-    }
-
     @ExportMessage
     final long write(int fd, Buffer data,
                     @CachedLibrary("this.nativePosixSupport") PosixSupportLibrary nativeLib) throws PosixException {
diff --git a/investigations/io_perf/notes.org b/investigations/io_perf/notes.org
deleted file mode 100644
index 6a92d8ae8d..0000000000
--- a/investigations/io_perf/notes.org
+++ /dev/null
@@ -1,525 +0,0 @@
-* IO Perf Investigation Notes
-
-** Scope
-
-Investigation target: remaining performance gap in workload 07 / jsonrpc tokenizer style
-strict request-response traffic, especially tiny write/flush + pipe readline overhead in
-GraalPy's =_io= stack.
-
-Current reference commit for runtime changes:
-- =930c9f5b09= Add lower-level readInto fast path for buffered reads
-
-Related commits in this investigation:
-- =39bde29f49= Reduce tiny flush copying in =_io=
-- =3cc0650e9c= WIP: add jsonrpc pipe profiling scripts
-- =9d27960795= Add jsonrpc pipe microbenchmark to mx harness
-
-** Benchmark Shape
-
-In-repo harness benchmark:
-- =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py=
-
-Representative command:
-#+begin_src bash
-graalpy graalpython/com.oracle.graal.python.benchmarks/python/harness.py \
-  graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \
-  -i 12 5000 text text mask 64
-#+end_src
-
-Interpretation:
-- =5000= roundtrips per benchmark iteration
-- =text text= means parent and worker both use text I/O wrappers
-- =mask= uses the tokenizer-like normalize/mask request shape
-- =64= payload bytes
-
-The heavier =5000=-roundtrip mode is preferred for A/B work over the smaller =500= mode,
-because the smaller mode is too noisy for subtle changes.
-
-** Profiling Tooling
-
-Worker-only profiling scripts:
-- =scripts/profile_jsonrpc_pipe_worker.py=
-- =scripts/profile-jsonrpc-pipe-async-text.sh=
-- =scripts/profile-jsonrpc-pipe-async-buffer.sh=
-- =scripts/profile-jsonrpc-pipe-gprofng-text.sh=
-- =scripts/profile-jsonrpc-pipe-gprofng-buffer.sh=
-
-Typical async-profiler runs:
-#+begin_src bash
-REQS=20000 scripts/profile-jsonrpc-pipe-async-text.sh /tmp/jsonrpc-ee-worker-text-current.svg
-REQS=20000 scripts/profile-jsonrpc-pipe-async-buffer.sh /tmp/jsonrpc-ee-worker-buffer-current.svg
-#+end_src
-
-Current useful profiler artifacts:
-- =/tmp/jsonrpc-ee-worker-text-current.svg=
-- =/tmp/jsonrpc-ee-worker-buffer-current.svg=
-
-Async-profiler conclusions:
-- Whole-process launcher/harness profiles are dominated by compilation noise.
-- Worker-only profiles are the useful ones.
-- The comparison between =text= and =buffer= modes clearly isolates the remaining text-layer cost.
-
-gprofng status:
-- Installed locally and usable.
-- Produced coarse output dominated by =<Unknown>= under this WSL2 setup.
-- Useful as a secondary cross-check, not the primary driver for this work.
-
-** Current Hotspots
-
-From worker-only async-profiler on current committed state (=930c9f5b09=):
-
-Text-mode still shows:
-- =TextIOWrapperBuiltins$WriteNode.write=
-- =TextIOWrapperNodes$ReadlineNode.readline=
-- =TextIOWrapperNodes$ReadChunkNode.readChunk=
-- =BufferedWriterNodes$FlushUnlockedNode.bufferedwriterFlushUnlocked=
-- =BufferedWriterNodes$RawWriteNode.bufferedwriterRawWrite=
-- =FileIOBuiltins$ReadintoNode.readinto=
-- =PosixModuleBuiltins$ReadNode.read=
-- =FileIOBuiltins$WriteNode.write=
-- =PosixModuleBuiltins$WriteNode.write=
-
-Buffer-mode drops the text wrapper layer and shows mainly:
-- =BufferedReaderMixinBuiltins$BufferedReadlineNode.readline=
-- =BufferedWriterNodes$FlushUnlockedNode.bufferedwriterFlushUnlocked=
-- =BufferedWriterNodes$RawWriteNode.bufferedwriterRawWrite=
-- =FileIOBuiltins$ReadintoNode.readinto=
-- =PosixModuleBuiltins$ReadNode.read=
-- =FileIOBuiltins$WriteNode.write=
-- =PosixModuleBuiltins$WriteNode.write=
-
-Interpretation:
-- Lower buffered/file/posix path has improved.
-- Remaining gap is increasingly concentrated in the text wrapper read path.
-
-** Kept Changes
-
-*** =39bde29f49= Reduce tiny flush copying in =_io=
-
-What changed:
-- Replaced pending text output buffering with a stealable byte buffer:
-  - =PTextIO=
-  - =PendingBytesOutputStream=
-- Avoided =toByteArray()= copy on =TextIOWrapper= flush:
-  - =TextIOWrapperNodes.WriteFlushNode=
-- Avoided slice copy in =BufferedWriterNodes.FlushUnlockedNode= when =writePos == 0=
-
-Measured effect on the small =mx benchmark micro:jsonrpc-pipe= configuration:
-- =AVG (no warmup)= improved from =0.188 s= to =0.089 s=
-
-*** =930c9f5b09= Add lower-level readInto fast path for buffered reads
-
-What changed:
-- Added =PosixSupportLibrary.readInto(Object receiver, int fd, Buffer data)=
-- Implemented it in:
-  - =NFIPosixSupport=
-  - =EmulatedPosixSupport=
-  - =LoggingPosixSupport=
-  - =PreInitPosixSupport=
-- Used it from =BufferedReaderMixinBuiltins.FillBufferNode= only when:
-  - refill starts at offset =0=
-  - raw object is cached =PFileIO=
-
-This stays below the Python protocol layer and avoids:
-- temporary =PByteArray=
-- Python-level =raw.readinto()= call
-- extra copy back into the buffered reader's internal byte array
-
-Repeated heavy-run comparison against prior committed state:
-
-Baseline (=39bde29f49=), 5 runs:
-- median =AVG (no warmup)=: =0.551 s=
-- mean   =AVG (no warmup)=: =0.537 s=
-- min/max: =0.344 s= / =0.707 s=
-
-Candidate (=930c9f5b09=), 5 runs:
-- median =AVG (no warmup)=: =0.476 s=
-- mean   =AVG (no warmup)=: =0.503 s=
-- min/max: =0.346 s= / =0.682 s=
-
-Interpretation:
-- This lower-level read-side change was worth keeping.
-
-** Rejected Experiments
-
-*** Rejected: Python-level direct fill-buffer shortcut
-
-Location:
-- =BufferedReaderMixinBuiltins.FillBufferNode=
-
-Idea:
-- When =start == 0=, call =raw.readinto()= directly on =self.getBuffer()=
-
-Why rejected:
-- went back through Python-level dispatch
-- added significant hot-node complexity
-- repeated runs gave mixed signal and no clear win
-
-*** Rejected: =ReadChunkNode= telling / non-telling split
-
-Location:
-- =TextIOWrapperNodes.ReadChunkNode=
-
-Idea:
-- Split =self.isTelling()= and non-=telling= cases into separate specializations
-
-Repeated heavy-run results:
-- runs: =0.446 s=, =0.804 s=, =0.515 s=, =0.617 s=, =0.621 s=
-- median =0.617 s=
-- mean =0.601 s=
-
-Compared to current baseline (=930c9f5b09=):
-- median =0.476 s=
-- mean =0.503 s=
-
-Why rejected:
-- regression on repeated runs
-- likely extra node shape / cache sharing cost outweighed saved work
-
-*** Rejected: =ReadChunkNode= =PBytes= fast path
-
-Location:
-- =TextIOWrapperNodes.ReadChunkNode=
-
-Idea:
-- If =inputChunk instanceof PBytes=, skip the generic buffer acquire/release path
-
-Repeated heavy-run results:
-- runs: =0.515 s=, =0.647 s=, =0.510 s=, =0.707 s=, =0.613 s=
-- median =0.613 s=
-- mean =0.598 s=
-
-Compared to current baseline (=930c9f5b09=):
-- median =0.476 s=
-- mean =0.503 s=
-
-Why rejected:
-- regression on repeated runs
-- too small/local a fast path to beat the resulting code shape in practice
-
-*** Blocked: direct TextIOWrapper readline -> buffered byte readline delegation
-
-Location:
-- =TextIOWrapperNodes.ReadlineNode=
-
-Idea:
-- In the common unlimited, non-=tell()= case with no decoded-char backlog, delegate directly to
-  the underlying buffered byte =readline= node and decode just that one line.
-
-Reasoning:
-- This would bypass:
-  - =TextIOWrapperNodes.ReadChunkNode=
-  - =FindLineEndingNode=
-  - some =TruffleString= churn in the common NDJSON case
-
-Why not pursued further yet:
-- Cross-file Truffle DSL node construction blocked the straightforward implementation.
-- Attempting to cache =BufferedReaderMixinBuiltins.BufferedReadlineNode= from
-  =TextIOWrapperNodes= did not compile:
-  - implicit =create()= is not available there
-  - explicit =...BufferedReadlineNodeGen.create()= was not accepted by the DSL expression parser
-- Retried with explicit generated-node cache expressions and still hit DSL/parser visibility issues.
-
-This direction may still be worthwhile, but likely requires one of:
-- moving a reusable helper into a place both nodes can access cleanly
-- a small refactor in =BufferedReaderMixinBuiltins=
-- or a different Java-level delegation approach that does not require cross-file cached-node construction
-
-*** Rejected: shared bufferedReadline helper + TextIOWrapper fast path
-
-Location:
-- =BufferedReaderMixinBuiltins=
-- =TextIOWrapperNodes.ReadlineNode=
-
-Idea:
-- Extract the buffered byte-line acquisition logic into a reusable pure Java helper in
-  =BufferedReaderMixinBuiltins=
-- Cache the necessary nodes at the call site in =TextIOWrapperNodes=
-- In the common case (=limit < 0=, no decoded backlog, no =tell()= tracking), acquire a byte line
-  from the buffered layer and decode just that line
-
-Why this was attractive:
-- avoids cross-file generated-node construction
-- keeps lower-layer logic shared instead of duplicated
-- bypasses =ReadChunkNode= and =FindLineEndingNode= in the common NDJSON case
-
-Repeated heavy-run results:
-- runs: =0.520 s=, =0.391 s=, =0.526 s=, =0.665 s=, =0.550 s=
-- median =AVG (no warmup)=: =0.526 s=
-- mean =AVG (no warmup)=: =0.530 s=
-- median =BEST=: =0.278 s=
-
-Compared to current baseline (=930c9f5b09=):
-- baseline median =AVG (no warmup)=: =0.476 s=
-- baseline mean =AVG (no warmup)=: =0.503 s=
-- baseline median =BEST=: =0.288 s=
-
-Conclusion:
-- not good enough to keep
-- slightly better tail minima / bests, but worse median and mean
-- reverted
-
-** Measurement Guidance
-
-Preferred comparison protocol for future changes:
-- Use the current committed state as A
-- Use a single candidate patch as B
-- Rebuild once for B
-- Run 5 repeated harness invocations with:
-  - =-i 12 5000 text text mask 64=
-- Compare at least:
-  - median =AVG (no warmup)=
-  - mean =AVG (no warmup)=
-  - min/max =AVG (no warmup)=
-  - median =BEST=
-
-Avoid using single-run =AVG (no warmup)= from the small =500=-roundtrip benchmark for go/no-go
-decisions on subtle changes.
-
-** Current CPython Comparison
-
-Repeated heavy-run comparison using:
-#+begin_src bash
-graalpython/com.oracle.graal.python.benchmarks/python/harness.py \
-  graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \
-  -i 12 5000 <parent-io> <worker-io> mask 64
-#+end_src
-
-Five-run summaries:
-
-| runtime/mode       | median AVG (no warmup) | mean AVG (no warmup) | median BEST |
-|--------------------+------------------------+-----------------------+-------------|
-| CPython text/text  | 0.529 s                | 0.554 s               | 0.508 s     |
-| CPython buffer/buffer | 0.513 s             | 0.517 s               | 0.488 s     |
-| GraalPy text/text  | 0.446 s                | 0.475 s               | 0.277 s     |
-| GraalPy buffer/buffer | 0.615 s             | 0.598 s               | 0.298 s     |
-
-Implications:
-- On the current committed state, GraalPy =text/text= is *faster* than CPython on this harness benchmark.
-- GraalPy =buffer/buffer= is still *slower* than CPython and also slower than GraalPy =text/text=.
-- Therefore, the remaining end-to-end issue on this benchmark is not simply "TextIOWrapper is slower".
-- The lower buffered/file/posix path still matters, and some previous assumptions should be re-checked
-  against the heavier benchmark protocol.
-
-** Interpreter (Compilation=false): native-ee standalone vs CPython
-
-*** Protocol
-
-Benchmark entrypoint and workload stayed the same as the prior investigation:
-- =graalpython/com.oracle.graal.python.benchmarks/python/harness.py=
-- =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py=
-- workload parameters: =5000 text text mask 64=
-
-Pinned harness commands used for this section:
-#+begin_src bash
-env LC_ALL=C.UTF-8 PYTHONHASHSEED=0 \
-  GRAAL_PYTHON_VM_ARGS='--experimental-options --engine.Compilation=false' \
-  taskset -c 2 ./mxbuild/linux-amd64/GRAALPY_NATIVE_STANDALONE/bin/graalpy \
-  graalpython/com.oracle.graal.python.benchmarks/python/harness.py \
-  graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \
-  -r 1 -i 12 5000 text text mask 64
-
-env LC_ALL=C.UTF-8 PYTHONHASHSEED=0 \
-  taskset -c 2 python3 \
-  graalpython/com.oracle.graal.python.benchmarks/python/harness.py \
-  graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \
-  -r 1 -i 12 5000 text text mask 64
-#+end_src
-
-Protocol details:
-- =taskset -c 2= pins both parent and worker to one CPU
-- =LC_ALL=C.UTF-8= and =PYTHONHASHSEED=0= kept constant across runs
-- =-r 1= gives one unmeasured in-process pre-run before the 12 measured iterations
-- repeated each harness invocation 5 times per runtime
-- primary summary metric: cross-run median/mean of harness =AVG (all runs)=
-- steady-state cross-check: mean of the last 6 raw durations from each 12-iteration harness run
-
-Why use =GRAAL_PYTHON_VM_ARGS= here:
-- it ensures the worker subprocess launched by =jsonrpc-pipe.py= also sees
-  =--experimental-options --engine.Compilation=false=
-
-Hardware/software context for this section:
-- host: WSL2 Linux =6.6.87.2-microsoft-standard-WSL2=
-- CPU: =13th Gen Intel(R) Core(TM) i9-13900H=, 20 online CPUs
-- native standalone: =GraalPy 3.12.8 (Oracle GraalVM Native 25.1.0)=
-- CPython: =Python 3.12.11=
-
-*** Harness Results
-
-Five-run summaries for the pinned =-r 1 -i 12 5000 text text mask 64= protocol:
-
-| runtime              | median AVG (all runs) | mean AVG (all runs) | median tail-6 avg | mean tail-6 avg | median BEST |
-|----------------------+------------------------+---------------------+-------------------+-----------------+-------------|
-| native-ee standalone | 1.203 s                | 1.234 s             | 1.162 s           | 1.190 s         | 1.124 s     |
-| CPython              | 0.244 s                | 0.244 s             | 0.242 s           | 0.244 s         | 0.236 s     |
-
-Observed gap:
-- native-ee standalone is about =4.93x= slower than CPython by median harness =AVG (all runs)=
-- even on the trailing-6 steady-state cross-check, native-ee standalone is still about =4.80x= slower
-
-Raw per-run summaries:
-- native-ee standalone:
-  - run 1: =AVG(all)= =1.217 s=, =BEST= =1.104 s=, tail-6 avg =1.134 s=
-  - run 2: =AVG(all)= =1.160 s=, =BEST= =1.105 s=, tail-6 avg =1.127 s=
-  - run 3: =AVG(all)= =1.195 s=, =BEST= =1.124 s=, tail-6 avg =1.164 s=
-  - run 4: =AVG(all)= =1.203 s=, =BEST= =1.129 s=, tail-6 avg =1.162 s=
-  - run 5: =AVG(all)= =1.394 s=, =BEST= =1.304 s=, tail-6 avg =1.361 s=
-- CPython:
-  - run 1: =AVG(all)= =0.250 s=, =BEST= =0.236 s=, tail-6 avg =0.253 s=
-  - run 2: =AVG(all)= =0.240 s=, =BEST= =0.236 s=, tail-6 avg =0.239 s=
-  - run 3: =AVG(all)= =0.243 s=, =BEST= =0.237 s=, tail-6 avg =0.242 s=
-  - run 4: =AVG(all)= =0.244 s=, =BEST= =0.235 s=, tail-6 avg =0.242 s=
-  - run 5: =AVG(all)= =0.244 s=, =BEST= =0.239 s=, tail-6 avg =0.242 s=
-
-Interpretation:
-- the current interpreter-mode gap is large and repeatable
-- the outlier native run 5 moves the mean a bit, but not the overall conclusion
-- the steady-state tail still shows a large gap, so this is not just cold startup
-
-*** Worker Breakdown
-
-Worker-only =gprofng= profile for native-ee standalone with =Compilation=false=:
-- collected by driving =15000= text-mode requests into a worker-only
-  =gprofng collect app -O /tmp/jsonrpc-native-text-compfalse.er -F off -- ...=
-  launch of:
-  - =./mxbuild/linux-amd64/GRAALPY_NATIVE_STANDALONE/bin/graalpy=
-  - =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py --worker --worker-io=text=
-- with =GRAAL_PYTHON_VM_ARGS='--experimental-options --engine.Compilation=false'=
-
-Top native-ee standalone interpreter-mode functions/suspects:
-- =PBytecodeDSLRootNodeGen$CachedBytecodeNode.continueAt=: =65.47%= inclusive CPU
-- =SubstrateEnterpriseOptimizedCallTarget.invokeFromInterpreter=: =18.83%= inclusive CPU
-- =OptimizedCallTarget.callBoundary=: =16.14%= inclusive CPU
-- =CallDispatchersFactory$FunctionCachedCallNodeGen$Inlined.execute=: =11.21%= inclusive CPU
-- allocation/GC shows up materially:
-  - =G1Library.allocateArray=: =9.87%= inclusive CPU
-  - =MemAllocator::allocate=: =9.87%= inclusive CPU
-  - =slowPathNewInstance=: =4.93%= inclusive CPU
-- text-layer work is present but not dominant on its own:
-  - =TextIOWrapperNodesFactory$ReadChunkNodeGen$Inlined.execute=: =5.38%= inclusive CPU
-- JSON/string work is also visible:
-  - =JSONUtils.appendString=
-  - =JSONEncoderBuiltins...AppendSimpleObject=
-  - =JSONScannerBuiltins...scanOnceUnicode=
-  - =PatternBuiltins...SubnInnerNode=
-- raw syscall wrappers are not the main bucket in this profile:
-  - =write=: =9.42%= exclusive CPU
-  - =read=: =0.90%= exclusive CPU
-
-Worker-only CPython =cProfile= on the same text-mode worker shape (15000 requests):
-- =TextIOWrapper.readline=: =0.780 s= cumulative inside a =1.658 s= worker profile
-- =TextIOWrapper.flush=: =0.227 s=
-- =write_message=: =0.419 s= cumulative
-- =read_message=: =0.946 s= cumulative
-- =json.dumps=: =0.159 s=
-- =json.loads=: =0.143 s=
-- =mask_row=: =0.206 s=
-
-Interpretation:
-- native-ee standalone spends a large fraction above the syscall boundary in bytecode interpreter
-  dispatch, call boundaries, dynamic dispatch, and allocation/GC
-- CPython still pays most of its visible worker cost in text I/O and JSON, but those hot paths stay
-  largely in optimized C implementations rather than showing a large interpreter-dispatch bucket
-- therefore the native-ee interpreter-mode gap is not just "readline/flush are slower"; a broader
-  dispatch/allocation cost is visible in the worker profile
-
-*** Syscall Cross-Check
-
-One =strace -f -c= run per runtime, same pinned direct benchmark shape:
-- native-ee standalone direct =text/text=, =5000= roundtrips:
-  - clean wall time without =strace=: =1.979 s=
-  - traced syscall mix:
-    - =futex=: =95.09%= traced syscall time, 852 calls
-    - =read=: 10200 calls
-    - =write=: 10003 calls
-- CPython direct =text/text=, =5000= roundtrips:
-  - clean wall time without =strace=: =0.573 s=
-  - traced syscall mix:
-    - =wait4=: =78.22%= traced syscall time, 85 calls
-    - =write=: 10043 calls
-    - =read=: 10502 calls
-    - =futex=: only 44 calls, =0.05%= traced syscall time
-
-Important caveat:
-- =strace= perturbs wall times substantially under WSL2, so use it only for syscall mix, not for
-  timing conclusions
-
-Interpretation:
-- native-ee standalone and CPython issue roughly the same order of magnitude of =read= and =write=
-  syscalls for this workload
-- the main gap therefore is not "native-ee does far more pipe syscalls"
-- native-ee shows much heavier =futex= activity, which suggests extra runtime coordination/synchronization
-  on top of the same basic I/O pattern
-
-*** Isolation Experiment: Drop Text Wrappers Only
-
-Direct-mode check using the same benchmark, pinned to CPU 2, 3 runs each:
-
-| runtime/mode                | median wall time |
-|----------------------------+------------------|
-| native-ee standalone text/text   | 1.701 s      |
-| native-ee standalone buffer/buffer | 1.377 s    |
-| CPython text/text          | 0.280 s          |
-| CPython buffer/buffer      | 0.276 s          |
-
-Interpretation:
-- removing =TextIOWrapper= helps native-ee standalone by about =19%= in this interpreter-mode direct check
-- CPython changes very little between =text/text= and =buffer/buffer= on this workload
-- but native-ee standalone =buffer/buffer= is still about =4.99x= slower than CPython =buffer/buffer=
-- therefore text I/O is a meaningful contributor, but it does not explain the full interpreter-mode gap
-
-*** Current Hypotheses
-
-Most likely contributors to the remaining interpreter-mode gap:
-- bytecode interpreter dispatch / call-boundary overhead in the native standalone
-- object allocation and GC churn in request decode/normalize/encode paths
-- text read path cost still matters, especially =ReadChunkNode=, but it is only part of the total gap
-- extra runtime synchronization (visible in =futex= activity) may be contributing to end-to-end time
-
-Most likely productive next targets:
-- reduce allocation and dispatch churn in the short-request text/JSON path
-- re-check =TextIOWrapperNodes.ReadlineNode= and =ReadChunkNode=, but do not assume that fixing them
-  alone will close the gap
-- inspect why the interpreter-mode native standalone still stays ~5x behind CPython even in
-  =buffer/buffer= mode
-
-*** Experiment Log
-
-- Kept for protocol:
-  - switched from command-line-only flags to =GRAAL_PYTHON_VM_ARGS= for native-ee standalone
-  - reason: ensures the worker subprocess also runs with =Compilation=false=
-- Tried as isolation only:
-  - direct =buffer/buffer= runs under =Compilation=false=
-  - result: useful diagnostic, but not a runtime change
-- No runtime code micro-optimization was committed in this pass:
-  - the new measurements point to multiple cost centers
-  - better to keep the notes reproducible first, then patch one hotspot at a time
-
-** WSL2 Notes
-
-Environment:
-- WSL2 kernel detected
-- =perf= is present but kernel-matched tooling is not configured cleanly
-- =gprofng= is installed
-
-Practical consequence:
-- use worker-only async-profiler as primary guide
-- use gprofng only as a coarse cross-check
-
-** Next Likely Target
-
-Most likely remaining productive area:
-- =TextIOWrapperNodes.ReadlineNode=
-
-Rationale:
-- lower buffered/file/posix stack has already been improved
-- text-mode still pays in:
-  - =TextIOWrapperNodes$ReadlineNode.readline=
-  - =TextIOWrapperNodes$ReadChunkNode.readChunk=
-
-But:
-- recent attempts show that naive local fast paths in =ReadChunkNode= are easy to get wrong
-- likely next useful change must avoid growing node shape too much
-- focus should be on reducing actual =TruffleString= / substring / concat churn in the
-  common short-line case, not just adding more conditionals
-- the promising "delegate to buffered byte readline" idea is currently blocked by DSL wiring issues

From e054fcb401ed4909b15d734a7f9822be3ecee2f9 Mon Sep 17 00:00:00 2001
From: Tim Felgentreff <tim.felgentreff@oracle.com>
Date: Thu, 16 Apr 2026 16:17:18 +0200
Subject: [PATCH 7/7] [GR-74843] Preserve safe buffered io write fast path

---
 .../builtins/modules/io/BufferedIONodes.java  |  4 +-
 .../modules/io/BufferedWriterNodes.java       | 48 ++++++++++++++++++-
 .../python/builtins/modules/io/PBuffered.java |  4 +-
 .../modules/io/TextIOWrapperNodes.java        |  2 +-
 4 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedIONodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedIONodes.java
index c126f5f927..f6de4820a8 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedIONodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedIONodes.java
@@ -131,12 +131,12 @@ static boolean isClosed(PBuffered self) {
         }
 
         @SuppressWarnings("unused")
-        @Specialization(guards = {"self.getBuffer() != null", "self.isFastClosedChecks()"})
+        @Specialization(guards = {"self.getBuffer() != null", "self.hasFileIORaw()"})
         static boolean isClosedFileIO(PBuffered self) {
             return self.getFileIORaw().isClosed();
         }
 
-        @Specialization(guards = {"self.getBuffer() != null", "!self.isFastClosedChecks()"})
+        @Specialization(guards = {"self.getBuffer() != null", "!self.hasFileIORaw()"})
         static boolean isClosedBuffered(VirtualFrame frame, Node inliningTarget, PBuffered self,
                         @Cached PyObjectGetAttr getAttr,
                         @Cached PyObjectIsTrueNode isTrue) {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java
index c273e58231..76944609fa 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java
@@ -48,11 +48,16 @@
 import static com.oracle.graal.python.builtins.modules.io.IONodes.T_WRITE;
 import static com.oracle.graal.python.nodes.ErrorMessages.IO_S_INVALID_LENGTH;
 import static com.oracle.graal.python.nodes.ErrorMessages.WRITE_COULD_NOT_COMPLETE_WITHOUT_BLOCKING;
+import static com.oracle.graal.python.nodes.ErrorMessages.IO_CLOSED;
+import static com.oracle.graal.python.nodes.ErrorMessages.FILE_NOT_OPEN_FOR_S;
+import static com.oracle.graal.python.runtime.exception.PythonErrorType.IOUnsupportedOperation;
 import static com.oracle.graal.python.runtime.exception.PythonErrorType.OSError;
 import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError;
 
 import com.oracle.graal.python.PythonLanguage;
+import com.oracle.graal.python.builtins.modules.PosixModuleBuiltins;
 import com.oracle.graal.python.builtins.objects.PNone;
+import com.oracle.graal.python.builtins.objects.exception.OSErrorEnum;
 import com.oracle.graal.python.builtins.objects.buffer.PythonBufferAccessLibrary;
 import com.oracle.graal.python.builtins.objects.bytes.PBytes;
 import com.oracle.graal.python.builtins.objects.common.SequenceStorageNodes;
@@ -61,8 +66,12 @@
 import com.oracle.graal.python.lib.PyNumberAsSizeNode;
 import com.oracle.graal.python.lib.PyObjectCallMethodObjArgs;
 import com.oracle.graal.python.nodes.PNodeWithContext;
+import com.oracle.graal.python.nodes.PConstructAndRaiseNode;
 import com.oracle.graal.python.nodes.PRaiseNode;
 import com.oracle.graal.python.nodes.object.BuiltinClassProfiles.IsBuiltinObjectProfile;
+import com.oracle.graal.python.runtime.PosixSupportLibrary;
+import com.oracle.graal.python.runtime.PosixSupportLibrary.PosixException;
+import com.oracle.graal.python.runtime.PythonContext;
 import com.oracle.graal.python.runtime.exception.PException;
 import com.oracle.graal.python.runtime.object.PFactory;
 import com.oracle.graal.python.util.PythonUtils;
@@ -72,8 +81,10 @@
 import com.oracle.truffle.api.dsl.GenerateInline;
 import com.oracle.truffle.api.dsl.Specialization;
 import com.oracle.truffle.api.frame.VirtualFrame;
+import com.oracle.truffle.api.profiles.InlinedBranchProfile;
 import com.oracle.truffle.api.library.CachedLibrary;
 import com.oracle.truffle.api.nodes.Node;
+import com.oracle.graal.python.runtime.GilNode;
 
 public class BufferedWriterNodes {
 
@@ -222,7 +233,40 @@ abstract static class RawWriteNode extends PNodeWithContext {
         /**
          * implementation of cpython/Modules/_io/bufferedio.c:_bufferedwriter_raw_write
          */
-        @Specialization
+        @SuppressWarnings("truffle-sharing")
+        @Specialization(guards = "self.hasFileIORaw()")
+        static int bufferedwriterRawWriteFileIO(VirtualFrame frame, Node inliningTarget, PBuffered self, byte[] buf, int len,
+                        @Bind PythonContext context,
+                        @CachedLibrary("context.getPosixSupport()") PosixSupportLibrary posixLib,
+                        @Cached InlinedBranchProfile errorProfile,
+                        @Cached GilNode gil,
+                        @Cached PConstructAndRaiseNode.Lazy constructAndRaiseNode,
+                        @Cached PRaiseNode raiseNode) {
+            PFileIO fileIO = self.getFileIORaw();
+            if (fileIO.isClosed()) {
+                throw raiseNode.raise(inliningTarget, ValueError, IO_CLOSED);
+            }
+            if (!fileIO.isWritable()) {
+                throw raiseNode.raise(inliningTarget, IOUnsupportedOperation, FILE_NOT_OPEN_FOR_S, "writing");
+            }
+            final int n;
+            try {
+                n = Math.toIntExact(PosixModuleBuiltins.WriteNode.write(fileIO.getFD(), buf, len,
+                                inliningTarget, posixLib, context.getPosixSupport(), errorProfile, gil));
+            } catch (PosixException e) {
+                if (e.getErrorCode() == OSErrorEnum.EAGAIN.getNumber()) {
+                    return -2;
+                }
+                throw constructAndRaiseNode.get(inliningTarget).raiseOSErrorFromPosixException(frame, e);
+            }
+            if (n > 0 && self.getAbsPos() != -1) {
+                self.incAbsPos(n);
+            }
+            return n;
+        }
+
+        @SuppressWarnings("truffle-sharing")
+        @Specialization(guards = "!self.hasFileIORaw()")
         static int bufferedwriterRawWrite(VirtualFrame frame, Node inliningTarget, PBuffered self, byte[] buf, int len,
                         @Bind PythonLanguage language,
                         @Cached PyObjectCallMethodObjArgs callMethod,
@@ -274,7 +318,7 @@ protected static void bufferedwriterFlushUnlocked(VirtualFrame frame, PBuffered
             while (self.getWritePos() < self.getWriteEnd()) {
                 byte[] buf;
                 int len;
-                if (self.getWritePos() == 0) {
+                if (self.hasFileIORaw() && self.getWritePos() == 0) {
                     buf = self.getBuffer();
                     len = self.getWriteEnd();
                 } else {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PBuffered.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PBuffered.java
index bbcf567d5c..bf71cb80ba 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PBuffered.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PBuffered.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * The Universal Permissive License (UPL), Version 1.0
@@ -158,7 +158,7 @@ public void setFinalizing(boolean finalizing) {
         this.finalizing = finalizing;
     }
 
-    public boolean isFastClosedChecks() {
+    public boolean hasFileIORaw() {
         return fileioRaw != null;
     }
 
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java
index f2afa5cb9d..71d9832ae4 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java
@@ -884,7 +884,7 @@ static void init(VirtualFrame frame, Node inliningTarget, PTextIO self, Object b
 
             if (buffer instanceof PBuffered) {
                 /* Cache the raw FileIO object to speed up 'closed' checks */
-                if (((PBuffered) buffer).isFastClosedChecks()) {
+                if (((PBuffered) buffer).hasFileIORaw()) {
                     PFileIO f = ((PBuffered) buffer).getFileIORaw();
                     self.setFileIO(f);
                 }