From 2bf28bf4e102a8aa1a8cfdd2cacd947e231797b9 Mon Sep 17 00:00:00 2001 From: Tim Felgentreff Date: Fri, 10 Apr 2026 10:33:46 +0200 Subject: [PATCH 1/7] Add jsonrpc pipe microbenchmark to mx harness --- .../python/micro/jsonrpc-pipe.py | 431 ++++++++++++++++++ mx.graalpython/mx_graalpython_bench_param.py | 2 + 2 files changed, 433 insertions(+) create mode 100644 graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py diff --git a/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py b/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py new file mode 100644 index 0000000000..d104a3ad00 --- /dev/null +++ b/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py @@ -0,0 +1,431 @@ +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import argparse +import io +import json +import os +import re +import subprocess +import sys +import time + + +EMAIL_RE = re.compile(r"\s+") +NON_DIGIT_RE = re.compile(r"\D+") +_STATE = None + + +class Endpoint: + def __init__(self, mode, reader, writer, closeables=()): + self.mode = mode + self.reader = reader + self.writer = writer + self.closeables = closeables + + def write_message(self, message): + line = json.dumps(message, separators=(",", ":")) + if self.mode == "text": + self.writer.write(line) + self.writer.write("\n") + self.writer.flush() + return + payload = (line + "\n").encode("utf-8") + if self.mode == "buffer": + self.writer.write(payload) + self.writer.flush() + return + write_all(self.writer, payload) + + def read_message(self): + if self.mode == "text": + line = self.reader.readline() + else: + data = self.reader.readline() + line = data.decode("utf-8") if data else "" + if not line: + raise EOFError("unexpected EOF while reading line") + return json.loads(line) + + def close(self): + streams = self.closeables if self.closeables else (self.reader, self.writer) + for stream in streams: + if hasattr(stream, "close"): + try: + stream.close() + except OSError: + pass + + +class FDLineReader: + def __init__(self, fd): + self.fd = fd + self.pending = bytearray() + + def readline(self): + while True: + newline = self.pending.find(b"\n") + if newline >= 0: + line = bytes(self.pending[: newline + 1]) + del self.pending[: newline + 1] + return line + chunk = os.read(self.fd, 4096) + if not chunk: + if not self.pending: + return b"" + line = bytes(self.pending) + self.pending.clear() + return line + self.pending.extend(chunk) + + +class State: + def __init__(self, roundtrips, client_io, worker_io, workload, payload_bytes, batch_size): + self.roundtrips = roundtrips + self.client_io = client_io + self.worker_io = worker_io + self.workload = workload + self.payload_bytes = payload_bytes + self.batch_size = batch_size + self.next_request_id = 1 + self.process = None + self.endpoint = None + + +def write_all(fd, data): + view = memoryview(data) + while view: + written = os.write(fd, view) + view = view[written:] + + +def create_text_endpoint(read_raw, write_raw): + reader_buffer = io.BufferedReader(read_raw, buffer_size=8192) + writer_buffer = io.BufferedWriter(write_raw, buffer_size=8192) + reader = io.TextIOWrapper(reader_buffer, encoding="utf-8", newline=None) + writer = io.TextIOWrapper(writer_buffer, encoding="utf-8", newline="\n", line_buffering=False, write_through=False) + return Endpoint("text", reader, writer) + + +def create_buffer_endpoint(read_raw, write_raw): + reader = io.BufferedReader(read_raw, buffer_size=8192) + writer = io.BufferedWriter(write_raw, buffer_size=8192) + return Endpoint("buffer", reader, writer) + + +def create_fd_endpoint(read_fd, write_fd, closeables=()): + return Endpoint("fd", FDLineReader(read_fd), write_fd, closeables) + + +def create_parent_endpoint(process, mode): + if mode == "text": + return create_text_endpoint(process.stdout, process.stdin) + if mode == "buffer": + return create_buffer_endpoint(process.stdout, process.stdin) + return create_fd_endpoint(process.stdout.fileno(), process.stdin.fileno(), (process.stdout, process.stdin)) + + +def create_worker_endpoint(mode): + if mode == "text": + return Endpoint("text", sys.stdin, sys.stdout) + if mode == "buffer": + return Endpoint("buffer", sys.stdin.buffer, sys.stdout.buffer) + return create_fd_endpoint(0, 1) + + +def normalize_email(value): + return EMAIL_RE.sub("", value.strip().lower()) + + +def normalize_phone(value): + digits = NON_DIGIT_RE.sub("", value) + if digits.startswith("00"): + digits = digits[2:] + return digits + + +def mask_email(value): + name, _, domain = value.partition("@") + if not domain: + return "***" + return "%s***@%s" % (name[:1], domain) + + +def mask_phone(value): + if len(value) <= 4: + return "*" * len(value) + return "*" * (len(value) - 4) + value[-4:] + + +def mask_row(row): + email = normalize_email(str(row.get("email", ""))) + phone = normalize_phone(str(row.get("phone", ""))) + return { + "email_normalized": email, + "phone_normalized": phone, + "email_masked": mask_email(email) if email else None, + "phone_masked": mask_phone(phone) if phone else None, + "region": str(row.get("region", "")).upper(), + "source": str(row.get("source", "")).lower(), + } + + +def make_echo_payload(payload_bytes): + if payload_bytes <= 0: + return "" + unit = "payload-" + return (unit * ((payload_bytes // len(unit)) + 1))[:payload_bytes] + + +def make_mask_row(index, payload_bytes): + suffix = make_echo_payload(max(payload_bytes, 8)) + return { + "email": " User%s.%s@Example.COM " % (index, suffix), + "phone": "+49 (170) %04d-%s" % (index, suffix[:8]), + "region": "eu", + "source": "microbench", + } + + +def build_request(kind, request_id, payload_bytes, batch_size): + if kind == "health": + method = "health" + params = {} + elif kind == "echo": + method = "echo" + params = {"payload": make_echo_payload(payload_bytes)} + elif kind == "mask": + method = "mask" + params = make_mask_row(request_id, payload_bytes) + elif kind == "mask_batch": + method = "mask_batch" + params = {"rows": [make_mask_row(request_id + i, payload_bytes) for i in range(batch_size)]} + else: + raise AssertionError("unsupported request kind: %s" % kind) + return {"jsonrpc": "2.0", "id": request_id, "method": method, "params": params} + + +def handle_request(message): + request_id = message.get("id") + method = message.get("method") + params = message.get("params", {}) + if method == "health": + result = {"ok": True, "worker": "jsonrpc-pipe", "protocol": "json-rpc-2.0-ndjson"} + elif method == "echo": + payload = str(dict(params).get("payload", "")) + result = {"ok": True, "echo": payload, "size": len(payload)} + elif method == "mask": + result = {"ok": True, "normalized": mask_row(dict(params))} + elif method == "mask_batch": + rows = [mask_row(dict(row)) for row in dict(params).get("rows", [])] + result = {"ok": True, "normalized": rows, "count": len(rows)} + else: + return {"jsonrpc": "2.0", "id": request_id, "error": {"code": -32601, "message": "method not found"}} + return {"jsonrpc": "2.0", "id": request_id, "result": result} + + +def validate_response(request, response, kind, payload_bytes, batch_size): + if response.get("id") != request["id"]: + raise AssertionError("mismatched response id") + if "error" in response: + raise AssertionError("worker returned error: %s" % (response["error"],)) + result = response.get("result") + if not isinstance(result, dict) or not result.get("ok"): + raise AssertionError("unexpected response payload: %s" % (response,)) + if kind == "echo": + if result.get("echo") != make_echo_payload(payload_bytes): + raise AssertionError("echo payload mismatch") + elif kind == "mask": + expected = mask_row(make_mask_row(int(request["id"]), payload_bytes)) + if result.get("normalized") != expected: + raise AssertionError("mask result mismatch") + elif kind == "mask_batch": + expected_rows = [mask_row(make_mask_row(int(request["id"]) + i, payload_bytes)) for i in range(batch_size)] + if result.get("count") != batch_size or result.get("normalized") != expected_rows: + raise AssertionError("mask_batch result mismatch") + + +def run_roundtrips(state): + completed = 0 + for _ in range(state.roundtrips): + request = build_request(state.workload, state.next_request_id, state.payload_bytes, state.batch_size) + state.next_request_id += 1 + state.endpoint.write_message(request) + response = state.endpoint.read_message() + validate_response(request, response, state.workload, state.payload_bytes, state.batch_size) + completed += 1 + return completed + + +def parse_int(value): + if isinstance(value, int): + return value + return int(str(value).replace("_", "")) + + +def __process_args__(roundtrips=500, client_io="text", worker_io="text", workload="mask", payload_bytes=64, batch_size=8): + return [ + parse_int(roundtrips), + str(client_io), + str(worker_io), + str(workload), + parse_int(payload_bytes), + parse_int(batch_size), + ] + + +def __setup__(roundtrips=500, client_io="text", worker_io="text", workload="mask", payload_bytes=64, batch_size=8): + global _STATE + __teardown__() + state = State(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size) + command = [ + sys.executable, + __file__, + "--worker", + "--worker-io=%s" % worker_io, + ] + process = subprocess.Popen( + command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=0, + ) + state.process = process + state.endpoint = create_parent_endpoint(process, client_io) + _STATE = state + + +def __benchmark__(roundtrips=500, client_io="text", worker_io="text", workload="mask", payload_bytes=64, batch_size=8): + if _STATE is None: + __setup__(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size) + return run_roundtrips(_STATE) + + +def __teardown__(): + global _STATE + state = _STATE + _STATE = None + if state is None: + return + try: + if state.endpoint is not None: + state.endpoint.close() + finally: + if state.process is not None: + stderr = b"" + try: + stderr = state.process.stderr.read() if state.process.stderr is not None else b"" + except OSError: + pass + return_code = state.process.wait() + if return_code != 0: + raise RuntimeError("worker exited with status %d: %s" % (return_code, stderr.decode("utf-8", errors="replace"))) + + +def run_worker(worker_io): + endpoint = create_worker_endpoint(worker_io) + try: + while True: + try: + request = endpoint.read_message() + except EOFError: + return 0 + endpoint.write_message(handle_request(request)) + finally: + endpoint.close() + + +def run_direct(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size): + start = time.perf_counter() + __setup__(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size) + try: + completed = __benchmark__(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size) + finally: + __teardown__() + wall = time.perf_counter() - start + print("roundtrips=%d" % completed) + print("wall_s=%s" % wall) + print("throughput_ops_s=%s" % (completed / wall if wall else 0.0)) + return 0 + + +def main(argv=None): + parser = argparse.ArgumentParser(description="Strict JSON-RPC-like pipe roundtrip microbenchmark.") + parser.add_argument("--worker", action="store_true") + parser.add_argument("--worker-io", choices=("text", "buffer", "fd"), default="text") + parser.add_argument("--roundtrips", type=parse_int, default=500) + parser.add_argument("--client-io", choices=("text", "buffer", "fd"), default="text") + parser.add_argument("--workload", choices=("health", "echo", "mask", "mask_batch"), default="mask") + parser.add_argument("--payload-bytes", type=parse_int, default=64) + parser.add_argument("--batch-size", type=parse_int, default=8) + args = parser.parse_args(argv) + if args.worker: + return run_worker(args.worker_io) + return run_direct(args.roundtrips, args.client_io, args.worker_io, args.workload, args.payload_bytes, args.batch_size) + + +def run(): + __setup__() + try: + __benchmark__() + finally: + __teardown__() + + +def warmupIterations(): + return 5 + + +def iterations(): + return 10 + + +def summary(): + return { + "name": "OutlierRemovalAverageSummary", + "lower-threshold": 0, + "upper-threshold": 0.3, + } + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/mx.graalpython/mx_graalpython_bench_param.py b/mx.graalpython/mx_graalpython_bench_param.py index 78d5bf06c6..fe9eb3ef0e 100644 --- a/mx.graalpython/mx_graalpython_bench_param.py +++ b/mx.graalpython/mx_graalpython_bench_param.py @@ -121,6 +121,7 @@ 'virtualize-in-try-catch-oom': ITER_10, 'phase_shift_warmup_baseline': ITER_5 + ['--self-measurement'] + ['500'], 'phase_shift_warmup': ITER_3 + ['--self-measurement'] + ['1600', '500'], + 'jsonrpc-pipe': ITER_10 + ['500', 'text', 'text', 'mask', '64'], 'startup': ITER_5 + ['50'], 'startup-imports': ITER_5 + ['20'], } @@ -130,6 +131,7 @@ 'nano-arith': ITER_6 + WARMUP_2, 'nano-loop': ITER_6 + WARMUP_2, 'nano-if': ITER_6 + WARMUP_2, + 'jsonrpc-pipe': ITER_6 + WARMUP_2 + ['100', 'text', 'text', 'mask', '64'], 'arith-modulo-sized': ITER_6 + WARMUP_2 + ['1'], 'if-generic': ITER_10 + WARMUP_2 + ['500000'], 'if-generic-non-builtin': ITER_10 + WARMUP_2 + ['500000'], From 0c27038cc5fe4dc7a14ad5259adc23b952c0c517 Mon Sep 17 00:00:00 2001 From: Tim Felgentreff Date: Fri, 10 Apr 2026 10:42:24 +0200 Subject: [PATCH 2/7] Add notes on building standalones for agents --- AGENTS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 5c385a78db..01d3c96e8c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -68,6 +68,10 @@ It consists of: Java (Truffle) + C (CPython C-API compatibility) + Python stdlib * Style / formatting `mx python-style --fix` `mx python-gate --tags style` +* Building standalones for benchmarking + - use `mx --env native-ee sforceimports && mx --env native-ee checkout-downstream compiler graal-enterprise` to get the right revisions + - use `mx -p ../graal/vm fetch-jdk -jdk-id labsjdk-ce-latest` and set JAVA_HOME as per that command's output + - use `mx --env jvm-ee-libgraal` and `mx --env native-ee` to build the JAVA and NATIVE standalone distributions ## NOTES - When searching for implementation, prefer `graalpython/com.oracle.graal.python/src/...` over vendored `lib-python` unless you are intentionally modifying upstream stdlib/tests. From 13811b1a996b9be5cdfda464b667d4c4bf173c68 Mon Sep 17 00:00:00 2001 From: Tim Felgentreff Date: Fri, 10 Apr 2026 11:51:06 +0200 Subject: [PATCH 3/7] Reduce tiny flush copying in _io Cuts copies in TextIOWrapper pending-byte flushes and avoids a buffer slice copy in BufferedWriter flush when writePos is 0. On the jsonrpc-pipe microbenchmark (jvm-ee, mx benchmark micro:jsonrpc-pipe --tracker none -- --python-vm=graalpython --python-vm-config=default --), AVG (no warmup) went from 0.188 s before these changes to 0.089 s after them. --- .../modules/io/BufferedWriterNodes.java | 14 ++++- .../python/builtins/modules/io/PTextIO.java | 16 ++--- .../modules/io/PendingBytesOutputStream.java | 62 +++++++++++++++++++ .../modules/io/TextIOWrapperNodes.java | 6 +- 4 files changed, 82 insertions(+), 16 deletions(-) create mode 100644 graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PendingBytesOutputStream.java diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java index 5c7a52b178..c273e58231 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -272,8 +272,16 @@ protected static void bufferedwriterFlushUnlocked(VirtualFrame frame, PBuffered self.incRawPos(-rewind); } while (self.getWritePos() < self.getWriteEnd()) { - byte[] buf = PythonUtils.arrayCopyOfRange(self.getBuffer(), self.getWritePos(), self.getWriteEnd()); - int n = rawWriteNode.execute(frame, inliningTarget, self, buf, buf.length); + byte[] buf; + int len; + if (self.getWritePos() == 0) { + buf = self.getBuffer(); + len = self.getWriteEnd(); + } else { + buf = PythonUtils.arrayCopyOfRange(self.getBuffer(), self.getWritePos(), self.getWriteEnd()); + len = buf.length; + } + int n = rawWriteNode.execute(frame, inliningTarget, self, buf, len); if (n == -2) { throw raiseBlockingIOError.get(inliningTarget).raiseEAGAIN(WRITE_COULD_NOT_COMPLETE_WITHOUT_BLOCKING, 0); } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PTextIO.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PTextIO.java index 200ff15b56..155ac163ad 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PTextIO.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PTextIO.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -41,13 +41,9 @@ package com.oracle.graal.python.builtins.modules.io; import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.append; -import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.createOutputStream; -import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.toByteArray; import static com.oracle.graal.python.nodes.StringLiterals.T_EMPTY_STRING; import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING; -import java.io.ByteArrayOutputStream; - import com.oracle.graal.python.builtins.objects.ints.IntBuiltins; import com.oracle.graal.python.builtins.objects.ints.IntNodes; import com.oracle.graal.python.builtins.objects.ints.PInt; @@ -93,7 +89,7 @@ public final class PTextIO extends PTextIOBase { private int decodedCharsUsed; /* offset (in code points) into _decoded_chars for read() */ private int decodedCharsLen; /* code point length of decodedChars */ - private ByteArrayOutputStream pendingBytes; // data waiting to be written. + private PendingBytesOutputStream pendingBytes; // data waiting to be written. /* * snapshot is either NULL, or a tuple (dec_flags, next_input) where dec_flags is the second @@ -112,7 +108,7 @@ public final class PTextIO extends PTextIOBase { public PTextIO(Object cls, Shape instanceShape) { super(cls, instanceShape); - pendingBytes = createOutputStream(); + pendingBytes = new PendingBytesOutputStream(); } @Override @@ -324,11 +320,11 @@ TruffleString consumeAllDecodedChars(TruffleString.SubstringNode substringNode, } public void clearPendingBytes() { - pendingBytes = createOutputStream(); + pendingBytes = new PendingBytesOutputStream(); } - public byte[] getAndClearPendingBytes() { - byte[] b = toByteArray(pendingBytes); + public PendingBytesOutputStream getAndClearPendingBytes() { + PendingBytesOutputStream b = pendingBytes; clearPendingBytes(); return b; } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PendingBytesOutputStream.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PendingBytesOutputStream.java new file mode 100644 index 0000000000..809711b9f3 --- /dev/null +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PendingBytesOutputStream.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package com.oracle.graal.python.builtins.modules.io; + +import java.io.ByteArrayOutputStream; + +final class PendingBytesOutputStream extends ByteArrayOutputStream { + + PendingBytesOutputStream() { + super(); + } + + PendingBytesOutputStream(int size) { + super(size); + } + + byte[] getBuffer() { + return buf; + } + + int getCount() { + return count; + } +} diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java index 5eb40a3447..f2afa5cb9d 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -207,8 +207,8 @@ static void nothingTodo(@SuppressWarnings("unused") PTextIO self) { static void writeflush(VirtualFrame frame, Node inliningTarget, PTextIO self, @Bind PythonLanguage language, @Cached PyObjectCallMethodObjArgs callMethod) { - byte[] pending = self.getAndClearPendingBytes(); - PBytes b = PFactory.createBytes(language, pending); + PendingBytesOutputStream pending = self.getAndClearPendingBytes(); + PBytes b = PFactory.createBytes(language, pending.getBuffer(), pending.getCount()); callMethod.execute(frame, inliningTarget, self.getBuffer(), T_WRITE, b); // TODO: check _PyIO_trap_eintr } From 72a791bd10f9949412b566ad4901ff9101179107 Mon Sep 17 00:00:00 2001 From: Tim Felgentreff Date: Fri, 10 Apr 2026 14:05:57 +0200 Subject: [PATCH 4/7] Add lower-level readInto fast path for buffered reads Adds a PosixSupportLibrary readInto primitive and uses it for BufferedReader fill-buffer refills when the raw object is cached PFileIO and the refill starts at offset 0. On the jsonrpc-pipe microbenchmark with a heavier repeated-run protocol (graalpy harness.py micro/jsonrpc-pipe.py -i 12 5000 text text mask 64), the current baseline had median AVG (no warmup) 0.551 s and mean 0.537 s across 5 runs. This change measured median 0.476 s and mean 0.503 s across 5 runs. --- .../io/BufferedReaderMixinBuiltins.java | 80 +++++++++++++++++-- .../python/runtime/EmulatedPosixSupport.java | 25 ++++++ .../python/runtime/LoggingPosixSupport.java | 11 +++ .../graal/python/runtime/NFIPosixSupport.java | 11 +++ .../python/runtime/PosixSupportLibrary.java | 2 + .../python/runtime/PreInitPosixSupport.java | 9 +++ 6 files changed, 131 insertions(+), 7 deletions(-) diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java index 8f5fb49222..101ca493d0 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -82,6 +82,7 @@ import com.oracle.graal.python.builtins.modules.io.BufferedIONodesFactory.CheckIsClosedNodeGen; import com.oracle.graal.python.builtins.objects.PNone; import com.oracle.graal.python.builtins.objects.buffer.PythonBufferAccessLibrary; +import com.oracle.graal.python.builtins.objects.exception.OSErrorEnum; import com.oracle.graal.python.builtins.objects.bytes.BytesNodes; import com.oracle.graal.python.builtins.objects.bytes.PByteArray; import com.oracle.graal.python.builtins.objects.bytes.PBytes; @@ -97,7 +98,12 @@ import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode; import com.oracle.graal.python.nodes.function.builtins.clinic.ArgumentClinicProvider; import com.oracle.graal.python.nodes.object.GetClassNode; +import com.oracle.graal.python.nodes.PConstructAndRaiseNode; +import com.oracle.graal.python.runtime.GilNode; import com.oracle.graal.python.runtime.IndirectCallData.InteropCallData; +import com.oracle.graal.python.runtime.PosixSupport; +import com.oracle.graal.python.runtime.PosixSupportLibrary; +import com.oracle.graal.python.runtime.PythonContext; import com.oracle.graal.python.runtime.exception.PException; import com.oracle.graal.python.runtime.object.PFactory; import com.oracle.graal.python.util.PythonUtils; @@ -114,6 +120,7 @@ import com.oracle.truffle.api.frame.VirtualFrame; import com.oracle.truffle.api.library.CachedLibrary; import com.oracle.truffle.api.nodes.Node; +import com.oracle.truffle.api.profiles.InlinedBranchProfile; import com.oracle.truffle.api.profiles.InlinedConditionProfile; import com.oracle.truffle.api.strings.TruffleString; @@ -190,7 +197,8 @@ abstract static class FillBufferNode extends PNodeWithContext { @Specialization static int bufferedreaderFillBuffer(VirtualFrame frame, Node inliningTarget, PBuffered self, - @Cached RawReadNode rawReadNode) { + @Cached RawReadNode rawReadNode, + @Cached RawReadIntoBufferNode rawReadIntoBufferNode) { int start; if (isValidReadBuffer(self)) { start = self.getReadEnd(); @@ -198,21 +206,79 @@ static int bufferedreaderFillBuffer(VirtualFrame frame, Node inliningTarget, PBu start = 0; } int len = self.getBufferSize() - start; - byte[] fill = rawReadNode.execute(frame, inliningTarget, self, len); - if (fill == BLOCKED) { - return -2; + int n; + if (start == 0 && self.isFastClosedChecks()) { + n = rawReadIntoBufferNode.execute(frame, inliningTarget, self.getFileIORaw(), self.getBuffer(), len); + if (n == -2) { + return -2; + } + } else { + byte[] fill = rawReadNode.execute(frame, inliningTarget, self, len); + if (fill == BLOCKED) { + return -2; + } + n = fill.length; + if (n > 0) { + PythonUtils.arraycopy(fill, 0, self.getBuffer(), start, n); + } } - int n = fill.length; if (n == 0) { return n; } - PythonUtils.arraycopy(fill, 0, self.getBuffer(), start, n); self.setReadEnd(start + n); self.setRawPos(start + n); return n; } } + @GenerateInline + @GenerateCached(false) + abstract static class RawReadIntoBufferNode extends PNodeWithContext { + + public abstract int execute(VirtualFrame frame, Node inliningTarget, PFileIO raw, byte[] buffer, int len); + + @Specialization + static int readIntoBuffer(VirtualFrame frame, Node inliningTarget, PFileIO raw, byte[] buffer, int len, + @Bind PythonContext context, + @CachedLibrary("context.getPosixSupport()") PosixSupportLibrary posixLib, + @Cached InlinedBranchProfile readErrorProfile, + @Cached InlinedBranchProfile readErrorProfile2, + @Cached GilNode gil, + @Cached PConstructAndRaiseNode.Lazy constructAndRaiseNode) { + try { + return readInto(raw.getFD(), buffer, len, inliningTarget, posixLib, context.getPosixSupport(), readErrorProfile, gil); + } catch (PosixSupportLibrary.PosixException e) { + if (e.getErrorCode() == OSErrorEnum.EAGAIN.getNumber()) { + readErrorProfile2.enter(inliningTarget); + return -2; + } + throw constructAndRaiseNode.get(inliningTarget).raiseOSErrorFromPosixException(frame, e); + } + } + + private static int readInto(int fd, byte[] buffer, int len, + Node inliningTarget, PosixSupportLibrary posixLib, PosixSupport posixSupport, + InlinedBranchProfile errorProfile, GilNode gil) throws PosixSupportLibrary.PosixException { + gil.release(true); + try { + while (true) { + try { + return (int) posixLib.readInto(posixSupport, fd, new PosixSupportLibrary.Buffer(buffer, len)); + } catch (PosixSupportLibrary.PosixException e) { + errorProfile.enter(inliningTarget); + if (e.getErrorCode() == OSErrorEnum.EINTR.getNumber()) { + PythonContext.triggerAsyncActions(inliningTarget); + } else { + throw e; + } + } + } + } finally { + gil.acquire(); + } + } + } + @Builtin(name = J_READABLE, minNumOfPositionalArgs = 1) @GenerateNodeFactory abstract static class ReadableNode extends PythonUnaryWithInitErrorBuiltinNode { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java index 88ec3a30f9..af6c851da1 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java @@ -526,6 +526,31 @@ public Buffer read(int fd, long length, } } + @ExportMessage + @SuppressWarnings({"unused", "static-method"}) + public long readInto(int fd, Buffer data, + @Bind Node inliningTarget, + @Shared("errorBranch") @Cached InlinedBranchProfile errorBranch, + @Shared("eq") @Cached TruffleString.EqualNode eqNode) throws PosixException { + Channel channel = getFileChannel(fd); + if (!(channel instanceof ReadableByteChannel readableChannel)) { + errorBranch.enter(inliningTarget); + throw posixException(OSErrorEnum.EBADF); + } + try { + int n = doReadIntoChannel(readableChannel, data.data, (int) data.length); + return n < 0 ? 0 : n; + } catch (Exception e) { + errorBranch.enter(inliningTarget); + throw posixException(OSErrorEnum.fromException(e, eqNode)); + } + } + + @TruffleBoundary(allowInlining = true) + private static int doReadIntoChannel(ReadableByteChannel channel, byte[] data, int length) throws IOException { + return channel.read(ByteBuffer.wrap(data, 0, length)); + } + @TruffleBoundary private static Buffer readBytesFromChannel(ReadableByteChannel channel, long sizeIn) throws IOException { long size = sizeIn; diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java index e7371fadb9..7172f5b5ce 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java @@ -184,6 +184,17 @@ final Buffer read(int fd, long length, } } + @ExportMessage + final long readInto(int fd, Buffer data, + @CachedLibrary("this.delegate") PosixSupportLibrary lib) throws PosixException { + logEnter("readInto", "%d, %d", fd, data.length); + try { + return logExit("readInto", "%d", lib.readInto(delegate, fd, data)); + } catch (PosixException e) { + throw logException("readInto", e); + } + } + @ExportMessage final long write(int fd, Buffer data, @CachedLibrary("this.delegate") PosixSupportLibrary lib) throws PosixException { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java index 1bf99341f3..9c343d9c34 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java @@ -565,6 +565,17 @@ public Buffer read(int fd, long length, return buffer.withLength(n); } + @ExportMessage + public long readInto(int fd, Buffer data, + @Shared("invoke") @Cached InvokeNativeFunction invokeNode) throws PosixException { + setErrno(invokeNode, 0); + long n = invokeNode.callLong(this, PosixNativeFunction.call_read, fd, data.data, data.length); + if (n < 0) { + throw getErrnoAndThrowPosixException(invokeNode); + } + return n; + } + @ExportMessage public long write(int fd, Buffer data, @Shared("invoke") @Cached InvokeNativeFunction invokeNode) throws PosixException { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java index 5ad6aaac3e..351f8d601c 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java @@ -93,6 +93,8 @@ public abstract class PosixSupportLibrary extends Library { public abstract Buffer read(Object receiver, int fd, long length) throws PosixException; + public abstract long readInto(Object receiver, int fd, Buffer data) throws PosixException; + public abstract long write(Object receiver, int fd, Buffer data) throws PosixException; public abstract int dup(Object receiver, int fd) throws PosixException; diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java index 9b36dc9fd0..26916c89df 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java @@ -203,6 +203,15 @@ final Buffer read(int fd, long length, return nativeLib.read(nativePosixSupport, fd, length); } + @ExportMessage + final long readInto(int fd, Buffer data, + @CachedLibrary("this.nativePosixSupport") PosixSupportLibrary nativeLib) throws PosixException { + if (inPreInitialization) { + return PosixSupportLibrary.getUncached().readInto(emulatedPosixSupport, fd, data); + } + return nativeLib.readInto(nativePosixSupport, fd, data); + } + @ExportMessage final long write(int fd, Buffer data, @CachedLibrary("this.nativePosixSupport") PosixSupportLibrary nativeLib) throws PosixException { From 565c1c6c9b7f97291d89455605ea40aee0aef317 Mon Sep 17 00:00:00 2001 From: Tim Felgentreff Date: Fri, 10 Apr 2026 15:52:46 +0200 Subject: [PATCH 5/7] Add io perf investigation notes and scripts --- .../python/micro/jsonrpc-pipe.py | 14 +- investigations/io_perf/notes.org | 525 ++++++++++++++++++ scripts/profile-jsonrpc-pipe-async-buffer.sh | 51 ++ scripts/profile-jsonrpc-pipe-async-text.sh | 51 ++ .../profile-jsonrpc-pipe-gprofng-buffer.sh | 51 ++ scripts/profile-jsonrpc-pipe-gprofng-text.sh | 51 ++ scripts/profile_jsonrpc_pipe_worker.py | 202 +++++++ 7 files changed, 944 insertions(+), 1 deletion(-) create mode 100644 investigations/io_perf/notes.org create mode 100755 scripts/profile-jsonrpc-pipe-async-buffer.sh create mode 100755 scripts/profile-jsonrpc-pipe-async-text.sh create mode 100755 scripts/profile-jsonrpc-pipe-gprofng-buffer.sh create mode 100755 scripts/profile-jsonrpc-pipe-gprofng-text.sh create mode 100755 scripts/profile_jsonrpc_pipe_worker.py diff --git a/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py b/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py index d104a3ad00..83ee8cfd06 100644 --- a/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py +++ b/graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py @@ -301,6 +301,18 @@ def parse_int(value): return int(str(value).replace("_", "")) +def get_subprocess_launcher_args(): + orig_argv = getattr(sys, "orig_argv", None) + if not orig_argv: + return [sys.executable] + launcher_args = [sys.executable] + for arg in orig_argv[1:]: + if not arg.startswith("-"): + break + launcher_args.append(arg) + return launcher_args + + def __process_args__(roundtrips=500, client_io="text", worker_io="text", workload="mask", payload_bytes=64, batch_size=8): return [ parse_int(roundtrips), @@ -317,7 +329,7 @@ def __setup__(roundtrips=500, client_io="text", worker_io="text", workload="mask __teardown__() state = State(roundtrips, client_io, worker_io, workload, payload_bytes, batch_size) command = [ - sys.executable, + *get_subprocess_launcher_args(), __file__, "--worker", "--worker-io=%s" % worker_io, diff --git a/investigations/io_perf/notes.org b/investigations/io_perf/notes.org new file mode 100644 index 0000000000..6a92d8ae8d --- /dev/null +++ b/investigations/io_perf/notes.org @@ -0,0 +1,525 @@ +* IO Perf Investigation Notes + +** Scope + +Investigation target: remaining performance gap in workload 07 / jsonrpc tokenizer style +strict request-response traffic, especially tiny write/flush + pipe readline overhead in +GraalPy's =_io= stack. + +Current reference commit for runtime changes: +- =930c9f5b09= Add lower-level readInto fast path for buffered reads + +Related commits in this investigation: +- =39bde29f49= Reduce tiny flush copying in =_io= +- =3cc0650e9c= WIP: add jsonrpc pipe profiling scripts +- =9d27960795= Add jsonrpc pipe microbenchmark to mx harness + +** Benchmark Shape + +In-repo harness benchmark: +- =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py= + +Representative command: +#+begin_src bash +graalpy graalpython/com.oracle.graal.python.benchmarks/python/harness.py \ + graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \ + -i 12 5000 text text mask 64 +#+end_src + +Interpretation: +- =5000= roundtrips per benchmark iteration +- =text text= means parent and worker both use text I/O wrappers +- =mask= uses the tokenizer-like normalize/mask request shape +- =64= payload bytes + +The heavier =5000=-roundtrip mode is preferred for A/B work over the smaller =500= mode, +because the smaller mode is too noisy for subtle changes. + +** Profiling Tooling + +Worker-only profiling scripts: +- =scripts/profile_jsonrpc_pipe_worker.py= +- =scripts/profile-jsonrpc-pipe-async-text.sh= +- =scripts/profile-jsonrpc-pipe-async-buffer.sh= +- =scripts/profile-jsonrpc-pipe-gprofng-text.sh= +- =scripts/profile-jsonrpc-pipe-gprofng-buffer.sh= + +Typical async-profiler runs: +#+begin_src bash +REQS=20000 scripts/profile-jsonrpc-pipe-async-text.sh /tmp/jsonrpc-ee-worker-text-current.svg +REQS=20000 scripts/profile-jsonrpc-pipe-async-buffer.sh /tmp/jsonrpc-ee-worker-buffer-current.svg +#+end_src + +Current useful profiler artifacts: +- =/tmp/jsonrpc-ee-worker-text-current.svg= +- =/tmp/jsonrpc-ee-worker-buffer-current.svg= + +Async-profiler conclusions: +- Whole-process launcher/harness profiles are dominated by compilation noise. +- Worker-only profiles are the useful ones. +- The comparison between =text= and =buffer= modes clearly isolates the remaining text-layer cost. + +gprofng status: +- Installed locally and usable. +- Produced coarse output dominated by == under this WSL2 setup. +- Useful as a secondary cross-check, not the primary driver for this work. + +** Current Hotspots + +From worker-only async-profiler on current committed state (=930c9f5b09=): + +Text-mode still shows: +- =TextIOWrapperBuiltins$WriteNode.write= +- =TextIOWrapperNodes$ReadlineNode.readline= +- =TextIOWrapperNodes$ReadChunkNode.readChunk= +- =BufferedWriterNodes$FlushUnlockedNode.bufferedwriterFlushUnlocked= +- =BufferedWriterNodes$RawWriteNode.bufferedwriterRawWrite= +- =FileIOBuiltins$ReadintoNode.readinto= +- =PosixModuleBuiltins$ReadNode.read= +- =FileIOBuiltins$WriteNode.write= +- =PosixModuleBuiltins$WriteNode.write= + +Buffer-mode drops the text wrapper layer and shows mainly: +- =BufferedReaderMixinBuiltins$BufferedReadlineNode.readline= +- =BufferedWriterNodes$FlushUnlockedNode.bufferedwriterFlushUnlocked= +- =BufferedWriterNodes$RawWriteNode.bufferedwriterRawWrite= +- =FileIOBuiltins$ReadintoNode.readinto= +- =PosixModuleBuiltins$ReadNode.read= +- =FileIOBuiltins$WriteNode.write= +- =PosixModuleBuiltins$WriteNode.write= + +Interpretation: +- Lower buffered/file/posix path has improved. +- Remaining gap is increasingly concentrated in the text wrapper read path. + +** Kept Changes + +*** =39bde29f49= Reduce tiny flush copying in =_io= + +What changed: +- Replaced pending text output buffering with a stealable byte buffer: + - =PTextIO= + - =PendingBytesOutputStream= +- Avoided =toByteArray()= copy on =TextIOWrapper= flush: + - =TextIOWrapperNodes.WriteFlushNode= +- Avoided slice copy in =BufferedWriterNodes.FlushUnlockedNode= when =writePos == 0= + +Measured effect on the small =mx benchmark micro:jsonrpc-pipe= configuration: +- =AVG (no warmup)= improved from =0.188 s= to =0.089 s= + +*** =930c9f5b09= Add lower-level readInto fast path for buffered reads + +What changed: +- Added =PosixSupportLibrary.readInto(Object receiver, int fd, Buffer data)= +- Implemented it in: + - =NFIPosixSupport= + - =EmulatedPosixSupport= + - =LoggingPosixSupport= + - =PreInitPosixSupport= +- Used it from =BufferedReaderMixinBuiltins.FillBufferNode= only when: + - refill starts at offset =0= + - raw object is cached =PFileIO= + +This stays below the Python protocol layer and avoids: +- temporary =PByteArray= +- Python-level =raw.readinto()= call +- extra copy back into the buffered reader's internal byte array + +Repeated heavy-run comparison against prior committed state: + +Baseline (=39bde29f49=), 5 runs: +- median =AVG (no warmup)=: =0.551 s= +- mean =AVG (no warmup)=: =0.537 s= +- min/max: =0.344 s= / =0.707 s= + +Candidate (=930c9f5b09=), 5 runs: +- median =AVG (no warmup)=: =0.476 s= +- mean =AVG (no warmup)=: =0.503 s= +- min/max: =0.346 s= / =0.682 s= + +Interpretation: +- This lower-level read-side change was worth keeping. + +** Rejected Experiments + +*** Rejected: Python-level direct fill-buffer shortcut + +Location: +- =BufferedReaderMixinBuiltins.FillBufferNode= + +Idea: +- When =start == 0=, call =raw.readinto()= directly on =self.getBuffer()= + +Why rejected: +- went back through Python-level dispatch +- added significant hot-node complexity +- repeated runs gave mixed signal and no clear win + +*** Rejected: =ReadChunkNode= telling / non-telling split + +Location: +- =TextIOWrapperNodes.ReadChunkNode= + +Idea: +- Split =self.isTelling()= and non-=telling= cases into separate specializations + +Repeated heavy-run results: +- runs: =0.446 s=, =0.804 s=, =0.515 s=, =0.617 s=, =0.621 s= +- median =0.617 s= +- mean =0.601 s= + +Compared to current baseline (=930c9f5b09=): +- median =0.476 s= +- mean =0.503 s= + +Why rejected: +- regression on repeated runs +- likely extra node shape / cache sharing cost outweighed saved work + +*** Rejected: =ReadChunkNode= =PBytes= fast path + +Location: +- =TextIOWrapperNodes.ReadChunkNode= + +Idea: +- If =inputChunk instanceof PBytes=, skip the generic buffer acquire/release path + +Repeated heavy-run results: +- runs: =0.515 s=, =0.647 s=, =0.510 s=, =0.707 s=, =0.613 s= +- median =0.613 s= +- mean =0.598 s= + +Compared to current baseline (=930c9f5b09=): +- median =0.476 s= +- mean =0.503 s= + +Why rejected: +- regression on repeated runs +- too small/local a fast path to beat the resulting code shape in practice + +*** Blocked: direct TextIOWrapper readline -> buffered byte readline delegation + +Location: +- =TextIOWrapperNodes.ReadlineNode= + +Idea: +- In the common unlimited, non-=tell()= case with no decoded-char backlog, delegate directly to + the underlying buffered byte =readline= node and decode just that one line. + +Reasoning: +- This would bypass: + - =TextIOWrapperNodes.ReadChunkNode= + - =FindLineEndingNode= + - some =TruffleString= churn in the common NDJSON case + +Why not pursued further yet: +- Cross-file Truffle DSL node construction blocked the straightforward implementation. +- Attempting to cache =BufferedReaderMixinBuiltins.BufferedReadlineNode= from + =TextIOWrapperNodes= did not compile: + - implicit =create()= is not available there + - explicit =...BufferedReadlineNodeGen.create()= was not accepted by the DSL expression parser +- Retried with explicit generated-node cache expressions and still hit DSL/parser visibility issues. + +This direction may still be worthwhile, but likely requires one of: +- moving a reusable helper into a place both nodes can access cleanly +- a small refactor in =BufferedReaderMixinBuiltins= +- or a different Java-level delegation approach that does not require cross-file cached-node construction + +*** Rejected: shared bufferedReadline helper + TextIOWrapper fast path + +Location: +- =BufferedReaderMixinBuiltins= +- =TextIOWrapperNodes.ReadlineNode= + +Idea: +- Extract the buffered byte-line acquisition logic into a reusable pure Java helper in + =BufferedReaderMixinBuiltins= +- Cache the necessary nodes at the call site in =TextIOWrapperNodes= +- In the common case (=limit < 0=, no decoded backlog, no =tell()= tracking), acquire a byte line + from the buffered layer and decode just that line + +Why this was attractive: +- avoids cross-file generated-node construction +- keeps lower-layer logic shared instead of duplicated +- bypasses =ReadChunkNode= and =FindLineEndingNode= in the common NDJSON case + +Repeated heavy-run results: +- runs: =0.520 s=, =0.391 s=, =0.526 s=, =0.665 s=, =0.550 s= +- median =AVG (no warmup)=: =0.526 s= +- mean =AVG (no warmup)=: =0.530 s= +- median =BEST=: =0.278 s= + +Compared to current baseline (=930c9f5b09=): +- baseline median =AVG (no warmup)=: =0.476 s= +- baseline mean =AVG (no warmup)=: =0.503 s= +- baseline median =BEST=: =0.288 s= + +Conclusion: +- not good enough to keep +- slightly better tail minima / bests, but worse median and mean +- reverted + +** Measurement Guidance + +Preferred comparison protocol for future changes: +- Use the current committed state as A +- Use a single candidate patch as B +- Rebuild once for B +- Run 5 repeated harness invocations with: + - =-i 12 5000 text text mask 64= +- Compare at least: + - median =AVG (no warmup)= + - mean =AVG (no warmup)= + - min/max =AVG (no warmup)= + - median =BEST= + +Avoid using single-run =AVG (no warmup)= from the small =500=-roundtrip benchmark for go/no-go +decisions on subtle changes. + +** Current CPython Comparison + +Repeated heavy-run comparison using: +#+begin_src bash +graalpython/com.oracle.graal.python.benchmarks/python/harness.py \ + graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \ + -i 12 5000 mask 64 +#+end_src + +Five-run summaries: + +| runtime/mode | median AVG (no warmup) | mean AVG (no warmup) | median BEST | +|--------------------+------------------------+-----------------------+-------------| +| CPython text/text | 0.529 s | 0.554 s | 0.508 s | +| CPython buffer/buffer | 0.513 s | 0.517 s | 0.488 s | +| GraalPy text/text | 0.446 s | 0.475 s | 0.277 s | +| GraalPy buffer/buffer | 0.615 s | 0.598 s | 0.298 s | + +Implications: +- On the current committed state, GraalPy =text/text= is *faster* than CPython on this harness benchmark. +- GraalPy =buffer/buffer= is still *slower* than CPython and also slower than GraalPy =text/text=. +- Therefore, the remaining end-to-end issue on this benchmark is not simply "TextIOWrapper is slower". +- The lower buffered/file/posix path still matters, and some previous assumptions should be re-checked + against the heavier benchmark protocol. + +** Interpreter (Compilation=false): native-ee standalone vs CPython + +*** Protocol + +Benchmark entrypoint and workload stayed the same as the prior investigation: +- =graalpython/com.oracle.graal.python.benchmarks/python/harness.py= +- =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py= +- workload parameters: =5000 text text mask 64= + +Pinned harness commands used for this section: +#+begin_src bash +env LC_ALL=C.UTF-8 PYTHONHASHSEED=0 \ + GRAAL_PYTHON_VM_ARGS='--experimental-options --engine.Compilation=false' \ + taskset -c 2 ./mxbuild/linux-amd64/GRAALPY_NATIVE_STANDALONE/bin/graalpy \ + graalpython/com.oracle.graal.python.benchmarks/python/harness.py \ + graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \ + -r 1 -i 12 5000 text text mask 64 + +env LC_ALL=C.UTF-8 PYTHONHASHSEED=0 \ + taskset -c 2 python3 \ + graalpython/com.oracle.graal.python.benchmarks/python/harness.py \ + graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \ + -r 1 -i 12 5000 text text mask 64 +#+end_src + +Protocol details: +- =taskset -c 2= pins both parent and worker to one CPU +- =LC_ALL=C.UTF-8= and =PYTHONHASHSEED=0= kept constant across runs +- =-r 1= gives one unmeasured in-process pre-run before the 12 measured iterations +- repeated each harness invocation 5 times per runtime +- primary summary metric: cross-run median/mean of harness =AVG (all runs)= +- steady-state cross-check: mean of the last 6 raw durations from each 12-iteration harness run + +Why use =GRAAL_PYTHON_VM_ARGS= here: +- it ensures the worker subprocess launched by =jsonrpc-pipe.py= also sees + =--experimental-options --engine.Compilation=false= + +Hardware/software context for this section: +- host: WSL2 Linux =6.6.87.2-microsoft-standard-WSL2= +- CPU: =13th Gen Intel(R) Core(TM) i9-13900H=, 20 online CPUs +- native standalone: =GraalPy 3.12.8 (Oracle GraalVM Native 25.1.0)= +- CPython: =Python 3.12.11= + +*** Harness Results + +Five-run summaries for the pinned =-r 1 -i 12 5000 text text mask 64= protocol: + +| runtime | median AVG (all runs) | mean AVG (all runs) | median tail-6 avg | mean tail-6 avg | median BEST | +|----------------------+------------------------+---------------------+-------------------+-----------------+-------------| +| native-ee standalone | 1.203 s | 1.234 s | 1.162 s | 1.190 s | 1.124 s | +| CPython | 0.244 s | 0.244 s | 0.242 s | 0.244 s | 0.236 s | + +Observed gap: +- native-ee standalone is about =4.93x= slower than CPython by median harness =AVG (all runs)= +- even on the trailing-6 steady-state cross-check, native-ee standalone is still about =4.80x= slower + +Raw per-run summaries: +- native-ee standalone: + - run 1: =AVG(all)= =1.217 s=, =BEST= =1.104 s=, tail-6 avg =1.134 s= + - run 2: =AVG(all)= =1.160 s=, =BEST= =1.105 s=, tail-6 avg =1.127 s= + - run 3: =AVG(all)= =1.195 s=, =BEST= =1.124 s=, tail-6 avg =1.164 s= + - run 4: =AVG(all)= =1.203 s=, =BEST= =1.129 s=, tail-6 avg =1.162 s= + - run 5: =AVG(all)= =1.394 s=, =BEST= =1.304 s=, tail-6 avg =1.361 s= +- CPython: + - run 1: =AVG(all)= =0.250 s=, =BEST= =0.236 s=, tail-6 avg =0.253 s= + - run 2: =AVG(all)= =0.240 s=, =BEST= =0.236 s=, tail-6 avg =0.239 s= + - run 3: =AVG(all)= =0.243 s=, =BEST= =0.237 s=, tail-6 avg =0.242 s= + - run 4: =AVG(all)= =0.244 s=, =BEST= =0.235 s=, tail-6 avg =0.242 s= + - run 5: =AVG(all)= =0.244 s=, =BEST= =0.239 s=, tail-6 avg =0.242 s= + +Interpretation: +- the current interpreter-mode gap is large and repeatable +- the outlier native run 5 moves the mean a bit, but not the overall conclusion +- the steady-state tail still shows a large gap, so this is not just cold startup + +*** Worker Breakdown + +Worker-only =gprofng= profile for native-ee standalone with =Compilation=false=: +- collected by driving =15000= text-mode requests into a worker-only + =gprofng collect app -O /tmp/jsonrpc-native-text-compfalse.er -F off -- ...= + launch of: + - =./mxbuild/linux-amd64/GRAALPY_NATIVE_STANDALONE/bin/graalpy= + - =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py --worker --worker-io=text= +- with =GRAAL_PYTHON_VM_ARGS='--experimental-options --engine.Compilation=false'= + +Top native-ee standalone interpreter-mode functions/suspects: +- =PBytecodeDSLRootNodeGen$CachedBytecodeNode.continueAt=: =65.47%= inclusive CPU +- =SubstrateEnterpriseOptimizedCallTarget.invokeFromInterpreter=: =18.83%= inclusive CPU +- =OptimizedCallTarget.callBoundary=: =16.14%= inclusive CPU +- =CallDispatchersFactory$FunctionCachedCallNodeGen$Inlined.execute=: =11.21%= inclusive CPU +- allocation/GC shows up materially: + - =G1Library.allocateArray=: =9.87%= inclusive CPU + - =MemAllocator::allocate=: =9.87%= inclusive CPU + - =slowPathNewInstance=: =4.93%= inclusive CPU +- text-layer work is present but not dominant on its own: + - =TextIOWrapperNodesFactory$ReadChunkNodeGen$Inlined.execute=: =5.38%= inclusive CPU +- JSON/string work is also visible: + - =JSONUtils.appendString= + - =JSONEncoderBuiltins...AppendSimpleObject= + - =JSONScannerBuiltins...scanOnceUnicode= + - =PatternBuiltins...SubnInnerNode= +- raw syscall wrappers are not the main bucket in this profile: + - =write=: =9.42%= exclusive CPU + - =read=: =0.90%= exclusive CPU + +Worker-only CPython =cProfile= on the same text-mode worker shape (15000 requests): +- =TextIOWrapper.readline=: =0.780 s= cumulative inside a =1.658 s= worker profile +- =TextIOWrapper.flush=: =0.227 s= +- =write_message=: =0.419 s= cumulative +- =read_message=: =0.946 s= cumulative +- =json.dumps=: =0.159 s= +- =json.loads=: =0.143 s= +- =mask_row=: =0.206 s= + +Interpretation: +- native-ee standalone spends a large fraction above the syscall boundary in bytecode interpreter + dispatch, call boundaries, dynamic dispatch, and allocation/GC +- CPython still pays most of its visible worker cost in text I/O and JSON, but those hot paths stay + largely in optimized C implementations rather than showing a large interpreter-dispatch bucket +- therefore the native-ee interpreter-mode gap is not just "readline/flush are slower"; a broader + dispatch/allocation cost is visible in the worker profile + +*** Syscall Cross-Check + +One =strace -f -c= run per runtime, same pinned direct benchmark shape: +- native-ee standalone direct =text/text=, =5000= roundtrips: + - clean wall time without =strace=: =1.979 s= + - traced syscall mix: + - =futex=: =95.09%= traced syscall time, 852 calls + - =read=: 10200 calls + - =write=: 10003 calls +- CPython direct =text/text=, =5000= roundtrips: + - clean wall time without =strace=: =0.573 s= + - traced syscall mix: + - =wait4=: =78.22%= traced syscall time, 85 calls + - =write=: 10043 calls + - =read=: 10502 calls + - =futex=: only 44 calls, =0.05%= traced syscall time + +Important caveat: +- =strace= perturbs wall times substantially under WSL2, so use it only for syscall mix, not for + timing conclusions + +Interpretation: +- native-ee standalone and CPython issue roughly the same order of magnitude of =read= and =write= + syscalls for this workload +- the main gap therefore is not "native-ee does far more pipe syscalls" +- native-ee shows much heavier =futex= activity, which suggests extra runtime coordination/synchronization + on top of the same basic I/O pattern + +*** Isolation Experiment: Drop Text Wrappers Only + +Direct-mode check using the same benchmark, pinned to CPU 2, 3 runs each: + +| runtime/mode | median wall time | +|----------------------------+------------------| +| native-ee standalone text/text | 1.701 s | +| native-ee standalone buffer/buffer | 1.377 s | +| CPython text/text | 0.280 s | +| CPython buffer/buffer | 0.276 s | + +Interpretation: +- removing =TextIOWrapper= helps native-ee standalone by about =19%= in this interpreter-mode direct check +- CPython changes very little between =text/text= and =buffer/buffer= on this workload +- but native-ee standalone =buffer/buffer= is still about =4.99x= slower than CPython =buffer/buffer= +- therefore text I/O is a meaningful contributor, but it does not explain the full interpreter-mode gap + +*** Current Hypotheses + +Most likely contributors to the remaining interpreter-mode gap: +- bytecode interpreter dispatch / call-boundary overhead in the native standalone +- object allocation and GC churn in request decode/normalize/encode paths +- text read path cost still matters, especially =ReadChunkNode=, but it is only part of the total gap +- extra runtime synchronization (visible in =futex= activity) may be contributing to end-to-end time + +Most likely productive next targets: +- reduce allocation and dispatch churn in the short-request text/JSON path +- re-check =TextIOWrapperNodes.ReadlineNode= and =ReadChunkNode=, but do not assume that fixing them + alone will close the gap +- inspect why the interpreter-mode native standalone still stays ~5x behind CPython even in + =buffer/buffer= mode + +*** Experiment Log + +- Kept for protocol: + - switched from command-line-only flags to =GRAAL_PYTHON_VM_ARGS= for native-ee standalone + - reason: ensures the worker subprocess also runs with =Compilation=false= +- Tried as isolation only: + - direct =buffer/buffer= runs under =Compilation=false= + - result: useful diagnostic, but not a runtime change +- No runtime code micro-optimization was committed in this pass: + - the new measurements point to multiple cost centers + - better to keep the notes reproducible first, then patch one hotspot at a time + +** WSL2 Notes + +Environment: +- WSL2 kernel detected +- =perf= is present but kernel-matched tooling is not configured cleanly +- =gprofng= is installed + +Practical consequence: +- use worker-only async-profiler as primary guide +- use gprofng only as a coarse cross-check + +** Next Likely Target + +Most likely remaining productive area: +- =TextIOWrapperNodes.ReadlineNode= + +Rationale: +- lower buffered/file/posix stack has already been improved +- text-mode still pays in: + - =TextIOWrapperNodes$ReadlineNode.readline= + - =TextIOWrapperNodes$ReadChunkNode.readChunk= + +But: +- recent attempts show that naive local fast paths in =ReadChunkNode= are easy to get wrong +- likely next useful change must avoid growing node shape too much +- focus should be on reducing actual =TruffleString= / substring / concat churn in the + common short-line case, not just adding more conditionals +- the promising "delegate to buffered byte readline" idea is currently blocked by DSL wiring issues diff --git a/scripts/profile-jsonrpc-pipe-async-buffer.sh b/scripts/profile-jsonrpc-pipe-async-buffer.sh new file mode 100755 index 0000000000..5ca134c571 --- /dev/null +++ b/scripts/profile-jsonrpc-pipe-async-buffer.sh @@ -0,0 +1,51 @@ +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +#!/usr/bin/env bash +set -euo pipefail +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +OUT="${1:-/tmp/jsonrpc-ee-worker-buffer.svg}" +REQS="${REQS:-30000}" +GRAALPY="${GRAALPY:-$ROOT/mxbuild/linux-amd64/GRAALPY_JVM_STANDALONE/bin/graalpy}" +exec python3 "$ROOT/scripts/profile_jsonrpc_pipe_worker.py" \ + --graalpy "$GRAALPY" \ + --worker-io buffer \ + --profiler async \ + --requests "$REQS" \ + --output "$OUT" diff --git a/scripts/profile-jsonrpc-pipe-async-text.sh b/scripts/profile-jsonrpc-pipe-async-text.sh new file mode 100755 index 0000000000..2b75b1741d --- /dev/null +++ b/scripts/profile-jsonrpc-pipe-async-text.sh @@ -0,0 +1,51 @@ +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +#!/usr/bin/env bash +set -euo pipefail +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +OUT="${1:-/tmp/jsonrpc-ee-worker-text.svg}" +REQS="${REQS:-30000}" +GRAALPY="${GRAALPY:-$ROOT/mxbuild/linux-amd64/GRAALPY_JVM_STANDALONE/bin/graalpy}" +exec python3 "$ROOT/scripts/profile_jsonrpc_pipe_worker.py" \ + --graalpy "$GRAALPY" \ + --worker-io text \ + --profiler async \ + --requests "$REQS" \ + --output "$OUT" diff --git a/scripts/profile-jsonrpc-pipe-gprofng-buffer.sh b/scripts/profile-jsonrpc-pipe-gprofng-buffer.sh new file mode 100755 index 0000000000..3aed4f4026 --- /dev/null +++ b/scripts/profile-jsonrpc-pipe-gprofng-buffer.sh @@ -0,0 +1,51 @@ +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +#!/usr/bin/env bash +set -euo pipefail +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +OUT="${1:-/tmp/jsonrpc-ee-worker-buffer.er}" +REQS="${REQS:-30000}" +GRAALPY="${GRAALPY:-$ROOT/mxbuild/linux-amd64/GRAALPY_JVM_STANDALONE/bin/graalpy}" +exec python3 "$ROOT/scripts/profile_jsonrpc_pipe_worker.py" \ + --graalpy "$GRAALPY" \ + --worker-io buffer \ + --profiler gprofng \ + --requests "$REQS" \ + --output "$OUT" diff --git a/scripts/profile-jsonrpc-pipe-gprofng-text.sh b/scripts/profile-jsonrpc-pipe-gprofng-text.sh new file mode 100755 index 0000000000..63be3fb887 --- /dev/null +++ b/scripts/profile-jsonrpc-pipe-gprofng-text.sh @@ -0,0 +1,51 @@ +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +#!/usr/bin/env bash +set -euo pipefail +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +OUT="${1:-/tmp/jsonrpc-ee-worker-text.er}" +REQS="${REQS:-30000}" +GRAALPY="${GRAALPY:-$ROOT/mxbuild/linux-amd64/GRAALPY_JVM_STANDALONE/bin/graalpy}" +exec python3 "$ROOT/scripts/profile_jsonrpc_pipe_worker.py" \ + --graalpy "$GRAALPY" \ + --worker-io text \ + --profiler gprofng \ + --requests "$REQS" \ + --output "$OUT" diff --git a/scripts/profile_jsonrpc_pipe_worker.py b/scripts/profile_jsonrpc_pipe_worker.py new file mode 100755 index 0000000000..aed3bb4b90 --- /dev/null +++ b/scripts/profile_jsonrpc_pipe_worker.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +import subprocess +import sys +from typing import BinaryIO, TextIO + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Drive the jsonrpc-pipe worker under a profiler.") + parser.add_argument("--graalpy", required=True, help="Path to GraalPy launcher") + parser.add_argument("--worker-io", choices=("text", "buffer"), required=True) + parser.add_argument("--profiler", choices=("async", "gprofng"), required=True) + parser.add_argument("--requests", type=int, default=30000) + parser.add_argument("--output", required=True, help="Profile output file (async) or experiment dir (gprofng)") + parser.add_argument("--async-profiler-dir", default="/tmp/async-profiler-1.8.3-linux-x64") + parser.add_argument("--benchmark", default="graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py") + return parser.parse_args() + + +def build_worker_cmd(args: argparse.Namespace) -> list[str]: + benchmark = str(Path(args.benchmark).resolve()) + worker = [args.graalpy, benchmark, "--worker", f"--worker-io={args.worker_io}"] + if args.profiler == "async": + lib = Path(args.async_profiler_dir) / "build" / "libasyncProfiler.so" + return [ + args.graalpy, + f"--vm.agentpath:{lib}=start,event=cpu,file={args.output}", + "--vm.XX:+UnlockDiagnosticVMOptions", + "--vm.XX:+DebugNonSafepoints", + benchmark, + "--worker", + f"--worker-io={args.worker_io}", + ] + return [ + "gprofng", + "collect", + "app", + "-O", + args.output, + "-F", + "off", + "--", + *worker, + ] + + +def make_request(index: int) -> dict[str, object]: + return { + "jsonrpc": "2.0", + "id": index, + "method": "mask", + "params": { + "email": f" User{index}.payload-payload@Example.COM ", + "phone": f"+49 (170) {index:04d}-payload-", + "region": "eu", + "source": "microbench", + }, + } + + +def read_json_line_text(stream: TextIO, stderr: TextIO) -> dict[str, object]: + while True: + line = stream.readline() + if not line: + raise RuntimeError(f"worker terminated early: {stderr.read()}") + if line.lstrip().startswith("{"): + return json.loads(line) + + +def read_json_line_binary(stream: BinaryIO, stderr: BinaryIO) -> dict[str, object]: + while True: + line = stream.readline() + if not line: + raise RuntimeError(f"worker terminated early: {stderr.read().decode('utf-8', errors='replace')}") + if line.lstrip().startswith(b"{"): + return json.loads(line) + + +def drive_text(process: subprocess.Popen[str], requests: int) -> None: + assert process.stdin is not None + assert process.stdout is not None + assert process.stderr is not None + for i in range(requests): + process.stdin.write(json.dumps(make_request(i), separators=(",", ":")) + "\n") + process.stdin.flush() + read_json_line_text(process.stdout, process.stderr) + + +def drive_binary(process: subprocess.Popen[bytes], requests: int) -> None: + assert process.stdin is not None + assert process.stdout is not None + assert process.stderr is not None + for i in range(requests): + payload = (json.dumps(make_request(i), separators=(",", ":")) + "\n").encode("utf-8") + process.stdin.write(payload) + process.stdin.flush() + read_json_line_binary(process.stdout, process.stderr) + + +def main() -> int: + args = parse_args() + output = Path(args.output) + output.parent.mkdir(parents=True, exist_ok=True) + if args.profiler == "gprofng" and output.exists(): + if output.is_dir(): + subprocess.check_call(["rm", "-rf", str(output)]) + else: + output.unlink() + cmd = build_worker_cmd(args) + process_cwd = None + if args.profiler == "gprofng": + process_cwd = str(output.parent) + cmd[4] = output.name + text_mode = args.worker_io == "text" + if text_mode: + process = subprocess.Popen( + cmd, + cwd=process_cwd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + bufsize=1, + ) + try: + drive_text(process, args.requests) + process.stdin.close() + rc = process.wait(timeout=120) + if rc != 0: + raise RuntimeError(process.stderr.read()) + sys.stdout.write(process.stderr.read()) + finally: + if process.poll() is None: + process.kill() + else: + process = subprocess.Popen( + cmd, + cwd=process_cwd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=0, + ) + try: + drive_binary(process, args.requests) + process.stdin.close() + rc = process.wait(timeout=120) + if rc != 0: + raise RuntimeError(process.stderr.read().decode("utf-8", errors="replace")) + sys.stdout.write(process.stderr.read().decode("utf-8", errors="replace")) + finally: + if process.poll() is None: + process.kill() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From f0b4ac9a3839e3ad367adc1c292d95b161b094ba Mon Sep 17 00:00:00 2001 From: Tim Felgentreff Date: Wed, 15 Apr 2026 16:40:12 +0200 Subject: [PATCH 6/7] Revert buffered _io readInto fast path The direct buffered readInto refill path caused correctness regressions in\nseek/readline-sensitive code paths. In particular, it broke traceback\nsource extraction and tokenize/linecache-based reads, which reproduced as\nfailing test_traceback assertions and zipimport source-location failures.\n\nRemove the optimization and delete the now-dead PosixSupport readInto\nplumbing until there is a version that preserves buffered IO invariants. --- .../io/BufferedReaderMixinBuiltins.java | 78 +-- .../python/runtime/EmulatedPosixSupport.java | 25 - .../python/runtime/LoggingPosixSupport.java | 11 - .../graal/python/runtime/NFIPosixSupport.java | 11 - .../python/runtime/PosixSupportLibrary.java | 2 - .../python/runtime/PreInitPosixSupport.java | 9 - investigations/io_perf/notes.org | 525 ------------------ 7 files changed, 6 insertions(+), 655 deletions(-) delete mode 100644 investigations/io_perf/notes.org diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java index 101ca493d0..f32954f9e4 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedReaderMixinBuiltins.java @@ -82,7 +82,6 @@ import com.oracle.graal.python.builtins.modules.io.BufferedIONodesFactory.CheckIsClosedNodeGen; import com.oracle.graal.python.builtins.objects.PNone; import com.oracle.graal.python.builtins.objects.buffer.PythonBufferAccessLibrary; -import com.oracle.graal.python.builtins.objects.exception.OSErrorEnum; import com.oracle.graal.python.builtins.objects.bytes.BytesNodes; import com.oracle.graal.python.builtins.objects.bytes.PByteArray; import com.oracle.graal.python.builtins.objects.bytes.PBytes; @@ -98,12 +97,7 @@ import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode; import com.oracle.graal.python.nodes.function.builtins.clinic.ArgumentClinicProvider; import com.oracle.graal.python.nodes.object.GetClassNode; -import com.oracle.graal.python.nodes.PConstructAndRaiseNode; -import com.oracle.graal.python.runtime.GilNode; import com.oracle.graal.python.runtime.IndirectCallData.InteropCallData; -import com.oracle.graal.python.runtime.PosixSupport; -import com.oracle.graal.python.runtime.PosixSupportLibrary; -import com.oracle.graal.python.runtime.PythonContext; import com.oracle.graal.python.runtime.exception.PException; import com.oracle.graal.python.runtime.object.PFactory; import com.oracle.graal.python.util.PythonUtils; @@ -120,7 +114,6 @@ import com.oracle.truffle.api.frame.VirtualFrame; import com.oracle.truffle.api.library.CachedLibrary; import com.oracle.truffle.api.nodes.Node; -import com.oracle.truffle.api.profiles.InlinedBranchProfile; import com.oracle.truffle.api.profiles.InlinedConditionProfile; import com.oracle.truffle.api.strings.TruffleString; @@ -197,8 +190,7 @@ abstract static class FillBufferNode extends PNodeWithContext { @Specialization static int bufferedreaderFillBuffer(VirtualFrame frame, Node inliningTarget, PBuffered self, - @Cached RawReadNode rawReadNode, - @Cached RawReadIntoBufferNode rawReadIntoBufferNode) { + @Cached RawReadNode rawReadNode) { int start; if (isValidReadBuffer(self)) { start = self.getReadEnd(); @@ -206,79 +198,21 @@ static int bufferedreaderFillBuffer(VirtualFrame frame, Node inliningTarget, PBu start = 0; } int len = self.getBufferSize() - start; - int n; - if (start == 0 && self.isFastClosedChecks()) { - n = rawReadIntoBufferNode.execute(frame, inliningTarget, self.getFileIORaw(), self.getBuffer(), len); - if (n == -2) { - return -2; - } - } else { - byte[] fill = rawReadNode.execute(frame, inliningTarget, self, len); - if (fill == BLOCKED) { - return -2; - } - n = fill.length; - if (n > 0) { - PythonUtils.arraycopy(fill, 0, self.getBuffer(), start, n); - } + byte[] fill = rawReadNode.execute(frame, inliningTarget, self, len); + if (fill == BLOCKED) { + return -2; } + int n = fill.length; if (n == 0) { return n; } + PythonUtils.arraycopy(fill, 0, self.getBuffer(), start, n); self.setReadEnd(start + n); self.setRawPos(start + n); return n; } } - @GenerateInline - @GenerateCached(false) - abstract static class RawReadIntoBufferNode extends PNodeWithContext { - - public abstract int execute(VirtualFrame frame, Node inliningTarget, PFileIO raw, byte[] buffer, int len); - - @Specialization - static int readIntoBuffer(VirtualFrame frame, Node inliningTarget, PFileIO raw, byte[] buffer, int len, - @Bind PythonContext context, - @CachedLibrary("context.getPosixSupport()") PosixSupportLibrary posixLib, - @Cached InlinedBranchProfile readErrorProfile, - @Cached InlinedBranchProfile readErrorProfile2, - @Cached GilNode gil, - @Cached PConstructAndRaiseNode.Lazy constructAndRaiseNode) { - try { - return readInto(raw.getFD(), buffer, len, inliningTarget, posixLib, context.getPosixSupport(), readErrorProfile, gil); - } catch (PosixSupportLibrary.PosixException e) { - if (e.getErrorCode() == OSErrorEnum.EAGAIN.getNumber()) { - readErrorProfile2.enter(inliningTarget); - return -2; - } - throw constructAndRaiseNode.get(inliningTarget).raiseOSErrorFromPosixException(frame, e); - } - } - - private static int readInto(int fd, byte[] buffer, int len, - Node inliningTarget, PosixSupportLibrary posixLib, PosixSupport posixSupport, - InlinedBranchProfile errorProfile, GilNode gil) throws PosixSupportLibrary.PosixException { - gil.release(true); - try { - while (true) { - try { - return (int) posixLib.readInto(posixSupport, fd, new PosixSupportLibrary.Buffer(buffer, len)); - } catch (PosixSupportLibrary.PosixException e) { - errorProfile.enter(inliningTarget); - if (e.getErrorCode() == OSErrorEnum.EINTR.getNumber()) { - PythonContext.triggerAsyncActions(inliningTarget); - } else { - throw e; - } - } - } - } finally { - gil.acquire(); - } - } - } - @Builtin(name = J_READABLE, minNumOfPositionalArgs = 1) @GenerateNodeFactory abstract static class ReadableNode extends PythonUnaryWithInitErrorBuiltinNode { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java index af6c851da1..88ec3a30f9 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/EmulatedPosixSupport.java @@ -526,31 +526,6 @@ public Buffer read(int fd, long length, } } - @ExportMessage - @SuppressWarnings({"unused", "static-method"}) - public long readInto(int fd, Buffer data, - @Bind Node inliningTarget, - @Shared("errorBranch") @Cached InlinedBranchProfile errorBranch, - @Shared("eq") @Cached TruffleString.EqualNode eqNode) throws PosixException { - Channel channel = getFileChannel(fd); - if (!(channel instanceof ReadableByteChannel readableChannel)) { - errorBranch.enter(inliningTarget); - throw posixException(OSErrorEnum.EBADF); - } - try { - int n = doReadIntoChannel(readableChannel, data.data, (int) data.length); - return n < 0 ? 0 : n; - } catch (Exception e) { - errorBranch.enter(inliningTarget); - throw posixException(OSErrorEnum.fromException(e, eqNode)); - } - } - - @TruffleBoundary(allowInlining = true) - private static int doReadIntoChannel(ReadableByteChannel channel, byte[] data, int length) throws IOException { - return channel.read(ByteBuffer.wrap(data, 0, length)); - } - @TruffleBoundary private static Buffer readBytesFromChannel(ReadableByteChannel channel, long sizeIn) throws IOException { long size = sizeIn; diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java index 7172f5b5ce..e7371fadb9 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/LoggingPosixSupport.java @@ -184,17 +184,6 @@ final Buffer read(int fd, long length, } } - @ExportMessage - final long readInto(int fd, Buffer data, - @CachedLibrary("this.delegate") PosixSupportLibrary lib) throws PosixException { - logEnter("readInto", "%d, %d", fd, data.length); - try { - return logExit("readInto", "%d", lib.readInto(delegate, fd, data)); - } catch (PosixException e) { - throw logException("readInto", e); - } - } - @ExportMessage final long write(int fd, Buffer data, @CachedLibrary("this.delegate") PosixSupportLibrary lib) throws PosixException { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java index 9c343d9c34..1bf99341f3 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java @@ -565,17 +565,6 @@ public Buffer read(int fd, long length, return buffer.withLength(n); } - @ExportMessage - public long readInto(int fd, Buffer data, - @Shared("invoke") @Cached InvokeNativeFunction invokeNode) throws PosixException { - setErrno(invokeNode, 0); - long n = invokeNode.callLong(this, PosixNativeFunction.call_read, fd, data.data, data.length); - if (n < 0) { - throw getErrnoAndThrowPosixException(invokeNode); - } - return n; - } - @ExportMessage public long write(int fd, Buffer data, @Shared("invoke") @Cached InvokeNativeFunction invokeNode) throws PosixException { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java index 351f8d601c..5ad6aaac3e 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PosixSupportLibrary.java @@ -93,8 +93,6 @@ public abstract class PosixSupportLibrary extends Library { public abstract Buffer read(Object receiver, int fd, long length) throws PosixException; - public abstract long readInto(Object receiver, int fd, Buffer data) throws PosixException; - public abstract long write(Object receiver, int fd, Buffer data) throws PosixException; public abstract int dup(Object receiver, int fd) throws PosixException; diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java index 26916c89df..9b36dc9fd0 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PreInitPosixSupport.java @@ -203,15 +203,6 @@ final Buffer read(int fd, long length, return nativeLib.read(nativePosixSupport, fd, length); } - @ExportMessage - final long readInto(int fd, Buffer data, - @CachedLibrary("this.nativePosixSupport") PosixSupportLibrary nativeLib) throws PosixException { - if (inPreInitialization) { - return PosixSupportLibrary.getUncached().readInto(emulatedPosixSupport, fd, data); - } - return nativeLib.readInto(nativePosixSupport, fd, data); - } - @ExportMessage final long write(int fd, Buffer data, @CachedLibrary("this.nativePosixSupport") PosixSupportLibrary nativeLib) throws PosixException { diff --git a/investigations/io_perf/notes.org b/investigations/io_perf/notes.org deleted file mode 100644 index 6a92d8ae8d..0000000000 --- a/investigations/io_perf/notes.org +++ /dev/null @@ -1,525 +0,0 @@ -* IO Perf Investigation Notes - -** Scope - -Investigation target: remaining performance gap in workload 07 / jsonrpc tokenizer style -strict request-response traffic, especially tiny write/flush + pipe readline overhead in -GraalPy's =_io= stack. - -Current reference commit for runtime changes: -- =930c9f5b09= Add lower-level readInto fast path for buffered reads - -Related commits in this investigation: -- =39bde29f49= Reduce tiny flush copying in =_io= -- =3cc0650e9c= WIP: add jsonrpc pipe profiling scripts -- =9d27960795= Add jsonrpc pipe microbenchmark to mx harness - -** Benchmark Shape - -In-repo harness benchmark: -- =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py= - -Representative command: -#+begin_src bash -graalpy graalpython/com.oracle.graal.python.benchmarks/python/harness.py \ - graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \ - -i 12 5000 text text mask 64 -#+end_src - -Interpretation: -- =5000= roundtrips per benchmark iteration -- =text text= means parent and worker both use text I/O wrappers -- =mask= uses the tokenizer-like normalize/mask request shape -- =64= payload bytes - -The heavier =5000=-roundtrip mode is preferred for A/B work over the smaller =500= mode, -because the smaller mode is too noisy for subtle changes. - -** Profiling Tooling - -Worker-only profiling scripts: -- =scripts/profile_jsonrpc_pipe_worker.py= -- =scripts/profile-jsonrpc-pipe-async-text.sh= -- =scripts/profile-jsonrpc-pipe-async-buffer.sh= -- =scripts/profile-jsonrpc-pipe-gprofng-text.sh= -- =scripts/profile-jsonrpc-pipe-gprofng-buffer.sh= - -Typical async-profiler runs: -#+begin_src bash -REQS=20000 scripts/profile-jsonrpc-pipe-async-text.sh /tmp/jsonrpc-ee-worker-text-current.svg -REQS=20000 scripts/profile-jsonrpc-pipe-async-buffer.sh /tmp/jsonrpc-ee-worker-buffer-current.svg -#+end_src - -Current useful profiler artifacts: -- =/tmp/jsonrpc-ee-worker-text-current.svg= -- =/tmp/jsonrpc-ee-worker-buffer-current.svg= - -Async-profiler conclusions: -- Whole-process launcher/harness profiles are dominated by compilation noise. -- Worker-only profiles are the useful ones. -- The comparison between =text= and =buffer= modes clearly isolates the remaining text-layer cost. - -gprofng status: -- Installed locally and usable. -- Produced coarse output dominated by == under this WSL2 setup. -- Useful as a secondary cross-check, not the primary driver for this work. - -** Current Hotspots - -From worker-only async-profiler on current committed state (=930c9f5b09=): - -Text-mode still shows: -- =TextIOWrapperBuiltins$WriteNode.write= -- =TextIOWrapperNodes$ReadlineNode.readline= -- =TextIOWrapperNodes$ReadChunkNode.readChunk= -- =BufferedWriterNodes$FlushUnlockedNode.bufferedwriterFlushUnlocked= -- =BufferedWriterNodes$RawWriteNode.bufferedwriterRawWrite= -- =FileIOBuiltins$ReadintoNode.readinto= -- =PosixModuleBuiltins$ReadNode.read= -- =FileIOBuiltins$WriteNode.write= -- =PosixModuleBuiltins$WriteNode.write= - -Buffer-mode drops the text wrapper layer and shows mainly: -- =BufferedReaderMixinBuiltins$BufferedReadlineNode.readline= -- =BufferedWriterNodes$FlushUnlockedNode.bufferedwriterFlushUnlocked= -- =BufferedWriterNodes$RawWriteNode.bufferedwriterRawWrite= -- =FileIOBuiltins$ReadintoNode.readinto= -- =PosixModuleBuiltins$ReadNode.read= -- =FileIOBuiltins$WriteNode.write= -- =PosixModuleBuiltins$WriteNode.write= - -Interpretation: -- Lower buffered/file/posix path has improved. -- Remaining gap is increasingly concentrated in the text wrapper read path. - -** Kept Changes - -*** =39bde29f49= Reduce tiny flush copying in =_io= - -What changed: -- Replaced pending text output buffering with a stealable byte buffer: - - =PTextIO= - - =PendingBytesOutputStream= -- Avoided =toByteArray()= copy on =TextIOWrapper= flush: - - =TextIOWrapperNodes.WriteFlushNode= -- Avoided slice copy in =BufferedWriterNodes.FlushUnlockedNode= when =writePos == 0= - -Measured effect on the small =mx benchmark micro:jsonrpc-pipe= configuration: -- =AVG (no warmup)= improved from =0.188 s= to =0.089 s= - -*** =930c9f5b09= Add lower-level readInto fast path for buffered reads - -What changed: -- Added =PosixSupportLibrary.readInto(Object receiver, int fd, Buffer data)= -- Implemented it in: - - =NFIPosixSupport= - - =EmulatedPosixSupport= - - =LoggingPosixSupport= - - =PreInitPosixSupport= -- Used it from =BufferedReaderMixinBuiltins.FillBufferNode= only when: - - refill starts at offset =0= - - raw object is cached =PFileIO= - -This stays below the Python protocol layer and avoids: -- temporary =PByteArray= -- Python-level =raw.readinto()= call -- extra copy back into the buffered reader's internal byte array - -Repeated heavy-run comparison against prior committed state: - -Baseline (=39bde29f49=), 5 runs: -- median =AVG (no warmup)=: =0.551 s= -- mean =AVG (no warmup)=: =0.537 s= -- min/max: =0.344 s= / =0.707 s= - -Candidate (=930c9f5b09=), 5 runs: -- median =AVG (no warmup)=: =0.476 s= -- mean =AVG (no warmup)=: =0.503 s= -- min/max: =0.346 s= / =0.682 s= - -Interpretation: -- This lower-level read-side change was worth keeping. - -** Rejected Experiments - -*** Rejected: Python-level direct fill-buffer shortcut - -Location: -- =BufferedReaderMixinBuiltins.FillBufferNode= - -Idea: -- When =start == 0=, call =raw.readinto()= directly on =self.getBuffer()= - -Why rejected: -- went back through Python-level dispatch -- added significant hot-node complexity -- repeated runs gave mixed signal and no clear win - -*** Rejected: =ReadChunkNode= telling / non-telling split - -Location: -- =TextIOWrapperNodes.ReadChunkNode= - -Idea: -- Split =self.isTelling()= and non-=telling= cases into separate specializations - -Repeated heavy-run results: -- runs: =0.446 s=, =0.804 s=, =0.515 s=, =0.617 s=, =0.621 s= -- median =0.617 s= -- mean =0.601 s= - -Compared to current baseline (=930c9f5b09=): -- median =0.476 s= -- mean =0.503 s= - -Why rejected: -- regression on repeated runs -- likely extra node shape / cache sharing cost outweighed saved work - -*** Rejected: =ReadChunkNode= =PBytes= fast path - -Location: -- =TextIOWrapperNodes.ReadChunkNode= - -Idea: -- If =inputChunk instanceof PBytes=, skip the generic buffer acquire/release path - -Repeated heavy-run results: -- runs: =0.515 s=, =0.647 s=, =0.510 s=, =0.707 s=, =0.613 s= -- median =0.613 s= -- mean =0.598 s= - -Compared to current baseline (=930c9f5b09=): -- median =0.476 s= -- mean =0.503 s= - -Why rejected: -- regression on repeated runs -- too small/local a fast path to beat the resulting code shape in practice - -*** Blocked: direct TextIOWrapper readline -> buffered byte readline delegation - -Location: -- =TextIOWrapperNodes.ReadlineNode= - -Idea: -- In the common unlimited, non-=tell()= case with no decoded-char backlog, delegate directly to - the underlying buffered byte =readline= node and decode just that one line. - -Reasoning: -- This would bypass: - - =TextIOWrapperNodes.ReadChunkNode= - - =FindLineEndingNode= - - some =TruffleString= churn in the common NDJSON case - -Why not pursued further yet: -- Cross-file Truffle DSL node construction blocked the straightforward implementation. -- Attempting to cache =BufferedReaderMixinBuiltins.BufferedReadlineNode= from - =TextIOWrapperNodes= did not compile: - - implicit =create()= is not available there - - explicit =...BufferedReadlineNodeGen.create()= was not accepted by the DSL expression parser -- Retried with explicit generated-node cache expressions and still hit DSL/parser visibility issues. - -This direction may still be worthwhile, but likely requires one of: -- moving a reusable helper into a place both nodes can access cleanly -- a small refactor in =BufferedReaderMixinBuiltins= -- or a different Java-level delegation approach that does not require cross-file cached-node construction - -*** Rejected: shared bufferedReadline helper + TextIOWrapper fast path - -Location: -- =BufferedReaderMixinBuiltins= -- =TextIOWrapperNodes.ReadlineNode= - -Idea: -- Extract the buffered byte-line acquisition logic into a reusable pure Java helper in - =BufferedReaderMixinBuiltins= -- Cache the necessary nodes at the call site in =TextIOWrapperNodes= -- In the common case (=limit < 0=, no decoded backlog, no =tell()= tracking), acquire a byte line - from the buffered layer and decode just that line - -Why this was attractive: -- avoids cross-file generated-node construction -- keeps lower-layer logic shared instead of duplicated -- bypasses =ReadChunkNode= and =FindLineEndingNode= in the common NDJSON case - -Repeated heavy-run results: -- runs: =0.520 s=, =0.391 s=, =0.526 s=, =0.665 s=, =0.550 s= -- median =AVG (no warmup)=: =0.526 s= -- mean =AVG (no warmup)=: =0.530 s= -- median =BEST=: =0.278 s= - -Compared to current baseline (=930c9f5b09=): -- baseline median =AVG (no warmup)=: =0.476 s= -- baseline mean =AVG (no warmup)=: =0.503 s= -- baseline median =BEST=: =0.288 s= - -Conclusion: -- not good enough to keep -- slightly better tail minima / bests, but worse median and mean -- reverted - -** Measurement Guidance - -Preferred comparison protocol for future changes: -- Use the current committed state as A -- Use a single candidate patch as B -- Rebuild once for B -- Run 5 repeated harness invocations with: - - =-i 12 5000 text text mask 64= -- Compare at least: - - median =AVG (no warmup)= - - mean =AVG (no warmup)= - - min/max =AVG (no warmup)= - - median =BEST= - -Avoid using single-run =AVG (no warmup)= from the small =500=-roundtrip benchmark for go/no-go -decisions on subtle changes. - -** Current CPython Comparison - -Repeated heavy-run comparison using: -#+begin_src bash -graalpython/com.oracle.graal.python.benchmarks/python/harness.py \ - graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \ - -i 12 5000 mask 64 -#+end_src - -Five-run summaries: - -| runtime/mode | median AVG (no warmup) | mean AVG (no warmup) | median BEST | -|--------------------+------------------------+-----------------------+-------------| -| CPython text/text | 0.529 s | 0.554 s | 0.508 s | -| CPython buffer/buffer | 0.513 s | 0.517 s | 0.488 s | -| GraalPy text/text | 0.446 s | 0.475 s | 0.277 s | -| GraalPy buffer/buffer | 0.615 s | 0.598 s | 0.298 s | - -Implications: -- On the current committed state, GraalPy =text/text= is *faster* than CPython on this harness benchmark. -- GraalPy =buffer/buffer= is still *slower* than CPython and also slower than GraalPy =text/text=. -- Therefore, the remaining end-to-end issue on this benchmark is not simply "TextIOWrapper is slower". -- The lower buffered/file/posix path still matters, and some previous assumptions should be re-checked - against the heavier benchmark protocol. - -** Interpreter (Compilation=false): native-ee standalone vs CPython - -*** Protocol - -Benchmark entrypoint and workload stayed the same as the prior investigation: -- =graalpython/com.oracle.graal.python.benchmarks/python/harness.py= -- =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py= -- workload parameters: =5000 text text mask 64= - -Pinned harness commands used for this section: -#+begin_src bash -env LC_ALL=C.UTF-8 PYTHONHASHSEED=0 \ - GRAAL_PYTHON_VM_ARGS='--experimental-options --engine.Compilation=false' \ - taskset -c 2 ./mxbuild/linux-amd64/GRAALPY_NATIVE_STANDALONE/bin/graalpy \ - graalpython/com.oracle.graal.python.benchmarks/python/harness.py \ - graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \ - -r 1 -i 12 5000 text text mask 64 - -env LC_ALL=C.UTF-8 PYTHONHASHSEED=0 \ - taskset -c 2 python3 \ - graalpython/com.oracle.graal.python.benchmarks/python/harness.py \ - graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py \ - -r 1 -i 12 5000 text text mask 64 -#+end_src - -Protocol details: -- =taskset -c 2= pins both parent and worker to one CPU -- =LC_ALL=C.UTF-8= and =PYTHONHASHSEED=0= kept constant across runs -- =-r 1= gives one unmeasured in-process pre-run before the 12 measured iterations -- repeated each harness invocation 5 times per runtime -- primary summary metric: cross-run median/mean of harness =AVG (all runs)= -- steady-state cross-check: mean of the last 6 raw durations from each 12-iteration harness run - -Why use =GRAAL_PYTHON_VM_ARGS= here: -- it ensures the worker subprocess launched by =jsonrpc-pipe.py= also sees - =--experimental-options --engine.Compilation=false= - -Hardware/software context for this section: -- host: WSL2 Linux =6.6.87.2-microsoft-standard-WSL2= -- CPU: =13th Gen Intel(R) Core(TM) i9-13900H=, 20 online CPUs -- native standalone: =GraalPy 3.12.8 (Oracle GraalVM Native 25.1.0)= -- CPython: =Python 3.12.11= - -*** Harness Results - -Five-run summaries for the pinned =-r 1 -i 12 5000 text text mask 64= protocol: - -| runtime | median AVG (all runs) | mean AVG (all runs) | median tail-6 avg | mean tail-6 avg | median BEST | -|----------------------+------------------------+---------------------+-------------------+-----------------+-------------| -| native-ee standalone | 1.203 s | 1.234 s | 1.162 s | 1.190 s | 1.124 s | -| CPython | 0.244 s | 0.244 s | 0.242 s | 0.244 s | 0.236 s | - -Observed gap: -- native-ee standalone is about =4.93x= slower than CPython by median harness =AVG (all runs)= -- even on the trailing-6 steady-state cross-check, native-ee standalone is still about =4.80x= slower - -Raw per-run summaries: -- native-ee standalone: - - run 1: =AVG(all)= =1.217 s=, =BEST= =1.104 s=, tail-6 avg =1.134 s= - - run 2: =AVG(all)= =1.160 s=, =BEST= =1.105 s=, tail-6 avg =1.127 s= - - run 3: =AVG(all)= =1.195 s=, =BEST= =1.124 s=, tail-6 avg =1.164 s= - - run 4: =AVG(all)= =1.203 s=, =BEST= =1.129 s=, tail-6 avg =1.162 s= - - run 5: =AVG(all)= =1.394 s=, =BEST= =1.304 s=, tail-6 avg =1.361 s= -- CPython: - - run 1: =AVG(all)= =0.250 s=, =BEST= =0.236 s=, tail-6 avg =0.253 s= - - run 2: =AVG(all)= =0.240 s=, =BEST= =0.236 s=, tail-6 avg =0.239 s= - - run 3: =AVG(all)= =0.243 s=, =BEST= =0.237 s=, tail-6 avg =0.242 s= - - run 4: =AVG(all)= =0.244 s=, =BEST= =0.235 s=, tail-6 avg =0.242 s= - - run 5: =AVG(all)= =0.244 s=, =BEST= =0.239 s=, tail-6 avg =0.242 s= - -Interpretation: -- the current interpreter-mode gap is large and repeatable -- the outlier native run 5 moves the mean a bit, but not the overall conclusion -- the steady-state tail still shows a large gap, so this is not just cold startup - -*** Worker Breakdown - -Worker-only =gprofng= profile for native-ee standalone with =Compilation=false=: -- collected by driving =15000= text-mode requests into a worker-only - =gprofng collect app -O /tmp/jsonrpc-native-text-compfalse.er -F off -- ...= - launch of: - - =./mxbuild/linux-amd64/GRAALPY_NATIVE_STANDALONE/bin/graalpy= - - =graalpython/com.oracle.graal.python.benchmarks/python/micro/jsonrpc-pipe.py --worker --worker-io=text= -- with =GRAAL_PYTHON_VM_ARGS='--experimental-options --engine.Compilation=false'= - -Top native-ee standalone interpreter-mode functions/suspects: -- =PBytecodeDSLRootNodeGen$CachedBytecodeNode.continueAt=: =65.47%= inclusive CPU -- =SubstrateEnterpriseOptimizedCallTarget.invokeFromInterpreter=: =18.83%= inclusive CPU -- =OptimizedCallTarget.callBoundary=: =16.14%= inclusive CPU -- =CallDispatchersFactory$FunctionCachedCallNodeGen$Inlined.execute=: =11.21%= inclusive CPU -- allocation/GC shows up materially: - - =G1Library.allocateArray=: =9.87%= inclusive CPU - - =MemAllocator::allocate=: =9.87%= inclusive CPU - - =slowPathNewInstance=: =4.93%= inclusive CPU -- text-layer work is present but not dominant on its own: - - =TextIOWrapperNodesFactory$ReadChunkNodeGen$Inlined.execute=: =5.38%= inclusive CPU -- JSON/string work is also visible: - - =JSONUtils.appendString= - - =JSONEncoderBuiltins...AppendSimpleObject= - - =JSONScannerBuiltins...scanOnceUnicode= - - =PatternBuiltins...SubnInnerNode= -- raw syscall wrappers are not the main bucket in this profile: - - =write=: =9.42%= exclusive CPU - - =read=: =0.90%= exclusive CPU - -Worker-only CPython =cProfile= on the same text-mode worker shape (15000 requests): -- =TextIOWrapper.readline=: =0.780 s= cumulative inside a =1.658 s= worker profile -- =TextIOWrapper.flush=: =0.227 s= -- =write_message=: =0.419 s= cumulative -- =read_message=: =0.946 s= cumulative -- =json.dumps=: =0.159 s= -- =json.loads=: =0.143 s= -- =mask_row=: =0.206 s= - -Interpretation: -- native-ee standalone spends a large fraction above the syscall boundary in bytecode interpreter - dispatch, call boundaries, dynamic dispatch, and allocation/GC -- CPython still pays most of its visible worker cost in text I/O and JSON, but those hot paths stay - largely in optimized C implementations rather than showing a large interpreter-dispatch bucket -- therefore the native-ee interpreter-mode gap is not just "readline/flush are slower"; a broader - dispatch/allocation cost is visible in the worker profile - -*** Syscall Cross-Check - -One =strace -f -c= run per runtime, same pinned direct benchmark shape: -- native-ee standalone direct =text/text=, =5000= roundtrips: - - clean wall time without =strace=: =1.979 s= - - traced syscall mix: - - =futex=: =95.09%= traced syscall time, 852 calls - - =read=: 10200 calls - - =write=: 10003 calls -- CPython direct =text/text=, =5000= roundtrips: - - clean wall time without =strace=: =0.573 s= - - traced syscall mix: - - =wait4=: =78.22%= traced syscall time, 85 calls - - =write=: 10043 calls - - =read=: 10502 calls - - =futex=: only 44 calls, =0.05%= traced syscall time - -Important caveat: -- =strace= perturbs wall times substantially under WSL2, so use it only for syscall mix, not for - timing conclusions - -Interpretation: -- native-ee standalone and CPython issue roughly the same order of magnitude of =read= and =write= - syscalls for this workload -- the main gap therefore is not "native-ee does far more pipe syscalls" -- native-ee shows much heavier =futex= activity, which suggests extra runtime coordination/synchronization - on top of the same basic I/O pattern - -*** Isolation Experiment: Drop Text Wrappers Only - -Direct-mode check using the same benchmark, pinned to CPU 2, 3 runs each: - -| runtime/mode | median wall time | -|----------------------------+------------------| -| native-ee standalone text/text | 1.701 s | -| native-ee standalone buffer/buffer | 1.377 s | -| CPython text/text | 0.280 s | -| CPython buffer/buffer | 0.276 s | - -Interpretation: -- removing =TextIOWrapper= helps native-ee standalone by about =19%= in this interpreter-mode direct check -- CPython changes very little between =text/text= and =buffer/buffer= on this workload -- but native-ee standalone =buffer/buffer= is still about =4.99x= slower than CPython =buffer/buffer= -- therefore text I/O is a meaningful contributor, but it does not explain the full interpreter-mode gap - -*** Current Hypotheses - -Most likely contributors to the remaining interpreter-mode gap: -- bytecode interpreter dispatch / call-boundary overhead in the native standalone -- object allocation and GC churn in request decode/normalize/encode paths -- text read path cost still matters, especially =ReadChunkNode=, but it is only part of the total gap -- extra runtime synchronization (visible in =futex= activity) may be contributing to end-to-end time - -Most likely productive next targets: -- reduce allocation and dispatch churn in the short-request text/JSON path -- re-check =TextIOWrapperNodes.ReadlineNode= and =ReadChunkNode=, but do not assume that fixing them - alone will close the gap -- inspect why the interpreter-mode native standalone still stays ~5x behind CPython even in - =buffer/buffer= mode - -*** Experiment Log - -- Kept for protocol: - - switched from command-line-only flags to =GRAAL_PYTHON_VM_ARGS= for native-ee standalone - - reason: ensures the worker subprocess also runs with =Compilation=false= -- Tried as isolation only: - - direct =buffer/buffer= runs under =Compilation=false= - - result: useful diagnostic, but not a runtime change -- No runtime code micro-optimization was committed in this pass: - - the new measurements point to multiple cost centers - - better to keep the notes reproducible first, then patch one hotspot at a time - -** WSL2 Notes - -Environment: -- WSL2 kernel detected -- =perf= is present but kernel-matched tooling is not configured cleanly -- =gprofng= is installed - -Practical consequence: -- use worker-only async-profiler as primary guide -- use gprofng only as a coarse cross-check - -** Next Likely Target - -Most likely remaining productive area: -- =TextIOWrapperNodes.ReadlineNode= - -Rationale: -- lower buffered/file/posix stack has already been improved -- text-mode still pays in: - - =TextIOWrapperNodes$ReadlineNode.readline= - - =TextIOWrapperNodes$ReadChunkNode.readChunk= - -But: -- recent attempts show that naive local fast paths in =ReadChunkNode= are easy to get wrong -- likely next useful change must avoid growing node shape too much -- focus should be on reducing actual =TruffleString= / substring / concat churn in the - common short-line case, not just adding more conditionals -- the promising "delegate to buffered byte readline" idea is currently blocked by DSL wiring issues From e054fcb401ed4909b15d734a7f9822be3ecee2f9 Mon Sep 17 00:00:00 2001 From: Tim Felgentreff Date: Thu, 16 Apr 2026 16:17:18 +0200 Subject: [PATCH 7/7] [GR-74843] Preserve safe buffered io write fast path --- .../builtins/modules/io/BufferedIONodes.java | 4 +- .../modules/io/BufferedWriterNodes.java | 48 ++++++++++++++++++- .../python/builtins/modules/io/PBuffered.java | 4 +- .../modules/io/TextIOWrapperNodes.java | 2 +- 4 files changed, 51 insertions(+), 7 deletions(-) diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedIONodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedIONodes.java index c126f5f927..f6de4820a8 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedIONodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedIONodes.java @@ -131,12 +131,12 @@ static boolean isClosed(PBuffered self) { } @SuppressWarnings("unused") - @Specialization(guards = {"self.getBuffer() != null", "self.isFastClosedChecks()"}) + @Specialization(guards = {"self.getBuffer() != null", "self.hasFileIORaw()"}) static boolean isClosedFileIO(PBuffered self) { return self.getFileIORaw().isClosed(); } - @Specialization(guards = {"self.getBuffer() != null", "!self.isFastClosedChecks()"}) + @Specialization(guards = {"self.getBuffer() != null", "!self.hasFileIORaw()"}) static boolean isClosedBuffered(VirtualFrame frame, Node inliningTarget, PBuffered self, @Cached PyObjectGetAttr getAttr, @Cached PyObjectIsTrueNode isTrue) { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java index c273e58231..76944609fa 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/BufferedWriterNodes.java @@ -48,11 +48,16 @@ import static com.oracle.graal.python.builtins.modules.io.IONodes.T_WRITE; import static com.oracle.graal.python.nodes.ErrorMessages.IO_S_INVALID_LENGTH; import static com.oracle.graal.python.nodes.ErrorMessages.WRITE_COULD_NOT_COMPLETE_WITHOUT_BLOCKING; +import static com.oracle.graal.python.nodes.ErrorMessages.IO_CLOSED; +import static com.oracle.graal.python.nodes.ErrorMessages.FILE_NOT_OPEN_FOR_S; +import static com.oracle.graal.python.runtime.exception.PythonErrorType.IOUnsupportedOperation; import static com.oracle.graal.python.runtime.exception.PythonErrorType.OSError; import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError; import com.oracle.graal.python.PythonLanguage; +import com.oracle.graal.python.builtins.modules.PosixModuleBuiltins; import com.oracle.graal.python.builtins.objects.PNone; +import com.oracle.graal.python.builtins.objects.exception.OSErrorEnum; import com.oracle.graal.python.builtins.objects.buffer.PythonBufferAccessLibrary; import com.oracle.graal.python.builtins.objects.bytes.PBytes; import com.oracle.graal.python.builtins.objects.common.SequenceStorageNodes; @@ -61,8 +66,12 @@ import com.oracle.graal.python.lib.PyNumberAsSizeNode; import com.oracle.graal.python.lib.PyObjectCallMethodObjArgs; import com.oracle.graal.python.nodes.PNodeWithContext; +import com.oracle.graal.python.nodes.PConstructAndRaiseNode; import com.oracle.graal.python.nodes.PRaiseNode; import com.oracle.graal.python.nodes.object.BuiltinClassProfiles.IsBuiltinObjectProfile; +import com.oracle.graal.python.runtime.PosixSupportLibrary; +import com.oracle.graal.python.runtime.PosixSupportLibrary.PosixException; +import com.oracle.graal.python.runtime.PythonContext; import com.oracle.graal.python.runtime.exception.PException; import com.oracle.graal.python.runtime.object.PFactory; import com.oracle.graal.python.util.PythonUtils; @@ -72,8 +81,10 @@ import com.oracle.truffle.api.dsl.GenerateInline; import com.oracle.truffle.api.dsl.Specialization; import com.oracle.truffle.api.frame.VirtualFrame; +import com.oracle.truffle.api.profiles.InlinedBranchProfile; import com.oracle.truffle.api.library.CachedLibrary; import com.oracle.truffle.api.nodes.Node; +import com.oracle.graal.python.runtime.GilNode; public class BufferedWriterNodes { @@ -222,7 +233,40 @@ abstract static class RawWriteNode extends PNodeWithContext { /** * implementation of cpython/Modules/_io/bufferedio.c:_bufferedwriter_raw_write */ - @Specialization + @SuppressWarnings("truffle-sharing") + @Specialization(guards = "self.hasFileIORaw()") + static int bufferedwriterRawWriteFileIO(VirtualFrame frame, Node inliningTarget, PBuffered self, byte[] buf, int len, + @Bind PythonContext context, + @CachedLibrary("context.getPosixSupport()") PosixSupportLibrary posixLib, + @Cached InlinedBranchProfile errorProfile, + @Cached GilNode gil, + @Cached PConstructAndRaiseNode.Lazy constructAndRaiseNode, + @Cached PRaiseNode raiseNode) { + PFileIO fileIO = self.getFileIORaw(); + if (fileIO.isClosed()) { + throw raiseNode.raise(inliningTarget, ValueError, IO_CLOSED); + } + if (!fileIO.isWritable()) { + throw raiseNode.raise(inliningTarget, IOUnsupportedOperation, FILE_NOT_OPEN_FOR_S, "writing"); + } + final int n; + try { + n = Math.toIntExact(PosixModuleBuiltins.WriteNode.write(fileIO.getFD(), buf, len, + inliningTarget, posixLib, context.getPosixSupport(), errorProfile, gil)); + } catch (PosixException e) { + if (e.getErrorCode() == OSErrorEnum.EAGAIN.getNumber()) { + return -2; + } + throw constructAndRaiseNode.get(inliningTarget).raiseOSErrorFromPosixException(frame, e); + } + if (n > 0 && self.getAbsPos() != -1) { + self.incAbsPos(n); + } + return n; + } + + @SuppressWarnings("truffle-sharing") + @Specialization(guards = "!self.hasFileIORaw()") static int bufferedwriterRawWrite(VirtualFrame frame, Node inliningTarget, PBuffered self, byte[] buf, int len, @Bind PythonLanguage language, @Cached PyObjectCallMethodObjArgs callMethod, @@ -274,7 +318,7 @@ protected static void bufferedwriterFlushUnlocked(VirtualFrame frame, PBuffered while (self.getWritePos() < self.getWriteEnd()) { byte[] buf; int len; - if (self.getWritePos() == 0) { + if (self.hasFileIORaw() && self.getWritePos() == 0) { buf = self.getBuffer(); len = self.getWriteEnd(); } else { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PBuffered.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PBuffered.java index bbcf567d5c..bf71cb80ba 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PBuffered.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/PBuffered.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -158,7 +158,7 @@ public void setFinalizing(boolean finalizing) { this.finalizing = finalizing; } - public boolean isFastClosedChecks() { + public boolean hasFileIORaw() { return fileioRaw != null; } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java index f2afa5cb9d..71d9832ae4 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/io/TextIOWrapperNodes.java @@ -884,7 +884,7 @@ static void init(VirtualFrame frame, Node inliningTarget, PTextIO self, Object b if (buffer instanceof PBuffered) { /* Cache the raw FileIO object to speed up 'closed' checks */ - if (((PBuffered) buffer).isFastClosedChecks()) { + if (((PBuffered) buffer).hasFileIORaw()) { PFileIO f = ((PBuffered) buffer).getFileIORaw(); self.setFileIO(f); }