diff --git a/compute/capture_guard_test.go b/compute/capture_guard_test.go
index 91eb5a1..b1acbcd 100644
--- a/compute/capture_guard_test.go
+++ b/compute/capture_guard_test.go
@@ -2,7 +2,12 @@ package compute
 
 import (
 	"errors"
+	"fmt"
 	"testing"
+	"unsafe"
+
+	"github.com/zerfoo/ztensor/internal/cuda"
+	"github.com/zerfoo/ztensor/internal/gpuapi"
 )
 
 // TestEnsureNotCapturing_NilStream verifies that ensureNotCapturing returns
@@ -15,6 +20,73 @@ func TestEnsureNotCapturing_NilStream(t *testing.T) {
 	}
 }
 
+// TestEnsureNotCapturing_NilPtr verifies that ensureNotCapturing returns nil
+// when the engine has a stream whose Ptr() is nil. This can happen when a
+// stream object is present but the underlying vendor handle was never
+// assigned (CPU-shim runtimes).
+func TestEnsureNotCapturing_NilPtr(t *testing.T) {
+	e := &GPUEngine[float32]{stream: nilPtrStream{}}
+	if err := e.ensureNotCapturing(); err != nil {
+		t.Fatalf("ensureNotCapturing on nil-ptr stream: got %v, want nil", err)
+	}
+}
+
+// TestEnsureNotCapturing_ProbeStatuses is a table-driven test that walks
+// every cudaStreamCaptureStatus value through ensureNotCapturing and asserts
+// the mapping to the guard's outcome:
+//   - None          -> nil (allocation allowed)
+//   - Active        -> ErrCaptureIncompatibleAllocation
+//   - Invalidated   -> nil (guard only blocks Active; fallback logic handles Invalidated)
+func TestEnsureNotCapturing_ProbeStatuses(t *testing.T) {
+	tests := []struct {
+		name   string
+		status cuda.CaptureStatus
+		want   error
+	}{
+		{name: "None allows allocation", status: cuda.CaptureStatusNone, want: nil},
+		{name: "Active blocks allocation", status: cuda.CaptureStatusActive, want: ErrCaptureIncompatibleAllocation},
+		{name: "Invalidated does not trip the active guard", status: cuda.CaptureStatusInvalidated, want: nil},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			restore := swapCaptureStatusFn(func(_ *cuda.Stream) (cuda.CaptureStatus, error) {
+				return tc.status, nil
+			})
+			defer restore()
+
+			e := &GPUEngine[float32]{stream: fakePtrStream{}}
+			got := e.ensureNotCapturing()
+			if !errors.Is(got, tc.want) && got != tc.want {
+				t.Fatalf("ensureNotCapturing(status=%v): got %v, want %v", tc.status, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestEnsureNotCapturing_ProbeError verifies that when cudaStreamGetCaptureInfo
+// itself fails, ensureNotCapturing returns that error (wrapped for context) and
+// does NOT silently treat the stream as safe. Probe failure must propagate so
+// callers fail loud instead of racing a hang on GB10.
+func TestEnsureNotCapturing_ProbeError(t *testing.T) {
+	probeErr := errors.New("cudaStreamGetCaptureInfo failed: synthetic")
+	restore := swapCaptureStatusFn(func(_ *cuda.Stream) (cuda.CaptureStatus, error) {
+		return cuda.CaptureStatusNone, probeErr
+	})
+	defer restore()
+
+	e := &GPUEngine[float32]{stream: fakePtrStream{}}
+	err := e.ensureNotCapturing()
+	if err == nil {
+		t.Fatal("ensureNotCapturing: expected error from failing probe, got nil")
+	}
+	if !errors.Is(err, probeErr) {
+		t.Fatalf("ensureNotCapturing: expected error to wrap probe error, got %v", err)
+	}
+	if errors.Is(err, ErrCaptureIncompatibleAllocation) {
+		t.Fatal("ensureNotCapturing: probe error must not be surfaced as ErrCaptureIncompatibleAllocation")
+	}
+}
+
 // TestErrCaptureIncompatibleAllocation_Is verifies that
 // ErrCaptureIncompatibleAllocation is a sentinel error usable with
 // errors.Is, both directly and when wrapped.
@@ -28,6 +100,17 @@ func TestErrCaptureIncompatibleAllocation_Is(t *testing.T) {
 	}
 }
 
+// TestErrCaptureIncompatibleAllocation_FmtErrorfWrap verifies that the sentinel
+// survives fmt.Errorf("...: %w", ...) wrapping — the idiom callers in
+// allocWeight / uploadBytes use indirectly via ensureNotCapturing and that
+// downstream callers use when adding their own context.
+func TestErrCaptureIncompatibleAllocation_FmtErrorfWrap(t *testing.T) {
+	wrapped := fmt.Errorf("upload layer %d: %w", 7, ErrCaptureIncompatibleAllocation)
+	if !errors.Is(wrapped, ErrCaptureIncompatibleAllocation) {
+		t.Fatalf("errors.Is through fmt.Errorf wrap: got false, want true (err=%v)", wrapped)
+	}
+}
+
 // wrapErr emulates a caller that wraps the sentinel error with %w.
 // Kept local to the test to avoid leaking helpers into the package API.
 func wrapErr(err error) error {
@@ -38,3 +121,40 @@ type wrappedErr struct{ inner error }
 
 func (w *wrappedErr) Error() string { return "wrapped: " + w.inner.Error() }
 func (w *wrappedErr) Unwrap() error { return w.inner }
+
+// swapCaptureStatusFn replaces the package-level captureStatusFn for a test
+// and returns a restore closure. Callers defer restore() to keep tests hermetic.
+func swapCaptureStatusFn(fn func(*cuda.Stream) (cuda.CaptureStatus, error)) func() {
+	prev := captureStatusFn
+	captureStatusFn = fn
+	return func() { captureStatusFn = prev }
+}
+
+// fakeStreamSentinel backs fakePtrStream.Ptr() with a stable address so that
+// escape-analysis does not re-allocate per call and returned pointers remain
+// valid for the lifetime of the test binary. The probe is stubbed, so the
+// handle is never dereferenced.
+var fakeStreamSentinel byte
+
+// fakePtrStream satisfies gpuapi.Stream and returns a non-nil Ptr so that
+// ensureNotCapturing proceeds past the early-return guards and exercises the
+// probe path. Synchronize / Destroy are never called by the guard.
+type fakePtrStream struct{}
+
+func (fakePtrStream) Synchronize() error  { return nil }
+func (fakePtrStream) Destroy() error      { return nil }
+func (fakePtrStream) Ptr() unsafe.Pointer { return unsafe.Pointer(&fakeStreamSentinel) }
+
+// nilPtrStream satisfies gpuapi.Stream but returns a nil Ptr. Used to cover
+// the "stream present but unbacked" branch of ensureNotCapturing.
+type nilPtrStream struct{}
+
+func (nilPtrStream) Synchronize() error  { return nil }
+func (nilPtrStream) Destroy() error      { return nil }
+func (nilPtrStream) Ptr() unsafe.Pointer { return nil }
+
+// Compile-time assertions that the fakes satisfy gpuapi.Stream.
+var (
+	_ gpuapi.Stream = fakePtrStream{}
+	_ gpuapi.Stream = nilPtrStream{}
+)
diff --git a/compute/gpu_engine.go b/compute/gpu_engine.go
index b11ff91..6f6c64e 100644
--- a/compute/gpu_engine.go
+++ b/compute/gpu_engine.go
@@ -573,6 +573,11 @@ func (e *GPUEngine[T]) UploadWeights(tensors []*tensor.TensorNumeric[float32]) e
 	return nil
 }
 
+// captureStatusFn is the indirection point for cuda.StreamCaptureStatus used
+// by ensureNotCapturing. Tests swap it to inject synthetic capture state
+// without requiring real CUDA hardware.
+var captureStatusFn = cuda.StreamCaptureStatus
+
 // ensureNotCapturing returns ErrCaptureIncompatibleAllocation if the
 // engine's stream is currently capturing a CUDA graph. On CPU-only
 // runtimes or when the stream handle is nil, returns nil (no capture
@@ -587,7 +592,7 @@ func (e *GPUEngine[T]) ensureNotCapturing() error {
 		return nil
 	}
 	s := cuda.StreamFromPtr(ptr)
-	status, err := cuda.StreamCaptureStatus(s)
+	status, err := captureStatusFn(s)
 	if err != nil {
 		return fmt.Errorf("ensureNotCapturing: %w", err)
 	}
diff --git a/compute/gpu_engine_alloc_guard_test.go b/compute/gpu_engine_alloc_guard_test.go
new file mode 100644
index 0000000..1cd45aa
--- /dev/null
+++ b/compute/gpu_engine_alloc_guard_test.go
@@ -0,0 +1,113 @@
+package compute
+
+import (
+	"errors"
+	"testing"
+
+	"github.com/zerfoo/ztensor/internal/cuda"
+)
+
+// TestAllocWeight_PropagatesCaptureSentinel confirms the capture guard's
+// sentinel flows out of allocWeight unchanged. A caller wrapping the error
+// with fmt.Errorf("%w") must still match the sentinel via errors.Is so that
+// fallback paths (CaptureSafe, later epics) can catch the exact failure mode.
+func TestAllocWeight_PropagatesCaptureSentinel(t *testing.T) {
+	restore := swapCaptureStatusFn(func(_ *cuda.Stream) (cuda.CaptureStatus, error) {
+		return cuda.CaptureStatusActive, nil
+	})
+	defer restore()
+
+	e := &GPUEngine[float32]{stream: fakePtrStream{}}
+	ptr, err := e.allocWeight(4096)
+	if err == nil {
+		t.Fatal("allocWeight under active capture: expected error, got nil")
+	}
+	if !errors.Is(err, ErrCaptureIncompatibleAllocation) {
+		t.Fatalf("allocWeight: expected ErrCaptureIncompatibleAllocation, got %v", err)
+	}
+	if ptr != nil {
+		t.Fatalf("allocWeight: expected nil pointer on guard trip, got %p", ptr)
+	}
+}
+
+// TestAllocWeight_PropagatesProbeError confirms that if the capture probe
+// itself fails, allocWeight returns the wrapped probe error — not the
+// sentinel, and not a nil error that would let a hang happen silently.
+func TestAllocWeight_PropagatesProbeError(t *testing.T) {
+	probeErr := errors.New("cudaStreamGetCaptureInfo failed: synthetic")
+	restore := swapCaptureStatusFn(func(_ *cuda.Stream) (cuda.CaptureStatus, error) {
+		return cuda.CaptureStatusNone, probeErr
+	})
+	defer restore()
+
+	e := &GPUEngine[float32]{stream: fakePtrStream{}}
+	ptr, err := e.allocWeight(4096)
+	if err == nil {
+		t.Fatal("allocWeight with failing probe: expected error, got nil")
+	}
+	if !errors.Is(err, probeErr) {
+		t.Fatalf("allocWeight: expected wrapped probe error, got %v", err)
+	}
+	if errors.Is(err, ErrCaptureIncompatibleAllocation) {
+		t.Fatal("allocWeight: probe failure must not be reported as capture sentinel")
+	}
+	if ptr != nil {
+		t.Fatalf("allocWeight: expected nil pointer on probe failure, got %p", ptr)
+	}
+}
+
+// TestUploadBytes_PropagatesCaptureSentinel mirrors the allocWeight test on
+// the upload path. uploadBytes is the second weight-load entry point touched
+// during UploadWeights, so both must fail loud under active capture.
+func TestUploadBytes_PropagatesCaptureSentinel(t *testing.T) {
+	restore := swapCaptureStatusFn(func(_ *cuda.Stream) (cuda.CaptureStatus, error) {
+		return cuda.CaptureStatusActive, nil
+	})
+	defer restore()
+
+	e := &GPUEngine[float32]{stream: fakePtrStream{}}
+	src := []byte{0x01, 0x02, 0x03, 0x04}
+	err := e.uploadBytes(nil, src)
+	if err == nil {
+		t.Fatal("uploadBytes under active capture: expected error, got nil")
+	}
+	if !errors.Is(err, ErrCaptureIncompatibleAllocation) {
+		t.Fatalf("uploadBytes: expected ErrCaptureIncompatibleAllocation, got %v", err)
+	}
+}
+
+// TestUploadBytes_PropagatesProbeError confirms probe failures propagate out
+// of uploadBytes the same way they do out of allocWeight.
+func TestUploadBytes_PropagatesProbeError(t *testing.T) {
+	probeErr := errors.New("cudaStreamGetCaptureInfo failed: synthetic")
+	restore := swapCaptureStatusFn(func(_ *cuda.Stream) (cuda.CaptureStatus, error) {
+		return cuda.CaptureStatusNone, probeErr
+	})
+	defer restore()
+
+	e := &GPUEngine[float32]{stream: fakePtrStream{}}
+	src := []byte{0x01, 0x02}
+	err := e.uploadBytes(nil, src)
+	if err == nil {
+		t.Fatal("uploadBytes with failing probe: expected error, got nil")
+	}
+	if !errors.Is(err, probeErr) {
+		t.Fatalf("uploadBytes: expected wrapped probe error, got %v", err)
+	}
+	if errors.Is(err, ErrCaptureIncompatibleAllocation) {
+		t.Fatal("uploadBytes: probe failure must not be reported as capture sentinel")
+	}
+}
+
+// TestAllocWeight_PassesWhenNotCapturing_NilStream is a negative control: on
+// an engine with a nil stream (CPU-only path), allocWeight must NOT be
+// short-circuited by the guard. We cannot safely drive it into the real
+// runtime Malloc here (no GPU), but we can confirm the guard returns nil and
+// the failure, if any, comes from downstream (runtime == nil panic would
+// indicate the guard path is wrong).
+func TestEnsureNotCapturing_AllowsAllocationWhenStreamAbsent(t *testing.T) {
+	e := &GPUEngine[float32]{}
+	if err := e.ensureNotCapturing(); err != nil {
+		t.Fatalf("ensureNotCapturing with nil stream: got %v, want nil", err)
+	}
+}
diff --git a/compute/gpu_engine_gb10_test.go b/compute/gpu_engine_gb10_test.go
new file mode 100644
index 0000000..7b8ee6e
--- /dev/null
+++ b/compute/gpu_engine_gb10_test.go
@@ -0,0 +1,184 @@
+//go:build dgxgb10
+
+package compute
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/zerfoo/ztensor/tensor"
+)
+
+// TestCUDAGraph_MultiTensorUpload_GB10 reproduces the GB10 hang where a
+// capture region starts, an allocation-during-capture happens, and
+// StreamEndCapture deadlocks. It is gated by //go:build dgxgb10 so it
+// only runs on the DGX Spark host; the DGX runner is expected to pass
+// -tags dgxgb10.
+//
+// The test accepts three outcomes so pre-fix and post-fix states are
+// both observable:
+//
+//  1. EndCapture returns a valid graph: E2 fix is in place. The test
+//     passes.
+//  2. BeginCapture or EndCapture returns ErrCaptureIncompatibleAllocation
+//     (or any wrapping of it): the probe from T1.2 caught the unsafe
+//     allocation synchronously. The test records this and passes.
+//  3. The capture body does not complete inside a 30 second timeout:
+//     the hang is still present. The test calls t.Fatal. This is the
+//     signal that the fix regressed (or is not yet in place).
+//
+// Hangs manifest as a deadlock inside StreamEndCapture on GB10 with
+// allocations issued during capture, so the 30s guard is the only
+// reliable way to surface the bug without hanging the whole test
+// binary.
+func TestCUDAGraph_MultiTensorUpload_GB10(t *testing.T) {
+	eng := newTestGPUEngine(t)
+
+	uploadTensors := buildGB10StressTensors(t)
+	if err := eng.UploadWeights(uploadTensors); err != nil {
+		t.Fatalf("UploadWeights: %v", err)
+	}
+
+	// Pair of tensors used inside the capture region for MatMul.
+	// 256x1024 * 1024x256 matches a tensor uploaded above and exercises
+	// the dense float32 kernel that triggers the hang on GB10.
+	aData := make([]float32, 256*1024)
+	for i := range aData {
+		aData[i] = float32(i%7) * 0.125
+	}
+	bData := make([]float32, 1024*256)
+	for i := range bData {
+		bData[i] = float32(i%5) * 0.0625
+	}
+	a, err := tensor.New[float32]([]int{256, 1024}, aData)
+	if err != nil {
+		t.Fatalf("tensor.New A: %v", err)
+	}
+	b, err := tensor.New[float32]([]int{1024, 256}, bData)
+	if err != nil {
+		t.Fatalf("tensor.New B: %v", err)
+	}
+	if err := eng.UploadWeights([]*tensor.TensorNumeric[float32]{a, b}); err != nil {
+		t.Fatalf("UploadWeights(matmul operands): %v", err)
+	}
+
+	// 30 second watchdog: if the capture lifecycle does not complete,
+	// the goroutine is leaked but the test fails the offending run so
+	// the CI job surfaces the bug instead of spinning forever.
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	type captureResult struct {
+		handle GraphHandle
+		err    error
+		// phase identifies where the failure originated so the log
+		// distinguishes BeginCapture errors (T1.2 probe) from
+		// EndCapture errors (post-fix graph instantiation failures).
+		phase string
+	}
+	done := make(chan captureResult, 1)
+
+	go func() {
+		if err := eng.BeginCapture(); err != nil {
+			done <- captureResult{err: err, phase: "BeginCapture"}
+			return
+		}
+		// Run a MatMul inside the capture region. On the pre-fix path
+		// this is the op whose cudaMallocAsync call deadlocks
+		// StreamEndCapture downstream.
+		if _, err := eng.MatMul(context.Background(), a, b); err != nil {
+			// If MatMul itself fails synchronously we still need to
+			// clean up the capture state before surfacing the error.
+			_, endErr := eng.EndCapture()
+			if endErr != nil {
+				err = fmt.Errorf("%w (EndCapture cleanup: %v)", err, endErr)
+			}
+			done <- captureResult{err: err, phase: "MatMul"}
+			return
+		}
+		handle, err := eng.EndCapture()
+		done <- captureResult{handle: handle, err: err, phase: "EndCapture"}
+	}()
+
+	select {
+	case <-ctx.Done():
+		t.Fatal("hang detected -- capture lifecycle did not complete within 30s")
+	case res := <-done:
+		// Ensure any captured graph is released even if the test fails
+		// later in its assertions.
+		t.Cleanup(func() {
+			if res.err == nil {
+				_ = eng.DestroyGraph(res.handle)
+			}
+		})
+
+		if res.err == nil {
+			t.Logf("capture completed cleanly in phase=%s; fix is in place", res.phase)
+			return
+		}
+		if errors.Is(res.err, ErrCaptureIncompatibleAllocation) {
+			t.Logf("observed ErrCaptureIncompatibleAllocation in phase=%s (expected pre-fix outcome): %v", res.phase, res.err)
+			return
+		}
+		t.Fatalf("unexpected capture error in phase=%s: %v", res.phase, res.err)
+	}
+}
+
+// buildGB10StressTensors constructs >=50 float32 tensors spanning a mix
+// of shapes that matches the production upload pattern that triggers the
+// hang: several row-major matrices, a 256x1024 dense matrix, and a
+// handful of long 1-D vectors. Each tensor is populated with a cheap
+// deterministic pattern so MatMul inside the capture region produces
+// non-zero work.
+func buildGB10StressTensors(t *testing.T) []*tensor.TensorNumeric[float32] {
+	t.Helper()
+
+	// 50 varied tensors. The 256x1024 matrix is mandatory because it is
+	// the shape that reproduces on GB10; the remainder is spread across
+	// smaller shapes to force the allocator to touch multiple size
+	// buckets in the pool.
+	shapes := [][]int{
+		{256, 1024},
+		{64, 64}, {64, 64}, {64, 64}, {64, 64},
+		{128, 256}, {128, 256}, {128, 256}, {128, 256},
+		{1024},
+		{512, 128}, {512, 128},
+		{32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32},
+		{256}, {256}, {256},
+		{128, 128}, {128, 128}, {128, 128}, {128, 128},
+		{16, 16}, {16, 16}, {16, 16}, {16, 16}, {16, 16}, {16, 16},
+		{512},
+		{64, 128}, {64, 128},
+		{8, 8}, {8, 8}, {8, 8}, {8, 8}, {8, 8}, {8, 8}, {8, 8}, {8, 8},
+		{2048},
+		{96, 96}, {96, 96}, {96, 96},
+		{4, 4}, {4, 4}, {4, 4}, {4, 4}, {4, 4},
+		{1024, 64},
+	}
+	if len(shapes) < 50 {
+		t.Fatalf("shape list too short: %d", len(shapes))
+	}
+
+	out := make([]*tensor.TensorNumeric[float32], 0, len(shapes))
+	for i, shape := range shapes {
+		n := 1
+		for _, d := range shape {
+			n *= d
+		}
+		data := make([]float32, n)
+		for j := range data {
+			// Mix the tensor index into the value to avoid identical
+			// payloads being deduped by any future cache layer.
+			data[j] = float32((i+1)*(j+1)%131) * 0.03125
+		}
+		tn, err := tensor.New[float32](shape, data)
+		if err != nil {
+			t.Fatalf("tensor.New shape=%v: %v", shape, err)
+		}
+		out = append(out, tn)
+	}
+	return out
+}
diff --git a/docs/plan.md b/docs/plan.md
index 054e4d5..2727822 100644
--- a/docs/plan.md
+++ b/docs/plan.md
@@ -231,16 +231,16 @@ All estimates are rough; refine when a task starts.
 - [x] T1.2 Add `ensureNotCapturing()` guard to `allocWeight` and `uploadBytes` in `compute/gpu_engine.go`. If status is `Active`, return a typed error `ErrCaptureIncompatibleAllocation`. Owner: task-T1.2. Est: 60m. verifies: [UC-003] Completed: 2026-04-15
   - Acceptance: Existing non-capture tests unaffected. New unit test with a mock stream in `Active` state triggers the error.
   - Dependencies: T1.1.
-- [ ] T1.3 Write `TestCUDAGraph_MultiTensorUpload_GB10` in `compute/gpu_engine_test.go` gated behind `//go:build dgxgb10` build tag. The test uploads 50 tensors (including a 256x1024 float32 matrix), then invokes `BeginCapture`, runs a MatMul, `EndCapture`. Owner: TBD. Est: 2h. verifies: [UC-001, UC-002]
+- [x] T1.3 Write `TestCUDAGraph_MultiTensorUpload_GB10` in `compute/gpu_engine_gb10_test.go` gated behind `//go:build dgxgb10` build tag. The test uploads 50 tensors (including a 256x1024 float32 matrix), then invokes `BeginCapture`, runs a MatMul, `EndCapture`. Owner: task-T1.3. Est: 2h. verifies: [UC-001, UC-002] Completed: 2026-04-15
   - Acceptance: Without the fix the test fails with either a hang (caught by a 30s `context.WithTimeout`) or the new typed error.
   - Dependencies: T1.2.
 - [ ] T1.4 Package the test into a Spark manifest `docs/bench/manifests/cuda-graph-gb10-repro.yaml` and submit. Collect logs for evidence. Owner: TBD. Est: 90m. verifies: [UC-002]
   - Acceptance: Manifest submitted via `curl -X POST $SPARK/api/v1/pods ...`; log output includes the hang signature or the new typed error. File one zerfoo-side GitHub issue if a new failure mode surfaces.
   - Dependencies: T1.3.
-- [ ] T1.5 Add unit and integration tests covering T1.1 to T1.3 code paths. Owner: TBD. Est: 60m. verifies: [infrastructure]
+- [x] T1.5 Add unit and integration tests covering T1.1 to T1.3 code paths. Owner: task-T1.5. Est: 60m. verifies: [infrastructure] Completed: 2026-04-15
   - Acceptance: CPU-mock unit tests pass in `go test ./compute/... ./internal/cuda/...`.
   - Dependencies: T1.1, T1.2.
-- [ ] T1.6 Run `gofmt -s -w`, `goimports`, and `golangci-lint run ./...` after the E1 changes. Owner: TBD. Est: 15m. verifies: [infrastructure]
+- [x] T1.6 Run `gofmt -s -w`, `goimports`, and `golangci-lint run ./...` after the E1 changes. Owner: coordinator. Est: 15m. verifies: [infrastructure] Completed: 2026-04-15
   - Dependencies: T1.5.
 
 ### E2 Fix the silent hang path (capture-aware allocation)
@@ -339,9 +339,9 @@ count equals the number of task IDs listed on that wave.
 
 #### Wave 2: Reproduction harness (3 agents)
 
-- [ ] T1.3 Write `TestCUDAGraph_MultiTensorUpload_GB10`  verifies: [UC-001, UC-002]
-- [ ] T1.5 Unit and integration tests for E1  verifies: [infrastructure]
-- [ ] T1.6 Lint and format E1  verifies: [infrastructure]
+- [x] T1.3 Write `TestCUDAGraph_MultiTensorUpload_GB10`  verifies: [UC-001, UC-002]  2026-04-15
+- [x] T1.5 Unit and integration tests for E1  verifies: [infrastructure]  2026-04-15
+- [x] T1.6 Lint and format E1  verifies: [infrastructure]  2026-04-15
 
 #### Wave 3: Repro on hardware (1 agent)
 
diff --git a/internal/cuda/purego.go b/internal/cuda/purego.go
index b22313d..2db0f93 100644
--- a/internal/cuda/purego.go
+++ b/internal/cuda/purego.go
@@ -13,25 +13,25 @@ type CUDALib struct {
 	handle uintptr // dlopen handle for libcudart
 
 	// CUDA runtime function pointers
-	cudaMalloc             uintptr
-	cudaFree               uintptr
-	cudaMemcpy             uintptr
-	cudaMemcpyAsync        uintptr
-	cudaMallocManaged      uintptr
-	cudaStreamCreate       uintptr
-	cudaStreamSynchronize  uintptr
-	cudaStreamDestroy      uintptr
-	cudaGetDeviceCount     uintptr
-	cudaSetDevice          uintptr
-	cudaGetErrorString     uintptr
-	cudaGetDeviceProperties  uintptr
+	cudaMalloc              uintptr
+	cudaFree                uintptr
+	cudaMemcpy              uintptr
+	cudaMemcpyAsync         uintptr
+	cudaMallocManaged       uintptr
+	cudaStreamCreate        uintptr
+	cudaStreamSynchronize   uintptr
+	cudaStreamDestroy       uintptr
+	cudaGetDeviceCount      uintptr
+	cudaSetDevice           uintptr
+	cudaGetErrorString      uintptr
+	cudaGetDeviceProperties uintptr
 	cudaMemcpyPeer          uintptr
 	cudaDeviceGetAttribute  uintptr
 
 	// Async alloc/free (optional, available since CUDA 11.2)
-	cudaMallocAsync  uintptr
-	cudaFreeAsync    uintptr
-	cudaMemsetAsync  uintptr
+	cudaMallocAsync uintptr
+	cudaFreeAsync   uintptr
+	cudaMemsetAsync uintptr
 
 	// CUDA graph API (optional, resolved separately -- may not exist on older runtimes)
 	cudaStreamBeginCapture   uintptr
@@ -179,12 +179,12 @@ const (
 // We also check common CUDA installation directories and the ztensor module
 // source tree for development builds.
 var kernelLibPaths = []string{
-	"libkernels.so",                                          // LD_LIBRARY_PATH + system default
-	"./libkernels.so",                                        // current working directory
-	"./internal/cuda/kernels/libkernels.so",                  // ztensor source tree (dev)
-	"/usr/local/lib/libkernels.so",                           // standard local install
-	"/usr/local/cuda/lib64/libkernels.so",                    // CUDA install directory
-	"/opt/zerfoo/lib/libkernels.so",                          // packaged install
+	"libkernels.so",                         // LD_LIBRARY_PATH + system default
+	"./libkernels.so",                       // current working directory
+	"./internal/cuda/kernels/libkernels.so", // ztensor source tree (dev)
+	"/usr/local/lib/libkernels.so",          // standard local install
+	"/usr/local/cuda/lib64/libkernels.so",   // CUDA install directory
+	"/opt/zerfoo/lib/libkernels.so",         // packaged install
 }
 
 // DlopenKernels loads the custom kernels shared library (libkernels.so)
diff --git a/internal/cuda/runtime_purego_test.go b/internal/cuda/runtime_purego_test.go
index b1f4a61..14cf035 100644
--- a/internal/cuda/runtime_purego_test.go
+++ b/internal/cuda/runtime_purego_test.go
@@ -31,6 +31,41 @@ func TestStreamCaptureStatus_NoRuntime(t *testing.T) {
 	}
 }
 
+// TestStreamFromPtr_NilHandle verifies StreamFromPtr accepts a nil input and
+// produces a Stream whose Ptr() reports nil. This is the path compute's
+// ensureNotCapturing uses to short-circuit before invoking the CUDA probe on
+// a stream that was never bound to a vendor handle.
+func TestStreamFromPtr_NilHandle(t *testing.T) {
+	s := StreamFromPtr(nil)
+	if s == nil {
+		t.Fatal("StreamFromPtr(nil) returned nil Stream")
+	}
+	if got := s.Ptr(); got != nil {
+		t.Fatalf("StreamFromPtr(nil).Ptr(): got %p, want nil", got)
+	}
+}
+
+// TestStreamCaptureStatus_ZeroStream exercises the path where the caller
+// hands in a Stream whose handle is the zero value (e.g. a freshly wrapped
+// nil pointer). When the CUDA runtime is unavailable, the binding must still
+// return CaptureStatusNone with no error rather than panicking on the zero
+// handle.
+func TestStreamCaptureStatus_ZeroStream(t *testing.T) {
+	if Available() {
+		// On CUDA-enabled hosts the zero handle is invalid; skip instead of
+		// probing the driver with an illegal argument.
+		t.Skip("zero-handle probe is only safe when CUDA is unavailable")
+	}
+	var s Stream // handle == 0
+	status, err := StreamCaptureStatus(&s)
+	if err != nil {
+		t.Fatalf("StreamCaptureStatus(zero stream) returned error: %v", err)
+	}
+	if status != CaptureStatusNone {
+		t.Fatalf("StreamCaptureStatus(zero stream): got %v, want CaptureStatusNone", status)
+	}
+}
+
 func TestCaptureStatus_EnumValues(t *testing.T) {
 	// Compile-time exhaustive switch — ensures enum values stay stable and
 	// every variant remains addressable from client code.