From e11bbf2267b7efb1fe72c4024f69c46e716471af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Wed, 22 Apr 2026 18:58:37 +0100 Subject: [PATCH] update owner status outside of component reconciliation --- README.md | 27 ++- docs/component.md | 71 +++++- docs/guidelines.md | 41 +++- e2e/framework/cluster_reconciler.go | 12 +- e2e/framework/reconciler.go | 12 +- .../component-prerequisites/app/controller.go | 13 +- examples/custom-resource/app/controller.go | 23 +- .../extraction-and-guards/app/controller.go | 23 +- .../grace-inconsistency/app/controller.go | 23 +- .../mutations-and-gating/app/controller.go | 23 +- pkg/component/component.go | 137 ++++++++---- pkg/component/component_test.go | 6 + pkg/component/conditions.go | 35 +-- pkg/component/conditions_test.go | 208 ++++++++++++++---- 14 files changed, 484 insertions(+), 170 deletions(-) diff --git a/README.md b/README.md index 5cb7b340..ee15dc77 100644 --- a/README.md +++ b/README.md @@ -238,27 +238,38 @@ func NewWebInterfaceComponent(owner *MyOperatorCR) (*component.Component, error) The controller builds the component and hands it to the framework. ```go -func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { +func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { owner := &MyOperatorCR{} if err := r.Get(ctx, req.NamespacedName, owner); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } - comp, err := NewWebInterfaceComponent(owner) - if err != nil { - return reconcile.Result{}, err - } - - return reconcile.Result{}, comp.Reconcile(ctx, component.ReconcileContext{ + recCtx := component.ReconcileContext{ Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics, Owner: owner, - }) + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + + comp, err := NewWebInterfaceComponent(owner) + if err != nil { + return reconcile.Result{}, err + } + + return reconcile.Result{}, comp.Reconcile(ctx, recCtx) } ``` +Components stage their conditions on `owner` in memory; a single deferred `component.FlushStatus` at the end of the +reconcile loop persists every condition with one `Status().Update` call. This keeps controllers with multiple components +free of self-induced 409 conflicts. + ## Beyond the Basics The Quick Start shows the common path. The sections below highlight capabilities that matter once your operator grows diff --git a/docs/component.md b/docs/component.md index d3b923bc..90ed7149 100644 --- a/docs/component.md +++ b/docs/component.md @@ -32,6 +32,7 @@ reports their aggregate health through one condition on the owner CRD. - [Grace Period](#grace-period) - [Suspension Lifecycle](#suspension-lifecycle) - [ReconcileContext](#reconcilecontext) +- [Persisting Status with FlushStatus](#persisting-status-with-flushstatus) - [Guards](#guards) - [Registering a Guard](#registering-a-guard) - [Guard Behavior](#guard-behavior) @@ -262,7 +263,9 @@ This means a read-only resource registered before a managed resource can extract resource's guard or mutations. **Phase 5: Status aggregation and condition update.** The health of each resource is collected, the grace period is -consulted, and a single aggregate condition is written to the owner object's status. +consulted, and a single aggregate condition is written to the owner object's conditions **in memory**. `Reconcile` never +calls the Kubernetes API to persist status; the controller does that in a single write at the end of its reconcile loop. +See [Persisting Status with FlushStatus](#persisting-status-with-flushstatus). **Phase 6: Resource deletion.** Resources registered for deletion are removed from the cluster. @@ -428,7 +431,7 @@ recCtx := component.ReconcileContext{ Client: r.Client, // sigs.k8s.io/controller-runtime/pkg/client Scheme: r.Scheme, // *runtime.Scheme Recorder: r.Recorder, // record.EventRecorder - Metrics: r.Metrics, // component.Recorder (condition metrics) + Metrics: r.Metrics, // component.Recorder (condition metrics), optional Owner: owner, // the CRD that owns this component } @@ -437,9 +440,57 @@ err = comp.Reconcile(ctx, recCtx) Dependencies are passed explicitly so components remain testable and decoupled from global state. -The `Metrics` field is required. The framework records Prometheus metrics for every condition state transition during -reconciliation. The recorder implementation is provided by -[go-crd-condition-metrics](https://github.com/sourcehawk/go-crd-condition-metrics). +The `Metrics` field is optional. When set, the framework records Prometheus metrics for every condition reported during +a reconcile. The recorder implementation is provided by +[go-crd-condition-metrics](https://github.com/sourcehawk/go-crd-condition-metrics). Leave the field `nil` to opt out of +metric recording. + +## Persisting Status with FlushStatus + +`Component.Reconcile` only mutates the owner's status conditions in memory. The controller is responsible for writing +those conditions to the Kubernetes API by calling `component.FlushStatus` once per reconcile, typically from a deferred +call so that conditions set on error paths are still persisted: + +```go +func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { + owner := &v1alpha1.MyApp{} + if err := r.Get(ctx, req.NamespacedName, owner); err != nil { + return reconcile.Result{}, client.IgnoreNotFound(err) + } + + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + + comp, err := buildMyComponent(owner) + if err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, comp.Reconcile(ctx, recCtx) +} +``` + +`FlushStatus` performs one `Status().Update` call that writes every condition currently on the owner in memory, wrapped +in `retry.RetryOnConflict`. If another writer updated the owner between the controller's initial `Get` and this call, +`FlushStatus` refetches, reapplies the conditions staged during the reconcile, and retries. Conditions managed by other +writers on the same owner are preserved because `meta.SetStatusCondition` merges by condition type. + +After the update succeeds, `FlushStatus` records metrics for every condition on the owner. If `rec.Metrics` is nil, +metric recording is skipped. + +This split is what allows a controller with several components (see [Keep Controllers Thin](./guidelines.md) and +[One Component Per Logical Condition](./guidelines.md)) to stage several conditions during one reconcile and persist +them all in a single write. Persisting after every component would race the components' writes against each other and +produce 409 conflicts. ## Guards @@ -460,8 +511,14 @@ The following example shows the complete pattern. A cloud provider role resource bucket resource uses that ARN in its spec and guards against being applied before the ARN is available: ```go -func (r *MyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - // ...fetch owner... +func (r *MyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, err error) { + // ...fetch owner and build recCtx... + + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() // roleARN is scoped to this reconcile call. The role resource's data extractor // populates it after the role is applied. Because extraction runs per-resource diff --git a/docs/guidelines.md b/docs/guidelines.md index 1c229749..8b52f679 100644 --- a/docs/guidelines.md +++ b/docs/guidelines.md @@ -301,34 +301,55 @@ clearer than splitting them into `DeploymentReady` and `ServiceReady`. ## Keep Controllers Thin -Controllers should fetch the owner, decide which components to build, and call `Reconcile()`. Business logic, resource -construction, and feature decisions belong in components and their resource builders. +Controllers should fetch the owner, decide which components to build, call `Reconcile()`, and defer a single +`component.FlushStatus` to persist status. Business logic, resource construction, and feature decisions belong in +components and their resource builders. ```go -func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { +func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { owner := &v1alpha1.MyApp{} if err := r.Get(ctx, req.NamespacedName, owner); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } - comp, err := buildWebComponent(owner) - if err != nil { - return reconcile.Result{}, err - } - - return reconcile.Result{}, comp.Reconcile(ctx, component.ReconcileContext{ + recCtx := component.ReconcileContext{ Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics, Owner: owner, - }) + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + + comp, err := buildWebComponent(owner) + if err != nil { + return reconcile.Result{}, err + } + + return reconcile.Result{}, comp.Reconcile(ctx, recCtx) } ``` This keeps controller logic trivial to test (there is almost nothing to test) and makes component construction functions independently testable as pure functions: owner in, component out, no cluster required. +### Flushing status is the controller's job + +`Component.Reconcile` only mutates the owner's conditions in memory. Persisting them is explicitly the controller's +responsibility, via one `component.FlushStatus` call per reconcile, typically deferred so that conditions set by error +paths (for example, `fail()` in the framework) are still written when `Reconcile` returns an error. + +Do not call `FlushStatus` in between component reconciles. With several components per controller the point of the split +is to stage all their conditions in memory first and write them once at the end. Flushing between components brings back +the exact 409 conflict pattern the split was introduced to eliminate. + +If you do not want to emit condition metrics, leave `ReconcileContext.Metrics` as `nil`. `FlushStatus` tolerates a nil +recorder and simply skips metric emission. + ## Resource Registration Order Is Execution Order Resources are reconciled in the exact order they are registered with `WithResource()`. This is deliberate: guards and diff --git a/e2e/framework/cluster_reconciler.go b/e2e/framework/cluster_reconciler.go index 8f634787..9c113fb8 100644 --- a/e2e/framework/cluster_reconciler.go +++ b/e2e/framework/cluster_reconciler.go @@ -82,7 +82,7 @@ func (r *ClusterE2EReconciler) Unregister(name string) { // Reconcile implements reconcile.Reconciler. It fetches the ClusterTestApp, looks up // the registered factory, builds the component, and reconciles it. -func (r *ClusterE2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { +func (r *ClusterE2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { logger := log.FromContext(ctx).WithValues("clustertestapp", req.NamespacedName) owner := &ClusterTestApp{} @@ -98,20 +98,19 @@ func (r *ClusterE2EReconciler) Reconcile(ctx context.Context, req reconcile.Requ r.mu.RUnlock() var comp *component.Component - var err error switch { case hasComp: comp, err = compFactory(owner) case hasRes: - res, buildErr := resFactory(owner) + resource, buildErr := resFactory(owner) if buildErr != nil { return reconcile.Result{}, buildErr } comp, err = component.NewComponentBuilder(). WithName("e2e-test"). WithConditionType("E2EReady"). - WithResource(res, component.ResourceOptions{}). + WithResource(resource, component.ResourceOptions{}). Suspend(owner.Spec.Suspended). Build() default: @@ -130,6 +129,11 @@ func (r *ClusterE2EReconciler) Reconcile(ctx context.Context, req reconcile.Requ Metrics: r.Metrics, Owner: owner, } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() if err := comp.Reconcile(ctx, recCtx); err != nil { return reconcile.Result{}, err diff --git a/e2e/framework/reconciler.go b/e2e/framework/reconciler.go index 2bfd629c..c366f19a 100644 --- a/e2e/framework/reconciler.go +++ b/e2e/framework/reconciler.go @@ -82,7 +82,7 @@ func (r *E2EReconciler) Unregister(key types.NamespacedName) { // Reconcile implements reconcile.Reconciler. It fetches the TestApp, looks up // the registered factory, builds the component, and reconciles it. -func (r *E2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { +func (r *E2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { logger := log.FromContext(ctx).WithValues("testapp", req.NamespacedName) owner := &TestApp{} @@ -98,20 +98,19 @@ func (r *E2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r r.mu.RUnlock() var comp *component.Component - var err error switch { case hasComp: comp, err = compFactory(owner) case hasRes: - res, buildErr := resFactory(owner) + resource, buildErr := resFactory(owner) if buildErr != nil { return reconcile.Result{}, buildErr } comp, err = component.NewComponentBuilder(). WithName("e2e-test"). WithConditionType("E2EReady"). - WithResource(res, component.ResourceOptions{}). + WithResource(resource, component.ResourceOptions{}). Suspend(owner.Spec.Suspended). Build() default: @@ -130,6 +129,11 @@ func (r *E2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r Metrics: r.Metrics, Owner: owner, } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() if err := comp.Reconcile(ctx, recCtx); err != nil { return reconcile.Result{}, err diff --git a/examples/component-prerequisites/app/controller.go b/examples/component-prerequisites/app/controller.go index 91f04406..6f1cefd4 100644 --- a/examples/component-prerequisites/app/controller.go +++ b/examples/component-prerequisites/app/controller.go @@ -27,7 +27,13 @@ type Controller struct { } // Reconcile builds and reconciles the infra and app components in order. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +// +// Both components share the same ReconcileContext and stage their conditions +// on the owner in memory; a single deferred FlushStatus at the end of +// reconciliation persists both conditions in one API call. That is what +// prevents the sequential components from racing two separate status updates +// against the same owner and hitting conflicts. +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { recCtx := component.ReconcileContext{ Client: r.Client, Scheme: r.Scheme, @@ -35,6 +41,11 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { Metrics: r.Metrics, Owner: owner, } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() // --- Infra component: no prerequisites --- cmResource, err := r.NewConfigMapResource(owner) diff --git a/examples/custom-resource/app/controller.go b/examples/custom-resource/app/controller.go index dfd16dd9..30ed308a 100644 --- a/examples/custom-resource/app/controller.go +++ b/examples/custom-resource/app/controller.go @@ -23,7 +23,20 @@ type Controller struct { } // Reconcile builds and reconciles a single component managing the certificate. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + certResource, err := r.NewCertificateResource(owner) if err != nil { return err @@ -38,11 +51,5 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { return err } - return comp.Reconcile(ctx, component.ReconcileContext{ - Client: r.Client, - Scheme: r.Scheme, - Recorder: r.Recorder, - Metrics: r.Metrics, - Owner: owner, - }) + return comp.Reconcile(ctx, recCtx) } diff --git a/examples/extraction-and-guards/app/controller.go b/examples/extraction-and-guards/app/controller.go index 8f00c9ec..464d7816 100644 --- a/examples/extraction-and-guards/app/controller.go +++ b/examples/extraction-and-guards/app/controller.go @@ -30,7 +30,20 @@ type Controller struct { // Reconcile builds and reconciles a component where the ConfigMap is registered // before the Secret. Registration order matters: the guard on the Secret can // only read data extracted by a preceding resource. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + // Shared state: the ConfigMap extractor writes here, the Secret guard reads it. var dbHost string @@ -54,11 +67,5 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { return err } - return comp.Reconcile(ctx, component.ReconcileContext{ - Client: r.Client, - Scheme: r.Scheme, - Recorder: r.Recorder, - Metrics: r.Metrics, - Owner: owner, - }) + return comp.Reconcile(ctx, recCtx) } diff --git a/examples/grace-inconsistency/app/controller.go b/examples/grace-inconsistency/app/controller.go index 42c8ebb7..297205c0 100644 --- a/examples/grace-inconsistency/app/controller.go +++ b/examples/grace-inconsistency/app/controller.go @@ -25,7 +25,20 @@ type Controller struct { // Reconcile builds and reconciles a component with grace period and // inconsistency suppression. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + deployResource, err := r.NewDeploymentResource(owner) if err != nil { return err @@ -52,11 +65,5 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { return err } - return comp.Reconcile(ctx, component.ReconcileContext{ - Client: r.Client, - Scheme: r.Scheme, - Recorder: r.Recorder, - Metrics: r.Metrics, - Owner: owner, - }) + return comp.Reconcile(ctx, recCtx) } diff --git a/examples/mutations-and-gating/app/controller.go b/examples/mutations-and-gating/app/controller.go index 9e0f15b0..d2fc5305 100644 --- a/examples/mutations-and-gating/app/controller.go +++ b/examples/mutations-and-gating/app/controller.go @@ -24,7 +24,20 @@ type Controller struct { } // Reconcile builds and reconciles a single component containing both resources. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + deployResource, err := r.NewDeploymentResource(owner) if err != nil { return err @@ -55,11 +68,5 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { return err } - return comp.Reconcile(ctx, component.ReconcileContext{ - Client: r.Client, - Scheme: r.Scheme, - Recorder: r.Recorder, - Metrics: r.Metrics, - Owner: owner, - }) + return comp.Reconcile(ctx, recCtx) } diff --git a/pkg/component/component.go b/pkg/component/component.go index c6c1ec45..28cd24ae 100644 --- a/pkg/component/component.go +++ b/pkg/component/component.go @@ -5,10 +5,12 @@ import ( "fmt" "time" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -30,6 +32,8 @@ type OperatorCRD interface { } // Recorder is an interface for recording status condition changes as metrics. +// It is optional: a [ReconcileContext] may leave [ReconcileContext.Metrics] +// nil, in which case [FlushStatus] skips metric emission. type Recorder interface { // RecordConditionFor records a condition change for a specific object and kind. RecordConditionFor( @@ -47,7 +51,8 @@ type ReconcileContext struct { Scheme *runtime.Scheme // Recorder is the event recorder for publishing Kubernetes events. Recorder record.EventRecorder - // Metrics is the recorder for status condition metrics. + // Metrics is the recorder for status condition metrics. It is optional; if + // nil, [FlushStatus] will skip metric emission. Metrics Recorder // Owner is the custom resource that owns and is updated by the components. Owner OperatorCRD @@ -134,8 +139,12 @@ func (c *Component) GetCondition(owner OperatorCRD) Condition { // Reconcile converges the component to the desired state. // -// A component manages its own condition on the parent and updates it accordingly -// to represent currently observable facts about the component status. +// A component manages its own condition on the parent and updates it in-memory +// to represent currently observable facts about the component status. Reconcile +// never writes to the Kubernetes API status subresource; the controller is +// responsible for persisting the final status by calling [FlushStatus] exactly +// once per reconciliation, typically via defer so that conditions set on error +// paths are still written. // // Reconciliation follows these steps: // @@ -170,7 +179,9 @@ func (c *Component) GetCondition(owner OperatorCRD) Condition { // // 6. Condition Update: Derives a new component condition using a stateful // progression model that considers the aggregate resource status, the -// previous condition, and the configured grace period to avoid churn. +// previous condition, and the configured grace period to avoid churn. The +// new condition is written to the owner in memory only; call [FlushStatus] +// to persist. // // 7. Resource Deletion: Finally, it deletes any resources registered for deletion. func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { @@ -184,7 +195,7 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { mapper := rec.Client.RESTMapper() if mapper == nil { return fail( - ctx, rec, c.conditionType, fmt.Errorf( + rec, c.conditionType, fmt.Errorf( "ReconcileContext.Client.RESTMapper() returned nil; a valid RESTMapper is required for reconciliation", ), ) @@ -195,17 +206,18 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { enabled, err := c.featureGate.Enabled() if err != nil { cond := conditionFeatureGateError(c.conditionType, err, rec.Owner.GetGeneration()) - _ = setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) return err } if !enabled { if err := deleteResources(ctx, rec, c.allManagedResources(), withDeletionReason("disabled feature gate")); err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } cond := conditionDisabled(c.conditionType, rec.Owner.GetGeneration()) - return setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) + return nil } } @@ -219,13 +231,14 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { result, err := c.evaluatePrerequisites(rec) if err != nil { cond := conditionPrerequisiteNotMet(c.conditionType, err.Error(), rec.Owner.GetGeneration()) - _ = setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) return err } if result.Status == PrerequisiteStatusNotMet { cond := conditionPrerequisiteNotMet(c.conditionType, result.Reason, rec.Owner.GetGeneration()) - return setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) + return nil } } } @@ -236,7 +249,7 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { managed := c.managedResources() results, err := suspendResources(ctx, rec, managed, c.name, mapper) if err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } cond := suspendingCondition( @@ -244,12 +257,10 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { suspensionResults(results).summary(), rec.Owner.GetGeneration(), ) - if err := setStatusCondition(ctx, rec, cond); err != nil { - return err - } + applyStatusCondition(rec, cond) if err := deleteResources(ctx, rec, c.deleteResources); err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } return nil @@ -261,7 +272,7 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { // is available to subsequent resources' guards and mutations. results, err := reconcileResources(ctx, rec, c.reconcileResources, c.name, mapper) if err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } // Determine new condition for component @@ -272,12 +283,10 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { c.gracePeriod, c.GetCondition(rec.Owner), ) - if err := setStatusCondition(ctx, rec, cond); err != nil { - return err - } + applyStatusCondition(rec, cond) if err := deleteResources(ctx, rec, c.deleteResources); err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } return nil @@ -348,23 +357,77 @@ func (c *Component) managedResources() []Resource { return managed } -// fail sets the component's error status condition on the owner and returns the -// provided error. -// -// This helper centralizes the common reconciliation pattern where a failure -// should both: -// 1. Update the component condition on the owner to reflect the error. -// 2. Propagate the error to stop further reconciliation. -// -// The error from setting the status condition is intentionally ignored because -// the original reconciliation error is considered the primary failure. -func fail( - ctx context.Context, - rec ReconcileContext, - conditionType ConditionType, - err error, -) error { +// fail writes an error condition for the component to the owner in memory and +// returns the provided error. Persistence of the error condition happens when +// the controller calls [FlushStatus], typically from a deferred call that +// still runs even when Reconcile returns an error. +func fail(rec ReconcileContext, conditionType ConditionType, err error) error { cond := conditionError(conditionType, err, rec.Owner.GetGeneration()) - _ = setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) return err } + +// FlushStatus persists the owner's current status conditions to the Kubernetes +// API and records condition metrics for every condition on the owner. +// +// Controllers must call FlushStatus exactly once per reconciliation, typically +// via defer so that conditions set on error paths are still persisted: +// +// func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (res reconcile.Result, err error) { +// owner := &v1alpha1.MyApp{} +// if err := r.Get(ctx, req.NamespacedName, owner); err != nil { +// return reconcile.Result{}, client.IgnoreNotFound(err) +// } +// rec := component.ReconcileContext{ /* ... */ Owner: owner} +// defer func() { +// if flushErr := component.FlushStatus(ctx, rec); flushErr != nil && err == nil { +// err = flushErr +// } +// }() +// return reconcile.Result{}, comp.Reconcile(ctx, rec) +// } +// +// On a 409 Conflict (for example if an external writer updated the owner +// between the controller fetching it and this call) FlushStatus refetches the +// owner, re-applies the conditions staged during reconciliation using +// meta.SetStatusCondition, and retries. Conditions managed by other writers on +// the owner are preserved because meta.SetStatusCondition merges by condition +// type. +// +// If rec.Metrics is nil, metric recording is skipped. All other fields of rec +// must be populated. +func FlushStatus(ctx context.Context, rec ReconcileContext) error { + desired := append([]metav1.Condition(nil), *rec.Owner.GetStatusConditions()...) + + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + updateErr := rec.Client.Status().Update(ctx, rec.Owner) + if updateErr == nil { + return nil + } + if !apierrors.IsConflict(updateErr) { + return updateErr + } + key := client.ObjectKeyFromObject(rec.Owner) + if getErr := rec.Client.Get(ctx, key, rec.Owner); getErr != nil { + return getErr + } + for _, cond := range desired { + meta.SetStatusCondition(rec.Owner.GetStatusConditions(), cond) + } + return updateErr + }) + if err != nil { + return err + } + + if rec.Metrics == nil { + return nil + } + for _, cond := range *rec.Owner.GetStatusConditions() { + rec.Metrics.RecordConditionFor( + rec.Owner.GetKind(), rec.Owner, cond.Type, string(cond.Status), + cond.Reason, cond.LastTransitionTime.Time, + ) + } + return nil +} diff --git a/pkg/component/component_test.go b/pkg/component/component_test.go index 05b6d536..7aa1ebf9 100644 --- a/pkg/component/component_test.go +++ b/pkg/component/component_test.go @@ -42,7 +42,13 @@ var _ = Describe("Component Reconciler", func() { } }) + // getOwnerCondition mirrors what a controller does in production: after + // Reconcile mutates the owner's status conditions in memory, a deferred + // FlushStatus writes them to the API. The helper flushes then refetches so + // that the returned condition reflects the persisted state, including + // error conditions set via fail(). getOwnerCondition := func() Condition { + Expect(FlushStatus(ctx, recCtx)).To(Succeed()) updatedOwner := &MockOperatorCRD{} Expect(k8sClient.Get(ctx, client.ObjectKey{Name: owner.Name, Namespace: namespace}, updatedOwner)).To(Succeed()) return comp.GetCondition(updatedOwner) diff --git a/pkg/component/conditions.go b/pkg/component/conditions.go index d1e3fcc8..8d79b188 100644 --- a/pkg/component/conditions.go +++ b/pkg/component/conditions.go @@ -1,8 +1,6 @@ package component import ( - "context" - "github.com/sourcehawk/operator-component-framework/pkg/component/concepts" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -153,29 +151,12 @@ func conditionUnknown(component ConditionType, observedGeneration int64) Conditi } } -// setStatusCondition updates the component condition on the owner CRD's status. -// It performs three key actions: -// 1. Updates the condition in the owner's status condition slice. -// 2. Records the condition change in the metrics recorder. -// 3. If the condition has changed, it persists the update to the Kubernetes API. -func setStatusCondition( - ctx context.Context, rec ReconcileContext, cond Condition, -) error { - changed := meta.SetStatusCondition(rec.Owner.GetStatusConditions(), metav1.Condition(cond)) - updated := meta.FindStatusCondition(*rec.Owner.GetStatusConditions(), cond.Type) - - if updated != nil { - rec.Metrics.RecordConditionFor( - rec.Owner.GetKind(), rec.Owner, updated.Type, string(updated.Status), - updated.Reason, updated.LastTransitionTime.Time, - ) - } - - if changed { - if err := rec.Client.Status().Update(ctx, rec.Owner); err != nil { - return err - } - } - - return nil +// applyStatusCondition updates the component condition on the owner's in-memory +// status conditions. It does not call the Kubernetes API and does not record +// metrics; persistence and metrics recording are performed once per reconcile +// by [FlushStatus]. Keeping this function purely in-memory is what allows a +// controller with several components to share a single status write at the end +// of reconciliation instead of racing multiple writes against the same owner. +func applyStatusCondition(rec ReconcileContext, cond Condition) { + meta.SetStatusCondition(rec.Owner.GetStatusConditions(), metav1.Condition(cond)) } diff --git a/pkg/component/conditions_test.go b/pkg/component/conditions_test.go index 35cbc939..4a72ef6e 100644 --- a/pkg/component/conditions_test.go +++ b/pkg/component/conditions_test.go @@ -3,6 +3,7 @@ package component import ( "context" "errors" + "fmt" "testing" "time" @@ -10,8 +11,10 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -191,10 +194,7 @@ func TestConditionMethods(t *testing.T) { }) } -func TestSetStatusCondition(t *testing.T) { - scheme := runtime.NewScheme() - _ = AddToScheme(scheme) - +func TestApplyStatusCondition(t *testing.T) { owner := &MockOperatorCRD{ ObjectMeta: metav1.ObjectMeta{ Name: "test-owner", @@ -203,7 +203,12 @@ func TestSetStatusCondition(t *testing.T) { }, } - ctx := context.Background() + rec := ReconcileContext{ + Client: failingClient{}, + Metrics: metricsThatPanic{}, + Owner: owner, + } + cond := Condition{ Type: "TestComponent", Status: metav1.ConditionTrue, @@ -212,56 +217,123 @@ func TestSetStatusCondition(t *testing.T) { ObservedGeneration: 1, } - t.Run("should update status and record metrics", func(t *testing.T) { - k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() - metrics := &MockMetrics{} + // Pure in-memory mutation: no client call, no metrics call. Using a client + // that would fail on any API access and a metrics recorder that would + // panic on any invocation proves applyStatusCondition never reaches either. + applyStatusCondition(rec, cond) - rec := ReconcileContext{ - Client: k8sClient, - Metrics: metrics, - Owner: owner, + conditions := owner.GetStatusConditions() + require.Len(t, *conditions, 1) + assert.Equal(t, cond.Type, (*conditions)[0].Type) + assert.Equal(t, metav1.ConditionTrue, (*conditions)[0].Status) + assert.Equal(t, cond.Reason, (*conditions)[0].Reason) +} + +func TestFlushStatus(t *testing.T) { + ctx := context.Background() + scheme := runtime.NewScheme() + require.NoError(t, AddToScheme(scheme)) + + newOwner := func() *MockOperatorCRD { + return &MockOperatorCRD{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-owner", + Namespace: "default", + Generation: 1, + }, } + } + + cond := func(ctype, reason string, status metav1.ConditionStatus) Condition { + return Condition{Type: ctype, Status: status, Reason: reason, ObservedGeneration: 1} + } - metrics.On("RecordConditionFor", - owner.GetKind(), owner, cond.Type, string(cond.Status), - cond.Reason, mock.AnythingOfType("time.Time"), mock.Anything, - ).Return() + t.Run("persists every condition on the owner and records a metric per condition", func(t *testing.T) { + owner := newOwner() + applyStatusCondition(ReconcileContext{Owner: owner}, cond("InfraReady", "Ready", metav1.ConditionTrue)) + applyStatusCondition(ReconcileContext{Owner: owner}, cond("AppReady", "Ready", metav1.ConditionTrue)) - err := setStatusCondition(ctx, rec, cond) - require.NoError(t, err) + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() + metrics := &MockMetrics{} + metrics.On("RecordConditionFor", owner.GetKind(), owner, "InfraReady", + string(metav1.ConditionTrue), "Ready", mock.Anything, mock.Anything).Return().Once() + metrics.On("RecordConditionFor", owner.GetKind(), owner, "AppReady", + string(metav1.ConditionTrue), "Ready", mock.Anything, mock.Anything).Return().Once() - // Verify condition set in owner - updated := owner.GetStatusConditions() - assert.Len(t, *updated, 1) - assert.Equal(t, cond.Type, (*updated)[0].Type) + require.NoError(t, FlushStatus(ctx, ReconcileContext{Client: k8sClient, Metrics: metrics, Owner: owner})) + persisted := &MockOperatorCRD{} + require.NoError(t, k8sClient.Get(ctx, client.ObjectKeyFromObject(owner), persisted)) + assert.Len(t, persisted.Status.Conditions, 2) metrics.AssertExpectations(t) }) - t.Run("should return error if update fails", func(t *testing.T) { - // Use a manual mock client to simulate error - innerClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() - k8sClient := &errorMockClient{Client: innerClient} - metrics := &MockMetrics{} + t.Run("is a no-op when metrics recorder is nil", func(t *testing.T) { + owner := newOwner() + applyStatusCondition(ReconcileContext{Owner: owner}, cond("InfraReady", "Ready", metav1.ConditionTrue)) - rec := ReconcileContext{ - Client: k8sClient, - Metrics: metrics, - Owner: owner, - } + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() - metrics.On("RecordConditionFor", - mock.Anything, mock.Anything, mock.Anything, mock.Anything, - mock.Anything, mock.Anything, mock.Anything, - ).Return() + require.NoError(t, FlushStatus(ctx, ReconcileContext{Client: k8sClient, Owner: owner})) - // New condition to ensure changed=true - newCond := cond - newCond.Message = "Something changed" + persisted := &MockOperatorCRD{} + require.NoError(t, k8sClient.Get(ctx, client.ObjectKeyFromObject(owner), persisted)) + assert.Len(t, persisted.Status.Conditions, 1) + }) - err := setStatusCondition(ctx, rec, newCond) + t.Run("surfaces non-conflict update errors without recording metrics", func(t *testing.T) { + owner := newOwner() + applyStatusCondition(ReconcileContext{Owner: owner}, cond("InfraReady", "Ready", metav1.ConditionTrue)) + + inner := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() + k8sClient := &errorMockClient{Client: inner} + metrics := &MockMetrics{} + + err := FlushStatus(ctx, ReconcileContext{Client: k8sClient, Metrics: metrics, Owner: owner}) require.Error(t, err) assert.Equal(t, "update failed", err.Error()) + + metrics.AssertNotCalled(t, "RecordConditionFor") + }) + + t.Run("retries on conflict by refetching the owner and reapplying staged conditions", func(t *testing.T) { + owner := newOwner() + + // External writer got there first with a condition the framework does not manage. + serverSide := newOwner() + serverSide.Status.Conditions = []metav1.Condition{{ + Type: "ExternalReady", + Status: metav1.ConditionTrue, + Reason: "ExternalReason", + LastTransitionTime: metav1.Now(), + }} + inner := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(serverSide).Build() + // The in-memory owner has a stale ResourceVersion (empty), so the first + // Update must conflict. The conflict wrapper refetches, reapplies our + // staged condition, and lets retry.RetryOnConflict succeed on the retry. + k8sClient := &conflictOnceClient{Client: inner} + + applyStatusCondition(ReconcileContext{Owner: owner}, cond("InfraReady", "Ready", metav1.ConditionTrue)) + + metrics := &MockMetrics{} + metrics.On("RecordConditionFor", owner.GetKind(), owner, "ExternalReady", + string(metav1.ConditionTrue), "ExternalReason", mock.Anything, mock.Anything).Return().Once() + metrics.On("RecordConditionFor", owner.GetKind(), owner, "InfraReady", + string(metav1.ConditionTrue), "Ready", mock.Anything, mock.Anything).Return().Once() + + require.NoError(t, FlushStatus(ctx, ReconcileContext{Client: k8sClient, Metrics: metrics, Owner: owner})) + + assert.GreaterOrEqual(t, k8sClient.gets, 1, "expected at least one Get for conflict refetch") + persisted := &MockOperatorCRD{} + require.NoError(t, k8sClient.Get(ctx, client.ObjectKeyFromObject(owner), persisted)) + // Both the externally written condition and the framework-managed + // condition must be present on the persisted owner. + types := make([]string, 0, len(persisted.Status.Conditions)) + for _, c := range persisted.Status.Conditions { + types = append(types, c.Type) + } + assert.ElementsMatch(t, []string{"ExternalReady", "InfraReady"}, types) + metrics.AssertExpectations(t) }) } @@ -280,3 +352,59 @@ type errorStatusWriter struct { func (e *errorStatusWriter) Update(_ context.Context, _ client.Object, _ ...client.SubResourceUpdateOption) error { return errors.New("update failed") } + +// conflictOnceClient wraps a real client and causes the first status Update to +// return a Conflict error, forcing FlushStatus to refetch and retry. Subsequent +// Updates fall through to the underlying client. +type conflictOnceClient struct { + client.Client + conflicts int + gets int +} + +func (c *conflictOnceClient) Status() client.SubResourceWriter { + return &conflictOnceStatusWriter{SubResourceWriter: c.Client.Status(), parent: c} +} + +func (c *conflictOnceClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + c.gets++ + return c.Client.Get(ctx, key, obj, opts...) +} + +type conflictOnceStatusWriter struct { + client.SubResourceWriter + parent *conflictOnceClient +} + +func (w *conflictOnceStatusWriter) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error { + if w.parent.conflicts == 0 { + w.parent.conflicts++ + return apierrors.NewConflict( + schema.GroupResource{Group: "example.io", Resource: "mockoperatorcrds"}, + obj.GetName(), + fmt.Errorf("the object has been modified; please apply your changes to the latest version and try again"), + ) + } + return w.SubResourceWriter.Update(ctx, obj, opts...) +} + +// failingClient is a client whose every method returns an error. Used to prove +// that applyStatusCondition never calls the API. +type failingClient struct { + client.Client +} + +func (failingClient) Status() client.SubResourceWriter { + panic("applyStatusCondition must not reach the client") +} +func (failingClient) Get(context.Context, client.ObjectKey, client.Object, ...client.GetOption) error { + panic("applyStatusCondition must not reach the client") +} + +// metricsThatPanic is a Recorder whose every method panics. Used to prove that +// applyStatusCondition never records metrics. +type metricsThatPanic struct{} + +func (metricsThatPanic) RecordConditionFor(string, ocm.ObjectLike, string, string, string, time.Time, ...string) { + panic("applyStatusCondition must not record metrics") +}