fix: allow lifecycle code path to retry failed stop jobs (#26277) · coder/coder@0847137
@@ -653,6 +653,72 @@ func TestWorkspaceAutobuild(t *testing.T) {
653653require.Equal(t, stats.Transitions[ws.ID], database.WorkspaceTransitionStop)
654654 })
655655656+// FailureTTLStopOK verifies that a workspace whose latest build is a failed
657+// stop is retried by issuing another stop after the failure TTL elapses.
658+t.Run("FailureTTLStopOK", func(t *testing.T) {
659+t.Parallel()
660+661+var (
662+ticker = make(chan time.Time)
663+statCh = make(chan autobuild.Stats)
664+logger = slogtest.Make(t, &slogtest.Options{
665+// We ignore errors here since we expect to fail
666+// builds.
667+IgnoreErrors: true,
668+ })
669+failureTTL = time.Minute
670+ )
671+672+client, db, user := coderdenttest.NewWithDatabase(t, &coderdenttest.Options{
673+Options: &coderdtest.Options{
674+Logger: &logger,
675+AutobuildTicker: ticker,
676+IncludeProvisionerDaemon: true,
677+AutobuildStats: statCh,
678+TemplateScheduleStore: schedule.NewEnterpriseTemplateScheduleStore(agplUserQuietHoursScheduleStore(), notifications.NewNoopEnqueuer(), logger, nil),
679+ },
680+LicenseOptions: &coderdenttest.LicenseOptions{
681+Features: license.Features{codersdk.FeatureAdvancedTemplateScheduling: 1},
682+ },
683+ })
684+685+// The start build succeeds, but the stop build fails. This leaves the
686+// workspace's latest build as a failed stop.
687+version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
688+Parse: echo.ParseComplete,
689+ProvisionPlan: echo.PlanComplete,
690+ProvisionApplyMap: map[proto.WorkspaceTransition][]*proto.Response{
691+proto.WorkspaceTransition_START: echo.ApplyComplete,
692+proto.WorkspaceTransition_STOP: echo.ApplyFailed,
693+ },
694+ })
695+template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) {
696+ctr.FailureTTLMillis = ptr.Ref[int64](failureTTL.Milliseconds())
697+ })
698+coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
699+ws := coderdtest.CreateWorkspace(t, client, template.ID)
700+coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, ws.LatestBuild.ID)
701+702+ctx := testutil.Context(t, testutil.WaitLong)
703+stopBuild, err := client.CreateWorkspaceBuild(ctx, ws.ID, codersdk.CreateWorkspaceBuildRequest{
704+Transition: codersdk.WorkspaceTransitionStop,
705+ })
706+require.NoError(t, err)
707+build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, stopBuild.ID)
708+require.Equal(t, codersdk.WorkspaceStatusFailed, build.Status)
709+require.Equal(t, codersdk.WorkspaceTransitionStop, build.Transition)
710+tickTime := build.Job.CompletedAt.Add(failureTTL * 2)
711+712+p, err := coderdtest.GetProvisionerForTags(db, time.Now(), ws.OrganizationID, nil)
713+require.NoError(t, err)
714+coderdtest.UpdateProvisionerLastSeenAt(t, db, p.ID, tickTime)
715+ticker <- tickTime
716+stats := <-statCh
717+// Expect the workspace to be stopped again for breaching failure TTL.
718+require.Len(t, stats.Transitions, 1)
719+require.Equal(t, stats.Transitions[ws.ID], database.WorkspaceTransitionStop)
720+ })
721+656722t.Run("FailureTTLTooEarly", func(t *testing.T) {
657723t.Parallel()
658724