From 5a3b1372bf14dc0944b096c44b54f93a4b485862 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Thu, 25 Sep 2025 11:34:20 -0400 Subject: [PATCH 1/3] RSDK-11248 Remove restart checking logic and update restart_status endpoint --- robot/web/web.go | 13 ++++++-- web/cmd/droid/droid.go | 6 +++- web/server/entrypoint.go | 35 ++------------------ web/server/restart_checker.go | 60 ----------------------------------- 4 files changed, 18 insertions(+), 96 deletions(-) delete mode 100644 web/server/restart_checker.go diff --git a/robot/web/web.go b/robot/web/web.go index 4391749a523..1cf6a533bdb 100644 --- a/robot/web/web.go +++ b/robot/web/web.go @@ -971,14 +971,23 @@ func (svc *webService) Stats() any { // RestartStatusResponse is the JSON response of the `restart_status` HTTP // endpoint. type RestartStatusResponse struct { - // RestartAllowed represents whether this instance of the viam-server can be + // RestartAllowed represents whether this instance of the viamserver can be // safely restarted. RestartAllowed bool `json:"restart_allowed"` + // DoesNotHandleNeedsRestart represents whether this instance of the viamserver does + // not check for the need to restart against app itself and, thus, needs agent to do so. + // Newer versions of viamserver (>= v0.9x.0) will report true for this value, while + // older versions won't report it at all, and agent should let viamserver handle + // NeedsRestart logic. + DoesNotHandleNeedsRestart bool `json:"does_not_handle_needs_restart,omitempty"` } // Handles the `/restart_status` endpoint. func (svc *webService) handleRestartStatus(w http.ResponseWriter, r *http.Request) { - response := RestartStatusResponse{RestartAllowed: svc.r.RestartAllowed()} + response := RestartStatusResponse{ + RestartAllowed: svc.r.RestartAllowed(), + DoesNotHandleNeedsRestart: true, + } w.Header().Set("Content-Type", "application/json") // Only log errors from encoding here. A failure to encode should never diff --git a/web/cmd/droid/droid.go b/web/cmd/droid/droid.go index 0a8ebec96f9..7b2de086286 100644 --- a/web/cmd/droid/droid.go +++ b/web/cmd/droid/droid.go @@ -19,7 +19,11 @@ var logger = logging.NewDebugLogger("droid-entrypoint") // DroidStopHook used by android harness to stop the RDK. func DroidStopHook() { //nolint:revive - server.ForceRestart = true + // NOTE(benjirewis): In RSDK-11248, we removed the restart checking logic from + // viam-server and put it in viam-agent. This method used to set a flag used by restart + // checking logic, but since that no longer exists and we have no users of this droid + // code, we no longer support this method. + logger.Error("DroidStopHook is no longer supported") } // MainEntry is called by our android app to start the RDK. diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 381f6f5cda0..f384100aab7 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -444,15 +444,11 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err defer func() { <-forceShutdown }() var ( - theRobot robot.LocalRobot - theRobotLock sync.Mutex - cloudRestartCheckerActive chan struct{} + theRobot robot.LocalRobot + theRobotLock sync.Mutex ) rpcDialer := rpc.NewCachedDialer() defer func() { - if cloudRestartCheckerActive != nil { - <-cloudRestartCheckerActive - } err = multierr.Combine(err, rpcDialer.Close()) }() defer cancel() @@ -529,33 +525,6 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err // This functionality is tested in `TestLogPropagation` in `local_robot_test.go`. config.UpdateLoggerRegistryFromConfig(s.registry, fullProcessedConfig, s.logger) - if fullProcessedConfig.Cloud != nil { - cloudRestartCheckerActive = make(chan struct{}) - utils.PanicCapturingGo(func() { - defer close(cloudRestartCheckerActive) - restartCheck := newRestartChecker(cfg.Cloud, s.logger, s.conn) - restartInterval := defaultNeedsRestartCheckInterval - - for { - if !utils.SelectContextOrWait(ctx, restartInterval) { - return - } - - mustRestart, newRestartInterval, err := restartCheck.needsRestart(ctx) - if err != nil { - s.logger.Infow("failed to check restart", "error", err) - continue - } - - restartInterval = newRestartInterval - - if mustRestart { - logStackTraceAndCancel(cancel, s.logger) - } - } - }) - } - robotOptions := createRobotOptions() if s.args.RevealSensitiveConfigDiffs { robotOptions = append(robotOptions, robotimpl.WithRevealSensitiveConfigDiffs()) diff --git a/web/server/restart_checker.go b/web/server/restart_checker.go deleted file mode 100644 index 214f860b64f..00000000000 --- a/web/server/restart_checker.go +++ /dev/null @@ -1,60 +0,0 @@ -// Package server implements the entry point for running a robot web server. -package server - -import ( - "context" - "time" - - apppb "go.viam.com/api/app/v1" - "go.viam.com/utils/rpc" - - "go.viam.com/rdk/config" - "go.viam.com/rdk/logging" -) - -const ( - defaultNeedsRestartCheckInterval = time.Second * 5 - minNeedsRestartCheckInterval = time.Second * 1 -) - -type needsRestartChecker interface { - needsRestart(ctx context.Context) (bool, time.Duration, error) -} - -type needsRestartCheckerGRPC struct { - cfg *config.Cloud - logger logging.Logger - client rpc.ClientConn -} - -// ForceRestart lets random other parts of the app request an exit. -var ForceRestart bool - -func (c *needsRestartCheckerGRPC) needsRestart(ctx context.Context) (bool, time.Duration, error) { - if ForceRestart { - return true, minNeedsRestartCheckInterval, nil - } - service := apppb.NewRobotServiceClient(c.client) - res, err := service.NeedsRestart(ctx, &apppb.NeedsRestartRequest{Id: c.cfg.ID}) - if err != nil { - return false, defaultNeedsRestartCheckInterval, err - } - - restartInterval := res.RestartCheckInterval.AsDuration() - if restartInterval < minNeedsRestartCheckInterval { - c.logger.Warnf("received restart interval less than %s not using was %d", - minNeedsRestartCheckInterval, - res.RestartCheckInterval.AsDuration()) - restartInterval = defaultNeedsRestartCheckInterval - } - - return res.MustRestart, restartInterval, nil -} - -func newRestartChecker(cfg *config.Cloud, logger logging.Logger, client rpc.ClientConn) needsRestartChecker { - return &needsRestartCheckerGRPC{ - cfg: cfg, - logger: logger, - client: client, - } -} From 45ac69f7d4ccd1dc20ea8af6dfce5c2f40c7f5b3 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Tue, 30 Sep 2025 15:05:47 -0400 Subject: [PATCH 2/3] Conditionally handle restart logic based on VIAM_AGENT_HANDLES_NEEDS_RESTART --- utils/env.go | 10 +++++++ web/server/entrypoint.go | 38 ++++++++++++++++++++++-- web/server/restart_checker.go | 54 +++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 web/server/restart_checker.go diff --git a/utils/env.go b/utils/env.go index 2a6d0975dad..aa9c4bc2f84 100644 --- a/utils/env.go +++ b/utils/env.go @@ -79,6 +79,16 @@ const ( // GetImagesInStreamServerEnvVar is the environment variable that enables the GetImages feature flag in stream server. GetImagesInStreamServerEnvVar = "VIAM_GET_IMAGES_IN_STREAM_SERVER" + + // ViamAgentHandlesNeedsRestartChecking is the environment variable that viam-agent will + // set before starting viam-server to indicate that agent is a new enough version to + // have its own background loop that runs NeedsRestart against app.viam.com to determine + // if the system needs a restart. MUST be kept in line with the equivalent value in the + // agent repo. + // + // TODO(RSDK-12057): Remove sensitivity to this environment variable once we fully + // remove all NeedsRestart checking logic from viam-server. + ViamAgentHandlesNeedsRestartChecking = "VIAM_AGENT_HANDLES_NEEDS_RESTART_CHECKING" ) // EnvTrueValues contains strings that we interpret as boolean true in env vars. diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index f384100aab7..605d7e73f5b 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -444,11 +444,15 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err defer func() { <-forceShutdown }() var ( - theRobot robot.LocalRobot - theRobotLock sync.Mutex + theRobot robot.LocalRobot + theRobotLock sync.Mutex + cloudRestartCheckerActive chan struct{} ) rpcDialer := rpc.NewCachedDialer() defer func() { + if cloudRestartCheckerActive != nil { + <-cloudRestartCheckerActive + } err = multierr.Combine(err, rpcDialer.Close()) }() defer cancel() @@ -525,6 +529,36 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err // This functionality is tested in `TestLogPropagation` in `local_robot_test.go`. config.UpdateLoggerRegistryFromConfig(s.registry, fullProcessedConfig, s.logger) + // Only start cloud restart checker if cloud config is non-nil, and viam-agent is not + // handling restart checking for us (relevant environment variable is unset). + if fullProcessedConfig.Cloud != nil && os.Getenv(rutils.ViamAgentHandlesNeedsRestartChecking) == "" { + s.logger.CInfo(ctx, "Agent does not handle checking needs restart functionality; will handle in server") + cloudRestartCheckerActive = make(chan struct{}) + utils.PanicCapturingGo(func() { + defer close(cloudRestartCheckerActive) + restartCheck := newRestartChecker(cfg.Cloud, s.logger, s.conn) + restartInterval := defaultNeedsRestartCheckInterval + + for { + if !utils.SelectContextOrWait(ctx, restartInterval) { + return + } + + mustRestart, newRestartInterval, err := restartCheck.needsRestart(ctx) + if err != nil { + s.logger.Infow("failed to check restart", "error", err) + continue + } + + restartInterval = newRestartInterval + + if mustRestart { + logStackTraceAndCancel(cancel, s.logger) + } + } + }) + } + robotOptions := createRobotOptions() if s.args.RevealSensitiveConfigDiffs { robotOptions = append(robotOptions, robotimpl.WithRevealSensitiveConfigDiffs()) diff --git a/web/server/restart_checker.go b/web/server/restart_checker.go new file mode 100644 index 00000000000..abe0d846c68 --- /dev/null +++ b/web/server/restart_checker.go @@ -0,0 +1,54 @@ +// Package server implements the entry point for running a robot web server. +package server + +import ( + "context" + "time" + + apppb "go.viam.com/api/app/v1" + "go.viam.com/utils/rpc" + + "go.viam.com/rdk/config" + "go.viam.com/rdk/logging" +) + +const ( + defaultNeedsRestartCheckInterval = time.Second * 5 + minNeedsRestartCheckInterval = time.Second * 1 +) + +type needsRestartChecker interface { + needsRestart(ctx context.Context) (bool, time.Duration, error) +} + +type needsRestartCheckerGRPC struct { + cfg *config.Cloud + logger logging.Logger + client rpc.ClientConn +} + +func (c *needsRestartCheckerGRPC) needsRestart(ctx context.Context) (bool, time.Duration, error) { + service := apppb.NewRobotServiceClient(c.client) + res, err := service.NeedsRestart(ctx, &apppb.NeedsRestartRequest{Id: c.cfg.ID}) + if err != nil { + return false, defaultNeedsRestartCheckInterval, err + } + + restartInterval := res.RestartCheckInterval.AsDuration() + if restartInterval < minNeedsRestartCheckInterval { + c.logger.Warnf("received restart interval less than %s not using was %d", + minNeedsRestartCheckInterval, + res.RestartCheckInterval.AsDuration()) + restartInterval = defaultNeedsRestartCheckInterval + } + + return res.MustRestart, restartInterval, nil +} + +func newRestartChecker(cfg *config.Cloud, logger logging.Logger, client rpc.ClientConn) needsRestartChecker { + return &needsRestartCheckerGRPC{ + cfg: cfg, + logger: logger, + client: client, + } +} From 25ddb35fb2ed31ec3acee7cc973cf0c64795a9e3 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Tue, 30 Sep 2025 15:11:33 -0400 Subject: [PATCH 3/3] Update droid comment --- web/cmd/droid/droid.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/web/cmd/droid/droid.go b/web/cmd/droid/droid.go index 7b2de086286..149685f7c88 100644 --- a/web/cmd/droid/droid.go +++ b/web/cmd/droid/droid.go @@ -19,10 +19,7 @@ var logger = logging.NewDebugLogger("droid-entrypoint") // DroidStopHook used by android harness to stop the RDK. func DroidStopHook() { //nolint:revive - // NOTE(benjirewis): In RSDK-11248, we removed the restart checking logic from - // viam-server and put it in viam-agent. This method used to set a flag used by restart - // checking logic, but since that no longer exists and we have no users of this droid - // code, we no longer support this method. + // We have no users of this droid code, so we no longer support this method. logger.Error("DroidStopHook is no longer supported") }