Restartable workers

When the worker is started with `-restart-exit-code 47` or has
`restart_exit_code=47` in `flamenco-worker.yaml`, it's marked as
'restartable'. This will enable two worker actions 'Restart
(immediately)' and 'Restart (after task is finished)' in the Manager web
interface. When a worker is asked to restart, it will exit with exit
code `47`. Of course any positive exit code can be used here.
This commit is contained in:
Sybren A. Stüvel 2023-08-14 15:00:32 +02:00
parent 1eb7764d00
commit 3e72391cbf
14 changed files with 148 additions and 26 deletions

@ -47,6 +47,8 @@ var cliArgs struct {
manager string
register bool
restartExitCode int
}
func main() {
@ -84,17 +86,22 @@ func main() {
// Load configuration, and override things from the CLI arguments if necessary.
configWrangler := worker.NewConfigWrangler()
// Before the config can be overridden, it has to be loaded.
if _, err := configWrangler.WorkerConfig(); err != nil {
log.Fatal().Err(err).Msg("error loading worker configuration")
}
if cliArgs.managerURL != nil {
url := cliArgs.managerURL.String()
log.Info().Str("manager", url).Msg("using Manager URL from commandline")
// Before the config can be overridden, it has to be loaded.
if _, err := configWrangler.WorkerConfig(); err != nil {
log.Fatal().Err(err).Msg("error loading worker configuration")
}
configWrangler.SetManagerURL(url)
}
if cliArgs.restartExitCode != 0 {
log.Info().Int("exitCode", cliArgs.restartExitCode).
Msg("will tell Manager this Worker can restart")
configWrangler.SetRestartExitCode(cliArgs.restartExitCode)
}
findBlender()
findFFmpeg()
@ -163,7 +170,8 @@ func main() {
go w.Start(workerCtx, startupState)
if w.WaitForShutdown(workerCtx) {
shutdownReason := w.WaitForShutdown(workerCtx)
if shutdownReason != worker.ReasonContextClosed {
go shutdown()
}
<-shutdownComplete
@ -172,6 +180,8 @@ func main() {
wg.Wait()
log.Debug().Msg("process shutting down")
config, _ := configWrangler.WorkerConfig()
stopProcess(config, shutdownReason)
}
func shutdown() {
@ -203,6 +213,17 @@ func shutdown() {
close(shutdownComplete)
}
func stopProcess(config worker.WorkerConfig, shutdownReason worker.ShutdownReason) {
switch shutdownReason {
case worker.ReasonContextClosed:
os.Exit(1)
case worker.ReasonShutdownReq:
os.Exit(0)
case worker.ReasonRestartReq:
os.Exit(config.RestartExitCode)
}
}
func parseCliArgs() {
flag.BoolVar(&cliArgs.version, "version", false, "Shows the application version, then exits.")
flag.BoolVar(&cliArgs.flush, "flush", false, "Flush any buffered task updates to the Manager, then exits.")
@ -216,6 +237,9 @@ func parseCliArgs() {
flag.BoolVar(&cliArgs.register, "register", false, "(Re-)register at the Manager.")
flag.BoolVar(&cliArgs.findManager, "find-manager", false, "Autodiscover a Manager, then quit.")
flag.IntVar(&cliArgs.restartExitCode, "restart-exit-code", 0,
"Mark this Worker as restartable. It will exit with this code to signify it needs to be restarted.")
flag.Parse()
if cliArgs.manager != "" {

@ -159,6 +159,13 @@ func (f *Flamenco) RequestWorkerStatusChange(e echo.Context, workerUUID string)
Str("requested", string(change.Status)).
Bool("lazy", change.IsLazy).
Logger()
if change.Status == api.WorkerStatusRestart && !dbWorker.CanRestart {
logger.Error().Msg("worker cannot be restarted, rejecting status change request")
return sendAPIError(e, http.StatusPreconditionFailed,
"worker %q does not know how to restart", workerUUID)
}
logger.Info().Msg("worker status change requested")
if dbWorker.Status == change.Status {
@ -380,10 +387,11 @@ func (f *Flamenco) CreateWorkerTag(e echo.Context) error {
func workerSummary(w persistence.Worker) api.WorkerSummary {
summary := api.WorkerSummary{
Id: w.UUID,
Name: w.Name,
Status: w.Status,
Version: w.Software,
Id: w.UUID,
Name: w.Name,
Status: w.Status,
Version: w.Software,
CanRestart: w.CanRestart,
}
if w.StatusRequested != "" {
summary.StatusChange = &api.WorkerStatusChangeRequest{

@ -30,6 +30,13 @@ var rememberableWorkerStates = map[api.WorkerStatus]bool{
api.WorkerStatusAwake: true,
}
// offlineWorkerStates contains worker statuses that are automatically
// acknowledged on sign-off.
var offlineWorkerStates = map[api.WorkerStatus]bool{
api.WorkerStatusOffline: true,
api.WorkerStatusRestart: true,
}
// RegisterWorker registers a new worker and stores it in the database.
func (f *Flamenco) RegisterWorker(e echo.Context) error {
logger := requestLogger(e)
@ -137,6 +144,7 @@ func (f *Flamenco) workerUpdateAfterSignOn(e echo.Context, update api.SignOnJSON
w.Address = e.RealIP()
w.Name = update.Name
w.Software = update.SoftwareVersion
w.CanRestart = update.CanRestart != nil && *update.CanRestart
// Remove trailing spaces from task types, and convert to lower case.
for idx := range update.SupportedTaskTypes {
@ -168,7 +176,7 @@ func (f *Flamenco) SignOff(e echo.Context) error {
w := requestWorkerOrPanic(e)
prevStatus := w.Status
w.Status = api.WorkerStatusOffline
if w.StatusRequested == api.WorkerStatusOffline {
if offlineWorkerStates[w.StatusRequested] {
w.StatusChangeClear()
}

@ -25,6 +25,7 @@ type Worker struct {
Software string `gorm:"type:varchar(32);default:''"`
Status api.WorkerStatus `gorm:"type:varchar(16);default:''"`
LastSeenAt time.Time `gorm:"index"` // Should contain UTC timestamps.
CanRestart bool `gorm:"type:smallint;default:false"`
StatusRequested api.WorkerStatus `gorm:"type:varchar(16);default:''"`
LazyStatusRequest bool `gorm:"type:smallint;default:false"`

@ -14,11 +14,12 @@ import (
// the caller.
func NewWorkerUpdate(worker *persistence.Worker) api.SocketIOWorkerUpdate {
workerUpdate := api.SocketIOWorkerUpdate{
Id: worker.UUID,
Name: worker.Name,
Status: worker.Status,
Version: worker.Software,
Updated: worker.UpdatedAt,
Id: worker.UUID,
Name: worker.Name,
Status: worker.Status,
Version: worker.Software,
Updated: worker.UpdatedAt,
CanRestart: worker.CanRestart,
}
if worker.StatusRequested != "" {

@ -46,7 +46,8 @@ type WorkerConfig struct {
// configuration file, but also from autodiscovery via UPnP/SSDP.
ManagerURL string `yaml:"-"`
TaskTypes []string `yaml:"task_types"`
TaskTypes []string `yaml:"task_types"`
RestartExitCode int `yaml:"restart_exit_code"`
}
type WorkerCredentials struct {
@ -145,6 +146,10 @@ func (fcw *FileConfigWrangler) SetManagerURL(managerURL string) {
fcw.wc.ManagerURL = managerURL
}
func (fcw *FileConfigWrangler) SetRestartExitCode(code int) {
fcw.wc.RestartExitCode = code
}
// DefaultConfig returns a fairly sane default configuration.
func (fcw FileConfigWrangler) DefaultConfig() WorkerConfig {
return defaultConfig

@ -150,10 +150,12 @@ func repeatSignOnUntilAnswer(ctx context.Context, cfg WorkerConfig, client Flame
func signOn(ctx context.Context, cfg WorkerConfig, client FlamencoClient) (api.WorkerStatus, error) {
logger := log.With().Str("manager", cfg.ManagerURL).Logger()
canRestart := cfg.RestartExitCode != 0
req := api.SignOnJSONRequestBody{
Name: workerName(),
SupportedTaskTypes: cfg.TaskTypes,
SoftwareVersion: appinfo.ExtendedVersion(),
CanRestart: &canRestart,
}
logger.Info().

@ -16,6 +16,13 @@ func (w *Worker) gotoStateOffline(context.Context) {
defer w.stateMutex.Unlock()
w.state = api.WorkerStatusOffline
w.requestShutdown(false)
}
// requestShutdown closes the w.shutdown channel, to indicate to the main
// function that it should proceed with the shutdown procedure.
func (w *Worker) requestShutdown(requestRestart bool) {
w.restartAfterShutdown = requestRestart
// Signal that the Worker should shut down.
log.Debug().Msg("closing the shutdown channel")

@ -0,0 +1,17 @@
package worker
// SPDX-License-Identifier: GPL-3.0-or-later
import (
"context"
"projects.blender.org/studio/flamenco/pkg/api"
)
func (w *Worker) gotoStateRestart(ctx context.Context) {
w.stateMutex.Lock()
defer w.stateMutex.Unlock()
w.state = api.WorkerStatusRestart
w.requestShutdown(true)
}

@ -15,6 +15,7 @@ func (w *Worker) setupStateMachine() {
w.stateStarters[api.WorkerStatusAsleep] = w.gotoStateAsleep
w.stateStarters[api.WorkerStatusAwake] = w.gotoStateAwake
w.stateStarters[api.WorkerStatusOffline] = w.gotoStateOffline
w.stateStarters[api.WorkerStatusRestart] = w.gotoStateRestart
}
// Called whenever the Flamenco Manager has a change in current status for us.

@ -17,7 +17,8 @@ type Worker struct {
doneWg *sync.WaitGroup
// Will be closed by the Worker when it wants to shut down. See Worker.WaitForShutdown().
shutdown chan struct{}
shutdown chan struct{}
restartAfterShutdown bool
client FlamencoClient
@ -69,14 +70,24 @@ func (w *Worker) Close() {
w.doneWg.Wait()
}
type ShutdownReason int
const (
ReasonContextClosed ShutdownReason = iota // Main Context closed.
ReasonShutdownReq // Manager requested a shutdown.
ReasonRestartReq // Manager requested a restart.
)
// WaitForShutdown waits until Flamenco wants to shut down the application.
// Returns `true` when the Worker has signalled it wants to shut down.
// Returns `false` when the shutdown was caused by the context closing.
func (w *Worker) WaitForShutdown(ctx context.Context) bool {
// Returns the reason of the shutdown.
func (w *Worker) WaitForShutdown(ctx context.Context) ShutdownReason {
select {
case <-ctx.Done():
return false
return ReasonContextClosed
case <-w.shutdown:
return true
if w.restartAfterShutdown {
return ReasonRestartReq
}
return ReasonShutdownReq
}
}

@ -4,7 +4,9 @@
<template v-if="!hasActiveWorker">Select a Worker</template>
<template v-else>Choose an action...</template>
</option>
<option v-for="(action, key) in WORKER_ACTIONS" :value="key">{{ action.label }}</option>
<template v-for="(action, key) in WORKER_ACTIONS">
<option :value="key" v-if="action.condition()">{{ action.label }}</option>
</template>
</select>
<button :disabled="!canPerformAction" class="btn" @click.prevent="performWorkerAction">Apply</button>
</template>
@ -25,6 +27,7 @@ const WORKER_ACTIONS = Object.freeze({
title: 'Shut down the worker after the current task finishes. The worker may automatically restart.',
target_status: 'offline',
lazy: true,
condition: () => true,
},
offline_immediate: {
label: 'Shut Down (immediately)',
@ -32,6 +35,23 @@ const WORKER_ACTIONS = Object.freeze({
title: 'Immediately shut down the worker. It may automatically restart.',
target_status: 'offline',
lazy: false,
condition: () => true,
},
restart_lazy: {
label: 'Restart (after task is finished)',
icon: '✝',
title: 'Restart the worker after the current task finishes.',
target_status: 'restart',
lazy: true,
condition: () => workers.canRestart(),
},
restart_immediate: {
label: 'Restart (immediately)',
icon: '✝!',
title: 'Immediately restart the worker.',
target_status: 'restart',
lazy: false,
condition: () => workers.canRestart(),
},
asleep_lazy: {
label: 'Send to Sleep (after task is finished)',
@ -39,6 +59,7 @@ const WORKER_ACTIONS = Object.freeze({
title: 'Let the worker sleep after finishing this task.',
target_status: 'asleep',
lazy: true,
condition: () => true,
},
asleep_immediate: {
label: 'Send to Sleep (immediately)',
@ -46,6 +67,7 @@ const WORKER_ACTIONS = Object.freeze({
title: 'Let the worker sleep immediately.',
target_status: 'asleep',
lazy: false,
condition: () => true,
},
wakeup: {
label: 'Wake Up',
@ -53,6 +75,7 @@ const WORKER_ACTIONS = Object.freeze({
title: 'Wake the worker up. A sleeping worker can take a minute to respond.',
target_status: 'awake',
lazy: false,
condition: () => true,
},
});
@ -75,7 +98,9 @@ function performWorkerAction() {
console.log("Requesting worker status change", statuschange);
api.requestWorkerStatusChange(workerID, statuschange)
.then((result) => notifs.add(`Worker status change to ${action.target_status} confirmed.`))
.catch((error) => notifs.add(`Error requesting worker status change: ${error}`));
.catch((error) => {
notifs.add(`Error requesting worker status change: ${error.body.message}`)
});
}
</script>

@ -32,6 +32,11 @@
<dd>
<link-worker-task :workerTask="workerData.task" />
</dd>
<template v-if="workerData.can_restart">
<dt class="field-can-restart">Can Restart</dt>
<dd>{{ workerData.can_restart }}</dd>
</template>
</dl>
<section class="worker-tags" v-if="workers.tags && workers.tags.length">

@ -63,5 +63,12 @@ export const useWorkers = defineStore('workers', {
this.tagsByID = tagsByID;
});
},
/**
* @returns whether the active worker understands how to get restarted.
*/
canRestart() {
return !!this.activeWorker && !!this.activeWorker.can_restart;
},
},
});