diff --git a/CHANGELOG.md b/CHANGELOG.md index 0da38d44..36080c0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ bugs in actually-released versions. - Fix an issue where the columns in the web interface wouldn't correctly resize when the shown information changed. - Add-on: replace the different 'refresh' buttons (for Manager info & storage location, job types, and worker tags) with a single button that just refreshes everything in one go. The information obtained from Flamenco Manager is now stored in a JSON file on disk, making it independent from Blender auto-saving the user preferences. - Ensure the web frontend connects to the backend correctly when served over HTTPS ([#104296](https://projects.blender.org/studio/flamenco/pulls/104296)). +- For Workers running on Linux, it is now possible to configure the "OOM score adjustment" for sub-processes. This makes it possible for the out-of-memory killer to target Blender, and not Flamenco Worker itself. - Security updates of some dependencies: - [Incorrect forwarding of sensitive headers and cookies on HTTP redirect in net/http](https://pkg.go.dev/vuln/GO-2024-2600) - [Memory exhaustion in multipart form parsing in net/textproto and net/http](https://pkg.go.dev/vuln/GO-2024-2599) diff --git a/cmd/flamenco-worker/main.go b/cmd/flamenco-worker/main.go index 0cfd6393..b4d1ee46 100644 --- a/cmd/flamenco-worker/main.go +++ b/cmd/flamenco-worker/main.go @@ -23,6 +23,7 @@ import ( "projects.blender.org/studio/flamenco/internal/appinfo" "projects.blender.org/studio/flamenco/internal/worker" "projects.blender.org/studio/flamenco/internal/worker/cli_runner" + "projects.blender.org/studio/flamenco/pkg/oomscore" "projects.blender.org/studio/flamenco/pkg/sysinfo" "projects.blender.org/studio/flamenco/pkg/website" ) @@ -114,6 +115,10 @@ func main() { findBlender() findFFmpeg() + // Create the CLI runner before the auto-discovery, to make any configuration + // problems clear before waiting for the Manager to respond. + cliRunner := createCLIRunner(&configWrangler) + // Give the auto-discovery some time to find a Manager. discoverTimeout := 10 * time.Minute discoverCtx, discoverCancel := context.WithTimeout(context.Background(), discoverTimeout) @@ -149,7 +154,6 @@ func main() { return } - cliRunner := cli_runner.NewCLIRunner() listener = worker.NewListener(client, buffer) cmdRunner := worker.NewCommandExecutor(cliRunner, listener, timeService) taskRunner := worker.NewTaskExecutor(cmdRunner, listener) @@ -304,3 +308,27 @@ func logFatalManagerDiscoveryError(err error, discoverTimeout time.Duration) { Msgf("auto-discovery error, see %s", website.CannotFindManagerHelpURL) } } + +func createCLIRunner(configWrangler *worker.FileConfigWrangler) *cli_runner.CLIRunner { + config, err := configWrangler.WorkerConfig() + if err != nil { + log.Fatal().Err(err).Msg("error loading worker configuration") + } + + if config.LinuxOOMScoreAdjust == nil { + log.Debug().Msg("executables will be run without OOM score adjustment") + return cli_runner.NewCLIRunner() + } + + if !oomscore.Available() { + log.Warn(). + Msgf("config: oom_score_adjust configured, but that is only available on Linux, not this platform. See %s for more information.", + website.OOMScoreAdjURL) + return cli_runner.NewCLIRunner() + } + + adjustment := *config.LinuxOOMScoreAdjust + log.Info().Int("oom_score_adjust", adjustment).Msg("executables will be run with OOM score adjustment") + + return cli_runner.NewCLIRunnerWithOOMScoreAdjuster(adjustment) +} diff --git a/internal/worker/cli_runner/cli_runner.go b/internal/worker/cli_runner/cli_runner.go index 7fe843b7..9369c3cf 100644 --- a/internal/worker/cli_runner/cli_runner.go +++ b/internal/worker/cli_runner/cli_runner.go @@ -11,6 +11,7 @@ import ( "github.com/alessio/shellescape" "github.com/rs/zerolog" + "projects.blender.org/studio/flamenco/pkg/oomscore" ) // The buffer size used to read stdout/stderr output from subprocesses, in @@ -20,11 +21,19 @@ const StdoutBufferSize = 40 * 1024 // CLIRunner is a wrapper around exec.CommandContext() to allow mocking. type CLIRunner struct { + oomScoreAdjust int + useOOMScoreAdjust bool } func NewCLIRunner() *CLIRunner { return &CLIRunner{} } +func NewCLIRunnerWithOOMScoreAdjuster(oomScoreAdjust int) *CLIRunner { + return &CLIRunner{ + oomScoreAdjust: oomScoreAdjust, + useOOMScoreAdjust: true, + } +} func (cli *CLIRunner) CommandContext(ctx context.Context, name string, arg ...string) *exec.Cmd { return exec.CommandContext(ctx, name, arg...) @@ -55,7 +64,7 @@ func (cli *CLIRunner) RunWithTextOutput( return err } - if err := execCmd.Start(); err != nil { + if err := cli.startWithOOMAdjust(execCmd); err != nil { logger.Error().Err(err).Msg("error starting CLI execution") return err } @@ -171,3 +180,13 @@ func (cli *CLIRunner) logCmd( } return nil } + +// startWithOOMAdjust runs the command with its OOM score adjusted. +func (cli *CLIRunner) startWithOOMAdjust(execCmd *exec.Cmd) error { + if cli.useOOMScoreAdjust { + oomScoreRestore := oomscore.Adjust(cli.oomScoreAdjust) + defer oomScoreRestore() + } + + return execCmd.Start() +} diff --git a/internal/worker/config.go b/internal/worker/config.go index 0ad0d6eb..e786cbb8 100644 --- a/internal/worker/config.go +++ b/internal/worker/config.go @@ -58,6 +58,18 @@ type WorkerConfig struct { TaskTypes []string `yaml:"task_types"` RestartExitCode int `yaml:"restart_exit_code"` + + // LinuxOOMScoreAdjust controls the Linux out-of-memory killer. Is used when + // spawning a sub-process, to adjust the likelyness that that subprocess is + // killed rather than Flamenco Worker itself. That way Flamenco Worker can + // report the failure to the Manager. + // + // If the Worker itself would be OOM-killed, it would just be restarted and + // get the task it was already working on, causing an infinite OOM-loop. + // + // If this value is not specified in the configuration file, Flamenco Worker + // will not attempt to adjust its OOM score. + LinuxOOMScoreAdjust *int `yaml:"oom_score_adjust"` } type WorkerCredentials struct { diff --git a/pkg/oomscore/oomscore.go b/pkg/oomscore/oomscore.go new file mode 100644 index 00000000..30ac61f4 --- /dev/null +++ b/pkg/oomscore/oomscore.go @@ -0,0 +1,86 @@ +// package oomscore provides some functions to adjust the Linux +// out-of-memory (OOM) score, i.e. the number that determines how likely it is +// that a process is killed in an out-of-memory situation. +// +// It is available only on Linux. On other platforms ErrNotImplemented will be returned. +package oomscore + +import ( + "errors" + + "github.com/rs/zerolog/log" +) + +var ErrNotImplemented = errors.New("OOM score functionality not implemented on this platform") + +// Available returns whether the functionality in this package is available for +// the current platform. +func Available() bool { + return available +} + +// GetOOMScore returns the current process' OOM score. +func GetOOMScore() (int, error) { + return getOOMScore() +} + +// GetOOMScoreAdj returns the current process' OOM score adjustment. +func GetOOMScoreAdj() (int, error) { + return getOOMScoreAdj() +} + +// SetOOMScoreAdj sets the current process' OOM score adjustment. +func SetOOMScoreAdj(score int) error { + return setOOMScoreAdj(score) +} + +type ScoreRestoreFunc func() + +var emptyRestoreFunc ScoreRestoreFunc = func() {} + +// Adjust temporarily sets the OOM score adjustment. +// The returned function MUST be called to restore the original value. +// Any problems changing the score are logged, but not otherwise returned. +func Adjust(score int) (restoreFunc ScoreRestoreFunc) { + restoreFunc = emptyRestoreFunc + + if !Available() { + return + } + + origScore, err := getOOMScoreAdj() + if err != nil { + log.Error(). + AnErr("cause", err). + Msg("could not get the current process' oom_score_adj value") + return + } + + log.Trace(). + Int("oom_score_adj", score). + Msg("setting oom_score_adj") + + err = setOOMScoreAdj(score) + if err != nil { + log.Error(). + Int("oom_score_adj", score). + AnErr("cause", err). + Msg("could not set the current process' oom_score_adj value") + return + } + + return func() { + log.Trace(). + Int("oom_score_adj", origScore). + Msg("restoring oom_score_adj") + + err = setOOMScoreAdj(origScore) + if err != nil { + log.Error(). + Int("oom_score_adj", origScore). + AnErr("cause", err). + Msg("could not restore the current process' oom_score_adj value") + return + } + } +} diff --git a/pkg/oomscore/oomscore_linux.go b/pkg/oomscore/oomscore_linux.go new file mode 100644 index 00000000..225925fd --- /dev/null +++ b/pkg/oomscore/oomscore_linux.go @@ -0,0 +1,66 @@ +//go:build linux + +package oomscore + +import ( + "fmt" + "os" + "path/filepath" +) + +const ( + available = true +) + +// getOOMScore returns the current process' OOM score. +func getOOMScore() (int, error) { + return readInt("oom_score") +} + +// getOOMScoreAdj returns the current process' OOM score adjustment. +func getOOMScoreAdj() (int, error) { + return readInt("oom_score_adj") +} + +// setOOMScoreAdj sets the current process' OOM score adjustment. +func setOOMScoreAdj(newScore int) error { + return writeInt(newScore, "oom_score_adj") +} + +// readInt reads an integer from /proc/{pid}/{filename} +func readInt(filename string) (int, error) { + fullPath := procPidPath(filename) + + file, err := os.Open(fullPath) + if err != nil { + return 0, fmt.Errorf("opening %s: %w", fullPath, err) + } + + var valueInFile int + n, err := fmt.Fscan(file, &valueInFile) + if err != nil { + return 0, fmt.Errorf("reading %s: %w", fullPath, err) + } + if n < 1 { + return 0, fmt.Errorf("reading %s: did not find a number", fullPath) + } + + return valueInFile, nil +} + +// writeInt writes an integer to /proc/{pid}/{filename} +func writeInt(value int, filename string) error { + fullPath := procPidPath(filename) + contents := fmt.Sprint(value) + err := os.WriteFile(fullPath, []byte(contents), os.ModePerm) + if err != nil { + return fmt.Errorf("writing %s: %w", fullPath, err) + } + return nil +} + +// procPidPath returns "/proc/{pid}/{filename}". +func procPidPath(filename string) string { + pid := os.Getpid() + return filepath.Join("/proc", fmt.Sprint(pid), filename) +} diff --git a/pkg/oomscore/oomscore_nonlinux.go b/pkg/oomscore/oomscore_nonlinux.go new file mode 100644 index 00000000..be9d56a8 --- /dev/null +++ b/pkg/oomscore/oomscore_nonlinux.go @@ -0,0 +1,19 @@ +//go:build !linux + +package oomscore + +const ( + available = false +) + +func getOOMScore() (int, error) { + return 0, ErrNotImplemented +} + +func getOOMScoreAdj() (int, error) { + return 0, ErrNotImplemented +} + +func setOOMScoreAdj(int) error { + return ErrNotImplemented +} diff --git a/pkg/website/urls.go b/pkg/website/urls.go index a8285cf7..bccd4d2c 100644 --- a/pkg/website/urls.go +++ b/pkg/website/urls.go @@ -9,4 +9,5 @@ const ( BugReportURL = "https://flamenco.blender.org/get-involved" ShamanRequirementsURL = "https://flamenco.blender.org/usage/shared-storage/shaman/#requirements" WorkerConfigURL = "https://flamenco.blender.org/usage/worker-configuration/" + OOMScoreAdjURL = WorkerConfigURL ) diff --git a/web/project-website/content/usage/worker-configuration/_index.md b/web/project-website/content/usage/worker-configuration/_index.md index a3272b67..6a2d345f 100644 --- a/web/project-website/content/usage/worker-configuration/_index.md +++ b/web/project-website/content/usage/worker-configuration/_index.md @@ -19,6 +19,9 @@ This is an example of such a configuration file: manager_url: http://flamenco.local:8080/ task_types: [blender, ffmpeg, file-management, misc] restart_exit_code: 47 + +# Optional advanced option, available on Linux only: +oom_score_adjust: 500 ``` - `manager_url`: The URL of the Manager to connect to. If the setting is blank @@ -31,10 +34,17 @@ restart_exit_code: 47 - `restart_exit_code`: Having this set to a non-zero value will mark this Worker as 'restartable'. See [Shut Down & Restart Actions][restarting] for more information. +- `oom_score_adjust`: an optional value between 0 and 1000, only available on + Linux. It configures the Out Of Memory behaviour of the Linux kernel. This is + the `oom_score_adj` value for all sub-processes started by the Worker. Set + this to a high value, so that when the machine runs out of memory when + rendering, it is Blender that gets killed, and not Flamenco Worker itself. For + more information, see [Linux Kernel: Per-Process Parameters][per-process-proc]. [scripts]: {{< ref "usage/job-types" >}} [task-types]: {{< ref "usage/job-types" >}}#task-types [restarting]: {{< ref "usage/worker-actions" >}}#shut-down--restart-actions +[per-process-proc]: https://docs.kernel.org/filesystems/proc.html#chapter-3-per-process-parameters ## Worker-Specific Files