Worker: add configuration for Linux out-of-memory killer

Add a Worker configuration option to configure the Linux out-of-memory
behaviour. Add `oom_score_adjust=500` to `flamenco-worker.yaml` to increase
the chance that Blender gets killed when the machine runs out of memory,
instead of Flamenco Worker itself.
This commit is contained in:
Sybren A. Stüvel 2024-04-15 17:21:11 +02:00
parent 3974770f36
commit e2bca9ad61
9 changed files with 244 additions and 2 deletions

@ -13,6 +13,7 @@ bugs in actually-released versions.
- Fix an issue where the columns in the web interface wouldn't correctly resize when the shown information changed.
- Add-on: replace the different 'refresh' buttons (for Manager info & storage location, job types, and worker tags) with a single button that just refreshes everything in one go. The information obtained from Flamenco Manager is now stored in a JSON file on disk, making it independent from Blender auto-saving the user preferences.
- Ensure the web frontend connects to the backend correctly when served over HTTPS ([#104296](https://projects.blender.org/studio/flamenco/pulls/104296)).
- For Workers running on Linux, it is now possible to configure the "OOM score adjustment" for sub-processes. This makes it possible for the out-of-memory killer to target Blender, and not Flamenco Worker itself.
- Security updates of some dependencies:
- [Incorrect forwarding of sensitive headers and cookies on HTTP redirect in net/http](https://pkg.go.dev/vuln/GO-2024-2600)
- [Memory exhaustion in multipart form parsing in net/textproto and net/http](https://pkg.go.dev/vuln/GO-2024-2599)

@ -23,6 +23,7 @@ import (
"projects.blender.org/studio/flamenco/internal/appinfo"
"projects.blender.org/studio/flamenco/internal/worker"
"projects.blender.org/studio/flamenco/internal/worker/cli_runner"
"projects.blender.org/studio/flamenco/pkg/oomscore"
"projects.blender.org/studio/flamenco/pkg/sysinfo"
"projects.blender.org/studio/flamenco/pkg/website"
)
@ -114,6 +115,10 @@ func main() {
findBlender()
findFFmpeg()
// Create the CLI runner before the auto-discovery, to make any configuration
// problems clear before waiting for the Manager to respond.
cliRunner := createCLIRunner(&configWrangler)
// Give the auto-discovery some time to find a Manager.
discoverTimeout := 10 * time.Minute
discoverCtx, discoverCancel := context.WithTimeout(context.Background(), discoverTimeout)
@ -149,7 +154,6 @@ func main() {
return
}
cliRunner := cli_runner.NewCLIRunner()
listener = worker.NewListener(client, buffer)
cmdRunner := worker.NewCommandExecutor(cliRunner, listener, timeService)
taskRunner := worker.NewTaskExecutor(cmdRunner, listener)
@ -304,3 +308,27 @@ func logFatalManagerDiscoveryError(err error, discoverTimeout time.Duration) {
Msgf("auto-discovery error, see %s", website.CannotFindManagerHelpURL)
}
}
func createCLIRunner(configWrangler *worker.FileConfigWrangler) *cli_runner.CLIRunner {
config, err := configWrangler.WorkerConfig()
if err != nil {
log.Fatal().Err(err).Msg("error loading worker configuration")
}
if config.LinuxOOMScoreAdjust == nil {
log.Debug().Msg("executables will be run without OOM score adjustment")
return cli_runner.NewCLIRunner()
}
if !oomscore.Available() {
log.Warn().
Msgf("config: oom_score_adjust configured, but that is only available on Linux, not this platform. See %s for more information.",
website.OOMScoreAdjURL)
return cli_runner.NewCLIRunner()
}
adjustment := *config.LinuxOOMScoreAdjust
log.Info().Int("oom_score_adjust", adjustment).Msg("executables will be run with OOM score adjustment")
return cli_runner.NewCLIRunnerWithOOMScoreAdjuster(adjustment)
}

@ -11,6 +11,7 @@ import (
"github.com/alessio/shellescape"
"github.com/rs/zerolog"
"projects.blender.org/studio/flamenco/pkg/oomscore"
)
// The buffer size used to read stdout/stderr output from subprocesses, in
@ -20,11 +21,19 @@ const StdoutBufferSize = 40 * 1024
// CLIRunner is a wrapper around exec.CommandContext() to allow mocking.
type CLIRunner struct {
oomScoreAdjust int
useOOMScoreAdjust bool
}
func NewCLIRunner() *CLIRunner {
return &CLIRunner{}
}
func NewCLIRunnerWithOOMScoreAdjuster(oomScoreAdjust int) *CLIRunner {
return &CLIRunner{
oomScoreAdjust: oomScoreAdjust,
useOOMScoreAdjust: true,
}
}
func (cli *CLIRunner) CommandContext(ctx context.Context, name string, arg ...string) *exec.Cmd {
return exec.CommandContext(ctx, name, arg...)
@ -55,7 +64,7 @@ func (cli *CLIRunner) RunWithTextOutput(
return err
}
if err := execCmd.Start(); err != nil {
if err := cli.startWithOOMAdjust(execCmd); err != nil {
logger.Error().Err(err).Msg("error starting CLI execution")
return err
}
@ -171,3 +180,13 @@ func (cli *CLIRunner) logCmd(
}
return nil
}
// startWithOOMAdjust runs the command with its OOM score adjusted.
func (cli *CLIRunner) startWithOOMAdjust(execCmd *exec.Cmd) error {
if cli.useOOMScoreAdjust {
oomScoreRestore := oomscore.Adjust(cli.oomScoreAdjust)
defer oomScoreRestore()
}
return execCmd.Start()
}

@ -58,6 +58,18 @@ type WorkerConfig struct {
TaskTypes []string `yaml:"task_types"`
RestartExitCode int `yaml:"restart_exit_code"`
// LinuxOOMScoreAdjust controls the Linux out-of-memory killer. Is used when
// spawning a sub-process, to adjust the likelyness that that subprocess is
// killed rather than Flamenco Worker itself. That way Flamenco Worker can
// report the failure to the Manager.
//
// If the Worker itself would be OOM-killed, it would just be restarted and
// get the task it was already working on, causing an infinite OOM-loop.
//
// If this value is not specified in the configuration file, Flamenco Worker
// will not attempt to adjust its OOM score.
LinuxOOMScoreAdjust *int `yaml:"oom_score_adjust"`
}
type WorkerCredentials struct {

86
pkg/oomscore/oomscore.go Normal file

@ -0,0 +1,86 @@
// package oomscore provides some functions to adjust the Linux
// out-of-memory (OOM) score, i.e. the number that determines how likely it is
// that a process is killed in an out-of-memory situation.
//
// It is available only on Linux. On other platforms ErrNotImplemented will be returned.
package oomscore
import (
"errors"
"github.com/rs/zerolog/log"
)
var ErrNotImplemented = errors.New("OOM score functionality not implemented on this platform")
// Available returns whether the functionality in this package is available for
// the current platform.
func Available() bool {
return available
}
// GetOOMScore returns the current process' OOM score.
func GetOOMScore() (int, error) {
return getOOMScore()
}
// GetOOMScoreAdj returns the current process' OOM score adjustment.
func GetOOMScoreAdj() (int, error) {
return getOOMScoreAdj()
}
// SetOOMScoreAdj sets the current process' OOM score adjustment.
func SetOOMScoreAdj(score int) error {
return setOOMScoreAdj(score)
}
type ScoreRestoreFunc func()
var emptyRestoreFunc ScoreRestoreFunc = func() {}
// Adjust temporarily sets the OOM score adjustment.
// The returned function MUST be called to restore the original value.
// Any problems changing the score are logged, but not otherwise returned.
func Adjust(score int) (restoreFunc ScoreRestoreFunc) {
restoreFunc = emptyRestoreFunc
if !Available() {
return
}
origScore, err := getOOMScoreAdj()
if err != nil {
log.Error().
AnErr("cause", err).
Msg("could not get the current process' oom_score_adj value")
return
}
log.Trace().
Int("oom_score_adj", score).
Msg("setting oom_score_adj")
err = setOOMScoreAdj(score)
if err != nil {
log.Error().
Int("oom_score_adj", score).
AnErr("cause", err).
Msg("could not set the current process' oom_score_adj value")
return
}
return func() {
log.Trace().
Int("oom_score_adj", origScore).
Msg("restoring oom_score_adj")
err = setOOMScoreAdj(origScore)
if err != nil {
log.Error().
Int("oom_score_adj", origScore).
AnErr("cause", err).
Msg("could not restore the current process' oom_score_adj value")
return
}
}
}

@ -0,0 +1,66 @@
//go:build linux
package oomscore
import (
"fmt"
"os"
"path/filepath"
)
const (
available = true
)
// getOOMScore returns the current process' OOM score.
func getOOMScore() (int, error) {
return readInt("oom_score")
}
// getOOMScoreAdj returns the current process' OOM score adjustment.
func getOOMScoreAdj() (int, error) {
return readInt("oom_score_adj")
}
// setOOMScoreAdj sets the current process' OOM score adjustment.
func setOOMScoreAdj(newScore int) error {
return writeInt(newScore, "oom_score_adj")
}
// readInt reads an integer from /proc/{pid}/{filename}
func readInt(filename string) (int, error) {
fullPath := procPidPath(filename)
file, err := os.Open(fullPath)
if err != nil {
return 0, fmt.Errorf("opening %s: %w", fullPath, err)
}
var valueInFile int
n, err := fmt.Fscan(file, &valueInFile)
if err != nil {
return 0, fmt.Errorf("reading %s: %w", fullPath, err)
}
if n < 1 {
return 0, fmt.Errorf("reading %s: did not find a number", fullPath)
}
return valueInFile, nil
}
// writeInt writes an integer to /proc/{pid}/{filename}
func writeInt(value int, filename string) error {
fullPath := procPidPath(filename)
contents := fmt.Sprint(value)
err := os.WriteFile(fullPath, []byte(contents), os.ModePerm)
if err != nil {
return fmt.Errorf("writing %s: %w", fullPath, err)
}
return nil
}
// procPidPath returns "/proc/{pid}/{filename}".
func procPidPath(filename string) string {
pid := os.Getpid()
return filepath.Join("/proc", fmt.Sprint(pid), filename)
}

@ -0,0 +1,19 @@
//go:build !linux
package oomscore
const (
available = false
)
func getOOMScore() (int, error) {
return 0, ErrNotImplemented
}
func getOOMScoreAdj() (int, error) {
return 0, ErrNotImplemented
}
func setOOMScoreAdj(int) error {
return ErrNotImplemented
}

@ -9,4 +9,5 @@ const (
BugReportURL = "https://flamenco.blender.org/get-involved"
ShamanRequirementsURL = "https://flamenco.blender.org/usage/shared-storage/shaman/#requirements"
WorkerConfigURL = "https://flamenco.blender.org/usage/worker-configuration/"
OOMScoreAdjURL = WorkerConfigURL
)

@ -19,6 +19,9 @@ This is an example of such a configuration file:
manager_url: http://flamenco.local:8080/
task_types: [blender, ffmpeg, file-management, misc]
restart_exit_code: 47
# Optional advanced option, available on Linux only:
oom_score_adjust: 500
```
- `manager_url`: The URL of the Manager to connect to. If the setting is blank
@ -31,10 +34,17 @@ restart_exit_code: 47
- `restart_exit_code`: Having this set to a non-zero value will mark this Worker
as 'restartable'. See [Shut Down & Restart Actions][restarting] for more
information.
- `oom_score_adjust`: an optional value between 0 and 1000, only available on
Linux. It configures the Out Of Memory behaviour of the Linux kernel. This is
the `oom_score_adj` value for all sub-processes started by the Worker. Set
this to a high value, so that when the machine runs out of memory when
rendering, it is Blender that gets killed, and not Flamenco Worker itself. For
more information, see [Linux Kernel: Per-Process Parameters][per-process-proc].
[scripts]: {{< ref "usage/job-types" >}}
[task-types]: {{< ref "usage/job-types" >}}#task-types
[restarting]: {{< ref "usage/worker-actions" >}}#shut-down--restart-actions
[per-process-proc]: https://docs.kernel.org/filesystems/proc.html#chapter-3-per-process-parameters
## Worker-Specific Files