Add jitter to :exponentially_longer

Prior to this change, exponentially_longer had adverse consequences
during system-wide downstream failures.  This change adds a random value to the
back off calculation in order to prevent the thundering herd
problem, whereby all retry jobs would retry at the same time.

Specifically this change adds a jitter option to retry_on to enable users of it to
scope the randomness calculation to a reasonable amount.  The default is
15% of the exponential back off calculation.
This commit is contained in:
Anthony Ross 2018-02-02 16:50:57 +00:00 committed by Jeremy Daer
parent be2473b2ef
commit 5f7621878d
3 changed files with 66 additions and 43 deletions

@ -34,5 +34,16 @@
*Vlado Cingel*
* Add jitter to :exponentially_longer
ActiveJob::Exceptions.retry_on with :exponentially_longer now uses a random amount of jitter in order to
prevent the [thundering herd effect.](https://en.wikipedia.org/wiki/Thundering_herd_problem). Defaults to
15% (represented as 0.15) but overridable via the `:jitter` option when using `retry_on`.
Jitter is applied when an `Integer`, `ActiveSupport::Duration` or `exponentially_longer`, is passed to the `wait` argument in `retry_on`.
retry_on(MyError, wait: :exponentially_longer, jitter: 0.30)
*Anthony Ross*
Please check [6-0-stable](https://github.com/rails/rails/blob/6-0-stable/activejob/CHANGELOG.md) for previous changes.

@ -19,22 +19,24 @@ module ClassMethods
# ==== Options
# * <tt>:wait</tt> - Re-enqueues the job with a delay specified either in seconds (default: 3 seconds),
# as a computing proc that the number of executions so far as an argument, or as a symbol reference of
# <tt>:exponentially_longer</tt>, which applies the wait algorithm of <tt>(executions ** 4) + 2</tt>
# (first wait 3s, then 18s, then 83s, etc)
# <tt>:exponentially_longer</tt>, which applies the wait algorithm of <tt><((executions**4) + (Kernel.rand((executions**4) * jitter))) + 2/tt>
# (first wait ~3s, then ~18s, then ~83s, etc)
# * <tt>:attempts</tt> - Re-enqueues the job the specified number of times (default: 5 attempts)
# * <tt>:queue</tt> - Re-enqueues the job on a different queue
# * <tt>:priority</tt> - Re-enqueues the job with a different priority
# * <tt>:jitter</tt> - A random delay of wait time used when calculating backoff. The default is 15%(0.15) which represents the upper bound of possible wait time (expressed as a percentage)
#
# ==== Examples
#
# class RemoteServiceJob < ActiveJob::Base
# retry_on CustomAppException # defaults to 3s wait, 5 attempts
# retry_on CustomAppException # defaults to ~3s wait, 5 attempts
# retry_on AnotherCustomAppException, wait: ->(executions) { executions * 2 }
#
# retry_on ActiveRecord::Deadlocked, wait: 5.seconds, attempts: 3
# retry_on Net::OpenTimeout, Timeout::Error, wait: :exponentially_longer, attempts: 10 # retries at most 10 times for Net::OpenTimeout and Timeout::Error combined
# # To retry at most 10 times for each individual exception:
# # retry_on Net::OpenTimeout, wait: :exponentially_longer, attempts: 10
# # retry_on Net::ReadTimeout, wait: 5.seconds, jitter: 0.30, attempts: 10
# # retry_on Timeout::Error, wait: :exponentially_longer, attempts: 10
#
# retry_on(YetAnotherCustomAppException) do |job, error|
@ -47,12 +49,11 @@ module ClassMethods
# # Might raise Net::OpenTimeout or Timeout::Error when the remote service is down
# end
# end
def retry_on(*exceptions, wait: 3.seconds, attempts: 5, queue: nil, priority: nil)
def retry_on(*exceptions, wait: 3.seconds, attempts: 5, queue: nil, priority: nil, jitter: 0.15)
rescue_from(*exceptions) do |error|
executions = executions_for(exceptions)
if executions < attempts
retry_job wait: determine_delay(seconds_or_duration_or_algorithm: wait, executions: executions), queue: queue, priority: priority, error: error
retry_job wait: determine_delay(seconds_or_duration_or_algorithm: wait, executions: executions, jitter: jitter), queue: queue, priority: priority, error: error
else
if block_given?
instrument :retry_stopped, error: error do
@ -121,16 +122,16 @@ def retry_job(options = {})
end
private
def determine_delay(seconds_or_duration_or_algorithm:, executions:)
def determine_delay(seconds_or_duration_or_algorithm:, executions:, jitter:)
case seconds_or_duration_or_algorithm
when :exponentially_longer
(executions**4) + 2
((executions**4) + (Kernel.rand((executions**4) * jitter))) + 2
when ActiveSupport::Duration
duration = seconds_or_duration_or_algorithm
duration.to_i
duration = seconds_or_duration_or_algorithm.to_i
duration + Kernel.rand(duration * jitter)
when Integer
seconds = seconds_or_duration_or_algorithm
seconds
seconds + (Kernel.rand(seconds * jitter).ceil)
when Proc
algorithm = seconds_or_duration_or_algorithm
algorithm.call(executions)

@ -3,6 +3,7 @@
require "helper"
require "jobs/retry_job"
require "models/person"
require "minitest/mock"
class ExceptionsTest < ActiveSupport::TestCase
setup do
@ -94,32 +95,38 @@ class ExceptionsTest < ActiveSupport::TestCase
test "long wait job" do
travel_to Time.now
random_amount = 1
RetryJob.perform_later "LongWaitError", 2, :log_scheduled_at
assert_equal [
"Raised LongWaitError for the 1st time",
"Next execution scheduled at #{(Time.now + 3600.seconds).to_f}",
"Successfully completed job"
], JobBuffer.values
Kernel.stub(:rand, random_amount) do
RetryJob.perform_later "LongWaitError", 2, :log_scheduled_at
assert_equal [
"Raised LongWaitError for the 1st time",
"Next execution scheduled at #{(Time.now + 3600.seconds + random_amount).to_f}",
"Successfully completed job"
], JobBuffer.values
end
end
test "exponentially retrying job" do
test "exponentially retrying job includes jitter" do
travel_to Time.now
RetryJob.perform_later "ExponentialWaitTenAttemptsError", 5, :log_scheduled_at
random_amount = 2
assert_equal [
"Raised ExponentialWaitTenAttemptsError for the 1st time",
"Next execution scheduled at #{(Time.now + 3.seconds).to_f}",
"Raised ExponentialWaitTenAttemptsError for the 2nd time",
"Next execution scheduled at #{(Time.now + 18.seconds).to_f}",
"Raised ExponentialWaitTenAttemptsError for the 3rd time",
"Next execution scheduled at #{(Time.now + 83.seconds).to_f}",
"Raised ExponentialWaitTenAttemptsError for the 4th time",
"Next execution scheduled at #{(Time.now + 258.seconds).to_f}",
"Successfully completed job"
], JobBuffer.values
Kernel.stub(:rand, random_amount) do
RetryJob.perform_later "ExponentialWaitTenAttemptsError", 5, :log_scheduled_at
assert_equal [
"Raised ExponentialWaitTenAttemptsError for the 1st time",
"Next execution scheduled at #{(Time.now + 3.seconds + random_amount).to_f}",
"Raised ExponentialWaitTenAttemptsError for the 2nd time",
"Next execution scheduled at #{(Time.now + 18.seconds + random_amount).to_f}",
"Raised ExponentialWaitTenAttemptsError for the 3rd time",
"Next execution scheduled at #{(Time.now + 83.seconds + random_amount).to_f}",
"Raised ExponentialWaitTenAttemptsError for the 4th time",
"Next execution scheduled at #{(Time.now + 258.seconds + random_amount).to_f}",
"Successfully completed job"
], JobBuffer.values
end
end
test "custom wait retrying job" do
@ -145,19 +152,23 @@ class ExceptionsTest < ActiveSupport::TestCase
exceptions_to_raise = %w(ExponentialWaitTenAttemptsError CustomWaitTenAttemptsError ExponentialWaitTenAttemptsError CustomWaitTenAttemptsError)
RetryJob.perform_later exceptions_to_raise, 5, :log_scheduled_at
random_amount = 1
assert_equal [
"Raised ExponentialWaitTenAttemptsError for the 1st time",
"Next execution scheduled at #{(Time.now + 3.seconds).to_f}",
"Raised CustomWaitTenAttemptsError for the 2nd time",
"Next execution scheduled at #{(Time.now + 2.seconds).to_f}",
"Raised ExponentialWaitTenAttemptsError for the 3rd time",
"Next execution scheduled at #{(Time.now + 18.seconds).to_f}",
"Raised CustomWaitTenAttemptsError for the 4th time",
"Next execution scheduled at #{(Time.now + 4.seconds).to_f}",
"Successfully completed job"
], JobBuffer.values
Kernel.stub(:rand, random_amount) do
RetryJob.perform_later exceptions_to_raise, 5, :log_scheduled_at
assert_equal [
"Raised ExponentialWaitTenAttemptsError for the 1st time",
"Next execution scheduled at #{(Time.now + 3.seconds + random_amount).to_f}",
"Raised CustomWaitTenAttemptsError for the 2nd time",
"Next execution scheduled at #{(Time.now + 2.seconds).to_f}",
"Raised ExponentialWaitTenAttemptsError for the 3rd time",
"Next execution scheduled at #{(Time.now + 18.seconds + random_amount).to_f}",
"Raised CustomWaitTenAttemptsError for the 4th time",
"Next execution scheduled at #{(Time.now + 4.seconds).to_f}",
"Successfully completed job"
], JobBuffer.values
end
end
test "successfully retry job throwing one of two retryable exceptions" do