Optimize Active Record batching for whole table iterations

This commit is contained in:
fatkodima 2022-06-21 02:13:38 +03:00
parent 19f9922523
commit 620f247829
4 changed files with 75 additions and 4 deletions

@ -1,3 +1,27 @@
* Optimize Active Record batching for whole table iterations.
Previously, `in_batches` got all the ids and constructed an `IN`-based query for each batch.
When iterating over the whole tables, this approach is not optimal as it loads unneeded ids and
`IN` queries with lots of items are slow.
Now, whole table iterations use range iteration (`id >= x AND id <= y`) by default which can make iteration
several times faster. E.g., tested on a PostgreSQL table with 10 million records: querying (`253s` vs `30s`),
updating (`288s` vs `124s`), deleting (`268s` vs `83s`).
Only whole table iterations use this style of iteration by default. You can disable this behavior by passing `use_ranges: false`.
If you iterate over the table and the only condition is, e.g., `archived_at: nil` (and only a tiny fraction
of the records are archived), it makes sense to opt in to this approach:
```ruby
Project.where(archived_at: nil).in_batches(use_ranges: true) do |relation|
# do something
end
```
See #45414 for more details.
*fatkodima*
* `.with` query method added. Construct common table expressions with ease and get `ActiveRecord::Relation` back.
```ruby

@ -168,6 +168,11 @@ def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore:
# * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
# an order is present in the relation.
# * <tt>:order</tt> - Specifies the primary key order (can be +:asc+ or +:desc+). Defaults to +:asc+.
# * <tt>:use_ranges</tt> - Specifies whether to use range iteration (id >= x AND id <= y).
# It can make iterating over the whole or almost whole tables several times faster.
# Only whole table iterations use this style of iteration by default. You can disable this behavior by passing +false+.
# If you iterate over the table and the only condition is, e.g., `archived_at: nil` (and only a tiny fraction
# of the records are archived), it makes sense to opt in to this approach.
#
# Limits are honored, and if present there is no requirement for the batch
# size, it can be less than, equal, or greater than the limit.
@ -201,7 +206,7 @@ def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore:
#
# NOTE: By its nature, batch processing is subject to race conditions if
# other processes are modifying the database.
def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore: nil, order: :asc)
def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore: nil, order: :asc, use_ranges: nil)
relation = self
unless [:asc, :desc].include?(order)
@ -209,7 +214,7 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
end
unless block_given?
return BatchEnumerator.new(of: of, start: start, finish: finish, relation: self, order: order)
return BatchEnumerator.new(of: of, start: start, finish: finish, relation: self, order: order, use_ranges: use_ranges)
end
if arel.orders.present?
@ -226,6 +231,7 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
relation = apply_limits(relation, start, finish, order)
relation.skip_query_cache! # Retaining the results in the query cache would undermine the point of batching
batch_relation = relation
empty_scope = to_sql == klass.unscoped.all.to_sql
loop do
if load
@ -233,6 +239,14 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
ids = records.map(&:id)
yielded_relation = where(primary_key => ids)
yielded_relation.load_records(records)
elsif (empty_scope && use_ranges != false) || use_ranges
ids = batch_relation.pluck(primary_key)
finish = ids.last
if finish
yielded_relation = apply_finish_limit(batch_relation, finish, order)
yielded_relation = yielded_relation.except(:limit, :order)
yielded_relation.skip_query_cache!(false)
end
else
ids = batch_relation.pluck(primary_key)
yielded_relation = where(primary_key => ids)

@ -5,12 +5,13 @@ module Batches
class BatchEnumerator
include Enumerable
def initialize(of: 1000, start: nil, finish: nil, relation:, order: :asc) # :nodoc:
def initialize(of: 1000, start: nil, finish: nil, relation:, order: :asc, use_ranges: nil) # :nodoc:
@of = of
@relation = relation
@start = start
@finish = finish
@order = order
@use_ranges = use_ranges
end
# The primary key value from which the BatchEnumerator starts, inclusive of the value.
@ -91,7 +92,7 @@ def destroy_all
# relation.update_all(awesome: true)
# end
def each(&block)
enum = @relation.to_enum(:in_batches, of: @of, start: @start, finish: @finish, load: false, order: @order)
enum = @relation.to_enum(:in_batches, of: @of, start: @start, finish: @finish, load: false, order: @order, use_ranges: @use_ranges)
return enum.each(&block) if block_given?
enum
end

@ -441,6 +441,38 @@ def test_in_batches_should_end_at_the_finish_option
end
end
def test_in_batches_executes_range_queries_when_unconstrained
c = Post.connection
quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
assert_sql(/WHERE #{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
Post.in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
end
end
def test_in_batches_executes_in_queries_when_unconstrained_and_opted_out_of_ranges
c = Post.connection
quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
assert_sql(/#{quoted_posts_id} IN \(.+\)/i) do
Post.in_batches(of: 2, use_ranges: false) { |relation| assert_kind_of Post, relation.first }
end
end
def test_in_batches_executes_in_queries_when_constrained
c = Post.connection
quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
assert_sql(/#{quoted_posts_id} IN \(.+\)/i) do
Post.where("id < ?", 5).in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
end
end
def test_in_batches_executes_range_queries_when_constrained_and_opted_in_into_ranges
c = Post.connection
quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
assert_sql(/#{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
Post.where("id < ?", 5).in_batches(of: 2, use_ranges: true) { |relation| assert_kind_of Post, relation.first }
end
end
def test_in_batches_shouldnt_execute_query_unless_needed
assert_queries(2) do
Post.in_batches(of: @total) { |relation| assert_kind_of ActiveRecord::Relation, relation }