Optimize Active Record batching for whole table iterations
This commit is contained in:
parent
19f9922523
commit
620f247829
@ -1,3 +1,27 @@
|
||||
* Optimize Active Record batching for whole table iterations.
|
||||
|
||||
Previously, `in_batches` got all the ids and constructed an `IN`-based query for each batch.
|
||||
When iterating over the whole tables, this approach is not optimal as it loads unneeded ids and
|
||||
`IN` queries with lots of items are slow.
|
||||
|
||||
Now, whole table iterations use range iteration (`id >= x AND id <= y`) by default which can make iteration
|
||||
several times faster. E.g., tested on a PostgreSQL table with 10 million records: querying (`253s` vs `30s`),
|
||||
updating (`288s` vs `124s`), deleting (`268s` vs `83s`).
|
||||
|
||||
Only whole table iterations use this style of iteration by default. You can disable this behavior by passing `use_ranges: false`.
|
||||
If you iterate over the table and the only condition is, e.g., `archived_at: nil` (and only a tiny fraction
|
||||
of the records are archived), it makes sense to opt in to this approach:
|
||||
|
||||
```ruby
|
||||
Project.where(archived_at: nil).in_batches(use_ranges: true) do |relation|
|
||||
# do something
|
||||
end
|
||||
```
|
||||
|
||||
See #45414 for more details.
|
||||
|
||||
*fatkodima*
|
||||
|
||||
* `.with` query method added. Construct common table expressions with ease and get `ActiveRecord::Relation` back.
|
||||
|
||||
```ruby
|
||||
|
@ -168,6 +168,11 @@ def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore:
|
||||
# * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
|
||||
# an order is present in the relation.
|
||||
# * <tt>:order</tt> - Specifies the primary key order (can be +:asc+ or +:desc+). Defaults to +:asc+.
|
||||
# * <tt>:use_ranges</tt> - Specifies whether to use range iteration (id >= x AND id <= y).
|
||||
# It can make iterating over the whole or almost whole tables several times faster.
|
||||
# Only whole table iterations use this style of iteration by default. You can disable this behavior by passing +false+.
|
||||
# If you iterate over the table and the only condition is, e.g., `archived_at: nil` (and only a tiny fraction
|
||||
# of the records are archived), it makes sense to opt in to this approach.
|
||||
#
|
||||
# Limits are honored, and if present there is no requirement for the batch
|
||||
# size, it can be less than, equal, or greater than the limit.
|
||||
@ -201,7 +206,7 @@ def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore:
|
||||
#
|
||||
# NOTE: By its nature, batch processing is subject to race conditions if
|
||||
# other processes are modifying the database.
|
||||
def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore: nil, order: :asc)
|
||||
def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore: nil, order: :asc, use_ranges: nil)
|
||||
relation = self
|
||||
|
||||
unless [:asc, :desc].include?(order)
|
||||
@ -209,7 +214,7 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
|
||||
end
|
||||
|
||||
unless block_given?
|
||||
return BatchEnumerator.new(of: of, start: start, finish: finish, relation: self, order: order)
|
||||
return BatchEnumerator.new(of: of, start: start, finish: finish, relation: self, order: order, use_ranges: use_ranges)
|
||||
end
|
||||
|
||||
if arel.orders.present?
|
||||
@ -226,6 +231,7 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
|
||||
relation = apply_limits(relation, start, finish, order)
|
||||
relation.skip_query_cache! # Retaining the results in the query cache would undermine the point of batching
|
||||
batch_relation = relation
|
||||
empty_scope = to_sql == klass.unscoped.all.to_sql
|
||||
|
||||
loop do
|
||||
if load
|
||||
@ -233,6 +239,14 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
|
||||
ids = records.map(&:id)
|
||||
yielded_relation = where(primary_key => ids)
|
||||
yielded_relation.load_records(records)
|
||||
elsif (empty_scope && use_ranges != false) || use_ranges
|
||||
ids = batch_relation.pluck(primary_key)
|
||||
finish = ids.last
|
||||
if finish
|
||||
yielded_relation = apply_finish_limit(batch_relation, finish, order)
|
||||
yielded_relation = yielded_relation.except(:limit, :order)
|
||||
yielded_relation.skip_query_cache!(false)
|
||||
end
|
||||
else
|
||||
ids = batch_relation.pluck(primary_key)
|
||||
yielded_relation = where(primary_key => ids)
|
||||
|
@ -5,12 +5,13 @@ module Batches
|
||||
class BatchEnumerator
|
||||
include Enumerable
|
||||
|
||||
def initialize(of: 1000, start: nil, finish: nil, relation:, order: :asc) # :nodoc:
|
||||
def initialize(of: 1000, start: nil, finish: nil, relation:, order: :asc, use_ranges: nil) # :nodoc:
|
||||
@of = of
|
||||
@relation = relation
|
||||
@start = start
|
||||
@finish = finish
|
||||
@order = order
|
||||
@use_ranges = use_ranges
|
||||
end
|
||||
|
||||
# The primary key value from which the BatchEnumerator starts, inclusive of the value.
|
||||
@ -91,7 +92,7 @@ def destroy_all
|
||||
# relation.update_all(awesome: true)
|
||||
# end
|
||||
def each(&block)
|
||||
enum = @relation.to_enum(:in_batches, of: @of, start: @start, finish: @finish, load: false, order: @order)
|
||||
enum = @relation.to_enum(:in_batches, of: @of, start: @start, finish: @finish, load: false, order: @order, use_ranges: @use_ranges)
|
||||
return enum.each(&block) if block_given?
|
||||
enum
|
||||
end
|
||||
|
@ -441,6 +441,38 @@ def test_in_batches_should_end_at_the_finish_option
|
||||
end
|
||||
end
|
||||
|
||||
def test_in_batches_executes_range_queries_when_unconstrained
|
||||
c = Post.connection
|
||||
quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
|
||||
assert_sql(/WHERE #{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
|
||||
Post.in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
|
||||
end
|
||||
end
|
||||
|
||||
def test_in_batches_executes_in_queries_when_unconstrained_and_opted_out_of_ranges
|
||||
c = Post.connection
|
||||
quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
|
||||
assert_sql(/#{quoted_posts_id} IN \(.+\)/i) do
|
||||
Post.in_batches(of: 2, use_ranges: false) { |relation| assert_kind_of Post, relation.first }
|
||||
end
|
||||
end
|
||||
|
||||
def test_in_batches_executes_in_queries_when_constrained
|
||||
c = Post.connection
|
||||
quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
|
||||
assert_sql(/#{quoted_posts_id} IN \(.+\)/i) do
|
||||
Post.where("id < ?", 5).in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
|
||||
end
|
||||
end
|
||||
|
||||
def test_in_batches_executes_range_queries_when_constrained_and_opted_in_into_ranges
|
||||
c = Post.connection
|
||||
quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
|
||||
assert_sql(/#{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
|
||||
Post.where("id < ?", 5).in_batches(of: 2, use_ranges: true) { |relation| assert_kind_of Post, relation.first }
|
||||
end
|
||||
end
|
||||
|
||||
def test_in_batches_shouldnt_execute_query_unless_needed
|
||||
assert_queries(2) do
|
||||
Post.in_batches(of: @total) { |relation| assert_kind_of ActiveRecord::Relation, relation }
|
||||
|
Loading…
Reference in New Issue
Block a user