Optimize Active Record batching for whole table iterations

2022-06-21 02:13:38 +03:00 · 2022-06-21 02:13:38 +03:00 · 620f247829
commit 620f247829
parent 19f9922523
4 changed files with 75 additions and 4 deletions
--- a/activerecord/CHANGELOG.md
+++ b/activerecord/CHANGELOG.md
@ -1,3 +1,27 @@
+*   Optimize Active Record batching for whole table iterations.
+
+    Previously, `in_batches` got all the ids and constructed an `IN`-based query for each batch.
+    When iterating over the whole tables, this approach is not optimal as it loads unneeded ids and
+    `IN` queries with lots of items are slow.
+
+    Now, whole table iterations use range iteration (`id >= x AND id <= y`) by default which can make iteration
+    several times faster. E.g., tested on a PostgreSQL table with 10 million records: querying (`253s` vs `30s`),
+    updating (`288s` vs `124s`), deleting (`268s` vs `83s`).
+
+    Only whole table iterations use this style of iteration by default. You can disable this behavior by passing `use_ranges: false`.
+    If you iterate over the table and the only condition is, e.g., `archived_at: nil` (and only a tiny fraction
+    of the records are archived), it makes sense to opt in to this approach:
+
+    ```ruby
+    Project.where(archived_at: nil).in_batches(use_ranges: true) do |relation|
+      # do something
+    end
+    ```
+
+    See #45414 for more details.
+
+    *fatkodima*
+
 *   `.with` query method added. Construct common table expressions with ease and get `ActiveRecord::Relation` back.

    ```ruby
--- a/activerecord/lib/active_record/relation/batches.rb
+++ b/activerecord/lib/active_record/relation/batches.rb
@ -168,6 +168,11 @@ def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore:
    # * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
    #   an order is present in the relation.
    # * <tt>:order</tt> - Specifies the primary key order (can be +:asc+ or +:desc+). Defaults to +:asc+.
+    # * <tt>:use_ranges</tt> - Specifies whether to use range iteration (id >= x AND id <= y).
+    #   It can make iterating over the whole or almost whole tables several times faster.
+    #   Only whole table iterations use this style of iteration by default. You can disable this behavior by passing +false+.
+    #   If you iterate over the table and the only condition is, e.g., `archived_at: nil` (and only a tiny fraction
+    #   of the records are archived), it makes sense to opt in to this approach.
    #
    # Limits are honored, and if present there is no requirement for the batch
    # size, it can be less than, equal, or greater than the limit.
@ -201,7 +206,7 @@ def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore:
    #
    # NOTE: By its nature, batch processing is subject to race conditions if
    # other processes are modifying the database.
-    def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore: nil, order: :asc)
+    def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore: nil, order: :asc, use_ranges: nil)
      relation = self

      unless [:asc, :desc].include?(order)
@ -209,7 +214,7 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
      end

      unless block_given?
-        return BatchEnumerator.new(of: of, start: start, finish: finish, relation: self, order: order)
+        return BatchEnumerator.new(of: of, start: start, finish: finish, relation: self, order: order, use_ranges: use_ranges)
      end

      if arel.orders.present?
@ -226,6 +231,7 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
      relation = apply_limits(relation, start, finish, order)
      relation.skip_query_cache! # Retaining the results in the query cache would undermine the point of batching
      batch_relation = relation
+      empty_scope = to_sql == klass.unscoped.all.to_sql

      loop do
        if load
@ -233,6 +239,14 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
          ids = records.map(&:id)
          yielded_relation = where(primary_key => ids)
          yielded_relation.load_records(records)
+        elsif (empty_scope && use_ranges != false) || use_ranges
+          ids = batch_relation.pluck(primary_key)
+          finish = ids.last
+          if finish
+            yielded_relation = apply_finish_limit(batch_relation, finish, order)
+            yielded_relation = yielded_relation.except(:limit, :order)
+            yielded_relation.skip_query_cache!(false)
+          end
        else
          ids = batch_relation.pluck(primary_key)
          yielded_relation = where(primary_key => ids)
--- a/activerecord/lib/active_record/relation/batches/batch_enumerator.rb
+++ b/activerecord/lib/active_record/relation/batches/batch_enumerator.rb
@ -5,12 +5,13 @@ module Batches
    class BatchEnumerator
      include Enumerable

-      def initialize(of: 1000, start: nil, finish: nil, relation:, order: :asc) # :nodoc:
+      def initialize(of: 1000, start: nil, finish: nil, relation:, order: :asc, use_ranges: nil) # :nodoc:
        @of       = of
        @relation = relation
        @start = start
        @finish = finish
        @order = order
+        @use_ranges = use_ranges
      end

      # The primary key value from which the BatchEnumerator starts, inclusive of the value.
@ -91,7 +92,7 @@ def destroy_all
      #     relation.update_all(awesome: true)
      #   end
      def each(&block)
-        enum = @relation.to_enum(:in_batches, of: @of, start: @start, finish: @finish, load: false, order: @order)
+        enum = @relation.to_enum(:in_batches, of: @of, start: @start, finish: @finish, load: false, order: @order, use_ranges: @use_ranges)
        return enum.each(&block) if block_given?
        enum
      end
--- a/activerecord/test/cases/batches_test.rb
+++ b/activerecord/test/cases/batches_test.rb
@ -441,6 +441,38 @@ def test_in_batches_should_end_at_the_finish_option
    end
  end

+  def test_in_batches_executes_range_queries_when_unconstrained
+    c = Post.connection
+    quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
+    assert_sql(/WHERE #{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
+      Post.in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
+    end
+  end
+
+  def test_in_batches_executes_in_queries_when_unconstrained_and_opted_out_of_ranges
+    c = Post.connection
+    quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
+    assert_sql(/#{quoted_posts_id} IN \(.+\)/i) do
+      Post.in_batches(of: 2, use_ranges: false) { |relation| assert_kind_of Post, relation.first }
+    end
+  end
+
+  def test_in_batches_executes_in_queries_when_constrained
+    c = Post.connection
+    quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
+    assert_sql(/#{quoted_posts_id} IN \(.+\)/i) do
+      Post.where("id < ?", 5).in_batches(of: 2) { |relation| assert_kind_of Post, relation.first }
+    end
+  end
+
+  def test_in_batches_executes_range_queries_when_constrained_and_opted_in_into_ranges
+    c = Post.connection
+    quoted_posts_id = Regexp.escape(c.quote_table_name("posts.id"))
+    assert_sql(/#{quoted_posts_id} > .+ AND #{quoted_posts_id} <= .+/i) do
+      Post.where("id < ?", 5).in_batches(of: 2, use_ranges: true) { |relation| assert_kind_of Post, relation.first }
+    end
+  end
+
  def test_in_batches_shouldnt_execute_query_unless_needed
    assert_queries(2) do
      Post.in_batches(of: @total) { |relation| assert_kind_of ActiveRecord::Relation, relation }