From 4c86641af2498f54d039e6768b256a2dbe79dc5b Mon Sep 17 00:00:00 2001 From: Adrianna Chang Date: Mon, 10 May 2021 16:46:16 -0400 Subject: [PATCH 1/4] Support BatchEnumerator collections --- README.md | 40 +++++++++++++++++++ .../maintenance_tasks/task_job_concern.rb | 21 ++++++++-- .../update_posts_in_batches_task.rb | 16 ++++++++ test/jobs/maintenance_tasks/task_job_test.rb | 22 +++++++++- .../maintenance_tasks/task_data_test.rb | 1 + test/system/maintenance_tasks/tasks_test.rb | 1 + test/tasks/maintenance_tasks/task_test.rb | 1 + 7 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 test/dummy/app/tasks/maintenance/update_posts_in_batches_task.rb diff --git a/README.md b/README.md index 5b1335df..dea5fe28 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,46 @@ title,content My Title,Hello World! ``` +### Processing Batch Collections + +The Maintenance Tasks gem supports processing Active Records in batches. This +can reduce the number of calls your Task makes to the database. Use +`ActiveRecord::Batches#in_batches` on the relation returned by your collection to specify that your Task should process +batches instead of records. Active Record defaults to 1000 records by batch, but a custom size can be +specified. + +```ruby +# app/tasks/maintenance/update_posts_in_batches_task.rb +module Maintenance + class UpdatePostsInBatchesTask < MaintenanceTasks::Task + def collection + Post.in_batches + end + + def process(batch_of_posts) + batch_of_posts.update_all(content: "New content added on #{Time.now.utc}") + end + end +end +``` + +Ensure that you've implemented the following methods: + +* `collection`: return an `ActiveRecord::Batches::BatchEnumerator`. +* `process`: do the work of your Task on a batch (`ActiveRecord::Relation`). + +Note that `#count` is calculated automatically based on the number of batches in +your collection, and your Task's progress will be displayed in terms of batches +(not the number of records in the relation). + +**Important!** Batches should only be used if `#process` is performing a batch +operation such as `#update_all` or `#delete_all`. If you need to iterate over +individual records, you should define a collection that [returns an +`ActiveRecord::Relation`](#creating-a-task). This uses batching +internally, but loads the records with one SQL query. Conversely, batch +collections load the primary keys of the records of the batch first, and then perform an additional query to load the +records when calling `each` (or any `Enumerable` method) inside `#process`. + ### Throttling Maintenance Tasks often modify a lot of data and can be taxing on your database. diff --git a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb index 99522943..b13398f9 100644 --- a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb +++ b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb @@ -36,13 +36,27 @@ def build_enumerator(_run, cursor:) collection_enum = case collection when ActiveRecord::Relation enumerator_builder.active_record_on_records(collection, cursor: cursor) + when ActiveRecord::Batches::BatchEnumerator + relation = collection.instance_variable_get(:@relation) + batch_size = collection.instance_variable_get(:@of) + enumerator_builder.active_record_on_batch_relations( + relation, + cursor: cursor, + batch_size: batch_size, + as_relation: true + ) + @run.update!(tick_total: enumerator.size) + enumerator when Array enumerator_builder.build_array_enumerator(collection, cursor: cursor) when CSV JobIteration::CsvEnumerator.new(collection).rows(cursor: cursor) else - raise ArgumentError, "#{@task.class.name}#collection must be either "\ - "an Active Record Relation, Array, or CSV." + raise ArgumentError, <<~MSG.squish + #{@task.class.name}#collection must be either an + Active Record Relation, ActiveRecord::Batches::BatchEnumerator, + Array, or CSV. + MSG end @task.throttle_conditions.reduce(collection_enum) do |enum, condition| @@ -85,7 +99,8 @@ def before_perform end def on_start - @run.update!(started_at: Time.now, tick_total: @task.count) + @run.tick_total = @task.count unless @run.tick_total + @run.update!(started_at: Time.now) end def on_complete diff --git a/test/dummy/app/tasks/maintenance/update_posts_in_batches_task.rb b/test/dummy/app/tasks/maintenance/update_posts_in_batches_task.rb new file mode 100644 index 00000000..f6c2f78e --- /dev/null +++ b/test/dummy/app/tasks/maintenance/update_posts_in_batches_task.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true +module Maintenance + class UpdatePostsInBatchesTask < MaintenanceTasks::Task + def collection + Post.in_batches(of: 5) + end + + def count + Post.count + end + + def process(batch_of_posts) + batch_of_posts.update_all(content: "New content added on #{Time.now.utc}") + end + end +end diff --git a/test/jobs/maintenance_tasks/task_job_test.rb b/test/jobs/maintenance_tasks/task_job_test.rb index d53fa233..47c54a50 100644 --- a/test/jobs/maintenance_tasks/task_job_test.rb +++ b/test/jobs/maintenance_tasks/task_job_test.rb @@ -247,8 +247,11 @@ class << self assert_predicate @run, :errored? assert_equal "ArgumentError", @run.error_class assert_empty @run.backtrace - expected_message = "Maintenance::TestTask#collection "\ - "must be either an Active Record Relation, Array, or CSV." + expected_message = <<~MSG.squish + Maintenance::TestTask#collection must be either an + Active Record Relation, ActiveRecord::Batches::BatchEnumerator, + Array, or CSV. + MSG assert_equal expected_message, @run.error_message end @@ -368,5 +371,20 @@ class << self assert_predicate run.reload, :succeeded? end + + test ".perform_now handles batch relation tasks" do + 5.times do |i| + Post.create!(title: "Another Post ##{i}", content: "Content ##{i}") + end + # We expect 2 batches (7 posts => 5 + 2) + Maintenance::UpdatePostsInBatchesTask.any_instance.expects(:process).twice + + run = Run.create!(task_name: "Maintenance::UpdatePostsInBatchesTask") + TaskJob.perform_now(run) + + run.reload + assert_equal 2, run.tick_total + assert_equal 2, run.tick_count + end end end diff --git a/test/models/maintenance_tasks/task_data_test.rb b/test/models/maintenance_tasks/task_data_test.rb index 80cf6641..2b30d94e 100644 --- a/test/models/maintenance_tasks/task_data_test.rb +++ b/test/models/maintenance_tasks/task_data_test.rb @@ -27,6 +27,7 @@ class TaskDataTest < ActiveSupport::TestCase "Maintenance::ImportPostsTask", "Maintenance::ParamsTask", "Maintenance::TestTask", + "Maintenance::UpdatePostsInBatchesTask", "Maintenance::UpdatePostsTask", "Maintenance::UpdatePostsThrottledTask", ] diff --git a/test/system/maintenance_tasks/tasks_test.rb b/test/system/maintenance_tasks/tasks_test.rb index 8f70e2b1..f5874fa5 100644 --- a/test/system/maintenance_tasks/tasks_test.rb +++ b/test/system/maintenance_tasks/tasks_test.rb @@ -24,6 +24,7 @@ class TasksTest < ApplicationSystemTestCase "Maintenance::ImportPostsTask\nNew", "Maintenance::ParamsTask\nNew", "Maintenance::TestTask\nNew", + "Maintenance::UpdatePostsInBatchesTask\nNew", "Maintenance::UpdatePostsThrottledTask\nNew", "Completed Tasks", "Maintenance::UpdatePostsTask\nSucceeded", diff --git a/test/tasks/maintenance_tasks/task_test.rb b/test/tasks/maintenance_tasks/task_test.rb index 9463cf48..920a4df6 100644 --- a/test/tasks/maintenance_tasks/task_test.rb +++ b/test/tasks/maintenance_tasks/task_test.rb @@ -11,6 +11,7 @@ class TaskTest < ActiveSupport::TestCase "Maintenance::ImportPostsTask", "Maintenance::ParamsTask", "Maintenance::TestTask", + "Maintenance::UpdatePostsInBatchesTask", "Maintenance::UpdatePostsTask", "Maintenance::UpdatePostsThrottledTask", ] From abb5befe60aa5f9130bb46a101a3cf698f2661fe Mon Sep 17 00:00:00 2001 From: Adrianna Chang Date: Mon, 17 May 2021 21:31:06 -0400 Subject: [PATCH 2/4] Disable use of start and finish options on batch enumerator --- .../maintenance_tasks/task_job_concern.rb | 8 +++++++- test/jobs/maintenance_tasks/task_job_test.rb | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb index b13398f9..0fcb15e2 100644 --- a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb +++ b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb @@ -37,13 +37,19 @@ def build_enumerator(_run, cursor:) when ActiveRecord::Relation enumerator_builder.active_record_on_records(collection, cursor: cursor) when ActiveRecord::Batches::BatchEnumerator + if collection.instance_variable_get(:@start) || + collection.instance_variable_get(:@finish) + raise ArgumentError, <<~MSG.squish + #{@task.class.name}#collection cannot support + a batch enumerator with the "start" or "finish" options. + MSG + end relation = collection.instance_variable_get(:@relation) batch_size = collection.instance_variable_get(:@of) enumerator_builder.active_record_on_batch_relations( relation, cursor: cursor, batch_size: batch_size, - as_relation: true ) @run.update!(tick_total: enumerator.size) enumerator diff --git a/test/jobs/maintenance_tasks/task_job_test.rb b/test/jobs/maintenance_tasks/task_job_test.rb index 47c54a50..503abf1c 100644 --- a/test/jobs/maintenance_tasks/task_job_test.rb +++ b/test/jobs/maintenance_tasks/task_job_test.rb @@ -386,5 +386,24 @@ class << self assert_equal 2, run.tick_total assert_equal 2, run.tick_count end + + test ".perform_now raises if +start+ or +finish+ options are used on batch enumerator" do + batch_enumerator = Post.in_batches(of: 5, start: 1, finish: 10) + + Maintenance::UpdatePostsInBatchesTask.any_instance + .expects(:collection).returns(batch_enumerator) + + run = Run.create!(task_name: "Maintenance::UpdatePostsInBatchesTask") + TaskJob.perform_now(run) + + assert_predicate run.reload, :errored? + assert_equal "ArgumentError", run.error_class + assert_empty run.backtrace + expected_message = <<~MSG.squish + Maintenance::UpdatePostsInBatchesTask#collection cannot support a batch + enumerator with the "start" or "finish" options. + MSG + assert_equal expected_message, run.error_message + end end end From e59b27bfda037788234e038f33d67cfd4b6b89e0 Mon Sep 17 00:00:00 2001 From: Adrianna Chang Date: Mon, 31 May 2021 14:14:58 -0400 Subject: [PATCH 3/4] Bring in patch from Rails to expose public attr readers on BatchEnumerator --- .../maintenance_tasks/task_job_concern.rb | 9 +++----- lib/maintenance_tasks.rb | 2 ++ lib/patches/active_record_batch_enumerator.rb | 23 +++++++++++++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) create mode 100644 lib/patches/active_record_batch_enumerator.rb diff --git a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb index 0fcb15e2..3f92754e 100644 --- a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb +++ b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb @@ -37,19 +37,16 @@ def build_enumerator(_run, cursor:) when ActiveRecord::Relation enumerator_builder.active_record_on_records(collection, cursor: cursor) when ActiveRecord::Batches::BatchEnumerator - if collection.instance_variable_get(:@start) || - collection.instance_variable_get(:@finish) + if collection.start || collection.finish raise ArgumentError, <<~MSG.squish #{@task.class.name}#collection cannot support a batch enumerator with the "start" or "finish" options. MSG end - relation = collection.instance_variable_get(:@relation) - batch_size = collection.instance_variable_get(:@of) enumerator_builder.active_record_on_batch_relations( - relation, + collection.relation, cursor: cursor, - batch_size: batch_size, + batch_size: collection.batch_size, ) @run.update!(tick_total: enumerator.size) enumerator diff --git a/lib/maintenance_tasks.rb b/lib/maintenance_tasks.rb index fda5157e..ff32c57d 100644 --- a/lib/maintenance_tasks.rb +++ b/lib/maintenance_tasks.rb @@ -7,6 +7,8 @@ require "job-iteration" require "maintenance_tasks/engine" +require "patches/active_record_batch_enumerator" + # The engine's namespace module. It provides isolation between the host # application's code and the engine-specific code. Top-level engine constants # and variables are defined under this module. diff --git a/lib/patches/active_record_batch_enumerator.rb b/lib/patches/active_record_batch_enumerator.rb new file mode 100644 index 00000000..d843d5a2 --- /dev/null +++ b/lib/patches/active_record_batch_enumerator.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +# TODO: Remove this patch once all supported Rails versions include the changes +# upstream - https://github.com/rails/rails/pull/42312/commits/a031a43d969c87542c4ee8d0d338d55fcbb53376 +module ActiveRecordBatchEnumerator + # The primary key value from which the BatchEnumerator starts, + # inclusive of the value. + attr_reader :start + + # The primary key value at which the BatchEnumerator ends, + # inclusive of the value. + attr_reader :finish + + # The relation from which the BatchEnumerator yields batches. + attr_reader :relation + + # The size of the batches yielded by the BatchEnumerator. + def batch_size + @of + end +end + +ActiveRecord::Batches::BatchEnumerator.include(ActiveRecordBatchEnumerator) From d7b7dc56eb10d1f236ea798fd3aedb66bc768f17 Mon Sep 17 00:00:00 2001 From: Adrianna Chang Date: Thu, 3 Jun 2021 08:54:41 -0400 Subject: [PATCH 4/4] Define #count automatically for batch collections using the enumerator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Étienne Barrié --- .../concerns/maintenance_tasks/task_job_concern.rb | 12 +++++++----- app/tasks/maintenance_tasks/task.rb | 1 + .../maintenance/update_posts_in_batches_task.rb | 4 ---- test/tasks/maintenance_tasks/task_test.rb | 4 ++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb index 3f92754e..417aa756 100644 --- a/app/jobs/concerns/maintenance_tasks/task_job_concern.rb +++ b/app/jobs/concerns/maintenance_tasks/task_job_concern.rb @@ -32,6 +32,7 @@ def retry_on(*, **) def build_enumerator(_run, cursor:) cursor ||= @run.cursor collection = @task.collection + @enumerator = nil collection_enum = case collection when ActiveRecord::Relation @@ -43,13 +44,13 @@ def build_enumerator(_run, cursor:) a batch enumerator with the "start" or "finish" options. MSG end - enumerator_builder.active_record_on_batch_relations( + # For now, only support automatic count based on the enumerator for + # batches + @enumerator = enumerator_builder.active_record_on_batch_relations( collection.relation, cursor: cursor, batch_size: collection.batch_size, ) - @run.update!(tick_total: enumerator.size) - enumerator when Array enumerator_builder.build_array_enumerator(collection, cursor: cursor) when CSV @@ -102,8 +103,9 @@ def before_perform end def on_start - @run.tick_total = @task.count unless @run.tick_total - @run.update!(started_at: Time.now) + count = @task.count + count = @enumerator&.size if count == :no_count + @run.update!(started_at: Time.now, tick_total: count) end def on_complete diff --git a/app/tasks/maintenance_tasks/task.rb b/app/tasks/maintenance_tasks/task.rb index ee6d742f..c854a83f 100644 --- a/app/tasks/maintenance_tasks/task.rb +++ b/app/tasks/maintenance_tasks/task.rb @@ -131,6 +131,7 @@ def process(_item) # # @return [Integer, nil] def count + :no_count end end end diff --git a/test/dummy/app/tasks/maintenance/update_posts_in_batches_task.rb b/test/dummy/app/tasks/maintenance/update_posts_in_batches_task.rb index f6c2f78e..91f614bf 100644 --- a/test/dummy/app/tasks/maintenance/update_posts_in_batches_task.rb +++ b/test/dummy/app/tasks/maintenance/update_posts_in_batches_task.rb @@ -5,10 +5,6 @@ def collection Post.in_batches(of: 5) end - def count - Post.count - end - def process(batch_of_posts) batch_of_posts.update_all(content: "New content added on #{Time.now.utc}") end diff --git a/test/tasks/maintenance_tasks/task_test.rb b/test/tasks/maintenance_tasks/task_test.rb index 920a4df6..7c0d97cc 100644 --- a/test/tasks/maintenance_tasks/task_test.rb +++ b/test/tasks/maintenance_tasks/task_test.rb @@ -54,9 +54,9 @@ class TaskTest < ActiveSupport::TestCase assert_equal 2, Maintenance::TestTask.count end - test "#count is nil by default" do + test "#count is :no_count by default" do task = Task.new - assert_nil task.count + assert_equal(:no_count, task.count) end test "#collection raises NoMethodError" do