Skip to content

Commit

Permalink
HELIO-4704 Attempt to speed up sitemap generation
Browse files Browse the repository at this point in the history
  • Loading branch information
sethaj committed Aug 5, 2024
1 parent 4d4e153 commit 1adcaab
Showing 1 changed file with 29 additions and 13 deletions.
42 changes: 29 additions & 13 deletions config/sitemap.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,44 @@
# add '/static/about_daily', changefreq: 'monthly'
# add '/static/about_project', changefreq: 'monthly'
# add '/static/rights', changefreq: 'monthly'
ActiveFedora::SolrService.query("+has_model_ssim:Monograph AND -(press_sim:demo OR press_sim:heliotrope OR press_sim:monitoringservicetarget)",
fl: ['id',
'date_modified_dtsi',
'visibility_ssi',
'tombstone_ssim',
'representative_id_ssim',
'file_set_ids_ssim'],
rows: 100_000).each do |doc|
next unless doc['visibility_ssi'] == 'open'
next if doc['tombstone_ssim']&.first == 'yes'

docs = ActiveFedora::SolrService.query("+has_model_ssim:Monograph AND -(press_sim:demo OR press_sim:heliotrope OR press_sim:monitoringservicetarget)", rows: 100_000)
docs.each do |d|
next unless d['visibility_ssi'] == 'open'
next if d['tombstone_ssim']&.first == 'yes'
file_set_ids = doc['file_set_ids_ssim']
cover_id = doc['representative_id_ssim']&.first
rep_file_set_ids = FeaturedRepresentative.where(work_id: doc['id'], file_set_id: file_set_ids).pluck(:file_set_id)

cover_id = d['representative_id_ssim']&.first
file_set_docs = []
# We have books with so many file_sets that the query is too big for solr, so we break them up to manage query size if needed
if file_set_ids.present?
until file_set_ids.empty?
file_set_docs << ActiveFedora::SolrService.query("{!terms f=id}#{file_set_ids.shift(999).join(",")}", fl: ['id', 'visibility_ssi'], rows: 1000)
end
file_set_docs = file_set_docs.flatten
end

d['file_set_ids_ssim']&.each do |fsid|
fs = ActiveFedora::SolrService.query("{!terms f=id}#{fsid}", rows: 1).first
file_set_docs.each do |fs|
next unless fs.present? && fs['visibility_ssi'] == 'open'
# Monograph cover and "featured representative" file_set URLs don't need to be in sitemaps.
# Crawlers cannot parse content from CSB, anyway. They can only read the page title.
rep = FeaturedRepresentative.where(work_id: d['id'], file_set_id: fsid).first
next if rep&.kind.present? || fsid == cover_id
next if fs['id'] == cover_id
next if rep_file_set_ids.any?(fs['id'])

# the majority of FileSets won't be featured reps at all, so get a 'normal' url
url = Rails.application.routes.url_helpers.hyrax_file_set_path(fsid)
add url, lastmod: d['date_modified_dtsi'], priority: 0.5, changefreq: 'monthly'
url = Rails.application.routes.url_helpers.hyrax_file_set_path(fs['id'])
add url, lastmod: doc['date_modified_dtsi'], priority: 0.5, changefreq: 'monthly'
end

# monographs are always in sitemaps (unless they're in Draft)
url = Rails.application.routes.url_helpers.hyrax_monograph_path(d['id'])
add url, lastmod: d['date_modified_dtsi'], priority: 1, changefreq: 'monthly'
url = Rails.application.routes.url_helpers.hyrax_monograph_path(doc['id'])
add url, lastmod: doc['date_modified_dtsi'], priority: 1, changefreq: 'monthly'
end
end

0 comments on commit 1adcaab

Please sign in to comment.