diff --git a/app/models/whitehall_migration/document_export.rb b/app/models/whitehall_migration/document_export.rb new file mode 100644 index 0000000000..ffb9ffcfb4 --- /dev/null +++ b/app/models/whitehall_migration/document_export.rb @@ -0,0 +1,99 @@ +class WhitehallMigration::DocumentExport + def self.exportable_documents + @exportable_documents ||= Document + .includes(:live_edition) + .select do |document| + document.live_edition && document.live_edition.state != "removed" + end + end + + def self.export_to_hash(document) + content_revision = document.live_edition.revision.content_revision + + { + content_id: document[:content_id], + state: document.live_edition.state, + created_at: document[:created_at], + first_published_at: PublishingApiPayload::History.new(document.live_edition).first_published_at, + updated_at: document[:updated_at], + created_by: User.find(document.created_by_id).email, + last_edited_by: User.find(document.live_edition.revision.created_by_id).email, + document_type: document.live_edition.revision.metadata_revision.document_type_id, + title: content_revision.title, + base_path: content_revision.base_path, + summary: content_revision.summary, + body: content_revision.contents["body"], + tags: document.live_edition.revision.tags_revision.tags, + political: document.live_edition.political?, + government_id: document.live_edition.government_id, + change_notes: change_notes(document), + internal_history: internal_history(document), + images: export_images(document), + attachments: export_attachments(document), + } + end + + def self.change_notes(document) + PublishingApiPayload::History.new(document.live_edition).change_history + end + + def self.internal_history(document) + timeline_entries = TimelineEntry.where(document:) + .includes(:created_by, :details) + .order(created_at: :desc) + .includes(:edition) + + timeline_entries.map do |entry| + entry_content = if entry.internal_note? && entry.details + entry.details.body + elsif (entry.withdrawn? || entry.withdrawn_updated?) && entry.details + entry.details.public_explanation + end + + { + edition_number: entry.edition.number, + entry_type: entry.entry_type, + date: entry.created_at.to_fs(:date), + time: entry.created_at.to_fs(:time), + user: entry.created_by.email, + entry_content:, + } + end + end + + def self.export_images(document) + revision = document.live_edition.revision + lead_image_revision = revision.lead_image_revision + all_image_revisions = revision.image_revisions + + all_image_revisions.map do |image_revision| + { + created_at: image_revision.created_at, + caption: image_revision.caption, + alt_text: image_revision.alt_text, + credit: image_revision.credit, + lead_image: image_revision == lead_image_revision, + variants: image_revision.blob_revision.assets.map do |asset| + { + variant: asset.variant, + file_url: asset.file_url, + } + end, + } + end + end + + def self.export_attachments(document) + revision = document.live_edition.revision + all_file_attachment_revisions = revision.file_attachment_revisions + + all_file_attachment_revisions.map do |file_attachment_revision| + metadata = file_attachment_revision.metadata_revision + { + file_url: file_attachment_revision.asset.file_url, + title: metadata.title, + created_at: file_attachment_revision.created_at, + } + end + end +end diff --git a/lib/tasks/export.rake b/lib/tasks/export.rake new file mode 100644 index 0000000000..88c83cbab0 --- /dev/null +++ b/lib/tasks/export.rake @@ -0,0 +1,32 @@ +require "json" + +namespace :export do + desc "Export a specific live document and its assets, by its content ID" + task :live_document_and_assets, %i[content_id output_file] => :environment do |_, args| + document = Document.find_by(content_id: args[:content_id]) + hash = WhitehallMigration::DocumentExport.export_to_hash(document) + + if args[:output_file] + File.write(args[:output_file], JSON.pretty_generate(hash)) + else + pp hash + end + end + + desc "Export all live documents and assets" + task :live_documents_and_assets, %i[output_directory] => :environment do |_, args| + documents = WhitehallMigration::DocumentExport.exportable_documents + + puts "Exporting #{documents.count} live editions" + + documents.each do |document| + hash = WhitehallMigration::DocumentExport.export_to_hash(document) + + if args[:output_directory] + File.write("#{args[:output_directory]}/#{hash[:base_path].split('/').last}.json", JSON.pretty_generate(hash)) + else + pp hash + end + end + end +end diff --git a/lib/versioning/revision_updater.rb b/lib/versioning/revision_updater.rb index 2ee0ad20dc..6a94d5034d 100644 --- a/lib/versioning/revision_updater.rb +++ b/lib/versioning/revision_updater.rb @@ -1,5 +1,6 @@ module Versioning class RevisionUpdater < BaseUpdater + require_relative "./revision_updater/image" include RevisionUpdater::Image include RevisionUpdater::FileAttachment diff --git a/spec/factories/edition_factory.rb b/spec/factories/edition_factory.rb index 2524261cf1..21dee74696 100644 --- a/spec/factories/edition_factory.rb +++ b/spec/factories/edition_factory.rb @@ -106,6 +106,14 @@ end end + trait :published_but_needs_2i do + published + + transient do + state { "published_but_needs_2i" } + end + end + trait :withdrawn do summary { SecureRandom.alphanumeric(10) } live { true } diff --git a/spec/lib/tasks/export_spec.rb b/spec/lib/tasks/export_spec.rb new file mode 100644 index 0000000000..d801e190d0 --- /dev/null +++ b/spec/lib/tasks/export_spec.rb @@ -0,0 +1,84 @@ +RSpec.describe "Export tasks" do + include ActiveJob::TestHelper + + describe "export:live_document_and_assets" do + before do + Rake::Task["export:live_document_and_assets"].reenable + end + + it "calls WhitehallMigration::DocumentExport.export_to_hash with correct arguments" do + document = create(:document, :with_live_edition) + allow(WhitehallMigration::DocumentExport).to receive(:export_to_hash) + Rake::Task["export:live_document_and_assets"].invoke(document.content_id) + expect(WhitehallMigration::DocumentExport).to have_received(:export_to_hash).with(document) + end + + it "pretty-prints the result to STDOUT if no output_file is specified" do + document = create(:document, :with_live_edition) + allow(WhitehallMigration::DocumentExport).to receive(:export_to_hash).and_return({ foo: "bar" }) + expect { Rake::Task["export:live_document_and_assets"].invoke(document.content_id) }.to output("{:foo=>\"bar\"}\n").to_stdout + end + + it "writes the result as JSON to the given output_file if specified" do + document = create(:document, :with_live_edition) + allow(WhitehallMigration::DocumentExport).to receive(:export_to_hash).and_return({ foo: "bar", baz: "qux" }) + + output_file = Tempfile.new("export") + Rake::Task["export:live_document_and_assets"].invoke(document.content_id, output_file.path) + + expected = <<~JSON + { + "foo": "bar", + "baz": "qux" + } + JSON + expect(File.read(output_file.path)).to match(expected.strip) + end + end + + describe "export:live_documents_and_assets" do + before do + allow($stdout).to receive(:puts) # suppress output for cleanliness + Rake::Task["export:live_documents_and_assets"].reenable + Document.find_each(&:destroy) # Clean slate + allow(WhitehallMigration::DocumentExport).to receive(:exportable_documents).and_return(documents) + end + + let(:documents) do + [ + create(:document, :with_live_edition), + create(:document, :with_live_edition), + create(:document, :with_live_edition), + ] + end + + it "lists how many documents it is about to export" do + expect { Rake::Task["export:live_documents_and_assets"].invoke }.to output(/^Exporting 3 live editions/).to_stdout + end + + it "calls WhitehallMigration::DocumentExport.export_to_hash with correct arguments" do + allow(WhitehallMigration::DocumentExport).to receive(:export_to_hash) + Rake::Task["export:live_documents_and_assets"].invoke + expect(WhitehallMigration::DocumentExport).to have_received(:export_to_hash).with(documents[0]) + expect(WhitehallMigration::DocumentExport).to have_received(:export_to_hash).with(documents[1]) + expect(WhitehallMigration::DocumentExport).to have_received(:export_to_hash).with(documents[2]) + end + + it "pretty-prints the result to STDOUT if no output_directory is specified" do + allow(WhitehallMigration::DocumentExport).to receive(:export_to_hash).and_return({ foo: "bar" }) + expect { Rake::Task["export:live_documents_and_assets"].invoke }.to output(/{:foo=>"bar"}\n{:foo=>"bar"}\n{:foo=>"bar"}\n$/).to_stdout + end + + it "writes the result as JSON files to the given output_directory if specified" do + allow(WhitehallMigration::DocumentExport).to receive(:export_to_hash) do |document| + { base_path: "/news/example-path-#{document.id}" } + end + output_directory = Dir.mktmpdir + Rake::Task["export:live_documents_and_assets"].invoke(output_directory) + + expected_files = documents.map { |doc| "#{output_directory}/example-path-#{doc.id}.json" } + actual_files = Dir.glob("#{output_directory}/*.json").sort + expect(actual_files).to match_array(expected_files) + end + end +end diff --git a/spec/models/whitehall_migration/document_export_spec.rb b/spec/models/whitehall_migration/document_export_spec.rb new file mode 100644 index 0000000000..0570d86d5a --- /dev/null +++ b/spec/models/whitehall_migration/document_export_spec.rb @@ -0,0 +1,362 @@ +RSpec.describe WhitehallMigration::DocumentExport do + describe ".exportable_documents" do + it "returns documents that are published (with or without 2i), or withdrawn" do + Document.find_each(&:destroy) # Clean slate + + withdrawn_edition = create(:edition, :withdrawn) + live_but_needs_2i = create(:edition, :published_but_needs_2i) + documents_to_be_processed = [ + create(:document, :with_live_edition), + create(:document, :with_current_and_live_editions), + live_but_needs_2i.document, + withdrawn_edition.document, + ] + + # documents to be ignored + create(:document, :with_current_edition) + create(:edition, state: "submitted_for_review") + create(:edition, :removed, removal: create(:removal, redirect: true, alternative_url: "/somewhere")) + + expect(described_class.exportable_documents.sort_by(&:id)).to eq(documents_to_be_processed.sort_by(&:id)) + end + end + + describe "#export_to_hash" do + it "takes a Document and maps it to a hash" do + document = create(:document, :with_live_edition) + expect(described_class.export_to_hash(document)).to be_a(Hash) + end + + it "has a `content_id` property" do + document = create(:document, :with_live_edition) + expect(described_class.export_to_hash(document)[:content_id]).to eq(document.content_id) + end + + it "has a `state` property" do + document = create(:document, :with_live_edition) + expect(described_class.export_to_hash(document)[:state]).to eq("published") + end + + it "has a `created_at` property" do + document = create(:document, :with_live_edition) + expect(described_class.export_to_hash(document)[:created_at]).to eq(document.created_at) + end + + describe "the `first_published_at` property" do + it "delegates to PublishingApiPayload::History to populate first_published_at" do + document = build(:document, :live) + document.live_edition = create(:edition, :published, document:) + history = instance_double( + PublishingApiPayload::History, + change_history: [], + first_published_at: 1.year.ago, + ) + allow(PublishingApiPayload::History).to receive(:new).and_return(history) + + expect(described_class.export_to_hash(document)[:first_published_at]).to match(history.first_published_at) + end + end + + it "has a `updated_at` property" do + document = create(:document, :with_live_edition) + expect(described_class.export_to_hash(document)[:updated_at]).to eq(document.updated_at) + end + + it "has a `created_by` property" do + email = "foo@example.com" + document = create(:document, :with_live_edition, created_by: build(:user, email:)) + expect(described_class.export_to_hash(document)[:created_by]).to eq(email) + end + + it "has a `last_edited_by` property" do + email = "foo@example.com" + document = build(:document, :live) + document.live_edition = create(:edition, :published, created_by: build(:user, email:), document:) + expect(described_class.export_to_hash(document)[:last_edited_by]).to eq(email) + end + + it "has a `document_type` property" do + document = build(:document, :live) + document.live_edition = create(:edition, :published, document_type: DocumentType.find("news_story"), document:) + expect(described_class.export_to_hash(document)[:document_type]).to eq("news_story") + end + + it "has a `title` property" do + title = "Here is a title" + document = build(:document, :live) + document.live_edition = create(:edition, :published, title:, document:) + expect(described_class.export_to_hash(document)[:title]).to eq(title) + end + + it "has a `base_path` property" do + base_path = "/foo/bar" + document = build(:document, :live) + document.live_edition = create(:edition, :published, base_path:, document:) + expect(described_class.export_to_hash(document)[:base_path]).to eq(base_path) + end + + it "has a `summary` property" do + summary = "Here is a summary" + document = build(:document, :live) + document.live_edition = create(:edition, :published, summary:, document:) + expect(described_class.export_to_hash(document)[:summary]).to eq(summary) + end + + it "has a `body` property" do + body = <<~GOVSPEAK + Here are some contents + + And here are some more! + GOVSPEAK + document = build(:document, :live) + document.live_edition = create(:edition, :published, document:, contents: { "body" => body }) + expect(described_class.export_to_hash(document)[:body]).to eq(body) + end + + it "has a `tags` property" do + tags = { "primary_publishing_organisation" => [SecureRandom.uuid] } + document = build(:document, :live) + document.live_edition = create(:edition, :published, document:, tags:) + + expect(described_class.export_to_hash(document)[:tags]).to eq(tags) + end + + it "has a `political` property" do + document = build(:document, :live) + document.live_edition = create(:edition, :published, document:) + allow(document.live_edition).to receive(:political?).and_return(true) + + expect(described_class.export_to_hash(document)[:political]).to be(true) + end + + it "has a `government_id` property" do + document = build(:document, :live) + document.live_edition = create(:edition, :published, document:) + government_id = SecureRandom.uuid + allow(document.live_edition).to receive(:government_id).and_return(government_id) + + expect(described_class.export_to_hash(document)[:government_id]).to be(government_id) + end + + describe "the `change_notes` property" do + it "delegates to PublishingApiPayload::History to populate change_notes" do + document = build(:document, :live) + document.live_edition = create(:edition, :published, document:) + history = instance_double( + PublishingApiPayload::History, + change_history: [{ note: "note", public_timestamp: Time.zone.now }], + public_updated_at: Time.zone.now, + first_published_at: Time.zone.now, + ) + allow(PublishingApiPayload::History).to receive(:new).and_return(history) + + expect(described_class.export_to_hash(document)[:change_notes]).to match(history.change_history) + end + end + + describe "the `internal_history` property" do + let(:document) { instance_double(Document) } + + it "includes internal notes" do + details = instance_double(InternalNote, body: "This is an internal note") + e = entry_double( + internal_note?: true, + details:, + entry_type: "internal_note", + edition: instance_double(Edition, number: 1), + created_by: instance_double(User, email: "example@gov.uk"), + ) + stub_chain_with([e]) + + expect(described_class.internal_history(document)).to eq([ + { + edition_number: 1, + entry_type: "internal_note", + date: "2024-01-01", + time: "10:00", + user: "example@gov.uk", + entry_content: "This is an internal note", + }, + ]) + end + + it "includes withdrawn/updated entries with public explanation" do + details = instance_double(Withdrawal, public_explanation: "Withdrawn explanation") + e = entry_double( + withdrawn_updated?: true, + details:, + entry_type: "withdrawn", + edition: instance_double(Edition, number: 2), + created_at: build_time(date: "2024-02-01", time: "11:00"), + created_by: instance_double(User, email: "withdrawn-author@gov.uk"), + ) + stub_chain_with([e]) + + expect(described_class.internal_history(document)).to eq([ + { + edition_number: 2, + entry_type: "withdrawn", + date: "2024-02-01", + time: "11:00", + user: "withdrawn-author@gov.uk", + entry_content: "Withdrawn explanation", + }, + ]) + end + + it "returns entries ordered by created_at desc (we respect the chain's order)" do + newer = entry_double( + entry_type: "withdrawn", + withdrawn_updated?: true, + details: instance_double(Withdrawal, public_explanation: "Later"), + edition: instance_double(Edition, number: 2), + created_at: build_time(date: "2024-02-01", time: "11:00"), + created_by: instance_double(User, email: "b@gov.uk"), + ) + + older = entry_double( + internal_note?: true, + details: instance_double(InternalNote, body: "Earlier note"), + entry_type: "internal_note", + edition: instance_double(Edition, number: 1), + created_at: build_time(date: "2024-01-01", time: "10:00"), + created_by: instance_double(User, email: "a@gov.uk"), + ) + + # We hand back [newer, older] from the ordered chain + stub_chain_with([newer, older]) + + result = described_class.internal_history(document) + expect(result.map { |h| h[:edition_number] }).to eq([2, 1]) + end + + def build_time(date:, time:) + t = instance_double(Time) + allow(t).to receive(:to_fs).with(:date).and_return(date) + allow(t).to receive(:to_fs).with(:time).and_return(time) + t + end + + def stub_chain_with(entries) + # Simulate: TimelineEntry.where(document: doc).includes(...).order(...).includes(...) + allow(TimelineEntry).to receive(:where).with(document:).and_return(entries) + allow(entries).to receive(:includes).and_return(entries) + allow(entries).to receive(:order).with(created_at: :desc).and_return(entries) + end + + def entry_double(overrides = {}) + defaults = { + edition: instance_double(Edition, number: 1), + entry_type: "internal_note", + created_at: build_time(date: "2024-01-01", time: "10:00"), + created_by: instance_double(User, email: "example@gov.uk"), + details: nil, + internal_note?: false, + withdrawn?: false, + withdrawn_updated?: false, + backdated?: false, + revision: nil, + } + instance_double(TimelineEntry, **defaults.merge(overrides)) + end + end + + describe "the 'images' property" do + it "exports image data with all required properties" do + image_revision = create( + :image_revision, + caption: "Image caption text", + alt_text: "Alt text description", + credit: "Photo credit", + ) + + blob_revision = instance_double(Image::BlobRevision) + assets = [ + instance_double(Image::Asset, variant: "300", file_url: "https://assets.publishing.service.gov.uk/media/123/image-300.jpg"), + instance_double(Image::Asset, variant: "960", file_url: "https://assets.publishing.service.gov.uk/media/123/image-960.jpg"), + ] + allow(image_revision).to receive(:blob_revision).and_return(blob_revision) + allow(blob_revision).to receive(:assets).and_return(assets) + + document = build(:document, :live) + document.live_edition = create(:edition, :published, document:, lead_image_revision: image_revision) + + result = described_class.export_to_hash(document) + + expect(result[:images]).to be_an(Array) + expect(result[:images].length).to eq(1) + + image = result[:images].first + expect(image).to include({ + caption: "Image caption text", + alt_text: "Alt text description", + credit: "Photo credit", + lead_image: true, + created_at: image_revision.created_at, + }) + expect(image[:variants]).to eq([ + { variant: "300", file_url: "https://assets.publishing.service.gov.uk/media/123/image-300.jpg" }, + { variant: "960", file_url: "https://assets.publishing.service.gov.uk/media/123/image-960.jpg" }, + ]) + end + + it "sets lead_image flag correctly for multiple images" do + lead_image = create(:image_revision) + other_image = create(:image_revision) + + [lead_image, other_image].each do |image_revision| + blob_revision = instance_double(Image::BlobRevision) + assets = [instance_double(Image::Asset, variant: "300", file_url: "https://assets.publishing.service.gov.uk/media/test.jpg")] + allow(image_revision).to receive(:blob_revision).and_return(blob_revision) + allow(blob_revision).to receive(:assets).and_return(assets) + end + + document = build(:document, :live) + document.live_edition = create( + :edition, + :published, + document:, + lead_image_revision: lead_image, + image_revisions: [lead_image, other_image], + ) + + result = described_class.export_to_hash(document) + + expect(result[:images].length).to eq(2) + lead = result[:images].find { |img| img[:lead_image] } + non_lead = result[:images].find { |img| !img[:lead_image] } + + expect(lead[:lead_image]).to be(true) + expect(non_lead[:lead_image]).to be(false) + expect(lead[:variants]).to eq([{ variant: "300", file_url: "https://assets.publishing.service.gov.uk/media/test.jpg" }]) + expect(non_lead[:variants]).to eq([{ variant: "300", file_url: "https://assets.publishing.service.gov.uk/media/test.jpg" }]) + end + end + + describe "the 'attachments' property" do + it "returns all of the document's attachments in hash form" do + file_url = "https://assets.publishing.service.gov.uk/media/5e5f9a16d3bf7f1090676df2/sample.pdf" + asset = instance_double(FileAttachment::Asset, file_url:) + file_attachment_revision = create( + :file_attachment_revision, + :on_asset_manager, + title: "Sample title", + asset:, + ) + allow(file_attachment_revision).to receive(:asset).and_return(asset) + expected = [ + { + file_url:, + title: "Sample title", + created_at: file_attachment_revision.created_at, + }, + ] + + document = build(:document, :live) + document.live_edition = create(:edition, :published, document:, file_attachment_revisions: [file_attachment_revision]) + result = described_class.export_to_hash(document) + expect(result[:attachments]).to eq(expected) + end + end + end +end