From d79fd0343162561e9aefe5449cfa75bff4b3bdbd Mon Sep 17 00:00:00 2001 From: Manu Vasconcelos <87862340+vasconsaurus@users.noreply.github.com> Date: Tue, 24 Oct 2023 08:52:53 -0300 Subject: [PATCH] Retry ArchiveOrgErrors (#410) * update so ArchiveOrgError is retried and TooManyCaptures is not * Send status_ext and message when ArchiveOrgError is raised (so we don't lose this information) * update test "should update media with error when Archive.org can't archive the url" --- .../concerns/media_archive_org_archiver.rb | 18 ++-- test/models/archiver_test.rb | 94 ++++++++++--------- 2 files changed, 57 insertions(+), 55 deletions(-) diff --git a/app/models/concerns/media_archive_org_archiver.rb b/app/models/concerns/media_archive_org_archiver.rb index 29f82e39..f2395d19 100644 --- a/app/models/concerns/media_archive_org_archiver.rb +++ b/app/models/concerns/media_archive_org_archiver.rb @@ -27,17 +27,17 @@ def send_to_archive_org(url, key_id, _supported = nil) if body['job_id'] Media.delay_for(2.minutes).get_archive_org_status(body['job_id'], url, key_id) else - klass = Pender::Exception::ArchiveOrgError - if body['message']&.include?('The same snapshot') || body['status_ext'] == 'error:too-many-daily-captures' - klass = Pender::Exception::TooManyCaptures - end - PenderSentry.notify( - klass.new(body["message"]), - url: url, - response_body: body - ) data = snapshot_data.to_h.merge({ error: { message: "(#{body['status_ext']}) #{body['message']}", code: Lapis::ErrorCodes::const_get('ARCHIVER_ERROR') }}) Media.notify_webhook_and_update_cache('archive_org', url, data, key_id) + if body['message']&.include?('The same snapshot') || body['status_ext'] == 'error:too-many-daily-captures' + PenderSentry.notify( + Pender::Exception::TooManyCaptures.new(body["message"]), + url: url, + response_body: body + ) + else + raise Pender::Exception::ArchiveOrgError, "(#{body['status_ext']}) #{body['message']}" + end end end end diff --git a/test/models/archiver_test.rb b/test/models/archiver_test.rb index 61f93c2f..ad99aa93 100644 --- a/test/models/archiver_test.rb +++ b/test/models/archiver_test.rb @@ -53,7 +53,27 @@ def quietly_redefine_constant(klass, constant, new_value) WebMock.disable! end - test "when archive.org fails to archive, it should add to data the available archive.org snapshot and the error" do + test "should archive Arabics url to Archive.org" do + Media.any_instance.unstub(:archive_to_archive_org) + a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' } + + url = 'http://www.yallakora.com/ar/news/342470/%D8%A7%D8%AA%D8%AD%D8%A7%D8%AF-%D8%A7%D9%84%D9%83%D8%B1%D8%A9-%D8%B9%D9%86-%D8%A3%D8%B2%D9%85%D8%A9-%D8%A7%D9%84%D8%B3%D8%B9%D9%8A%D8%AF-%D9%84%D8%A7%D8%A8%D8%AF-%D9%85%D9%86-%D8%AD%D9%84-%D9%85%D8%B9-%D8%A7%D9%84%D8%B2%D9%85%D8%A7%D9%84%D9%83/2504' + WebMock.enable! + allowed_sites = lambda{ |uri| uri.host != 'web.archive.org' } + WebMock.disable_net_connect!(allow: allowed_sites) + WebMock.stub_request(:post, /example.com\/webhook/).to_return(status: 200, body: '') + WebMock.stub_request(:post, /web.archive.org\/save/).to_return(body: {url: url, job_id: 'ebb13d31-7fcf-4dce-890c-c256e2823ca0' }.to_json) + WebMock.stub_request(:get, /web.archive.org\/save\/status/).to_return(body: {status: 'success', timestamp: 'timestamp'}.to_json) + + assert_nothing_raised do + m = create_media url: url, key: a + data = m.as_json + end + ensure + WebMock.disable! + end + + test "when archive.org fails to archive, it should add to data the available archive.org snapshot (if available) and the error" do a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' } url = 'https://example.com/' @@ -64,7 +84,7 @@ def quietly_redefine_constant(klass, constant, new_value) WebMock.stub_request(:get, url).to_return(status: 200, body: 'A page') WebMock.stub_request(:post, /example.com\/webhook/).to_return(status: 200, body: '') WebMock.stub_request(:post, /web.archive.org\/save/).to_return(status: 200, body: { message: 'The same snapshot had been made 12 hours, 13 minutes ago. You can make new capture of this URL after 24 hours.', url: url}.to_json) - + media = create_media url: url, key: a id = Media.get_id(media.url) data = media.as_json(archivers: 'archive_org') @@ -77,27 +97,7 @@ def quietly_redefine_constant(klass, constant, new_value) WebMock.disable! end - test "should archive Arabics url to Archive.org" do - Media.any_instance.unstub(:archive_to_archive_org) - a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' } - - url = 'http://www.yallakora.com/ar/news/342470/%D8%A7%D8%AA%D8%AD%D8%A7%D8%AF-%D8%A7%D9%84%D9%83%D8%B1%D8%A9-%D8%B9%D9%86-%D8%A3%D8%B2%D9%85%D8%A9-%D8%A7%D9%84%D8%B3%D8%B9%D9%8A%D8%AF-%D9%84%D8%A7%D8%A8%D8%AF-%D9%85%D9%86-%D8%AD%D9%84-%D9%85%D8%B9-%D8%A7%D9%84%D8%B2%D9%85%D8%A7%D9%84%D9%83/2504' - WebMock.enable! - allowed_sites = lambda{ |uri| uri.host != 'web.archive.org' } - WebMock.disable_net_connect!(allow: allowed_sites) - WebMock.stub_request(:post, /example.com\/webhook/).to_return(status: 200, body: '') - WebMock.stub_request(:post, /web.archive.org\/save/).to_return(body: {url: url, job_id: 'ebb13d31-7fcf-4dce-890c-c256e2823ca0' }.to_json) - WebMock.stub_request(:get, /web.archive.org\/save\/status/).to_return(body: {status: 'success', timestamp: 'timestamp'}.to_json) - - assert_nothing_raised do - m = create_media url: url, key: a - data = m.as_json - end - ensure - WebMock.disable! - end - - test "should update media with error when archive to Archive.org fails too many times" do + test "should update media with error when Archive.org can't archive the url" do WebMock.enable! allowed_sites = lambda{ |uri| uri.host != 'web.archive.org' } WebMock.disable_net_connect!(allow: allowed_sites) @@ -110,25 +110,30 @@ def quietly_redefine_constant(klass, constant, new_value) Media.any_instance.stubs(:archive) a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' } - url = 'https://www.facebook.com/permalink.php?story_fbid=1649526595359937&id=100009078379548' - - assert_raises Pender::Exception::RetryLater do - m = Media.new url: url - m.as_json(archivers: 'none') - assert_nil m.data.dig('archives', 'archive_org') - WebMock.stub_request(:post, /web.archive.org\/save/).to_return(body: {url: url, job_id: 'ebb13d31-7fcf-4dce-890c-c256e2823ca0' }.to_json) - WebMock.stub_request(:get, /web.archive.org\/save\/status/).to_return(body: {status: 'error', status_ext: 'error:not-found', message: 'The server cannot find the requested resource'}.to_json) + urls = { + 'http://localhost:3333/unreachable-url' => {status_ext: 'error:invalid-url-syntax', message: 'URL syntax is not valid'}, + 'http://www.dutertenewsupdate.info/2018/01/duterte-turned-philippines-into.html' => {status_ext: 'error:invalid-host-resolution', message: 'Cannot resolve host'}, + } - Media.send_to_archive_org(url.to_s, a.id) - media_data = Pender::Store.current.read(Media.get_id(url), :json) - assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_FAILURE'), media_data.dig('archives', 'archive_org', 'error', 'code') - assert_equal "#{data[:code]} #{data[:message]}", media_data.dig('archives', 'archive_org', 'error', 'message') - end + urls.each_pair do |url, data| + m = Media.new url: url + m.as_json(archivers: 'none') + assert_nil m.data.dig('archives', 'archive_org') + WebMock.stub_request(:any, /web.archive.org\/save/).to_return(body: {status: 'error', status_ext: data[:status_ext], message: data[:message]}.to_json) + WebMock.stub_request(:get, /archive.org\/wayback/).to_return(body: {"archived_snapshots":{}}.to_json, headers: {}) + + assert_raises Pender::Exception::RetryLater do + Media.send_to_archive_org(url.to_s, a.id) + end + media_data = Pender::Store.current.read(Media.get_id(url), :json) + assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_ERROR'), media_data.dig('archives', 'archive_org', 'error', 'code') + assert_equal "(#{data[:status_ext]}) #{data[:message]}", media_data.dig('archives', 'archive_org', 'error', 'message') + end ensure WebMock.disable! end - test "should update media with error when Archive.org can't archive the url" do + test "should update media with error when archive to Archive.org fails too many times" do WebMock.enable! allowed_sites = lambda{ |uri| uri.host != 'web.archive.org' } WebMock.disable_net_connect!(allow: allowed_sites) @@ -141,22 +146,19 @@ def quietly_redefine_constant(klass, constant, new_value) Media.any_instance.stubs(:archive) a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' } - urls = { - 'http://localhost:3333/unreachable-url' => {status_ext: 'error:invalid-url-syntax', message: 'URL syntax is not valid'}, - 'http://www.dutertenewsupdate.info/2018/01/duterte-turned-philippines-into.html' => {status_ext: 'error:invalid-host-resolution', message: 'Cannot resolve host'}, - } + url = 'https://www.facebook.com/permalink.php?story_fbid=1649526595359937&id=100009078379548' - urls.each_pair do |url, data| + assert_raises Pender::Exception::RetryLater do m = Media.new url: url m.as_json(archivers: 'none') assert_nil m.data.dig('archives', 'archive_org') - WebMock.stub_request(:any, /web.archive.org\/save/).to_return(body: {status: 'error', status_ext: data[:status_ext], message: data[:message]}.to_json) - WebMock.stub_request(:get, /archive.org\/wayback/).to_return(body: {"archived_snapshots":{}}.to_json, headers: {}) + WebMock.stub_request(:post, /web.archive.org\/save/).to_return(body: {url: url, job_id: 'ebb13d31-7fcf-4dce-890c-c256e2823ca0' }.to_json) + WebMock.stub_request(:get, /web.archive.org\/save\/status/).to_return(body: {status: 'error', status_ext: 'error:not-found', message: 'The server cannot find the requested resource'}.to_json) Media.send_to_archive_org(url.to_s, a.id) media_data = Pender::Store.current.read(Media.get_id(url), :json) - assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_ERROR'), media_data.dig('archives', 'archive_org', 'error', 'code') - assert_equal "(#{data[:status_ext]}) #{data[:message]}", media_data.dig('archives', 'archive_org', 'error', 'message') + assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_FAILURE'), media_data.dig('archives', 'archive_org', 'error', 'code') + assert_equal "#{data[:code]} #{data[:message]}", media_data.dig('archives', 'archive_org', 'error', 'message') end ensure WebMock.disable!