Skip to content

Commit

Permalink
Retry ArchiveOrgErrors (#410)
Browse files Browse the repository at this point in the history
* update so ArchiveOrgError is retried and TooManyCaptures is not

* Send status_ext and message when ArchiveOrgError is raised (so we don't lose this information)

* update test "should update media with error when Archive.org can't archive the url"
  • Loading branch information
vasconsaurus authored Oct 24, 2023
1 parent 28941ec commit d79fd03
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 55 deletions.
18 changes: 9 additions & 9 deletions app/models/concerns/media_archive_org_archiver.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@ def send_to_archive_org(url, key_id, _supported = nil)
if body['job_id']
Media.delay_for(2.minutes).get_archive_org_status(body['job_id'], url, key_id)
else
klass = Pender::Exception::ArchiveOrgError
if body['message']&.include?('The same snapshot') || body['status_ext'] == 'error:too-many-daily-captures'
klass = Pender::Exception::TooManyCaptures
end
PenderSentry.notify(
klass.new(body["message"]),
url: url,
response_body: body
)
data = snapshot_data.to_h.merge({ error: { message: "(#{body['status_ext']}) #{body['message']}", code: Lapis::ErrorCodes::const_get('ARCHIVER_ERROR') }})
Media.notify_webhook_and_update_cache('archive_org', url, data, key_id)
if body['message']&.include?('The same snapshot') || body['status_ext'] == 'error:too-many-daily-captures'
PenderSentry.notify(
Pender::Exception::TooManyCaptures.new(body["message"]),
url: url,
response_body: body
)
else
raise Pender::Exception::ArchiveOrgError, "(#{body['status_ext']}) #{body['message']}"
end
end
end
end
Expand Down
94 changes: 48 additions & 46 deletions test/models/archiver_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,27 @@ def quietly_redefine_constant(klass, constant, new_value)
WebMock.disable!
end

test "when archive.org fails to archive, it should add to data the available archive.org snapshot and the error" do
test "should archive Arabics url to Archive.org" do
Media.any_instance.unstub(:archive_to_archive_org)
a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' }

url = 'http://www.yallakora.com/ar/news/342470/%D8%A7%D8%AA%D8%AD%D8%A7%D8%AF-%D8%A7%D9%84%D9%83%D8%B1%D8%A9-%D8%B9%D9%86-%D8%A3%D8%B2%D9%85%D8%A9-%D8%A7%D9%84%D8%B3%D8%B9%D9%8A%D8%AF-%D9%84%D8%A7%D8%A8%D8%AF-%D9%85%D9%86-%D8%AD%D9%84-%D9%85%D8%B9-%D8%A7%D9%84%D8%B2%D9%85%D8%A7%D9%84%D9%83/2504'
WebMock.enable!
allowed_sites = lambda{ |uri| uri.host != 'web.archive.org' }
WebMock.disable_net_connect!(allow: allowed_sites)
WebMock.stub_request(:post, /example.com\/webhook/).to_return(status: 200, body: '')
WebMock.stub_request(:post, /web.archive.org\/save/).to_return(body: {url: url, job_id: 'ebb13d31-7fcf-4dce-890c-c256e2823ca0' }.to_json)
WebMock.stub_request(:get, /web.archive.org\/save\/status/).to_return(body: {status: 'success', timestamp: 'timestamp'}.to_json)

assert_nothing_raised do
m = create_media url: url, key: a
data = m.as_json
end
ensure
WebMock.disable!
end

test "when archive.org fails to archive, it should add to data the available archive.org snapshot (if available) and the error" do
a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' }
url = 'https://example.com/'

Expand All @@ -64,7 +84,7 @@ def quietly_redefine_constant(klass, constant, new_value)
WebMock.stub_request(:get, url).to_return(status: 200, body: '<html>A page</html>')
WebMock.stub_request(:post, /example.com\/webhook/).to_return(status: 200, body: '')
WebMock.stub_request(:post, /web.archive.org\/save/).to_return(status: 200, body: { message: 'The same snapshot had been made 12 hours, 13 minutes ago. You can make new capture of this URL after 24 hours.', url: url}.to_json)

media = create_media url: url, key: a
id = Media.get_id(media.url)
data = media.as_json(archivers: 'archive_org')
Expand All @@ -77,27 +97,7 @@ def quietly_redefine_constant(klass, constant, new_value)
WebMock.disable!
end

test "should archive Arabics url to Archive.org" do
Media.any_instance.unstub(:archive_to_archive_org)
a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' }

url = 'http://www.yallakora.com/ar/news/342470/%D8%A7%D8%AA%D8%AD%D8%A7%D8%AF-%D8%A7%D9%84%D9%83%D8%B1%D8%A9-%D8%B9%D9%86-%D8%A3%D8%B2%D9%85%D8%A9-%D8%A7%D9%84%D8%B3%D8%B9%D9%8A%D8%AF-%D9%84%D8%A7%D8%A8%D8%AF-%D9%85%D9%86-%D8%AD%D9%84-%D9%85%D8%B9-%D8%A7%D9%84%D8%B2%D9%85%D8%A7%D9%84%D9%83/2504'
WebMock.enable!
allowed_sites = lambda{ |uri| uri.host != 'web.archive.org' }
WebMock.disable_net_connect!(allow: allowed_sites)
WebMock.stub_request(:post, /example.com\/webhook/).to_return(status: 200, body: '')
WebMock.stub_request(:post, /web.archive.org\/save/).to_return(body: {url: url, job_id: 'ebb13d31-7fcf-4dce-890c-c256e2823ca0' }.to_json)
WebMock.stub_request(:get, /web.archive.org\/save\/status/).to_return(body: {status: 'success', timestamp: 'timestamp'}.to_json)

assert_nothing_raised do
m = create_media url: url, key: a
data = m.as_json
end
ensure
WebMock.disable!
end

test "should update media with error when archive to Archive.org fails too many times" do
test "should update media with error when Archive.org can't archive the url" do
WebMock.enable!
allowed_sites = lambda{ |uri| uri.host != 'web.archive.org' }
WebMock.disable_net_connect!(allow: allowed_sites)
Expand All @@ -110,25 +110,30 @@ def quietly_redefine_constant(klass, constant, new_value)
Media.any_instance.stubs(:archive)

a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' }
url = 'https://www.facebook.com/permalink.php?story_fbid=1649526595359937&id=100009078379548'

assert_raises Pender::Exception::RetryLater do
m = Media.new url: url
m.as_json(archivers: 'none')
assert_nil m.data.dig('archives', 'archive_org')
WebMock.stub_request(:post, /web.archive.org\/save/).to_return(body: {url: url, job_id: 'ebb13d31-7fcf-4dce-890c-c256e2823ca0' }.to_json)
WebMock.stub_request(:get, /web.archive.org\/save\/status/).to_return(body: {status: 'error', status_ext: 'error:not-found', message: 'The server cannot find the requested resource'}.to_json)
urls = {
'http://localhost:3333/unreachable-url' => {status_ext: 'error:invalid-url-syntax', message: 'URL syntax is not valid'},
'http://www.dutertenewsupdate.info/2018/01/duterte-turned-philippines-into.html' => {status_ext: 'error:invalid-host-resolution', message: 'Cannot resolve host'},
}

Media.send_to_archive_org(url.to_s, a.id)
media_data = Pender::Store.current.read(Media.get_id(url), :json)
assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_FAILURE'), media_data.dig('archives', 'archive_org', 'error', 'code')
assert_equal "#{data[:code]} #{data[:message]}", media_data.dig('archives', 'archive_org', 'error', 'message')
end
urls.each_pair do |url, data|
m = Media.new url: url
m.as_json(archivers: 'none')
assert_nil m.data.dig('archives', 'archive_org')
WebMock.stub_request(:any, /web.archive.org\/save/).to_return(body: {status: 'error', status_ext: data[:status_ext], message: data[:message]}.to_json)
WebMock.stub_request(:get, /archive.org\/wayback/).to_return(body: {"archived_snapshots":{}}.to_json, headers: {})

assert_raises Pender::Exception::RetryLater do
Media.send_to_archive_org(url.to_s, a.id)
end
media_data = Pender::Store.current.read(Media.get_id(url), :json)
assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_ERROR'), media_data.dig('archives', 'archive_org', 'error', 'code')
assert_equal "(#{data[:status_ext]}) #{data[:message]}", media_data.dig('archives', 'archive_org', 'error', 'message')
end
ensure
WebMock.disable!
end

test "should update media with error when Archive.org can't archive the url" do
test "should update media with error when archive to Archive.org fails too many times" do
WebMock.enable!
allowed_sites = lambda{ |uri| uri.host != 'web.archive.org' }
WebMock.disable_net_connect!(allow: allowed_sites)
Expand All @@ -141,22 +146,19 @@ def quietly_redefine_constant(klass, constant, new_value)
Media.any_instance.stubs(:archive)

a = create_api_key application_settings: { 'webhook_url': 'https://example.com/webhook.php', 'webhook_token': 'test' }
urls = {
'http://localhost:3333/unreachable-url' => {status_ext: 'error:invalid-url-syntax', message: 'URL syntax is not valid'},
'http://www.dutertenewsupdate.info/2018/01/duterte-turned-philippines-into.html' => {status_ext: 'error:invalid-host-resolution', message: 'Cannot resolve host'},
}
url = 'https://www.facebook.com/permalink.php?story_fbid=1649526595359937&id=100009078379548'

urls.each_pair do |url, data|
assert_raises Pender::Exception::RetryLater do
m = Media.new url: url
m.as_json(archivers: 'none')
assert_nil m.data.dig('archives', 'archive_org')
WebMock.stub_request(:any, /web.archive.org\/save/).to_return(body: {status: 'error', status_ext: data[:status_ext], message: data[:message]}.to_json)
WebMock.stub_request(:get, /archive.org\/wayback/).to_return(body: {"archived_snapshots":{}}.to_json, headers: {})
WebMock.stub_request(:post, /web.archive.org\/save/).to_return(body: {url: url, job_id: 'ebb13d31-7fcf-4dce-890c-c256e2823ca0' }.to_json)
WebMock.stub_request(:get, /web.archive.org\/save\/status/).to_return(body: {status: 'error', status_ext: 'error:not-found', message: 'The server cannot find the requested resource'}.to_json)

Media.send_to_archive_org(url.to_s, a.id)
media_data = Pender::Store.current.read(Media.get_id(url), :json)
assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_ERROR'), media_data.dig('archives', 'archive_org', 'error', 'code')
assert_equal "(#{data[:status_ext]}) #{data[:message]}", media_data.dig('archives', 'archive_org', 'error', 'message')
assert_equal Lapis::ErrorCodes::const_get('ARCHIVER_FAILURE'), media_data.dig('archives', 'archive_org', 'error', 'code')
assert_equal "#{data[:code]} #{data[:message]}", media_data.dig('archives', 'archive_org', 'error', 'message')
end
ensure
WebMock.disable!
Expand Down

0 comments on commit d79fd03

Please sign in to comment.