Skip to content

Commit

Permalink
3843 – Unrelated instagram links from the same page being matched (#405)
Browse files Browse the repository at this point in the history
* update instagram to set title to original_url when redirected to main page

we saw a bug where some unrelated items were being matched, because their
data was the same. They were probably being redirect and the title for all
of them were the instagram home url. They were also being parsed by page
and not instagram item parser.
  • Loading branch information
vasconsaurus authored Oct 11, 2023
1 parent 14349d3 commit f0ba82d
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 2 deletions.
4 changes: 4 additions & 0 deletions app/models/concerns/provider_instagram.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ class ApiAuthenticationError < StandardError; end
class_methods do
def ignored_urls
[
{
pattern: /^https:\/\/(www\.)?instagram\.com/,
reason: :login_page
},
{
pattern: /^https:\/\/www\.instagram\.com\/accounts\/login/,
reason: :login_page
Expand Down
5 changes: 3 additions & 2 deletions app/models/parser/instagram_item.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@ class InstagramItem < Base
include ProviderInstagram

INSTAGRAM_ITEM_URL = /^https?:\/\/(www\.)?instagram\.com\/(p|tv|reel)\/([^\/]+)/
INSTAGRAM_HOME_URL = /^https?:\/\/(www\.)?instagram\.com\/?$/

class << self
def type
'instagram_item'.freeze
end

def patterns
[INSTAGRAM_ITEM_URL]
[INSTAGRAM_ITEM_URL, INSTAGRAM_HOME_URL]
end
end

private

# Main function for class
def parse_data_for_parser(doc, _original_url, _jsonld_array)
def parse_data_for_parser(doc, original_url, _jsonld_array)
id = url.match(INSTAGRAM_ITEM_URL)[3]
@parsed_data.merge!(external_id: id)

Expand Down
26 changes: 26 additions & 0 deletions test/models/parser/instagram_item_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ class InstagramItemIntegrationTest < ActiveSupport::TestCase
assert !data['title'].blank?
end

test "should parse Instagram item when the final url is instagram.com" do
m = Media.new url: 'https://instagram.com/'
data = m.as_json
assert_equal 'instagram', data['provider']
assert_equal 'https://instagram.com/', data['title']
end

test "should get canonical URL parsed from html tags" do
media1 = create_media url: 'https://www.instagram.com/p/CAdW7PMlTWc/?taken-by=kikoloureiro'
assert_match /https:\/\/www.instagram.com\/p\/CAdW7PMlTWc/, media1.url
Expand Down Expand Up @@ -50,6 +57,9 @@ def doc

match_three = Parser::InstagramItem.match?('https://www.instagram.com/reel/CAdW7PMlTWc')
assert_equal true, match_three.is_a?(Parser::InstagramItem)

match_four = Parser::InstagramItem.match?('https://www.instagram.com/')
assert_equal true, match_four.is_a?(Parser::InstagramItem)
end

test "should set profile defaults to URL upon error" do
Expand Down Expand Up @@ -158,4 +168,20 @@ def doc
assert data['raw']['metatags'].present?
assert data['raw']['api'].present?
end

test "should return url as title when redirected to instagram main page" do
url = 'https://www.instagram.com/p/CdOk-lLKmyH/'
instagram_main_page = 'https://instagram.com/'

WebMock.stub_request(:get, url).to_return(status: 302, headers: { 'location' => instagram_main_page })
WebMock.stub_request(:get, instagram_main_page).to_return(status: 200, body: '<html>Instagram</html>')
WebMock.stub_request(:get, "https://www.instagram.com/p/CdOk-lLKmyH/?__a=1&__d=a").to_return(status: 200)

media = Media.new(url: url)
data = media.as_json

assert_equal 'https://www.instagram.com/p/CdOk-lLKmyH', data['title']
assert_equal 'instagram', data['provider']
assert_equal 'item', data['type']
end
end

0 comments on commit f0ba82d

Please sign in to comment.