3843 – Unrelated instagram links from the same page being matched (#405)

* update instagram to set title to original_url when redirected to main page we saw a bug where some unrelated items were being matched, because their data was the same. They were probably being redirect and the title for all of them were the instagram home url. They were also being parsed by page and not instagram item parser.
meedan · Oct 11, 2023 · f0ba82d · f0ba82d
1 parent 14349d3
commit f0ba82d
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 2 deletions.
diff --git a/app/models/concerns/provider_instagram.rb b/app/models/concerns/provider_instagram.rb
@@ -8,6 +8,10 @@ class ApiAuthenticationError < StandardError; end
   class_methods do
     def ignored_urls
       [
+        {
+          pattern: /^https:\/\/(www\.)?instagram\.com/,
+          reason: :login_page
+        },
         {
           pattern: /^https:\/\/www\.instagram\.com\/accounts\/login/,
           reason: :login_page

diff --git a/app/models/parser/instagram_item.rb b/app/models/parser/instagram_item.rb
@@ -3,21 +3,22 @@ class InstagramItem < Base
     include ProviderInstagram
 
     INSTAGRAM_ITEM_URL = /^https?:\/\/(www\.)?instagram\.com\/(p|tv|reel)\/([^\/]+)/
+    INSTAGRAM_HOME_URL = /^https?:\/\/(www\.)?instagram\.com\/?$/
 
     class << self
       def type
         'instagram_item'.freeze
       end
 
       def patterns
-        [INSTAGRAM_ITEM_URL]
+        [INSTAGRAM_ITEM_URL, INSTAGRAM_HOME_URL]
       end
     end
 
     private
 
     # Main function for class
-    def parse_data_for_parser(doc, _original_url, _jsonld_array)
+    def parse_data_for_parser(doc, original_url, _jsonld_array)
       id = url.match(INSTAGRAM_ITEM_URL)[3]
       @parsed_data.merge!(external_id: id)
 

diff --git a/test/models/parser/instagram_item_test.rb b/test/models/parser/instagram_item_test.rb
@@ -9,6 +9,13 @@ class InstagramItemIntegrationTest < ActiveSupport::TestCase
     assert !data['title'].blank?
   end
 
+  test "should parse Instagram item when the final url is instagram.com" do
+    m = Media.new url: 'https://instagram.com/'
+    data = m.as_json
+    assert_equal 'instagram', data['provider']
+    assert_equal 'https://instagram.com/', data['title']
+  end
+
   test "should get canonical URL parsed from html tags" do
     media1 = create_media url: 'https://www.instagram.com/p/CAdW7PMlTWc/?taken-by=kikoloureiro'
     assert_match /https:\/\/www.instagram.com\/p\/CAdW7PMlTWc/, media1.url
@@ -50,6 +57,9 @@ def doc
 
     match_three = Parser::InstagramItem.match?('https://www.instagram.com/reel/CAdW7PMlTWc')
     assert_equal true, match_three.is_a?(Parser::InstagramItem)
+
+    match_four = Parser::InstagramItem.match?('https://www.instagram.com/')
+    assert_equal true, match_four.is_a?(Parser::InstagramItem)
   end
 
   test "should set profile defaults to URL upon error" do
@@ -158,4 +168,20 @@ def doc
     assert data['raw']['metatags'].present?
     assert data['raw']['api'].present?
   end
+
+  test "should return url as title when redirected to instagram main page" do
+    url = 'https://www.instagram.com/p/CdOk-lLKmyH/'
+    instagram_main_page = 'https://instagram.com/'
+
+    WebMock.stub_request(:get, url).to_return(status: 302, headers: { 'location' => instagram_main_page })
+    WebMock.stub_request(:get, instagram_main_page).to_return(status: 200, body: '<html>Instagram</html>')
+    WebMock.stub_request(:get, "https://www.instagram.com/p/CdOk-lLKmyH/?__a=1&__d=a").to_return(status: 200)
+
+    media = Media.new(url: url)
+    data = media.as_json
+
+    assert_equal 'https://www.instagram.com/p/CdOk-lLKmyH', data['title']
+    assert_equal 'instagram', data['provider']
+    assert_equal 'item', data['type']
+  end
 end