Allow parsers to define parameters for URL normalization

Allow parsers to define URL parameters for normalization. The provided parameters will be stripped from the URL.
meedan · Jun 9, 2024 · f31bebc · f31bebc
1 parent 09bcc8c
commit f31bebc
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -454,6 +454,16 @@ To enable sampling for Honeycomb, set the following configuration (either in `co
 
 **Note**: If sampling behavior is changed in Pender, we will also need to update the behavior to match in any other application reporting to Honeycomb. More [here](https://docs.honeycomb.io/getting-data-in/opentelemetry/ruby/#sampling)
 
+### URL Parameters Normalization
+
+Some service providers include URL parameters for tracking purposes that can be safely removed. Pender parsers can define a list of such parameters to be removed during the URL normalization process.
+
+To define URL parameters to be removed, a parser class should implement the `urls_parameters_to_remove` method, which returns an array of strings representing the parameters to be stripped. For example:
+
+```ruby
+def urls_parameters_to_remove
+  ['ighs']
+end
 
 #### Environment overrides
 

diff --git a/app/models/media.rb b/app/models/media.rb
@@ -64,6 +64,7 @@ def initialize(attributes = {})
     self.follow_redirections
     self.url = RequestHelper.normalize_url(self.url) unless self.get_canonical_url
     self.try_https
+    self.remove_parser_specific_parameters
     self.parser = nil
   end
 
@@ -275,6 +276,24 @@ def try_https
     end
   end
 
+  def remove_parser_specific_parameters
+    parser_class = self.class.find_parser_class(self.url)
+    return unless parser_class&.respond_to?(:urls_parameters_to_remove)
+
+    params_to_remove = parser_class.urls_parameters_to_remove
+    uri = URI.parse(self.url)
+    query_params = URI.decode_www_form(uri.query || '').reject { |key, _| params_to_remove.include?(key) }
+    uri.query = URI.encode_www_form(query_params)
+    self.url = uri.to_s
+  end
+
+  def self.find_parser_class(url)
+    PARSERS.each do |parser|
+      return parser if parser.patterns.any? { |pattern| pattern.match?(url) }
+    end
+    nil
+  end
+
   def get_html(header_options = {}, force_proxy = false)
     RequestHelper.get_html(self.url, self.method(:set_error), header_options, force_proxy)
   end

diff --git a/app/models/parser/instagram_item.rb b/app/models/parser/instagram_item.rb
@@ -13,6 +13,10 @@ def type
       def patterns
         [INSTAGRAM_ITEM_URL, INSTAGRAM_HOME_URL]
       end
+
+      def urls_parameters_to_remove
+        ['igsh']
+      end
     end
 
     private

diff --git a/test/models/media_test.rb b/test/models/media_test.rb
@@ -618,4 +618,10 @@ def teardown
     assert_equal "201", response.code
     assert_equal 'fake response body', response.body
   end
+
+  test 'should remove parser specific URL parameters' do
+    url = 'https://www.instagram.com/p/xyz/?igshid=1'
+    media = Media.new(url: url)
+    assert_not_includes media.url, 'igshid=1'
+  end
 end
diff --git a/test/models/parser/instagram_item_test.rb b/test/models/parser/instagram_item_test.rb
@@ -175,12 +175,16 @@ def doc
 
     WebMock.stub_request(:get, url).to_return(status: 302, headers: { 'location' => instagram_main_page })
     WebMock.stub_request(:get, instagram_main_page).to_return(status: 200, body: '<html>Instagram</html>')
-    WebMock.stub_request(:get, "https://www.instagram.com/p/CdOk-lLKmyH/?__a=1&__d=a").to_return(status: 200)
-
+    WebMock.stub_request(:get, "https://www.instagram.com/p/CdOk-lLKmyH?/?__a=1&__d=a").with(
+      headers: {
+        'Accept'=>'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
+      }
+    ).to_return(status: 200, body: "", headers: {})
+
     media = Media.new(url: url)
     data = media.as_json
 
-    assert_equal 'https://www.instagram.com/p/CdOk-lLKmyH', data['title']
+    assert_equal 'https://www.instagram.com/p/CdOk-lLKmyH?', data['title']
     assert_equal 'instagram', data['provider']
     assert_equal 'item', data['type']
   end