From ea22506b2371205a0e72da288af30ff7f580126f Mon Sep 17 00:00:00 2001 From: Jay Joshua <7008757+jayjay-w@users.noreply.github.com> Date: Tue, 17 Sep 2024 17:06:52 +0200 Subject: [PATCH] Add more URL schemes for YouTube (#471) * Add more URL schemes for YouTube Add new URL schemes for the YouTube Parser: 1. Support for different subdomains: Handles `www.`, `m.` (mobile), and no subdomain variations. 2. Domain variations: Supports both `youtube.com`, `youtu.be`, and `youtube-nocookie.com`. 3. Path variations: Supports paths like `/watch`, `/embed/`, `/v/`, `/e/`, `/shorts/`, `/live/`, `/playlist`, etc. 4. Query parameters: Allows for multiple query parameters like feature, list, index, etc. 5. Timestamps: Supports timestamps like `#t=1m30s`, `#t=10s`, or `t=60s`. 6. Special cases: Includes cases like `oembed?url=`, `attribution_link?a=`, and others. --- app/models/parser/youtube_item.rb | 2 +- test/models/parser/youtube_item_test.rb | 69 +++++++++++++++++++++---- 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/app/models/parser/youtube_item.rb b/app/models/parser/youtube_item.rb index 8e56fad5..62967315 100644 --- a/app/models/parser/youtube_item.rb +++ b/app/models/parser/youtube_item.rb @@ -2,7 +2,7 @@ module Parser class YoutubeItem < Base include ProviderYoutube - YOUTUBE_ITEM_URL = /^https?:\/\/(www\.)?(youtube\.com|youtu\.be)\/(watch\?v=|embed\/|v\/|shorts\/|playlist\?list=)?(?[a-zA-Z0-9_-]{9,11})(\S+)?$/ + YOUTUBE_ITEM_URL = /^https?:\/\/(www\.|m\.)?(youtube(-nocookie)?\.com|youtu\.be)\/((watch\?v=|embed\/|v\/|e\/|shorts\/|live\/|playlist\?list=)?(?[a-zA-Z0-9_-]{9,11})([&?]([^#]+))?(#t=[\dhms]+)?)|(oembed\?url=.+)|(attribution_link\?a=.+&u=%2Fwatch%3Fv%3D(?[a-zA-Z0-9_-]{9,11})(.+))$/ DIRECT_ATTRIBUTES = %w[ description diff --git a/test/models/parser/youtube_item_test.rb b/test/models/parser/youtube_item_test.rb index f105c4e5..61d682a1 100644 --- a/test/models/parser/youtube_item_test.rb +++ b/test/models/parser/youtube_item_test.rb @@ -72,18 +72,67 @@ def doc assert_equal Parser::YoutubeItem.type, 'youtube_item' end - test "matches known URL patterns, and returns instance on success" do - assert_nil Parser::YoutubeItem.match?('https://example.com') - assert_nil Parser::YoutubeItem.match?('https://www.youtube.com/channel/UCZbgt7KIEF_755Xm14JpkCQm') - assert_nil Parser::YoutubeItem.match?('https://www.youtube.com/user/portadosfundos') + test "does not match invalid URL patterns" do + invalid_urls = [ + 'https://example.com', + 'https://www.youtube.com/channel/UCZbgt7KIEF_755Xm14JpkCQm', + 'https://www.youtube.com/user/portadosfundos' + ] - match_one = Parser::YoutubeItem.match?('https://www.youtube.com/watch?v=mtLxD7r4BZQ') - match_two = Parser::YoutubeItem.match?('https://www.youtube.com/shorts/uZG3Y-ulMsc?si=yAE7bJpPPbsevBO7') - match_three = Parser::YoutubeItem.match?('https://youtu.be/cMQuVvFvSIA?si=JpFRMHlGMiQ6aMJO') + invalid_urls.each do |url| + assert_nil Parser::YoutubeItem.match?(url), "Expected #{url} to not match, but it did." + end + end - assert_equal true, match_one.is_a?(Parser::YoutubeItem) - assert_equal true, match_two.is_a?(Parser::YoutubeItem) - assert_equal true, match_three.is_a?(Parser::YoutubeItem) + test "matches known URL patterns, and returns instance on success" do + valid_urls = [ + # Standard YouTube URLs + 'http://www.youtube.com/watch?v=-wtIMTCHWuI', + 'http://youtube.com/watch?v=-wtIMTCHWuI', + 'http://m.youtube.com/watch?v=-wtIMTCHWuI', + 'https://www.youtube.com/watch?v=lalOy8Mbfdc', + 'https://youtube.com/watch?v=lalOy8Mbfdc', + 'https://m.youtube.com/watch?v=lalOy8Mbfdc', + + # URLs with additional parameters and features + 'http://www.youtube.com/watch?v=yZv2daTWRZU&feature=em-uploademail', + 'https://www.youtube.com/watch?v=0zM3nApSvMg#t=0m10s', + 'http://www.youtube.com/watch?v=cKZDdG9FTKY&feature=channel', + 'http://www.youtube.com/watch?v=lalOy8Mbfdc&playnext_from=TL&videos=osPknwzXEas&feature=sub', + + # Shortened YouTube URLs + 'http://youtu.be/dQw4w9WgXcQ', + 'https://youtu.be/oTJRivZTMLs?list=PLToa5JuFMsXTNkrLJbRlB--76IAOjRM9b', + + # Embedded URLs + 'https://www.youtube.com/embed/0zM3nApSvMg', + 'http://www.youtube.com/embed/lalOy8Mbfdc?rel=0', + + # YouTube no-cookie embedded URLs + 'http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0', # Added this URL + + # Attribution links + 'http://www.youtube.com/attribution_link?a=JdfC0C9V6ZI&u=%2Fwatch%3Fv%3DEhxJLojIE_o%26feature%3Dshare', + + # oEmbed URLs + 'https://www.youtube.com/oembed?url=http%3A//www.youtube.com/watch?v%3D-wtIMTCHWuI&format=json', + + # Shorts URLs + 'https://www.youtube.com/shorts/j9rZxAF3C0I', + 'https://m.youtube.com/shorts/j9rZxAF3C0I', + + # Live URLs + 'https://www.youtube.com/live/8hBmepWUJoc', + + # Various video player URLs + 'http://www.youtube.com/v/dQw4w9WgXcQ', + 'https://m.youtube.com/v/-wtIMTCHWuI?version=3&autohide=1' + ] + + valid_urls.each do |url| + match = Parser::YoutubeItem.match?(url) + assert_equal true, match.is_a?(Parser::YoutubeItem), "Expected #{url} to match and return a Parser::YoutubeItem instance, but it did not." + end end test "should selectively assign YouTube fields to raw api data" do