Skip to content

Commit

Permalink
Add more URL schemes for YouTube (#471)
Browse files Browse the repository at this point in the history
* Add more URL schemes for YouTube

Add new URL schemes for the YouTube Parser:

1. Support for different subdomains: Handles `www.`, `m.` (mobile), and no subdomain variations.
2. Domain variations: Supports both `youtube.com`, `youtu.be`, and `youtube-nocookie.com`.
3. Path variations: Supports paths like `/watch`, `/embed/`, `/v/`, `/e/`, `/shorts/`, `/live/`, `/playlist`, etc.
4. Query parameters: Allows for multiple query parameters like feature, list, index, etc.
5. Timestamps: Supports timestamps like `#t=1m30s`, `#t=10s`, or `t=60s`.
6. Special cases: Includes cases like `oembed?url=`, `attribution_link?a=`, and others.
  • Loading branch information
jayjay-w authored Sep 17, 2024
1 parent 8f057ca commit ea22506
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 11 deletions.
2 changes: 1 addition & 1 deletion app/models/parser/youtube_item.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module Parser
class YoutubeItem < Base
include ProviderYoutube

YOUTUBE_ITEM_URL = /^https?:\/\/(www\.)?(youtube\.com|youtu\.be)\/(watch\?v=|embed\/|v\/|shorts\/|playlist\?list=)?(?<id>[a-zA-Z0-9_-]{9,11})(\S+)?$/
YOUTUBE_ITEM_URL = /^https?:\/\/(www\.|m\.)?(youtube(-nocookie)?\.com|youtu\.be)\/((watch\?v=|embed\/|v\/|e\/|shorts\/|live\/|playlist\?list=)?(?<id>[a-zA-Z0-9_-]{9,11})([&?]([^#]+))?(#t=[\dhms]+)?)|(oembed\?url=.+)|(attribution_link\?a=.+&u=%2Fwatch%3Fv%3D(?<id>[a-zA-Z0-9_-]{9,11})(.+))$/

DIRECT_ATTRIBUTES = %w[
description
Expand Down
69 changes: 59 additions & 10 deletions test/models/parser/youtube_item_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,67 @@ def doc
assert_equal Parser::YoutubeItem.type, 'youtube_item'
end

test "matches known URL patterns, and returns instance on success" do
assert_nil Parser::YoutubeItem.match?('https://example.com')
assert_nil Parser::YoutubeItem.match?('https://www.youtube.com/channel/UCZbgt7KIEF_755Xm14JpkCQm')
assert_nil Parser::YoutubeItem.match?('https://www.youtube.com/user/portadosfundos')
test "does not match invalid URL patterns" do
invalid_urls = [
'https://example.com',
'https://www.youtube.com/channel/UCZbgt7KIEF_755Xm14JpkCQm',
'https://www.youtube.com/user/portadosfundos'
]

match_one = Parser::YoutubeItem.match?('https://www.youtube.com/watch?v=mtLxD7r4BZQ')
match_two = Parser::YoutubeItem.match?('https://www.youtube.com/shorts/uZG3Y-ulMsc?si=yAE7bJpPPbsevBO7')
match_three = Parser::YoutubeItem.match?('https://youtu.be/cMQuVvFvSIA?si=JpFRMHlGMiQ6aMJO')
invalid_urls.each do |url|
assert_nil Parser::YoutubeItem.match?(url), "Expected #{url} to not match, but it did."
end
end

assert_equal true, match_one.is_a?(Parser::YoutubeItem)
assert_equal true, match_two.is_a?(Parser::YoutubeItem)
assert_equal true, match_three.is_a?(Parser::YoutubeItem)
test "matches known URL patterns, and returns instance on success" do
valid_urls = [
# Standard YouTube URLs
'http://www.youtube.com/watch?v=-wtIMTCHWuI',
'http://youtube.com/watch?v=-wtIMTCHWuI',
'http://m.youtube.com/watch?v=-wtIMTCHWuI',
'https://www.youtube.com/watch?v=lalOy8Mbfdc',
'https://youtube.com/watch?v=lalOy8Mbfdc',
'https://m.youtube.com/watch?v=lalOy8Mbfdc',

# URLs with additional parameters and features
'http://www.youtube.com/watch?v=yZv2daTWRZU&feature=em-uploademail',
'https://www.youtube.com/watch?v=0zM3nApSvMg#t=0m10s',
'http://www.youtube.com/watch?v=cKZDdG9FTKY&feature=channel',
'http://www.youtube.com/watch?v=lalOy8Mbfdc&playnext_from=TL&videos=osPknwzXEas&feature=sub',

# Shortened YouTube URLs
'http://youtu.be/dQw4w9WgXcQ',
'https://youtu.be/oTJRivZTMLs?list=PLToa5JuFMsXTNkrLJbRlB--76IAOjRM9b',

# Embedded URLs
'https://www.youtube.com/embed/0zM3nApSvMg',
'http://www.youtube.com/embed/lalOy8Mbfdc?rel=0',

# YouTube no-cookie embedded URLs
'http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0', # Added this URL

# Attribution links
'http://www.youtube.com/attribution_link?a=JdfC0C9V6ZI&u=%2Fwatch%3Fv%3DEhxJLojIE_o%26feature%3Dshare',

# oEmbed URLs
'https://www.youtube.com/oembed?url=http%3A//www.youtube.com/watch?v%3D-wtIMTCHWuI&format=json',

# Shorts URLs
'https://www.youtube.com/shorts/j9rZxAF3C0I',
'https://m.youtube.com/shorts/j9rZxAF3C0I',

# Live URLs
'https://www.youtube.com/live/8hBmepWUJoc',

# Various video player URLs
'http://www.youtube.com/v/dQw4w9WgXcQ',
'https://m.youtube.com/v/-wtIMTCHWuI?version=3&autohide=1'
]

valid_urls.each do |url|
match = Parser::YoutubeItem.match?(url)
assert_equal true, match.is_a?(Parser::YoutubeItem), "Expected #{url} to match and return a Parser::YoutubeItem instance, but it did not."
end
end

test "should selectively assign YouTube fields to raw api data" do
Expand Down

0 comments on commit ea22506

Please sign in to comment.