Skip to content

Commit

Permalink
PROD-29813: Security fixes for embeddable links
Browse files Browse the repository at this point in the history
  • Loading branch information
tregismoreira committed Aug 10, 2024
1 parent f57616c commit 7ae1cb5
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 219 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,215 +64,117 @@ public static function create(ContainerInterface $container, array $configuratio
* {@inheritdoc}
*/
public function process($text, $langcode): FilterProcessResult {
// Check for whitelisted URL.
if ($this->embedHelper->whiteList($text)) {
$result = new FilterProcessResult(static::convertUrls($text));
}
else {
$result = new FilterProcessResult($text);
}
// Convert Urls.
$pattern = $this->embedHelper->getCombinedPatterns();
$result = new FilterProcessResult($this->convertUrls($text, $pattern));

// Not whitelisted is return the string as is.
// Also, add the required dependencies and cache tags.
return $this->embedHelper->addDependencies($result, 'social_embed:filter.convert_url');
}

/**
* Replaces appearances of supported URLs with placeholder embed elements.
* Replaces appearances of supported URLs with <drupal-url> embed elements.
*
* Logic of this function is copied from _filter_url() and slightly adopted
* for our use case. _filter_url() is unfortunately not general enough to
* re-use it.
*
* If something wrong here, you can reference to _filter_url() function.
*
* @param string $text
* Text to be processed.
* @param string $pattern
* URL pattern to match.
*
* @return mixed
* @return string
* Processed text.
*/
public static function convertUrls(string $text): mixed {
// Store the current text in case any of the preg_* functions fail.
$saved_text = $text;

public function convertUrls(string $text, string $pattern) {
// Tags to skip and not recurse into.
$ignore_tags = 'a|script|style|code|pre';

// Prepare protocols pattern for absolute URLs.
// \Drupal\Component\Utility\UrlHelper::stripDangerousProtocols()
// will replace any bad protocols with HTTP, so we need to support the
// identical list.
// While '//' is technically optional for MAILTO only, we cannot cleanly
// differ between protocols here without hard-coding MAILTO, so '//' is
// optional for all protocols.
// @see \Drupal\Component\Utility\UrlHelper::stripDangerousProtocols()
$protocols = \Drupal::getContainer()->getParameter('filter_protocols');
assert(is_string($protocols) || is_array($protocols), "Invalid filter_protocols parameter configuration, must be either array or string");
$protocols = is_array($protocols) ? implode(':(?://)?|', $protocols) . ':(?://)?' : $protocols;

$valid_url_path_characters = "[\p{L}\p{M}\p{N}!\*\';:=\+,\.\$\/%#\[\]\-_~@&]";

// Allow URL paths to contain balanced parens
// 1. Used in Wikipedia URLs like /Primer_(film)
// 2. Used in IIS sessions like /S(dfd346)/.
$valid_url_balanced_parens = '\(' . $valid_url_path_characters . '+\)';

// Valid end-of-path characters (so /foo. does not gobble the period).
// 1. Allow =&# for empty URL parameters and other URL-join artifacts.
$valid_url_ending_characters = '[\p{L}\p{M}\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')';

$valid_url_query_chars = '[a-zA-Z0-9!?\*\'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]';
$valid_url_query_ending_chars = '[a-zA-Z0-9_&=#\/]';

// Full path and allow @ in a url, but only in the middle. Catch things
// like http://example.com/@user/
$valid_url_path = '(?:(?:' . $valid_url_path_characters . '*(?:' . $valid_url_balanced_parens . $valid_url_path_characters . '*)*' . $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\/))';

// Prepare domain name pattern.
// The ICANN seems to be on track towards accepting more diverse top level
// domains, so this pattern has been "future-proofed" to allow for TLDs
// of length 2-64.
$domain = '(?:[\p{L}\p{M}\p{N}._+-]+\.)?[\p{L}\p{M}]{2,64}\b';
$ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}';
$auth = '[\p{L}\p{M}\p{N}:%_+*~#?&=.,/;-]+@';
$trail = '(' . $valid_url_path . '*)?(\\?' . $valid_url_query_chars . '*' . $valid_url_query_ending_chars . ')?';

// Match absolute URLs.
$url_pattern = "(?:$auth)?(?:$domain|$ip)\/?(?:$trail)?";
$pattern = "`((?:$protocols)(?:$url_pattern))`u";
$tasks['socialFullUrlToEmbed'] = $pattern;

// Match www. domains.
$url_pattern = "www\.(?:$domain)\/?(?:$trail)?";
$pattern = "`($url_pattern)`u";
$tasks['socialPartialUrlToEmbed'] = $pattern;

// Match short whitelisted domains (without http/https and www.).
$url_pattern = implode("|", \Drupal::service('social_embed.helper_service')->getPatterns());
$pattern = "`($url_pattern)`u";
$tasks['socialShortUrlToEmbed'] = $pattern;

// Each type of URL needs to be processed separately. The text is joined and
// re-split after each task, since all injected HTML tags must be correctly
// protected before the next task.
foreach ($tasks as $task => $pattern) {
// HTML comments need to be handled separately, as they may contain HTML
// markup, especially a '>'. Therefore, remove all comment contents and
// add them back later.
_filter_url_escape_comments([], TRUE);

$text = ($text !== NULL) ? preg_replace_callback('`<!--(.*?)-->`s', '_filter_url_escape_comments', $text) : $text;

// Split at all tags; ensures that no tags or attributes are processed.
$chunks = ($text !== NULL) ? preg_split('/(<.+?>)/is', $text, -1, PREG_SPLIT_DELIM_CAPTURE) : [];

// Do not attempt to convert links into URLs if preg_split() fails.
if ($chunks !== FALSE) {
// PHP ensures that the array consists of alternating delimiters and
// literals, and begins and ends with a literal (inserting NULL as
// required). Therefore, the first chunk is always text:
$chunk_type = 'text';
// If a tag of $ignore_tags is found, it is stored in $open_tag and only
// removed when the closing tag is found. Until the closing tag is
// found,no replacements are made.
$open_tag = '';
for ($i = 0; $i < count($chunks); $i++) {
if ($chunk_type == 'text') {
// Only process this text if there are no unclosed $ignore_tags.
if ($open_tag == '') {
// If there is a match, inject a link into this chunk via the
// callback function contained in $task.
if ($chunks[$i] !== NULL) {
$chunks[$i] = preg_replace_callback($pattern, [static::class, $task], $chunks[$i]);
}
// Split at all tags; ensures that no tags or attributes are processed.
$chunks = is_null($text) ? [''] : preg_split('/(<.+?>)/is', $text, -1, PREG_SPLIT_DELIM_CAPTURE);

// Do not attempt to convert links into URLs if preg_split() fails.
if ($chunks !== FALSE) {
// PHP ensures that the array consists of alternating delimiters and
// literals, and begins and ends with a literal (inserting NULL as
// required). Therefore, the first chunk is always text:
$chunk_type = 'text';
// If a tag of $ignore_tags is found, it is stored in $open_tag and only
// removed when the closing tag is found. Until the closing tag is found,
// no replacements are made.
$open_tag = '';
for ($i = 0; $i < count($chunks); $i++) {
if ($chunk_type == 'text') {
// Only process this text if there are no unclosed $ignore_tags.
if ($open_tag == '') {
// If there is a match, inject a link into this chunk via the
// callback function contained in $task.
$chunks[$i] = preg_replace_callback(
$pattern,
function ($match) {
// Replace URL by the embed code.
return self::socialUrlToEmbed($match[1]);
},
$chunks[$i]
);
}
// Text chunk is done, so next chunk must be a tag.
$chunk_type = 'tag';
}
else {
// Only process this tag if there are no unclosed $ignore_tags.
if ($open_tag == '') {
// Check whether this tag is contained in $ignore_tags.
if (preg_match("`<($ignore_tags)(?:\s|>)`i", $chunks[$i], $matches)) {
$open_tag = $matches[1];
}
// Text chunk is done, so next chunk must be a tag.
$chunk_type = 'tag';
}
// Otherwise, check whether this is the closing tag for $open_tag.
else {
// Only process this tag if there are no unclosed $ignore_tags.
if ($open_tag == '') {
// Check whether this tag is contained in $ignore_tags.
if (preg_match("`<($ignore_tags)(?:\s|>)`i", $chunks[$i], $matches)) {
$open_tag = $matches[1];
}
}
// Otherwise, check whether this is the closing tag for $open_tag.
else {
if (preg_match("`<\/$open_tag>`i", $chunks[$i], $matches)) {
$open_tag = '';
}
if (preg_match("`<\/$open_tag>`i", $chunks[$i], $matches)) {
$open_tag = '';
}
// Tag chunk is done, so next chunk must be text.
$chunk_type = 'text';
}
// Tag chunk is done, so next chunk must be text.
$chunk_type = 'text';
}

$text = implode($chunks);
}

// Revert to the original comment contents.
_filter_url_escape_comments([], FALSE);

if ($text !== NULL) {
$text = preg_replace_callback('`<!--(.*?)-->`', '_filter_url_escape_comments', $text);
}
$text = implode($chunks);
}

// If there is no text at this point revert to the previous text.
return strlen((string) $text) > 0 ? $text : $saved_text;
}

/**
* Makes links out of absolute URLs.
*
* Callback for preg_replace_callback() within convertUrls().
*/
public static function socialFullUrlToEmbed(array $match): string {
return self::socialUrlToEmbed($match);
}

/**
* Makes links out of domain names starting with "www.".
*
* Callback for preg_replace_callback() within convertUrls().
*/
public static function socialPartialUrlToEmbed(array $match): string {
return self::socialUrlToEmbed($match, 'https://');
}

/**
* Makes links out of domain names starting with domain name.
*
* Callback for preg_replace_callback() within convertUrls().
*/
public static function socialShortUrlToEmbed(array $match): string {
return self::socialPartialUrlToEmbed($match);
return $text;
}

/**
* Process URL for embedding.
* Replace URL by the embed code.
*
* @param array $match
* The array with matched strings.
* @param string $full_url_prefix
* The internet protocol
* Send your value (https:// or https://www.) for partial and short links.
* @param string $url
* The matched url.
*
* @return string
* Processed link.
*/
public static function socialUrlToEmbed(array $match, string $full_url_prefix = ''): string {
public static function socialUrlToEmbed(string $url): string {
// Add protocol if does not exist.
if (!preg_match('/^https?:\/\//', $url)) {
$url = 'https://' . $url;
}

try {
$url_for_processing = $match[1];
$url_for_processing = $url;
$social_embed_helper = \Drupal::service('social_embed.helper_service');

// Decode URL.
$url_for_processing = Html::decodeEntities($url_for_processing);

// Default URL for return.
$result_link = $url_for_processing;

// Full URL with protocol (http, https etc.).
$full_url = $full_url_prefix . $url_for_processing;
$info = \Drupal::service('url_embed')->getUrlInfo($full_url);
$info = \Drupal::service('url_embed')->getUrlInfo($url);

if ($info) {
/** @var \Drupal\user\Entity\User $user */
Expand All @@ -288,41 +190,24 @@ public static function socialUrlToEmbed(array $match, string $full_url_prefix =
)
) {
// Replace URL with consent button.
return $social_embed_helper->getPlaceholderMarkupForProvider($info['providerName'], $full_url);
return $social_embed_helper->getPlaceholderMarkupForProvider($info['providerName'], $url);
}
else {
$white_list = $social_embed_helper->getPatterns();

// Check if the URL provided is from a whitelisted site.
foreach ($white_list as $item) {
// Testing pattern.
$testing_pattern = '/' . $item . '/';

// For "Facebook" and "Instagram" links embedding require to
// set up an application. Sometimes it doesn't have a sake
// to do it. Let's return a link without embedding
// if the application isn't connected.
if (
preg_match("/facebook.com\/|instagram.com\//i", $result_link) &&
(
!\Drupal::config('url_embed.settings')->get('facebook_app_id') ||
!\Drupal::config('url_embed.settings')->get('facebook_app_secret')
)
) {
return $result_link;
}

// Twitter: Return not supported for embedding
// specific link as a link.
if (preg_match('(twitter.com\/(.*)\/status\/(.*)\/(.*))', $result_link)) {
return $result_link;
}

// Check if it matches.
if (preg_match($testing_pattern, $url_for_processing)) {
$result_link = '<drupal-url data-embed-url="' . $full_url . '"></drupal-url>';
}
// For "Facebook" and "Instagram" links embedding require to
// set up an application. Sometimes it doesn't have a sake
// to do it. Let's return a link without embedding
// if the application isn't connected.
if (
preg_match("/facebook.com\/|instagram.com\//i", $result_link) &&
(
!\Drupal::config('url_embed.settings')->get('facebook_app_id') ||
!\Drupal::config('url_embed.settings')->get('facebook_app_secret')
)
) {
return $result_link;
}

$result_link = '<drupal-url data-embed-url="' . $url . '"></drupal-url>';
}
}

Expand All @@ -334,7 +219,7 @@ public static function socialUrlToEmbed(array $match, string $full_url_prefix =
$logger = \Drupal::logger('social_embed');
Error::logException($logger, $e);

return $match[1];
return $url;
}
}

Expand Down
Loading

0 comments on commit 7ae1cb5

Please sign in to comment.