From 7ae1cb5fa9912b8d172cbf5960a9c3dedf99a2c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thiago=20R=C3=A9gis?= Date: Sat, 10 Aug 2024 11:57:53 -0300 Subject: [PATCH] PROD-29813: Security fixes for embeddable links --- .../SocialEmbedConvertUrlToEmbedFilter.php | 281 ++++++------------ .../src/Service/SocialEmbedHelper.php | 44 +-- .../src/SocialEmbedConfigOverride.php | 3 - 3 files changed, 109 insertions(+), 219 deletions(-) diff --git a/modules/social_features/social_embed/src/Plugin/Filter/SocialEmbedConvertUrlToEmbedFilter.php b/modules/social_features/social_embed/src/Plugin/Filter/SocialEmbedConvertUrlToEmbedFilter.php index 431a893da71..92437139005 100644 --- a/modules/social_features/social_embed/src/Plugin/Filter/SocialEmbedConvertUrlToEmbedFilter.php +++ b/modules/social_features/social_embed/src/Plugin/Filter/SocialEmbedConvertUrlToEmbedFilter.php @@ -64,215 +64,117 @@ public static function create(ContainerInterface $container, array $configuratio * {@inheritdoc} */ public function process($text, $langcode): FilterProcessResult { - // Check for whitelisted URL. - if ($this->embedHelper->whiteList($text)) { - $result = new FilterProcessResult(static::convertUrls($text)); - } - else { - $result = new FilterProcessResult($text); - } + // Convert Urls. + $pattern = $this->embedHelper->getCombinedPatterns(); + $result = new FilterProcessResult($this->convertUrls($text, $pattern)); + // Not whitelisted is return the string as is. // Also, add the required dependencies and cache tags. return $this->embedHelper->addDependencies($result, 'social_embed:filter.convert_url'); } /** - * Replaces appearances of supported URLs with placeholder embed elements. + * Replaces appearances of supported URLs with embed elements. * * Logic of this function is copied from _filter_url() and slightly adopted * for our use case. _filter_url() is unfortunately not general enough to * re-use it. * - * If something wrong here, you can reference to _filter_url() function. - * * @param string $text * Text to be processed. + * @param string $pattern + * URL pattern to match. * - * @return mixed + * @return string * Processed text. */ - public static function convertUrls(string $text): mixed { - // Store the current text in case any of the preg_* functions fail. - $saved_text = $text; - + public function convertUrls(string $text, string $pattern) { // Tags to skip and not recurse into. $ignore_tags = 'a|script|style|code|pre'; - // Prepare protocols pattern for absolute URLs. - // \Drupal\Component\Utility\UrlHelper::stripDangerousProtocols() - // will replace any bad protocols with HTTP, so we need to support the - // identical list. - // While '//' is technically optional for MAILTO only, we cannot cleanly - // differ between protocols here without hard-coding MAILTO, so '//' is - // optional for all protocols. - // @see \Drupal\Component\Utility\UrlHelper::stripDangerousProtocols() - $protocols = \Drupal::getContainer()->getParameter('filter_protocols'); - assert(is_string($protocols) || is_array($protocols), "Invalid filter_protocols parameter configuration, must be either array or string"); - $protocols = is_array($protocols) ? implode(':(?://)?|', $protocols) . ':(?://)?' : $protocols; - - $valid_url_path_characters = "[\p{L}\p{M}\p{N}!\*\';:=\+,\.\$\/%#\[\]\-_~@&]"; - - // Allow URL paths to contain balanced parens - // 1. Used in Wikipedia URLs like /Primer_(film) - // 2. Used in IIS sessions like /S(dfd346)/. - $valid_url_balanced_parens = '\(' . $valid_url_path_characters . '+\)'; - - // Valid end-of-path characters (so /foo. does not gobble the period). - // 1. Allow =&# for empty URL parameters and other URL-join artifacts. - $valid_url_ending_characters = '[\p{L}\p{M}\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')'; - - $valid_url_query_chars = '[a-zA-Z0-9!?\*\'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]'; - $valid_url_query_ending_chars = '[a-zA-Z0-9_&=#\/]'; - - // Full path and allow @ in a url, but only in the middle. Catch things - // like http://example.com/@user/ - $valid_url_path = '(?:(?:' . $valid_url_path_characters . '*(?:' . $valid_url_balanced_parens . $valid_url_path_characters . '*)*' . $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\/))'; - - // Prepare domain name pattern. - // The ICANN seems to be on track towards accepting more diverse top level - // domains, so this pattern has been "future-proofed" to allow for TLDs - // of length 2-64. - $domain = '(?:[\p{L}\p{M}\p{N}._+-]+\.)?[\p{L}\p{M}]{2,64}\b'; - $ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'; - $auth = '[\p{L}\p{M}\p{N}:%_+*~#?&=.,/;-]+@'; - $trail = '(' . $valid_url_path . '*)?(\\?' . $valid_url_query_chars . '*' . $valid_url_query_ending_chars . ')?'; - - // Match absolute URLs. - $url_pattern = "(?:$auth)?(?:$domain|$ip)\/?(?:$trail)?"; - $pattern = "`((?:$protocols)(?:$url_pattern))`u"; - $tasks['socialFullUrlToEmbed'] = $pattern; - - // Match www. domains. - $url_pattern = "www\.(?:$domain)\/?(?:$trail)?"; - $pattern = "`($url_pattern)`u"; - $tasks['socialPartialUrlToEmbed'] = $pattern; - - // Match short whitelisted domains (without http/https and www.). - $url_pattern = implode("|", \Drupal::service('social_embed.helper_service')->getPatterns()); - $pattern = "`($url_pattern)`u"; - $tasks['socialShortUrlToEmbed'] = $pattern; - - // Each type of URL needs to be processed separately. The text is joined and - // re-split after each task, since all injected HTML tags must be correctly - // protected before the next task. - foreach ($tasks as $task => $pattern) { - // HTML comments need to be handled separately, as they may contain HTML - // markup, especially a '>'. Therefore, remove all comment contents and - // add them back later. - _filter_url_escape_comments([], TRUE); - - $text = ($text !== NULL) ? preg_replace_callback('``s', '_filter_url_escape_comments', $text) : $text; - - // Split at all tags; ensures that no tags or attributes are processed. - $chunks = ($text !== NULL) ? preg_split('/(<.+?>)/is', $text, -1, PREG_SPLIT_DELIM_CAPTURE) : []; - - // Do not attempt to convert links into URLs if preg_split() fails. - if ($chunks !== FALSE) { - // PHP ensures that the array consists of alternating delimiters and - // literals, and begins and ends with a literal (inserting NULL as - // required). Therefore, the first chunk is always text: - $chunk_type = 'text'; - // If a tag of $ignore_tags is found, it is stored in $open_tag and only - // removed when the closing tag is found. Until the closing tag is - // found,no replacements are made. - $open_tag = ''; - for ($i = 0; $i < count($chunks); $i++) { - if ($chunk_type == 'text') { - // Only process this text if there are no unclosed $ignore_tags. - if ($open_tag == '') { - // If there is a match, inject a link into this chunk via the - // callback function contained in $task. - if ($chunks[$i] !== NULL) { - $chunks[$i] = preg_replace_callback($pattern, [static::class, $task], $chunks[$i]); - } + // Split at all tags; ensures that no tags or attributes are processed. + $chunks = is_null($text) ? [''] : preg_split('/(<.+?>)/is', $text, -1, PREG_SPLIT_DELIM_CAPTURE); + + // Do not attempt to convert links into URLs if preg_split() fails. + if ($chunks !== FALSE) { + // PHP ensures that the array consists of alternating delimiters and + // literals, and begins and ends with a literal (inserting NULL as + // required). Therefore, the first chunk is always text: + $chunk_type = 'text'; + // If a tag of $ignore_tags is found, it is stored in $open_tag and only + // removed when the closing tag is found. Until the closing tag is found, + // no replacements are made. + $open_tag = ''; + for ($i = 0; $i < count($chunks); $i++) { + if ($chunk_type == 'text') { + // Only process this text if there are no unclosed $ignore_tags. + if ($open_tag == '') { + // If there is a match, inject a link into this chunk via the + // callback function contained in $task. + $chunks[$i] = preg_replace_callback( + $pattern, + function ($match) { + // Replace URL by the embed code. + return self::socialUrlToEmbed($match[1]); + }, + $chunks[$i] + ); + } + // Text chunk is done, so next chunk must be a tag. + $chunk_type = 'tag'; + } + else { + // Only process this tag if there are no unclosed $ignore_tags. + if ($open_tag == '') { + // Check whether this tag is contained in $ignore_tags. + if (preg_match("`<($ignore_tags)(?:\s|>)`i", $chunks[$i], $matches)) { + $open_tag = $matches[1]; } - // Text chunk is done, so next chunk must be a tag. - $chunk_type = 'tag'; } + // Otherwise, check whether this is the closing tag for $open_tag. else { - // Only process this tag if there are no unclosed $ignore_tags. - if ($open_tag == '') { - // Check whether this tag is contained in $ignore_tags. - if (preg_match("`<($ignore_tags)(?:\s|>)`i", $chunks[$i], $matches)) { - $open_tag = $matches[1]; - } - } - // Otherwise, check whether this is the closing tag for $open_tag. - else { - if (preg_match("`<\/$open_tag>`i", $chunks[$i], $matches)) { - $open_tag = ''; - } + if (preg_match("`<\/$open_tag>`i", $chunks[$i], $matches)) { + $open_tag = ''; } - // Tag chunk is done, so next chunk must be text. - $chunk_type = 'text'; } + // Tag chunk is done, so next chunk must be text. + $chunk_type = 'text'; } - - $text = implode($chunks); } - // Revert to the original comment contents. - _filter_url_escape_comments([], FALSE); - - if ($text !== NULL) { - $text = preg_replace_callback('``', '_filter_url_escape_comments', $text); - } + $text = implode($chunks); } - // If there is no text at this point revert to the previous text. - return strlen((string) $text) > 0 ? $text : $saved_text; - } - - /** - * Makes links out of absolute URLs. - * - * Callback for preg_replace_callback() within convertUrls(). - */ - public static function socialFullUrlToEmbed(array $match): string { - return self::socialUrlToEmbed($match); - } - - /** - * Makes links out of domain names starting with "www.". - * - * Callback for preg_replace_callback() within convertUrls(). - */ - public static function socialPartialUrlToEmbed(array $match): string { - return self::socialUrlToEmbed($match, 'https://'); - } - - /** - * Makes links out of domain names starting with domain name. - * - * Callback for preg_replace_callback() within convertUrls(). - */ - public static function socialShortUrlToEmbed(array $match): string { - return self::socialPartialUrlToEmbed($match); + return $text; } /** - * Process URL for embedding. + * Replace URL by the embed code. * - * @param array $match - * The array with matched strings. - * @param string $full_url_prefix - * The internet protocol - * Send your value (https:// or https://www.) for partial and short links. + * @param string $url + * The matched url. * * @return string * Processed link. */ - public static function socialUrlToEmbed(array $match, string $full_url_prefix = ''): string { + public static function socialUrlToEmbed(string $url): string { + // Add protocol if does not exist. + if (!preg_match('/^https?:\/\//', $url)) { + $url = 'https://' . $url; + } + try { - $url_for_processing = $match[1]; + $url_for_processing = $url; $social_embed_helper = \Drupal::service('social_embed.helper_service'); + // Decode URL. $url_for_processing = Html::decodeEntities($url_for_processing); + // Default URL for return. $result_link = $url_for_processing; + // Full URL with protocol (http, https etc.). - $full_url = $full_url_prefix . $url_for_processing; - $info = \Drupal::service('url_embed')->getUrlInfo($full_url); + $info = \Drupal::service('url_embed')->getUrlInfo($url); if ($info) { /** @var \Drupal\user\Entity\User $user */ @@ -288,41 +190,24 @@ public static function socialUrlToEmbed(array $match, string $full_url_prefix = ) ) { // Replace URL with consent button. - return $social_embed_helper->getPlaceholderMarkupForProvider($info['providerName'], $full_url); + return $social_embed_helper->getPlaceholderMarkupForProvider($info['providerName'], $url); } else { - $white_list = $social_embed_helper->getPatterns(); - - // Check if the URL provided is from a whitelisted site. - foreach ($white_list as $item) { - // Testing pattern. - $testing_pattern = '/' . $item . '/'; - - // For "Facebook" and "Instagram" links embedding require to - // set up an application. Sometimes it doesn't have a sake - // to do it. Let's return a link without embedding - // if the application isn't connected. - if ( - preg_match("/facebook.com\/|instagram.com\//i", $result_link) && - ( - !\Drupal::config('url_embed.settings')->get('facebook_app_id') || - !\Drupal::config('url_embed.settings')->get('facebook_app_secret') - ) - ) { - return $result_link; - } - - // Twitter: Return not supported for embedding - // specific link as a link. - if (preg_match('(twitter.com\/(.*)\/status\/(.*)\/(.*))', $result_link)) { - return $result_link; - } - - // Check if it matches. - if (preg_match($testing_pattern, $url_for_processing)) { - $result_link = ''; - } + // For "Facebook" and "Instagram" links embedding require to + // set up an application. Sometimes it doesn't have a sake + // to do it. Let's return a link without embedding + // if the application isn't connected. + if ( + preg_match("/facebook.com\/|instagram.com\//i", $result_link) && + ( + !\Drupal::config('url_embed.settings')->get('facebook_app_id') || + !\Drupal::config('url_embed.settings')->get('facebook_app_secret') + ) + ) { + return $result_link; } + + $result_link = ''; } } @@ -334,7 +219,7 @@ public static function socialUrlToEmbed(array $match, string $full_url_prefix = $logger = \Drupal::logger('social_embed'); Error::logException($logger, $e); - return $match[1]; + return $url; } } diff --git a/modules/social_features/social_embed/src/Service/SocialEmbedHelper.php b/modules/social_features/social_embed/src/Service/SocialEmbedHelper.php index 5c2fd2b3457..6c2ced667c4 100644 --- a/modules/social_features/social_embed/src/Service/SocialEmbedHelper.php +++ b/modules/social_features/social_embed/src/Service/SocialEmbedHelper.php @@ -90,30 +90,20 @@ public function addDependencies(FilterProcessResult $result, string $tag): Filte } /** - * Checks if item is on the whitelist. + * Checks if a given URL is whitelisted. * - * @param string $text - * The item to check for. + * @param string $url + * The URL to check. * * @return bool - * Return if the item is on the whitelist or not. + * Returns TRUE if the URL is whitelisted, otherwise FALSE. */ - public function whiteList(string $text): bool { + public function whiteList(string $url): bool { // Fetch allowed patterns. - $patterns = $this->getPatterns(); - - // Check if the URL provided is from a whitelisted site. - foreach ($patterns as $pattern) { - // Testing pattern. - $testing_pattern = '/' . $pattern . '/'; - - // Check if it matches. - if (preg_match($testing_pattern, $text)) { - return TRUE; - } - } + $patterns = $this->getCombinedPatterns(); - return FALSE; + // Check if the URL matches any of the allowed patterns. + return preg_match($patterns, $url) === 1; } /** @@ -132,6 +122,7 @@ public function getPatterns(): array { '(instagram.com\/p\/(.*))', '(open.spotify.com\/track\/(.*))', '(twitter.com\/(.*)\/status\/(.*))', + '(x.com\/(.*)\/status\/(.*))', '(vimeo.com\/\d{7,9})', '(youtube.com\/watch[?]v=(.*))', '(youtu.be\/(.*))', @@ -139,6 +130,23 @@ public function getPatterns(): array { ]; } + /** + * The combined version of the patterns. + * + * @return string + * The string of combined patterns. + */ + public function getCombinedPatterns(): string { + // Fetch allowed patterns. + $patterns = $this->getPatterns(); + + // Combine the patterns in a single string. + $combined_patterns = implode('|', $patterns); + + // Return the combined version of the patterns. + return "/\b(?:https?:\/\/)?(?:www\.)?($combined_patterns)/i"; + } + /** * Returns markup for a placeholder, based on the provider and the url. * diff --git a/modules/social_features/social_embed/src/SocialEmbedConfigOverride.php b/modules/social_features/social_embed/src/SocialEmbedConfigOverride.php index fbe01fd995f..6c8249c611d 100644 --- a/modules/social_features/social_embed/src/SocialEmbedConfigOverride.php +++ b/modules/social_features/social_embed/src/SocialEmbedConfigOverride.php @@ -117,9 +117,6 @@ protected function addFilterOverride($text_format, $convert_url, array &$overrid 'provider' => 'social_embed', 'status' => TRUE, 'weight' => (isset($filters['filter_url']['weight']) ? $filters['filter_url']['weight'] - 1 : 99), - 'settings' => [ - 'url_prefix' => '', - ], ]; if (isset($filters['filter_html'])) {