From 9915b836e62a0243bdfd429f31b3f791a07cd4e3 Mon Sep 17 00:00:00 2001 From: drmathias Date: Sun, 27 Aug 2023 22:37:51 +0100 Subject: [PATCH] Improve rule matching to support query strings --- README.md | 1 + src/Robots.Txt.Parser/UrlRule.cs | 134 +++++++++++++++-- .../UrlRuleTests.cs | 139 ++++++++++++++++-- 3 files changed, 250 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index b61c221..5865cc9 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ There is also the possibility to extend this library to support protocols other | RSS 2.0 feeds | ❌ | 0.8 | | Atom 0.3/1.0 feeds | ❌ | 0.8 | | Simple text sitemaps | ❌ | 0.5 | +| Memory management (500 KiB parsing limit) | ✔️ | | | Caching support | ❌ | 0.3 | # Usage diff --git a/src/Robots.Txt.Parser/UrlRule.cs b/src/Robots.Txt.Parser/UrlRule.cs index dbc0178..5b3f313 100644 --- a/src/Robots.Txt.Parser/UrlRule.cs +++ b/src/Robots.Txt.Parser/UrlRule.cs @@ -1,5 +1,7 @@ +using System; +using System.Collections.Generic; using System.Linq; -using System.Web; +using System.Text; namespace Robots.Txt.Parser; @@ -10,6 +12,14 @@ namespace Robots.Txt.Parser; /// URL path pattern public record UrlRule(RuleType Type, UrlPathPattern Pattern); +/// +/// Robots.txt rule type +/// +public enum RuleType +{ + Allow, Disallow +} + public class UrlPathPattern { private readonly bool _matchSubPaths; @@ -20,8 +30,8 @@ private UrlPathPattern(string value) Length = value.Length; if (value.EndsWith('$')) value = value[..^1]; else _matchSubPaths = true; - _patternParts = value.Split('*', System.StringSplitOptions.None) - .Select(part => HttpUtility.UrlDecode(part.Replace("%2F", "%252F"))) + _patternParts = value.Split('*', StringSplitOptions.None) + .Select(PathHelpers.PreparePathForComparison) .ToArray(); } @@ -52,20 +62,118 @@ public class UrlPath { internal readonly string _value; - private UrlPath(string value) - { - _value = HttpUtility.UrlDecode(value.Replace("%2F", "%252F")); - } + private UrlPath(string value) => _value = PathHelpers.PreparePathForComparison(value); public int Length => _value.Length; public static implicit operator UrlPath(string value) => new(value); } -/// -/// Robots.txt rule type -/// -public enum RuleType +static class PathHelpers { - Allow, Disallow -} \ No newline at end of file + private static readonly HashSet _reservedChars = new() + { + ':' , '/' , '?' , '#' , '[' , ']' , '@', + '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=' + }; + + /* + * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + * a b c d e f g h i j k l m n o p q r s t u v w x y z + * 0 1 2 3 4 5 6 7 8 9 - _ . ~ + */ + private static readonly string[] _unreservedCharactersPercentEncoded = new string[] + { + "%41", "%42", "%43", "%44", "%45", "%46", "%47", "%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", "%58", "%59", "%5A", + "%61", "%62", "%63", "%64", "%65", "%66", "%67", "%68", "%69", "%6A", "%6B", "%6C", "%6D", "%6E", "%6F", "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", "%78", "%79", "%7A", + "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37", "%38", "%39", "%2D", "%5F", "%2E", "%7E" + }; + + internal static string PreparePathForComparison(string value) + { + /* + Octets in the URI and robots.txt paths outside the range of the ASCII coded character set, and those in the + reserved range defined by [RFC3986], MUST be percent-encoded as defined by [RFC3986] prior to comparison. + */ + var encodedPath = EncodeUrlPath(value); + + /* + If a percent-encoded ASCII octet is encountered in the URI, it MUST be unencoded prior to comparison, + unless it is a reserved character in the URI as defined by [RFC3986] or the character is outside the unreserved character range. + */ + return DecodePercentEncodedUnreservedCharacters(encodedPath); + } + + private static string EncodeUrlPath(string value) + { + var pathAndTheRest = value.Split('?', 2); + var path = pathAndTheRest[0]; + + var encodedUrlPathBuilder = new StringBuilder(); + + for (int i = 0; i < path.Length; i++) + { + var character = path[i]; + + // skip over chars already % encoded + if (character == '%' + && i < path.Length - 2 + && char.IsAsciiHexDigit(path[i + 1]) + && char.IsAsciiHexDigit(path[i + 2])) + { + encodedUrlPathBuilder.Append('%'); + // normalize % encoding casing + encodedUrlPathBuilder.Append(char.ToUpperInvariant(path[i + 1])); + encodedUrlPathBuilder.Append(char.ToUpperInvariant(path[i + 2])); + i += 2; + continue; + } + + // if (character == '/' || _pChars.Value.Contains(character)) encodedUrlPathBuilder.Append(character); + if (character == '/' || (char.IsAscii(character) && !_reservedChars.Contains(character))) encodedUrlPathBuilder.Append(character); + else encodedUrlPathBuilder.Append(Uri.HexEscape(character)); + } + + if (pathAndTheRest.Length == 1) return encodedUrlPathBuilder.ToString(); + + // fragment can be discarded for path rule matching + var query = pathAndTheRest[1].Split('#', 2)[0]; + encodedUrlPathBuilder.Append('?'); + + for (int i = 0; i < query.Length; i++) + { + var character = query[i]; + + // skip over chars already % encoded + if (character == '%' + && i < query.Length - 2 + && char.IsAsciiHexDigit(query[i + 1]) + && char.IsAsciiHexDigit(query[i + 2])) + { + encodedUrlPathBuilder.Append('%'); + encodedUrlPathBuilder.Append(query[i + 1]); + encodedUrlPathBuilder.Append(query[i + 2]); + i += 2; + continue; + } + + if (char.IsAscii(character) && !_reservedChars.Contains(character)) encodedUrlPathBuilder.Append(character); + else encodedUrlPathBuilder.Append(Uri.HexEscape(character)); + } + + return encodedUrlPathBuilder.ToString(); + } + + private static string DecodePercentEncodedUnreservedCharacters(string value) + { + foreach (var percentEncoding in _unreservedCharactersPercentEncoded) + { + value = value.Replace( + percentEncoding, + Convert.ToChar(Convert.ToUInt32(percentEncoding[1..], 16)).ToString(), + StringComparison.InvariantCultureIgnoreCase); + } + + return value; + } +} diff --git a/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs b/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs index d4d7e20..14c6987 100644 --- a/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs +++ b/tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs @@ -84,7 +84,7 @@ public void Matches_SubdirectoryMatch_ReturnTrue() } [Fact] - public void Matches_OctectBothLowercase_ReturnTrue() + public void Matches_PercentEncodedCharacterBothLowercase_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3c"); @@ -97,7 +97,7 @@ public void Matches_OctectBothLowercase_ReturnTrue() } [Fact] - public void Matches_OctectBothUppercase_ReturnTrue() + public void Matches_PercentEncodedCharacterBothUppercase_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3C"); @@ -110,7 +110,7 @@ public void Matches_OctectBothUppercase_ReturnTrue() } [Fact] - public void Matches_OctectRuleLowercasePathUppercase_ReturnTrue() + public void Matches_PercentEncodedCharacterRuleLowercasePathUppercase_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3c"); @@ -123,7 +123,7 @@ public void Matches_OctectRuleLowercasePathUppercase_ReturnTrue() } [Fact] - public void Matches_OctectRuleUppercasePathLowercase_ReturnTrue() + public void Matches_PercentEncodedCharacterRuleUppercasePathLowercase_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3C"); @@ -136,7 +136,7 @@ public void Matches_OctectRuleUppercasePathLowercase_ReturnTrue() } [Fact] - public void Matches_OctectForwardSlashBothUrl_ReturnTrue() + public void Matches_PercentEncodedCharacterForwardSlashBothUrl_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path%2F"); @@ -149,7 +149,7 @@ public void Matches_OctectForwardSlashBothUrl_ReturnTrue() } [Fact] - public void Matches_OctectForwardSlashOnlyInRule_ReturnFalse() + public void Matches_PercentEncodedCharacterForwardSlashOnlyInRule_ReturnFalse() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path%2F"); @@ -162,7 +162,7 @@ public void Matches_OctectForwardSlashOnlyInRule_ReturnFalse() } [Fact] - public void Matches_OctectForwardSlashOnlyInPath_ReturnFalse() + public void Matches_PercentEncodedCharacterForwardSlashOnlyInPath_ReturnFalse() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path/"); @@ -175,7 +175,85 @@ public void Matches_OctectForwardSlashOnlyInPath_ReturnFalse() } [Fact] - public void Matches_OctectNotForwardSlashLowercaseOnlyInRule_ReturnTrue() + public void Matches_PercentEncodedCharacterAsteriskBothUrl_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/some%2Apath"); + + // Act + var matches = urlRule.Pattern.Matches("/some%2Apath"); + + // Assert + matches.Should().Be(true); + } + + [Fact] + public void Matches_PercentEncodedCharacterAsteriskOnlyInRule_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/some%2Apath"); + + // Act + var matches = urlRule.Pattern.Matches("/some*path"); + + // Assert + matches.Should().Be(true); + } + + [Fact] + public void Matches_PercentEncodedCharacterAsteriskOnlyInPath_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/some*path"); + + // Act + var matches = urlRule.Pattern.Matches("/some%2Apath"); + + // Assert + matches.Should().Be(true); + } + + [Fact] + public void Matches_PercentEncodedCharacterReservedBothUrl_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/some%24path"); + + // Act + var matches = urlRule.Pattern.Matches("/some%24path"); + + // Assert + matches.Should().Be(true); + } + + [Fact] + public void Matches_PercentEncodedCharacterReservedOnlyInRule_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/some%24path"); + + // Act + var matches = urlRule.Pattern.Matches("/some$path"); + + // Assert + matches.Should().Be(true); + } + + [Fact] + public void Matches_PercentEncodedCharacterReservedOnlyInPath_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/some$path"); + + // Act + var matches = urlRule.Pattern.Matches("/some%24path"); + + // Assert + matches.Should().Be(true); + } + + [Fact] + public void Matches_PercentEncodedCharacterNotSpecialLowercaseOnlyInRule_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path%7e"); @@ -188,7 +266,7 @@ public void Matches_OctectNotForwardSlashLowercaseOnlyInRule_ReturnTrue() } [Fact] - public void Matches_OctectNotForwardSlashLowercaseOnlyInPath_ReturnTrue() + public void Matches_PercentEncodedCharacterNotSpecialLowercaseOnlyInPath_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path~"); @@ -201,7 +279,7 @@ public void Matches_OctectNotForwardSlashLowercaseOnlyInPath_ReturnTrue() } [Fact] - public void Matches_OctectNotForwardSlashUppercaseOnlyInRule_ReturnTrue() + public void Matches_PercentEncodedCharacterNotSpecialUppercaseOnlyInRule_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path%7E"); @@ -214,7 +292,7 @@ public void Matches_OctectNotForwardSlashUppercaseOnlyInRule_ReturnTrue() } [Fact] - public void Matches_OctectNotForwardSlashUppercaseOnlyInPath_ReturnTrue() + public void Matches_PercentEncodedCharacterNotSpecialUppercaseOnlyInPath_ReturnTrue() { // Arrange var urlRule = new UrlRule(RuleType.Disallow, "/some/path~"); @@ -225,4 +303,43 @@ public void Matches_OctectNotForwardSlashUppercaseOnlyInPath_ReturnTrue() // Assert matches.Should().Be(true); } + + [Fact] + public void Matches_UnescapedQueryStringInRuleAndPath_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https://foo.bar"); + + // Act + var matches = urlRule.Pattern.Matches("/foo/bar?baz=https://foo.bar"); + + // Assert + matches.Should().Be(true); + } + + [Fact] + public void Matches_UnescapedQueryStringInRuleButPathEscaped_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https://foo.bar"); + + // Act + var matches = urlRule.Pattern.Matches("/foo/bar?baz=https%3A%2F%2Ffoo.bar"); + + // Assert + matches.Should().Be(true); + } + + [Fact] + public void Matches_UnescapedQueryStringInPathButRuleEscaped_ReturnTrue() + { + // Arrange + var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https%3A%2F%2Ffoo.bar"); + + // Act + var matches = urlRule.Pattern.Matches("/foo/bar?baz=https://foo.bar"); + + // Assert + matches.Should().Be(true); + } }