Skip to content

Commit

Permalink
Improve rule matching to support query strings
Browse files Browse the repository at this point in the history
  • Loading branch information
drmathias committed Aug 27, 2023
1 parent 65bbc5a commit 9915b83
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 24 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ There is also the possibility to extend this library to support protocols other
| RSS 2.0 feeds || 0.8 |
| Atom 0.3/1.0 feeds || 0.8 |
| Simple text sitemaps || 0.5 |
| Memory management (500 KiB parsing limit) | ✔️ | |
| Caching support || 0.3 |

# Usage
Expand Down
134 changes: 121 additions & 13 deletions src/Robots.Txt.Parser/UrlRule.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;

namespace Robots.Txt.Parser;

Expand All @@ -10,6 +12,14 @@ namespace Robots.Txt.Parser;
/// <param name="Pattern">URL path pattern</param>
public record UrlRule(RuleType Type, UrlPathPattern Pattern);

/// <summary>
/// Robots.txt rule type
/// </summary>
public enum RuleType
{
Allow, Disallow
}

public class UrlPathPattern
{
private readonly bool _matchSubPaths;
Expand All @@ -20,8 +30,8 @@ private UrlPathPattern(string value)
Length = value.Length;
if (value.EndsWith('$')) value = value[..^1];
else _matchSubPaths = true;
_patternParts = value.Split('*', System.StringSplitOptions.None)
.Select(part => HttpUtility.UrlDecode(part.Replace("%2F", "%252F")))
_patternParts = value.Split('*', StringSplitOptions.None)
.Select(PathHelpers.PreparePathForComparison)
.ToArray();
}

Expand Down Expand Up @@ -52,20 +62,118 @@ public class UrlPath
{
internal readonly string _value;

private UrlPath(string value)
{
_value = HttpUtility.UrlDecode(value.Replace("%2F", "%252F"));
}
private UrlPath(string value) => _value = PathHelpers.PreparePathForComparison(value);

public int Length => _value.Length;

public static implicit operator UrlPath(string value) => new(value);
}

/// <summary>
/// Robots.txt rule type
/// </summary>
public enum RuleType
static class PathHelpers
{
Allow, Disallow
}
private static readonly HashSet<char> _reservedChars = new()
{
':' , '/' , '?' , '#' , '[' , ']' , '@',
'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '='
};

/*
* A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
* a b c d e f g h i j k l m n o p q r s t u v w x y z
* 0 1 2 3 4 5 6 7 8 9 - _ . ~
*/
private static readonly string[] _unreservedCharactersPercentEncoded = new string[]
{
"%41", "%42", "%43", "%44", "%45", "%46", "%47", "%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", "%58", "%59", "%5A",
"%61", "%62", "%63", "%64", "%65", "%66", "%67", "%68", "%69", "%6A", "%6B", "%6C", "%6D", "%6E", "%6F", "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", "%78", "%79", "%7A",
"%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37", "%38", "%39", "%2D", "%5F", "%2E", "%7E"
};

internal static string PreparePathForComparison(string value)
{
/*
Octets in the URI and robots.txt paths outside the range of the ASCII coded character set, and those in the
reserved range defined by [RFC3986], MUST be percent-encoded as defined by [RFC3986] prior to comparison.
*/
var encodedPath = EncodeUrlPath(value);

/*
If a percent-encoded ASCII octet is encountered in the URI, it MUST be unencoded prior to comparison,
unless it is a reserved character in the URI as defined by [RFC3986] or the character is outside the unreserved character range.
*/
return DecodePercentEncodedUnreservedCharacters(encodedPath);
}

private static string EncodeUrlPath(string value)
{
var pathAndTheRest = value.Split('?', 2);
var path = pathAndTheRest[0];

var encodedUrlPathBuilder = new StringBuilder();

for (int i = 0; i < path.Length; i++)
{
var character = path[i];

// skip over chars already % encoded
if (character == '%'
&& i < path.Length - 2
&& char.IsAsciiHexDigit(path[i + 1])
&& char.IsAsciiHexDigit(path[i + 2]))
{
encodedUrlPathBuilder.Append('%');
// normalize % encoding casing
encodedUrlPathBuilder.Append(char.ToUpperInvariant(path[i + 1]));
encodedUrlPathBuilder.Append(char.ToUpperInvariant(path[i + 2]));
i += 2;
continue;
}

// if (character == '/' || _pChars.Value.Contains(character)) encodedUrlPathBuilder.Append(character);
if (character == '/' || (char.IsAscii(character) && !_reservedChars.Contains(character))) encodedUrlPathBuilder.Append(character);
else encodedUrlPathBuilder.Append(Uri.HexEscape(character));
}

if (pathAndTheRest.Length == 1) return encodedUrlPathBuilder.ToString();

// fragment can be discarded for path rule matching
var query = pathAndTheRest[1].Split('#', 2)[0];
encodedUrlPathBuilder.Append('?');

for (int i = 0; i < query.Length; i++)
{
var character = query[i];

// skip over chars already % encoded
if (character == '%'
&& i < query.Length - 2
&& char.IsAsciiHexDigit(query[i + 1])
&& char.IsAsciiHexDigit(query[i + 2]))
{
encodedUrlPathBuilder.Append('%');
encodedUrlPathBuilder.Append(query[i + 1]);
encodedUrlPathBuilder.Append(query[i + 2]);
i += 2;
continue;
}

if (char.IsAscii(character) && !_reservedChars.Contains(character)) encodedUrlPathBuilder.Append(character);
else encodedUrlPathBuilder.Append(Uri.HexEscape(character));
}

return encodedUrlPathBuilder.ToString();
}

private static string DecodePercentEncodedUnreservedCharacters(string value)
{
foreach (var percentEncoding in _unreservedCharactersPercentEncoded)
{
value = value.Replace(
percentEncoding,
Convert.ToChar(Convert.ToUInt32(percentEncoding[1..], 16)).ToString(),
StringComparison.InvariantCultureIgnoreCase);
}

return value;
}
}
139 changes: 128 additions & 11 deletions tests/Robots.Txt.Parser.Tests.Unit/UrlRuleTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public void Matches_SubdirectoryMatch_ReturnTrue()
}

[Fact]
public void Matches_OctectBothLowercase_ReturnTrue()
public void Matches_PercentEncodedCharacterBothLowercase_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3c");
Expand All @@ -97,7 +97,7 @@ public void Matches_OctectBothLowercase_ReturnTrue()
}

[Fact]
public void Matches_OctectBothUppercase_ReturnTrue()
public void Matches_PercentEncodedCharacterBothUppercase_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3C");
Expand All @@ -110,7 +110,7 @@ public void Matches_OctectBothUppercase_ReturnTrue()
}

[Fact]
public void Matches_OctectRuleLowercasePathUppercase_ReturnTrue()
public void Matches_PercentEncodedCharacterRuleLowercasePathUppercase_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3c");
Expand All @@ -123,7 +123,7 @@ public void Matches_OctectRuleLowercasePathUppercase_ReturnTrue()
}

[Fact]
public void Matches_OctectRuleUppercasePathLowercase_ReturnTrue()
public void Matches_PercentEncodedCharacterRuleUppercasePathLowercase_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%3C");
Expand All @@ -136,7 +136,7 @@ public void Matches_OctectRuleUppercasePathLowercase_ReturnTrue()
}

[Fact]
public void Matches_OctectForwardSlashBothUrl_ReturnTrue()
public void Matches_PercentEncodedCharacterForwardSlashBothUrl_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%2F");
Expand All @@ -149,7 +149,7 @@ public void Matches_OctectForwardSlashBothUrl_ReturnTrue()
}

[Fact]
public void Matches_OctectForwardSlashOnlyInRule_ReturnFalse()
public void Matches_PercentEncodedCharacterForwardSlashOnlyInRule_ReturnFalse()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%2F");
Expand All @@ -162,7 +162,7 @@ public void Matches_OctectForwardSlashOnlyInRule_ReturnFalse()
}

[Fact]
public void Matches_OctectForwardSlashOnlyInPath_ReturnFalse()
public void Matches_PercentEncodedCharacterForwardSlashOnlyInPath_ReturnFalse()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path/");
Expand All @@ -175,7 +175,85 @@ public void Matches_OctectForwardSlashOnlyInPath_ReturnFalse()
}

[Fact]
public void Matches_OctectNotForwardSlashLowercaseOnlyInRule_ReturnTrue()
public void Matches_PercentEncodedCharacterAsteriskBothUrl_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some%2Apath");

// Act
var matches = urlRule.Pattern.Matches("/some%2Apath");

// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_PercentEncodedCharacterAsteriskOnlyInRule_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some%2Apath");

// Act
var matches = urlRule.Pattern.Matches("/some*path");

// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_PercentEncodedCharacterAsteriskOnlyInPath_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some*path");

// Act
var matches = urlRule.Pattern.Matches("/some%2Apath");

// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_PercentEncodedCharacterReservedBothUrl_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some%24path");

// Act
var matches = urlRule.Pattern.Matches("/some%24path");

// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_PercentEncodedCharacterReservedOnlyInRule_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some%24path");

// Act
var matches = urlRule.Pattern.Matches("/some$path");

// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_PercentEncodedCharacterReservedOnlyInPath_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some$path");

// Act
var matches = urlRule.Pattern.Matches("/some%24path");

// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_PercentEncodedCharacterNotSpecialLowercaseOnlyInRule_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%7e");
Expand All @@ -188,7 +266,7 @@ public void Matches_OctectNotForwardSlashLowercaseOnlyInRule_ReturnTrue()
}

[Fact]
public void Matches_OctectNotForwardSlashLowercaseOnlyInPath_ReturnTrue()
public void Matches_PercentEncodedCharacterNotSpecialLowercaseOnlyInPath_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path~");
Expand All @@ -201,7 +279,7 @@ public void Matches_OctectNotForwardSlashLowercaseOnlyInPath_ReturnTrue()
}

[Fact]
public void Matches_OctectNotForwardSlashUppercaseOnlyInRule_ReturnTrue()
public void Matches_PercentEncodedCharacterNotSpecialUppercaseOnlyInRule_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path%7E");
Expand All @@ -214,7 +292,7 @@ public void Matches_OctectNotForwardSlashUppercaseOnlyInRule_ReturnTrue()
}

[Fact]
public void Matches_OctectNotForwardSlashUppercaseOnlyInPath_ReturnTrue()
public void Matches_PercentEncodedCharacterNotSpecialUppercaseOnlyInPath_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/some/path~");
Expand All @@ -225,4 +303,43 @@ public void Matches_OctectNotForwardSlashUppercaseOnlyInPath_ReturnTrue()
// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_UnescapedQueryStringInRuleAndPath_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https://foo.bar");

// Act
var matches = urlRule.Pattern.Matches("/foo/bar?baz=https://foo.bar");

// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_UnescapedQueryStringInRuleButPathEscaped_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https://foo.bar");

// Act
var matches = urlRule.Pattern.Matches("/foo/bar?baz=https%3A%2F%2Ffoo.bar");

// Assert
matches.Should().Be(true);
}

[Fact]
public void Matches_UnescapedQueryStringInPathButRuleEscaped_ReturnTrue()
{
// Arrange
var urlRule = new UrlRule(RuleType.Disallow, "/foo/bar?baz=https%3A%2F%2Ffoo.bar");

// Act
var matches = urlRule.Pattern.Matches("/foo/bar?baz=https://foo.bar");

// Assert
matches.Should().Be(true);
}
}

0 comments on commit 9915b83

Please sign in to comment.