Skip to content

Commit

Permalink
Site text sitemap support and test improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
drmathias committed Aug 28, 2023
1 parent c83390e commit ab58b84
Show file tree
Hide file tree
Showing 20 changed files with 159,527 additions and 8,918 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,6 @@ $RECYCLE.BIN/

# Windows shortcuts
*.lnk

# Custom directory
exclude/
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,11 @@ There is also the possibility to extend this library to support protocols other
| Sitemap entries | ✔️ | |
| Host directive | ✔️ | |
| Crawl-delay directive | ✔️ | |
| Sitemaps XML format | ✔️ | |
| RSS 2.0 feeds || 0.8 |
| Atom 0.3/1.0 feeds || 0.8 |
| Simple text sitemaps | | 0.5 |
| Memory management (500 KiB parsing limit) | ✔️ | |
| Caching support | | 0.3 |
| Sitemaps XML format | ✔️ | |
| Simple text sitemaps | ✔️ | |
| Memory management | ✔️ | |

# Usage

Expand Down
11 changes: 9 additions & 2 deletions src/Robots.Txt.Parser/Http/RobotWebClient.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Net.Mime;
using System.Threading;
using System.Threading.Tasks;

Expand Down Expand Up @@ -86,11 +87,17 @@ the 500-599 range.
foreach (var uri in uris)
{
var request = new HttpRequestMessage(HttpMethod.Get, uri);
request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml,*/*");
request.Headers.Add("Accept", "application/xml,text/plain,text/xml,*/*");
var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
if (!response.IsSuccessStatusCode) return null;
using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
var parsedSitemap = await SitemapParser.ReadFromStreamAsync(stream, modifiedSince, cancellationToken);

var parsedSitemap = response.Content.Headers.ContentType?.MediaType switch
{
MediaTypeNames.Text.Plain => await SimpleTextSitemapParser.ReadFromStreamAsync(stream, cancellationToken),
MediaTypeNames.Text.Xml or MediaTypeNames.Application.Xml or _
=> await SitemapParser.ReadFromStreamAsync(stream, modifiedSince, cancellationToken)
};

if (parsedSitemap is null)
{
Expand Down
1 change: 1 addition & 0 deletions src/Robots.Txt.Parser/ProductToken.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ public partial class ProductToken : IEquatable<string>, IEquatable<ProductToken>
/// </summary>
/// <param name="value">Raw product token value</param>
/// <returns><see cref="ProductToken"/> that identifies a robot rule group</returns>
/// <exception cref="ArgumentOutOfRangeException">Product token is formatted incorrectly</exception>
public static ProductToken Parse(string value)
{
if (value != Wildcard._value && !ValidationPattern.IsMatch(value))
Expand Down
26 changes: 26 additions & 0 deletions src/Robots.Txt.Parser/RobotsTxtException.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using System;
using System.Runtime.Serialization;

namespace Robots.Txt.Parser;

/// <summary>
/// Exception raised when parsing a robots.txt file
/// </summary>
public class RobotsTxtException : Exception
{
internal RobotsTxtException()
{
}

internal RobotsTxtException(string? message) : base(message)
{
}

internal RobotsTxtException(string? message, Exception? innerException) : base(message, innerException)
{
}

protected RobotsTxtException(SerializationInfo info, StreamingContext context) : base(info, context)
{
}
}
108 changes: 58 additions & 50 deletions src/Robots.Txt.Parser/RobotsTxtParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public RobotsTxtParser(IRobotClient robotClient)
/// <param name="stream">The input stream</param>
/// <param name="cancellationToken">Cancellation token</param>
/// <returns>Parsed <see cref="IRobotsTxt"/></returns>
/// <exception cref="RobotsTxtException">Raised when there is an error parsing the robots.txt file</exception>
public async Task<IRobotsTxt> ReadFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
{
string? line;
Expand All @@ -53,71 +54,78 @@ Crawlers MUST use case-insensitive matching to find the group that matches the p
var userAgentRules = new Dictionary<ProductToken, HashSet<UrlRule>>();
var userAgentCrawlDirectives = new Dictionary<ProductToken, int>();

/*
The file MUST be UTF-8 encoded
*/
using var streamReader = new StreamReader(stream, Encoding.UTF8);
while ((line = await streamReader.ReadLineAsync(cancellationToken)) is not null)
try
{
if (stream.Position > ByteCount500KiB) throw new OutOfMemoryException("Reached parsing limit");
/*
The file MUST be UTF-8 encoded
*/
using var streamReader = new StreamReader(stream, Encoding.UTF8);
while ((line = await streamReader.ReadLineAsync(cancellationToken)) is not null && !cancellationToken.IsCancellationRequested)
{
if (stream.Position > ByteCount500KiB) throw new RobotsTxtException("Reached parsing limit");

if (line.StartsWith('#')) continue;
if (line.StartsWith('#')) continue;

if (line.StartsWith(UserAgentDirective, StringComparison.InvariantCultureIgnoreCase))
{
if (!previousLineWasUserAgent) currentUserAgents.Clear();
var currentUserAgent = GetValueOfDirective(line, UserAgentDirective);
if (ProductToken.TryParse(currentUserAgent, out var productToken))
if (line.StartsWith(UserAgentDirective, StringComparison.InvariantCultureIgnoreCase))
{
currentUserAgents.Add(productToken);
userAgentRules.TryAdd(productToken, new HashSet<UrlRule>());
previousLineWasUserAgent = true;
if (!previousLineWasUserAgent) currentUserAgents.Clear();
var currentUserAgent = GetValueOfDirective(line, UserAgentDirective);
if (ProductToken.TryParse(currentUserAgent, out var productToken))
{
currentUserAgents.Add(productToken);
userAgentRules.TryAdd(productToken, new HashSet<UrlRule>());
previousLineWasUserAgent = true;
}
continue;
}
continue;
}

if (currentUserAgents.Count == 0)
{
if (line.StartsWith(SitemapDirective, StringComparison.InvariantCultureIgnoreCase))
{
var sitemapValue = GetValueOfDirective(line, SitemapDirective);
if (Uri.TryCreate(sitemapValue, UriKind.Absolute, out var sitemapAddress)) sitemaps.Add(sitemapAddress);
}
else if (host is null && line.StartsWith(HostDirective, StringComparison.InvariantCultureIgnoreCase))
{
var hostValue = GetValueOfDirective(line, HostDirective);
if (Uri.IsWellFormedUriString(hostValue, UriKind.Absolute)
&& Uri.TryCreate(hostValue, UriKind.Absolute, out var uri)) hostValue = uri.Host;
var hostNameType = Uri.CheckHostName(hostValue);
if (hostNameType != UriHostNameType.Unknown && hostNameType != UriHostNameType.Basic) host = hostValue;
}
}
else
{
if (line.StartsWith(DisallowDirective, StringComparison.InvariantCultureIgnoreCase))
if (currentUserAgents.Count == 0)
{
var disallowValue = GetValueOfDirective(line, DisallowDirective);
foreach (var userAgent in currentUserAgents) userAgentRules[userAgent].Add(new UrlRule(RuleType.Disallow, disallowValue));
}
else if (line.StartsWith(AllowDirective, StringComparison.InvariantCultureIgnoreCase))
{
var allowedValue = GetValueOfDirective(line, AllowDirective);
foreach (var userAgent in currentUserAgents) userAgentRules[userAgent].Add(new UrlRule(RuleType.Allow, allowedValue));
if (line.StartsWith(SitemapDirective, StringComparison.InvariantCultureIgnoreCase))
{
var sitemapValue = GetValueOfDirective(line, SitemapDirective);
if (Uri.TryCreate(sitemapValue, UriKind.Absolute, out var sitemapAddress)) sitemaps.Add(sitemapAddress);
}
else if (host is null && line.StartsWith(HostDirective, StringComparison.InvariantCultureIgnoreCase))
{
var hostValue = GetValueOfDirective(line, HostDirective);
if (Uri.IsWellFormedUriString(hostValue, UriKind.Absolute)
&& Uri.TryCreate(hostValue, UriKind.Absolute, out var uri)) hostValue = uri.Host;
var hostNameType = Uri.CheckHostName(hostValue);
if (hostNameType != UriHostNameType.Unknown && hostNameType != UriHostNameType.Basic) host = hostValue;
}
}
else if (line.StartsWith(CrawlDelayDirective, StringComparison.InvariantCultureIgnoreCase))
else
{
var crawlDelayValue = GetValueOfDirective(line, CrawlDelayDirective);
if (int.TryParse(crawlDelayValue, out var parsedCrawlDelay))
if (line.StartsWith(DisallowDirective, StringComparison.InvariantCultureIgnoreCase))
{
var disallowValue = GetValueOfDirective(line, DisallowDirective);
foreach (var userAgent in currentUserAgents) userAgentRules[userAgent].Add(new UrlRule(RuleType.Disallow, disallowValue));
}
else if (line.StartsWith(AllowDirective, StringComparison.InvariantCultureIgnoreCase))
{
var allowedValue = GetValueOfDirective(line, AllowDirective);
foreach (var userAgent in currentUserAgents) userAgentRules[userAgent].Add(new UrlRule(RuleType.Allow, allowedValue));
}
else if (line.StartsWith(CrawlDelayDirective, StringComparison.InvariantCultureIgnoreCase))
{
foreach (var userAgent in currentUserAgents) userAgentCrawlDirectives.TryAdd(userAgent, parsedCrawlDelay);
var crawlDelayValue = GetValueOfDirective(line, CrawlDelayDirective);
if (int.TryParse(crawlDelayValue, out var parsedCrawlDelay))
{
foreach (var userAgent in currentUserAgents) userAgentCrawlDirectives.TryAdd(userAgent, parsedCrawlDelay);
}
}
}

previousLineWasUserAgent = false;
}

previousLineWasUserAgent = false;
return new RobotsTxt(_robotClient, userAgentRules, userAgentCrawlDirectives, host, sitemaps);
}
catch (Exception e) when (e is not RobotsTxtException)
{
throw new RobotsTxtException("Unable to parse robots.txt", e);
}

return new RobotsTxt(_robotClient, userAgentRules, userAgentCrawlDirectives, host, sitemaps);
}

private static string GetValueOfDirective(string line, string directive)
Expand Down
65 changes: 65 additions & 0 deletions src/Robots.Txt.Parser/SimpleTextSitemapParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using System.Threading.Tasks;

namespace Robots.Txt.Parser;

/// <summary>
/// Parses a <see cref="Sitemap"/> TXT document
/// </summary>
public static class SimpleTextSitemapParser
{
private const int MaxLines = 50000;
private const int ByteCount50MiB = 52_428_800;

/// <summary>
/// Parses a <see cref="Sitemap"/> from a <see cref="Stream"/>
/// </summary>
/// <param name="stream">Sitemap document stream</param>
/// <param name="cancellationToken">Cancellation token</param>
/// <returns>The parsed <see cref="Sitemap"/></returns>
/// <exception cref="SitemapException">Raised when there is an error parsing the Sitemap</exception>
public static async Task<Sitemap> ReadFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
{
var urlSet = new HashSet<UrlSetItem>();
try
{
using var streamReader = new StreamReader(stream);
string? line;
var lineCount = 0;
while (((line = await streamReader.ReadLineAsync(cancellationToken)) is not null) && !cancellationToken.IsCancellationRequested)
{
/*
Each text file ... and must be no larger than 50MiB (52,428,800 bytes)
*/
if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit");

if (string.IsNullOrWhiteSpace(line)) continue;

lineCount++;

/*
Each text file can contain a maximum of 50,000 URLs
*/
if (lineCount > MaxLines) throw new SitemapException("Reached line limit");

/*
The text file must have one URL per line. The URLs cannot contain embedded new lines.
You must fully specify URLs, including the http.
The text file must use UTF-8 encoding.
The text file should contain no information other than the list of URLs.
The text file should contain no header or footer information.
*/
urlSet.Add(new UrlSetItem(new Uri(line), null, null, null));
}

return new Sitemap(urlSet);
}
catch (Exception e) when (e is not SitemapException)
{
throw new SitemapException("Unable to parse sitemap", e);
}
}
}
4 changes: 2 additions & 2 deletions src/Robots.Txt.Parser/SitemapParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
namespace Robots.Txt.Parser;

/// <summary>
/// Parses a <see cref="Sitemap"/> document
/// Parses a <see cref="Sitemap"/> XML document
/// </summary>
public class SitemapParser
{
Expand All @@ -21,7 +21,7 @@ public class SitemapParser
/// <param name="modifiedSince">Filters the sitemap on the modified date</param>
/// <param name="cancellationToken">Cancellation token</param>
/// <returns>The parsed <see cref="Sitemap"/></returns>
/// <exception cref="SitemapException">Thrown when the sitemap document is formatted incorrectly</exception>
/// <exception cref="SitemapException">Raised when there is an error parsing the Sitemap</exception>
public static async Task<Sitemap> ReadFromStreamAsync(Stream stream, DateTime? modifiedSince = null, CancellationToken cancellationToken = default)
{
try
Expand Down
8 changes: 4 additions & 4 deletions tests/Robots.Txt.Parser.Tests.Unit/ProductTokenTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,12 @@ public void TryParse_ValidProductToken_ReturnTrue()
public void Equals_Null_NotEqual()
{
// Arrange
var a = ProductToken.Wildcard;
var productToken = ProductToken.Wildcard;

// Act
var isEqualProductToken = a.Equals((ProductToken?)null);
var isEqualString = a.Equals((string?)null);
var isEqualObject = a.Equals((object?)null);
var isEqualProductToken = productToken.Equals((ProductToken?)null);
var isEqualString = productToken!.Equals((string?)null);
var isEqualObject = productToken!.Equals((object?)null);

// Assert
isEqualProductToken.Should().Be(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@
</ItemGroup>

<ItemGroup>
<EmbeddedResource Include="over-50kib-robots.txt" />
<EmbeddedResource Include="under-50kib-robots.txt" />
<EmbeddedResource Include="over-500kib-robots.txt" />
<EmbeddedResource Include="exactly-500kib-robots.txt" />
<EmbeddedResource Include="over-50k-lines-sitemap.txt" />
<EmbeddedResource Include="exactly-50k-lines-sitemap.txt" />
<EmbeddedResource Include="over-50mib-sitemap.txt" />
<EmbeddedResource Include="exactly-50mib-sitemap.txt" />
</ItemGroup>

<ItemGroup>
Expand Down
11 changes: 5 additions & 6 deletions tests/Robots.Txt.Parser.Tests.Unit/RobotsTxtParserTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
using System;
using System.IO;
using System.Reflection;
using System.Text;
Expand Down Expand Up @@ -71,11 +70,11 @@ public async Task ReadFromStreamAsync_WithEndOfLineComments_CommentsIgnored()
}

[Fact]
public async Task ReadFromStreamAsync_Under50KiB_DoNotThrow()
public async Task ReadFromStreamAsync_Exactly500KiB_DoNotThrow()
{
// Arrange
var fileProvider = new EmbeddedFileProvider(Assembly.GetExecutingAssembly());
var stream = fileProvider.GetFileInfo("under-50kib-robots.txt").CreateReadStream();
var stream = fileProvider.GetFileInfo("exactly-500kib-robots.txt").CreateReadStream();

// Act
var parse = async () => await _parser.ReadFromStreamAsync(stream);
Expand All @@ -85,17 +84,17 @@ public async Task ReadFromStreamAsync_Under50KiB_DoNotThrow()
}

[Fact]
public async Task ReadFromStreamAsync_Over50KiB_ThrowOutOfMemoryException()
public async Task ReadFromStreamAsync_Over500KiB_ThrowRobotsTxtException()
{
// Arrange
var fileProvider = new EmbeddedFileProvider(Assembly.GetExecutingAssembly());
var stream = fileProvider.GetFileInfo("over-50kib-robots.txt").CreateReadStream();
var stream = fileProvider.GetFileInfo("over-500kib-robots.txt").CreateReadStream();

// Act
var parse = async () => await _parser.ReadFromStreamAsync(stream);

// Assert
await parse.Should().ThrowAsync<OutOfMemoryException>();
await parse.Should().ThrowAsync<RobotsTxtException>();
}

[Fact]
Expand Down
Loading

0 comments on commit ab58b84

Please sign in to comment.