diff --git a/README.md b/README.md index d4ef9d6..09ff1aa 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ There is also the possibility to extend this library to support protocols other # Features | Name | Supported | Priority | -|------|-----------|---------| +|------|-----------|----------| | HTTP/HTTPS | ✔️ | | | FTPS/FTPS | ❌ | 0.1 | | Wildcard (`*`) User-agent | ✔️ | | diff --git a/src/Robots.Txt.Parser/Http/RobotWebClient.cs b/src/Robots.Txt.Parser/Http/RobotWebClient.cs index 88aaecb..228faf1 100644 --- a/src/Robots.Txt.Parser/Http/RobotWebClient.cs +++ b/src/Robots.Txt.Parser/Http/RobotWebClient.cs @@ -110,7 +110,7 @@ MediaTypeNames.Text.Xml or MediaTypeNames.Application.Xml or _ continue; } - if (parsedSitemap is SitemapRoot sitemapRoot) + if (parsedSitemap is SitemapIndex sitemapRoot) { var sitemaps = await (this as IRobotWebClient).LoadSitemapsAsync(sitemapRoot.SitemapUris, modifiedSince, cancellationToken); if (sitemaps is not null) sitemap = sitemaps.Combine(sitemaps); diff --git a/src/Robots.Txt.Parser/ISitemap.cs b/src/Robots.Txt.Parser/ISitemap.cs index e6a3329..74a71c0 100644 --- a/src/Robots.Txt.Parser/ISitemap.cs +++ b/src/Robots.Txt.Parser/ISitemap.cs @@ -34,9 +34,9 @@ internal Sitemap Combine(Sitemap other) } } -internal class SitemapRoot : Sitemap +internal class SitemapIndex : Sitemap { - public SitemapRoot(HashSet sitemapUris) : base(new HashSet()) + public SitemapIndex(HashSet sitemapUris) : base(new HashSet()) { SitemapUris = sitemapUris; } diff --git a/src/Robots.Txt.Parser/UrlSetItem.cs b/src/Robots.Txt.Parser/SitemapItem.cs similarity index 50% rename from src/Robots.Txt.Parser/UrlSetItem.cs rename to src/Robots.Txt.Parser/SitemapItem.cs index 10c0525..4d9f315 100644 --- a/src/Robots.Txt.Parser/UrlSetItem.cs +++ b/src/Robots.Txt.Parser/SitemapItem.cs @@ -2,16 +2,47 @@ namespace Robots.Txt.Parser; +public record SitemapItem +{ + internal SitemapItem(Uri Location, DateTime? LastModified) + { + this.Location = Location; + this.LastModified = LastModified; + } + + /// + /// URL location + /// + public Uri Location { get; } + + /// + /// Date and time that the contents of the URL was last modified + /// + public DateTime? LastModified { get; } +} + /// /// Url item described in a sitemap /// -/// URL location -/// Date that the contents of the URL was last modified -/// Hint for how often the URL is expected to change -/// Hint for the priority that should be assigned to the URL -public record UrlSetItem(Uri Location, DateTime? LastModified, ChangeFrequency? ChangeFrequency, decimal? Priority); +public record UrlSetItem : SitemapItem +{ + internal UrlSetItem(Uri location, DateTime? lastModified, ChangeFrequency? changeFrequency, decimal? priority) + : base(location, lastModified) + { + ChangeFrequency = changeFrequency; + Priority = priority; + } -internal record SitemapItem(Uri Location, DateTime? LastModified); + /// + /// Hint for how often the URL is expected to change + /// + public ChangeFrequency? ChangeFrequency { get; } + + /// + /// Hint for the priority that should be assigned to the URL + /// + public decimal? Priority { get; } +} /// /// Change frequency values used in the sitemap specification diff --git a/src/Robots.Txt.Parser/SitemapParser.cs b/src/Robots.Txt.Parser/SitemapParser.cs index 99ca2ba..56493ac 100644 --- a/src/Robots.Txt.Parser/SitemapParser.cs +++ b/src/Robots.Txt.Parser/SitemapParser.cs @@ -1,8 +1,9 @@ using System; +using System.Collections.Generic; using System.IO; -using System.Linq; using System.Threading; using System.Threading.Tasks; +using System.Xml; using System.Xml.Linq; namespace Robots.Txt.Parser; @@ -12,6 +13,8 @@ namespace Robots.Txt.Parser; /// public class SitemapParser { + private const int ByteCount50MiB = 52_428_800; + private static readonly XNamespace sitemapNamespace = "http://www.sitemaps.org/schemas/sitemap/0.9"; /// @@ -26,58 +29,85 @@ public static async Task ReadFromStreamAsync(Stream stream, DateTime? m { try { - var document = await XDocument.LoadAsync(stream, LoadOptions.None, cancellationToken); - var urlSetElement = document.Element(sitemapNamespace + "urlset"); - if (urlSetElement is not null) return ReadUrlSet(urlSetElement, modifiedSince); + using var reader = XmlReader.Create(stream, new XmlReaderSettings { Async = true }); + await reader.MoveToContentAsync(); - var sitemapIndexElement = document.Element(sitemapNamespace + "sitemapindex"); - if (sitemapIndexElement is not null) return ReadSitemapIndex(sitemapIndexElement, modifiedSince); + return reader switch + { + XmlReader when reader.NamespaceURI == sitemapNamespace && reader.Name == "urlset" + => await ParseUrlSet(stream, reader, modifiedSince, cancellationToken), + XmlReader when reader.NamespaceURI == sitemapNamespace && reader.Name == "sitemapindex" + => await ParseSitemapIndex(stream, reader, modifiedSince, cancellationToken), + _ => throw new SitemapException("Unable to find root sitemap element") + }; } catch (Exception e) when (e is not SitemapException) { throw new SitemapException("Unable to parse sitemap", e); } - - throw new SitemapException("Unable to find root sitemap element"); } - private static SitemapRoot ReadSitemapIndex(XElement sitemapIndexElement, DateTime? modifiedSince) + private static async Task ParseSitemapIndex(Stream stream, XmlReader reader, DateTime? modifiedSince, CancellationToken cancellationToken) { - var sitemapElements = sitemapIndexElement.Elements(sitemapNamespace + "sitemap"); - var sitemaps = sitemapElements - .Select(sitemapElement => + await reader.ReadAsync(); + + var uris = new HashSet(); + while (!reader.EOF && reader.ReadState is ReadState.Interactive && !cancellationToken.IsCancellationRequested) + { + if (reader.NodeType is not XmlNodeType.Element || reader.Name != "sitemap" || reader.NamespaceURI != sitemapNamespace) { - var location = new Uri(sitemapElement.Element(sitemapNamespace + "loc")!.Value); - var lastModifiedString = sitemapElement.Element(sitemapNamespace + "lastmod")?.Value; - DateTime? lastModified = lastModifiedString is not null ? DateTime.Parse(lastModifiedString) : null; - return new SitemapItem(location, lastModified); - }) - .Where(sitemap => modifiedSince is null || sitemap.LastModified is null || sitemap.LastModified >= modifiedSince) - .Select(sitemap => sitemap.Location) - .ToHashSet(); - return new SitemapRoot(sitemaps); + await reader.ReadAsync(); + continue; + } + + var node = (XElement)await XNode.ReadFromAsync(reader, cancellationToken); + + if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit"); + + var lastModifiedString = node.Element(sitemapNamespace + "lastmod")?.Value; + DateTime? lastModified = lastModifiedString is not null ? DateTime.Parse(lastModifiedString) : null; + + if (modifiedSince is not null && lastModified is not null && lastModified < modifiedSince) continue; + + var location = new Uri(node.Element(sitemapNamespace + "loc")!.Value); + + uris.Add(location); + } + return new SitemapIndex(uris); } - private static Sitemap ReadUrlSet(XElement urlSetElement, DateTime? modifiedSince) + private static async Task ParseUrlSet(Stream stream, XmlReader reader, DateTime? modifiedSince, CancellationToken cancellationToken) { - var urlElements = urlSetElement.Elements(sitemapNamespace + "url"); - var urlSet = urlElements - .Select(urlElement => + await reader.ReadAsync(); + + var items = new HashSet(); + while (!reader.EOF && reader.ReadState is ReadState.Interactive && !cancellationToken.IsCancellationRequested) + { + if (reader.NodeType is not XmlNodeType.Element || reader.Name != "url" || reader.NamespaceURI != sitemapNamespace) { - var location = new Uri(urlElement.Element(sitemapNamespace + "loc")!.Value); - var lastModifiedString = urlElement.Element(sitemapNamespace + "lastmod")?.Value; - var changeFrequencyString = urlElement.Element(sitemapNamespace + "changefreq")?.Value; - var priorityString = urlElement.Element(sitemapNamespace + "priority")?.Value; - DateTime? lastModified = lastModifiedString is not null ? DateTime.Parse(lastModifiedString) : null; - ChangeFrequency? changeFrequency = changeFrequencyString is not null - ? Enum.Parse(changeFrequencyString, ignoreCase: true) - : null; - decimal? priority = priorityString is not null ? decimal.Parse(priorityString) : null; - return new UrlSetItem(location, lastModified, changeFrequency, priority); - }) - .Where(url => modifiedSince is null || url.LastModified is null || url.LastModified >= modifiedSince) - .ToHashSet(); - - return new Sitemap(urlSet); + await reader.ReadAsync(); + continue; + } + + var node = (XElement)await XNode.ReadFromAsync(reader, cancellationToken); + + if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit"); + + var lastModifiedString = node.Element(sitemapNamespace + "lastmod")?.Value; + DateTime? lastModified = lastModifiedString is not null ? DateTime.Parse(lastModifiedString) : null; + + if (modifiedSince is not null && lastModified is not null && lastModified < modifiedSince) continue; + + var location = new Uri(node.Element(sitemapNamespace + "loc")!.Value); + var changeFrequencyString = node.Element(sitemapNamespace + "changefreq")?.Value; + var priorityString = node.Element(sitemapNamespace + "priority")?.Value; + ChangeFrequency? changeFrequency = changeFrequencyString is not null + ? Enum.Parse(changeFrequencyString, ignoreCase: true) + : null; + decimal? priority = priorityString is not null ? decimal.Parse(priorityString) : null; + + items.Add(new UrlSetItem(location, lastModified, changeFrequency, priority)); + } + return new Sitemap(items); } } diff --git a/tests/Robots.Txt.Parser.Tests.Unit/SitemapParserTests.cs b/tests/Robots.Txt.Parser.Tests.Unit/SitemapParserTests.cs index 92d554c..59c8cd4 100644 --- a/tests/Robots.Txt.Parser.Tests.Unit/SitemapParserTests.cs +++ b/tests/Robots.Txt.Parser.Tests.Unit/SitemapParserTests.cs @@ -9,6 +9,20 @@ namespace Robots.Txt.Parser.Tests.Unit; public class SitemapParserTests { + [Fact] + public async Task ReadFromStreamAsync_EmptyFile_ThrowSitemapException() + { + // Arrange + var file = @""; + var stream = new MemoryStream(Encoding.UTF8.GetBytes(file)); + + // Act + var parse = async () => await SitemapParser.ReadFromStreamAsync(stream); + + // Assert + await parse.Should().ThrowExactlyAsync(); + } + [Fact] public async Task ReadFromStreamAsync_ImproperXmlFormat_ThrowSitemapException() { @@ -176,7 +190,7 @@ public async Task ReadFromStreamAsync_SitemapIndexNoModifiedDateFilter_ParseCorr var sitemap = await SitemapParser.ReadFromStreamAsync(stream); // Assert - var sitemapRoot = sitemap.Should().BeOfType().Subject; + var sitemapRoot = sitemap.Should().BeOfType().Subject; sitemap.UrlSet.Should().BeEmpty(); sitemapRoot.SitemapUris.Should().BeEquivalentTo(new[] { @@ -207,7 +221,7 @@ public async Task ReadFromStreamAsync_SitemapIndexEarlierModifiedDateFilter_Pars var sitemap = await SitemapParser.ReadFromStreamAsync(stream, new DateTime(2023, 08, 22)); // Assert - var sitemapRoot = sitemap.Should().BeOfType().Subject; + var sitemapRoot = sitemap.Should().BeOfType().Subject; sitemap.UrlSet.Should().BeEmpty(); sitemapRoot.SitemapUris.Should().BeEquivalentTo(new[] { @@ -238,7 +252,7 @@ public async Task ReadFromStreamAsync_SitemapIndexSameModifiedDateFilter_ParseCo var sitemap = await SitemapParser.ReadFromStreamAsync(stream, new DateTime(2023, 08, 23)); // Assert - var sitemapRoot = sitemap.Should().BeOfType().Subject; + var sitemapRoot = sitemap.Should().BeOfType().Subject; sitemap.UrlSet.Should().BeEmpty(); sitemapRoot.SitemapUris.Should().BeEquivalentTo(new[] { @@ -269,7 +283,7 @@ public async Task ReadFromStreamAsync_SitemapIndexExceedsModifiedDateFilter_Pars var sitemap = await SitemapParser.ReadFromStreamAsync(stream, new DateTime(2023, 08, 24)); // Assert - var sitemapRoot = sitemap.Should().BeOfType().Subject; + var sitemapRoot = sitemap.Should().BeOfType().Subject; sitemap.UrlSet.Should().BeEmpty(); sitemapRoot.SitemapUris.Should().BeEquivalentTo(new[] { new Uri("https://www.github.com/people.xml") }); }