Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure CopyElement does not break UTF-8 characters #192

Merged
merged 4 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 48 additions & 5 deletions xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package io.gdcc.xoai.xml;

import io.gdcc.xoai.xmlio.exceptions.XmlWriteException;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
Expand Down Expand Up @@ -82,19 +83,61 @@ protected void writeXml(XmlWriter writer) throws IOException {
* and the application must take care of it.
*/

try (xmlInputStream; ) {
// fill the buffer once with the first 1024 chars and read as string
byte[] buffer = xmlInputStream.readNBytes(1024);
String firstChars = new String(buffer, StandardCharsets.UTF_8);
try (xmlInputStream;
// create a buffered stream to allow for rewind
BufferedInputStream bufferedXmlInputStream =
new BufferedInputStream(xmlInputStream)) {

final int n = 1024;
// mark the start of the stream, leave 1 byte headroom for reading ahead
bufferedXmlInputStream.mark(n + 1);
// read the first bytes
byte[] bytes = bufferedXmlInputStream.readNBytes(n);

// broken UTF-8 multibyte char only possible when >= 1024 chars available
if (bytes.length == n) {
// analyze how far we can read without risk of broken char
final int maxN = maxBytesWithCompleteUTF8Chars(bytes);
// if we detected a (potentially broken) mbchar at the end, re-read.
if (maxN < n) {
// reset the stream to begin
bufferedXmlInputStream.reset();
// replace bytes by reading again, but only so far as maxN
bytes = bufferedXmlInputStream.readNBytes(maxN);
}
}

String firstChars = new String(bytes, StandardCharsets.UTF_8);
// match the start with the compiled regex and replace with nothing when matching.
firstChars = xmlDeclaration.reset(firstChars).replaceFirst("");

// write the chars to the output stream
writer.getOutputStream().write(firstChars.getBytes(StandardCharsets.UTF_8));

// now send the rest of the stream
xmlInputStream.transferTo(writer.getOutputStream());
bufferedXmlInputStream.transferTo(writer.getOutputStream());
}
}

public static int maxBytesWithCompleteUTF8Chars(final byte[] buffer) {
final int n = buffer.length;

if (Byte.toUnsignedInt(buffer[n - 1])
> 127) { // 0b0xxxxxxx are all 1-byte chars, so only seek for 0b1xxxxxxx
for (int i = n - 1;
i > n - 5;
i--) { // go max four bytes back (max UTF-8 mbchar length)
if (Byte.toUnsignedInt(buffer[i])
> 191) { // 0b110xxxxx / 0b1110xxxx / 0b11110xxx are the UTF-8 multibyte
poikilotherm marked this conversation as resolved.
Show resolved Hide resolved
// char start bytes
// -> this is a multibyte start char => use "i" as perfect new length
// (array index starts at 0, so is already -1 for read length)
return i;
}
// -> this is a byte somewhere inside the mbchar, go -1 byte and try again
}
}

return n;
}
}
109 changes: 109 additions & 0 deletions xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package io.gdcc.xoai.xml;

import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.jupiter.api.Assertions.*;
import static org.xmlunit.matchers.CompareMatcher.isSimilarTo;
Expand All @@ -15,11 +16,19 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Map;
import java.util.stream.Stream;
import javax.xml.stream.XMLStreamException;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import org.xmlunit.matchers.EvaluateXPathMatcher;

public class CopyElementTest {

Expand Down Expand Up @@ -199,4 +208,104 @@ void writeLargeStream() throws IOException, XMLStreamException, XmlWriteExceptio
// then
assertThat("Large stream XML wrapped in <metadata>", result, hasXPath("//metadata"));
}

/**
* This test checks that we do not bodge multibyte UTF-8 chars from the input stream. See <a
* href="https://github.com/gdcc/xoai/issues/188">issue 188 at gdcc/xoai</a>.
*/
@Test
void ensureIssue188Fixed() throws IOException, XMLStreamException {
// given
final Path sourceFile =
Path.of("src", "test", "resources", "188-copy-element-multibyte.xml");
final String sourceXml = Files.readString(sourceFile, StandardCharsets.UTF_8);
final ByteArrayOutputStream resultStream = new ByteArrayOutputStream();

final String description =
"This dataset contains information of experiments carried out upland rice in two"
+ " regions of Nicaragua (Caribbean and Pacific Region), as well as a"
+ " compilation of soils data from different regions in Nicaragua collected"
+ " from 2015 to 2019. The experiments were designed to explore the effects of"
+ " micronutrients in the yield of upland rice. The experiments were carried"
+ " out on farmer’s field during the 2018 production cycle, the dataset"
+ " contains yield and aerial biomass of the experiments.";

assertThat(
"Description is intact in source",
sourceXml,
EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description))
.withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/")));

// when
try (resultStream;
InputStream stream =
new ByteArrayInputStream(sourceXml.getBytes(StandardCharsets.UTF_8));
XmlWriter writer = new XmlWriter(resultStream)) {
writer.writeStartDocument();
writer.writeStartElement("metadata");
writer.write(new CopyElement(stream));
writer.writeEndElement();
writer.writeEndDocument();
}
String result = resultStream.toString();

// then
assertThat("Large stream XML wrapped in <metadata>", result, hasXPath("//metadata"));
assertThat(
"Has a <dc:description> element",
result,
hasXPath("//dc:description")
.withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/")));
assertThat(
"Description is still intact",
result,
EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description))
.withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/")));
}

static Stream<Arguments> multicharString() {
return Stream.of(
Arguments.of(4, "test", 0), // 1-byte ending
// 2-byte ending (complete, but we can't know that!)
Arguments.of(4, "testß", 0),
// 2-byte ending on 1st byte
Arguments.of(4, "testß", 1),
// 3-byte ending on 2nd byte
Arguments.of(4, "test’", 1),
// 3-byte ending on 3rd byte (complete, but we can't know that!)
Arguments.of(4, "test♥", 0),
// 3-byte ending on 2nd byte
Arguments.of(4, "test♥", 1),
// 3-byte ending on 1st byte
Arguments.of(4, "test♥", 2),
// 4-byte ending on 3rd byte
Arguments.of(4, "test\uD83D\uDD2B", 1));
}

@MethodSource("multicharString")
@ParameterizedTest
void ensureBufferAnalysisCorrectness(int expectedLength, String text, int bytesToCutOf)
throws UnsupportedEncodingException {
// given
// System.out.println(convertToBinary(text));
byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
// System.out.println(Arrays.toString(bytes));

// when
byte[] sut = Arrays.copyOf(bytes, bytes.length - bytesToCutOf);
int length = CopyElement.maxBytesWithCompleteUTF8Chars(sut);

// then
assertEquals(expectedLength, length);
}

static String convertToBinary(String input) {
byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
StringBuilder sb = new StringBuilder();
for (byte b : bytes) {
sb.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(" ", "0"));
sb.append(" ");
}
return sb.toString();
}
}
7 changes: 7 additions & 0 deletions xoai-common/src/test/resources/188-copy-element-multibyte.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<oai_dc:dc xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"><dc:title>Impact of micronutrients on upland rice yield, Nicaragua</dc:title><dc:identifier>https://doi.org/10.7910/DVN/IBDWAV</dc:identifier><dc:creator>Siles, Pablo</dc:creator><dc:creator>Tellez, Orlando</dc:creator><dc:creator>Peng, Yuan-Ching</dc:creator><dc:creator>Zeledón, Yasser</dc:creator><dc:publisher>Harvard Dataverse</dc:publisher><dc:description>This dataset contains information of experiments carried out upland rice in two regions of Nicaragua (Caribbean and Pacific Region), as well as a compilation of soils data from different regions in Nicaragua collected from 2015 to 2019. The experiments were designed to explore the effects of micronutrients in the yield of upland rice. The experiments were carried out on farmer’s field during the 2018 production cycle, the dataset contains yield and aerial biomass of the experiments.</dc:description><dc:subject>Agricultural Sciences</dc:subject><dc:subject>Earth and Environmental Sciences</dc:subject><dc:subject>Upland rice</dc:subject><dc:subject>Micronutrients</dc:subject><dc:subject>Crop yield</dc:subject><dc:subject>Zinc</dc:subject><dc:subject>Boron</dc:subject><dc:subject>Latin America and the Caribbean</dc:subject><dc:subject>Multifunctional Landscapes</dc:subject><dc:language>Spanish, Castilian</dc:language><dc:date>2019-02</dc:date><dc:contributor>Garcia, Carolina</dc:contributor><dc:type>Experimental Data</dc:type><dc:type>Environmental Data</dc:type></oai_dc:dc>

<!--
This data came from the original, unfixed https://dataverse.harvard.edu/api/datasets/export?exporter=oai_dc&persistentId=doi%3A10.7910/DVN/IBDWAV,
and included in the OAI-PMH response at https://dataverse.harvard.edu/oai?verb=GetRecord&identifier=doi:10.7910/DVN/IBDWAV&metadataPrefix=oai_dc
as referenced from https://github.com/gdcc/xoai/issues/188. Stored here as reproducer / test data.
-->