From 1afd5cf7e99bf9937ff10de8b276bdcbfc3280c6 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 15 Dec 2023 02:22:39 +0200 Subject: [PATCH] [`@huggingface/jinja`] Fix escaped characters (#416) Better parsing of escaped newlines, tabs, etc. Also adds a unit test for this. --- packages/jinja/src/lexer.ts | 29 +++++++++++++++++++++++ packages/jinja/src/runtime.ts | 7 +----- packages/jinja/test/templates.test.js | 33 +++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/packages/jinja/src/lexer.ts b/packages/jinja/src/lexer.ts index 1e6015164..3dd6eaafb 100644 --- a/packages/jinja/src/lexer.ts +++ b/packages/jinja/src/lexer.ts @@ -123,6 +123,18 @@ const ORDERED_MAPPING_TABLE: [string, TokenType][] = [ ["=", TOKEN_TYPES.Equals], ]; +const ESCAPE_CHARACTERS = new Map([ + ["n", "\n"], // New line + ["t", "\t"], // Horizontal tab + ["r", "\r"], // Carriage return + ["b", "\b"], // Backspace + ["f", "\f"], // Form feed + ["v", "\v"], // Vertical tab + ["'", "'"], // Single quote + ['"', '"'], // Double quote + ["\\", "\\"], // Backslash +]); + /** * Generate a list of tokens from a source string. */ @@ -135,6 +147,23 @@ export function tokenize(source: string): Token[] { const consumeWhile = (predicate: (char: string) => boolean): string => { let str = ""; while (predicate(src[cursorPosition])) { + // Check for escaped characters + if (src[cursorPosition] === "\\") { + // Consume the backslash + ++cursorPosition; + // Check for end of input + if (cursorPosition >= src.length) throw new SyntaxError("Unexpected end of input"); + + // Add the escaped character + const escaped = src[cursorPosition++]; + const unescaped = ESCAPE_CHARACTERS.get(escaped); + if (unescaped === undefined) { + throw new SyntaxError(`Unexpected escaped character: ${escaped}`); + } + str += unescaped; + continue; + } + str += src[cursorPosition++]; if (cursorPosition >= src.length) throw new SyntaxError("Unexpected end of input"); } diff --git a/packages/jinja/src/runtime.ts b/packages/jinja/src/runtime.ts index da02dffee..ec44705ef 100644 --- a/packages/jinja/src/runtime.ts +++ b/packages/jinja/src/runtime.ts @@ -486,12 +486,7 @@ export class Interpreter { case "NumericLiteral": return new NumericValue(Number((statement as NumericLiteral).value)); case "StringLiteral": - return new StringValue( - (statement as StringLiteral).value - // Unescape special characters - .replaceAll("\\n", "\n") - .replaceAll("\\t", "\t") - ); + return new StringValue((statement as StringLiteral).value); case "BooleanLiteral": return new BooleanValue((statement as BooleanLiteral).value); case "Identifier": diff --git a/packages/jinja/test/templates.test.js b/packages/jinja/test/templates.test.js index 1525cd98c..10e4d2f1b 100644 --- a/packages/jinja/test/templates.test.js +++ b/packages/jinja/test/templates.test.js @@ -60,6 +60,9 @@ const TEST_STRINGS = { MEMBERSHIP: `|{{ 0 in arr }}|{{ 1 in arr }}|{{ true in arr }}|{{ false in arr }}|{{ 'a' in arr }}|{{ 'b' in arr }}|`, MEMBERSHIP_NEGATION_1: `|{{ not 0 in arr }}|{{ not 1 in arr }}|{{ not true in arr }}|{{ not false in arr }}|{{ not 'a' in arr }}|{{ not 'b' in arr }}|`, MEMBERSHIP_NEGATION_2: `|{{ 0 not in arr }}|{{ 1 not in arr }}|{{ true not in arr }}|{{ false not in arr }}|{{ 'a' not in arr }}|{{ 'b' not in arr }}|`, + + // Escaped characters + ESCAPED_CHARS: `{{ '\\n' }}{{ '\\t' }}{{ '\\'' }}{{ '\\"' }}{{ '\\\\' }}{{ '|\\n|\\t|\\'|\\"|\\\\|' }}`, }; const TEST_PARSED = { @@ -1019,6 +1022,28 @@ const TEST_PARSED = { { value: "}}", type: "CloseExpression" }, { value: "|", type: "Text" }, ], + + // Escaped characters + ESCAPED_CHARS: [ + { value: "{{", type: "OpenExpression" }, + { value: "\n", type: "StringLiteral" }, + { value: "}}", type: "CloseExpression" }, + { value: "{{", type: "OpenExpression" }, + { value: "\t", type: "StringLiteral" }, + { value: "}}", type: "CloseExpression" }, + { value: "{{", type: "OpenExpression" }, + { value: "'", type: "StringLiteral" }, + { value: "}}", type: "CloseExpression" }, + { value: "{{", type: "OpenExpression" }, + { value: '"', type: "StringLiteral" }, + { value: "}}", type: "CloseExpression" }, + { value: "{{", type: "OpenExpression" }, + { value: "\\", type: "StringLiteral" }, + { value: "}}", type: "CloseExpression" }, + { value: "{{", type: "OpenExpression" }, + { value: `|\n|\t|'|"|\\|`, type: "StringLiteral" }, + { value: "}}", type: "CloseExpression" }, + ], }; const TEST_CONTEXT = { @@ -1115,6 +1140,9 @@ const TEST_CONTEXT = { MEMBERSHIP_NEGATION_2: { arr: [0, true, "a"], }, + + // Escaped characters + ESCAPED_CHARS: {}, }; const EXPECTED_OUTPUTS = { @@ -1173,6 +1201,11 @@ const EXPECTED_OUTPUTS = { MEMBERSHIP: "|true|false|true|false|true|false|", MEMBERSHIP_NEGATION_1: "|false|true|false|true|false|true|", MEMBERSHIP_NEGATION_2: "|false|true|false|true|false|true|", + + // Escaped characters + // NOTE: Since `trim_blocks` is enabled, we remove the first newline after the template tag, + // meaning the first newline in the output is not present + ESCAPED_CHARS: `\t'"\\|\n|\t|'|"|\\|`, }; describe("Templates", () => {