From 8aa500a1f0a59abe22d4ef5e6842d563510dd594 Mon Sep 17 00:00:00 2001
From: Mark Callow <2244683+MarkCallow@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:15:13 +0900
Subject: [PATCH] Fix: handle spaces and non-latin characters in filenames in
 loadtest apps. (#949)

Non-latin characters were a problem only for vkloadtests because only it
has a text overlay.
---
 .../VulkanAppSDL/vulkantextoverlay.hpp        | 129 +++++++++++++++++-
 .../glloadtests/shader-based/GL3LoadTests.cpp |   6 +
 tests/loadtests/vkloadtests.cmake             |   6 +
 .../loadtests/vkloadtests/VulkanLoadTests.cpp |   5 +
 utils/argparser.cpp                           |  50 ++++++-
 5 files changed, 183 insertions(+), 13 deletions(-)
diff --git a/tests/loadtests/appfwSDL/VulkanAppSDL/vulkantextoverlay.hpp b/tests/loadtests/appfwSDL/VulkanAppSDL/vulkantextoverlay.hpp
index 6ff559da44..0a0d236f00 100644
--- a/tests/loadtests/appfwSDL/VulkanAppSDL/vulkantextoverlay.hpp
+++ b/tests/loadtests/appfwSDL/VulkanAppSDL/vulkantextoverlay.hpp
@@ -29,10 +29,113 @@
 #define STB_FONT_HEIGHT STB_FONT_consolas_24_latin1_BITMAP_HEIGHT 
 #define STB_FIRST_CHAR STB_FONT_consolas_24_latin1_FIRST_CHAR
 #define STB_NUM_CHARS STB_FONT_consolas_24_latin1_NUM_CHARS
+#define STB_MISSING_GLPYH 0x80 // Actually a control character.
 
 // Max. number of chars the text overlay buffer can hold
 #define MAX_CHAR_COUNT 1024
 
+
+/**
+ * @internal
+ * @brief Given the lead byte of a UTF-8 sequence returns the expected length of the codepoint
+ * @param[in] leadByte The lead byte of a UTF-8 sequence
+ * @return The expected length of the codepoint */
+[[nodiscard]] constexpr inline int sequenceLength(uint8_t leadByte) noexcept {
+    if ((leadByte & 0b1000'0000u) == 0b0000'0000u)
+        return 1;
+    if ((leadByte & 0b1110'0000u) == 0b1100'0000u)
+        return 2;
+    if ((leadByte & 0b1111'0000u) == 0b1110'0000u)
+        return 3;
+    if ((leadByte & 0b1111'1000u) == 0b1111'0000u)
+        return 4;
+
+    return 0;
+}
+
+/**
+ * @internal
+ * @brief Checks if the codepoint was coded as a longer than required sequence
+ * @param[in] codepoint The unicode codepoint
+ * @param[in] length The UTF-8 sequence length
+ * @return True if the sequence length was inappropriate for the given codepoint */
+[[nodiscard]] constexpr inline bool isOverlongSequence(uint32_t codepoint, int length) noexcept {
+    if (codepoint < 0x80)
+        return length != 1;
+    else if (codepoint < 0x800)
+        return length != 2;
+    else if (codepoint < 0x10000)
+        return length != 3;
+    else
+        return false;
+}
+
+/**
+ * @internal
+ * @brief Checks if the codepoint is valid
+ * @param[in] codepoint The unicode codepoint
+ * @return True if the codepoint is a valid unicode codepoint */
+[[nodiscard]] constexpr inline bool isCodepointValid(uint32_t codepoint) noexcept {
+    return codepoint <= 0x0010FFFFu
+            && !(0xD800u <= codepoint && codepoint <= 0xDBFFu);
+}
+
+/**
+ * @internal
+ * @brief Safely checks and advances a UTF-8 sequence iterator to the start of the next unicode codepoint
+ * @param[in] it iterator to be advanced
+ * @param[in] end iterator pointing to the end of the range
+ * @return True if the advance operation was successful and the advanced codepoint was a valid UTF-8 sequence */
+template <typename Iterator>
+[[nodiscard]] constexpr bool advanceUTF8(Iterator& it, Iterator end,
+                                         uint32_t& codepoint) noexcept {
+    if (it == end)
+        return false;
+
+    const auto length = sequenceLength(*it);
+    if (length == 0)
+        return false;
+
+    if (std::distance(it, end) < length)
+        return false;
+
+    for (int i = 1; i < length; ++i) {
+        const auto trailByte = *(it + i);
+        if ((static_cast<uint8_t>(trailByte) & 0b1100'0000u) != 0b1000'0000u)
+            return false;
+    }
+
+    codepoint = 0;
+    switch (length) {
+    case 1:
+        codepoint |= *it++;
+        break;
+    case 2:
+        codepoint |= (*it++ & 0b0001'1111u) << 6u;
+        codepoint |= (*it++ & 0b0011'1111u);
+        break;
+    case 3:
+        codepoint |= (*it++ & 0b0000'1111u) << 12u;
+        codepoint |= (*it++ & 0b0011'1111u) << 6u;
+        codepoint |= (*it++ & 0b0011'1111u);
+        break;
+    case 4:
+        codepoint |= (*it++ & 0b0000'0111u) << 18u;
+        codepoint |= (*it++ & 0b0011'1111u) << 12u;
+        codepoint |= (*it++ & 0b0011'1111u) << 6u;
+        codepoint |= (*it++ & 0b0011'1111u);
+        break;
+    }
+
+    if (!isCodepointValid(codepoint))
+        return false;
+
+    if (isOverlongSequence(codepoint, length))
+        return false;
+
+    return true;
+}
+
 // Mostly self-contained text overlay class
 // todo : comment
 class VulkanTextOverlay
@@ -91,6 +194,7 @@ class VulkanTextOverlay
         // todo : throw error
         return 0;
     }
+
 public:
 
     enum TextAlign { alignLeft, alignCenter, alignRight };
@@ -573,12 +677,20 @@ class VulkanTextOverlay
 
         // Calculate text width
         float textWidth = 0;
-        for (auto letter : text)
+        uint32_t codepoint;
+        auto it = text.begin();
+        while (it != text.end())
         {
-            stb_fontchar *charData = &stbFontData[(uint32_t)letter - STB_FIRST_CHAR];
+            if (!advanceUTF8(it, text.end(), codepoint))
+                break;
+            // TODO: Get a UTF8 font. Consider changing to Dear ImGUI
+            // https://github.com/ocornut/imgui.
+            // Placeholder to avoid crashing.
+            if (codepoint > STB_NUM_CHARS + STB_FIRST_CHAR)
+                codepoint = STB_MISSING_GLPYH;
+            stb_fontchar *charData = &stbFontData[(uint32_t)codepoint - STB_FIRST_CHAR];
             textWidth += charData->advance * charW;
         }
-
         switch (align)
         {
         case alignRight:
@@ -592,9 +704,15 @@ class VulkanTextOverlay
         }
 
         // Generate a uv mapped quad per char in the new text
-        for (auto letter : text)
+        it = text.begin();
+        while (it != text.end())
         {
-            stb_fontchar *charData = &stbFontData[(uint32_t)letter - STB_FIRST_CHAR];
+            if (!advanceUTF8(it, text.end(), codepoint))
+                break;
+            if (codepoint > STB_NUM_CHARS + STB_FIRST_CHAR)
+                codepoint = STB_MISSING_GLPYH;
+
+            stb_fontchar *charData = &stbFontData[(uint32_t)codepoint - STB_FIRST_CHAR];
 
             mappedLocal->x = (x + (float)charData->x0 * charW);
             mappedLocal->y = (y + (float)charData->y0 * charH);
@@ -628,7 +746,6 @@ class VulkanTextOverlay
                 break; // Truncate the text.
         }
     }
-
     // Unmap buffer and update command buffers
     void endTextUpdate()
     {
diff --git a/tests/loadtests/glloadtests/shader-based/GL3LoadTests.cpp b/tests/loadtests/glloadtests/shader-based/GL3LoadTests.cpp
index edd4057c98..87ab152ab0 100644
--- a/tests/loadtests/glloadtests/shader-based/GL3LoadTests.cpp
+++ b/tests/loadtests/glloadtests/shader-based/GL3LoadTests.cpp
@@ -15,6 +15,9 @@
  *        OpenGL ES 3.x
  */
 
+#include <string>
+#include <regex>
+
 #include "GLLoadTests.h"
 #include "EncodeTexture.h"
 #include "DrawTexture.h"
@@ -71,6 +74,9 @@ GLLoadTests::showFile(std::string& filename)
         createViewer = DrawTexture::create;
     }
     ktxTexture_Destroy(kTexture);
+
+    // Escape any spaces in filename.
+    filename = std::regex_replace( filename, std::regex(" "), "\\ " );
     std::string args = "--external " + filename;
     pViewer = createViewer(w_width, w_height, args.c_str(), sBasePath);
     return pViewer;
diff --git a/tests/loadtests/vkloadtests.cmake b/tests/loadtests/vkloadtests.cmake
index 3ebf51ee1b..b36f756187 100644
--- a/tests/loadtests/vkloadtests.cmake
+++ b/tests/loadtests/vkloadtests.cmake
@@ -165,6 +165,12 @@ add_executable( vkloadtests
 
 set_code_sign(vkloadtests)
 
+# If VulkanAppSDL is ever made into its own target change the target here.
+target_compile_features(vkloadtests
+PRIVATE
+    cxx_std_14
+)
+
 target_include_directories(vkloadtests
 PRIVATE
     ${SDL2_INCLUDE_DIRS}
diff --git a/tests/loadtests/vkloadtests/VulkanLoadTests.cpp b/tests/loadtests/vkloadtests/VulkanLoadTests.cpp
index 50d2124bb5..116c5713a6 100644
--- a/tests/loadtests/vkloadtests/VulkanLoadTests.cpp
+++ b/tests/loadtests/vkloadtests/VulkanLoadTests.cpp
@@ -20,6 +20,8 @@
 
 #define _USE_MATH_DEFINES
 #include <math.h>
+#include <string>
+#include <regex>
 
 #include "VulkanLoadTests.h"
 #include "Texture.h"
@@ -355,6 +357,9 @@ VulkanLoadTests::showFile(std::string& filename)
         createViewer = Texture::create;
     }
     ktxTexture_Destroy(kTexture);
+
+    // Escape any spaces in filename.
+    filename = std::regex_replace( filename, std::regex(" "), "\\ " );
     std::string args = "--external " + filename;
     pViewer = createViewer(vkctx, w_width, w_height, args.c_str(), sBasePath);
     return pViewer;
diff --git a/utils/argparser.cpp b/utils/argparser.cpp
index 48620971f7..9c2e260add 100644
--- a/utils/argparser.cpp
+++ b/utils/argparser.cpp
@@ -21,7 +21,9 @@
  */
 
 #include <assert.h>
+#include <iostream>
 #include "argparser.h"
+#include <regex>
 
 using namespace std;
 
@@ -30,18 +32,52 @@ using namespace std;
  */
 argvector::argvector(const string& sArgs)
 {
-    const string sep(" \t\n\r\v\f");
+    const string sep(" \t");
+    // std::regex does not support negative lookbehind assertions. Darn!
+    // RE to extract argument between string start and separator.
+    // - Match 0 is whole matching string including separator.
+    // - Match 1 is the argument.
+    // - Match 2 is an empty string or a backslash.
+    // Use negative character class as it is easiest way to handle
+    // utf-8 file names with non-Latin characters. $ must not be last
+    // in the character class or it will be taken literally not as
+    // end of string.
+    // Use raw literal to avoid excess backslashes.
+    regex re(R"--(^([^$\\ \t]+)(\\?)(?:[ \t]+|$))--");
     size_t pos;
 
     pos = sArgs.find_first_not_of(sep);
     assert(pos != string::npos);
 
-    do {
-        size_t epos = sArgs.find_first_of(sep, pos);
-        size_t len = epos == string::npos ? epos : epos - pos;
-        push_back(sArgs.substr(pos, len));
-        pos = sArgs.find_first_not_of(sep, epos);
-    } while (pos != string::npos);
+    auto first = sArgs.begin() + pos;
+    auto last = sArgs.end();
+    bool continuation = false;
+    for (smatch sm; regex_search(first, last, sm, re);) {
+        bool needContinuation = false;
+#if DEBUG_REGEX
+        std::cout << "prefix: " << sm.prefix() << '\n';
+        std::cout << "suffix: " << sm.suffix() << '\n';
+        std::cout << "match size: " << sm.size() << '\n';
+        for(uint32_t i = 0; i < sm.size(); i++) {
+            std::cout << "match " << i << ": " << "\"" << sm.str(i) << "\"" << '\n';
+        }
+#endif
+        string arg;
+        // All this because std::regex does not support negative
+        // lookbehind assertions.
+        arg = sm.str(1);
+        if (*sm[2].first == '\\') {
+            arg += " ";
+            needContinuation = true;
+        }
+        if (continuation) {
+            this->back() += arg;
+        } else {
+            push_back(arg);
+        }
+        continuation = needContinuation;
+        first = sm.suffix().first;
+    }
 }
 
 /*