From 8aa500a1f0a59abe22d4ef5e6842d563510dd594 Mon Sep 17 00:00:00 2001 From: Mark Callow <2244683+MarkCallow@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:15:13 +0900 Subject: [PATCH] Fix: handle spaces and non-latin characters in filenames in loadtest apps. (#949) Non-latin characters were a problem only for vkloadtests because only it has a text overlay. --- .../VulkanAppSDL/vulkantextoverlay.hpp | 129 +++++++++++++++++- .../glloadtests/shader-based/GL3LoadTests.cpp | 6 + tests/loadtests/vkloadtests.cmake | 6 + .../loadtests/vkloadtests/VulkanLoadTests.cpp | 5 + utils/argparser.cpp | 50 ++++++- 5 files changed, 183 insertions(+), 13 deletions(-) diff --git a/tests/loadtests/appfwSDL/VulkanAppSDL/vulkantextoverlay.hpp b/tests/loadtests/appfwSDL/VulkanAppSDL/vulkantextoverlay.hpp index 6ff559da44..0a0d236f00 100644 --- a/tests/loadtests/appfwSDL/VulkanAppSDL/vulkantextoverlay.hpp +++ b/tests/loadtests/appfwSDL/VulkanAppSDL/vulkantextoverlay.hpp @@ -29,10 +29,113 @@ #define STB_FONT_HEIGHT STB_FONT_consolas_24_latin1_BITMAP_HEIGHT #define STB_FIRST_CHAR STB_FONT_consolas_24_latin1_FIRST_CHAR #define STB_NUM_CHARS STB_FONT_consolas_24_latin1_NUM_CHARS +#define STB_MISSING_GLPYH 0x80 // Actually a control character. // Max. number of chars the text overlay buffer can hold #define MAX_CHAR_COUNT 1024 + +/** + * @internal + * @brief Given the lead byte of a UTF-8 sequence returns the expected length of the codepoint + * @param[in] leadByte The lead byte of a UTF-8 sequence + * @return The expected length of the codepoint */ +[[nodiscard]] constexpr inline int sequenceLength(uint8_t leadByte) noexcept { + if ((leadByte & 0b1000'0000u) == 0b0000'0000u) + return 1; + if ((leadByte & 0b1110'0000u) == 0b1100'0000u) + return 2; + if ((leadByte & 0b1111'0000u) == 0b1110'0000u) + return 3; + if ((leadByte & 0b1111'1000u) == 0b1111'0000u) + return 4; + + return 0; +} + +/** + * @internal + * @brief Checks if the codepoint was coded as a longer than required sequence + * @param[in] codepoint The unicode codepoint + * @param[in] length The UTF-8 sequence length + * @return True if the sequence length was inappropriate for the given codepoint */ +[[nodiscard]] constexpr inline bool isOverlongSequence(uint32_t codepoint, int length) noexcept { + if (codepoint < 0x80) + return length != 1; + else if (codepoint < 0x800) + return length != 2; + else if (codepoint < 0x10000) + return length != 3; + else + return false; +} + +/** + * @internal + * @brief Checks if the codepoint is valid + * @param[in] codepoint The unicode codepoint + * @return True if the codepoint is a valid unicode codepoint */ +[[nodiscard]] constexpr inline bool isCodepointValid(uint32_t codepoint) noexcept { + return codepoint <= 0x0010FFFFu + && !(0xD800u <= codepoint && codepoint <= 0xDBFFu); +} + +/** + * @internal + * @brief Safely checks and advances a UTF-8 sequence iterator to the start of the next unicode codepoint + * @param[in] it iterator to be advanced + * @param[in] end iterator pointing to the end of the range + * @return True if the advance operation was successful and the advanced codepoint was a valid UTF-8 sequence */ +template +[[nodiscard]] constexpr bool advanceUTF8(Iterator& it, Iterator end, + uint32_t& codepoint) noexcept { + if (it == end) + return false; + + const auto length = sequenceLength(*it); + if (length == 0) + return false; + + if (std::distance(it, end) < length) + return false; + + for (int i = 1; i < length; ++i) { + const auto trailByte = *(it + i); + if ((static_cast(trailByte) & 0b1100'0000u) != 0b1000'0000u) + return false; + } + + codepoint = 0; + switch (length) { + case 1: + codepoint |= *it++; + break; + case 2: + codepoint |= (*it++ & 0b0001'1111u) << 6u; + codepoint |= (*it++ & 0b0011'1111u); + break; + case 3: + codepoint |= (*it++ & 0b0000'1111u) << 12u; + codepoint |= (*it++ & 0b0011'1111u) << 6u; + codepoint |= (*it++ & 0b0011'1111u); + break; + case 4: + codepoint |= (*it++ & 0b0000'0111u) << 18u; + codepoint |= (*it++ & 0b0011'1111u) << 12u; + codepoint |= (*it++ & 0b0011'1111u) << 6u; + codepoint |= (*it++ & 0b0011'1111u); + break; + } + + if (!isCodepointValid(codepoint)) + return false; + + if (isOverlongSequence(codepoint, length)) + return false; + + return true; +} + // Mostly self-contained text overlay class // todo : comment class VulkanTextOverlay @@ -91,6 +194,7 @@ class VulkanTextOverlay // todo : throw error return 0; } + public: enum TextAlign { alignLeft, alignCenter, alignRight }; @@ -573,12 +677,20 @@ class VulkanTextOverlay // Calculate text width float textWidth = 0; - for (auto letter : text) + uint32_t codepoint; + auto it = text.begin(); + while (it != text.end()) { - stb_fontchar *charData = &stbFontData[(uint32_t)letter - STB_FIRST_CHAR]; + if (!advanceUTF8(it, text.end(), codepoint)) + break; + // TODO: Get a UTF8 font. Consider changing to Dear ImGUI + // https://github.com/ocornut/imgui. + // Placeholder to avoid crashing. + if (codepoint > STB_NUM_CHARS + STB_FIRST_CHAR) + codepoint = STB_MISSING_GLPYH; + stb_fontchar *charData = &stbFontData[(uint32_t)codepoint - STB_FIRST_CHAR]; textWidth += charData->advance * charW; } - switch (align) { case alignRight: @@ -592,9 +704,15 @@ class VulkanTextOverlay } // Generate a uv mapped quad per char in the new text - for (auto letter : text) + it = text.begin(); + while (it != text.end()) { - stb_fontchar *charData = &stbFontData[(uint32_t)letter - STB_FIRST_CHAR]; + if (!advanceUTF8(it, text.end(), codepoint)) + break; + if (codepoint > STB_NUM_CHARS + STB_FIRST_CHAR) + codepoint = STB_MISSING_GLPYH; + + stb_fontchar *charData = &stbFontData[(uint32_t)codepoint - STB_FIRST_CHAR]; mappedLocal->x = (x + (float)charData->x0 * charW); mappedLocal->y = (y + (float)charData->y0 * charH); @@ -628,7 +746,6 @@ class VulkanTextOverlay break; // Truncate the text. } } - // Unmap buffer and update command buffers void endTextUpdate() { diff --git a/tests/loadtests/glloadtests/shader-based/GL3LoadTests.cpp b/tests/loadtests/glloadtests/shader-based/GL3LoadTests.cpp index edd4057c98..87ab152ab0 100644 --- a/tests/loadtests/glloadtests/shader-based/GL3LoadTests.cpp +++ b/tests/loadtests/glloadtests/shader-based/GL3LoadTests.cpp @@ -15,6 +15,9 @@ * OpenGL ES 3.x */ +#include +#include + #include "GLLoadTests.h" #include "EncodeTexture.h" #include "DrawTexture.h" @@ -71,6 +74,9 @@ GLLoadTests::showFile(std::string& filename) createViewer = DrawTexture::create; } ktxTexture_Destroy(kTexture); + + // Escape any spaces in filename. + filename = std::regex_replace( filename, std::regex(" "), "\\ " ); std::string args = "--external " + filename; pViewer = createViewer(w_width, w_height, args.c_str(), sBasePath); return pViewer; diff --git a/tests/loadtests/vkloadtests.cmake b/tests/loadtests/vkloadtests.cmake index 3ebf51ee1b..b36f756187 100644 --- a/tests/loadtests/vkloadtests.cmake +++ b/tests/loadtests/vkloadtests.cmake @@ -165,6 +165,12 @@ add_executable( vkloadtests set_code_sign(vkloadtests) +# If VulkanAppSDL is ever made into its own target change the target here. +target_compile_features(vkloadtests +PRIVATE + cxx_std_14 +) + target_include_directories(vkloadtests PRIVATE ${SDL2_INCLUDE_DIRS} diff --git a/tests/loadtests/vkloadtests/VulkanLoadTests.cpp b/tests/loadtests/vkloadtests/VulkanLoadTests.cpp index 50d2124bb5..116c5713a6 100644 --- a/tests/loadtests/vkloadtests/VulkanLoadTests.cpp +++ b/tests/loadtests/vkloadtests/VulkanLoadTests.cpp @@ -20,6 +20,8 @@ #define _USE_MATH_DEFINES #include +#include +#include #include "VulkanLoadTests.h" #include "Texture.h" @@ -355,6 +357,9 @@ VulkanLoadTests::showFile(std::string& filename) createViewer = Texture::create; } ktxTexture_Destroy(kTexture); + + // Escape any spaces in filename. + filename = std::regex_replace( filename, std::regex(" "), "\\ " ); std::string args = "--external " + filename; pViewer = createViewer(vkctx, w_width, w_height, args.c_str(), sBasePath); return pViewer; diff --git a/utils/argparser.cpp b/utils/argparser.cpp index 48620971f7..9c2e260add 100644 --- a/utils/argparser.cpp +++ b/utils/argparser.cpp @@ -21,7 +21,9 @@ */ #include +#include #include "argparser.h" +#include using namespace std; @@ -30,18 +32,52 @@ using namespace std; */ argvector::argvector(const string& sArgs) { - const string sep(" \t\n\r\v\f"); + const string sep(" \t"); + // std::regex does not support negative lookbehind assertions. Darn! + // RE to extract argument between string start and separator. + // - Match 0 is whole matching string including separator. + // - Match 1 is the argument. + // - Match 2 is an empty string or a backslash. + // Use negative character class as it is easiest way to handle + // utf-8 file names with non-Latin characters. $ must not be last + // in the character class or it will be taken literally not as + // end of string. + // Use raw literal to avoid excess backslashes. + regex re(R"--(^([^$\\ \t]+)(\\?)(?:[ \t]+|$))--"); size_t pos; pos = sArgs.find_first_not_of(sep); assert(pos != string::npos); - do { - size_t epos = sArgs.find_first_of(sep, pos); - size_t len = epos == string::npos ? epos : epos - pos; - push_back(sArgs.substr(pos, len)); - pos = sArgs.find_first_not_of(sep, epos); - } while (pos != string::npos); + auto first = sArgs.begin() + pos; + auto last = sArgs.end(); + bool continuation = false; + for (smatch sm; regex_search(first, last, sm, re);) { + bool needContinuation = false; +#if DEBUG_REGEX + std::cout << "prefix: " << sm.prefix() << '\n'; + std::cout << "suffix: " << sm.suffix() << '\n'; + std::cout << "match size: " << sm.size() << '\n'; + for(uint32_t i = 0; i < sm.size(); i++) { + std::cout << "match " << i << ": " << "\"" << sm.str(i) << "\"" << '\n'; + } +#endif + string arg; + // All this because std::regex does not support negative + // lookbehind assertions. + arg = sm.str(1); + if (*sm[2].first == '\\') { + arg += " "; + needContinuation = true; + } + if (continuation) { + this->back() += arg; + } else { + push_back(arg); + } + continuation = needContinuation; + first = sm.suffix().first; + } } /*