Add Win32API-based encoding option to C++ runtime

kaitai-io · Jul 12, 2023 · 058cfdc · 058cfdc
1 parent cd8e2d0
commit 058cfdc
Show file tree

Hide file tree

Showing 6 changed files with 218 additions and 19 deletions.
diff --git a/.build/build.ps1 b/.build/build.ps1
@@ -11,7 +11,9 @@ Requires:
 [CmdletBinding()]
 param (
     [Parameter(Mandatory=$true)]
-    [string] $GTestPath
+    [string] $GTestPath,
+    [Parameter(Mandatory=$false)]
+    [string] $EncodingType = "WIN32API"
 )
 
 # Standard boilerplate
@@ -26,7 +28,7 @@ Push-Location $repoRoot
 $null = New-Item build -ItemType Directory -Force
 cd build
 
-cmake -DCMAKE_PREFIX_PATH="$GTestPath" -DSTRING_ENCODING_TYPE=NONE ..
+cmake -DCMAKE_PREFIX_PATH="$GTestPath" -DSTRING_ENCODING_TYPE="$EncodingType" ..
 cmake --build . --config Debug
 cp $GTestPath\debug\bin\*.dll tests\Debug
 cp Debug\kaitai_struct_cpp_stl_runtime.dll tests\Debug

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,5 @@
-project (kaitai_struct_cpp_stl_runtime CXX)
 cmake_minimum_required (VERSION 3.3)
+project (kaitai_struct_cpp_stl_runtime CXX)
 enable_testing()
 
 set (CMAKE_INCLUDE_CURRENT_DIR ON)
@@ -26,7 +26,7 @@ set (SOURCES
     kaitai/kaitaistream.cpp
 )
 
-set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|NONE|...)")
+set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)")
 
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 

diff --git a/Common.cmake b/Common.cmake
@@ -1,5 +1,7 @@
 if (STRING_ENCODING_TYPE STREQUAL "ICONV")
     target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV)
+elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API")
+    target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API)
 elseif (STRING_ENCODING_TYPE STREQUAL "NONE")
     target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE)
 else()

diff --git a/kaitai/kaitaistream.cpp b/kaitai/kaitaistream.cpp
@@ -635,9 +635,10 @@ uint8_t kaitai::kstream::byte_array_max(const std::string val) {
 #include <iconv.h>
 #include <cerrno>
 #include <stdexcept>
+#include "kaitaistream.h"
 
-std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc) {
-    iconv_t cd = iconv_open(KS_STR_DEFAULT_ENCODING, src_enc.c_str());
+std::string kaitai::kstream::bytes_to_str(const std::string src, const char* src_enc) {
+    iconv_t cd = iconv_open(KS_STR_DEFAULT_ENCODING, src_enc);
 
     if (cd == (iconv_t)-1) {
         if (errno == EINVAL) {
@@ -655,7 +656,9 @@ std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc)
     std::string dst(dst_len, ' ');
     size_t dst_left = dst_len;
 
-    char *src_ptr = &src[0];
+    // NB: this should be const char*, but for some reason iconv() requires non-const in its 2nd argument,
+    // so we force it with a cast.
+    char *src_ptr = const_cast<char*>(src.data());
     char *dst_ptr = &dst[0];
 
     while (true) {
@@ -691,9 +694,143 @@ std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc)
     return dst;
 }
 #elif defined(KS_STR_ENCODING_NONE)
-std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc) {
+std::string kaitai::kstream::bytes_to_str(const std::string src, const char* src_enc) {
     return src;
 }
+#elif defined(KS_STR_ENCODING_WIN32API)
+#include <windows.h>
+#include <limits>
+
+// Unbreak std::numeric_limits<T>::max, as otherwise MSVC substitutes "useful" max() macro.
+#undef max
+
+int kaitai::kstream::encoding_to_win_codepage(const char* src_enc) {
+    std::string enc(src_enc);
+    if (enc == "UTF-8") {
+        return CP_UTF8;
+    } else if (enc == "UTF-16LE") {
+        return KAITAI_CP_UTF16LE;
+    } else if (enc == "UTF-16BE") {
+        return KAITAI_CP_UTF16BE;
+    } else if (enc == "IBM437") {
+        return 437;
+    } else if (enc == "IBM850") {
+        return 850;
+    } else if (enc == "SHIFT_JIS") {
+        return 932;
+    } else if (enc == "GB2312") {
+        return 936;
+    } else if (enc == "ASCII") {
+        return 20127;
+    } else if (enc == "EUC-JP") {
+        return 20932;
+    } else if (enc == "ISO-8859-1") {
+        return 28591;
+    } else if (enc == "ISO-8859-2") {
+        return 28592;
+    } else if (enc == "ISO-8859-3") {
+        return 28593;
+    } else if (enc == "ISO-8859-4") {
+        return 28594;
+    } else if (enc == "ISO-8859-5") {
+        return 28595;
+    } else if (enc == "ISO-8859-6") {
+        return 28596;
+    } else if (enc == "ISO-8859-7") {
+        return 28597;
+    } else if (enc == "ISO-8859-8") {
+        return 28598;
+    } else if (enc == "ISO-8859-9") {
+        return 28599;
+    } else if (enc == "ISO-8859-10") {
+        return 28600;
+    } else if (enc == "ISO-8859-11") {
+        return 28601;
+    } else if (enc == "ISO-8859-13") {
+        return 28603;
+    } else if (enc == "ISO-8859-14") {
+        return 28604;
+    } else if (enc == "ISO-8859-15") {
+        return 28605;
+    } else if (enc == "ISO-8859-16") {
+        return 28606;
+    }
+
+    return KAITAI_CP_UNSUPPORTED;
+}
+
+std::string kaitai::kstream::bytes_to_str(const std::string src, const char* src_enc) {
+    // Step 1: convert encoding name to codepage number
+    int codePage = encoding_to_win_codepage(src_enc);
+    return bytes_to_str(src, codePage);
+}
+
+std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) {
+    if (codepage == KAITAI_CP_UNSUPPORTED) {
+        throw std::runtime_error("bytes_to_str: unsupported encoding name");
+    }
+
+    // Shortcut: if we're already in UTF-8, no need to convert anything
+    if (codepage == CP_UTF8) {
+        return src;
+    }
+
+    // Step 2: convert bytes to UTF16 ("wide char") string
+    std::wstring utf16;
+    int32_t utf16_len;
+    int32_t src_len;
+    if (src.length() > std::numeric_limits<int32_t>::max()) {
+        throw std::runtime_error("bytes_to_str: buffers longer than int32_t are unsupported");
+    } else {
+        src_len = static_cast<int32_t>(src.length());
+    }
+
+    switch (codepage) {
+    case KAITAI_CP_UTF16LE:
+        // If our source is already UTF16LE, just copy it
+        utf16_len = src_len / 2;
+        utf16 = std::wstring((wchar_t*)src.c_str(), utf16_len);
+        break;
+    case KAITAI_CP_UTF16BE:
+        // If our source is in UTF16BE, convert it to UTF16LE by swapping bytes
+        utf16_len = src_len / 2;
+
+        utf16 = std::wstring(utf16_len, L'\0');
+        for (int i = 0; i < utf16_len; i++) {
+            utf16[i] = (static_cast<uint8_t>(src[i * 2]) << 8) | static_cast<uint8_t>(src[i * 2 + 1]);
+        }
+        break;
+    default:
+        // Calculate the length of the UTF16 string
+        utf16_len = MultiByteToWideChar(codepage, 0, src.c_str(), src_len, 0, 0);
+        if (utf16_len == 0) {
+            throw std::runtime_error("bytes_to_str: MultiByteToWideChar length calculation error");
+        }
+
+        // Convert to UTF16 string
+        utf16 = std::wstring(utf16_len, L'\0');
+        if (MultiByteToWideChar(codepage, 0, src.c_str(), src_len, &utf16[0], utf16_len) == 0) {
+            throw std::runtime_error("bytes_to_str: MultiByteToWideChar conversion error");
+        }
+    }
+
+    // Step 3: convert UTF16 string to UTF8 string
+
+    // Calculate the length of the UTF8 string
+    int utf8_len = WideCharToMultiByte(CP_UTF8, 0, &utf16[0], utf16_len, 0, 0, 0, 0);
+    if (utf8_len == 0) {
+        throw std::runtime_error("bytes_to_str: WideCharToMultiByte length calculation error");
+    }
+
+    // Convert to UTF8 string
+    std::string utf8(utf8_len, '\0');
+    if (WideCharToMultiByte(CP_UTF8, 0, &utf16[0], utf16_len, &utf8[0], utf8_len, 0, 0) == 0) {
+        throw std::runtime_error("bytes_to_str: WideCharToMultiByte conversion error");
+    }
+
+    return utf8;
+}
+
 #else
-#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_NONE
+#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE
 #endif
diff --git a/kaitai/kaitaistream.h b/kaitai/kaitaistream.h
@@ -165,7 +165,7 @@ class kstream {
 
     static std::string bytes_strip_right(std::string src, char pad_byte);
     static std::string bytes_terminate(std::string src, char term, bool include);
-    static std::string bytes_to_str(std::string src, std::string src_enc);
+    static std::string bytes_to_str(const std::string src, const char* src_enc);
 
     //@}
 
@@ -319,6 +319,32 @@ class kstream {
 
     static void unsigned_to_decimal(uint64_t number, char *buffer);
 
+#ifdef KS_STR_ENCODING_WIN32API
+    enum {
+        KAITAI_CP_UNSUPPORTED = -1,
+        KAITAI_CP_UTF16LE = -2,
+        KAITAI_CP_UTF16BE = -3,
+    };
+
+    /**
+     * Converts string name of the encoding into a Windows codepage number. We extend standard Windows codepage list
+     * with a few special meanings (see KAITAI_CP_* enum), reserving negative values of integer for that.
+     * @param src_enc string name of the encoding; this should match canonical name of the encoding as per discussion
+     *     in https://github.com/kaitai-io/kaitai_struct/issues/116
+     * @return Windows codepage number or member of KAITAI_CP_* enum.
+     * @ref https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
+     */
+    static int encoding_to_win_codepage(const char* src_enc);
+
+    /**
+     * Converts bytes packed in std::string into a UTF-8 string, based on given source encoding indicated by `codepage`.
+     * @param src bytes to be converted
+     * @param codepage Windows codepage number or member of KAITAI_CP_* enum.
+     * @return UTF-8 string
+     */
+    static std::string bytes_to_str(const std::string src, int codepage);
+#endif
+
     static const int ZLIB_BUF_SIZE = 128 * 1024;
 };
 

diff --git a/tests/unittest.cpp b/tests/unittest.cpp
@@ -67,29 +67,61 @@ TEST(KaitaiStreamTest, bytes_to_str_ascii)
 }
 
 #ifndef KS_STR_ENCODING_NONE
-TEST(KaitaiStreamTest, bytes_to_str_iso8859_1)
+TEST(KaitaiStreamTest, bytes_to_str_iso_8859_1)
 {
-    std::string res = kaitai::kstream::bytes_to_str("\xC4\xD6\xDC\xE4\xF6\xFC\xDF", "ISO8859-1");
-    EXPECT_EQ(res, "\xC3\x84\xC3\x96\xC3\x9C\xC3\xA4\xC3\xB6\xC3\xBC\xC3\x9F");
+    std::string res = kaitai::kstream::bytes_to_str("\xC4\xD6\xDC\xE4\xF6\xFC\xDF", "ISO-8859-1");
+    EXPECT_EQ(res,
+        "\xC3\x84"  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
+        "\xC3\x96"  // U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS
+        "\xC3\x9C"  // U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS
+        "\xC3\xA4"  // U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
+        "\xC3\xB6"  // U+00F6 LATIN SMALL LETTER O WITH DIAERESIS
+        "\xC3\xBC"  // U+00FC LATIN SMALL LETTER U WITH DIAERESIS
+        "\xC3\x9F"  // U+00DF LATIN SMALL LETTER SHARP S
+    );
 }
 
 TEST(KaitaiStreamTest, bytes_to_str_gb2312)
 {
     std::string res = kaitai::kstream::bytes_to_str("\xC4\xE3\xBA\xC3\xCA\xC0\xBD\xE7", "GB2312");
-    EXPECT_EQ(res, "\xE4\xBD\xA0\xE5\xA5\xBD\xE4\xB8\x96\xE7\x95\x8C");
+    EXPECT_EQ(res,
+        "\xE4\xBD\xA0"  // U+4F60 CJK UNIFIED IDEOGRAPH-4F60
+        "\xE5\xA5\xBD"  // U+597D CJK UNIFIED IDEOGRAPH-597D
+        "\xE4\xB8\x96"  // U+4E16 CJK UNIFIED IDEOGRAPH-4E16
+        "\xE7\x95\x8C"  // U+754C CJK UNIFIED IDEOGRAPH-754C
+    );
 }
 
-TEST(KaitaiStreamTest, bytes_to_str_cp437)
+TEST(KaitaiStreamTest, bytes_to_str_ibm437)
 {
-    std::string res = kaitai::kstream::bytes_to_str("\xCC\xB2\x40", "cp437");
-    EXPECT_EQ(res, "\xE2\x95\xA0\xE2\x96\x93\x40");
+    std::string res = kaitai::kstream::bytes_to_str("\xCC\xB2\x40", "IBM437");
+    EXPECT_EQ(res,
+        "\xE2\x95\xA0"  // U+2560 BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
+        "\xE2\x96\x93"  // U+2593 DARK SHADE
+        "\x40"          // U+0040 COMMERCIAL AT
+    );
 }
 
-TEST(KaitaiStreamTest, bytes_to_str_utf16_le)
+TEST(KaitaiStreamTest, bytes_to_str_utf16le)
 {
     // NB: UTF16 bytes representation will have binary zeroes in the middle, so we need to convert it to std::string with explicit length
     std::string res = kaitai::kstream::bytes_to_str(std::string("\x41\x00\x42\x00\x91\x25\x70\x24", 8), "UTF-16LE");
-    EXPECT_EQ(res, "AB\xE2\x96\x91\xE2\x91\xB0");
+    EXPECT_EQ(res,
+        "AB"
+        "\xE2\x96\x91"  // U+2591 LIGHT SHADE
+        "\xE2\x91\xB0"  // U+2470 CIRCLED NUMBER SEVENTEEN
+    );
+}
+
+TEST(KaitaiStreamTest, bytes_to_str_utf16be)
+{
+    // NB: UTF16 bytes representation will have binary zeroes in the middle, so we need to convert it to std::string with explicit length
+    std::string res = kaitai::kstream::bytes_to_str(std::string("\x00\x41\x00\x42\x25\x91\x24\x70", 8), "UTF-16BE");
+    EXPECT_EQ(res,
+        "AB"
+        "\xE2\x96\x91"  // U+2591 LIGHT SHADE
+        "\xE2\x91\xB0"  // U+2470 CIRCLED NUMBER SEVENTEEN
+    );
 }
 #endif