Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Win32API-based encoding option to C++ runtime #61

Merged
merged 4 commits into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .build/build.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ Requires:
[CmdletBinding()]
param (
[Parameter(Mandatory=$true)]
[string] $GTestPath
[string] $GTestPath,
[Parameter(Mandatory=$false)]
[string] $EncodingType = "WIN32API"
)

# Standard boilerplate
Expand All @@ -26,7 +28,7 @@ Push-Location $repoRoot
$null = New-Item build -ItemType Directory -Force
cd build

cmake -DCMAKE_PREFIX_PATH="$GTestPath" -DSTRING_ENCODING_TYPE=NONE ..
cmake -DCMAKE_PREFIX_PATH="$GTestPath" -DSTRING_ENCODING_TYPE="$EncodingType" ..
cmake --build . --config Debug
cp $GTestPath\debug\bin\*.dll tests\Debug
cp Debug\kaitai_struct_cpp_stl_runtime.dll tests\Debug
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
project (kaitai_struct_cpp_stl_runtime CXX)
cmake_minimum_required (VERSION 3.3)
project (kaitai_struct_cpp_stl_runtime CXX)
enable_testing()

set (CMAKE_INCLUDE_CURRENT_DIR ON)
Expand All @@ -26,7 +26,7 @@ set (SOURCES
kaitai/kaitaistream.cpp
)

set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|NONE|...)")
set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)")

set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

Expand Down
2 changes: 2 additions & 0 deletions Common.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
if (STRING_ENCODING_TYPE STREQUAL "ICONV")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV)
elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API)
elseif (STRING_ENCODING_TYPE STREQUAL "NONE")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE)
else()
Expand Down
146 changes: 141 additions & 5 deletions kaitai/kaitaistream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -636,8 +636,8 @@ uint8_t kaitai::kstream::byte_array_max(const std::string val) {
#include <cerrno>
#include <stdexcept>

std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc) {
iconv_t cd = iconv_open(KS_STR_DEFAULT_ENCODING, src_enc.c_str());
std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) {
iconv_t cd = iconv_open(KS_STR_DEFAULT_ENCODING, src_enc);

if (cd == (iconv_t)-1) {
if (errno == EINVAL) {
Expand All @@ -655,7 +655,9 @@ std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc)
std::string dst(dst_len, ' ');
size_t dst_left = dst_len;

char *src_ptr = &src[0];
// NB: this should be const char *, but for some reason iconv() requires non-const in its 2nd argument,
// so we force it with a cast.
char *src_ptr = const_cast<char*>(src.data());
char *dst_ptr = &dst[0];

while (true) {
Expand Down Expand Up @@ -691,9 +693,143 @@ std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc)
return dst;
}
#elif defined(KS_STR_ENCODING_NONE)
std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc) {
std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) {
return src;
}
#elif defined(KS_STR_ENCODING_WIN32API)
#include <windows.h>
#include <limits>

// Unbreak std::numeric_limits<T>::max, as otherwise MSVC substitutes "useful" max() macro.
#undef max
GreyCat marked this conversation as resolved.
Show resolved Hide resolved

int kaitai::kstream::encoding_to_win_codepage(const char *src_enc) {
std::string enc(src_enc);
if (enc == "UTF-8") {
return CP_UTF8;
} else if (enc == "UTF-16LE") {
return KAITAI_CP_UTF16LE;
} else if (enc == "UTF-16BE") {
return KAITAI_CP_UTF16BE;
} else if (enc == "IBM437") {
return 437;
} else if (enc == "IBM850") {
return 850;
} else if (enc == "SHIFT_JIS") {
return 932;
} else if (enc == "GB2312") {
return 936;
} else if (enc == "ASCII") {
return 20127;
} else if (enc == "EUC-JP") {
return 20932;
} else if (enc == "ISO-8859-1") {
return 28591;
} else if (enc == "ISO-8859-2") {
return 28592;
} else if (enc == "ISO-8859-3") {
return 28593;
} else if (enc == "ISO-8859-4") {
return 28594;
} else if (enc == "ISO-8859-5") {
return 28595;
} else if (enc == "ISO-8859-6") {
return 28596;
} else if (enc == "ISO-8859-7") {
return 28597;
} else if (enc == "ISO-8859-8") {
return 28598;
} else if (enc == "ISO-8859-9") {
return 28599;
} else if (enc == "ISO-8859-10") {
return 28600;
} else if (enc == "ISO-8859-11") {
return 28601;
} else if (enc == "ISO-8859-13") {
return 28603;
} else if (enc == "ISO-8859-14") {
return 28604;
} else if (enc == "ISO-8859-15") {
return 28605;
} else if (enc == "ISO-8859-16") {
return 28606;
}

return KAITAI_CP_UNSUPPORTED;
}

std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) {
// Step 1: convert encoding name to codepage number
int codepage = encoding_to_win_codepage(src_enc);
return bytes_to_str(src, codepage);
}

std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) {
if (codepage == KAITAI_CP_UNSUPPORTED) {
throw std::runtime_error("bytes_to_str: unsupported encoding name");
}

// Shortcut: if we're already in UTF-8, no need to convert anything
if (codepage == CP_UTF8) {
return src;
}

// Step 2: convert bytes to UTF-16 ("wide char") string
std::wstring utf16;
int32_t utf16_len;
int32_t src_len;
if (src.length() > std::numeric_limits<int32_t>::max()) {
throw std::runtime_error("bytes_to_str: buffers longer than int32_t are unsupported");
} else {
src_len = static_cast<int32_t>(src.length());
}

switch (codepage) {
case KAITAI_CP_UTF16LE:
// If our source is already UTF-16LE, just copy it
utf16_len = src_len / 2;
utf16 = std::wstring((wchar_t*)src.c_str(), utf16_len);
break;
case KAITAI_CP_UTF16BE:
// If our source is in UTF-16BE, convert it to UTF-16LE by swapping bytes
utf16_len = src_len / 2;

utf16 = std::wstring(utf16_len, L'\0');
for (int32_t i = 0; i < utf16_len; i++) {
utf16[i] = (static_cast<uint8_t>(src[i * 2]) << 8) | static_cast<uint8_t>(src[i * 2 + 1]);
}
break;
default:
// Calculate the length of the UTF-16 string
utf16_len = MultiByteToWideChar(codepage, 0, src.c_str(), src_len, 0, 0);
if (utf16_len == 0) {
throw std::runtime_error("bytes_to_str: MultiByteToWideChar length calculation error");
}

// Convert to UTF-16 string
utf16 = std::wstring(utf16_len, L'\0');
if (MultiByteToWideChar(codepage, 0, src.c_str(), src_len, &utf16[0], utf16_len) == 0) {
throw std::runtime_error("bytes_to_str: MultiByteToWideChar conversion error");
}
}

// Step 3: convert UTF-16 string to UTF-8 string

// Calculate the length of the UTF-8 string
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, &utf16[0], utf16_len, 0, 0, 0, 0);
if (utf8_len == 0) {
throw std::runtime_error("bytes_to_str: WideCharToMultiByte length calculation error");
}

// Convert to UTF-8 string
std::string utf8(utf8_len, '\0');
if (WideCharToMultiByte(CP_UTF8, 0, &utf16[0], utf16_len, &utf8[0], utf8_len, 0, 0) == 0) {
throw std::runtime_error("bytes_to_str: WideCharToMultiByte conversion error");
}

return utf8;
}

#else
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_NONE
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE
#endif
28 changes: 27 additions & 1 deletion kaitai/kaitaistream.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class kstream {

static std::string bytes_strip_right(std::string src, char pad_byte);
static std::string bytes_terminate(std::string src, char term, bool include);
static std::string bytes_to_str(std::string src, std::string src_enc);
static std::string bytes_to_str(const std::string src, const char *src_enc);
Copy link
Member

@generalmimon generalmimon Jul 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@GreyCat As evidenced by our CI dashboard (https://ci.kaitai.io/), this change breaks half of our tests for target cpp_stl_11/msvc141_windows_x64 (see kaitai-io/ci_artifacts@cf98b22#diff-ddeb848e50b284745134d0b3371b48f0cf63c52ed1d2f1134ff625c1217b57a3). The ci.json diff starts as (I simplified it a bit and only showed the first 2 tests that regressed, out of a total of 62):

diff --git 1/ci_json-cpp_stl_11-msvc141_windows_x64-f8948e3f-good.json 2/ci_json-cpp_stl_11-msvc141_windows_x64-cf98b222-bad.json
index d9b49a7..fe807ae 100644
--- 1/ci_json-cpp_stl_11-msvc141_windows_x64-f8948e3f-good.json
+++ 2/ci_json-cpp_stl_11-msvc141_windows_x64-cf98b222-bad.json
@@ -1,13 +1,13 @@
 {
   "$meta": {
     "lang": "cpp_stl",
-    "timestamp": "2023-07-08T22:54:28Z",
+    "timestamp": "2023-07-14T19:03:17Z",
     "ci": {
       "ci": "appveyor",
-      "build_id": "47500100",
-      "job_id": "a5x62yaygwv8fuxc",
+      "build_id": "47552134",
+      "job_id": "1rj2gtnxn51dxib8",
       "job_number": "3",
-      "url": "https://ci.appveyor.com/project/kaitai-io/ci-targets/builds/47500100/job/a5x62yaygwv8fuxc"
+      "url": "https://ci.appveyor.com/project/kaitai-io/ci-targets/builds/47552134/job/1rj2gtnxn51dxib8"
     }
   },
   "BcdUserTypeBe": "passed",
@@ -29,13 +29,13 @@
   "BitsUnalignedB64Le": "passed",
   "BufferedStruct": "passed",
   "BytesPadTerm": "passed",
-  "CastNested": "passed",
+  "CastNested": "format_build_failed",
   "CastToImported": "passed",
   "CastToTop": "passed",
   "CombineBool": "passed",
   "CombineBytes": "passed",
   "CombineEnum": "passed",
-  "CombineStr": "passed",
+  "CombineStr": "format_build_failed",
   "Debug0": "passed",
   "DebugArrayUser": "passed",
   "DebugEnumName": "passed",
@@ ... @@

The error is the following (https://ci.appveyor.com/project/kaitai-io/ci-targets/builds/47552134/job/1rj2gtnxn51dxib8?fullLog=true#L492):

  cast_nested.cpp
C:\projects\ci-targets\tests\compiled\cpp_stl_11\cast_nested.cpp(90):
  error C2664: 'std::string kaitai::kstream::bytes_to_str(const std::string,const char *)':
    cannot convert argument 2 from 'std::string' to 'const char *'
    [C:\projects\ci-targets\tests\compiled\cpp_stl_11\bin\ks_tests.vcxproj]
  C:\projects\ci-targets\tests\compiled\cpp_stl_11\cast_nested.cpp(90):
    note: No user-defined-conversion operator available that can perform this conversion, or the operator cannot be called

Line compiled/cpp_stl_11/cast_nested.cpp:90 is:

    m_value = kaitai::kstream::bytes_to_str(m__io->read_bytes_term(0, false, true, true), std::string("ASCII"));

But this apparently isn't a MSVC-specific issue. I've just checked cpp_stl_11/gcc4.8 and the impact looks very much the same (see kaitai-io/ci_artifacts@4ab4ae1#diff-ddeb848e50b284745134d0b3371b48f0cf63c52ed1d2f1134ff625c1217b57a3), again the same 62 regressions in total:

diff --git 1/ci_json-cpp_stl_11-gcc4.8-00689cd1-good.json 2/ci_json-cpp_stl_11-gcc4.8-4ab4ae15-bad.json
index fd80bd6..942318f 100644
--- 1/ci_json-cpp_stl_11-gcc4.8-00689cd1-good.json
+++ 2/ci_json-cpp_stl_11-gcc4.8-4ab4ae15-bad.json
@@ -1,13 +1,13 @@
 {
   "$meta": {
     "lang": "cpp_stl",
-    "timestamp": "2023-07-08T22:37:56Z",
+    "timestamp": "2023-07-14T18:46:24Z",
     "ci": {
       "ci": "github",
-      "run_id": "5496725152",
-      "run_number": "254",
-      "job_id": "14883558372",
-      "url": "https://github.com/kaitai-io/ci_targets/actions/runs/5496725152/jobs/10016910686"
+      "run_id": "5557284492",
+      "run_number": "255",
+      "job_id": "15054460788",
+      "url": "https://github.com/kaitai-io/ci_targets/actions/runs/5557284492/jobs/10150864944"
     }
   },
   "BcdUserTypeBe": "passed",
@@ -29,13 +29,13 @@
   "BitsUnalignedB64Le": "passed",
   "BufferedStruct": "passed",
   "BytesPadTerm": "passed",
-  "CastNested": "passed",
+  "CastNested": "format_build_failed",
   "CastToImported": "passed",
   "CastToTop": "passed",
   "CombineBool": "passed",
   "CombineBytes": "passed",
   "CombineEnum": "passed",
-  "CombineStr": "passed",
+  "CombineStr": "format_build_failed",
   "Debug0": "passed",
   "DebugArrayUser": "passed",
   "DebugEnumName": "passed",
@@ ... @@

The cpp_stl_11/gcc4.8 target isn't visible at https://ci.kaitai.io/ though, because the CI dashboard is configured to display cpp_stl_11/gcc4.8_linux, see src/App.vue:63:

  ["cpp_stl_11", "gcc4.8_linux"],

I assume pushing to cpp_stl_11/gcc4.8 was an oversight? I guess keeping the _linux suffix makes sense - we want to test C++ on different platforms, so it's a good idea to specify the platform in all C++ target names.


//@}

Expand Down Expand Up @@ -319,6 +319,32 @@ class kstream {

static void unsigned_to_decimal(uint64_t number, char *buffer);

#ifdef KS_STR_ENCODING_WIN32API
enum {
KAITAI_CP_UNSUPPORTED = -1,
KAITAI_CP_UTF16LE = -2,
KAITAI_CP_UTF16BE = -3,
};
GreyCat marked this conversation as resolved.
Show resolved Hide resolved

/**
* Converts string name of the encoding into a Windows codepage number. We extend standard Windows codepage list
* with a few special meanings (see KAITAI_CP_* enum), reserving negative values of integer for that.
* @param src_enc string name of the encoding; this should match canonical name of the encoding as per discussion
* in https://github.com/kaitai-io/kaitai_struct/issues/116
* @return Windows codepage number or member of KAITAI_CP_* enum.
* @ref https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
*/
static int encoding_to_win_codepage(const char *src_enc);

/**
* Converts bytes packed in std::string into a UTF-8 string, based on given source encoding indicated by `codepage`.
* @param src bytes to be converted
* @param codepage Windows codepage number or member of KAITAI_CP_* enum.
* @return UTF-8 string
*/
static std::string bytes_to_str(const std::string src, int codepage);
#endif

static const int ZLIB_BUF_SIZE = 128 * 1024;
};

Expand Down
66 changes: 57 additions & 9 deletions tests/unittest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,29 +67,77 @@ TEST(KaitaiStreamTest, bytes_to_str_ascii)
}

#ifndef KS_STR_ENCODING_NONE
TEST(KaitaiStreamTest, bytes_to_str_iso8859_1)
TEST(KaitaiStreamTest, bytes_to_str_iso_8859_1)
{
std::string res = kaitai::kstream::bytes_to_str("\xC4\xD6\xDC\xE4\xF6\xFC\xDF", "ISO8859-1");
EXPECT_EQ(res, "\xC3\x84\xC3\x96\xC3\x9C\xC3\xA4\xC3\xB6\xC3\xBC\xC3\x9F");
std::string res = kaitai::kstream::bytes_to_str("\xC4\xD6\xDC\xE4\xF6\xFC\xDF\xA6", "ISO-8859-1");
EXPECT_EQ(res,
"\xC3\x84" // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
"\xC3\x96" // U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS
"\xC3\x9C" // U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS
"\xC3\xA4" // U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
"\xC3\xB6" // U+00F6 LATIN SMALL LETTER O WITH DIAERESIS
"\xC3\xBC" // U+00FC LATIN SMALL LETTER U WITH DIAERESIS
"\xC3\x9F" // U+00DF LATIN SMALL LETTER SHARP S
"\xC2\xA6" // U+00A6 BROKEN BAR
GreyCat marked this conversation as resolved.
Show resolved Hide resolved
);
}

TEST(KaitaiStreamTest, bytes_to_str_iso_8859_15)
{
std::string res = kaitai::kstream::bytes_to_str("\xC4\xD6\xDC\xE4\xF6\xFC\xDF\xA6", "ISO-8859-15");
EXPECT_EQ(res,
"\xC3\x84" // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
"\xC3\x96" // U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS
"\xC3\x9C" // U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS
"\xC3\xA4" // U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
"\xC3\xB6" // U+00F6 LATIN SMALL LETTER O WITH DIAERESIS
"\xC3\xBC" // U+00FC LATIN SMALL LETTER U WITH DIAERESIS
"\xC3\x9F" // U+00DF LATIN SMALL LETTER SHARP S
"\xC5\xA0" // U+0160 LATIN CAPITAL LETTER S WITH CARON
);
}

TEST(KaitaiStreamTest, bytes_to_str_gb2312)
{
std::string res = kaitai::kstream::bytes_to_str("\xC4\xE3\xBA\xC3\xCA\xC0\xBD\xE7", "GB2312");
EXPECT_EQ(res, "\xE4\xBD\xA0\xE5\xA5\xBD\xE4\xB8\x96\xE7\x95\x8C");
EXPECT_EQ(res,
"\xE4\xBD\xA0" // U+4F60 CJK UNIFIED IDEOGRAPH-4F60
"\xE5\xA5\xBD" // U+597D CJK UNIFIED IDEOGRAPH-597D
"\xE4\xB8\x96" // U+4E16 CJK UNIFIED IDEOGRAPH-4E16
"\xE7\x95\x8C" // U+754C CJK UNIFIED IDEOGRAPH-754C
);
}

TEST(KaitaiStreamTest, bytes_to_str_cp437)
TEST(KaitaiStreamTest, bytes_to_str_ibm437)
{
std::string res = kaitai::kstream::bytes_to_str("\xCC\xB2\x40", "cp437");
EXPECT_EQ(res, "\xE2\x95\xA0\xE2\x96\x93\x40");
std::string res = kaitai::kstream::bytes_to_str("\xCC\xB2\x40", "IBM437");
EXPECT_EQ(res,
"\xE2\x95\xA0" // U+2560 BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
"\xE2\x96\x93" // U+2593 DARK SHADE
"\x40" // U+0040 COMMERCIAL AT
);
}

TEST(KaitaiStreamTest, bytes_to_str_utf16_le)
TEST(KaitaiStreamTest, bytes_to_str_utf16le)
{
// NB: UTF16 bytes representation will have binary zeroes in the middle, so we need to convert it to std::string with explicit length
std::string res = kaitai::kstream::bytes_to_str(std::string("\x41\x00\x42\x00\x91\x25\x70\x24", 8), "UTF-16LE");
EXPECT_EQ(res, "AB\xE2\x96\x91\xE2\x91\xB0");
EXPECT_EQ(res,
"AB"
"\xE2\x96\x91" // U+2591 LIGHT SHADE
"\xE2\x91\xB0" // U+2470 CIRCLED NUMBER SEVENTEEN
);
}

TEST(KaitaiStreamTest, bytes_to_str_utf16be)
{
// NB: UTF16 bytes representation will have binary zeroes in the middle, so we need to convert it to std::string with explicit length
std::string res = kaitai::kstream::bytes_to_str(std::string("\x00\x41\x00\x42\x25\x91\x24\x70", 8), "UTF-16BE");
EXPECT_EQ(res,
"AB"
"\xE2\x96\x91" // U+2591 LIGHT SHADE
"\xE2\x91\xB0" // U+2470 CIRCLED NUMBER SEVENTEEN
);
}
#endif

Expand Down