Skip to content

Commit

Permalink
Add Win32API-based encoding option to C++ runtime
Browse files Browse the repository at this point in the history
  • Loading branch information
GreyCat committed Jul 12, 2023
1 parent cd8e2d0 commit 058cfdc
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 19 deletions.
6 changes: 4 additions & 2 deletions .build/build.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ Requires:
[CmdletBinding()]
param (
[Parameter(Mandatory=$true)]
[string] $GTestPath
[string] $GTestPath,
[Parameter(Mandatory=$false)]
[string] $EncodingType = "WIN32API"
)

# Standard boilerplate
Expand All @@ -26,7 +28,7 @@ Push-Location $repoRoot
$null = New-Item build -ItemType Directory -Force
cd build

cmake -DCMAKE_PREFIX_PATH="$GTestPath" -DSTRING_ENCODING_TYPE=NONE ..
cmake -DCMAKE_PREFIX_PATH="$GTestPath" -DSTRING_ENCODING_TYPE="$EncodingType" ..
cmake --build . --config Debug
cp $GTestPath\debug\bin\*.dll tests\Debug
cp Debug\kaitai_struct_cpp_stl_runtime.dll tests\Debug
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
project (kaitai_struct_cpp_stl_runtime CXX)
cmake_minimum_required (VERSION 3.3)
project (kaitai_struct_cpp_stl_runtime CXX)
enable_testing()

set (CMAKE_INCLUDE_CURRENT_DIR ON)
Expand All @@ -26,7 +26,7 @@ set (SOURCES
kaitai/kaitaistream.cpp
)

set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|NONE|...)")
set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)")

set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

Expand Down
2 changes: 2 additions & 0 deletions Common.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
if (STRING_ENCODING_TYPE STREQUAL "ICONV")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV)
elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API)
elseif (STRING_ENCODING_TYPE STREQUAL "NONE")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE)
else()
Expand Down
147 changes: 142 additions & 5 deletions kaitai/kaitaistream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -635,9 +635,10 @@ uint8_t kaitai::kstream::byte_array_max(const std::string val) {
#include <iconv.h>
#include <cerrno>
#include <stdexcept>
#include "kaitaistream.h"

std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc) {
iconv_t cd = iconv_open(KS_STR_DEFAULT_ENCODING, src_enc.c_str());
std::string kaitai::kstream::bytes_to_str(const std::string src, const char* src_enc) {
iconv_t cd = iconv_open(KS_STR_DEFAULT_ENCODING, src_enc);

if (cd == (iconv_t)-1) {
if (errno == EINVAL) {
Expand All @@ -655,7 +656,9 @@ std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc)
std::string dst(dst_len, ' ');
size_t dst_left = dst_len;

char *src_ptr = &src[0];
// NB: this should be const char*, but for some reason iconv() requires non-const in its 2nd argument,
// so we force it with a cast.
char *src_ptr = const_cast<char*>(src.data());
char *dst_ptr = &dst[0];

while (true) {
Expand Down Expand Up @@ -691,9 +694,143 @@ std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc)
return dst;
}
#elif defined(KS_STR_ENCODING_NONE)
std::string kaitai::kstream::bytes_to_str(std::string src, std::string src_enc) {
std::string kaitai::kstream::bytes_to_str(const std::string src, const char* src_enc) {
return src;
}
#elif defined(KS_STR_ENCODING_WIN32API)
#include <windows.h>
#include <limits>

// Unbreak std::numeric_limits<T>::max, as otherwise MSVC substitutes "useful" max() macro.
#undef max

int kaitai::kstream::encoding_to_win_codepage(const char* src_enc) {
std::string enc(src_enc);
if (enc == "UTF-8") {
return CP_UTF8;
} else if (enc == "UTF-16LE") {
return KAITAI_CP_UTF16LE;
} else if (enc == "UTF-16BE") {
return KAITAI_CP_UTF16BE;
} else if (enc == "IBM437") {
return 437;
} else if (enc == "IBM850") {
return 850;
} else if (enc == "SHIFT_JIS") {
return 932;
} else if (enc == "GB2312") {
return 936;
} else if (enc == "ASCII") {
return 20127;
} else if (enc == "EUC-JP") {
return 20932;
} else if (enc == "ISO-8859-1") {
return 28591;
} else if (enc == "ISO-8859-2") {
return 28592;
} else if (enc == "ISO-8859-3") {
return 28593;
} else if (enc == "ISO-8859-4") {
return 28594;
} else if (enc == "ISO-8859-5") {
return 28595;
} else if (enc == "ISO-8859-6") {
return 28596;
} else if (enc == "ISO-8859-7") {
return 28597;
} else if (enc == "ISO-8859-8") {
return 28598;
} else if (enc == "ISO-8859-9") {
return 28599;
} else if (enc == "ISO-8859-10") {
return 28600;
} else if (enc == "ISO-8859-11") {
return 28601;
} else if (enc == "ISO-8859-13") {
return 28603;
} else if (enc == "ISO-8859-14") {
return 28604;
} else if (enc == "ISO-8859-15") {
return 28605;
} else if (enc == "ISO-8859-16") {
return 28606;
}

return KAITAI_CP_UNSUPPORTED;
}

std::string kaitai::kstream::bytes_to_str(const std::string src, const char* src_enc) {
// Step 1: convert encoding name to codepage number
int codePage = encoding_to_win_codepage(src_enc);
return bytes_to_str(src, codePage);
}

std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) {
if (codepage == KAITAI_CP_UNSUPPORTED) {
throw std::runtime_error("bytes_to_str: unsupported encoding name");
}

// Shortcut: if we're already in UTF-8, no need to convert anything
if (codepage == CP_UTF8) {
return src;
}

// Step 2: convert bytes to UTF16 ("wide char") string
std::wstring utf16;
int32_t utf16_len;
int32_t src_len;
if (src.length() > std::numeric_limits<int32_t>::max()) {
throw std::runtime_error("bytes_to_str: buffers longer than int32_t are unsupported");
} else {
src_len = static_cast<int32_t>(src.length());
}

switch (codepage) {
case KAITAI_CP_UTF16LE:
// If our source is already UTF16LE, just copy it
utf16_len = src_len / 2;
utf16 = std::wstring((wchar_t*)src.c_str(), utf16_len);
break;
case KAITAI_CP_UTF16BE:
// If our source is in UTF16BE, convert it to UTF16LE by swapping bytes
utf16_len = src_len / 2;

utf16 = std::wstring(utf16_len, L'\0');
for (int i = 0; i < utf16_len; i++) {
utf16[i] = (static_cast<uint8_t>(src[i * 2]) << 8) | static_cast<uint8_t>(src[i * 2 + 1]);
}
break;
default:
// Calculate the length of the UTF16 string
utf16_len = MultiByteToWideChar(codepage, 0, src.c_str(), src_len, 0, 0);
if (utf16_len == 0) {
throw std::runtime_error("bytes_to_str: MultiByteToWideChar length calculation error");
}

// Convert to UTF16 string
utf16 = std::wstring(utf16_len, L'\0');
if (MultiByteToWideChar(codepage, 0, src.c_str(), src_len, &utf16[0], utf16_len) == 0) {
throw std::runtime_error("bytes_to_str: MultiByteToWideChar conversion error");
}
}

// Step 3: convert UTF16 string to UTF8 string

// Calculate the length of the UTF8 string
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, &utf16[0], utf16_len, 0, 0, 0, 0);
if (utf8_len == 0) {
throw std::runtime_error("bytes_to_str: WideCharToMultiByte length calculation error");
}

// Convert to UTF8 string
std::string utf8(utf8_len, '\0');
if (WideCharToMultiByte(CP_UTF8, 0, &utf16[0], utf16_len, &utf8[0], utf8_len, 0, 0) == 0) {
throw std::runtime_error("bytes_to_str: WideCharToMultiByte conversion error");
}

return utf8;
}

#else
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_NONE
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE
#endif
28 changes: 27 additions & 1 deletion kaitai/kaitaistream.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class kstream {

static std::string bytes_strip_right(std::string src, char pad_byte);
static std::string bytes_terminate(std::string src, char term, bool include);
static std::string bytes_to_str(std::string src, std::string src_enc);
static std::string bytes_to_str(const std::string src, const char* src_enc);

//@}

Expand Down Expand Up @@ -319,6 +319,32 @@ class kstream {

static void unsigned_to_decimal(uint64_t number, char *buffer);

#ifdef KS_STR_ENCODING_WIN32API
enum {
KAITAI_CP_UNSUPPORTED = -1,
KAITAI_CP_UTF16LE = -2,
KAITAI_CP_UTF16BE = -3,
};

/**
* Converts string name of the encoding into a Windows codepage number. We extend standard Windows codepage list
* with a few special meanings (see KAITAI_CP_* enum), reserving negative values of integer for that.
* @param src_enc string name of the encoding; this should match canonical name of the encoding as per discussion
* in https://github.com/kaitai-io/kaitai_struct/issues/116
* @return Windows codepage number or member of KAITAI_CP_* enum.
* @ref https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
*/
static int encoding_to_win_codepage(const char* src_enc);

/**
* Converts bytes packed in std::string into a UTF-8 string, based on given source encoding indicated by `codepage`.
* @param src bytes to be converted
* @param codepage Windows codepage number or member of KAITAI_CP_* enum.
* @return UTF-8 string
*/
static std::string bytes_to_str(const std::string src, int codepage);
#endif

static const int ZLIB_BUF_SIZE = 128 * 1024;
};

Expand Down
50 changes: 41 additions & 9 deletions tests/unittest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,29 +67,61 @@ TEST(KaitaiStreamTest, bytes_to_str_ascii)
}

#ifndef KS_STR_ENCODING_NONE
TEST(KaitaiStreamTest, bytes_to_str_iso8859_1)
TEST(KaitaiStreamTest, bytes_to_str_iso_8859_1)
{
std::string res = kaitai::kstream::bytes_to_str("\xC4\xD6\xDC\xE4\xF6\xFC\xDF", "ISO8859-1");
EXPECT_EQ(res, "\xC3\x84\xC3\x96\xC3\x9C\xC3\xA4\xC3\xB6\xC3\xBC\xC3\x9F");
std::string res = kaitai::kstream::bytes_to_str("\xC4\xD6\xDC\xE4\xF6\xFC\xDF", "ISO-8859-1");
EXPECT_EQ(res,
"\xC3\x84" // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
"\xC3\x96" // U+00D6 LATIN CAPITAL LETTER O WITH DIAERESIS
"\xC3\x9C" // U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS
"\xC3\xA4" // U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
"\xC3\xB6" // U+00F6 LATIN SMALL LETTER O WITH DIAERESIS
"\xC3\xBC" // U+00FC LATIN SMALL LETTER U WITH DIAERESIS
"\xC3\x9F" // U+00DF LATIN SMALL LETTER SHARP S
);
}

TEST(KaitaiStreamTest, bytes_to_str_gb2312)
{
std::string res = kaitai::kstream::bytes_to_str("\xC4\xE3\xBA\xC3\xCA\xC0\xBD\xE7", "GB2312");
EXPECT_EQ(res, "\xE4\xBD\xA0\xE5\xA5\xBD\xE4\xB8\x96\xE7\x95\x8C");
EXPECT_EQ(res,
"\xE4\xBD\xA0" // U+4F60 CJK UNIFIED IDEOGRAPH-4F60
"\xE5\xA5\xBD" // U+597D CJK UNIFIED IDEOGRAPH-597D
"\xE4\xB8\x96" // U+4E16 CJK UNIFIED IDEOGRAPH-4E16
"\xE7\x95\x8C" // U+754C CJK UNIFIED IDEOGRAPH-754C
);
}

TEST(KaitaiStreamTest, bytes_to_str_cp437)
TEST(KaitaiStreamTest, bytes_to_str_ibm437)
{
std::string res = kaitai::kstream::bytes_to_str("\xCC\xB2\x40", "cp437");
EXPECT_EQ(res, "\xE2\x95\xA0\xE2\x96\x93\x40");
std::string res = kaitai::kstream::bytes_to_str("\xCC\xB2\x40", "IBM437");
EXPECT_EQ(res,
"\xE2\x95\xA0" // U+2560 BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
"\xE2\x96\x93" // U+2593 DARK SHADE
"\x40" // U+0040 COMMERCIAL AT
);
}

TEST(KaitaiStreamTest, bytes_to_str_utf16_le)
TEST(KaitaiStreamTest, bytes_to_str_utf16le)
{
// NB: UTF16 bytes representation will have binary zeroes in the middle, so we need to convert it to std::string with explicit length
std::string res = kaitai::kstream::bytes_to_str(std::string("\x41\x00\x42\x00\x91\x25\x70\x24", 8), "UTF-16LE");
EXPECT_EQ(res, "AB\xE2\x96\x91\xE2\x91\xB0");
EXPECT_EQ(res,
"AB"
"\xE2\x96\x91" // U+2591 LIGHT SHADE
"\xE2\x91\xB0" // U+2470 CIRCLED NUMBER SEVENTEEN
);
}

TEST(KaitaiStreamTest, bytes_to_str_utf16be)
{
// NB: UTF16 bytes representation will have binary zeroes in the middle, so we need to convert it to std::string with explicit length
std::string res = kaitai::kstream::bytes_to_str(std::string("\x00\x41\x00\x42\x25\x91\x24\x70", 8), "UTF-16BE");
EXPECT_EQ(res,
"AB"
"\xE2\x96\x91" // U+2591 LIGHT SHADE
"\xE2\x91\xB0" // U+2470 CIRCLED NUMBER SEVENTEEN
);
}
#endif

Expand Down

0 comments on commit 058cfdc

Please sign in to comment.