Skip to content

Commit

Permalink
Add 4 more testcases, showing that the same bug is present in utf-16 …
Browse files Browse the repository at this point in the history
…as well. Also fix the bug (really, now!)
  • Loading branch information
jan.nijtmans committed Feb 9, 2023
1 parent d8e90de commit 14dd24c
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 8 deletions.
44 changes: 36 additions & 8 deletions generic/tclEncoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -2531,7 +2531,7 @@ Utf32ToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
int ch;
int ch = 0;

flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
Expand All @@ -2548,6 +2548,19 @@ Utf32ToUtfProc(
srcLen &= -4;
}

/*
* If last code point is a high surrogate, we cannot handle that yet,
* unless we are at the end.
*/

if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) &&
((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) &&
((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) &&
((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) {
result = TCL_CONVERT_MULTIBYTE;
srcLen-= 4;
}

srcStart = src;
srcEnd = src + srcLen;

Expand All @@ -2560,11 +2573,16 @@ Utf32ToUtfProc(
break;
}

int prev = ch;
if (flags & TCL_ENCODING_LE) {
ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
} else {
ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
}
if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) {
/* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
dst += Tcl_UniCharToUtf(-1, dst);
}
if ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
&& ((ch & ~0x7FF) == 0xD800))) {
if (STOPONERROR) {
Expand All @@ -2582,14 +2600,14 @@ Utf32ToUtfProc(
*dst++ = (ch & 0xFF);
} else {
dst += Tcl_UniCharToUtf(ch, dst);
if ((ch & ~0x3FF) == 0xD800) {
/* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
dst += Tcl_UniCharToUtf(-1, dst);
}
}
src += sizeof(unsigned int);
}

if ((ch & ~0x3FF) == 0xD800) {
/* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
dst += Tcl_UniCharToUtf(-1, dst);
}
*srcReadPtr = src - srcStart;
*dstWrotePtr = dst - dstStart;
*dstCharsPtr = numChars;
Expand Down Expand Up @@ -2734,7 +2752,7 @@ Utf16ToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
unsigned short ch;
unsigned short ch = 0;

flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
Expand All @@ -2752,10 +2770,11 @@ Utf16ToUtfProc(
}

/*
* If last code point is a high surrogate, we cannot handle that yet.
* If last code point is a high surrogate, we cannot handle that yet,
* unless we are at the end.
*/

if ((srcLen >= 2) &&
if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) &&
((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) {
result = TCL_CONVERT_MULTIBYTE;
srcLen-= 2;
Expand All @@ -2773,11 +2792,16 @@ Utf16ToUtfProc(
break;
}

unsigned short prev = ch;
if (flags & TCL_ENCODING_LE) {
ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
} else {
ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
}
if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) {
/* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
dst += Tcl_UniCharToUtf(-1, dst);
}

/*
* Special case for 1-byte utf chars for speed. Make sure we work with
Expand All @@ -2792,6 +2816,10 @@ Utf16ToUtfProc(
src += sizeof(unsigned short);
}

if ((ch & ~0x3FF) == 0xD800) {
/* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
dst += Tcl_UniCharToUtf(-1, dst);
}
*srcReadPtr = src - srcStart;
*dstWrotePtr = dst - dstStart;
*dstCharsPtr = numChars;
Expand Down
12 changes: 12 additions & 0 deletions tests/encoding.test
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,18 @@ test encoding-16.11 {Utf32ToUtfProc} -body {
test encoding-16.12 {Utf32ToUtfProc} -body {
encoding convertfrom utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00
} -result \uDC00\uD800
test encoding-16.13 {Utf16ToUtfProc} -body {
encoding convertfrom utf-16le \x00\xD8
} -result \uD800
test encoding-16.14 {Utf16ToUtfProc} -body {
encoding convertfrom utf-16le \x00\xDC
} -result \uDC00
test encoding-16.15 {Utf16ToUtfProc} -body {
encoding convertfrom utf-16le \x00\xD8\x00\xDC
} -result \uD800\uDC00
test encoding-16.16 {Utf16ToUtfProc} -body {
encoding convertfrom utf-16le \x00\xDC\x00\xD8
} -result \uDC00\uD800

test encoding-16.9 {
Utf16ToUtfProc, Tcl_UniCharToUtf, surrogate pairs in utf-16
Expand Down

0 comments on commit 14dd24c

Please sign in to comment.