Skip to content

Commit

Permalink
Implement UTF-16LE encoding, update tests, adjust codec interface
Browse files Browse the repository at this point in the history
Three major reasons for reimplementing UTF-16 and not use native codec:
 1. We want to remove StringDecoder & Buffer references due to #235.
 2. StringDecoder is inconsistent with handling surrogates on Node v6-9
 3. NPM module string_decoder gives strange results when processing chunks -
    it sometimes prepends '\u0000', likely due to a bug.

Performance was and is a major concern here. Decoder shouldn't be affected because it uses
backend methods directly. Encoder is affected due to introducing character-level loop. It's
still very fast (~450Mb/s), so I'm not too worried. If needed, we can make it about 4x faster
in Node.js by introducing a dedicated backend method. Browser speeds will be the same.
  • Loading branch information
ashtuchkin committed Jul 16, 2020
1 parent e567849 commit 84ee650
Show file tree
Hide file tree
Showing 6 changed files with 360 additions and 87 deletions.
3 changes: 1 addition & 2 deletions encodings/internal.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ module.exports = {
cesu8: { type: "_internal", bomAware: true},
unicode11utf8: "utf8",

ucs2: { type: "_internal", bomAware: true},
utf16le: "ucs2",
// NOTE: utf-16le/ucs2 are in utf16.js.

binary: { type: "_internal" },
base64: { type: "_internal" },
Expand Down
177 changes: 149 additions & 28 deletions encodings/utf16.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,123 @@
"use strict";

// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
// == UTF16-LE codec. ==========================================================
// Note: We're not using Node.js native codec because StringDecoder implementation is buggy
// (adds \0 in some chunks; doesn't flag non-even number of bytes). We do use raw encoding/decoding
// routines for performance, though.

exports.utf16le = class Utf16LECodec {
createEncoder(options, iconv) {
return new Utf16LEEncoder(iconv.backend);
}
createDecoder(options, iconv) {
return new Utf16LEDecoder(iconv.backend, iconv.defaultCharUnicode);
}
get bomAware() { return true; }
}

class Utf16LEEncoder {
constructor(backend) {
this.backend = backend;
}

write(str) {
const bytes = this.backend.allocBytes(str.length * 2);
const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length);
for (let i = 0; i < str.length; i++) {
chars[i] = str.charCodeAt(i);
}
return this.backend.bytesToResult(bytes, bytes.length);
}

end() {}
}

class Utf16LEDecoder {
constructor(backend, defaultChar) {
this.backend = backend;
this.defaultChar = defaultChar;
this.overflowByte = -1;
this.prefixSurrogate = undefined;
}

write(buf) {
if (buf.length == 0) {
return '';
}
let byteOffset = buf.byteOffset;
let byteLen = buf.length;

// Process previous overflowByte
let prefix = '';
if (this.overflowByte !== -1) {
byteOffset++; byteLen--;
prefix = String.fromCharCode(this.overflowByte + (buf[0] << 8));
}

// Set new overflowByte
if (byteLen & 1) {
this.overflowByte = buf[buf.length-1];
byteLen--;
} else {
this.overflowByte = -1;
}

let chars;
if (byteOffset & 1 === 0) {
// If byteOffset is aligned, just use the ArrayBuffer from input buf.
chars = new Uint16Array(buf.buffer, byteOffset, byteLen >> 1);
} else {
// If byteOffset is NOT aligned, create a new aligned buffer and copy the data.
chars = this.backend.allocRawChars(byteLen >> 1);
const srcByteView = new Uint8Array(buf.buffer, byteOffset, byteLen);
const destByteView = new Uint8Array(chars.buffer, chars.byteOffset, byteLen);
destByteView.set(srcByteView);
}

let res = prefix + this.backend.rawCharsToResult(chars, chars.length);
if (res) {
// Add high surrogate from previous chunk.
if (this.prefixSurrogate) {
res = this.prefixSurrogate + res;
this.prefixSurrogate = undefined;
}

// Slice off a new high surrogate at the end of the current chunk.
const lastChar = res.charCodeAt(res.length-1);
if (0xD800 <= lastChar && lastChar < 0xDC00) {
this.prefixSurrogate = res[res.length-1];
res = res.slice(0, -1);
}
}
return res;
}

end() {
if (this.prefixSurrogate || this.overflowByte !== -1) {
const res = (this.prefixSurrogate ? this.prefixSurrogate : '') + (this.overflowByte !== -1 ? this.defaultChar : '');
this.prefixSurrogate = undefined;
this.overflowByte = -1;
return res;
}
}
}
exports.ucs2 = "utf16le"; // Alias


// == UTF16-BE codec. ==========================================================

exports.utf16be = class Utf16BECodec {
get encoder() { return Utf16BEEncoder; }
get decoder() { return Utf16BEDecoder; }
createEncoder(options, iconv) {
return new Utf16BEEncoder(iconv.backend);
}
createDecoder(options, iconv) {
return new Utf16BEDecoder(iconv.backend, iconv.defaultCharUnicode);
}
get bomAware() { return true; }
}

class Utf16BEEncoder {
constructor(opts, codec, backend) {
constructor(backend) {
this.backend = backend;
}

Expand All @@ -30,30 +136,59 @@ class Utf16BEEncoder {
}

class Utf16BEDecoder {
constructor(opts, codec, backend) {
constructor(backend, defaultChar) {
this.backend = backend;
this.defaultChar = defaultChar;
this.overflowByte = -1;
this.prefixSurrogate = undefined;
}

write(buf) {
if (buf.length === 0) {
return '';
}

const chars = this.backend.allocRawChars((buf.length+1) >> 1);
let charsPos = 0, i = 0;

if (this.overflowByte !== -1 && i < buf.length) {
if (this.overflowByte !== -1) {
chars[charsPos++] = (this.overflowByte << 8) + buf[i++];
}

// NOTE: we can win another 10% perf by using chars[i >> 1].
// NOTE: the double-reverse method takes almost the same time.
for (; i < buf.length-1; i += 2) {
chars[charsPos++] = (buf[i] << 8) + buf[i+1];
}

this.overflowByte = (i == buf.length-1) ? buf[i] : -1;

return this.backend.rawCharsToResult(chars, charsPos);
let res = this.backend.rawCharsToResult(chars, charsPos);
if (res) {
// Add high surrogate from previous chunk.
if (this.prefixSurrogate) {
res = this.prefixSurrogate + res;
this.prefixSurrogate = undefined;
}

// Slice off a new high surrogate at the end of the current chunk.
const lastChar = res.charCodeAt(res.length-1);
if (0xD800 <= lastChar && lastChar < 0xDC00) {
this.prefixSurrogate = res[res.length-1];
res = res.slice(0, -1);
}
}
return res;

}

end() {
this.overflowByte = -1;
if (this.prefixSurrogate || this.overflowByte !== -1) {
const res = (this.prefixSurrogate ? this.prefixSurrogate : '') + (this.overflowByte !== -1 ? this.defaultChar : '');
this.prefixSurrogate = undefined;
this.overflowByte = -1;
return res;
}
}
}

Expand All @@ -67,39 +202,25 @@ class Utf16BEDecoder {
// Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).

exports.utf16 = class Utf16Codec {
constructor(opts, iconv) {
this.iconv = iconv;
}
get encoder() { return Utf16Encoder; }
get decoder() { return Utf16Decoder; }
}

class Utf16Encoder {
constructor(options, codec) {
createEncoder(options, iconv) {
options = options || {};
if (options.addBOM === undefined)
options.addBOM = true;
this.encoder = codec.iconv.getEncoder(options.use || 'utf-16le', options);
return iconv.getEncoder('utf-16le', options);
}

// Pass-through to this.encoder
write(str) {
return this.encoder.write(str);
}

end() {
return this.encoder.end();
createDecoder(options, iconv) {
return new Utf16Decoder(options, iconv);
}
}

class Utf16Decoder {
constructor(options, codec) {
constructor(options, iconv) {
this.decoder = null;
this.initialBufs = [];
this.initialBufsLen = 0;

this.options = options || {};
this.iconv = codec.iconv;
this.iconv = iconv;
}

write(buf) {
Expand Down
14 changes: 10 additions & 4 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,11 @@ iconv._canonicalizeEncoding = function(encoding) {
}

iconv.getEncoder = function getEncoder(encoding, options) {
var codec = iconv.getCodec(encoding),
encoder = new codec.encoder(options, codec, iconv.backend);
const codec = iconv.getCodec(encoding);

let encoder = codec.createEncoder
? codec.createEncoder(options, iconv)
: new codec.encoder(options, codec, iconv.backend);

if (codec.bomAware && options && options.addBOM)
encoder = new bomHandling.PrependBOM(encoder, options);
Expand All @@ -115,8 +118,11 @@ iconv.getEncoder = function getEncoder(encoding, options) {
}

iconv.getDecoder = function getDecoder(encoding, options) {
var codec = iconv.getCodec(encoding),
decoder = new codec.decoder(options, codec, iconv.backend);
const codec = iconv.getCodec(encoding);

let decoder = codec.createDecoder
? codec.createDecoder(options, iconv)
: new codec.decoder(options, codec, iconv.backend);

if (codec.bomAware && !(options && options.stripBOM === false))
decoder = new bomHandling.StripBOM(decoder, options);
Expand Down
16 changes: 3 additions & 13 deletions test/streams-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -213,17 +213,7 @@ describe("Streaming mode", function() {
encoding: "ucs2",
input: [[0x3D], [0xD8, 0x3B], [0xDE]], // U+1F63B, 😻, SMILING CAT FACE WITH HEART-SHAPED EYES
outputType: false, // Don't concat
checkOutput: function(res) {
if (semver.satisfies(process.version, '>= 6.2.1 < 10.0.0')) {
// After a string_decoder rewrite in https://github.com/nodejs/node/pull/6777, which
// was merged in Node v6.2.1, we don't merge chunks anymore.
// Not really correct, but it seems we cannot do anything with it.
// Though it has been fixed again in Node v10.0.0
assert.deepEqual(res, ["\uD83D", "\uDE3B"]);
} else {
assert.deepEqual(res, ["\uD83D\uDE3B"]); // We should have only 1 chunk.
}
},
checkOutput: function(res) { assert.deepEqual(res, ["\uD83D\uDE3B"]); }, // We should have only 1 chunk.
}));

it("Encoding using internal modules: utf8", checkEncodeStream({
Expand Down Expand Up @@ -264,13 +254,13 @@ describe("Streaming mode", function() {

it("Decoding of uneven length buffers from UTF-16BE - 2", checkDecodeStream({
encoding: "UTF-16BE",
input: [[0x00, 0x61, 0x00], [0x62, 0x00, 0x63]],
input: [[0x00, 0x61, 0x00], [0x62, 0x00], [0x63]],
output: "abc"
}));

it("Decoding of uneven length buffers from UTF-16", checkDecodeStream({
encoding: "UTF-16",
input: [[0x61], [0x0], [0x20], [0x0]],
input: [[0x61], [0x0, 0x20], [0x0]],
output: "a "
}));

Expand Down
Loading

0 comments on commit 84ee650

Please sign in to comment.