Implement UTF-16LE encoding, update tests, adjust codec interface

Three major reasons for reimplementing UTF-16 and not use native codec: 1. We want to remove StringDecoder & Buffer references due to #235. 2. StringDecoder is inconsistent with handling surrogates on Node v6-9 3. NPM module string_decoder gives strange results when processing chunks - it sometimes prepends '\u0000', likely due to a bug. Performance was and is a major concern here. Decoder shouldn't be affected because it uses backend methods directly. Encoder is affected due to introducing character-level loop. It's still very fast (~450Mb/s), so I'm not too worried. If needed, we can make it about 4x faster in Node.js by introducing a dedicated backend method. Browser speeds will be the same.
ashtuchkin · Jul 16, 2020 · 84ee650 · 84ee650
1 parent e567849
commit 84ee650
Show file tree

Hide file tree

Showing 6 changed files with 360 additions and 87 deletions.
diff --git a/encodings/internal.js b/encodings/internal.js
@@ -9,8 +9,7 @@ module.exports = {
     cesu8:  { type: "_internal", bomAware: true},
     unicode11utf8: "utf8",
 
-    ucs2:   { type: "_internal", bomAware: true},
-    utf16le: "ucs2",
+    // NOTE: utf-16le/ucs2 are in utf16.js.
 
     binary: { type: "_internal" },
     base64: { type: "_internal" },

diff --git a/encodings/utf16.js b/encodings/utf16.js
@@ -1,17 +1,123 @@
 "use strict";
 
-// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
+// == UTF16-LE codec. ==========================================================
+// Note: We're not using Node.js native codec because StringDecoder implementation is buggy
+// (adds \0 in some chunks; doesn't flag non-even number of bytes). We do use raw encoding/decoding
+// routines for performance, though.
+
+exports.utf16le = class Utf16LECodec {
+    createEncoder(options, iconv) {
+        return new Utf16LEEncoder(iconv.backend);
+    }
+    createDecoder(options, iconv) {
+        return new Utf16LEDecoder(iconv.backend, iconv.defaultCharUnicode);
+    }
+    get bomAware() { return true; }
+}
+
+class Utf16LEEncoder {
+    constructor(backend) {
+        this.backend = backend;
+    }
+
+    write(str) {
+        const bytes = this.backend.allocBytes(str.length * 2);
+        const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length);
+        for (let i = 0; i < str.length; i++) {
+            chars[i] = str.charCodeAt(i);
+        }
+        return this.backend.bytesToResult(bytes, bytes.length);
+    }
+
+    end() {}
+}
+
+class Utf16LEDecoder {
+    constructor(backend, defaultChar) {
+        this.backend = backend;
+        this.defaultChar = defaultChar;
+        this.overflowByte = -1;
+        this.prefixSurrogate = undefined;
+    }
+
+    write(buf) {
+        if (buf.length == 0) {
+            return '';
+        }
+        let byteOffset = buf.byteOffset;
+        let byteLen = buf.length;
+
+        // Process previous overflowByte
+        let prefix = '';
+        if (this.overflowByte !== -1) {
+            byteOffset++; byteLen--;
+            prefix = String.fromCharCode(this.overflowByte + (buf[0] << 8));
+        }
+
+        // Set new overflowByte
+        if (byteLen & 1) {
+            this.overflowByte = buf[buf.length-1];
+            byteLen--;
+        } else {
+            this.overflowByte = -1;
+        }
+
+        let chars;
+        if (byteOffset & 1 === 0) {
+            // If byteOffset is aligned, just use the ArrayBuffer from input buf.
+            chars = new Uint16Array(buf.buffer, byteOffset, byteLen >> 1);
+        } else {
+            // If byteOffset is NOT aligned, create a new aligned buffer and copy the data.
+            chars = this.backend.allocRawChars(byteLen >> 1);
+            const srcByteView = new Uint8Array(buf.buffer, byteOffset, byteLen);
+            const destByteView = new Uint8Array(chars.buffer, chars.byteOffset, byteLen);
+            destByteView.set(srcByteView);
+        }
+
+        let res = prefix + this.backend.rawCharsToResult(chars, chars.length);
+        if (res) {
+            // Add high surrogate from previous chunk.
+            if (this.prefixSurrogate) {
+                res = this.prefixSurrogate + res;
+                this.prefixSurrogate = undefined;
+            }
+
+            // Slice off a new high surrogate at the end of the current chunk.
+            const lastChar = res.charCodeAt(res.length-1);
+            if (0xD800 <= lastChar && lastChar < 0xDC00) {
+                this.prefixSurrogate = res[res.length-1];
+                res = res.slice(0, -1);
+            }
+        }
+        return res;
+    }
+
+    end() {
+        if (this.prefixSurrogate || this.overflowByte !== -1) {
+            const res = (this.prefixSurrogate ? this.prefixSurrogate : '') + (this.overflowByte !== -1 ? this.defaultChar : '');
+            this.prefixSurrogate = undefined;
+            this.overflowByte = -1;
+            return res;
+        }
+    }
+}
+exports.ucs2 = "utf16le";  // Alias
+
 
 // == UTF16-BE codec. ==========================================================
 
 exports.utf16be = class Utf16BECodec {
-    get encoder() { return Utf16BEEncoder; }
-    get decoder() { return Utf16BEDecoder; }
+    createEncoder(options, iconv) {
+        return new Utf16BEEncoder(iconv.backend);
+    }
+    createDecoder(options, iconv) {
+        return new Utf16BEDecoder(iconv.backend, iconv.defaultCharUnicode);
+    }
     get bomAware() { return true; }
 }
 
 class Utf16BEEncoder {
-    constructor(opts, codec, backend) {
+    constructor(backend) {
         this.backend = backend;
     }
 
@@ -30,30 +136,59 @@ class Utf16BEEncoder {
 }
 
 class Utf16BEDecoder {
-    constructor(opts, codec, backend) {
+    constructor(backend, defaultChar) {
         this.backend = backend;
+        this.defaultChar = defaultChar;
         this.overflowByte = -1;
+        this.prefixSurrogate = undefined;
     }
 
     write(buf) {
+        if (buf.length === 0) {
+            return '';
+        }
+
         const chars = this.backend.allocRawChars((buf.length+1) >> 1);
         let charsPos = 0, i = 0;
 
-        if (this.overflowByte !== -1 && i < buf.length) {
+        if (this.overflowByte !== -1) {
             chars[charsPos++] = (this.overflowByte << 8) + buf[i++];
         }
 
+        // NOTE: we can win another 10% perf by using chars[i >> 1].
+        // NOTE: the double-reverse method takes almost the same time.
         for (; i < buf.length-1; i += 2) {
             chars[charsPos++] = (buf[i] << 8) + buf[i+1];
         }
 
         this.overflowByte = (i == buf.length-1) ? buf[i] : -1;
 
-        return this.backend.rawCharsToResult(chars, charsPos);
+        let res = this.backend.rawCharsToResult(chars, charsPos);
+        if (res) {
+            // Add high surrogate from previous chunk.
+            if (this.prefixSurrogate) {
+                res = this.prefixSurrogate + res;
+                this.prefixSurrogate = undefined;
+            }
+
+            // Slice off a new high surrogate at the end of the current chunk.
+            const lastChar = res.charCodeAt(res.length-1);
+            if (0xD800 <= lastChar && lastChar < 0xDC00) {
+                this.prefixSurrogate = res[res.length-1];
+                res = res.slice(0, -1);
+            }
+        }
+        return res;
+
     }
 
     end() {
-        this.overflowByte = -1;
+        if (this.prefixSurrogate || this.overflowByte !== -1) {
+            const res = (this.prefixSurrogate ? this.prefixSurrogate : '') + (this.overflowByte !== -1 ? this.defaultChar : '');
+            this.prefixSurrogate = undefined;
+            this.overflowByte = -1;
+            return res;
+        }
     }
 }
 
@@ -67,39 +202,25 @@ class Utf16BEDecoder {
 // Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).
 
 exports.utf16 = class Utf16Codec {
-    constructor(opts, iconv) {
-        this.iconv = iconv;
-    }
-    get encoder() { return Utf16Encoder; }
-    get decoder() { return Utf16Decoder; }
-}
-
-class Utf16Encoder {
-    constructor(options, codec) {
+    createEncoder(options, iconv) {
         options = options || {};
         if (options.addBOM === undefined)
             options.addBOM = true;
-        this.encoder = codec.iconv.getEncoder(options.use || 'utf-16le', options);
+        return iconv.getEncoder('utf-16le', options);
     }
-
-    // Pass-through to this.encoder
-    write(str) {
-        return this.encoder.write(str);
-    }
-
-    end() {
-        return this.encoder.end();
+    createDecoder(options, iconv) {
+        return new Utf16Decoder(options, iconv);
     }
 }
 
 class Utf16Decoder {
-    constructor(options, codec) {
+    constructor(options, iconv) {
         this.decoder = null;
         this.initialBufs = [];
         this.initialBufsLen = 0;
 
         this.options = options || {};
-        this.iconv = codec.iconv;
+        this.iconv = iconv;
     }
 
     write(buf) {

diff --git a/lib/index.js b/lib/index.js
@@ -105,8 +105,11 @@ iconv._canonicalizeEncoding = function(encoding) {
 }
 
 iconv.getEncoder = function getEncoder(encoding, options) {
-    var codec = iconv.getCodec(encoding),
-        encoder = new codec.encoder(options, codec, iconv.backend);
+    const codec = iconv.getCodec(encoding);
+
+    let encoder = codec.createEncoder
+        ? codec.createEncoder(options, iconv)
+        : new codec.encoder(options, codec, iconv.backend);
 
     if (codec.bomAware && options && options.addBOM)
         encoder = new bomHandling.PrependBOM(encoder, options);
@@ -115,8 +118,11 @@ iconv.getEncoder = function getEncoder(encoding, options) {
 }
 
 iconv.getDecoder = function getDecoder(encoding, options) {
-    var codec = iconv.getCodec(encoding),
-        decoder = new codec.decoder(options, codec, iconv.backend);
+    const codec = iconv.getCodec(encoding);
+
+    let decoder = codec.createDecoder
+        ? codec.createDecoder(options, iconv)
+        : new codec.decoder(options, codec, iconv.backend);
 
     if (codec.bomAware && !(options && options.stripBOM === false))
         decoder = new bomHandling.StripBOM(decoder, options);

diff --git a/test/streams-test.js b/test/streams-test.js
@@ -213,17 +213,7 @@ describe("Streaming mode", function() {
         encoding: "ucs2",
         input: [[0x3D], [0xD8, 0x3B], [0xDE]], // U+1F63B, 😻, SMILING CAT FACE WITH HEART-SHAPED EYES
         outputType: false, // Don't concat
-        checkOutput: function(res) {
-            if (semver.satisfies(process.version, '>= 6.2.1 < 10.0.0')) {
-                // After a string_decoder rewrite in https://github.com/nodejs/node/pull/6777, which
-                // was merged in Node v6.2.1, we don't merge chunks anymore.
-                // Not really correct, but it seems we cannot do anything with it.
-                // Though it has been fixed again in Node v10.0.0
-                assert.deepEqual(res, ["\uD83D", "\uDE3B"]);
-            } else {
-                assert.deepEqual(res, ["\uD83D\uDE3B"]); // We should have only 1 chunk.
-            }
-        },
+        checkOutput: function(res) { assert.deepEqual(res, ["\uD83D\uDE3B"]); }, // We should have only 1 chunk.
     }));
 
     it("Encoding using internal modules: utf8", checkEncodeStream({
@@ -264,13 +254,13 @@ describe("Streaming mode", function() {
 
     it("Decoding of uneven length buffers from UTF-16BE - 2", checkDecodeStream({
         encoding: "UTF-16BE",
-        input: [[0x00, 0x61, 0x00], [0x62, 0x00, 0x63]],
+        input: [[0x00, 0x61, 0x00], [0x62, 0x00], [0x63]],
         output: "abc"
     }));
 
     it("Decoding of uneven length buffers from UTF-16", checkDecodeStream({
         encoding: "UTF-16",
-        input: [[0x61], [0x0], [0x20], [0x0]],
+        input: [[0x61], [0x0, 0x20], [0x0]],
         output: "a "
     }));