From 5e356a1a3b9c82de9f2e8afb0e8a755fadb19430 Mon Sep 17 00:00:00 2001
From: Joshua Bell <inexorabletash@gmail.com>
Date: Wed, 6 Apr 2016 20:12:14 -0700
Subject: [PATCH] Drop support for encoding to UTF-16, per spec update

---
 README.md                   | 63 ++++++++++++++++------------
 bower.json                  |  2 +-
 lib/encoding.js             | 48 ++++++++++-----------
 package.json                |  2 +-
 test/test-misc.js           | 84 +++++++++++++++++++++++--------------
 test/test-utf.js            | 23 +++++-----
 test/test-x-user-defined.js |  2 +-
 7 files changed, 130 insertions(+), 94 deletions(-)

diff --git a/README.md b/README.md
index d1e3006..c0f0c18 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,16 @@
 text-encoding
 ==============
 
-This is a polyfill for the [Encoding Living Standard](https://encoding.spec.whatwg.org/)
-API for the Web, allowing encoding and decoding of textual data to and from Typed Array
-buffers for binary data in JavaScript.
+This is a polyfill for the [Encoding Living
+Standard](https://encoding.spec.whatwg.org/) API for the Web, allowing
+encoding and decoding of textual data to and from Typed Array buffers
+for binary data in JavaScript.
 
-By default it adheres to the spec and does not support *encoding* to non-UTF encodings,
-only *decoding*. It is also implemented to match the specification's algorithms, rather
-than for performance. The intended use is within Web pages, so it has no dependency
-on server frameworks or particular module schemes.
+By default it adheres to the spec and does not support *encoding* to
+legacy encodings, only *decoding*. It is also implemented to match the
+specification's algorithms, rather than for performance. The intended
+use is within Web pages, so it has no dependency on server frameworks
+or particular module schemes.
 
 Basic examples and tests are included.
 
@@ -49,7 +51,7 @@ Or add it to your `bower.json` dependencies.
 Basic Usage
 
 ```js
-  var uint8array = new TextEncoder(encoding).encode(string);
+  var uint8array = new TextEncoder().encode(string);
   var string = new TextDecoder(encoding).decode(uint8array);
 ```
 
@@ -67,30 +69,38 @@ Streaming Decode
 
 All encodings from the Encoding specification are supported:
 
-utf-8 ibm866 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-5 iso-8859-6 iso-8859-7 iso-8859-8 iso-8859-8-i iso-8859-10 iso-8859-13 iso-8859-14 iso-8859-15 iso-8859-16 koi8-r koi8-u macintosh windows-874 windows-1250 windows-1251 windows-1252 windows-1253 windows-1254 windows-1255 windows-1256 windows-1257 windows-1258 x-mac-cyrillic gb18030 hz-gb-2312 big5 euc-jp iso-2022-jp shift_jis euc-kr replacement utf-16be utf-16le x-user-defined
-
-(Some encodings may be supported under other names, e.g. ascii, iso-8859-1, etc.
-See [Encoding](https://encoding.spec.whatwg.org/) for additional labels for each encoding.)
-
-Encodings other than **utf-8**, **utf-16le** and **utf-16be** require an additional
-`encoding-indexes.js` file to be included. It is rather large
-(596kB uncompressed, 188kB gzipped); portions may be deleted if
+utf-8 ibm866 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-5 iso-8859-6
+iso-8859-7 iso-8859-8 iso-8859-8-i iso-8859-10 iso-8859-13 iso-8859-14
+iso-8859-15 iso-8859-16 koi8-r koi8-u macintosh windows-874
+windows-1250 windows-1251 windows-1252 windows-1253 windows-1254
+windows-1255 windows-1256 windows-1257 windows-1258 x-mac-cyrillic
+gb18030 hz-gb-2312 big5 euc-jp iso-2022-jp shift_jis euc-kr
+replacement utf-16be utf-16le x-user-defined
+
+(Some encodings may be supported under other names, e.g. ascii,
+iso-8859-1, etc. See [Encoding](https://encoding.spec.whatwg.org/) for
+additional labels for each encoding.)
+
+Encodings other than **utf-8**, **utf-16le** and **utf-16be** require
+an additional `encoding-indexes.js` file to be included. It is rather
+large (596kB uncompressed, 188kB gzipped); portions may be deleted if
 support for some encodings is not required.
 
 ### Non-Standard Behavior ###
 
-As required by the specification, only encoding to **utf-8**,
-**utf-16le** and **utf-16be** is supported. If you want to try it out, you can
-force a non-standard behavior by passing the `NONSTANDARD_allowLegacyEncoding`
-option to TextEncoder. For example:
+As required by the specification, only encoding to **utf-8**, is
+supported. If you want to try it out, you can force a non-standard
+behavior by passing the `NONSTANDARD_allowLegacyEncoding` option to
+TextEncoder and a label. For example:
 
 ```js
 var uint8array = new TextEncoder(
   'windows-1252', { NONSTANDARD_allowLegacyEncoding: true }).encode(text);
 ```
 
-But note that the above won't work if you're using the polyfill in a browser that
-natively supports the TextEncoder API natively, since the polyfill won't be used!
+But note that the above won't work if you're using the polyfill in a
+browser that natively supports the TextEncoder API natively, since the
+polyfill won't be used!
 
 You can force the polyfill to be used by using this before the polyfill:
 
@@ -100,7 +110,8 @@ window.TextEncoder = window.TextDecoder = null;
 </script>
 ```
 
-To support the legacy encodings (which may be stateful), the TextEncoder `encode()`
-method accepts an optional dictionary and `stream` option,
-e.g. `encoder.encode(string, {stream: true});` This is not needed for the
-stateless UTF encodings since the input is always in complete code points.
+To support the legacy encodings (which may be stateful), the
+TextEncoder `encode()` method accepts an optional dictionary and
+`stream` option, e.g. `encoder.encode(string, {stream: true});` This
+is not needed for standard encoding since the input is always in
+complete code points.
diff --git a/bower.json b/bower.json
index 8d8146e..9c83327 100644
--- a/bower.json
+++ b/bower.json
@@ -1,6 +1,6 @@
 {
   "name": "text-encoding",
-  "version": "0.5.5",
+  "version": "0.6.0",
   "homepage": "https://github.com/inexorabletash/text-encoding",
   "authors": [
     "Joshua Bell <inexorabletash@gmail.com>",
diff --git a/lib/encoding.js b/lib/encoding.js
index 3ed40de..1f021d6 100644
--- a/lib/encoding.js
+++ b/lib/encoding.js
@@ -1192,15 +1192,13 @@ if (typeof module !== "undefined" && module.exports) {
 
   /**
    * @constructor
-   * @param {string=} label The label of the encoding;
-   *     defaults to 'utf-8'.
-   * @param {Object=} options
+   * @param {string=} label The label of the encoding. NONSTANDARD.
+   * @param {Object=} options NONSTANDARD.
    */
   function TextEncoder(label, options) {
     // Web IDL conventions
     if (!(this instanceof TextEncoder))
       throw TypeError('Called as a function. Did you forget \'new\'?');
-    label = label !== undefined ? String(label) : DEFAULT_ENCODING;
     options = ToDictionary(options);
 
     // A TextEncoder object has an associated encoding and encoder.
@@ -1216,34 +1214,36 @@ if (typeof module !== "undefined" && module.exports) {
     /** @private @type {string} */
     this._fatal = Boolean(options['fatal']) ? 'fatal' : 'replacement';
 
-    // 1. Let encoding be the result of getting an encoding from utfLabel.
-    var encoding = getEncoding(label);
-
-    // 2. If encoding is failure, or is not UTF-8, UTF-16BE, or
-    // UTF-16LE, throw a RangeError.
-    if (encoding === null || encoding.name === 'replacement' ||
-        (!includes(['UTF-8','UTF-16LE', 'UTF-16BE'], encoding.name) &&
-         !Boolean(options['NONSTANDARD_allowLegacyEncoding'])))
-      throw RangeError('Unknown encoding: ' + label);
-    if (!encoders[encoding.name]) {
-      throw Error('Encoder not present.' +
-                  ' Did you forget to include encoding-indexes.js?');
-    }
-
-    // 3. Let enc be a new TextEncoder object.
+    // 1. Let enc be a new TextEncoder object.
     var enc = this;
 
-    // 4. Set enc's encoding to encoding.
-    enc._encoding = encoding;
+    // 2. Set enc's encoding to UTF-8's encoder.
+    if (Boolean(options['NONSTANDARD_allowLegacyEncoding'])) {
+      // NONSTANDARD behavior.
+      label = label !== undefined ? String(label) : DEFAULT_ENCODING;
+      var encoding = getEncoding(label);
+      if (encoding === null || encoding.name === 'replacement')
+        throw RangeError('Unknown encoding: ' + label);
+      if (!encoders[encoding.name]) {
+        throw Error('Encoder not present.' +
+                    ' Did you forget to include encoding-indexes.js?');
+      }
+      enc._encoding = encoding;
+    } else {
+      // Standard behavior.
+      enc._encoding = getEncoding('utf-8');
 
-    // 5. Set enc's encoder to a new enc's encoding's encoder.
-    // (Done during encode itself, due to nonstandard streaming support.)
+      if (label !== undefined && 'console' in global) {
+        console.warn('TextEncoder constructor called with encoding label, '
+                     + 'which is ignored.');
+      }
+    }
 
     // For pre-ES5 runtimes:
     if (!Object.defineProperty)
       this.encoding = enc._encoding.name.toLowerCase();
 
-    // 6. Return enc.
+    // 3. Return enc.
     return enc;
   }
 
diff --git a/package.json b/package.json
index 16a267a..d7a21cc 100644
--- a/package.json
+++ b/package.json
@@ -12,7 +12,7 @@
     "Pierre Queinnec <pierre@queinnec.org>",
     "Zack Weinberg <zackw@panix.com>"
   ],
-  "version": "0.5.5",
+  "version": "0.6.0",
   "description": "Polyfill for the Encoding Living Standard's API.",
   "main": "index.js",
   "files": [
diff --git a/test/test-misc.js b/test/test-misc.js
index c6bf393..dc12036 100644
--- a/test/test-misc.js
+++ b/test/test-misc.js
@@ -1,7 +1,7 @@
 // This is free and unencumbered software released into the public domain.
 // See LICENSE.md for more information.
 
-var UTF_ENCODINGS = ['utf-8', 'utf-16le', 'utf-16be'];
+var THE_ENCODING = ['utf-8'];
 
 var LEGACY_ENCODINGS = [
   'ibm866', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5',
@@ -11,9 +11,14 @@ var LEGACY_ENCODINGS = [
   'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255',
   'windows-1256', 'windows-1257', 'windows-1258', 'x-mac-cyrillic',
   'gbk', 'gb18030', 'big5', 'euc-jp', 'iso-2022-jp', 'shift_jis',
-  'euc-kr'
+  'euc-kr', 'utf-16le', 'utf-16be'
 ];
 
+var ASCII_SUPERSETS = THE_ENCODING.concat(LEGACY_ENCODINGS)
+      .filter(function(e) {
+        return e !== 'utf-16le' && e !== 'utf-16be';
+      });
+
 // Miscellaneous tests
 
 test(function() {
@@ -29,7 +34,6 @@ test(function() {
 test(function() {
   assert_true('encoding' in new TextEncoder());
   assert_equals(new TextEncoder().encoding, 'utf-8');
-  assert_equals(new TextEncoder('utf-16le').encoding, 'utf-16le');
 
   assert_true('encoding' in new TextDecoder());
   assert_equals(new TextDecoder().encoding, 'utf-8');
@@ -53,8 +57,8 @@ test(function() {
 
   badStrings.forEach(
     function(t) {
-      var encoded = new TextEncoder('utf-8').encode(t.input);
-      var decoded = new TextDecoder('utf-8').decode(encoded);
+      var encoded = new TextEncoder().encode(t.input);
+      var decoded = new TextDecoder().decode(encoded);
       assert_equals(t.expected, decoded);
     });
 }, 'bad data');
@@ -167,21 +171,40 @@ test(function() {
 }, 'Encoding names');
 
 test(function() {
-  ['utf-8', 'utf-16le', 'utf-16be'].forEach(function(encoding) {
-    var string = '\x00123ABCabc\x80\xFF\u0100\u1000\uFFFD\uD800\uDC00\uDBFF\uDFFF';
-    var encoded = new TextEncoder(encoding).encode(string);
+  var string = '\x00123ABCabc\x80\xFF\u0100\u1000\uFFFD\uD800\uDC00\uDBFF\uDFFF';
+  var cases = [
+    {
+      encoding: 'utf-8',
+      encoded: [0, 49, 50, 51, 65, 66, 67, 97, 98, 99, 194, 128, 195, 191, 196,
+                128, 225, 128, 128, 239, 191, 189, 240, 144, 128, 128, 244, 143,
+                191, 191]
+    },
+    {
+      encoding: 'utf-16le',
+      encoded: [0, 0, 49, 0, 50, 0, 51, 0, 65, 0, 66, 0, 67, 0, 97, 0, 98, 0,
+                99, 0, 128, 0, 255, 0, 0, 1, 0, 16, 253, 255, 0, 216, 0, 220,
+                255, 219, 255, 223]
+    },
+    {
+      encoding: 'utf-16be',
+      encoded: [0, 0, 0, 49, 0, 50, 0, 51, 0, 65, 0, 66, 0, 67, 0, 97, 0, 98, 0,
+                99, 0, 128, 0, 255, 1, 0, 16, 0, 255, 253, 216, 0, 220, 0, 219,
+                255, 223, 255]
+      }
+  ];
 
+  cases.forEach(function(c) {
     for (var len = 1; len <= 5; ++len) {
-      var out = '', decoder = new TextDecoder(encoding);
-      for (var i = 0; i < encoded.length; i += len) {
+      var out = '', decoder = new TextDecoder(c.encoding);
+      for (var i = 0; i < c.encoded.length; i += len) {
         var sub = [];
-        for (var j = i; j < encoded.length && j < i + len; ++j) {
-          sub.push(encoded[j]);
+        for (var j = i; j < c.encoded.length && j < i + len; ++j) {
+          sub.push(c.encoded[j]);
         }
         out += decoder.decode(new Uint8Array(sub), {stream: true});
       }
       out += decoder.decode();
-      assert_equals(out, string, 'streaming decode ' + encoding);
+      assert_equals(out, string, 'streaming decode ' + c.encoding);
     }
   });
 }, 'Streaming Decode');
@@ -193,9 +216,7 @@ test(function() {
 }, 'Shift_JIS Decode');
 
 test(function() {
-  var encodings = ['utf-8'].concat(LEGACY_ENCODINGS);
-
-  encodings.forEach(function(encoding) {
+  ASCII_SUPERSETS.forEach(function(encoding) {
     var string = '', bytes = [];
     for (var i = 0; i < 128; ++i) {
 
@@ -207,9 +228,8 @@ test(function() {
       string += String.fromCharCode(i);
       bytes.push(i);
     }
-    var ascii_encoded = new TextEncoder('utf-8').encode(string);
+    var ascii_encoded = new TextEncoder().encode(string);
     assert_equals(new TextDecoder(encoding).decode(ascii_encoded), string, encoding);
-      //assert_array_equals(new TextEncoder(encoding).encode(string), bytes, encoding);
   });
 }, 'Supersets of ASCII decode ASCII correctly');
 
@@ -228,16 +248,11 @@ test(function() {
 }, 'Non-fatal errors at EOF');
 
 test(function() {
-  UTF_ENCODINGS.forEach(function(encoding) {
-    assert_equals(new TextDecoder(encoding).encoding, encoding);
-    assert_equals(new TextEncoder(encoding).encoding, encoding);
-  });
-
   LEGACY_ENCODINGS.forEach(function(encoding) {
     assert_equals(new TextDecoder(encoding).encoding, encoding);
-    assert_throws({name: 'RangeError'}, function() { new TextEncoder(encoding); });
+    assert_equals(new TextEncoder(encoding).encoding, 'utf-8');
   });
-}, 'Non-UTF encodings supported only for decode, not encode');
+}, 'Legacy encodings supported only for decode, not encode');
 
 test(function() {
   [
@@ -248,8 +263,7 @@ test(function() {
     'iso-2022-kr'
   ].forEach(function(encoding) {
 
-    assert_throws({name: 'RangeError'},
-                  function() { new TextEncoder(encoding); });
+    assert_equals(new TextEncoder(encoding).encoding, 'utf-8');
 
     assert_throws({name: 'RangeError'},
                   function() {
@@ -300,10 +314,6 @@ test(function() {
   assert_throws({name: 'TypeError'},
                 function() { new TextDecoder('utf-8').decode(null, ''); },
                 'String should not coerce to dictionary.');
-
-  assert_throws({name: 'RangeError'},
-                function() { new TextEncoder(null); },
-                'Null should coerce to "null" and be invalid encoding name.');
 }, 'Invalid parameters');
 
 test(function() {
@@ -357,3 +367,15 @@ test(function() {
   });
 
 }, 'NONSTANDARD - iso-2022-jp encoding attack (encoding)');
+
+['utf-16le', 'utf-16be'].forEach(function(encoding) {
+  test(function() {
+    var encoder = new TextEncoder(encoding, {NONSTANDARD_allowLegacyEncoding: true});
+    var decoder = new TextDecoder(encoding);
+
+    var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD";
+
+    assert_equals(decoder.decode(encoder.encode(sample)), sample);
+
+  }, 'NONSTANDARD - ' + encoding + ' (encoding)');
+});
diff --git a/test/test-utf.js b/test/test-utf.js
index 786c945..e469f5c 100644
--- a/test/test-utf.js
+++ b/test/test-utf.js
@@ -71,19 +71,26 @@ function genblock(from, len, skip) {
   return block.join('');
 }
 
+function encode_utf16le(s) { return encode_utf16(s, true); }
+function encode_utf16be(s) { return encode_utf16(s, false); }
+function encode_utf16(s, le) {
+  var a = new Uint8Array(s.length * 2), view = new DataView(a.buffer);
+  s.split('').forEach(function(c, i) {
+    view.setUint16(i * 2, c.charCodeAt(0), le);
+  });
+  return a;
+}
+
 function test_utf_roundtrip () {
   var MIN_CODEPOINT = 0;
   var MAX_CODEPOINT = 0x10FFFF;
   var BLOCK_SIZE = 0x1000;
   var SKIP_SIZE = 31;
 
-  var TE_U16LE = new TextEncoder("UTF-16LE");
   var TD_U16LE = new TextDecoder("UTF-16LE");
-
-  var TE_U16BE = new TextEncoder("UTF-16BE");
   var TD_U16BE = new TextDecoder("UTF-16BE");
 
-  var TE_U8    = new TextEncoder("UTF-8");
+  var TE_U8    = new TextEncoder();
   var TD_U8    = new TextDecoder("UTF-8");
 
   for (var i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) {
@@ -91,11 +98,11 @@ function test_utf_roundtrip () {
     var block = genblock(i, BLOCK_SIZE, SKIP_SIZE);
 
     // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves
-    var encoded = TE_U16LE.encode(block);
+    var encoded = encode_utf16le(block);
     var decoded = TD_U16LE.decode(encoded);
     assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag);
 
-    encoded = TE_U16BE.encode(block);
+    encoded = encode_utf16be(block);
     decoded = TD_U16BE.decode(encoded);
     assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag);
 
@@ -130,10 +137,6 @@ function test_utf_samples () {
 
   cases.forEach(
     function(t) {
-      var encoded = new TextEncoder(t.encoding).encode(sample);
-      assert_array_equals(encoded, t.expected,
-                          "expected equal encodings - " + t.encoding);
-
       var decoded = new TextDecoder(t.encoding)
                         .decode(new Uint8Array(t.expected));
       assert_equals(decoded, sample,
diff --git a/test/test-x-user-defined.js b/test/test-x-user-defined.js
index 401511a..e8df0da 100644
--- a/test/test-x-user-defined.js
+++ b/test/test-x-user-defined.js
@@ -3,7 +3,7 @@
 
 test(
   function() {
-    assert_throws({name: 'RangeError'}, function() { new TextEncoder('x-user-defined'); });
+    assert_equals(new TextEncoder('x-user-defined').encoding, 'utf-8');
 
     var decoder = new TextDecoder('x-user-defined');
     for (var i = 0; i < 0x80; ++i) {