From 5e356a1a3b9c82de9f2e8afb0e8a755fadb19430 Mon Sep 17 00:00:00 2001 From: Joshua Bell Date: Wed, 6 Apr 2016 20:12:14 -0700 Subject: [PATCH] Drop support for encoding to UTF-16, per spec update --- README.md | 63 ++++++++++++++++------------ bower.json | 2 +- lib/encoding.js | 48 ++++++++++----------- package.json | 2 +- test/test-misc.js | 84 +++++++++++++++++++++++-------------- test/test-utf.js | 23 +++++----- test/test-x-user-defined.js | 2 +- 7 files changed, 130 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index d1e3006..c0f0c18 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,16 @@ text-encoding ============== -This is a polyfill for the [Encoding Living Standard](https://encoding.spec.whatwg.org/) -API for the Web, allowing encoding and decoding of textual data to and from Typed Array -buffers for binary data in JavaScript. +This is a polyfill for the [Encoding Living +Standard](https://encoding.spec.whatwg.org/) API for the Web, allowing +encoding and decoding of textual data to and from Typed Array buffers +for binary data in JavaScript. -By default it adheres to the spec and does not support *encoding* to non-UTF encodings, -only *decoding*. It is also implemented to match the specification's algorithms, rather -than for performance. The intended use is within Web pages, so it has no dependency -on server frameworks or particular module schemes. +By default it adheres to the spec and does not support *encoding* to +legacy encodings, only *decoding*. It is also implemented to match the +specification's algorithms, rather than for performance. The intended +use is within Web pages, so it has no dependency on server frameworks +or particular module schemes. Basic examples and tests are included. @@ -49,7 +51,7 @@ Or add it to your `bower.json` dependencies. Basic Usage ```js - var uint8array = new TextEncoder(encoding).encode(string); + var uint8array = new TextEncoder().encode(string); var string = new TextDecoder(encoding).decode(uint8array); ``` @@ -67,30 +69,38 @@ Streaming Decode All encodings from the Encoding specification are supported: -utf-8 ibm866 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-5 iso-8859-6 iso-8859-7 iso-8859-8 iso-8859-8-i iso-8859-10 iso-8859-13 iso-8859-14 iso-8859-15 iso-8859-16 koi8-r koi8-u macintosh windows-874 windows-1250 windows-1251 windows-1252 windows-1253 windows-1254 windows-1255 windows-1256 windows-1257 windows-1258 x-mac-cyrillic gb18030 hz-gb-2312 big5 euc-jp iso-2022-jp shift_jis euc-kr replacement utf-16be utf-16le x-user-defined - -(Some encodings may be supported under other names, e.g. ascii, iso-8859-1, etc. -See [Encoding](https://encoding.spec.whatwg.org/) for additional labels for each encoding.) - -Encodings other than **utf-8**, **utf-16le** and **utf-16be** require an additional -`encoding-indexes.js` file to be included. It is rather large -(596kB uncompressed, 188kB gzipped); portions may be deleted if +utf-8 ibm866 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-5 iso-8859-6 +iso-8859-7 iso-8859-8 iso-8859-8-i iso-8859-10 iso-8859-13 iso-8859-14 +iso-8859-15 iso-8859-16 koi8-r koi8-u macintosh windows-874 +windows-1250 windows-1251 windows-1252 windows-1253 windows-1254 +windows-1255 windows-1256 windows-1257 windows-1258 x-mac-cyrillic +gb18030 hz-gb-2312 big5 euc-jp iso-2022-jp shift_jis euc-kr +replacement utf-16be utf-16le x-user-defined + +(Some encodings may be supported under other names, e.g. ascii, +iso-8859-1, etc. See [Encoding](https://encoding.spec.whatwg.org/) for +additional labels for each encoding.) + +Encodings other than **utf-8**, **utf-16le** and **utf-16be** require +an additional `encoding-indexes.js` file to be included. It is rather +large (596kB uncompressed, 188kB gzipped); portions may be deleted if support for some encodings is not required. ### Non-Standard Behavior ### -As required by the specification, only encoding to **utf-8**, -**utf-16le** and **utf-16be** is supported. If you want to try it out, you can -force a non-standard behavior by passing the `NONSTANDARD_allowLegacyEncoding` -option to TextEncoder. For example: +As required by the specification, only encoding to **utf-8**, is +supported. If you want to try it out, you can force a non-standard +behavior by passing the `NONSTANDARD_allowLegacyEncoding` option to +TextEncoder and a label. For example: ```js var uint8array = new TextEncoder( 'windows-1252', { NONSTANDARD_allowLegacyEncoding: true }).encode(text); ``` -But note that the above won't work if you're using the polyfill in a browser that -natively supports the TextEncoder API natively, since the polyfill won't be used! +But note that the above won't work if you're using the polyfill in a +browser that natively supports the TextEncoder API natively, since the +polyfill won't be used! You can force the polyfill to be used by using this before the polyfill: @@ -100,7 +110,8 @@ window.TextEncoder = window.TextDecoder = null; ``` -To support the legacy encodings (which may be stateful), the TextEncoder `encode()` -method accepts an optional dictionary and `stream` option, -e.g. `encoder.encode(string, {stream: true});` This is not needed for the -stateless UTF encodings since the input is always in complete code points. +To support the legacy encodings (which may be stateful), the +TextEncoder `encode()` method accepts an optional dictionary and +`stream` option, e.g. `encoder.encode(string, {stream: true});` This +is not needed for standard encoding since the input is always in +complete code points. diff --git a/bower.json b/bower.json index 8d8146e..9c83327 100644 --- a/bower.json +++ b/bower.json @@ -1,6 +1,6 @@ { "name": "text-encoding", - "version": "0.5.5", + "version": "0.6.0", "homepage": "https://github.com/inexorabletash/text-encoding", "authors": [ "Joshua Bell ", diff --git a/lib/encoding.js b/lib/encoding.js index 3ed40de..1f021d6 100644 --- a/lib/encoding.js +++ b/lib/encoding.js @@ -1192,15 +1192,13 @@ if (typeof module !== "undefined" && module.exports) { /** * @constructor - * @param {string=} label The label of the encoding; - * defaults to 'utf-8'. - * @param {Object=} options + * @param {string=} label The label of the encoding. NONSTANDARD. + * @param {Object=} options NONSTANDARD. */ function TextEncoder(label, options) { // Web IDL conventions if (!(this instanceof TextEncoder)) throw TypeError('Called as a function. Did you forget \'new\'?'); - label = label !== undefined ? String(label) : DEFAULT_ENCODING; options = ToDictionary(options); // A TextEncoder object has an associated encoding and encoder. @@ -1216,34 +1214,36 @@ if (typeof module !== "undefined" && module.exports) { /** @private @type {string} */ this._fatal = Boolean(options['fatal']) ? 'fatal' : 'replacement'; - // 1. Let encoding be the result of getting an encoding from utfLabel. - var encoding = getEncoding(label); - - // 2. If encoding is failure, or is not UTF-8, UTF-16BE, or - // UTF-16LE, throw a RangeError. - if (encoding === null || encoding.name === 'replacement' || - (!includes(['UTF-8','UTF-16LE', 'UTF-16BE'], encoding.name) && - !Boolean(options['NONSTANDARD_allowLegacyEncoding']))) - throw RangeError('Unknown encoding: ' + label); - if (!encoders[encoding.name]) { - throw Error('Encoder not present.' + - ' Did you forget to include encoding-indexes.js?'); - } - - // 3. Let enc be a new TextEncoder object. + // 1. Let enc be a new TextEncoder object. var enc = this; - // 4. Set enc's encoding to encoding. - enc._encoding = encoding; + // 2. Set enc's encoding to UTF-8's encoder. + if (Boolean(options['NONSTANDARD_allowLegacyEncoding'])) { + // NONSTANDARD behavior. + label = label !== undefined ? String(label) : DEFAULT_ENCODING; + var encoding = getEncoding(label); + if (encoding === null || encoding.name === 'replacement') + throw RangeError('Unknown encoding: ' + label); + if (!encoders[encoding.name]) { + throw Error('Encoder not present.' + + ' Did you forget to include encoding-indexes.js?'); + } + enc._encoding = encoding; + } else { + // Standard behavior. + enc._encoding = getEncoding('utf-8'); - // 5. Set enc's encoder to a new enc's encoding's encoder. - // (Done during encode itself, due to nonstandard streaming support.) + if (label !== undefined && 'console' in global) { + console.warn('TextEncoder constructor called with encoding label, ' + + 'which is ignored.'); + } + } // For pre-ES5 runtimes: if (!Object.defineProperty) this.encoding = enc._encoding.name.toLowerCase(); - // 6. Return enc. + // 3. Return enc. return enc; } diff --git a/package.json b/package.json index 16a267a..d7a21cc 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,7 @@ "Pierre Queinnec ", "Zack Weinberg " ], - "version": "0.5.5", + "version": "0.6.0", "description": "Polyfill for the Encoding Living Standard's API.", "main": "index.js", "files": [ diff --git a/test/test-misc.js b/test/test-misc.js index c6bf393..dc12036 100644 --- a/test/test-misc.js +++ b/test/test-misc.js @@ -1,7 +1,7 @@ // This is free and unencumbered software released into the public domain. // See LICENSE.md for more information. -var UTF_ENCODINGS = ['utf-8', 'utf-16le', 'utf-16be']; +var THE_ENCODING = ['utf-8']; var LEGACY_ENCODINGS = [ 'ibm866', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5', @@ -11,9 +11,14 @@ var LEGACY_ENCODINGS = [ 'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255', 'windows-1256', 'windows-1257', 'windows-1258', 'x-mac-cyrillic', 'gbk', 'gb18030', 'big5', 'euc-jp', 'iso-2022-jp', 'shift_jis', - 'euc-kr' + 'euc-kr', 'utf-16le', 'utf-16be' ]; +var ASCII_SUPERSETS = THE_ENCODING.concat(LEGACY_ENCODINGS) + .filter(function(e) { + return e !== 'utf-16le' && e !== 'utf-16be'; + }); + // Miscellaneous tests test(function() { @@ -29,7 +34,6 @@ test(function() { test(function() { assert_true('encoding' in new TextEncoder()); assert_equals(new TextEncoder().encoding, 'utf-8'); - assert_equals(new TextEncoder('utf-16le').encoding, 'utf-16le'); assert_true('encoding' in new TextDecoder()); assert_equals(new TextDecoder().encoding, 'utf-8'); @@ -53,8 +57,8 @@ test(function() { badStrings.forEach( function(t) { - var encoded = new TextEncoder('utf-8').encode(t.input); - var decoded = new TextDecoder('utf-8').decode(encoded); + var encoded = new TextEncoder().encode(t.input); + var decoded = new TextDecoder().decode(encoded); assert_equals(t.expected, decoded); }); }, 'bad data'); @@ -167,21 +171,40 @@ test(function() { }, 'Encoding names'); test(function() { - ['utf-8', 'utf-16le', 'utf-16be'].forEach(function(encoding) { - var string = '\x00123ABCabc\x80\xFF\u0100\u1000\uFFFD\uD800\uDC00\uDBFF\uDFFF'; - var encoded = new TextEncoder(encoding).encode(string); + var string = '\x00123ABCabc\x80\xFF\u0100\u1000\uFFFD\uD800\uDC00\uDBFF\uDFFF'; + var cases = [ + { + encoding: 'utf-8', + encoded: [0, 49, 50, 51, 65, 66, 67, 97, 98, 99, 194, 128, 195, 191, 196, + 128, 225, 128, 128, 239, 191, 189, 240, 144, 128, 128, 244, 143, + 191, 191] + }, + { + encoding: 'utf-16le', + encoded: [0, 0, 49, 0, 50, 0, 51, 0, 65, 0, 66, 0, 67, 0, 97, 0, 98, 0, + 99, 0, 128, 0, 255, 0, 0, 1, 0, 16, 253, 255, 0, 216, 0, 220, + 255, 219, 255, 223] + }, + { + encoding: 'utf-16be', + encoded: [0, 0, 0, 49, 0, 50, 0, 51, 0, 65, 0, 66, 0, 67, 0, 97, 0, 98, 0, + 99, 0, 128, 0, 255, 1, 0, 16, 0, 255, 253, 216, 0, 220, 0, 219, + 255, 223, 255] + } + ]; + cases.forEach(function(c) { for (var len = 1; len <= 5; ++len) { - var out = '', decoder = new TextDecoder(encoding); - for (var i = 0; i < encoded.length; i += len) { + var out = '', decoder = new TextDecoder(c.encoding); + for (var i = 0; i < c.encoded.length; i += len) { var sub = []; - for (var j = i; j < encoded.length && j < i + len; ++j) { - sub.push(encoded[j]); + for (var j = i; j < c.encoded.length && j < i + len; ++j) { + sub.push(c.encoded[j]); } out += decoder.decode(new Uint8Array(sub), {stream: true}); } out += decoder.decode(); - assert_equals(out, string, 'streaming decode ' + encoding); + assert_equals(out, string, 'streaming decode ' + c.encoding); } }); }, 'Streaming Decode'); @@ -193,9 +216,7 @@ test(function() { }, 'Shift_JIS Decode'); test(function() { - var encodings = ['utf-8'].concat(LEGACY_ENCODINGS); - - encodings.forEach(function(encoding) { + ASCII_SUPERSETS.forEach(function(encoding) { var string = '', bytes = []; for (var i = 0; i < 128; ++i) { @@ -207,9 +228,8 @@ test(function() { string += String.fromCharCode(i); bytes.push(i); } - var ascii_encoded = new TextEncoder('utf-8').encode(string); + var ascii_encoded = new TextEncoder().encode(string); assert_equals(new TextDecoder(encoding).decode(ascii_encoded), string, encoding); - //assert_array_equals(new TextEncoder(encoding).encode(string), bytes, encoding); }); }, 'Supersets of ASCII decode ASCII correctly'); @@ -228,16 +248,11 @@ test(function() { }, 'Non-fatal errors at EOF'); test(function() { - UTF_ENCODINGS.forEach(function(encoding) { - assert_equals(new TextDecoder(encoding).encoding, encoding); - assert_equals(new TextEncoder(encoding).encoding, encoding); - }); - LEGACY_ENCODINGS.forEach(function(encoding) { assert_equals(new TextDecoder(encoding).encoding, encoding); - assert_throws({name: 'RangeError'}, function() { new TextEncoder(encoding); }); + assert_equals(new TextEncoder(encoding).encoding, 'utf-8'); }); -}, 'Non-UTF encodings supported only for decode, not encode'); +}, 'Legacy encodings supported only for decode, not encode'); test(function() { [ @@ -248,8 +263,7 @@ test(function() { 'iso-2022-kr' ].forEach(function(encoding) { - assert_throws({name: 'RangeError'}, - function() { new TextEncoder(encoding); }); + assert_equals(new TextEncoder(encoding).encoding, 'utf-8'); assert_throws({name: 'RangeError'}, function() { @@ -300,10 +314,6 @@ test(function() { assert_throws({name: 'TypeError'}, function() { new TextDecoder('utf-8').decode(null, ''); }, 'String should not coerce to dictionary.'); - - assert_throws({name: 'RangeError'}, - function() { new TextEncoder(null); }, - 'Null should coerce to "null" and be invalid encoding name.'); }, 'Invalid parameters'); test(function() { @@ -357,3 +367,15 @@ test(function() { }); }, 'NONSTANDARD - iso-2022-jp encoding attack (encoding)'); + +['utf-16le', 'utf-16be'].forEach(function(encoding) { + test(function() { + var encoder = new TextEncoder(encoding, {NONSTANDARD_allowLegacyEncoding: true}); + var decoder = new TextDecoder(encoding); + + var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; + + assert_equals(decoder.decode(encoder.encode(sample)), sample); + + }, 'NONSTANDARD - ' + encoding + ' (encoding)'); +}); diff --git a/test/test-utf.js b/test/test-utf.js index 786c945..e469f5c 100644 --- a/test/test-utf.js +++ b/test/test-utf.js @@ -71,19 +71,26 @@ function genblock(from, len, skip) { return block.join(''); } +function encode_utf16le(s) { return encode_utf16(s, true); } +function encode_utf16be(s) { return encode_utf16(s, false); } +function encode_utf16(s, le) { + var a = new Uint8Array(s.length * 2), view = new DataView(a.buffer); + s.split('').forEach(function(c, i) { + view.setUint16(i * 2, c.charCodeAt(0), le); + }); + return a; +} + function test_utf_roundtrip () { var MIN_CODEPOINT = 0; var MAX_CODEPOINT = 0x10FFFF; var BLOCK_SIZE = 0x1000; var SKIP_SIZE = 31; - var TE_U16LE = new TextEncoder("UTF-16LE"); var TD_U16LE = new TextDecoder("UTF-16LE"); - - var TE_U16BE = new TextEncoder("UTF-16BE"); var TD_U16BE = new TextDecoder("UTF-16BE"); - var TE_U8 = new TextEncoder("UTF-8"); + var TE_U8 = new TextEncoder(); var TD_U8 = new TextDecoder("UTF-8"); for (var i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) { @@ -91,11 +98,11 @@ function test_utf_roundtrip () { var block = genblock(i, BLOCK_SIZE, SKIP_SIZE); // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves - var encoded = TE_U16LE.encode(block); + var encoded = encode_utf16le(block); var decoded = TD_U16LE.decode(encoded); assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag); - encoded = TE_U16BE.encode(block); + encoded = encode_utf16be(block); decoded = TD_U16BE.decode(encoded); assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag); @@ -130,10 +137,6 @@ function test_utf_samples () { cases.forEach( function(t) { - var encoded = new TextEncoder(t.encoding).encode(sample); - assert_array_equals(encoded, t.expected, - "expected equal encodings - " + t.encoding); - var decoded = new TextDecoder(t.encoding) .decode(new Uint8Array(t.expected)); assert_equals(decoded, sample, diff --git a/test/test-x-user-defined.js b/test/test-x-user-defined.js index 401511a..e8df0da 100644 --- a/test/test-x-user-defined.js +++ b/test/test-x-user-defined.js @@ -3,7 +3,7 @@ test( function() { - assert_throws({name: 'RangeError'}, function() { new TextEncoder('x-user-defined'); }); + assert_equals(new TextEncoder('x-user-defined').encoding, 'utf-8'); var decoder = new TextDecoder('x-user-defined'); for (var i = 0; i < 0x80; ++i) {