From 9af762c0cee90bcea0009788bf9616cfd1129351 Mon Sep 17 00:00:00 2001 From: syumai Date: Tue, 28 May 2024 17:16:47 +0900 Subject: [PATCH 1/4] add 'ignore' to fallback option --- src/encoding-convert.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/encoding-convert.js b/src/encoding-convert.js index 7ffe9c0..ba23802 100644 --- a/src/encoding-convert.js +++ b/src/encoding-convert.js @@ -1672,5 +1672,8 @@ function handleFallback(results, bytes, fallbackOption) { } results[results.length] = 0x3B; // ; } + break; + case 'ignore': + break; } } From 55d48d29d62a67f74e773dbf305a9549912ca495 Mon Sep 17 00:00:00 2001 From: syumai Date: Tue, 28 May 2024 17:21:14 +0900 Subject: [PATCH 2/4] run npm run build --- encoding.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/encoding.js b/encoding.js index 496f673..9442071 100644 --- a/encoding.js +++ b/encoding.js @@ -1824,6 +1824,9 @@ function handleFallback(results, bytes, fallbackOption) { } results[results.length] = 0x3B; // ; } + break; + case 'ignore': + break; } } From efcbf06a66e559c3170cca0555b623230d7323ff Mon Sep 17 00:00:00 2001 From: syumai Date: Tue, 28 May 2024 17:21:19 +0900 Subject: [PATCH 3/4] add ignoring untranslatable unknown characters test case --- tests/test.js | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/test.js b/tests/test.js index f65a04a..402c3dd 100644 --- a/tests/test.js +++ b/tests/test.js @@ -636,6 +636,50 @@ describe('encoding', function() { assert.deepEqual(decoded, '🍣寿司ビール🍺'); }); }); + + describe('Ignore untranslatable unknown characters', function() { + it('SJIS', function() { + // Characters that cannot be converted to Shift_JIS ('🍣', '🍺') will be ignored. + var sjis = encoding.convert(utf8, { + to: 'sjis', + from: 'utf-8', + fallback: 'ignore' + }); + var decoded = encoding.convert(sjis, { + to: 'unicode', + from: 'sjis' + }); + assert.deepEqual(decoded, '寿司ビール'); + }); + + it('EUC-JP', function() { + // Characters that cannot be converted to EUC-JP ('🍣', '🍺') will be ignored. + var eucjp = encoding.convert(utf8, { + to: 'euc-jp', + from: 'utf-8', + fallback: 'ignore' + }); + var decoded = encoding.convert(eucjp, { + to: 'unicode', + from: 'euc-jp' + }); + assert.deepEqual(decoded, '寿司ビール'); + }); + + it('JIS', function() { + // Characters that cannot be converted to JIS ('🍣', '🍺') will be ignored. + var jis = encoding.convert(utf8, { + to: 'jis', + from: 'utf-8', + fallback: 'ignore' + }); + var decoded = encoding.convert(jis, { + to: 'unicode', + from: 'jis' + }); + assert.deepEqual(decoded, '寿司ビール'); + }); + }); }); }); From 5055f4000958bc210a001d7ac5ebe1f262dbc63c Mon Sep 17 00:00:00 2001 From: syumai Date: Tue, 28 May 2024 17:49:10 +0900 Subject: [PATCH 4/4] update README --- README.md | 25 +++++++++++++++++++++++++ README_ja.md | 25 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/README.md b/README.md index 70a5ae6..0772ac6 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Convert and detect character encoding in JavaScript. + [Specify conversion options to the argument `to` as an object](#specify-conversion-options-to-the-argument-to-as-an-object) + [Specify the return type by the `type` option](#specify-the-return-type-by-the-type-option) + [Replacing characters with HTML entities when they cannot be represented](#replacing-characters-with-html-entities-when-they-cannot-be-represented) + + [Ignoring characters when they cannot be represented](#ignoring-characters-when-they-cannot-be-represented) + [Specify BOM in UTF-16](#specify-bom-in-utf-16) * [urlEncode : Encodes to percent-encoded string](#encodingurlencode-data) * [urlDecode : Decodes from percent-encoded string](#encodingurldecode-string) @@ -405,6 +406,30 @@ const sjisArray = Encoding.convert(unicodeArray, { console.log(sjisArray); // Converted to a code array of 'ホッケの漢字は𩸽' ``` +#### Ignoring characters when they cannot be represented + +By specifying `ignore` as a `fallback` option, characters that cannot be represented in the target encoding format can be ignored. + +Example of specifying `{ fallback: 'ignore' }` option: + +```javascript +const unicodeArray = Encoding.stringToCode("寿司🍣ビール🍺"); +// No fallback specified +let sjisArray = Encoding.convert(unicodeArray, { + to: "SJIS", + from: "UNICODE", +}); +console.log(sjisArray); // Converted to a code array of '寿司?ビール?' + +// Specify `fallback: html-entity` +sjisArray = Encoding.convert(unicodeArray, { + to: "SJIS", + from: "UNICODE", + fallback: "ignore", +}); +console.log(sjisArray); // Converted to a code array of '寿司ビール' +``` + #### Specify BOM in UTF-16 You can add a BOM (byte order mark) by specifying the `bom` option when converting to `UTF16`. diff --git a/README_ja.md b/README_ja.md index e3cb4b5..7cae3f8 100644 --- a/README_ja.md +++ b/README_ja.md @@ -28,6 +28,7 @@ JavaScript で文字コードの変換や判定をします。 + [引数 `to` にオブジェクトで変換オプションを指定する](#引数-to-にオブジェクトで変換オプションを指定する) + [`type` オプションで戻り値の型を指定する](#type-オプションで戻り値の型を指定する) + [変換できない文字を HTML エンティティ (HTML 数値文字参照) に置き換える](#変換できない文字を-html-エンティティ-html-数値文字参照-に置き換える) + + [変換できない文字を無視する](#変換できない文字を無視する) + [UTF-16 に BOM をつける](#utf-16-に-bom-をつける) * [urlEncode : 文字コードの配列をURLエンコードする](#encodingurlencode-data) * [urlDecode : 文字コードの配列にURLデコードする](#encodingurldecode-string) @@ -395,6 +396,30 @@ const sjisArray = Encoding.convert(unicodeArray, { console.log(sjisArray); // 'ホッケの漢字は𩸽' の数値配列に変換されます ``` +#### 変換できない文字を無視する + +変換先の文字コードで表現できない文字を無視するには、 `fallback` オプションに `ignore` を指定します。 + +`{ fallback: 'ignore' }` オプションを指定する例: + +```javascript +const unicodeArray = Encoding.stringToCode('寿司🍣ビール🍺'); +// fallback指定なし +let sjisArray = Encoding.convert(unicodeArray, { + to: 'SJIS', + from: 'UNICODE' +}); +console.log(sjisArray); // '寿司?ビール?' の数値配列に変換されます + +// `fallback: ignore`を指定 +sjisArray = Encoding.convert(unicodeArray, { + to: 'SJIS', + from: 'UNICODE', + fallback: 'ignore' +}); +console.log(sjisArray); // '寿司ビール' の数値配列に変換されます +``` + #### UTF-16 に BOM をつける `UTF16` に変換する際に `bom` オプションを指定すると BOM (byte order mark) の付加を指定できます。