creationix · creationix · Mar 13, 2013 · Mar 6, 2013 · Mar 7, 2013
diff --git a/jsonparse.js b/jsonparse.js
@@ -75,6 +75,9 @@ function Parser() {
   this.mode = undefined;
   this.stack = [];
   this.state = VALUE;
+  this.bytes_remaining = 0; // number of bytes remaining in multi byte utf8 char to read after split boundary
+  this.bytes_in_sequence = 0; // bytes in multi byte utf8 char to read
+  this.temp_buffs = { "2": new Buffer(2), "3": new Buffer(3), "4": new Buffer(4) }; // for rebuilding chars split before boundary is reached
 }
 var proto = Parser.prototype;
 proto.charError = function (buffer, i) {
@@ -109,11 +112,30 @@ proto.write = function (buffer) {
         } else { this.charError(buffer, i); }
       }
     }else if (this.tState === STRING1){ // After open quote
-      n = buffer[i];
-      if (n >= 128) {
-        for (var j = i; buffer[j] >= 128 && j < buffer.length; j++);
-        this.string += buffer.slice(i, j).toString();
-        i = j - 1;
+      n = buffer[i]; // get current byte from buffer
+      // check for carry over of a multi byte char split between data chunks
+      // & fill temp buffer it with start of this data chunk up to the boundary limit set in the last iteration
+      if (this.bytes_remaining > 0) {
+        for (var j = 0; j < this.bytes_remaining; j++) {
+          this.temp_buffs[this.bytes_in_sequence][this.bytes_in_sequence - this.bytes_remaining + j] = buffer[j];
+        }
+        this.string += this.temp_buffs[this.bytes_in_sequence].toString();
+        this.bytes_in_sequence = this.bytes_remaining = 0;
+        i = i + j - 1;
+      } else if (this.bytes_remaining === 0 && n >= 128) { // else if no remainder bytes carried over, parse multi byte (>=128) chars one at a time
+        if ((n >= 194) && (n <= 223)) this.bytes_in_sequence = 2;
+        if ((n >= 224) && (n <= 239)) this.bytes_in_sequence = 3;
+        if ((n >= 240) && (n <= 244)) this.bytes_in_sequence = 4;
+        if ((this.bytes_in_sequence + i) > buffer.length) { // if bytes needed to complete char fall outside buffer length, we have a boundary split
+          for (var k = 0; k <= (buffer.length - 1 - i); k++) {
+            this.temp_buffs[this.bytes_in_sequence][k] = buffer[i + k]; // fill temp buffer of correct size with bytes available in this chunk
+          }
+          this.bytes_remaining = (i + this.bytes_in_sequence) - buffer.length;
+          i = buffer.length - 1;
+        } else {
+          this.string += buffer.slice(i, (i + this.bytes_in_sequence)).toString();
+          i = i + this.bytes_in_sequence - 1;
+        }
       } else if (n === 0x22) { this.tState = START; this.onToken(STRING, this.string); this.string = undefined; }
       else if (n === 0x5c) { this.tState = STRING2; }
       else if (n >= 0x20) { this.string += String.fromCharCode(n); }

diff --git a/test/boundary.js b/test/boundary.js
@@ -0,0 +1,110 @@
+var test = require('tape');
+var Parser = require('../');
+
+test('2 byte utf8 \'De\' character: д', function (t) {
+  t.plan(1);
+
+  var p = new Parser();
+  p.onValue = function (value) {
+    t.equal(value, 'д');
+  };
+
+  var de_buffer = new Buffer([0xd0, 0xb4]);
+
+  p.write('"');
+  p.write(de_buffer);
+  p.write('"');
+
+});
+
+test('3 byte utf8 \'Han\' character: 我', function (t) {
+  t.plan(1);
+
+  var p = new Parser();
+  p.onValue = function (value) {
+    t.equal(value, '我');
+  };
+
+  var han_buffer = new Buffer([0xe6, 0x88, 0x91]);
+  p.write('"');
+  p.write(han_buffer);
+  p.write('"');
+});
+
+test('4 byte utf8 character (unicode scalar U+2070E): 𠜎', function (t) {
+  t.plan(1);
+
+  var p = new Parser();
+  p.onValue = function (value) {
+    t.equal(value, '𠜎');
+  };
+
+  var Ux2070E_buffer = new Buffer([0xf0, 0xa0, 0x9c, 0x8e]);
+  p.write('"');
+  p.write(Ux2070E_buffer);
+  p.write('"');
+});
+
+test('3 byte utf8 \'Han\' character chunked inbetween 2nd and 3rd byte: 我', function (t) {
+  t.plan(1);
+
+  var p = new Parser();
+  p.onValue = function (value) {
+    t.equal(value, '我');
+  };
+
+  var han_buffer_first = new Buffer([0xe6, 0x88]);
+  var han_buffer_second = new Buffer([0x91]);
+  p.write('"');
+  p.write(han_buffer_first);
+  p.write(han_buffer_second);
+  p.write('"');
+});
+
+test('4 byte utf8 character (unicode scalar U+2070E) chunked inbetween 2nd and 3rd byte: 𠜎', function (t) {
+  t.plan(1);
+
+  var p = new Parser();
+  p.onValue = function (value) {
+    t.equal(value, '𠜎');
+  };
+
+  var Ux2070E_buffer_first = new Buffer([0xf0, 0xa0]);
+  var Ux2070E_buffer_second = new Buffer([0x9c, 0x8e]);
+  p.write('"');
+  p.write(Ux2070E_buffer_first);
+  p.write(Ux2070E_buffer_second);
+  p.write('"');
+});
+
+test('1-4 byte utf8 character string chunked inbetween random bytes: Aж文𠜱B', function (t) {
+  t.plan(1);
+
+var p = new Parser();
+  p.onValue = function (value) {
+    t.equal(value, 'Aж文𠜱B');
+  };
+
+  var eclectic_buffer = new Buffer([0x41, // A
+                                    0xd0, 0xb6, // ж
+                                    0xe6, 0x96, 0x87, // 文
+                                    0xf0, 0xa0, 0x9c, 0xb1, // 𠜱
+                                    0x42]); // B
+
+  var rand_chunk = Math.floor(Math.random() * (eclectic_buffer.length));
+  var first_buffer = eclectic_buffer.slice(0, rand_chunk);
+  var second_buffer = eclectic_buffer.slice(rand_chunk);
+
+  //console.log('eclectic_buffer: ' + eclectic_buffer)
+  //console.log('sliced from 0 to ' + rand_chunk);
+  //console.log(first_buffer);
+  //console.log('then sliced from ' + rand_chunk + ' to the end');
+  //console.log(second_buffer);
+
+  console.log('chunked after offset ' + rand_chunk);
+  p.write('"');
+  p.write(first_buffer);
+  p.write(second_buffer);
+  p.write('"');
+
+});