Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

utf8 boundary patch #14

Merged
merged 2 commits into from
Mar 13, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions jsonparse.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ function Parser() {
this.mode = undefined;
this.stack = [];
this.state = VALUE;
this.bytes_remaining = 0; // number of bytes remaining in multi byte utf8 char to read after split boundary
this.bytes_in_sequence = 0; // bytes in multi byte utf8 char to read
this.temp_buffs = { "2": new Buffer(2), "3": new Buffer(3), "4": new Buffer(4) }; // for rebuilding chars split before boundary is reached
}
var proto = Parser.prototype;
proto.charError = function (buffer, i) {
Expand Down Expand Up @@ -109,11 +112,30 @@ proto.write = function (buffer) {
} else { this.charError(buffer, i); }
}
}else if (this.tState === STRING1){ // After open quote
n = buffer[i];
if (n >= 128) {
for (var j = i; buffer[j] >= 128 && j < buffer.length; j++);
this.string += buffer.slice(i, j).toString();
i = j - 1;
n = buffer[i]; // get current byte from buffer
// check for carry over of a multi byte char split between data chunks
// & fill temp buffer it with start of this data chunk up to the boundary limit set in the last iteration
if (this.bytes_remaining > 0) {
for (var j = 0; j < this.bytes_remaining; j++) {
this.temp_buffs[this.bytes_in_sequence][this.bytes_in_sequence - this.bytes_remaining + j] = buffer[j];
}
this.string += this.temp_buffs[this.bytes_in_sequence].toString();
this.bytes_in_sequence = this.bytes_remaining = 0;
i = i + j - 1;
} else if (this.bytes_remaining === 0 && n >= 128) { // else if no remainder bytes carried over, parse multi byte (>=128) chars one at a time
if ((n >= 194) && (n <= 223)) this.bytes_in_sequence = 2;
if ((n >= 224) && (n <= 239)) this.bytes_in_sequence = 3;
if ((n >= 240) && (n <= 244)) this.bytes_in_sequence = 4;
if ((this.bytes_in_sequence + i) > buffer.length) { // if bytes needed to complete char fall outside buffer length, we have a boundary split
for (var k = 0; k <= (buffer.length - 1 - i); k++) {
this.temp_buffs[this.bytes_in_sequence][k] = buffer[i + k]; // fill temp buffer of correct size with bytes available in this chunk
}
this.bytes_remaining = (i + this.bytes_in_sequence) - buffer.length;
i = buffer.length - 1;
} else {
this.string += buffer.slice(i, (i + this.bytes_in_sequence)).toString();
i = i + this.bytes_in_sequence - 1;
}
} else if (n === 0x22) { this.tState = START; this.onToken(STRING, this.string); this.string = undefined; }
else if (n === 0x5c) { this.tState = STRING2; }
else if (n >= 0x20) { this.string += String.fromCharCode(n); }
Expand Down
110 changes: 110 additions & 0 deletions test/boundary.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
var test = require('tape');
var Parser = require('../');

test('2 byte utf8 \'De\' character: д', function (t) {
t.plan(1);

var p = new Parser();
p.onValue = function (value) {
t.equal(value, 'д');
};

var de_buffer = new Buffer([0xd0, 0xb4]);

p.write('"');
p.write(de_buffer);
p.write('"');

});

test('3 byte utf8 \'Han\' character: 我', function (t) {
t.plan(1);

var p = new Parser();
p.onValue = function (value) {
t.equal(value, '我');
};

var han_buffer = new Buffer([0xe6, 0x88, 0x91]);
p.write('"');
p.write(han_buffer);
p.write('"');
});

test('4 byte utf8 character (unicode scalar U+2070E): 𠜎', function (t) {
t.plan(1);

var p = new Parser();
p.onValue = function (value) {
t.equal(value, '𠜎');
};

var Ux2070E_buffer = new Buffer([0xf0, 0xa0, 0x9c, 0x8e]);
p.write('"');
p.write(Ux2070E_buffer);
p.write('"');
});

test('3 byte utf8 \'Han\' character chunked inbetween 2nd and 3rd byte: 我', function (t) {
t.plan(1);

var p = new Parser();
p.onValue = function (value) {
t.equal(value, '我');
};

var han_buffer_first = new Buffer([0xe6, 0x88]);
var han_buffer_second = new Buffer([0x91]);
p.write('"');
p.write(han_buffer_first);
p.write(han_buffer_second);
p.write('"');
});

test('4 byte utf8 character (unicode scalar U+2070E) chunked inbetween 2nd and 3rd byte: 𠜎', function (t) {
t.plan(1);

var p = new Parser();
p.onValue = function (value) {
t.equal(value, '𠜎');
};

var Ux2070E_buffer_first = new Buffer([0xf0, 0xa0]);
var Ux2070E_buffer_second = new Buffer([0x9c, 0x8e]);
p.write('"');
p.write(Ux2070E_buffer_first);
p.write(Ux2070E_buffer_second);
p.write('"');
});

test('1-4 byte utf8 character string chunked inbetween random bytes: Aж文𠜱B', function (t) {
t.plan(1);

var p = new Parser();
p.onValue = function (value) {
t.equal(value, 'Aж文𠜱B');
};

var eclectic_buffer = new Buffer([0x41, // A
0xd0, 0xb6, // ж
0xe6, 0x96, 0x87, // 文
0xf0, 0xa0, 0x9c, 0xb1, // 𠜱
0x42]); // B

var rand_chunk = Math.floor(Math.random() * (eclectic_buffer.length));
var first_buffer = eclectic_buffer.slice(0, rand_chunk);
var second_buffer = eclectic_buffer.slice(rand_chunk);

//console.log('eclectic_buffer: ' + eclectic_buffer)
//console.log('sliced from 0 to ' + rand_chunk);
//console.log(first_buffer);
//console.log('then sliced from ' + rand_chunk + ' to the end');
//console.log(second_buffer);

console.log('chunked after offset ' + rand_chunk);
p.write('"');
p.write(first_buffer);
p.write(second_buffer);
p.write('"');

});