diff --git a/lib/decoder.js b/lib/decoder.js index 4ff6198b1..c5761dc20 100644 --- a/lib/decoder.js +++ b/lib/decoder.js @@ -18,24 +18,33 @@ function StreamDecoder(charset) { StreamDecoder.prototype._transform = function(chunk, encoding, done) { var res, found; - // try get charset from chunk, just once - if (this.charset == 'utf8' && !this.parsed_chunk) { + // try to get charset from chunk, just once + if (!this.parsed_chunk && (this.charset == 'utf8' || this.charset == 'utf-8')) { this.parsed_chunk = true; - var matches = regex.exec(chunk.toString()); - if (matches) { - found = matches[1].toLowerCase(); - this.charset = found == 'utf-8' ? 'utf8' : found; + // look for charset + if (regex.test(chunk.toString())) { + var charset = (RegExp.$1).toLowerCase().replace('utf8','utf-8'); // canonicalize + // override if iconv can handle it + if (iconv.encodingExists(charset)) this.charset = charset; } } - try { - res = iconv.decode(chunk, this.charset); - } catch(e) { // something went wrong, just return original chunk - res = chunk; - } - - this.push(res); + // no need to decode utf-8, pass through + if (this.charset == 'utf-8') return this.push(chunk), done(); + + // initialize stream decoder if not present + const self = this; + if (!this.decoder) { + this.decoder = iconv.decodeStream(this.charset); + this.decoder.on("data", function(decoded_chunk){ + // push decoded chunk + self.push(decoded_chunk); + }); + }; + + // write chunk to decoder + this.decoder.write(chunk); done(); } diff --git a/test/decoder_spec.js b/test/decoder_spec.js index db38821df..e1e23f7cc 100644 --- a/test/decoder_spec.js +++ b/test/decoder_spec.js @@ -1,5 +1,6 @@ var should = require('should'), needle = require('./../'), + decoder = require('./../lib/decoder'), Q = require('q'), chardet = require('jschardet'), helpers = require('./helpers'); @@ -118,4 +119,97 @@ describe('character encoding', function() { }) }) + + describe('multibyte characters split across chunks', function () { + + describe('with encoding = utf-8', function() { + + var d, + result = Buffer.allocUnsafe(0); + + before(function(done) { + d = decoder('utf-8'); + done(); + }); + + it('reassembles split multibyte characters', function (done) { + + d.on("data", function(chunk){ + result = Buffer.concat([ result, chunk ]); + }); + + d.on("end", function(){ + result.toString("utf-8").should.eql('慶'); + done(); + }); + + // write '慶' in utf-8 split across chunks + d.write(Buffer.from([0xE6])); + d.write(Buffer.from([0x85])); + d.write(Buffer.from([0xB6])); + d.end(); + + }) + }) + + describe('with encoding = euc-jp', function() { + + var d, + result = Buffer.allocUnsafe(0); + + before(function(done) { + d = decoder('euc-jp'); + done(); + }); + + it('reassembles split multibyte characters', function (done) { + + d.on("data", function(chunk){ + result = Buffer.concat([ result, chunk ]); + }); + + d.on("end", function(){ + result.toString("utf-8").should.eql('慶'); + done(); + }); + + // write '慶' in euc-jp split across chunks + d.write(Buffer.from([0xB7])); + d.write(Buffer.from([0xC4])); + d.end(); + + }) + }) + + describe('with encoding = gb18030', function() { + + var d, + result = Buffer.allocUnsafe(0); + + before(function(done) { + d = decoder('gb18030'); + done(); + }); + + it('reassembles split multibyte characters', function (done) { + + d.on("data", function(chunk){ + result = Buffer.concat([ result, chunk ]); + }); + + d.on("end", function(){ + result.toString("utf-8").should.eql('慶'); + done(); + }); + + // write '慶' in gb18030 split across chunks + d.write(Buffer.from([0x91])); + d.write(Buffer.from([0x63])); + d.end(); + + }) + }) + + }) + })