Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use iconv.decodeStream instead of iconv.decode; Fixes #374 #375

Merged
merged 6 commits into from
Apr 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions lib/decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,33 @@ function StreamDecoder(charset) {
StreamDecoder.prototype._transform = function(chunk, encoding, done) {
var res, found;

// try get charset from chunk, just once
if (this.charset == 'utf8' && !this.parsed_chunk) {
// try to get charset from chunk, just once
if (!this.parsed_chunk && (this.charset == 'utf8' || this.charset == 'utf-8')) {
this.parsed_chunk = true;

var matches = regex.exec(chunk.toString());
if (matches) {
found = matches[1].toLowerCase();
this.charset = found == 'utf-8' ? 'utf8' : found;
// look for charset
if (regex.test(chunk.toString())) {
var charset = (RegExp.$1).toLowerCase().replace('utf8','utf-8'); // canonicalize
// override if iconv can handle it
if (iconv.encodingExists(charset)) this.charset = charset;
}
}

try {
res = iconv.decode(chunk, this.charset);
} catch(e) { // something went wrong, just return original chunk
res = chunk;
}

this.push(res);
// no need to decode utf-8, pass through
if (this.charset == 'utf-8') return this.push(chunk), done();

// initialize stream decoder if not present
const self = this;
if (!this.decoder) {
this.decoder = iconv.decodeStream(this.charset);
this.decoder.on("data", function(decoded_chunk){
// push decoded chunk
self.push(decoded_chunk);
});
};

// write chunk to decoder
this.decoder.write(chunk);
done();
}

Expand Down
94 changes: 94 additions & 0 deletions test/decoder_spec.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
var should = require('should'),
needle = require('./../'),
decoder = require('./../lib/decoder'),
Q = require('q'),
chardet = require('jschardet'),
helpers = require('./helpers');
Expand Down Expand Up @@ -118,4 +119,97 @@ describe('character encoding', function() {
})

})

describe('multibyte characters split across chunks', function () {

describe('with encoding = utf-8', function() {

var d,
result = Buffer.allocUnsafe(0);

before(function(done) {
d = decoder('utf-8');
done();
});

it('reassembles split multibyte characters', function (done) {

d.on("data", function(chunk){
result = Buffer.concat([ result, chunk ]);
});

d.on("end", function(){
result.toString("utf-8").should.eql('慶');
done();
});

// write '慶' in utf-8 split across chunks
d.write(Buffer.from([0xE6]));
d.write(Buffer.from([0x85]));
d.write(Buffer.from([0xB6]));
d.end();

})
})

describe('with encoding = euc-jp', function() {

var d,
result = Buffer.allocUnsafe(0);

before(function(done) {
d = decoder('euc-jp');
done();
});

it('reassembles split multibyte characters', function (done) {

d.on("data", function(chunk){
result = Buffer.concat([ result, chunk ]);
});

d.on("end", function(){
result.toString("utf-8").should.eql('慶');
done();
});

// write '慶' in euc-jp split across chunks
d.write(Buffer.from([0xB7]));
d.write(Buffer.from([0xC4]));
d.end();

})
})

describe('with encoding = gb18030', function() {

var d,
result = Buffer.allocUnsafe(0);

before(function(done) {
d = decoder('gb18030');
done();
});

it('reassembles split multibyte characters', function (done) {

d.on("data", function(chunk){
result = Buffer.concat([ result, chunk ]);
});

d.on("end", function(){
result.toString("utf-8").should.eql('慶');
done();
});

// write '慶' in gb18030 split across chunks
d.write(Buffer.from([0x91]));
d.write(Buffer.from([0x63]));
d.end();

})
})

})

})