Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(chunker): Fix chunk length calculation for unicode characters #726

Merged
merged 3 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 45 additions & 17 deletions packages/ssr/src/utils/chunker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,62 @@ interface Chunk {
value: string;
}

function createChunkRegExp(chunkSize: number) {
return new RegExp('.{1,' + chunkSize + '}', 'g');
}

const MAX_CHUNK_SIZE = 3180;
const MAX_CHUNK_REGEXP = createChunkRegExp(MAX_CHUNK_SIZE);

/**
* create chunks from a string and return an array of object
*/
export function createChunks(key: string, value: string, chunkSize?: number): Chunk[] {
const re = chunkSize !== undefined ? createChunkRegExp(chunkSize) : MAX_CHUNK_REGEXP;
// check the length of the string to work out if it should be returned or chunked
const chunkCount = Math.ceil(value.length / (chunkSize ?? MAX_CHUNK_SIZE));
const resolvedChunkSize = chunkSize ?? MAX_CHUNK_SIZE;

let encodedValue = encodeURIComponent(value);

if (chunkCount === 1) {
if (encodedValue.length <= resolvedChunkSize) {
return [{ name: key, value }];
}

const chunks: Chunk[] = [];
// split string into a array based on the regex
const values = value.match(re);
values?.forEach((value, i) => {
const name = `${key}.${i}`;
chunks.push({ name, value });
});
const chunks: string[] = [];

while (encodedValue.length > 0) {
let encodedChunkHead = encodedValue.slice(0, resolvedChunkSize);

const lastEscapePos = encodedChunkHead.lastIndexOf('%');

// Check if the last escaped character is truncated.
if (lastEscapePos > resolvedChunkSize - 3) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to clarify, we subtract 3 here because an escaped character would take up 3 characters in the string right (a % followed by 2 hexadecimal characters)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's exactly it.

// If so, reslice the string to exclude the whole escape sequence.
// We only reduce the size of the string as the chunk must
// be smaller than the chunk size.
encodedChunkHead = encodedChunkHead.slice(0, lastEscapePos);
}

let valueHead: string = '';

// Check if the chunk was split along a valid unicode boundary.
while (encodedChunkHead.length > 0) {
try {
// Try to decode the chunk back and see if it is valid.
// Stop when the chunk is valid.
valueHead = decodeURIComponent(encodedChunkHead);
break;
} catch (error) {
if (
error instanceof URIError &&
encodedChunkHead.at(-3) === '%' &&
encodedChunkHead.length > 3
) {
encodedChunkHead = encodedChunkHead.slice(0, encodedChunkHead.length - 3);
} else {
throw error;
}
}
}

chunks.push(valueHead);
encodedValue = encodedValue.slice(encodedChunkHead.length);
}

return chunks;
return chunks.map((value, i) => ({ name: `${key}.${i}`, value }));
}

// Get fully constructed chunks
Expand Down
79 changes: 79 additions & 0 deletions packages/ssr/tests/chunker.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,83 @@ describe('chunker', () => {
expect(len(`${key}=${DOUBLE_CHUNK_STRING}`)).toBe(7257);
expect(combined).toBe(DOUBLE_CHUNK_STRING);
});

it('should correctly break between unicode boundaries in escaped characters', () => {
const test = ' ';
const chunks = createChunks('key', test, 4);
expect(chunks).toEqual([
{
name: 'key.0',
value: ' '
},
{
name: 'key.1',
value: ' '
},
{
name: 'key.2',
value: ' '
}
]);

expect(chunks.map((char) => char.value).join('')).toEqual(test);
});

describe('should correctly break between unicode boundaries in long unicode', () => {
it('should correctly break between unicode boundaries in long unicode (4 bytes)', () => {
const test = '🤦🏻‍♂️';
const chunksAtStartBorder = createChunks('key', test, 12);
const chunksAtEndBorder = createChunks('key', test, 17);
expect(chunksAtStartBorder).toEqual(chunksAtEndBorder);
expect(chunksAtStartBorder).toEqual([
{
name: 'key.0',
value: '🤦'
},
{
name: 'key.1',
value: '🏻'
},
{
name: 'key.2',
value: '‍'
},
{
name: 'key.3',
value: '♂'
},
{
name: 'key.4',
value: '️'
}
]);
expect(chunksAtStartBorder.map((char) => char.value).join('')).toEqual(test);
});

it('should correctly break between unicode boundaries in long unicode (5 bytes)', () => {
const test = '🤦🏻‍♂️';
const chunksAtStartBorder = createChunks('key', test, 18);
const chunksAtEndBorder = createChunks('key', test, 20);
expect(chunksAtStartBorder).toEqual(chunksAtEndBorder);
expect(chunksAtStartBorder).toEqual([
{
name: 'key.0',
value: '🤦'
},
{
name: 'key.1',
value: '🏻'
},
{
name: 'key.2',
value: '‍♂'
},
{
name: 'key.3',
value: '️'
}
]);
expect(chunksAtStartBorder.map((char) => char.value).join('')).toEqual(test);
});
});
});