From ec88ee18d4d3e503613faf9183d920e44c0301bd Mon Sep 17 00:00:00 2001 From: Pierre Date: Fri, 6 Sep 2024 19:39:46 -0300 Subject: [PATCH 1/5] fix: message parser being slow to process very long messages with too many symbols --- .changeset/short-drinks-itch.md | 6 ++ .../message-parser/loaders/pegtransform.js | 2 + packages/message-parser/src/grammar.pegjs | 71 +++++-------------- packages/message-parser/src/index.ts | 18 ++--- packages/message-parser/src/tracer.ts | 53 ++++++++++++++ packages/message-parser/src/utils.ts | 1 + packages/message-parser/tests/abuse.test.ts | 69 ++++++++++++++++++ .../message-parser/tests/emphasis.test.ts | 30 ++++++++ packages/message-parser/tests/link.test.ts | 24 +++++++ packages/peggy-loader/src/index.ts | 2 + 10 files changed, 210 insertions(+), 66 deletions(-) create mode 100644 .changeset/short-drinks-itch.md create mode 100644 packages/message-parser/src/tracer.ts create mode 100644 packages/message-parser/tests/abuse.test.ts diff --git a/.changeset/short-drinks-itch.md b/.changeset/short-drinks-itch.md new file mode 100644 index 000000000000..ee57330ffc86 --- /dev/null +++ b/.changeset/short-drinks-itch.md @@ -0,0 +1,6 @@ +--- +'@rocket.chat/message-parser': patch +'@rocket.chat/peggy-loader': patch +--- + +Improved the performance of the message parser diff --git a/packages/message-parser/loaders/pegtransform.js b/packages/message-parser/loaders/pegtransform.js index 9df49c61dfea..c4f430cffb95 100644 --- a/packages/message-parser/loaders/pegtransform.js +++ b/packages/message-parser/loaders/pegtransform.js @@ -5,6 +5,8 @@ module.exports = { code: pegjs.generate(content, { output: 'source', format: 'commonjs', + // We rely on a tracer to prevent recurrence on rules like Bold, Italic, Strikethrough and References + trace: true, }), }), }; diff --git a/packages/message-parser/src/grammar.pegjs b/packages/message-parser/src/grammar.pegjs index 182653a9c664..4a79ca75eefd 100644 --- a/packages/message-parser/src/grammar.pegjs +++ b/packages/message-parser/src/grammar.pegjs @@ -212,7 +212,7 @@ Inline = value:(InlineItem / Any)+ EndOfLine? { return reducePlainTexts(value); InlineItem = Whitespace / Timestamp - / References + / MaybeReferences / AutolinkedPhone / AutolinkedEmail / AutolinkedURL @@ -240,7 +240,7 @@ References = "[" title:LinkTitle* "](" href:LinkRef ")" { return title.length ? link(href, reducePlainTexts(title)) : link(href); } / "<" href:LinkRef "|" title:LinkTitle2 ">" { return link(href, [plain(title)]); } -LinkTitle = (Whitespace / EmphasisForReferences) / anyTitle:$(!("](" .) .) { return plain(anyTitle) } +LinkTitle = (Whitespace / Emphasis) / anyTitle:$(!("](" .) .) { return plain(anyTitle) } LinkTitle2 = $([\x20-\x3B\x3D\x3F-\x60\x61-\x7B\x7D-\xFF] / NonASCII)+ @@ -349,14 +349,7 @@ AutoLinkURLBody = !(Extra* (Whitespace / EndOfLine)) . * Emphasis * */ -Emphasis = Bold / Italic / Strikethrough - -/** - * - * Emphasis for References - * -*/ -EmphasisForReferences = BoldForReferences / ItalicForReferences / StrikethroughForReferences +Emphasis = MaybeBold / MaybeItalic / MaybeStrikethrough /** * @@ -365,6 +358,14 @@ EmphasisForReferences = BoldForReferences / ItalicForReferences / StrikethroughF * */ +MaybeBold = (! { return options.ruleStack.includes('Bold'); }) text:Bold { return text; } + +MaybeStrikethrough = (! { return options.ruleStack.includes('Strikethrough'); }) text:Strikethrough { return text; } + +MaybeItalic = (! { return options.ruleStack.includes('Italic'); }) text:Italic { return text; } + +MaybeReferences = (! { return options.ruleStack.includes('References'); }) text:References { return text; } + /* Italic */ Italic = value:$([a-zA-Z0-9]+ [\x5F] [\x5F]?) { return plain(value); } @@ -384,11 +385,11 @@ ItalicContentItems = text:ItalicContentItem+ { return reducePlainTexts(text); } ItalicContentItem = Whitespace / InlineCode - / References + / MaybeReferences / UserMention / ChannelMention - / Bold - / Strikethrough + / MaybeBold + / MaybeStrikethrough / Emoji / Emoticon / AnyItalic @@ -399,52 +400,12 @@ Bold = [\x2A] [\x2A] @BoldContent [\x2A] [\x2A] / [\x2A] @BoldContent [\x2A] BoldContent = text:BoldContentItem+ { return bold(reducePlainTexts(text)); } -BoldContentItem = Whitespace / InlineCode / References / UserMention / ChannelMention / Italic / Strikethrough / Emoji / Emoticon / AnyBold / Line +BoldContentItem = Whitespace / InlineCode / MaybeReferences / UserMention / ChannelMention / MaybeItalic / MaybeStrikethrough / Emoji / Emoticon / AnyBold / Line /* Strike */ Strikethrough = [\x7E] [\x7E] @StrikethroughContent [\x7E] [\x7E] / [\x7E] @StrikethroughContent [\x7E] -StrikethroughContent = text:(Timestamp / InlineCode / Whitespace / References / UserMention / ChannelMention / Italic / Bold / Emoji / Emoticon / AnyStrike / Line)+ { - return strike(reducePlainTexts(text)); - } - -/* Italic for References */ -ItalicForReferences - = value:$([a-zA-Z0-9]+ [\x5F] [\x5F]?) { return plain(value); } - / [\x5F] [\x5F] i:ItalicContentItemsForReferences [\x5F] [\x5F] t:$[a-zA-Z0-9]+ { - return reducePlainTexts([plain('__'), ...i, plain('__'), plain(t)])[0]; - } - / [\x5F] i:ItalicContentItemsForReferences [\x5F] t:$[a-zA-Z]+ { - return reducePlainTexts([plain('_'), ...i, plain('_'), plain(t)])[0]; - } - / [\x5F] [\x5F] @ItalicContentForReferences [\x5F] [\x5F] - / [\x5F] @ItalicContentForReferences [\x5F] - -ItalicContentForReferences = text:ItalicContentItemsForReferences { return italic(text); } - -ItalicContentItemsForReferences = text:ItalicContentItemForReferences+ { return reducePlainTexts(text); } - -ItalicContentItemForReferences - = Whitespace - / UserMention - / ChannelMention - / BoldForReferences - / StrikethroughForReferences - / Emoji - / Emoticon - / AnyItalic - / Line - / InlineCode - -/* Bold for References */ -BoldForReferences = [\x2A] [\x2A] @BoldContentForReferences [\x2A] [\x2A] / [\x2A] @BoldContentForReferences [\x2A] - -BoldContentForReferences = text:(Whitespace / UserMention / ChannelMention / ItalicForReferences / StrikethroughForReferences / Emoji / Emoticon / AnyBold / Line / InlineCode)+ { return bold(reducePlainTexts(text)); } - -/* Strike for References */ -StrikethroughForReferences = [\x7E] [\x7E] @StrikethroughContentForReferences [\x7E] [\x7E] / [\x7E] @StrikethroughContentForReferences [\x7E] - -StrikethroughContentForReferences = text:(Whitespace / UserMention / ChannelMention / ItalicForReferences / BoldForReferences / Emoji / Emoticon / AnyStrike / Line / InlineCode)+ { +StrikethroughContent = text:(Timestamp / Whitespace / InlineCode / MaybeReferences / UserMention / ChannelMention / MaybeItalic / MaybeBold / Emoji / Emoticon / AnyStrike / Line)+ { return strike(reducePlainTexts(text)); } diff --git a/packages/message-parser/src/index.ts b/packages/message-parser/src/index.ts index cbdae1382cb5..42eddb2d35b5 100644 --- a/packages/message-parser/src/index.ts +++ b/packages/message-parser/src/index.ts @@ -1,26 +1,22 @@ import type { Root } from './definitions'; import * as grammar from './grammar.pegjs'; +import { type Options, Tracer } from './tracer'; export * from './definitions'; export { isNodeOfType } from './guards'; -export type Options = { - colors?: boolean; - emoticons?: boolean; - katex?: { - dollarSyntax?: boolean; - parenthesisSyntax?: boolean; - }; - customDomains?: string[]; -}; +export const parse = (input: string, options?: Options): Root => { + const parserOptions = Tracer.createParserOptions(options); -export const parse = (input: string, options?: Options): Root => - grammar.parse(input, options); + const result = grammar.parse(input, parserOptions); + return result; +}; export { /** @deprecated */ parse as parser, /** @deprecated */ Root as MarkdownAST, + Options, }; diff --git a/packages/message-parser/src/tracer.ts b/packages/message-parser/src/tracer.ts new file mode 100644 index 000000000000..edbe709b1d68 --- /dev/null +++ b/packages/message-parser/src/tracer.ts @@ -0,0 +1,53 @@ +import type { ParserTracer, ParserOptions } from 'peggy'; + +export type Options = { + colors?: boolean; + emoticons?: boolean; + katex?: { + dollarSyntax?: boolean; + parenthesisSyntax?: boolean; + }; + customDomains?: string[]; +}; + +type TracedOptions = ParserOptions & { ruleStack: string[] }; + +export class Tracer implements ParserTracer { + constructor(private options: TracedOptions) { + // + } + + trace(event: any): void { + if (event.type === 'rule.enter') { + this.options.ruleStack.push(event.rule); + return; + } + + while (this.options.ruleStack.length > 0) { + const lastRule = + this.options.ruleStack[this.options.ruleStack.length - 1]; + + if ( + lastRule !== event.rule && + !this.options.ruleStack.includes(event.rule) + ) { + break; + } + this.options.ruleStack.pop(); + + if (lastRule === event.rule) { + break; + } + } + } + + public static createParserOptions(options?: Options): ParserOptions { + const parserOptions: TracedOptions = { + ...options, + ruleStack: [], + }; + + parserOptions.tracer = new Tracer(parserOptions); + return parserOptions; + } +} diff --git a/packages/message-parser/src/utils.ts b/packages/message-parser/src/utils.ts index 1f684b56d6ed..a48be3cfd182 100644 --- a/packages/message-parser/src/utils.ts +++ b/packages/message-parser/src/utils.ts @@ -200,6 +200,7 @@ export const reducePlainTexts = ( ): Paragraph['value'] => values .flatMap((item) => item) + .filter((item) => item) .reduce((result, item, index, values) => { const next = values[index + 1]; const current = joinEmoji(item, values[index - 1], next); diff --git a/packages/message-parser/tests/abuse.test.ts b/packages/message-parser/tests/abuse.test.ts new file mode 100644 index 000000000000..61cabef8ad73 --- /dev/null +++ b/packages/message-parser/tests/abuse.test.ts @@ -0,0 +1,69 @@ +import { parse } from '../src'; +import { paragraph, plain, bold, italic, strike } from '../src/utils'; + +test.each([ + [ + `This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. , REPEATx2 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. REPEAT x3 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. REPEAT x4 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. REPEATx 5 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. , REPEAT x6 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. this can go long for some time, repeat x7 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. ,repeat x8 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some.`, + [ + paragraph([ + plain( + 'This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&' + ), + bold([ + plain('()'), + italic([ + plain( + `+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok` + ), + strike([ + plain( + `, from now on we repeat some. , REPEATx2 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok` + ), + ]), + plain( + ', from now on we repeat some. REPEAT x3 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()' + ), + ]), + plain( + `+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok` + ), + strike([ + plain( + ', from now on we repeat some. REPEAT x4 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()' + ), + italic([ + plain( + `+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. REPEATx 5 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()` + ), + ]), + plain( + `+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok` + ), + ]), + plain( + `, from now on we repeat some. , REPEAT x6 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&` + ), + ]), + plain( + `()_+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok` + ), + strike([ + plain( + `, from now on we repeat some. this can go long for some time, repeat x7 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()` + ), + italic([ + plain( + `+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok~, from now on we repeat some. ,repeat x8 This a message designed to stress test the message parser, trying to force several rules to stack at the same time !!@#$%^&*()` + ), + ]), + plain( + `+, overloading the symbols {}:"|<>?, some more text ,./;'\\[], numbers too 1234567890-= let it call s o s ok` + ), + ]), + plain(', from now on we repeat some.'), + ]), + ], + ], +])('parses %p', (input, output) => { + expect(parse(input)).toMatchObject(output); +}); diff --git a/packages/message-parser/tests/emphasis.test.ts b/packages/message-parser/tests/emphasis.test.ts index e8e72a5882f1..3b56e4c88b55 100644 --- a/packages/message-parser/tests/emphasis.test.ts +++ b/packages/message-parser/tests/emphasis.test.ts @@ -185,6 +185,36 @@ test.each([ ]), ], ], + [ + '**bold ~~and strike~~** **not bold ~~but strike** ~~ not strike~~', + [ + paragraph([ + bold([plain('bold '), strike([plain('and strike')])]), + plain(' **not bold '), + strike([plain('but strike** ')]), + plain(' not strike~~'), + ]), + ], + ], + [ + '**bold** **another bold** ~~strike~~ ~~another strike~~ **bold ~~and strike~~** **not bold ~~but strike** ~~ not strike~~', + [ + paragraph([ + bold([plain('bold')]), + plain(' '), + bold([plain('another bold')]), + plain(' '), + strike([plain('strike')]), + plain(' '), + strike([plain('another strike')]), + plain(' '), + bold([plain('bold '), strike([plain('and strike')])]), + plain(' **not bold '), + strike([plain('but strike** ')]), + plain(' not strike~~'), + ]), + ], + ], ])('parses %p', (input, output) => { expect(parse(input)).toMatchObject(output); }); diff --git a/packages/message-parser/tests/link.test.ts b/packages/message-parser/tests/link.test.ts index 1e083fde5d70..fcf371474bf2 100644 --- a/packages/message-parser/tests/link.test.ts +++ b/packages/message-parser/tests/link.test.ts @@ -584,6 +584,30 @@ Text after line break`, ]), ], ], + [ + '[test **bold** and __italic__](https://rocket.chat)', + [ + paragraph([ + link('https://rocket.chat', [ + plain('test '), + bold([plain('bold')]), + plain(' and '), + italic([plain('italic')]), + ]), + ]), + ], + ], + [ + '[test **bold with __italic__**](https://rocket.chat)', + [ + paragraph([ + link('https://rocket.chat', [ + plain('test '), + bold([plain('bold with '), italic([plain('italic')])]), + ]), + ]), + ], + ], ])('parses %p', (input, output) => { expect(parse(input)).toMatchObject(output); }); diff --git a/packages/peggy-loader/src/index.ts b/packages/peggy-loader/src/index.ts index 4d805d2be14d..9d3ea3b582fb 100644 --- a/packages/peggy-loader/src/index.ts +++ b/packages/peggy-loader/src/index.ts @@ -32,6 +32,8 @@ function peggyLoader( return peggy.generate(grammarContent, { output: 'source', + // We rely on a tracer to prevent recurrence on rules like Bold, Italic, Strikethrough and References + trace: true, ...options, }); } From 42c6de806aa89728e738f5ebe38a93a4d7f3f6d1 Mon Sep 17 00:00:00 2001 From: Pierre Date: Fri, 6 Sep 2024 19:42:17 -0300 Subject: [PATCH 2/5] missing type --- packages/message-parser/src/tracer.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/message-parser/src/tracer.ts b/packages/message-parser/src/tracer.ts index edbe709b1d68..d188c4040ab3 100644 --- a/packages/message-parser/src/tracer.ts +++ b/packages/message-parser/src/tracer.ts @@ -1,4 +1,4 @@ -import type { ParserTracer, ParserOptions } from 'peggy'; +import type { ParserTracer, ParserTracerEvent, ParserOptions } from 'peggy'; export type Options = { colors?: boolean; @@ -17,7 +17,7 @@ export class Tracer implements ParserTracer { // } - trace(event: any): void { + trace(event: ParserTracerEvent): void { if (event.type === 'rule.enter') { this.options.ruleStack.push(event.rule); return; From c457390a9ca43da713ed1e0129abd635ef86b104 Mon Sep 17 00:00:00 2001 From: Pierre Date: Fri, 6 Sep 2024 20:03:14 -0300 Subject: [PATCH 3/5] extra test --- packages/message-parser/tests/abuse.test.ts | 199 ++++++++++++++++++++ 1 file changed, 199 insertions(+) diff --git a/packages/message-parser/tests/abuse.test.ts b/packages/message-parser/tests/abuse.test.ts index 61cabef8ad73..c280ee75b4a0 100644 --- a/packages/message-parser/tests/abuse.test.ts +++ b/packages/message-parser/tests/abuse.test.ts @@ -64,6 +64,205 @@ test.each([ ]), ], ], + [ + '**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__**_**__', + [ + paragraph([ + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + italic([bold([plain('_')])]), + bold([italic([plain('**')]), italic([plain('**')])]), + plain('__'), + ]), + ], + ], ])('parses %p', (input, output) => { expect(parse(input)).toMatchObject(output); }); From ac163eed7aa841849cfab9b2811c6219fa2d3d92 Mon Sep 17 00:00:00 2001 From: Pierre Date: Mon, 9 Sep 2024 12:03:09 -0300 Subject: [PATCH 4/5] added tests for italic prevention rules --- .../message-parser/tests/emphasis.test.ts | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/packages/message-parser/tests/emphasis.test.ts b/packages/message-parser/tests/emphasis.test.ts index 3b56e4c88b55..b035999204ce 100644 --- a/packages/message-parser/tests/emphasis.test.ts +++ b/packages/message-parser/tests/emphasis.test.ts @@ -215,6 +215,38 @@ test.each([ ]), ], ], + [ + 'some_snake_case_text and even_more', + [paragraph([plain('some_snake_case_text and even_more')])], + ], + [ + 'some_snake_case_text and some __italic__ text', + [ + paragraph([ + plain('some_snake_case_text and some '), + italic([plain('italic')]), + plain(' text'), + ]), + ], + ], + [ + 'some__double__snake__case__text and even_more', + [paragraph([plain('some__double__snake__case__text and even_more')])], + ], + [ + 'some__double__snake__case__text and some __italic__ text', + [ + paragraph([ + plain('some__double__snake__case__text and some '), + italic([plain('italic')]), + plain(' text'), + ]), + ], + ], + [ + 'something__ __and italic__', + [paragraph([plain('something__ '), italic([plain('and italic')])])], + ], ])('parses %p', (input, output) => { expect(parse(input)).toMatchObject(output); }); From c7bdefb4186adb08214214519e03c610697817b1 Mon Sep 17 00:00:00 2001 From: Pierre Date: Tue, 10 Sep 2024 13:50:05 -0300 Subject: [PATCH 5/5] replaced the tracer solution with a variable-based solution --- .../message-parser/loaders/pegtransform.js | 2 - packages/message-parser/src/grammar.pegjs | 69 +++++++++++++++++-- packages/message-parser/src/index.ts | 18 +++-- packages/message-parser/src/tracer.ts | 53 -------------- packages/message-parser/src/utils.ts | 37 ++++++---- packages/peggy-loader/src/index.ts | 2 - 6 files changed, 95 insertions(+), 86 deletions(-) delete mode 100644 packages/message-parser/src/tracer.ts diff --git a/packages/message-parser/loaders/pegtransform.js b/packages/message-parser/loaders/pegtransform.js index c4f430cffb95..9df49c61dfea 100644 --- a/packages/message-parser/loaders/pegtransform.js +++ b/packages/message-parser/loaders/pegtransform.js @@ -5,8 +5,6 @@ module.exports = { code: pegjs.generate(content, { output: 'source', format: 'commonjs', - // We rely on a tracer to prevent recurrence on rules like Bold, Italic, Strikethrough and References - trace: true, }), }), }; diff --git a/packages/message-parser/src/grammar.pegjs b/packages/message-parser/src/grammar.pegjs index 4a79ca75eefd..a6cae97facbf 100644 --- a/packages/message-parser/src/grammar.pegjs +++ b/packages/message-parser/src/grammar.pegjs @@ -10,6 +10,7 @@ emoji, emojiUnicode, emoticon, + extractFirstResult, heading, image, inlineCode, @@ -33,6 +34,11 @@ unorderedList, timestamp, } = require('./utils'); + +let skipBold = false; +let skipItalic = false; +let skipStrikethrough = false; +let skipReferences = false; }} Start @@ -358,13 +364,62 @@ Emphasis = MaybeBold / MaybeItalic / MaybeStrikethrough * */ -MaybeBold = (! { return options.ruleStack.includes('Bold'); }) text:Bold { return text; } - -MaybeStrikethrough = (! { return options.ruleStack.includes('Strikethrough'); }) text:Strikethrough { return text; } - -MaybeItalic = (! { return options.ruleStack.includes('Italic'); }) text:Italic { return text; } - -MaybeReferences = (! { return options.ruleStack.includes('References'); }) text:References { return text; } +// This rule is used inside expressions that have a JS code ensuring they always fail, +// Without any pattern to match, peggy will think the rule may end up succedding without consuming any input, which could cause infinite loops +// So this unreachable rule is added to them to satisfy peggy's requirement. +BlockedByJavascript = 'unreachable' + +MaybeBold + = result:( + & { + if (skipBold) { return false; } + skipBold = true; + return true; + } + ( + (text:Bold { skipBold = false; return text; }) + / (& { skipBold = false; return false; } BlockedByJavascript) + ) + ) { return extractFirstResult(result); } + +MaybeStrikethrough + = result:( + & { + if (skipStrikethrough) { return false; } + skipStrikethrough = true; + return true; + } + ( + (text:Strikethrough { skipStrikethrough = false; return text; }) + / (& { skipStrikethrough = false; return false; } BlockedByJavascript) + ) + ) { return extractFirstResult(result); } + +MaybeItalic + = result:( + & { + if (skipItalic) { return false; } + skipItalic = true; + return true; + } + ( + (text:Italic { skipItalic = false; return text; }) + / (& { skipItalic = false; return false; } BlockedByJavascript) + ) + ) { return extractFirstResult(result); } + +MaybeReferences + = result:( + & { + if (skipReferences) { return false; } + skipReferences = true; + return true; + } + ( + (text:References { skipReferences = false; return text; }) + / (& { skipReferences = false; return false; } BlockedByJavascript) + ) + ) { return extractFirstResult(result); } /* Italic */ Italic diff --git a/packages/message-parser/src/index.ts b/packages/message-parser/src/index.ts index 42eddb2d35b5..cbdae1382cb5 100644 --- a/packages/message-parser/src/index.ts +++ b/packages/message-parser/src/index.ts @@ -1,22 +1,26 @@ import type { Root } from './definitions'; import * as grammar from './grammar.pegjs'; -import { type Options, Tracer } from './tracer'; export * from './definitions'; export { isNodeOfType } from './guards'; -export const parse = (input: string, options?: Options): Root => { - const parserOptions = Tracer.createParserOptions(options); - - const result = grammar.parse(input, parserOptions); - return result; +export type Options = { + colors?: boolean; + emoticons?: boolean; + katex?: { + dollarSyntax?: boolean; + parenthesisSyntax?: boolean; + }; + customDomains?: string[]; }; +export const parse = (input: string, options?: Options): Root => + grammar.parse(input, options); + export { /** @deprecated */ parse as parser, /** @deprecated */ Root as MarkdownAST, - Options, }; diff --git a/packages/message-parser/src/tracer.ts b/packages/message-parser/src/tracer.ts deleted file mode 100644 index d188c4040ab3..000000000000 --- a/packages/message-parser/src/tracer.ts +++ /dev/null @@ -1,53 +0,0 @@ -import type { ParserTracer, ParserTracerEvent, ParserOptions } from 'peggy'; - -export type Options = { - colors?: boolean; - emoticons?: boolean; - katex?: { - dollarSyntax?: boolean; - parenthesisSyntax?: boolean; - }; - customDomains?: string[]; -}; - -type TracedOptions = ParserOptions & { ruleStack: string[] }; - -export class Tracer implements ParserTracer { - constructor(private options: TracedOptions) { - // - } - - trace(event: ParserTracerEvent): void { - if (event.type === 'rule.enter') { - this.options.ruleStack.push(event.rule); - return; - } - - while (this.options.ruleStack.length > 0) { - const lastRule = - this.options.ruleStack[this.options.ruleStack.length - 1]; - - if ( - lastRule !== event.rule && - !this.options.ruleStack.includes(event.rule) - ) { - break; - } - this.options.ruleStack.pop(); - - if (lastRule === event.rule) { - break; - } - } - } - - public static createParserOptions(options?: Options): ParserOptions { - const parserOptions: TracedOptions = { - ...options, - ruleStack: [], - }; - - parserOptions.tracer = new Tracer(parserOptions); - return parserOptions; - } -} diff --git a/packages/message-parser/src/utils.ts b/packages/message-parser/src/utils.ts index a48be3cfd182..6c5d605c5c7a 100644 --- a/packages/message-parser/src/utils.ts +++ b/packages/message-parser/src/utils.ts @@ -198,22 +198,19 @@ const joinEmoji = ( export const reducePlainTexts = ( values: Paragraph['value'] ): Paragraph['value'] => - values - .flatMap((item) => item) - .filter((item) => item) - .reduce((result, item, index, values) => { - const next = values[index + 1]; - const current = joinEmoji(item, values[index - 1], next); - const previous: Inlines = result[result.length - 1]; - - if (previous) { - if (current.type === 'PLAIN_TEXT' && current.type === previous.type) { - previous.value += current.value; - return result; - } + values.flat().reduce((result, item, index, values) => { + const next = values[index + 1]; + const current = joinEmoji(item, values[index - 1], next); + const previous: Inlines = result[result.length - 1]; + + if (previous) { + if (current.type === 'PLAIN_TEXT' && current.type === previous.type) { + previous.value += current.value; + return result; } - return [...result, current]; - }, [] as Paragraph['value']); + } + return [...result, current]; + }, [] as Paragraph['value']); export const lineBreak = (): LineBreak => ({ type: 'LINE_BREAK', value: undefined, @@ -250,3 +247,13 @@ export const timestamp = ( fallback: plain(``), }; }; + +export const extractFirstResult = ( + value: Types[keyof Types]['value'] +): Types[keyof Types]['value'] => { + if (typeof value !== 'object' || !Array.isArray(value)) { + return value; + } + + return value.filter((item) => item).shift() as Types[keyof Types]['value']; +}; diff --git a/packages/peggy-loader/src/index.ts b/packages/peggy-loader/src/index.ts index 9d3ea3b582fb..4d805d2be14d 100644 --- a/packages/peggy-loader/src/index.ts +++ b/packages/peggy-loader/src/index.ts @@ -32,8 +32,6 @@ function peggyLoader( return peggy.generate(grammarContent, { output: 'source', - // We rely on a tracer to prevent recurrence on rules like Bold, Italic, Strikethrough and References - trace: true, ...options, }); }