From b59888a54117aef42f6ac2a9d375321e204177ff Mon Sep 17 00:00:00 2001 From: Ivan Ponomarev Date: Fri, 17 Nov 2023 22:57:44 +0000 Subject: [PATCH] Fix bugs in SQL`ScriptScanner` with big String literals and PostgreSQL identifiers (as introduced by #7646) (#7818) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: EddĂș MelĂ©ndez Gonzales --- .../org/testcontainers/ext/ScriptScanner.java | 31 ++++++++--- .../testcontainers/ext/ScriptScannerTest.java | 54 +++++++++++++++++++ 2 files changed, 78 insertions(+), 7 deletions(-) create mode 100644 modules/database-commons/src/test/java/org/testcontainers/ext/ScriptScannerTest.java diff --git a/modules/database-commons/src/main/java/org/testcontainers/ext/ScriptScanner.java b/modules/database-commons/src/main/java/org/testcontainers/ext/ScriptScanner.java index 528acd139c7..686527e9364 100644 --- a/modules/database-commons/src/main/java/org/testcontainers/ext/ScriptScanner.java +++ b/modules/database-commons/src/main/java/org/testcontainers/ext/ScriptScanner.java @@ -28,11 +28,7 @@ class ScriptScanner { private final Pattern whitespace = Pattern.compile("\\s+"); - private final Pattern identifier = Pattern.compile("[a-z][a-z0-9_]*", Pattern.CASE_INSENSITIVE); - - private final Pattern singleQuotedString = Pattern.compile("'(\\\\'|[^'])*'"); - - private final Pattern ansiQuotedString = Pattern.compile("\"(\\\\\"|[^\"])*\""); + private final Pattern identifier = Pattern.compile("[a-z][a-z0-9_$]*", Pattern.CASE_INSENSITIVE); private final Pattern dollarQuotedStringDelimiter = Pattern.compile("\\$\\w*\\$"); @@ -54,7 +50,8 @@ private boolean matches(String substring) { private boolean matches(Pattern regexp) { Matcher m = regexp.matcher(script); - if (m.find(offset) && m.start() == offset) { + m.region(offset, script.length()); + if (m.lookingAt()) { currentMatch = m.group(); offset = m.end(); return true; @@ -99,6 +96,26 @@ private boolean matchesMultilineComment() { return false; } + private boolean matchesQuotedString(final char quote) { + if (script.charAt(offset) == quote) { + boolean escaped = false; + for (int i = offset + 1; i < script.length(); i++) { + char c = script.charAt(i); + if (escaped) { + //just skip the escaped character and drop the flag + escaped = false; + } else if (c == '\\') { + escaped = true; + } else if (c == quote) { + currentMatch = script.substring(offset, i + 1); + offset = i + 1; + return true; + } + } + } + return false; + } + private boolean matchesDollarQuotedString() { //Matches $$ .... $$ if (matches(dollarQuotedStringDelimiter)) { @@ -124,7 +141,7 @@ Lexem next() { return Lexem.SEPARATOR; } else if (matchesSingleLineComment() || matchesMultilineComment()) { return Lexem.COMMENT; - } else if (matches(singleQuotedString) || matches(ansiQuotedString) || matchesDollarQuotedString()) { + } else if (matchesQuotedString('\'') || matchesQuotedString('"') || matchesDollarQuotedString()) { return Lexem.QUOTED_STRING; } else if (matches(identifier)) { return Lexem.IDENTIFIER; diff --git a/modules/database-commons/src/test/java/org/testcontainers/ext/ScriptScannerTest.java b/modules/database-commons/src/test/java/org/testcontainers/ext/ScriptScannerTest.java new file mode 100644 index 00000000000..7e24026c4ec --- /dev/null +++ b/modules/database-commons/src/test/java/org/testcontainers/ext/ScriptScannerTest.java @@ -0,0 +1,54 @@ +package org.testcontainers.ext; + +import org.apache.commons.lang3.StringUtils; +import org.junit.Test; + +import java.util.regex.Pattern; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ScriptScannerTest { + + @Test + public void testHugeStringLiteral() { + String script = "/* a comment */ \"" + StringUtils.repeat('~', 10000) + "\";"; + ScriptScanner scanner = scanner(script); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.COMMENT); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.WHITESPACE); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.QUOTED_STRING); + assertThat(scanner.getCurrentMatch()).matches(Pattern.compile("\"~+\"")); + } + + @Test + public void testPgIdentifierWithDollarSigns() { + ScriptScanner scanner = scanner( + "this$is$a$valid$postgreSQL$identifier " + + "$a$While this is a quoted string$a$$ --just followed by a dollar sign" + ); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.IDENTIFIER); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.WHITESPACE); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.QUOTED_STRING); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.OTHER); + } + + @Test + public void testQuotedLiterals() { + ScriptScanner scanner = scanner("'this \\'is a literal' \"this \\\" is a literal\""); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.QUOTED_STRING); + assertThat(scanner.getCurrentMatch()).isEqualTo("'this \\'is a literal'"); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.WHITESPACE); + assertThat(scanner.next()).isEqualTo(ScriptScanner.Lexem.QUOTED_STRING); + assertThat(scanner.getCurrentMatch()).isEqualTo("\"this \\\" is a literal\""); + } + + private static ScriptScanner scanner(String script) { + return new ScriptScanner( + "dummy", + script, + ScriptUtils.DEFAULT_STATEMENT_SEPARATOR, + ScriptUtils.DEFAULT_COMMENT_PREFIX, + ScriptUtils.DEFAULT_BLOCK_COMMENT_START_DELIMITER, + ScriptUtils.DEFAULT_BLOCK_COMMENT_END_DELIMITER + ); + } +}