From 300cd199872b577769f749c18e19b39f22476725 Mon Sep 17 00:00:00 2001 From: Martijn Dekker Date: Sun, 5 Jul 2020 19:24:41 +0200 Subject: [PATCH] Fix corrupt UTF-8 char processing & shellquoting after aborted read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the processing of a multibyte character was interrupted in UTF-8 locales, e.g. by reading just one byte of a two-byte character 'ΓΌ' (\303\274) with a command like: print -nr $'\303\274' | read -n1 g then the shellquoting algorithm was corrupted in such a way that the final quote in simple single-quoted string was missing. This bug may have had other, as yet undiscovered, effects as well. The problem was with corrupted multibyte character processing and not with the shell-quoting routine sh_fmtq() itself. Full trace and discussion at: https://github.com/ksh93/ksh/issues/5 (which is also an attempt to begin to understand the esoteric workings of the libast mb* macros that process UTF-8 characters). src/lib/libast/comp/setlocale.c: utf8_mbtowc(): - If called from the mbinit() macro (i.e. if both pointer parameters are null), reset the global multibyte character synchronisation state variable. This fixes the problem with interrupted processing leaving an inconsistent state, provided that mbinit() is called before processing multibyte characters (which it is, in most (?) places that do this). Before this fix, calling mbinit() in UTF-8 locales was a no-op. src/cmd/ksh93/sh/string.c: sh_fmtq(): - Call mbinit() before potentially processing multibyte characters. Testing suggests that this could be superfluous, but at worst, it's harmless; better be sure. src/cmd/ksh93/tests/builtins.sh: - Add regression test for shellquoting with 'printf %q' after interrupting the processing of a multibyte characeter with 'read -n1'. This test only fails in a UTF-8 locale, e.g. when running: bin/shtests -u builtins SHELL=/buggy/ksh-2012-08-01 Fixes #5. --- NEWS | 6 ++++++ src/cmd/ksh93/include/version.h | 2 +- src/cmd/ksh93/sh/string.c | 3 +++ src/cmd/ksh93/tests/builtins.sh | 16 ++++++++++++++++ src/lib/libast/comp/setlocale.c | 2 ++ 5 files changed, 28 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 44cc5ed33d35..d55fd75fa36d 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,12 @@ For full details, see the git log at: https://github.com/ksh93/ksh Any uppercase BUG_* names are modernish shell bug IDs. +2020-07-05: + +- In UTF-8 locales, fix corruption of the shell's internal string quoting + algorithm (as used by xtrace, 'printf %q', and more) that occurred when + the processing of a multibyte character was interrupted. + 2020-07-03: - Backslashes are no longer escaped in the raw Bourne Shell-like editing diff --git a/src/cmd/ksh93/include/version.h b/src/cmd/ksh93/include/version.h index 0f0a4fb8c52d..2ee19b9b1b31 100644 --- a/src/cmd/ksh93/include/version.h +++ b/src/cmd/ksh93/include/version.h @@ -17,4 +17,4 @@ * David Korn * * * ***********************************************************************/ -#define SH_RELEASE "93u+m 2020-07-02" +#define SH_RELEASE "93u+m 2020-07-05" diff --git a/src/cmd/ksh93/sh/string.c b/src/cmd/ksh93/sh/string.c index 79b37f482209..5eb124b75b23 100644 --- a/src/cmd/ksh93/sh/string.c +++ b/src/cmd/ksh93/sh/string.c @@ -336,6 +336,9 @@ char *sh_fmtq(const char *string) int offset; if(!cp) return((char*)0); +#if SHOPT_MULTIBYTE + mbinit(); +#endif /* SHOPT_MULTIBYTE */ offset = staktell(); state = ((c= mbchar(cp))==0); if(isaletter(c)) diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh index 470644be4852..fd093c66bf15 100755 --- a/src/cmd/ksh93/tests/builtins.sh +++ b/src/cmd/ksh93/tests/builtins.sh @@ -302,7 +302,23 @@ fi if [[ $(printf '%..*s\n' : abc def) != abc:def ]] then err_exit "printf '%..*s' not working" fi + +# ====== +# shell-quoting using printf %q (same algorithm used for xtrace and output of 'set', 'trap', ...) + [[ $(printf '%q\n') == '' ]] || err_exit 'printf "%q" with missing arguments' + +# the following fails on 2012-08-01 in UTF-8 locales +expect="'shell-quoted string'" +actual=$( + print -nr $'\303\274' | read -n1 foo # interrupt processing of 2-byte UTF-8 char after reading 1 byte + printf '%q\n' "shell-quoted string" +) +LC_CTYPE=POSIX true # on buggy ksh, a locale re-init via temp assignment restores correct shellquoting +[[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \ + "(expected $expect; got $actual)" + +# ====== # we won't get hit by the one second boundary twice, right? expect= actual= { expect=$(LC_ALL=C date | sed 's/ GMT / UTC /') && actual=$(LC_ALL=C printf '%T\n' now) && [[ $actual == "$expect" ]]; } || diff --git a/src/lib/libast/comp/setlocale.c b/src/lib/libast/comp/setlocale.c index ba31fe914478..65553f8bb108 100644 --- a/src/lib/libast/comp/setlocale.c +++ b/src/lib/libast/comp/setlocale.c @@ -600,6 +600,8 @@ utf8_mbtowc(wchar_t* wp, const char* str, size_t n) register int c; register wchar_t w = 0; + if (!wp && !sp) + ast.mb_sync = 0; /* assume call from mbinit() macro: reset global multibyte sync state */ if (!sp || !n) return 0; if ((m = utf8tab[*sp]) > 0)