Skip to content

Commit

Permalink
fix: Handle UTF-16 escapes and invalid surrogate sequences in Python …
Browse files Browse the repository at this point in the history
…and Go (#2926)

Fixes #1980. Fixes #2925.

Should fix the root cause of #2934, but I'm not claiming the complete
fix, as attempting to include unescaped, non-ASCII characters in a Dafny
source file is revealing platform-specific headaches in the integration
test runner I'd like to address in a separate PR.

Printing non-ASCII characters, especially invalid UTF-16 sequences, is
still inconsistent across backends, but at least should not crash after
this change. Again this is difficult to test effectively across
platforms in our testing architecture.

<small>By submitting this pull request, I confirm that my contribution
is made under the terms of the [MIT
license](https://github.com/dafny-lang/dafny/blob/master/LICENSE.txt).</small>

Co-authored-by: Clément Pit-Claudel <[email protected]>
  • Loading branch information
robin-aws and cpitclaudel authored Nov 3, 2022
1 parent 8e2c073 commit c81c3ca
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 38 deletions.
53 changes: 45 additions & 8 deletions Source/DafnyCore/Compilers/Compiler-go.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2059,14 +2059,10 @@ protected override void EmitLiteralExpr(ConcreteSyntaxTree wr, LiteralExpr e) {
wr.Write("({0})(nil)", TypeName(e.Type, wr, e.tok));
} else if (e.Value is bool) {
wr.Write((bool)e.Value ? "true" : "false");
} else if (e is CharLiteralExpr) {
var v = (string)e.Value;
wr.Write("_dafny.Char('{0}')", TranslateEscapes(v, isChar: true));
} else if (e is StringLiteralExpr) {
var str = (StringLiteralExpr)e;
wr.Write("_dafny.SeqOfString(");
TrStringLiteral(str, wr);
wr.Write(")");
} else if (e is CharLiteralExpr chrLit) {
TrCharLiteral(chrLit, wr);
} else if (e is StringLiteralExpr strLit) {
TrStringLiteral(strLit, wr);
} else if (AsNativeType(e.Type) is NativeType nt) {
wr.Write("{0}({1})", GetNativeTypeName(nt), (BigInteger)e.Value);
} else if (e.Value is BigInteger i) {
Expand Down Expand Up @@ -2097,6 +2093,47 @@ void EmitIntegerLiteral(BigInteger i, ConcreteSyntaxTree wr) {
}
}

protected void TrCharLiteral(CharLiteralExpr chr, ConcreteSyntaxTree wr) {
var v = (string)chr.Value;
wr.Write("_dafny.Char(");
// See comment in TrStringLiteral for why we can't just translate directly sometimes.
if (Util.MightContainNonAsciiCharacters(v, false)) {
var c = Util.UnescapedCharacters(v, false).Single();
wr.Write($"{(int)c}");
} else {
wr.Write("'{0}'", TranslateEscapes(v, isChar: true));
}
wr.Write(")");
}

protected override void TrStringLiteral(StringLiteralExpr str, ConcreteSyntaxTree wr) {
Contract.Requires(str != null);
Contract.Requires(wr != null);
// It may not be possible to translate a Dafny string into a valid Go string,
// since Go string literals have to be encodable in UTF-8,
// but Dafny allows invalid sequences of surrogate characters.
// In addition, _dafny.SeqOfString iterates over the runes in the Go string
// rather than the equivalent UTF-16 code units.
// That means in many cases we can't create a Dafny string value by emitting
// _dafny.SeqOfString("..."), since there's no way to encode the right data in the Go string literal.
// Instead, if any non-ascii characters might be present, just emit a sequence of the direct UTF-16 code units instead.
var s = (string)str.Value;
if (Util.MightContainNonAsciiCharacters(s, false)) {
wr.Write("_dafny.SeqOfChars(");
var comma = "";
foreach (var c in Util.UnescapedCharacters(s, str.IsVerbatim)) {
wr.Write(comma);
wr.Write($"{(int)c}");
comma = ", ";
}
wr.Write(")");
} else {
wr.Write("_dafny.SeqOfString(");
EmitStringLiteral(s, str.IsVerbatim, wr);
wr.Write(")");
}
}

protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteSyntaxTree wr) {
var n = str.Length;
if (!isVerbatim) {
Expand Down
2 changes: 1 addition & 1 deletion Source/DafnyCore/Compilers/Compiler-java.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2276,7 +2276,7 @@ public override bool CompileTargetProgram(string dafnyProgramName, string target
files.Add($"\"{Path.GetFullPath(file)}\"");
}
var classpath = GetClassPath(targetFilename);
var psi = new ProcessStartInfo("javac", string.Join(" ", files)) {
var psi = new ProcessStartInfo("javac", "-encoding UTF8 " + string.Join(" ", files)) {
CreateNoWindow = true,
UseShellExecute = false,
RedirectStandardOutput = true,
Expand Down
2 changes: 1 addition & 1 deletion Source/DafnyCore/Compilers/SinglePassCompiler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5138,7 +5138,7 @@ protected ConcreteSyntaxTree StringLiteral(StringLiteralExpr str) {
return result;
}

protected void TrStringLiteral(StringLiteralExpr str, ConcreteSyntaxTree wr) {
protected virtual void TrStringLiteral(StringLiteralExpr str, ConcreteSyntaxTree wr) {
Contract.Requires(str != null);
Contract.Requires(wr != null);
EmitStringLiteral((string)str.Value, str.IsVerbatim, wr);
Expand Down
77 changes: 57 additions & 20 deletions Source/DafnyCore/Util.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Diagnostics.Contracts;
using System.Reactive.Disposables;
using System.Reactive.Linq;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using JetBrains.Annotations;
using Microsoft.Boogie;
Expand Down Expand Up @@ -185,6 +186,14 @@ public static string RemoveParsedStringQuotes(string s, out bool isVerbatimStrin
return s.Substring(1, len - 2);
}
}

public static bool MightContainNonAsciiCharacters(string s, bool isVerbatimString) {
// This is conservative since \u escapes could be ASCII characters,
// but that's fine since this method is just used as a conservative guard.
return TokensWithEscapes(s, isVerbatimString).Any(e =>
e.Any(c => !char.IsAscii(c)) || e.StartsWith(@"\u"));
}

/// <summary>
/// Replaced any escaped characters in s by the actual character that the escaping represents.
/// Assumes s to be a well-parsed string.
Expand All @@ -195,54 +204,82 @@ public static string RemoveEscaping(string s, bool isVerbatimString) {
UnescapedCharacters(s, isVerbatimString).Iter(ch => sb.Append(ch));
return sb.ToString();
}

/// <summary>
/// Returns the characters of the well-parsed string p, replacing any
/// escaped characters by the actual characters.
/// </summary>
public static IEnumerable<char> UnescapedCharacters(string p, bool isVerbatimString) {
if (isVerbatimString) {
foreach (var s in TokensWithEscapes(p, true)) {
if (s == "\"\"") {
yield return '"';
} else {
foreach (var c in s) {
yield return c;
}
}
}
} else {
foreach (var s in TokensWithEscapes(p, false)) {
switch (s) {
case @"\'": yield return '\''; break;
case @"\""": yield return '"'; break;
case @"\\": yield return '\\'; break;
case @"\0": yield return '\0'; break;
case @"\n": yield return '\n'; break;
case @"\r": yield return '\r'; break;
case @"\t": yield return '\t'; break;
case { } when s.StartsWith(@"\u"):
yield return (char)Convert.ToInt32(s[2..], 16);
break;
default:
foreach (var c in s) {
yield return c;
}
break;
}
}
}
}

/// <summary>
/// Enumerates the sequence of regular characters and escape sequences in the given string.
/// For example, "ab\tuv\u12345" may be broken up as ["a", "b", "\t", "u", "v", "\u1234", "5"].
/// Consecutive non-escaped characters may or may not be enumerated as a single string.
/// </summary>
public static IEnumerable<string> TokensWithEscapes(string p, bool isVerbatimString) {
Contract.Requires(p != null);
if (isVerbatimString) {
var skipNext = false;
foreach (var ch in p) {
if (skipNext) {
skipNext = false;
} else {
yield return ch;
if (ch == '"') {
skipNext = true;
yield return "\"";
} else {
yield return ch.ToString();
}
}
}
} else {
var i = 0;
while (i < p.Length) {
char special = ' '; // ' ' indicates not special
if (p[i] == '\\') {
switch (p[i + 1]) {
case '\'': special = '\''; break;
case '\"': special = '\"'; break;
case '\\': special = '\\'; break;
case '0': special = '\0'; break;
case 'n': special = '\n'; break;
case 'r': special = '\r'; break;
case 't': special = '\t'; break;
case 'u':
int ch = HexValue(p[i + 2]);
ch = 16 * ch + HexValue(p[i + 3]);
ch = 16 * ch + HexValue(p[i + 4]);
ch = 16 * ch + HexValue(p[i + 5]);
yield return (char)ch;
yield return p[i..(i + 6)];
i += 6;
continue;
break;
default:
yield return p[i..(i + 2)];
i += 2;
break;
}
}
if (special != ' ') {
yield return special;
i += 2;
} else {
yield return p[i];
yield return p[i].ToString();
i++;
}
}
Expand Down
2 changes: 2 additions & 0 deletions Source/DafnyRuntime/DafnyRuntime.go
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,8 @@ func (seq Seq) UniqueElements() Set {
func (seq Seq) String() string {
if seq.isString {
s := ""
// Note this doesn't produce the right string in UTF-8,
// since it converts surrogates independently.
for _, c := range seq.contents {
s += c.(Char).String()
}
Expand Down
13 changes: 12 additions & 1 deletion Source/DafnyRuntime/DafnyRuntime.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,24 @@ def __get__(self, instance, owner):
def print(value):
builtins.print(string_of(value), end="")

# Dafny strings are currently sequences of UTF-16 code units.
# To make a best effort attempt at printing the right characters we attempt to decode,
# but have to allow for invalid sequences.
def string_from_utf_16(utf_16_code_units):
return b''.join(ord(c).to_bytes(2, 'little') for c in utf_16_code_units).decode("utf-16-le", errors = 'replace')

def string_of(value) -> str:
if hasattr(value, '__dafnystr__'):
return value.__dafnystr__()
elif value is None:
return "null"
elif isinstance(value, bool):
return "true" if value else "false"
elif isinstance(value, str):
# This is only for Dafny char values.
# Dafny strings are represented as Seq's of individual char values,
# and Seq defines __dafnystr__.
return string_from_utf_16(value)
elif isinstance(value, tuple):
return '(' + ', '.join(map(string_of, value)) + ')'
elif isinstance(value, FunctionType):
Expand Down Expand Up @@ -82,7 +93,7 @@ def UniqueElements(self):

def __dafnystr__(self) -> str:
if self.isStr:
return ''.join(self)
return string_from_utf_16(self)
return '[' + ', '.join(map(string_of, self)) + ']'

def __add__(self, other):
Expand Down
4 changes: 3 additions & 1 deletion Test/allocated1/dafny0/Strings.dfy.expect
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

Dafny program verifier finished with 6 verified, 0 errors
Dafny program verifier finished with 8 verified, 0 errors
ch = D
The string is: DDD
Escape X: I say "hello" \ you say 'good bye'
Expand All @@ -8,3 +8,5 @@ Escape Z: There needs to be R&D
Yes, sir
Here is the end
* * *
I'm afraid you'll find escape quite impossible, \u007
Luckily I have this nifty gadget from my good friend, \Q
51 changes: 47 additions & 4 deletions Test/dafny0/Strings.dfy
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: %dafny /compile:3 /print:"%t.print" /dprint:"%t.dprint" "%s" > "%t"
// RUN: %diff "%s.expect" "%t"
// RUN: %testDafnyForEachCompiler "%s"

method Char(a: char, s: string, i: int) returns (b: char)
{
Expand All @@ -25,18 +24,44 @@ method M(a: char, b: char) returns (s: string, t: seq<char>)
s := t[0..|s|];
}

// Note that actually printing strings with non-ASCII characters
// is currently inconsistent across both languages and platforms.
// See https://github.com/dafny-lang/dafny/pull/2938 for an
// attempt to fix this that had to be reverted.
// For now we're just making runtimes assertions about the contents of
// strings rather than printing them and relying on the diff with the expect file.
method Main()
{
var ch: char;
var s, t := M(ch, ch);
print "ch = ", ch, "\n";
print "The string is: " + s + "\n";
var x, y, z := Escapes();
var x, y, z, zz := Escapes();
print "Escape X: ", x, "\n";
print "Escape Y: ", y, "\n";
print "Escape Z: ", z, "\n";
// Not printing zz as per the comment above
var c, d := CharEscapes();
print "Here is the end" + [c, d] + [' ', ' ', ' '] + [[d]][0] + " ", d, "\n";

// Testing invalid UTF-16 content that Dafny allows (at least until --unicode-char lands)

var x?, y?, z? := WeirdStrings();

expect |x?| == 30;
expect x?[29] as int == 55296;
expect |y?| == 30;
expect x?[29] as int == 55296;
expect |z?| > 2;
expect z?[0..2] == ['\ude0e', '\ud83d'];

var c?, d? := WeirdChars();
expect c? == 0xD800 as char;
expect d? == '\uDFFF';

// Ensuring we're precise enough about identifying \u escapes
print "I'm afraid you'll find escape quite impossible, \\u007", "\n";
print "Luckily I have this nifty gadget from my good friend, \\\u0051", "\n";
}

method GimmieAChar(s: string) returns (ch: char)
Expand All @@ -50,12 +75,14 @@ method GimmieAChar(s: string) returns (ch: char)
}
}

method Escapes() returns (x: string, y: string, z: string)
method Escapes() returns (x: string, y: string, z: string, zz: string)
ensures |zz| > 2
{
x := "I say \"hello\" \\ you say \'good bye'";
y := @"I say ""hello"" \ you say 'good bye'";
assert x == y;
z := "There needs to be \u0052\u0026\u0044\n\tYes, sir";
zz := "\ud83d\ude0e is the UTF-16 for a very cool emoji";
}

method CharEscapes() returns (c: char, d: char)
Expand All @@ -67,3 +94,19 @@ method CharEscapes() returns (c: char, d: char)
c := '\n';
d := '*';
}

// Strings that aren't valid UTF-16 sequences
method WeirdStrings() returns (x: string, y: string, z: string)
{
x := "What even is this character: \uD800";
y := "What even is this character: " + [0xD800 as char];
assert x == y;
z := "\ude0e\ud83d is not using surrogates correctly";
}

// Surrogate code points
method WeirdChars() returns (c: char, d: char)
{
c := '\uD800';
d := 0xDFFF as char;
}
4 changes: 2 additions & 2 deletions Test/dafny0/Strings.dfy.expect
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@

Dafny program verifier finished with 6 verified, 0 errors
ch = D
The string is: DDD
Escape X: I say "hello" \ you say 'good bye'
Expand All @@ -8,3 +6,5 @@ Escape Z: There needs to be R&D
Yes, sir
Here is the end
* * *
I'm afraid you'll find escape quite impossible, \u007
Luckily I have this nifty gadget from my good friend, \Q
1 change: 1 addition & 0 deletions docs/dev/news/2926.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The Python and Go backends now encode non-ASCII characters in string literals correctly

0 comments on commit c81c3ca

Please sign in to comment.