dafny-lang · robin-aws · Nov 3, 2022 · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022
diff --git a/Source/DafnyCore/Compilers/Compiler-go.cs b/Source/DafnyCore/Compilers/Compiler-go.cs
@@ -2061,12 +2061,36 @@ protected override void EmitLiteralExpr(ConcreteSyntaxTree wr, LiteralExpr e) {
         wr.Write((bool)e.Value ? "true" : "false");
       } else if (e is CharLiteralExpr) {
         var v = (string)e.Value;
-        wr.Write("_dafny.Char('{0}')", TranslateEscapes(v, isChar: true));
+        wr.Write("_dafny.Char(");
+        // See comment on the StringLiteralExpr case below.
+        if (Util.Utf16Escape.IsMatch(v)) {
+          char c = Util.UnescapedCharacters(v, false).Single();
+          wr.Write($"{(int)c}");
+        } else {
+          wr.Write("'{0}'", TranslateEscapes(v, isChar: true));
+        }
+        wr.Write(")");
       } else if (e is StringLiteralExpr) {
         var str = (StringLiteralExpr)e;
-        wr.Write("_dafny.SeqOfString(");
-        TrStringLiteral(str, wr);
-        wr.Write(")");
+        // It may not be possible to translate \u escapes into a valid Go string,
+        // since Go string literals have to be encodable in UTF-8,
+        // but Dafny allows invalid sequences of surrogate characters.
+        // So if any are present, just emit a sequence of the direct UTF-16 code units instead.
+        var s = (string)str.Value;
+        if (!str.IsVerbatim && Util.Utf16Escape.IsMatch(s)) {
+          wr.Write("_dafny.SeqOfChars(");
+          var comma = "";
+          foreach (var c in Util.UnescapedCharacters(s, str.IsVerbatim)) {
+            wr.Write(comma);
+            wr.Write($"{(int)c}");
+            comma = ", ";
+          }
+          wr.Write(")");
+        } else {
+          wr.Write("_dafny.SeqOfString(");
+          TrStringLiteral(str, wr);
+          wr.Write(")");
+        }
       } else if (AsNativeType(e.Type) is NativeType nt) {
         wr.Write("{0}({1})", GetNativeTypeName(nt), (BigInteger)e.Value);
       } else if (e.Value is BigInteger i) {
@@ -2097,6 +2121,8 @@ void EmitIntegerLiteral(BigInteger i, ConcreteSyntaxTree wr) {
       }
     }
 
+
+
     protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteSyntaxTree wr) {
-
-
-    protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteSyntaxTree wr) {
+    protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteSyntaxTree wr) {
-
-
-    protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteSyntaxTree wr) {
+    protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteSyntaxTree wr) {
       var n = str.Length;
       if (!isVerbatim) {

diff --git a/Source/DafnyCore/Util.cs b/Source/DafnyCore/Util.cs
@@ -8,6 +8,7 @@
 using System.Diagnostics.Contracts;
 using System.Reactive.Disposables;
 using System.Reactive.Linq;
+using System.Text.RegularExpressions;
 using System.Threading.Tasks;
 using JetBrains.Annotations;
 using Microsoft.Boogie;
@@ -195,6 +196,9 @@ public static string RemoveEscaping(string s, bool isVerbatimString) {
       UnescapedCharacters(s, isVerbatimString).Iter(ch => sb.Append(ch));
       return sb.ToString();
     }
+
+    public static readonly Regex Utf16Escape = new Regex(@"(?<!\\)\\u([0-9a-fA-F]{4})");
+
     /// <summary>
     /// Returns the characters of the well-parsed string p, replacing any
     /// escaped characters by the actual characters.

diff --git a/Source/DafnyRuntime/DafnyRuntime.go b/Source/DafnyRuntime/DafnyRuntime.go
@@ -597,6 +597,8 @@ func (seq Seq) UniqueElements() Set {
 func (seq Seq) String() string {
   if seq.isString {
     s := ""
+    // Note this doesn't produce the right string in UTF-8,
+    // since it converts surrogates independently.
     for _, c := range seq.contents {
       s += c.(Char).String()
     }

diff --git a/Source/DafnyRuntime/DafnyRuntime.py b/Source/DafnyRuntime/DafnyRuntime.py
@@ -17,13 +17,24 @@ def __get__(self, instance, owner):
 def print(value):
     builtins.print(string_of(value), end="")
 
+# Dafny strings are currently sequences of UTF-16 code units.
+# To make a best effort attempt at printing the right characters we attempt to decode,
+# but have to allow for invalid sequences.
+def string_from_utf_16(utf_16_code_units):
+    return b''.join([ord(c).to_bytes(2, 'little') for c in utf_16_code_units]).decode("utf-16", errors = 'replace')
+
 def string_of(value) -> str:
     if hasattr(value, '__dafnystr__'):
         return value.__dafnystr__()
     elif value is None:
         return "null"
     elif isinstance(value, bool):
         return "true" if value else "false"
+    elif isinstance(value, str):
+        # This is only for Dafny char values.
+        # Dafny strings are represented as Seq's of indivdual char values,
+        # and Seq defines __dafnystr__.
+        return string_from_utf_16(value)
     elif isinstance(value, tuple):
         return '(' + ', '.join(map(string_of, value)) + ')'
     elif isinstance(value, FunctionType):
@@ -82,7 +93,7 @@ def UniqueElements(self):
 
     def __dafnystr__(self) -> str:
         if self.isStr:
-            return ''.join(self)
+            return string_from_utf_16(self)
         return '[' + ', '.join(map(string_of, self)) + ']'
 
     def __add__(self, other):

diff --git a/Test/dafny0/Strings.dfy b/Test/dafny0/Strings.dfy
@@ -1,4 +1,9 @@
-// RUN: %dafny /compile:3 /print:"%t.print" /dprint:"%t.dprint" "%s" > "%t"
+// RUN: %baredafny verify %args "%s" > "%t"
+// RUN: %baredafny run --no-verify --target=cs %args "%s" >> "%t"
+// RUN: %baredafny run --no-verify --target=js %args  "%s" >> "%t"
+// RUN: %baredafny run --no-verify --target=go %args  "%s" >> "%t"
+// RUN: %baredafny run --no-verify --target=java %args  "%s" >> "%t"
+// RUN: %baredafny run --no-verify --target=py %args  "%s" >> "%t"
 // RUN: %diff "%s.expect" "%t"
 
 method Char(a: char, s: string, i: int) returns (b: char)
@@ -31,12 +36,31 @@ method Main()
   var s, t := M(ch, ch);
   print "ch = ", ch, "\n";
   print "The string is: " + s + "\n";
-  var x, y, z := Escapes();
+  var x, y, z, zz := Escapes();
   print "Escape X: ", x, "\n";
   print "Escape Y: ", y, "\n";
   print "Escape Z: ", z, "\n";
+  print "Escape ZZ: ", zz, "\n";
   var c, d := CharEscapes();
   print "Here is the end" + [c, d] + [' ', ' ', ' '] + [[d]][0] + "   ", d, "\n";
+
+  var x?, y?, z? := WeirdStrings();
+
+  // Printing these invalid (in UTF-16) strings can lead to at least inconsistent
+  // output across the backends, but they should never crash.
+  // We assert that the invalid state is modelled correctly as well.
+  expect |x?| == 30;
+  expect x?[29] as int == 55296;
+  print "Weird string X: ", x?, "\n";
+  expect |y?| == 30;
+  expect x?[29] as int == 55296;
+  print "Weird string Y: ", y?, "\n";
+  expect |z?| > 2;
+  expect z?[0..2] == ['\ude0e', '\ud83d'];
+  print "Weird string Z: ", z?, "\n";
+
+  var c?, d? := WeirdChars();
+  print "These characters are quite confused: ", c?, ' ', d?, "\n";
 }
 
 method GimmieAChar(s: string) returns (ch: char)
@@ -50,12 +74,14 @@ method GimmieAChar(s: string) returns (ch: char)
   }
 }
 
-method Escapes() returns (x: string, y: string, z: string)
+method Escapes() returns (x: string, y: string, z: string, zz: string)
+  ensures |zz| > 2
 {
   x := "I say \"hello\" \\ you say \'good bye'";
   y := @"I say ""hello"" \ you say 'good bye'";
   assert x == y;
   z := "There needs to be \u0052\u0026\u0044\n\tYes, sir";
+  zz := "\ud83d\ude0e is the UTF-16 for a very cool emoji";
 }
 
 method CharEscapes() returns (c: char, d: char)
@@ -67,3 +93,19 @@ method CharEscapes() returns (c: char, d: char)
   c := '\n';
   d := '*';
 }
+
+// Strings that aren't valid UTF-16 sequences
+method WeirdStrings() returns (x: string, y: string, z: string)
+{
+  x := "What even is this character: \uD800";
+  y := "What even is this character: " + [0xD800 as char];
+  assert x == y;
+  z := "\ude0e\ud83d is not using surrogates correctly";
+}
+
+// Surrogate code points
+method WeirdChars() returns (c: char, d: char)
+{
+  c := '\uD800';
+  d := 0xDFFF as char;
+}
diff --git a/Test/dafny0/Strings.dfy.expect b/Test/dafny0/Strings.dfy.expect
@@ -1,10 +1,77 @@
 
-Dafny program verifier finished with 6 verified, 0 errors
+Dafny program verifier finished with 8 verified, 0 errors
+
+Dafny program verifier did not attempt verification
+ch = D
+The string is: DDD
+Escape X: I say "hello" \ you say 'good bye'
+Escape Y: I say "hello" \ you say 'good bye'
+Escape Z: There needs to be R&D
+	Yes, sir
+Escape ZZ: 😎 is the UTF-16 for a very cool emoji
+Here is the end
+*   *   *
+Weird string X: What even is this character: �
+Weird string Y: What even is this character: �
+Weird string Z: �� is not using surrogates correctly
+These characters are quite confused: � �
+
+Dafny program verifier did not attempt verification
+ch = D
+The string is: DDD
+Escape X: I say "hello" \ you say 'good bye'
+Escape Y: I say "hello" \ you say 'good bye'
+Escape Z: There needs to be R&D
+	Yes, sir
+Escape ZZ: 😎 is the UTF-16 for a very cool emoji
+Here is the end
+*   *   *
+Weird string X: What even is this character: �
+Weird string Y: What even is this character: �
+Weird string Z: �� is not using surrogates correctly
+These characters are quite confused: � �
+
+Dafny program verifier did not attempt verification
+ch = D
+The string is: DDD
+Escape X: I say "hello" \ you say 'good bye'
+Escape Y: I say "hello" \ you say 'good bye'
+Escape Z: There needs to be R&D
+	Yes, sir
+Escape ZZ: �� is the UTF-16 for a very cool emoji
+Here is the end
+*   *   *
+Weird string X: What even is this character: �
+Weird string Y: What even is this character: �
+Weird string Z: �� is not using surrogates correctly
+These characters are quite confused: � �
+
+Dafny program verifier did not attempt verification
+ch = D
+The string is: DDD
+Escape X: I say "hello" \ you say 'good bye'
+Escape Y: I say "hello" \ you say 'good bye'
+Escape Z: There needs to be R&D
+	Yes, sir
+Escape ZZ: 😎 is the UTF-16 for a very cool emoji
+Here is the end
+*   *   *
+Weird string X: What even is this character: ?
+Weird string Y: What even is this character: ?
+Weird string Z: ?? is not using surrogates correctly
+These characters are quite confused: ? ?
+
+Dafny program verifier did not attempt verification
 ch = D
 The string is: DDD
 Escape X: I say "hello" \ you say 'good bye'
 Escape Y: I say "hello" \ you say 'good bye'
 Escape Z: There needs to be R&D
 	Yes, sir
+Escape ZZ: 😎 is the UTF-16 for a very cool emoji
 Here is the end
 *   *   *
+Weird string X: What even is this character: �
+Weird string Y: What even is this character: �
+Weird string Z: �� is not using surrogates correctly
+These characters are quite confused: � �