From 25a4cf6794f4df150d4cb4346d724b5048e826a3 Mon Sep 17 00:00:00 2001
From: Vinicius Stock <vinicius.stock@shopify.com>
Date: Tue, 8 Oct 2024 10:47:08 -0400
Subject: [PATCH] Avoid breaking code units offset on binary encoding

Co-authored-by: Kevin Newton <kddnewton@users.noreply.github.com>
---
 lib/prism/parse_result.rb        |  2 +-
 test/prism/ruby/location_test.rb | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
index ae026b42ac0..aea5dee9faf 100644
--- a/lib/prism/parse_result.rb
+++ b/lib/prism/parse_result.rb
@@ -90,7 +90,7 @@ def character_column(byte_offset)
     # concept of code units that differs from the number of characters in other
     # encodings, it is not captured here.
     def code_units_offset(byte_offset, encoding)
-      byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding)
+      byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
 
       if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
         byteslice.bytesize / 2
diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb
index fc80a5b875d..e360a0db724 100644
--- a/test/prism/ruby/location_test.rb
+++ b/test/prism/ruby/location_test.rb
@@ -140,6 +140,25 @@ def test_code_units
       assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
     end
 
+    def test_code_units_handles_binary_encoding_with_multibyte_characters
+      # If the encoding is set to binary and the source contains multibyte
+      # characters, we avoid breaking the code unit offsets, but they will
+      # still be incorrect.
+
+      program = Prism.parse(<<~RUBY).value
+        # -*- encoding: binary -*-
+
+        😀 + 😀
+      RUBY
+
+      # first 😀
+      location = program.statements.body.first.receiver.location
+
+      assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
+      assert_equal 4, location.end_code_units_column(Encoding::UTF_16LE)
+      assert_equal 4, location.end_code_units_column(Encoding::UTF_32LE)
+    end
+
     def test_chop
       location = Prism.parse("foo").value.location