Fixes encoding for Chinese characters #28

joshy · Dec 17, 2021 · 82b83aa · 82b83aa
1 parent 62240ba
commit 82b83aa
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 9 deletions.
diff --git a/striprtf/striprtf.py b/striprtf/striprtf.py
@@ -86,11 +86,11 @@ def rtf_to_text(text, errors="strict"):
     Parameters
     ----------
     text : str
-        The rtf text 
+        The rtf text
     errors : str
         How to handle encoding errors. Default is "strict", which throws an error. Another
         option is "ignore" which, as the name says, ignores encoding errors.
-    
+
     Returns
     -------
     str
@@ -104,7 +104,7 @@ def rtf_to_text(text, errors="strict"):
     encoding = 'utf8'
 
     for match in PATTERN.finditer(text):
-        word, arg, hex, char, brace, tchar = match.groups()
+        word, arg, _hex, char, brace, tchar = match.groups()
         if brace:
             curskip = 0
             if brace == "{":
@@ -143,11 +143,12 @@ def rtf_to_text(text, errors="strict"):
             curskip = 0
             if word in destinations:
                 ignorable = True
-
             # http://www.biblioscape.com/rtf15_spec.htm#Heading8
             elif word == "ansicpg":
                 encoding = f"cp{arg}"
-
+            # fix for issue #28
+            elif word == "fcharset" and arg == "134":
+                encoding = "gbk"
             elif ignorable:
                 pass
             elif word in specialchars:
@@ -164,16 +165,15 @@ def rtf_to_text(text, errors="strict"):
                         c += 0x10000
                     out = out + chr(c).encode(encoding, errors)
                     curskip = ucskip
-        elif hex:  # \'xx
+        elif _hex:  # \'xx
             if curskip > 0:
                 curskip -= 1
             elif not ignorable:
-                c = int(hex, 16)
-                out = out + bytes.fromhex(hex)
+                c = int(_hex, 16)
+                out = out + bytes.fromhex(_hex)
         elif tchar:
             if curskip > 0:
                 curskip -= 1
             elif not ignorable:
                 out = out + tchar.encode(encoding, errors)
-
     return out.decode(encoding, errors)
diff --git a/tests/rtf/issue_28.rtf b/tests/rtf/issue_28.rtf
@@ -0,0 +1,13 @@
+{\rtf1\ansi\ansicpg1252\cocoartf2580
+\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fnil\fcharset134 STSongti-SC-Regular;}
+{\colortbl;\red255\green255\blue255;}
+{\*\expandedcolortbl;;}
+{\info
+{\author Yi Lu}}\vieww11520\viewh8400\viewkind0
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+
+\f0\fs24 \cf0 \'d3\'a1\'cb\'a2\'c7\'e9\'bf\'f6\'b7\'b4\'d3\'b3\'a3\'ba\
+201-003-00155 (Multiple)\
+\
+\'ca\'d0\'d5\'fe\'b8\'ae\'c7\'e9\'bf\'f6\'b7\'b4\'d3\'b3\'a3\'ba\
+022-021-00768 (Multiple)}
diff --git a/tests/test_issue_28.py b/tests/test_issue_28.py
@@ -0,0 +1,18 @@
+import unittest
+from pathlib import Path
+
+from striprtf.striprtf import rtf_to_text
+
+RTF_DIR = Path.cwd() / "tests" / "rtf"
+TEXT_DIR = Path.cwd() / "tests" / "text"
+
+
+class TestUnicodeJapanese(unittest.TestCase):
+    def test_empty(self):
+        example_rtf = RTF_DIR / "issue_28.rtf"
+        example_txt = TEXT_DIR / "issue_28.txt"
+
+        with example_rtf.open() as source:
+            result = rtf_to_text(source.read(), errors="ignore")
+        with example_txt.open(encoding="utf-8") as destination:
+            self.assertEqual(destination.read(), result)
diff --git a/tests/text/issue_28.txt b/tests/text/issue_28.txt
@@ -0,0 +1,5 @@
+印刷情况反映：
+201-003-00155 (Multiple)
+
+市政府情况反映：
+022-021-00768 (Multiple)