Skip to content

Commit

Permalink
Fixes encoding for Chinese characters #28
Browse files Browse the repository at this point in the history
  • Loading branch information
joshy authored Dec 17, 2021
1 parent 62240ba commit 82b83aa
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 9 deletions.
18 changes: 9 additions & 9 deletions striprtf/striprtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,11 @@ def rtf_to_text(text, errors="strict"):
Parameters
----------
text : str
The rtf text
The rtf text
errors : str
How to handle encoding errors. Default is "strict", which throws an error. Another
option is "ignore" which, as the name says, ignores encoding errors.
Returns
-------
str
Expand All @@ -104,7 +104,7 @@ def rtf_to_text(text, errors="strict"):
encoding = 'utf8'

for match in PATTERN.finditer(text):
word, arg, hex, char, brace, tchar = match.groups()
word, arg, _hex, char, brace, tchar = match.groups()
if brace:
curskip = 0
if brace == "{":
Expand Down Expand Up @@ -143,11 +143,12 @@ def rtf_to_text(text, errors="strict"):
curskip = 0
if word in destinations:
ignorable = True

# http://www.biblioscape.com/rtf15_spec.htm#Heading8
elif word == "ansicpg":
encoding = f"cp{arg}"

# fix for issue #28
elif word == "fcharset" and arg == "134":
encoding = "gbk"
elif ignorable:
pass
elif word in specialchars:
Expand All @@ -164,16 +165,15 @@ def rtf_to_text(text, errors="strict"):
c += 0x10000
out = out + chr(c).encode(encoding, errors)
curskip = ucskip
elif hex: # \'xx
elif _hex: # \'xx
if curskip > 0:
curskip -= 1
elif not ignorable:
c = int(hex, 16)
out = out + bytes.fromhex(hex)
c = int(_hex, 16)
out = out + bytes.fromhex(_hex)
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out = out + tchar.encode(encoding, errors)

return out.decode(encoding, errors)
13 changes: 13 additions & 0 deletions tests/rtf/issue_28.rtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{\rtf1\ansi\ansicpg1252\cocoartf2580
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fnil\fcharset134 STSongti-SC-Regular;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
{\info
{\author Yi Lu}}\vieww11520\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0

\f0\fs24 \cf0 \'d3\'a1\'cb\'a2\'c7\'e9\'bf\'f6\'b7\'b4\'d3\'b3\'a3\'ba\
201-003-00155 (Multiple)\
\
\'ca\'d0\'d5\'fe\'b8\'ae\'c7\'e9\'bf\'f6\'b7\'b4\'d3\'b3\'a3\'ba\
022-021-00768 (Multiple)}
18 changes: 18 additions & 0 deletions tests/test_issue_28.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import unittest
from pathlib import Path

from striprtf.striprtf import rtf_to_text

RTF_DIR = Path.cwd() / "tests" / "rtf"
TEXT_DIR = Path.cwd() / "tests" / "text"


class TestUnicodeJapanese(unittest.TestCase):
def test_empty(self):
example_rtf = RTF_DIR / "issue_28.rtf"
example_txt = TEXT_DIR / "issue_28.txt"

with example_rtf.open() as source:
result = rtf_to_text(source.read(), errors="ignore")
with example_txt.open(encoding="utf-8") as destination:
self.assertEqual(destination.read(), result)
5 changes: 5 additions & 0 deletions tests/text/issue_28.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
印刷情况反映:
201-003-00155 (Multiple)

市政府情况反映:
022-021-00768 (Multiple)

0 comments on commit 82b83aa

Please sign in to comment.