diff --git a/striprtf/striprtf.py b/striprtf/striprtf.py index 2e14342..1c8a260 100644 --- a/striprtf/striprtf.py +++ b/striprtf/striprtf.py @@ -86,11 +86,11 @@ def rtf_to_text(text, errors="strict"): Parameters ---------- text : str - The rtf text + The rtf text errors : str How to handle encoding errors. Default is "strict", which throws an error. Another option is "ignore" which, as the name says, ignores encoding errors. - + Returns ------- str @@ -104,7 +104,7 @@ def rtf_to_text(text, errors="strict"): encoding = 'utf8' for match in PATTERN.finditer(text): - word, arg, hex, char, brace, tchar = match.groups() + word, arg, _hex, char, brace, tchar = match.groups() if brace: curskip = 0 if brace == "{": @@ -143,11 +143,12 @@ def rtf_to_text(text, errors="strict"): curskip = 0 if word in destinations: ignorable = True - # http://www.biblioscape.com/rtf15_spec.htm#Heading8 elif word == "ansicpg": encoding = f"cp{arg}" - + # fix for issue #28 + elif word == "fcharset" and arg == "134": + encoding = "gbk" elif ignorable: pass elif word in specialchars: @@ -164,16 +165,15 @@ def rtf_to_text(text, errors="strict"): c += 0x10000 out = out + chr(c).encode(encoding, errors) curskip = ucskip - elif hex: # \'xx + elif _hex: # \'xx if curskip > 0: curskip -= 1 elif not ignorable: - c = int(hex, 16) - out = out + bytes.fromhex(hex) + c = int(_hex, 16) + out = out + bytes.fromhex(_hex) elif tchar: if curskip > 0: curskip -= 1 elif not ignorable: out = out + tchar.encode(encoding, errors) - return out.decode(encoding, errors) diff --git a/tests/rtf/issue_28.rtf b/tests/rtf/issue_28.rtf new file mode 100644 index 0000000..377dc4a --- /dev/null +++ b/tests/rtf/issue_28.rtf @@ -0,0 +1,13 @@ +{\rtf1\ansi\ansicpg1252\cocoartf2580 +\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fnil\fcharset134 STSongti-SC-Regular;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +{\info +{\author Yi Lu}}\vieww11520\viewh8400\viewkind0 +\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 + +\f0\fs24 \cf0 \'d3\'a1\'cb\'a2\'c7\'e9\'bf\'f6\'b7\'b4\'d3\'b3\'a3\'ba\ +201-003-00155 (Multiple)\ +\ +\'ca\'d0\'d5\'fe\'b8\'ae\'c7\'e9\'bf\'f6\'b7\'b4\'d3\'b3\'a3\'ba\ +022-021-00768 (Multiple)} \ No newline at end of file diff --git a/tests/test_issue_28.py b/tests/test_issue_28.py new file mode 100644 index 0000000..8ea529d --- /dev/null +++ b/tests/test_issue_28.py @@ -0,0 +1,18 @@ +import unittest +from pathlib import Path + +from striprtf.striprtf import rtf_to_text + +RTF_DIR = Path.cwd() / "tests" / "rtf" +TEXT_DIR = Path.cwd() / "tests" / "text" + + +class TestUnicodeJapanese(unittest.TestCase): + def test_empty(self): + example_rtf = RTF_DIR / "issue_28.rtf" + example_txt = TEXT_DIR / "issue_28.txt" + + with example_rtf.open() as source: + result = rtf_to_text(source.read(), errors="ignore") + with example_txt.open(encoding="utf-8") as destination: + self.assertEqual(destination.read(), result) diff --git a/tests/text/issue_28.txt b/tests/text/issue_28.txt new file mode 100644 index 0000000..8a534c8 --- /dev/null +++ b/tests/text/issue_28.txt @@ -0,0 +1,5 @@ +印刷情况反映: +201-003-00155 (Multiple) + +市政府情况反映: +022-021-00768 (Multiple) \ No newline at end of file