Support non-ASCII characters in ISO-8859-x charset encodings (#3310)

Co-authored-by: Michael Grund <[email protected]> Co-authored-by: Yvonne Fröhlich <[email protected]>
GenericMappingTools · Jul 23, 2024 · 3502252 · 3502252
1 parent e746156
commit 3502252
Show file tree

Hide file tree

Showing 7 changed files with 215 additions and 38 deletions.
diff --git a/doc/techref/encodings.md b/doc/techref/encodings.md
@@ -1,14 +1,12 @@
 # Supported Encodings and Non-ASCII Characters
 
-GMT supports a number of encodings and each encoding contains a set of ASCII and non-ASCII
-characters. Below are some of the most common encodings and characters that are supported.
+GMT supports a number of encodings and each encoding contains a set of ASCII and
+non-ASCII characters. In PyGMT, you can use any of these ASCII and non-ASCII characters
+in arguments and text strings. When using non-ASCII characters in PyGMT, the easiest way
+is to copy and paste the character from the encoding tables below.
 
-In PyGMT, you can use any of these ASCII and non-ASCII characters in arguments and text
-strings. When using non-ASCII characters in PyGMT, the easiest way is to copy and paste
-the character from the tables below.
-
-**Note**: The special character &#xfffd; (REPLACEMENT CHARACTER) is used to indicate that
-the character is not defined in the encoding.
+**Note**: The special character &#xfffd; (REPLACEMENT CHARACTER) is used to indicate
+that the character is not defined in the encoding.
 
 ## Adobe ISOLatin1+ Encoding
 
@@ -106,3 +104,27 @@ the Unicode character set.
 | **\35x** | &#x27a8; | &#x27a9; | &#x27aa; | &#x27ab; | &#x27ac; | &#x27ad; | &#x27ae; | &#x27af; |
 | **\36x** | &#xfffd; | &#x27b1; | &#x27b2; | &#x27b3; | &#x27b4; | &#x27b5; | &#x27b6; | &#x27b7; |
 | **\37x** | &#x27b8; | &#x27b9; | &#x27ba; | &#x27bb; | &#x27bc; | &#x27bd; | &#x27be; | &#xfffd; |
+
+## ISO/IEC 8859
+
+GMT also supports the ISO/IEC 8859 standard for 8-bit character encodings. Refer to
+<https://en.wikipedia.org/wiki/ISO/IEC_8859> for descriptions of the different parts of
+the standard.
+
+For a list of the characters in each part of the standard, refer to the following links:
+
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-1>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-2>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-3>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-4>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-5>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-6>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-7>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-8>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-9>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-10>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-11>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-13>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-14>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-15>
+- <https://en.wikipedia.org/wiki/ISO/IEC_8859-16>
diff --git a/pygmt/encodings.py b/pygmt/encodings.py
@@ -1,13 +1,13 @@
 """
-Adobe character encodings supported by GMT.
+Character encodings supported by GMT.
 
-Currently, only Adobe Symbol, Adobe ZapfDingbats, and Adobe ISOLatin1+ encodings are
-supported.
+Currently, Adobe Symbol, Adobe ZapfDingbats, Adobe ISOLatin1+ and ISO-8859-x (x can be
+1-11, 13-16) encodings are supported. Adobe Standard encoding is not supported.
 
-The corresponding Unicode characters in each Adobe character encoding are generated
-from the mapping table and conversion script in the GMT-octal-codes
-(https://github.com/seisman/GMT-octal-codes) repository. Refer to that repository for
-details.
+The corresponding Unicode characters in each Adobe character encoding are generated from
+the mapping tables and conversion scripts in the
+`GMT-octal-codes repository <https://github.com/seisman/GMT-octal-codes>`__. Refer to
+that repository for details.
 
 Some code points are undefined and are assigned with the replacement character
 (``\ufffd``).
@@ -16,14 +16,17 @@
 ----------
 
 - GMT-octal-codes: https://github.com/seisman/GMT-octal-codes
-- GMT official documentation: https://docs.generic-mapping-tools.org/dev/reference/octal-codes.html
+- GMT documentation: https://docs.generic-mapping-tools.org/dev/reference/octal-codes.html
 - Adobe Postscript Language Reference: https://www.adobe.com/jp/print/postscript/pdfs/PLRM.pdf
-- ISOLatin1+: https://en.wikipedia.org/wiki/PostScript_Latin_1_Encoding
+- Adobe ISOLatin1+: https://en.wikipedia.org/wiki/PostScript_Latin_1_Encoding
 - Adobe Symbol: https://en.wikipedia.org/wiki/Symbol_(typeface)
-- Zapf Dingbats: https://en.wikipedia.org/wiki/Zapf_Dingbats
+- Adobe ZapfDingbats: https://en.wikipedia.org/wiki/Zapf_Dingbats
 - Adobe Glyph List: https://github.com/adobe-type-tools/agl-aglfn
+- ISO-8859: https://en.wikipedia.org/wiki/ISO/IEC_8859
 """
 
+import codecs
+
 # Dictionary of character mappings for different encodings.
 charset: dict = {}
 
@@ -129,3 +132,12 @@
         strict=False,
     )
 )
+
+# ISO-8859-x charsets and x can be 1-11, 13-16.
+for i in range(1, 17):
+    if i == 12:  # ISO-8859-12 was abandoned.
+        continue
+    charset[f"ISO-8859-{i}"] = {
+        code: codecs.decode(bytes([code]), f"iso8859_{i}", errors="replace")
+        for code in [*range(0o040, 0o200), *range(0o240, 0o400)]
+    }
diff --git a/pygmt/helpers/__init__.py b/pygmt/helpers/__init__.py
@@ -15,6 +15,7 @@
     unique_name,
 )
 from pygmt.helpers.utils import (
+    _check_encoding,
     _validate_data_input,
     args_in_kwargs,
     build_arg_list,

diff --git a/pygmt/helpers/utils.py b/pygmt/helpers/utils.py
@@ -115,6 +115,78 @@ def _validate_data_input(
                 raise GMTInvalidInput("data must provide x, y, and z columns.")
 
 
+def _check_encoding(
+    argstr: str,
+) -> Literal[
+    "ascii",
+    "ISOLatin1+",
+    "ISO-8859-1",
+    "ISO-8859-2",
+    "ISO-8859-3",
+    "ISO-8859-4",
+    "ISO-8859-5",
+    "ISO-8859-6",
+    "ISO-8859-7",
+    "ISO-8859-8",
+    "ISO-8859-9",
+    "ISO-8859-10",
+    "ISO-8859-11",
+    "ISO-8859-13",
+    "ISO-8859-14",
+    "ISO-8859-15",
+    "ISO-8859-16",
+]:
+    """
+    Check the charset encoding of a string.
+
+    All characters in the string must be in the same charset encoding, otherwise the
+    default ``ISOLatin1+`` encoding is returned. Characters in the Adobe Symbol and
+    ZapfDingbats encodings are also checked because they're independent on the choice of
+    encodings.
+
+    Parameters
+    ----------
+    argstr
+        The string to be checked.
+
+    Returns
+    -------
+    encoding
+        The encoding of the string.
+
+    Examples
+    --------
+    >>> _check_encoding("123ABC+-?!")  # ASCII characters only
+    'ascii'
+    >>> _check_encoding("12AB±β①②")  # Characters in ISOLatin1+
+    'ISOLatin1+'
+    >>> _check_encoding("12ABāáâãäåβ①②")  # Characters in ISO-8859-4
+    'ISO-8859-4'
+    >>> _check_encoding("12ABŒā")  # Mix characters in ISOLatin1+ (Œ) and ISO-8859-4 (ā)
+    'ISOLatin1+'
+    >>> _check_encoding("123AB中文")  # Characters not in any charset encoding
+    'ISOLatin1+'
+    """
+    # Return "ascii" if the string only contains ASCII characters.
+    if all(32 <= ord(c) <= 126 for c in argstr):
+        return "ascii"
+    # Loop through all supported encodings and check if all characters in the string
+    # are in the charset of the encoding. If all characters are in the charset, return
+    # the encoding. The ISOLatin1+ encoding is checked first because it is the default
+    # and most common encoding.
+    adobe_chars = set(charset["Symbol"].values()) | set(
+        charset["ZapfDingbats"].values()
+    )
+    for encoding in ["ISOLatin1+"] + [f"ISO-8859-{i}" for i in range(1, 17)]:
+        if encoding == "ISO-8859-12":  # ISO-8859-12 was abandoned. Skip it.
+            continue
+        if all(c in (set(charset[encoding].values()) | adobe_chars) for c in argstr):
+            return encoding  # type: ignore[return-value]
+    # Return the "ISOLatin1+" encoding if the string contains characters from multiple
+    # charset encodings or contains characters that are not in any charset encoding.
+    return "ISOLatin1+"
+
+
 def data_kind(
     data: Any = None, required: bool = True
 ) -> Literal["arg", "file", "geojson", "grid", "image", "matrix", "vectors"]:
@@ -192,17 +264,41 @@ def data_kind(
     return kind
 
 
-def non_ascii_to_octal(argstr: str) -> str:
+def non_ascii_to_octal(
+    argstr: str,
+    encoding: Literal[
+        "ascii",
+        "ISOLatin1+",
+        "ISO-8859-1",
+        "ISO-8859-2",
+        "ISO-8859-3",
+        "ISO-8859-4",
+        "ISO-8859-5",
+        "ISO-8859-6",
+        "ISO-8859-7",
+        "ISO-8859-8",
+        "ISO-8859-9",
+        "ISO-8859-10",
+        "ISO-8859-11",
+        "ISO-8859-13",
+        "ISO-8859-14",
+        "ISO-8859-15",
+        "ISO-8859-16",
+    ] = "ISOLatin1+",
+) -> str:
     r"""
     Translate non-ASCII characters to their corresponding octal codes.
 
-    Currently, only characters in the ISOLatin1+ charset and Symbol/ZapfDingbats fonts
-    are supported.
+    Currently, only non-ASCII characters in the Adobe ISOLatin1+, Adobe Symbol, Adobe
+    ZapfDingbats, and ISO-8850-x (x can be in 1-11, 13-17) encodings are supported.
+    The Adobe Standard encoding is not supported yet.
 
     Parameters
     ----------
     argstr
         The string to be translated.
+    encoding
+        The encoding of characters in the string.
 
     Returns
     -------
@@ -219,9 +315,11 @@ def non_ascii_to_octal(argstr: str) -> str:
     '@%34%\\041@%%@%34%\\176@%%@%34%\\241@%%@%34%\\376@%%'
     >>> non_ascii_to_octal("ABC ±120° DEF α ♥")
     'ABC \\261120\\260 DEF @~\\141@~ @%34%\\252@%%'
+    >>> non_ascii_to_octal("12ABāáâãäåβ①②", encoding="ISO-8859-4")
+    '12AB\\340\\341\\342\\343\\344\\345@~\\142@~@%34%\\254@%%@%34%\\255@%%'
     """  # noqa: RUF002
-    # Return the string if it only contains printable ASCII characters from 32 to 126.
-    if all(32 <= ord(c) <= 126 for c in argstr):
+    # Return the input string if it only contains ASCII characters.
+    if encoding == "ascii" or all(32 <= ord(c) <= 126 for c in argstr):
         return argstr
 
     # Dictionary mapping non-ASCII characters to octal codes
@@ -232,15 +330,15 @@ def non_ascii_to_octal(argstr: str) -> str:
     mapping.update(
         {c: f"@%34%\\{i:03o}@%%" for i, c in charset["ZapfDingbats"].items()}
     )
-    # Adobe ISOLatin1+ charset. Put at the end.
-    mapping.update({c: f"\\{i:03o}" for i, c in charset["ISOLatin1+"].items()})
+    # ISOLatin1+ or ISO-8859-x charset.
+    mapping.update({c: f"\\{i:03o}" for i, c in charset[encoding].items()})
 
     # Remove any printable characters
     mapping = {k: v for k, v in mapping.items() if k not in string.printable}
     return argstr.translate(str.maketrans(mapping))
 
 
-def build_arg_list(
+def build_arg_list(  # noqa: PLR0912
     kwdict: dict[str, Any],
     confdict: dict[str, str] | None = None,
     infile: str | pathlib.PurePath | Sequence[str | pathlib.PurePath] | None = None,
@@ -310,6 +408,10 @@ def build_arg_list(
     ...     )
     ... )
     ['f1.txt', 'f2.txt', '-A0', '-B', '--FORMAT_DATE_MAP=o dd', '->out.txt']
+    >>> build_arg_list(dict(B="12ABāβ①②"))
+    ['-B12AB\\340@~\\142@~@%34%\\254@%%@%34%\\255@%%', '--PS_CHAR_ENCODING=ISO-8859-4']
+    >>> build_arg_list(dict(B="12ABāβ①②"), confdict=dict(PS_CHAR_ENCODING="ISO-8859-5"))
+    ['-B12AB\\340@~\\142@~@%34%\\254@%%@%34%\\255@%%', '--PS_CHAR_ENCODING=ISO-8859-5']
     >>> print(build_arg_list(dict(R="1/2/3/4", J="X4i", watre=True)))
     Traceback (most recent call last):
       ...
@@ -324,11 +426,22 @@ def build_arg_list(
         elif value is True:
             gmt_args.append(f"-{key}")
         elif is_nonstr_iter(value):
-            gmt_args.extend(non_ascii_to_octal(f"-{key}{_value}") for _value in value)
+            gmt_args.extend(f"-{key}{_value}" for _value in value)
         else:
-            gmt_args.append(non_ascii_to_octal(f"-{key}{value}"))
+            gmt_args.append(f"-{key}{value}")
+
+    # Convert non-ASCII characters (if any) in the arguments to octal codes
+    encoding = _check_encoding("".join(gmt_args))
+    if encoding != "ascii":
+        gmt_args = [non_ascii_to_octal(arg, encoding=encoding) for arg in gmt_args]
     gmt_args = sorted(gmt_args)
 
+    # Set --PS_CHAR_ENCODING=encoding if necessary
+    if encoding not in {"ascii", "ISOLatin1+"} and not (
+        confdict and "PS_CHAR_ENCODING" in confdict
+    ):
+        gmt_args.append(f"--PS_CHAR_ENCODING={encoding}")
+
     if confdict:
         gmt_args.extend(f"--{key}={value}" for key, value in confdict.items())
 

diff --git a/pygmt/src/text.py b/pygmt/src/text.py
@@ -6,6 +6,7 @@
 from pygmt.clib import Session
 from pygmt.exceptions import GMTInvalidInput
 from pygmt.helpers import (
+    _check_encoding,
     build_arg_list,
     data_kind,
     fmt_docstring,
@@ -59,13 +60,12 @@ def text_(  # noqa: PLR0912
     - ``x``/``y``, and ``text``
     - ``position`` and ``text``
 
-    The text strings passed via the ``text`` parameter can contain ASCII
-    characters and non-ASCII characters defined in the ISOLatin1+ encoding
-    (i.e., IEC_8859-1), and the Symbol and ZapfDingbats character sets.
-    See :gmt-docs:`reference/octal-codes.html` for the full list of supported
-    non-ASCII characters.
+    The text strings passed via the ``text`` parameter can contain ASCII characters and
+    non-ASCII characters defined in the Adobe ISOLatin1+, Adobe Symbol, Adobe
+    ZapfDingbats and ISO-8859-x (x can be 1-11, 13-16) encodings. Refer to
+    :doc:`techref/encodings` for the full list of supported non-ASCII characters.
 
-    Full option list at :gmt-docs:`text.html`
+    Full option list at :gmt-docs:`text.html`.
 
     {aliases}
 
@@ -226,13 +226,24 @@ def text_(  # noqa: PLR0912
         kwargs["t"] = ""
 
     # Append text at last column. Text must be passed in as str type.
+    confdict = {}
     if kind == "vectors":
-        extra_arrays.append(
-            np.vectorize(non_ascii_to_octal)(np.atleast_1d(text).astype(str))
-        )
+        text = np.atleast_1d(text).astype(str)
+        encoding = _check_encoding("".join(text))
+        if encoding != "ascii":
+            text = np.vectorize(non_ascii_to_octal, excluded="encoding")(
+                text, encoding=encoding
+            )
+        extra_arrays.append(text)
+
+        if encoding not in {"ascii", "ISOLatin1+"}:
+            confdict = {"PS_CHAR_ENCODING": encoding}
 
     with Session() as lib:
         with lib.virtualfile_in(
             check_kind="vector", data=textfiles, x=x, y=y, extra_arrays=extra_arrays
         ) as vintbl:
-            lib.call_module(module="text", args=build_arg_list(kwargs, infile=vintbl))
+            lib.call_module(
+                module="text",
+                args=build_arg_list(kwargs, infile=vintbl, confdict=confdict),
+            )
diff --git a/pygmt/tests/baseline/test_text_nonascii_iso8859.png.dvc b/pygmt/tests/baseline/test_text_nonascii_iso8859.png.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: a0f35a1d58c95e6589c7397e7660e946
+  size: 17089
+  hash: md5
+  path: test_text_nonascii_iso8859.png
diff --git a/pygmt/tests/test_text.py b/pygmt/tests/test_text.py
@@ -434,3 +434,16 @@ def test_text_quotation_marks():
     fig.basemap(projection="X4c/2c", region=[0, 4, 0, 2], frame=0)
     fig.text(x=2, y=1, text='\\234 ‘ ’ " “ ”', font="20p")  # noqa: RUF001
     return fig
+
+
+@pytest.mark.mpl_image_compare
+def test_text_nonascii_iso8859():
+    """
+    Test passing text strings with non-ascii characters in ISO-8859-4 encoding.
+    """
+    fig = Figure()
+    fig.basemap(region=[0, 10, 0, 10], projection="X10c", frame=["WSEN+tAāáâãäåB"])
+    fig.text(position="TL", text="position-text:1ÉĘËĖ2")
+    fig.text(x=1, y=1, text="xytext:1éęëė2")
+    fig.text(x=[5, 5], y=[3, 5], text=["xytext1:ųúûüũūαζ∆❡", "xytext2:íîī∑π∇✉"])
+    return fig