Merge pull request #13 from apple1417/master

improve html plain text conversions
bl-sdk · Jan 29, 2024 · 6b0d0ae · 6b0d0ae
2 parents 0b95cbe + 370d80b
commit 6b0d0ae
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 35 deletions.
diff --git a/src/console_mod_menu/draw.py b/src/console_mod_menu/draw.py
@@ -23,3 +23,15 @@ def draw(msg: str, indent: int = 0) -> None:
 
     for line in wrapper.fill(html_to_plain_text(msg)).splitlines():
         print(prefix, line)
+
+
+def draw_description(description: str, indent: int = 0) -> None:
+    """
+    Draws a message coming from a mod/option description - honoring existing newlines.
+
+    Args:
+        description: The description to write.
+        indent: How much to indent the message.
+    """
+    for line in html_to_plain_text(description).splitlines():
+        draw(line, indent)
diff --git a/src/console_mod_menu/option_formatting.py b/src/console_mod_menu/option_formatting.py
@@ -2,7 +2,7 @@
 
 from mods_base import JSON, BaseOption, BoolOption, KeybindOption, ValueOption
 
-from .draw import draw
+from .draw import draw, draw_description
 from .screens import draw_stack_header
 
 _J = TypeVar("_J", bound=JSON)
@@ -64,8 +64,6 @@ def draw_option_header(option: BaseOption) -> None:
 
     if len(option.description) > 0:
         draw("=" * 32)
-        # Respect newlines - passing everything at once would let them get wrapped arbitrarily
-        for line in option.description.splitlines():
-            draw(line)
+        draw_description(option.description)
 
     draw("")
diff --git a/src/console_mod_menu/screens/mod.py b/src/console_mod_menu/screens/mod.py
@@ -17,7 +17,7 @@
 )
 from unrealsdk import logging
 
-from console_mod_menu.draw import draw
+from console_mod_menu.draw import draw, draw_description
 from console_mod_menu.option_formatting import draw_option_header, get_option_value_str
 
 from . import (
@@ -140,7 +140,7 @@ def draw(self) -> None:  # noqa: D102
         draw("")
 
         if self.mod.description:
-            draw(self.mod.description)
+            draw_description(self.mod.description)
             draw("")
 
         if not self.mod.enabling_locked:

diff --git a/src/mods_base/__init__.py b/src/mods_base/__init__.py
@@ -36,14 +36,14 @@
     command,
     remove_next_console_line_capture,
 )
-from .hook import hook
+from .hook import HookProtocol, hook
+from .html_to_plain_text import html_to_plain_text
 from .keybinds import EInputEvent, KeybindType, keybind
 from .mod import Game, Library, Mod, ModType
 from .mod_factory import build_mod
 from .mod_list import (
     deregister_mod,
     get_ordered_mod_list,
-    html_to_plain_text,
     register_mod,
 )
 from .options import (
@@ -83,6 +83,7 @@
     "GroupedOption",
     "HiddenOption",
     "hook",
+    "HookProtocol",
     "html_to_plain_text",
     "JSON",
     "keybind",
@@ -91,9 +92,9 @@
     "Library",
     "Mod",
     "MODS_DIR",
-    "open_in_mod_dir",
     "ModType",
     "NestedOption",
+    "open_in_mod_dir",
     "raw_keybinds",
     "register_mod",
     "remove_next_console_line_capture",

diff --git a/src/mods_base/html_to_plain_text.py b/src/mods_base/html_to_plain_text.py
@@ -0,0 +1,103 @@
+# ruff: noqa: D102
+
+from dataclasses import dataclass
+from functools import cache
+from html.parser import HTMLParser
+
+
+@dataclass
+class OrderedList:
+    num: int = 1
+
+
+@dataclass
+class UnorderedList:
+    pass
+
+
+class PlainTextHTMLConverter(HTMLParser):
+    plain_text: str
+    list_item_stack: list[OrderedList | UnorderedList]
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.plain_text = ""
+        self.list_item_stack = []
+
+    def handle_data(self, data: str) -> None:
+        self.plain_text += data
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        match tag.lower():
+            case "br":
+                self.plain_text += "\n"
+
+            case "ol":
+                self.plain_text += "\n"
+                self.list_item_stack.append(OrderedList())
+
+            case "ul":
+                self.plain_text += "\n"
+                self.list_item_stack.append(UnorderedList())
+
+            case "li":
+                if len(self.list_item_stack) >= 1:
+                    list_state = self.list_item_stack[-1]
+                    match list_state:
+                        case OrderedList():
+                            self.plain_text += f"{list_state.num}. "
+                            list_state.num += 1
+                        case UnorderedList():
+                            self.plain_text += "- "
+
+            case "img":
+                for name, val in attrs:
+                    if name.lower() == "alt" and val is not None:
+                        self.plain_text += val
+                        break
+
+            case _:
+                pass
+
+    def handle_endtag(self, tag: str) -> None:
+        match tag.lower():
+            case "ol":
+                if isinstance(self.list_item_stack[-1], OrderedList):
+                    self.list_item_stack.pop()
+
+            case "ul":
+                if isinstance(self.list_item_stack[-1], UnorderedList):
+                    self.list_item_stack.pop()
+
+            case "li":
+                self.plain_text += "\n"
+
+            case _:
+                pass
+
+
+@cache
+def html_to_plain_text(html: str) -> str:
+    """
+    Extracts plain text from HTML-containing text. This is *NOT* input sanitisation.
+
+    Removes most tags in place, and decodes entities - `<b>&amp;</b>` becomes `&`.
+
+    A few tags are substituted for plain text equivalents:
+    - `<br>` becomes a newline
+    - `<ol><li>` becomes `1. ` (incrementing with each list item)
+    - `<ul><li>` becomes `- `
+    - `<img alt='xyz'>` becomes it's alt text
+
+    Intended for use when accessing a mod name/description/option/etc., which may contain HTML tags,
+    but in a situation where such tags would be inappropriate.
+
+    Args:
+        html: The HTML-containing text.
+    Returns:
+        The extracted plain text.
+    """
+    parser = PlainTextHTMLConverter()
+    parser.feed(html)
+    return parser.plain_text
diff --git a/src/mods_base/mod_list.py b/src/mods_base/mod_list.py
@@ -1,7 +1,6 @@
 import os
 from dataclasses import dataclass, field
 from functools import cmp_to_key
-from html.parser import HTMLParser
 from pathlib import Path
 
 import pyunrealsdk
@@ -10,6 +9,7 @@
 from . import MODS_DIR, __version__
 from .command import AbstractCommand
 from .hook import HookProtocol
+from .html_to_plain_text import html_to_plain_text
 from .keybinds import KeybindType
 from .mod import Game, Library, Mod, ModType
 from .options import BaseOption, ButtonOption
@@ -56,9 +56,11 @@ def description(self) -> str:
         # Once already sorted, re-sorting should be relatively quick
         self.components.sort(key=lambda c: c.name.lower())
 
-        description = "Components:\n"
+        description = "Components:"
+        description += "<ul>"
         for comp in self.components:
-            description += f"- {comp.name}: {comp.version}\n"
+            description += f"<li>{comp.name}: {comp.version}</li>"
+        description += "</ul>"
 
         return description
 
@@ -118,29 +120,6 @@ def deregister_mod(mod: Mod) -> None:
     mod_list.remove(mod)
 
 
-def html_to_plain_text(html: str) -> str:
-    """
-    Extracts plain text from HTML-containing text. This is *NOT* input sanitisation.
-
-    Removes tags, and decodes entities - `<b>&amp;</b>` becomes `&`.
-
-    Intended for use when accessing a mod name/description/option/etc., which may contain HTML tags,
-    but in a situation where such tags would be inappropriate.
-
-    Args:
-        html: The HTML-containing text.
-    Returns:
-        The extracted plain text.
-    """
-    extracted_data: list[str] = []
-
-    parser = HTMLParser()
-    parser.handle_data = lambda data: extracted_data.append(data)
-    parser.feed(html)
-
-    return "".join(extracted_data)
-
-
 def get_ordered_mod_list() -> list[Mod]:
     """
     Gets the list of mods, in display order.