Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abbr Extension: Definition Sorting and Glossary storage #1467

Merged
merged 7 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@ better reflects what it is. `AbbrPreprocessor` has been deprecated.

A call to `Markdown.reset()` now clears all previously defined abbreviations.

Abbreviations are now sorted by length before executing `AbbrTreeprocessor`
to ensure that multi-word abbreviations are implemented even if an abbreviation
exists for one of those component words. (#1465)

Abbreviations without a definition are now ignored. This avoids applying
abbr tags to text without a title value.

Added an optional `glossary` configuration option to the abbreviations extension.
This provides a simple and efficient way to apply a dictionary of abbreviations
to every page.

Abbreviations can now be disabled by setting their definition to `""` or `''`.
This can be useful when using the `glossary` option.


### Fixed

* Fixed links to source code on GitHub from the documentation (#1453).
Expand Down
17 changes: 16 additions & 1 deletion docs/extensions/abbreviations.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,25 @@ Usage
See [Extensions](index.md) for general extension usage. Use `abbr` as the name
of the extension.

This extension does not accept any special configuration options.
The following options are provided to configure the output:

* **`glossary`**:
A dictionary where the `key` is the abbreviation and the `value` is the definition.

A trivial example:

```python
markdown.markdown(some_text, extensions=['abbr'])
```

Disabling Abbreviations
-----------------------

When using the `glossary` option, there may be times when you need to turn off
a specific abbreviation. To do this, set the abbreviation to `''` or `""`.

```md
The HTML abbreviation is disabled on this page.

*[HTML]: ''
```
60 changes: 45 additions & 15 deletions markdown/extensions/abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,38 @@ class AbbrExtension(Extension):

def __init__(self, **kwargs):
""" Initiate Extension and set up configs. """
self.config = {
'glossary': [
{},
'A dictionary where the `key` is the abbreviation and the `value` is the definition.'
"Default: `{}`"
],
}
""" Default configuration options. """
super().__init__(**kwargs)
self.abbrs = {}
self.glossary = {}

def reset(self):
""" Clear all previously defined abbreviations. """
self.abbrs.clear()
if (self.glossary):
self.abbrs.update(self.glossary)

def reset_glossary(self):
""" Clear all abbreviations from the glossary. """
self.glossary.clear()

def load_glossary(self, dictionary: dict[str, str]):
"""Adds `dictionary` to our glossary. Any abbreviations that already exist will be overwritten."""
if dictionary:
self.glossary = {**dictionary, **self.glossary}

def extendMarkdown(self, md):
""" Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
if (self.config['glossary'][0]):
self.load_glossary(self.config['glossary'][0])
self.abbrs.update(self.glossary)
md.registerExtension(self)
md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16)
Expand All @@ -69,13 +92,14 @@ def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -
self.iter_element(child, el)
if text := el.text:
for m in reversed(list(self.RE.finditer(text))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
if self.abbrs[m.group(0)]:
waylan marked this conversation as resolved.
Show resolved Hide resolved
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
el.text = text
if parent and el.tail:
if parent is not None and el.tail:
tail = el.tail
index = list(parent).index(el) + 1
for m in reversed(list(self.RE.finditer(tail))):
Expand All @@ -92,7 +116,9 @@ def run(self, root: etree.Element) -> etree.Element | None:
# No abbreviations defined. Skip running processor.
return
# Build and compile regex
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b")
abbr_list = list(self.abbrs.keys())
abbr_list.sort(key=len, reverse=True)
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in abbr_list) })\\b")
# Step through tree and modify on matches
self.iter_element(root)

Expand Down Expand Up @@ -120,14 +146,18 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
if m:
abbr = m.group('abbr').strip()
title = m.group('title').strip()
self.abbrs[abbr] = title
if block[m.end():].strip():
# Add any content after match back to blocks as separate block
blocks.insert(0, block[m.end():].lstrip('\n'))
if block[:m.start()].strip():
# Add any content before match back to blocks as separate block
blocks.insert(0, block[:m.start()].rstrip('\n'))
return True
if title and abbr:
if title == "''" or title == '""':
self.abbrs.pop(abbr)
else:
self.abbrs[abbr] = title
if block[m.end():].strip():
# Add any content after match back to blocks as separate block
blocks.insert(0, block[m.end():].lstrip('\n'))
if block[:m.start()].strip():
# Add any content before match back to blocks as separate block
blocks.insert(0, block[:m.start()].rstrip('\n'))
return True
# No match. Restore block.
blocks.insert(0, block)
return False
Expand Down
136 changes: 136 additions & 0 deletions tests/test_syntax/extensions/test_abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,69 @@ def test_abbr_override(self):
)
)

def test_abbr_glossary(self):
nbanyan marked this conversation as resolved.
Show resolved Hide resolved

glossary = {
"ABBR": "Abbreviation",
"abbr": "Abbreviation",
"HTML": "Hyper Text Markup Language",
"W3C": "World Wide Web Consortium"
}

self.assertMarkdownRenders(
self.dedent(
"""
ABBR
abbr

HTML
W3C
"""
),
self.dedent(
"""
<p><abbr title="Abbreviation">ABBR</abbr>
<abbr title="Abbreviation">abbr</abbr></p>
<p><abbr title="Hyper Text Markup Language">HTML</abbr>
<abbr title="World Wide Web Consortium">W3C</abbr></p>
"""
),
extensions=[AbbrExtension(glossary=glossary)]
)

def test_abbr_glossary_2(self):

glossary = {
"ABBR": "Abbreviation",
"abbr": "Abbreviation",
"HTML": "Hyper Text Markup Language",
"W3C": "World Wide Web Consortium"
}

glossary_2 = {
"ABBR": "New Abbreviation"
}

abbr_ext = AbbrExtension(glossary=glossary)
abbr_ext.load_glossary(glossary_2)

self.assertMarkdownRenders(
self.dedent(
"""
ABBR abbr HTML W3C
"""
),
self.dedent(
"""
<p><abbr title="New Abbreviation">ABBR</abbr> """
+ """<abbr title="Abbreviation">abbr</abbr> """
+ """<abbr title="Hyper Text Markup Language">HTML</abbr> """
+ """<abbr title="World Wide Web Consortium">W3C</abbr></p>
"""
),
extensions=[abbr_ext]
)

def test_abbr_nested(self):
self.assertMarkdownRenders(
self.dedent(
Expand Down Expand Up @@ -383,6 +446,79 @@ def test_abbr_with_attr_list(self):
extensions=['abbr', 'attr_list']
)

def test_abbr_superset_vs_subset(self):
self.assertMarkdownRenders(
self.dedent(
"""
abbr, SS, and abbr-SS should have different definitions.

*[abbr]: Abbreviation Definition
*[abbr-SS]: Abbreviation Superset Definition
*[SS]: Superset Definition
"""
),
self.dedent(
"""
<p><abbr title="Abbreviation Definition">abbr</abbr>, """
+ """<abbr title="Superset Definition">SS</abbr>, """
+ """and <abbr title="Abbreviation Superset Definition">abbr-SS</abbr> """
+ """should have different definitions.</p>
"""
)
)

def test_abbr_empty(self):
self.assertMarkdownRenders(
self.dedent(
"""
*[abbr]:
Abbreviation Definition

abbr

*[]: Empty

*[ ]: Empty

*[abbr]:

*[ABBR]:

Testing document text.
"""
),
self.dedent(
"""
<p><abbr title="Abbreviation Definition">abbr</abbr></p>\n"""
+ """<p>*[]: Empty</p>\n"""
+ """<p>*[ ]: Empty</p>\n"""
+ """<p>*[<abbr title="Abbreviation Definition">abbr</abbr>]:</p>\n"""
+ """<p>*[ABBR]:</p>\n"""
+ """<p>Testing document text.</p>
"""
)
)

def test_abbr_clear(self):
self.assertMarkdownRenders(
self.dedent(
"""
*[abbr]: Abbreviation Definition
*[ABBR]: Abbreviation Definition

abbr ABBR

*[abbr]: ""
*[ABBR]: ''
"""
),
self.dedent(
"""
<p>abbr ABBR</p>
"""
)
)

def test_abbr_reset(self):
ext = AbbrExtension()
md = Markdown(extensions=[ext])
Expand Down
Loading