From 6b13c560f5bf76889b1c603a5b83941892279c76 Mon Sep 17 00:00:00 2001 From: Sigurd Spieckermann <2206639+sisp@users.noreply.github.com> Date: Tue, 16 Jul 2024 15:49:13 +0200 Subject: [PATCH] Fixed blog readtime calculation to ignore non-content text (#7370) --- material/plugins/blog/readtime/parser.py | 35 ++++++++++++++++++++++-- src/plugins/blog/readtime/parser.py | 35 ++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 6 deletions(-) diff --git a/material/plugins/blog/readtime/parser.py b/material/plugins/blog/readtime/parser.py index e78e75c13f2..17662a4f393 100644 --- a/material/plugins/blog/readtime/parser.py +++ b/material/plugins/blog/readtime/parser.py @@ -20,6 +20,10 @@ from html.parser import HTMLParser +# TODO: Refactor the `void` set into a common module and import it from there +# and not from the search plugin. +from material.plugins.search.plugin import void + # ----------------------------------------------------------------------------- # Classes # ----------------------------------------------------------------------------- @@ -31,15 +35,40 @@ class ReadtimeParser(HTMLParser): def __init__(self): super().__init__(convert_charrefs = True) + # Tags to skip + self.skip = set([ + "object", # Objects + "script", # Scripts + "style", # Styles + "svg" # SVGs + ]) + + # Current context + self.context = [] + # Keep track of text and images self.text = [] self.images = 0 - # Collect images + # Called at the start of every HTML tag def handle_starttag(self, tag, attrs): + # Collect images if tag == "img": self.images += 1 - # Collect text + # Ignore self-closing tags + if tag not in void: + # Add tag to context + self.context.append(tag) + + # Called for the text contents of each tag def handle_data(self, data): - self.text.append(data) + # Collect text if not inside skip context + if not self.skip.intersection(self.context): + self.text.append(data) + + # Called at the end of every HTML tag + def handle_endtag(self, tag): + if self.context and self.context[-1] == tag: + # Remove tag from context + self.context.pop() diff --git a/src/plugins/blog/readtime/parser.py b/src/plugins/blog/readtime/parser.py index e78e75c13f2..17662a4f393 100644 --- a/src/plugins/blog/readtime/parser.py +++ b/src/plugins/blog/readtime/parser.py @@ -20,6 +20,10 @@ from html.parser import HTMLParser +# TODO: Refactor the `void` set into a common module and import it from there +# and not from the search plugin. +from material.plugins.search.plugin import void + # ----------------------------------------------------------------------------- # Classes # ----------------------------------------------------------------------------- @@ -31,15 +35,40 @@ class ReadtimeParser(HTMLParser): def __init__(self): super().__init__(convert_charrefs = True) + # Tags to skip + self.skip = set([ + "object", # Objects + "script", # Scripts + "style", # Styles + "svg" # SVGs + ]) + + # Current context + self.context = [] + # Keep track of text and images self.text = [] self.images = 0 - # Collect images + # Called at the start of every HTML tag def handle_starttag(self, tag, attrs): + # Collect images if tag == "img": self.images += 1 - # Collect text + # Ignore self-closing tags + if tag not in void: + # Add tag to context + self.context.append(tag) + + # Called for the text contents of each tag def handle_data(self, data): - self.text.append(data) + # Collect text if not inside skip context + if not self.skip.intersection(self.context): + self.text.append(data) + + # Called at the end of every HTML tag + def handle_endtag(self, tag): + if self.context and self.context[-1] == tag: + # Remove tag from context + self.context.pop()