Skip to content

Commit

Permalink
Don't count HTML markup in auto summaries
Browse files Browse the repository at this point in the history
This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though.

Closes gohugoio#12837
  • Loading branch information
bep committed Sep 10, 2024
1 parent 84ee00b commit a09c9f8
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 1 deletion.
27 changes: 26 additions & 1 deletion resources/page/page_markup.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package page

import (
"context"
"fmt"
"html/template"
"regexp"
"strings"
Expand Down Expand Up @@ -161,6 +162,20 @@ func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStart
return ptag
}

func deb(s string, lohi types.LowHigh[string]) {
fmt.Printf("%q\n", lohi.Value(s))
}

// Avoid counting words that are most likely HTML tokens.
var (
isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`)
isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+="`)
)

func isProbablyHTMLToken(s string) bool {
return isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s) || s == ">"
}

// ExtractSummaryFromHTML extracts a summary from the given HTML content.
func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) {
result.source = input
Expand All @@ -170,9 +185,19 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
return result
}

// deb(input, ptag)

var count int

countWord := func(word string) int {
word = strings.TrimSpace(word)
if len(word) == 0 {
return 0
}
if isProbablyHTMLToken(word) {
return 0
}

if isCJK {
word = tpl.StripHTML(word)
runeCount := utf8.RuneCountInString(word)
Expand All @@ -193,7 +218,7 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo

for j := result.WrapperStart.High; j < high; {
s := input[j:]
closingIndex := strings.Index(s, "</"+ptag.tagName)
closingIndex := strings.Index(s, "</"+ptag.tagName+">")

if closingIndex == -1 {
break
Expand Down
56 changes: 56 additions & 0 deletions resources/page/page_markup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,46 @@ func TestExtractSummaryFromHTML(t *testing.T) {
}
}

// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4
// Also issue 12837
func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) {
c := qt.New(t)

input := `
<p>
<div>
<picture>
<img src="imgs/1.jpg" alt="1"/>
</picture>
<picture>
<img src="imgs/2.jpg" alt="2"/>
</picture>
<picture>
<img src="imgs/3.jpg" alt="3"/>
</picture>
<picture>
<img src="imgs/4.jpg" alt="4"/>
</picture>
<picture>
<img src="imgs/5.jpg" alt="5"/>
</picture>
</div>
</p>
<p>
This is a story about a cat.
</p>
<p>
The cat was white and fluffy.
</p>
<p>
And it liked milk.
</p>
`

summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false)
c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue)
}

func TestExtractSummaryFromHTMLWithDivider(t *testing.T) {
c := qt.New(t)

Expand Down Expand Up @@ -114,6 +154,22 @@ func TestExpandDivider(t *testing.T) {
}
}

func TestIsProbablyHTMLToken(t *testing.T) {
c := qt.New(t)

for i, test := range []struct {
input string
expect bool
}{
{"<p>", true},
{"<p", true},
{"width=\"32\"", true},
{"<p>Æøå", false},
} {
c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
}
}

func BenchmarkSummaryFromHTML(b *testing.B) {
b.StopTimer()
input := "<p>First paragraph</p><p>Second paragraph</p>"
Expand Down

0 comments on commit a09c9f8

Please sign in to comment.