Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added func, that keeps track of whether the current node is within a … #10

Merged
merged 1 commit into from
Jul 16, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 66 additions & 4 deletions internal/qaparser/questionanswer/questionanswer.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

"github.com/PuerkitoBio/goquery"
"github.com/terratensor/svodd-server/internal/lib/httpclient"
"golang.org/x/net/html"
)

type Entry struct {
Expand Down Expand Up @@ -94,10 +95,16 @@ func (e *Entry) Parse(resBytes []byte) error {
}

els := doc.Find("#answer-content").First()
e.Html, err = goquery.OuterHtml(els)
if err != nil {
log.Printf("failed to get html: %v", err)
}
// Populate the `Html` field of the `Entry` struct with the outer HTML of the
// "#answer-content" element.
//
// This field will be used later to create a new QuestionAnswer struct.
//
// Parameters:
// - els: The goquery Selection representing the "#answer-content" element.
//
// Return type: None.
e.Html = populateOuterHtml(els)
e.SplitIntoChunks(els)

els = doc.Find(".comment-list").First()
Expand Down Expand Up @@ -304,3 +311,58 @@ func WrapPhrase(phrase, text string) string {

return wrapped
}

// populateOuterHtml generates the outer HTML representation of the given goquery Selection.
// This func keeps track of whether the current node is within a `<table>`,
// and only removes "style" attributes from nodes outside the table.
//
// Parameters:
// - els: a pointer to the goquery Selection
// Returns:
// - a string containing the generated outer HTML
func populateOuterHtml(els *goquery.Selection) string {
htmlStr, err := goquery.OuterHtml(els)
if err != nil {
log.Printf("failed to get html: %v", err)
}

doc, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
panic(err)
}

removeStyle(doc, false)

var buf strings.Builder
html.Render(&buf, doc)
return buf.String()
}

// removeStyle removes the "style" attribute from HTML elements.
// It skips elements inside tables because they often have inline styles.
//
// Parameters:
// - n: the HTML node to remove the "style" attribute from.
// - inTable: a boolean indicating whether the current node is inside a table.
func removeStyle(n *html.Node, inTable bool) {
if n.Type == html.ElementNode {
switch n.Data {
case "table":
inTable = true
case "/table":
inTable = false
default:
if !inTable {
for i := len(n.Attr) - 1; i >= 0; i-- {
if n.Attr[i].Key == "style" {
copy(n.Attr[i:], n.Attr[i+1:])
n.Attr = n.Attr[:len(n.Attr)-1]
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
removeStyle(c, inTable)
}
}
Loading