Skip to content

Commit

Permalink
fix sanitised code symbols
Browse files Browse the repository at this point in the history
should close #40
  • Loading branch information
samgozman committed Dec 12, 2023
1 parent b7a4ca2 commit 9cd69d2
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 4 deletions.
13 changes: 13 additions & 0 deletions journalist/news.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"encoding/json"
"fmt"
"github.com/microcosm-cc/bluemonday"
"html"
"regexp"
"strings"
"time"
Expand All @@ -22,6 +23,9 @@ type News struct {
// TODO: Add creator field if possible
}

// NewNews creates a new News instance from the given parameters.
// It sanitizes the title and description from HTML tags and styles.
// It also generates the ID of the news by hashing the link, title, description and date.
func NewNews(title, description, link, date, provider string) (*News, error) {
dateTime, err := parseDate(date)
if err != nil {
Expand All @@ -33,6 +37,15 @@ func NewNews(title, description, link, date, provider string) (*News, error) {
title = p.Sanitize(title)
description = p.Sanitize(description)

// Replace code symbols like ' with their actual symbols.
// This is placed after sanitization, because sanitization may replace some symbols along the way.
title = html.UnescapeString(title)
description = html.UnescapeString(description)

// Replace Unicode escape sequences (e.g., \u0026)
title = replaceUnicodeSymbols(title)
description = replaceUnicodeSymbols(description)

if len(description) > 1024 {
description = description[:1024]
}
Expand Down
6 changes: 3 additions & 3 deletions journalist/news_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ func TestNewNews(t *testing.T) {
name: "valid news with html tags",
args: args{
title: "title <i>bonk</i>",
description: "description <b>bold</b> <i>italic</i> <a href=\"link\">link</a>",
description: "description <b>bold</b> <i>S\\u0026P 500</i> <a href=\"link\">G&#38;T</a>",
link: "link",
date: "Mon, 02 Jan 2006 15:04:05 MST",
providerName: "provider",
},
want: &News{
ID: "309e1c0cfc773eccc628ba376378eaa1",
ID: "91e9909e2e1a1555d1d0aaca96aede63",
Title: "title bonk",
Description: "description bold italic link",
Description: "description bold S&P 500 G&T",
Link: "link",
Date: time.Date(2006, 1, 2, 15, 4, 5, 0, time.UTC),
ProviderName: "provider",
Expand Down
23 changes: 22 additions & 1 deletion journalist/utils.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package journalist

import "time"
import (
"regexp"
"strconv"
"time"
)

// parseDate parses a date string into a time.Time object in UTC
func parseDate(dateString string) (time.Time, error) {
Expand All @@ -23,3 +27,20 @@ func parseDate(dateString string) (time.Time, error) {

return parsedTime.UTC(), err
}

// replaceUnicodeSymbols replaces Unicode escape sequences with their corresponding characters
func replaceUnicodeSymbols(s string) string {
// Replace Unicode escape sequences (e.g., \u0026 with &)
re := regexp.MustCompile(`\\u([0-9A-Fa-f]{4})`)
decoded := re.ReplaceAllStringFunc(s, func(match string) string {
unicodeCode := match[2:] // Ignore "\u" at the beginning
num, err := strconv.ParseInt(unicodeCode, 16, 32)
if err != nil {
return match // If conversion fails, return the original sequence
}
// Convert Unicode code to a string and return the corresponding character
return string(rune(num))
})

return decoded
}
24 changes: 24 additions & 0 deletions journalist/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,27 @@ func Test_parseDate(t *testing.T) {
})
}
}

func Test_replaceUnicodeSymbols(t *testing.T) {
type args struct {
s string
}
tests := []struct {
name string
args args
want string
}{
{
"ampersand test",
args{"S\\u0026P 500"},
"S&P 500",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := replaceUnicodeSymbols(tt.args.s); got != tt.want {
t.Errorf("replaceHTMLCodeSymbols() = %v, want %v", got, tt.want)
}
})
}
}

0 comments on commit 9cd69d2

Please sign in to comment.