Skip to content

Commit

Permalink
remove advertisement
Browse files Browse the repository at this point in the history
  • Loading branch information
hjhee committed Feb 3, 2018
1 parent bf36a2b commit 9c82d4e
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 12 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,10 @@

# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
.glide/

# Fetched pages
output/

# Releases
*.zip
*.7z
17 changes: 16 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package main

import (
"fmt"
"html/template"
"log"
"math/rand"
Expand All @@ -9,7 +10,7 @@ import (
)

const (
numFetcher = 100
numFetcher = 10
numParser = 50
numRenderer = 5

Expand All @@ -18,15 +19,29 @@ const (

var outputTemplate *template.Template

type logWriter struct {
}

func (writer logWriter) Write(bytes []byte) (int, error) {
return fmt.Print(time.Now().UTC().Format("2006-01-02 15:04:05 ") + string(bytes))
}

func init() {
// setup log time format
// https://stackoverflow.com/a/36140590/6091246
log.SetFlags(0)
log.SetOutput(new(logWriter))

outputPath := "./output"
if _, err := os.Stat(outputPath); os.IsNotExist(err) {
err = os.Mkdir(outputPath, 0644)
if err != nil {
log.Fatalf("Error creating output folder: %v", err)
}
}

rand.Seed(time.Now().UnixNano())

// outputTemplate is used to render output
outputTemplate = template.Must(template.New(templateName).Funcs(
template.FuncMap{"convertTime": func(ts int64) string {
Expand Down
28 changes: 19 additions & 9 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ import (
"encoding/json"
"errors"
"fmt"
"html"
"html/template"
"log"
"math/rand"
"net/url"
"regexp"
"strconv"
"strings"
"sync"
"time"

Expand All @@ -35,7 +35,7 @@ func htmlParse(pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, callback fun
}

// get threadID and forumID using regex
posts := doc.Find("div.l_post.l_post_bright.j_l_post")
posts := doc.Find("div.l_post.l_post_bright.j_l_post.clearfix")
threadRegex := regexp.MustCompile(`\b"?thread_id"?:"?(\d+)"?\b`)
match := threadRegex.FindStringSubmatch(string(page.Content))
strInt, _ := strconv.ParseInt(match[1], 10, 64)
Expand Down Expand Up @@ -149,21 +149,28 @@ func pageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *Te
}()
defer tf.AddPosts(-1)
posts.Each(func(i int, s *goquery.Selection) {
// filter elements that has more than 4 class (maybe an advertisement)
classStr, _ := s.Attr("class") // get class string
if len(strings.Fields(classStr)) > 4 {
return
}

dataField, ok := s.Attr("data-field")
if !ok {
// maybe not an error, but an older version of data-field
// log.Printf("#%d data-field not found: %s", i, page.URL.String()) // there's a error on the page, maybe Tieba updated the syntax
return
}

var tiebaPost TiebaField
var res OutputField
err := json.Unmarshal([]byte(html.UnescapeString(dataField)), &tiebaPost)
err := json.Unmarshal([]byte(dataField), &tiebaPost)
if err != nil {
log.Printf("#%d data-field unmarshal failed: %v, url: %s", i, err, page.URL.String()) // there's a error on the page, maybe Tieba updated the syntax
return
}
res.UserName = tiebaPost.Author.UserName
res.Content = template.HTML(html.UnescapeString(tiebaPost.Content.Content))
res.Content = template.HTML(tiebaPost.Content.Content)
res.PostNO = tiebaPost.Content.PostNO
res.PostID = tiebaPost.Content.PostID

Expand All @@ -175,14 +182,17 @@ func pageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *Te
if err != nil {
log.Printf("#%d: post_content_%d parse failed, %s", i, res.PostID, err)
} else {
res.Content = template.HTML(html.UnescapeString(content))
res.Content = template.HTML(content)
}
}

// get post time
// Jquery过滤选择器,选择前几个元素,后几个元素,内容过滤选择器等
// http://www.cnblogs.com/alone2015/p/4962687.html
res.Time = s.Find("div.post-tail-wrap span.tail-info:nth-child(4)").Text()
res.Time = s.Find("span.tail-info:nth-child(4)").Text() // posted from device other than PC
if res.Time == "" {
res.Time = s.Find("span.tail-info:nth-child(3)").Text() // posted from PC
}

tf.Append(&res)
// log.Printf("#%d data-field found: %v\n", i, tiebaPost)
Expand Down Expand Up @@ -223,7 +233,7 @@ func commentParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap
defer tf.AddLzls(-1)

var lzl LzlField
err := json.Unmarshal([]byte(html.UnescapeString(string(page.Content))), &lzl)
err := json.Unmarshal([]byte(string(page.Content)), &lzl)
if err != nil {
return fmt.Errorf("Error parsing content file %s: %v", page.URL.String(), err)
}
Expand All @@ -238,7 +248,7 @@ func commentParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap
return nil // comment list empty, stop
}
comments := make(map[uint64]*LzlComment)
err = json.Unmarshal([]byte(html.UnescapeString(string(commentList))), &comments)
err = json.Unmarshal([]byte(string(commentList)), &comments)
if err != nil {
return fmt.Errorf("Error parsing comment_list from %s: %v\ncomment_list:\n%s", page.URL.String(), err, commentList)
}
Expand Down Expand Up @@ -275,7 +285,7 @@ func templateParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap
tf = tmMap.Get(threadID)

tf.mutex.Lock()
err := json.Unmarshal([]byte(html.UnescapeString(string(page.Content))), tf)
err := json.Unmarshal([]byte(string(page.Content)), tf)
tf.mutex.Unlock()
if err != nil {
return fmt.Errorf("Error parsing template file %s: %v", page.URL.String(), err)
Expand Down
1 change: 0 additions & 1 deletion template.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ func renderHTML(done <-chan struct{}, tempc <-chan *TemplateField, tmpl *templat

filename = fmt.Sprintf("output/file_%s.html", t.Title)
err = writeOutput(filename, func(w *bufio.Writer) error {

if err := tmpl.Execute(w, struct {
Title string
Posts []*OutputField
Expand Down
3 changes: 2 additions & 1 deletion type.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,8 @@ func (t *TemplateField) Merge() {
}

// Unique removes any duplicate posts
// naive
// too naive
// TODO: improve result with NLP technique
func (t *TemplateField) Unique() {
// Idiomatic way to remove duplicates in a slice
// https://www.reddit.com/r/golang/comments/5ia523/idiomatic_way_to_remove_duplicates_in_a_slice/db6qa2e/
Expand Down

0 comments on commit 9c82d4e

Please sign in to comment.