-
Notifications
You must be signed in to change notification settings - Fork 1
/
processDocuments.R
67 lines (60 loc) · 2.17 KB
/
processDocuments.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
### Lukas Huber
### 2015
## Parse the HTML content of a site and return the content and title
## TODO should be refactored to return all informations and made more error resistent
getDocument <- function(content){
if(content =="") return(c("", ""))
#print(str(content))
clearcontent <- content[-1]
contentTitle <- as.vector(unlist(content[1]))
### remove related entries
clearcontent <- clearcontent[-4]
##relatedEntries <- as.vector(unlist(content[5]))
#### create Cleansed Content
if(is.null(clearcontent)) return("")
if(is.null(clearcontent$date)){
clearcontent$date <- rep(0, length(clearcontent$text))
}
dfcontent <- tryCatch({
data.frame(matrix(unlist(clearcontent), nrow=3, byrow=T), stringsAsFactors = F)
},
error = function(e){
print(e)
#print(clearcontent)
# stop("error")
},
warning = function(w){
#print(w)
print(contentTitle)
print(clearcontent)
data.frame(matrix(unlist(clearcontent), nrow=3, byrow = T), stringsAsFactors = F)
}
)
#datum <- content$date[[2]]
#date <- strptime(datum, "%d. %B %Y um %H:%M Uhr")
#print("test")
#print(dfcontent)
### create a Document
doc <- paste(gsub("[\t\n]", "", x=dfcontent[2,], useBytes = T), sep="", collapse = " NEXTENTRY ")
return(c(contentTitle, doc))
}
## take dataframe with url and scrape each threads content.
## TODO make loop asynchrounus
getAllDocumentsofThematic <- function(df.threads){
doc.list <- list()
title.list <- list()
### Loop over Threads and get Content
for(j in 1:length(df.threads)){
#j <- 4
content <- scrapeContent(paste(url,"/" ,df.threads[2,j], sep=""))
doc <- getDocument(content)
#print(doc)
title.list <- c(title.list, doc[1])
doc.list <- c(doc.list, doc[2])
}
return(doc.list)
}
### Clear the NEXTENTRY which should split threads into entries
clearNE <- function(doc){
str_replace_all(string = doc, pattern = "NEXTENTRY", replacement = " ")
}