-
Notifications
You must be signed in to change notification settings - Fork 0
/
AI.Rout
103 lines (92 loc) · 3.18 KB
/
AI.Rout
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
R version 3.2.2 (2015-08-14) -- "Fire Safety"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
[Previously saved workspace restored]
> setwd("C:/Users/crobertson/Dropbox/nsercCREATE/Scraping/tweets/outputs/AITweets")
> x <- read.csv("test2_clean.csv")
> removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
> library(plyr)
Warning message:
package 'plyr' was built under R version 3.2.5
> library(ggplot2)
Warning message:
package 'ggplot2' was built under R version 3.2.5
> library(XML)
> library(tm)
Loading required package: NLP
Attaching package: 'NLP'
The following object is masked from 'package:ggplot2':
annotate
> library(wordcloud)
Loading required package: RColorBrewer
> library(RColorBrewer)
>
> x$D <- as.POSIXct(strptime(as.character(x$created_at), "%a %b %d %H:%M:%S %z %Y"))
> x$day <- as.Date(x$D)
> x$textC <- removeURL(x$text)
> x <- x[!duplicated(x$textC), ]
> daily <- ddply(x, .(day), summarise, numTweets = length(day)) #summarize by mean
>
> png("FULL_dailyTweetsAI.png", width=1280,height=800)
> ggplot(daily, aes(day, numTweets)) + geom_line() + xlab("") + ylab("Daily AI-related Tweets")
> dev.off()
null device
1
>
> recent <- subset(daily, day >= max(daily$day)-14)
>
> png("Recent_dailyTweetsAI.png", width=1280,height=800)
> ggplot(recent, aes(day, numTweets)) + geom_line() + xlab("") + ylab("Daily AI-related Tweets - last 14 days")
> dev.off()
null device
1
>
>
>
>
> recent14 <- subset(x, day >= max(x$day)-14)
>
>
> ap.corpus <- Corpus(DataframeSource(data.frame(recent14$text)))
> ap.corpus <- tm_map(ap.corpus, content_transformer(tolower))
> ap.corpus <- tm_map(ap.corpus, removeWords, stopwords("english"))
> ap.corpus <- tm_map(ap.corpus, removeWords, c("bird", "flu", "avian", "influenza", "poultry"))
> ap.corpus <- tm_map(ap.corpus, removePunctuation)
> ap.corpus <- tm_map(ap.corpus, PlainTextDocument)
> ap.corpus <- tm_map(ap.corpus, stemDocument)
>
> ap.corpus <- Corpus(VectorSource(ap.corpus))
> ap.tdm <- TermDocumentMatrix(ap.corpus)
> ap.m <- as.matrix(ap.tdm)
> ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
> ap.d <- data.frame(word = names(ap.v),freq=ap.v)
> head(ap.d)
word freq
may may 1835
chicken chicken 1159
ang ang 810
yung yung 619
ako ako 592
manok manok 499
> pal2 <- brewer.pal(8,"Dark2")
> png("wordCloud_14day.png", width=8,height=6, units="in", res=300)
> wordcloud(ap.d$word,ap.d$freq, scale=c(6,.2),min.freq=3,max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
> dev.off()
null device
1
>
>
>
> proc.time()
user system elapsed
68.50 1.62 73.38