Reddit

library(tm)
## Loading required package: NLP
library(SnowballC)
library(stringi)
## Warning: package 'stringi' was built under R version 3.4.4
library(wordcloud)
## Loading required package: RColorBrewer
library(RedditExtractoR)
# Create search topics of 'italy' and 'france' within Reddit post threads with topic, 'World News'.
Subreddit.Topics <- c("World News")
Search.Topics <- c("italy", "france")

# Get URLS of 'italy' Reddit post threads within 'World News'.
reddit_links.italy <- reddit_urls(
  search_terms = Search.Topics[1],
  subreddit = Subreddit.Topics,
  sort_by = "new",
  page_threshold = 1
)
str(reddit_links.italy)
## 'data.frame':    25 obs. of  5 variables:
##  $ date        : chr  "03-05-18" "31-03-18" "31-03-18" "24-02-18" ...
##  $ num_comments: num  17 250 2 6 4 378 5 13 701 3 ...
##  $ title       : chr  "As giant rodents thrive in Italy, mayor comes up with novel solution - eat them" "Italy On High Alert: Officials Warn of Flood of Terrorists Across the Sea" "French armed officers spark Italy migrant centre row" "Italy steps up security in major cities amid protests by rival activists" ...
##  $ subreddit   : chr  "news" "news" "news" "news" ...
##  $ URL         : chr  "http://www.reddit.com/r/news/comments/8gsfjz/as_giant_rodents_thrive_in_italy_mayor_comes_up/" "http://www.reddit.com/r/news/comments/88lbmq/italy_on_high_alert_officials_warn_of_flood_of/" "http://www.reddit.com/r/news/comments/88jslu/french_armed_officers_spark_italy_migrant_centre/" "http://www.reddit.com/r/news/comments/7zxr4t/italy_steps_up_security_in_major_cities_amid/" ...
# Get URLS of 'france' Reddit post threads within 'World News'.
reddit_links.france <- reddit_urls(
  search_terms = Search.Topics[2],
  subreddit = Subreddit.Topics,
  sort_by = "new",
  page_threshold = 1
)
str(reddit_links.france)
## 'data.frame':    25 obs. of  5 variables:
##  $ date        : chr  "20-05-18" "11-05-18" "20-04-18" "12-04-18" ...
##  $ num_comments: num  4092 485 156 19 16 ...
##  $ title       : chr  "France to fine men up to \xac750 for wolf-whistling or making sexual comments to women" "France: Europe isn\031t US \030vassal,\031 should trade with Iran" "France to ban use of meat terms to describe vegetable-based products -" "Iran tells France not to be influenced by Saudi prince on nuclear deal" ...
##  $ subreddit   : chr  "news" "news" "news" "news" ...
##  $ URL         : chr  "http://www.reddit.com/r/news/comments/8kr6rm/france_to_fine_men_up_to_750_for_wolfwhistling_or/" "http://www.reddit.com/r/news/comments/8imppm/france_europe_isnt_us_vassal_should_trade_with/" "http://www.reddit.com/r/news/comments/8dpnak/france_to_ban_use_of_meat_terms_to_describe/" "http://www.reddit.com/r/news/comments/8brnl8/iran_tells_france_not_to_be_influenced_by_saudi/" ...
# Determine which URLs have '\' character in them due to Unicode translations.
regexpr("\\", reddit_links.italy$URL, fixed = TRUE)  # No errors generated
##  [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1
## attr(,"match.length")
##  [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1
## attr(,"useBytes")
## [1] TRUE
regexpr("\\", reddit_links.france$URL, fixed = TRUE)  # Error generated in eighth URL
## Warning in regexpr("\\", reddit_links.france$URL, fixed = TRUE): input
## string 8 is invalid in this locale
##  [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1
## attr(,"match.length")
##  [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1
reddit_links.france$URL[8]
## [1] "http://www.reddit.com/r/news/comments/873q3p/france_gun_attack_tr\xe8bes_held_memorial_mass_for/"
reddit_links.france$URL[8] <- iconv(reddit_links.france$URL[8], "latin1", "UTF8")
reddit_links.france$URL[8]
## [1] "http://www.reddit.com/r/news/comments/873q3p/france_gun_attack_trèbes_held_memorial_mass_for/"
# Retrieve posts from the URLs in the 'italy' and 'france' Reddit links.
reddit_thread.italy <- reddit_content(reddit_links.italy$URL)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |=================================================================| 100%
reddit_thread.france <- reddit_content(reddit_links.france$URL)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |=================================================================| 100%
# Extract all Reddit comments.
reddit.comments.italy <- unlist(lapply(reddit_thread.france$comment[1:100], function(x) x))
reddit.comments.france <- unlist(lapply(reddit_thread.italy$comment[1:100], function(x) x))
# Pre-processing of the corpora.
reddit_corpus.italy <- VCorpus(VectorSource(reddit.comments.italy))     # Create corpus for 'italy'
save(reddit_corpus.italy, file = "OriginalCorpus_Italy.RData")
reddit_corpus.france <- VCorpus(VectorSource(reddit.comments.france))    # Create corpus for 'france'
save(reddit_corpus.france, file = "OriginalCorpus_France.RData")

removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
allButApost <- function(x) gsub("[^[:alnum:][:space:]']", "", x)

getTransCorpus <- function(data.corpus) {
  data.corpus <- tm_map(data.corpus, content_transformer(removeURL))
  data.corpus <- tm_map(data.corpus, removeNumbers)
  data.corpus <- tm_map(data.corpus, content_transformer(allButApost))
  data.corpus <- tm_map(data.corpus, tolower)
  data.corpus <- tm_map(data.corpus, removeWords, stopwords("SMART"))
  # data.corpus <- tm_map(data.corpus, removePunctuation)
  data.corpus <- tm_map(data.corpus, stemDocument)
  data.corpus <- tm_map(data.corpus, PlainTextDocument)
}
# Modify the corpora and save the modified objects.
data.Trans.corpus.italy <- getTransCorpus(reddit_corpus.italy)
save(data.Trans.corpus.italy, file = "ModifiedCorpus_Italy.RData")
data.Trans.corpus.france <- getTransCorpus(reddit_corpus.france)
save(data.Trans.corpus.france, file = "ModifiedCorpus_France.RData")
# Create TDMs for the corpora.
tdm_reddit.italy <- TermDocumentMatrix(data.Trans.corpus.italy)
tdm_reddit.france <- TermDocumentMatrix(data.Trans.corpus.france)
# Word frequency.
wordFreq.italy <- rowSums(as.matrix(tdm_reddit.italy))
wordFreq.italy <- subset(wordFreq.italy, wordFreq.italy >= 0)
sorted_wordFreq.italy <- sort(wordFreq.italy, decreasing = TRUE)
wordFreq.france <- rowSums(as.matrix(tdm_reddit.france))
wordFreq.france <- subset(wordFreq.france, wordFreq.france >= 0)
sorted_wordFreq.france <- sort(wordFreq.france, decreasing = TRUE)
# Display the first 15 terms in each TDM using table from 'kitr' package.
tdm_df_reddit.italy <- data.frame(Term = names(sorted_wordFreq.italy), Freq = sorted_wordFreq.italy)
knitr::kable(tdm_df_reddit.italy[1:15, c(1, 2)],
             digits = 2, format.args = list(big..mark = ","))
Term Freq
batteri batteri 79
appl appl 71
phone phone 59
slow slow 40
year year 26
updat updat 23
android android 21
make make 20
peopl peopl 20
samsung samsung 19
iphon iphon 16
power power 16
devic devic 15
issu issu 15
cpu cpu 14
tdm_df_reddit.france <- data.frame(Term = names(sorted_wordFreq.france), Freq = sorted_wordFreq.france)
knitr::kable(tdm_df_reddit.france[1:15, c(1, 2)],
             digits = 2, format.args = list(big..mark = ","))
Term Freq
peopl peopl 24
migrant migrant 19
countri countri 18
fund fund 16
china china 15
place place 13
expat expat 11
local local 11
world world 10
economi economi 9
emiss emiss 9
good good 8
invad invad 8
live live 8
part part 8
# Display word cloud for 'italy'.
set.seed(125)
require(wordcloud)
wordcloud(words = names(wordFreq.italy),
          freq = wordFreq.italy,
          min.freq = 2,
          random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))

# Display word cloud for 'france'.
wordcloud(words = names(wordFreq.france),
          freq = wordFreq.france,
          min.freq = 2,
          random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : argument could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : carbon could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : cheap could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : comment could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : comparison could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : complet could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : condemn could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : consequ could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : continu could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : control could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : crap could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : damag could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : desper could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : destroy could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : direct could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : ensur could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : environ could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : evid could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : exampl could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : expens could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : failur could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : florenc could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : footprint could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : fourth could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : fuck could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : general could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : guy could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : head could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : heavili could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : help could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : histori could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : hold could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : home could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : hungari could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : ignor could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : import could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : incom could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : industri could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : interest could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : lankan could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : learn could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : left could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : liter could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : love could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : median could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : middl could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : muslim could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : negoti could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : number could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : occupi could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : offer could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : orang could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : panama could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : pari could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : past could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : posit could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : poverti could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : presid could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : pretti could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : price could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : print could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : produc could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : propos could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : public could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : refuge could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : rent could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : rental could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : republican could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : restrict could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : result could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : ridicul could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : roman could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : scari could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : selfi could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : share could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : shutup could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : side could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : space could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : specul could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : spend could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : stick could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : structur could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : technolog could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : tourist could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : travel could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : trinket could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : understand could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : useless could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : wait could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : wife could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = names(wordFreq.france), freq =
## wordFreq.france, : yeah could not be fit on page. It will not be plotted.

# Initiate bigram creation.
library(RWeka)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigram.italy <- TermDocumentMatrix(reddit_corpus.italy, control = list(tokenize = BigramTokenizer))
bigram.france <- TermDocumentMatrix(reddit_corpus.france, control = list(tokenize = BigramTokenizer))

# Display most frequent bigrams.
bi_freq.italy <- sort(rowSums(as.matrix(bigram.italy)), decreasing = TRUE)
bi_freq.italy.df <- data.frame(word = names(bi_freq.italy), freq = bi_freq.italy)
bi_freq.france <- sort(rowSums(as.matrix(bigram.france)), decreasing = TRUE)
bi_freq.france.df <- data.frame(word = names(bi_freq.france), freq = bi_freq.france)
head(bi_freq.italy.df, 5) ; head(bi_freq.france.df, 5)
##                    word freq
## the battery the battery   25
## it s               it s   21
## don t             don t   17
## i m                 i m   17
## in the           in the   16
##            word freq
## of the   of the   16
## in the   in the   12
## it s       it s   11
## don t     don t   10
## and the and the    8
# Modified 'sentiment' function by returning sum of values which contribute to sentiment scores.
sentiment.na <- function(text, pos.words, neg.words) {
  text <- gsub('[[:punct:]]', '', text)
  text <- gsub('[[:cntrl:]]', '', text)
  text <- gsub('\\d+', '', text)
  text <- tolower(text)
  # split the text into a vector of words
  words <- strsplit(text, '\\s+')
  words <- unlist(words)
  # find which words are positive
  pos.matches <- match(words, pos.words)
  pos.matches <- !is.na(pos.matches)
  # find which words are negative
  neg.matches <- match(words, neg.words)
  neg.matches <- !is.na(neg.matches)
  # calculate the sentiment score
  p <- sum(pos.matches)
  n <- sum(neg.matches)
  #cat(" Positive: ", words[pos.matches], "\n")
  #cat(" Negative: ", words[neg.matches], "\n")
  if (p == 0 & n == 0)
    return (NA)
  else
    return (p - n)
}

# Save positive and negative words for sentiment analysis.
pos.words = scan('positive-words.txt',
                 what = 'character',
                 comment.char = ';')
neg.words = scan('negative-words.txt',
                 what = 'character',
                 comment.char = ';')
# Perform sentiment analysis on 'italy' and 'france' Reddit comments using existing functions.
rscores.italy.na <- sapply(reddit.comments.italy, sentiment.na, pos.words, neg.words)
table(rscores.italy.na)
## rscores.italy.na
## -10  -7  -6  -4  -3  -2  -1   0   1   2   3   5   6 
##   1   1   2   4   3   5  25  14  13   7   3   1   2
rscores.france.na <- sapply(reddit.comments.france, sentiment.na, pos.words, neg.words)
table(rscores.france.na)
## rscores.france.na
## -7 -6 -3 -2 -1  0  1  2  3  4  5  7 
##  2  1  1  8 13  9 13 10  4  1  1  2
# Create data frames for tables of sentiment scores without non-contributing scores for both countries.
df.rscores <- data.frame("Country" = rep(c("italy", "france"),
                                           times = c(length(table(rscores.italy.na)),
                                                     length(table(rscores.france.na)))),
                           "Values" = c(table(rscores.italy.na), table(rscores.france.na)),
                           "xVal" = as.numeric(c(names(table(rscores.italy.na)), names(table(rscores.france.na)))))
# Display a side-by-side bar graph of 'df.scores.na' using ggplot2.
require(ggplot2)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
ggplot(df.rscores, aes(xVal, Values, fill = Country)) +
  geom_bar(stat = "identity", position = "dodge", color = "white") +
  scale_fill_manual(values = c("darkblue", "forestgreen")) +
  labs(x = "Sentiment Score", y = "Frequency of Scores") +
  scale_x_discrete(limits = c(sort(unique(df.rscores$xVal)))) +
  geom_text(aes(x = xVal, y = Values + 0.3 * sign(Values), label = Values),
            position = position_dodge(0.9), vjust = -0.5, size = 3.3)

#

References.