Twitter

Load Libraries.

# Load libraries.
library(twitteR)
library(ROAuth)
library(tm)
## Loading required package: NLP
library(SnowballC)
library(stringi)
## Warning: package 'stringi' was built under R version 3.4.4
library(wordcloud)
## Loading required package: RColorBrewer

Twitter in R.

We first need to authenticate our access to Twitter using the setup_twitter_oath function. I have loaded my credentials contained within the t.api.key and t.api.secret objects.

# Authenticate Twitter session
load(file = "twitter_credentials")
setup_twitter_oauth(t.api.key, t.api.secret,
                   access_token = NULL, access_secret = NULL)
## [1] "Using browser based authentication"
# Search for two countries, Italy and France.
italyTweets <- searchTwitter('italy', lang = "en", n = 200)
head(italyTweets, n = 3)
## [[1]]
## [1] "Saliltoday: RT @yogashar99: Jews and the Parsis are living for nearly thousand years in India but Christians most ungrateful religion have problems htt…"
## 
## [[2]]
## [1] "DTonez1: RT @sevenneuter: This is a medieval depiction of King David on the Walls of the Giants at the Palazzo Trinci in Italy. https://t.co/fimB4su…"
## 
## [[3]]
## [1] "alyza_lustre: RT @eclipsemyg_: As someone who was born and lived in Italy for 16 yrs I'm not surprised to see racist bull**** being said about bts. Itali…"
franceTweets <- searchTwitter('france', lang = "en", n = 200)
head(franceTweets, n = 3)
## [[1]]
## [1] "bunny_suhana: RT @equalitynow: \"I’ve never spoken to her. I don’t know who she is. But she is my sister… and I can’t let her die for defending herself ag…"
## 
## [[2]]
## [1] "basketofjenn: RT @OzraeliAvi: Good. #bantheburqa https://t.co/MsXwu7Xt9n"
## 
## [[3]]
## [1] "HarfenistAndrew: I liked a @YouTube video https://t.co/im2djCkvsx France visit USSR - Anthems"
# User-defined function to display Twitter screen name and tweet.
display.tweet <- function(tweet) {
  cat("Screen name:", tweet$getScreenName(),
      "\nText:", tweet$getText(), "\n\n")
}
# Display 'italy' tweets.
for (t in italyTweets[1:3]) {
  display.tweet(t)
}
## Screen name: Saliltoday 
## Text: RT @yogashar99: Jews and the Parsis are living for nearly thousand years in India but Christians most ungrateful religion have problems htt… 
## 
## Screen name: DTonez1 
## Text: RT @sevenneuter: This is a medieval depiction of King David on the Walls of the Giants at the Palazzo Trinci in Italy. https://t.co/fimB4su… 
## 
## Screen name: alyza_lustre 
## Text: RT @eclipsemyg_: As someone who was born and lived in Italy for 16 yrs I'm not surprised to see racist bull**** being said about bts. Itali…
# Display 'france' tweets.
for (t in franceTweets[1:3]) {
  display.tweet(t)
}
## Screen name: bunny_suhana 
## Text: RT @equalitynow: "I’ve never spoken to her. I don’t know who she is. But she is my sister… and I can’t let her die for defending herself ag… 
## 
## Screen name: basketofjenn 
## Text: RT @OzraeliAvi: Good. #bantheburqa https://t.co/MsXwu7Xt9n 
## 
## Screen name: HarfenistAndrew 
## Text: I liked a @YouTube video https://t.co/im2djCkvsx France visit USSR - Anthems

Corpora Creation.

# Create the corpora for the 'italy' and 'france' tweets.
# Convert each tweet set into text documents for Corpus function.
italyTweets.text <- lapply(italyTweets, function(t) { t$getText()})
franceTweets.text <- lapply(franceTweets, function(t) { t$getText()})
italy.Corpus.source <- VectorSource(italyTweets.text)
france.Corpus.source <- VectorSource(franceTweets.text)
italy.Corpus <- Corpus(italy.Corpus.source)
france.Corpus <- Corpus(france.Corpus.source)

# Save original versions of the corpora.
save(italy.Corpus, file = "OriginalItalyCorpus.RData")
save(france.Corpus, file = "OriginalFranceCorpus.RData")

Transform the Corpora.

We begin by removing URLs using a custom function removeURL [Bos?18], numbers, and accent marks via format conversion to latin-ascii.

# Remove URLs.
removeURL <- function(x) {
  gsub("(http[^ ]*)", "", x)
  gsub("(https[^ ]*)", "", x)
}
italy.Corpus <- tm_map(italy.Corpus, content_transformer(removeURL))
france.Corpus <- tm_map(france.Corpus, content_transformer(removeURL))

# Remove numbers.
italy.Corpus <- tm_map(italy.Corpus, removeNumbers)
france.Corpus <- tm_map(france.Corpus, removeNumbers)

# Remove accent marks.
italy.Corpus <- tm_map(italy.Corpus, stri_trans_general, "latin-ascii")
france.Corpus <- tm_map(france.Corpus, stri_trans_general, "latin-ascii")

inspect(italy.Corpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] RT @yogashar: Jews and the Parsis are living for nearly thousand years in India but Christians most ungrateful religion have problems htt...        
## [2] RT @sevenneuter: This is a medieval depiction of King David on the Walls of the Giants at the Palazzo Trinci in Italy.                              
## [3] RT @eclipsemyg_: As someone who was born and lived in Italy for  yrs I'm not surprised to see racist bull**** being said about bts. Itali...        
## [4] @londoncafe I keep forgetting Italy didn't qualify, but yeah I get it the world cup gets everyone hyped up.                                         
## [5] RT @Drebae_: We going from New York to Cali\nLondon to Paris \nEl Mariachi\nEl Mariachi\nEl Mariachi\nWe going from Tokyo, Italy\nHong Kong to Br...
inspect(france.Corpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] RT @equalitynow: "I've never spoken to her. I don't know who she is. But she is my sister... and I can't let her die for defending herself ag...
## [2] RT @OzraeliAvi: Good. #bantheburqa                                                                                                              
## [3] I liked a @YouTube video  France visit USSR - Anthems                                                                                           
## [4] RT @BiafraHouse: #Cameroon Paul #Biya's Army in Action. Continued GENOCIDE in #Ambazonia Territory by these Beasts! BLACK MAN AND HIS PREDI...  
## [5] RT @elguapo: #TuesdayThoughts \n\nUS soldiers soothe a French girl with a puppy, during the liberation of France,  \n\n#KeepAmericaGreat...

We can see that there is a character denoted by a black diamond with a question mark. This refers to a character that cannot be displayed correctly on screen. Since this can be any character or combination of characters, let’s see if we can find black-diamond character and save it so we can perform our analyses.

# Change black-diamond-question-mark character to 'illegibleword' string.
# Inspect to find a tweet containing this mystery character.
# Found one in 4th document; split string to store character.
#splitString <- stringr::str_split(italy.Corpus[[3]]$content, " ")
# Character appears in fifth split string ; determine character at first position.
#charToRemove <- substr(splitString[[1]][5], 1, 1) ; charToRemove
# save(charToRemove, file = "blackdiamondchar.RData")
load("blackdiamondchar.RData")
# Substitute black-diamond-question-mark character to with 'illegibleword '.
for (i in 1:length(italy.Corpus)) {
  italy.Corpus[[i]]$content <- gsub(charToRemove, " illegibleword ", italy.Corpus[[i]]$content)
}
for (i in 1:length(france.Corpus)) {
  france.Corpus[[i]]$content <- gsub(charToRemove, " illegibleword ", france.Corpus[[i]]$content)
}

Rather than eliminating the character, I have decided to replace it with the string, illegibleword. While we could remove it as it does not add explicit value, it will be interesting to see how often it appears nonetheless.

Next, we would want to remove any offensive words in the tweets. We can use a list provided by James Parker [Par18] (note: this list has words with mature content). Rather than removing them, let us instead replace them with profaneword much like illegibleword.

# Remove profanity by substituting with 'profaneword' string.
# Create list of bad words provided by James Parker [Par18] via text file download.
# Suppress warning of incomplete final line so no manual changes are made to original file.
profaneWords <- suppressWarnings(readLines("full-list-of-bad-words-text-file_2018_03_26.txt"))

# Remove profanity from 'italy.Corpus'.
for (i in 1:length(italy.Corpus)) {
  splitString <- stringr::str_split(italy.Corpus[i]$content, " ")   # Split sentence string into words
  offendingIndices <- which(splitString[[1]] %in% profaneWords)    # Determine which index is profanity
  for (j in offendingIndices) {
    splitString[[1]][j] <- "profaneword"    # Replace profanity with 'profaneword'
  }
  italy.Corpus[[i]]$content = paste0(splitString[[1]], collapse = " ")    # Recombine words into new sentence
}

# Remove profanity from 'france.Corpus'.
for (i in 1:length(france.Corpus)) {
  splitString <- stringr::str_split(france.Corpus[i]$content, " ")   # Split sentence string into words
  offendingIndices <- which(splitString[[1]] %in% profaneWords)    # Determine which index is profanity
  for (j in offendingIndices) {
    splitString[[1]][j] <- "profaneword"    # Replace profanity with 'profaneword'
  }
  france.Corpus[[i]]$content = paste0(splitString[[1]], collapse = " ")    # Recombine words into new sentence
}

inspect(italy.Corpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] RT @yogashar: Jews and the Parsis are living for nearly thousand years in India but Christians most ungrateful religion have problems htt...        
## [2] RT @sevenneuter: This is a medieval depiction of King David on the Walls of the Giants at the Palazzo Trinci in Italy.                              
## [3] RT @eclipsemyg_: As someone who was born and lived in Italy for  yrs I'm not surprised to see racist bull**** being said about bts. Itali...        
## [4] @londoncafe I keep forgetting Italy didn't qualify, but yeah I get it the world cup gets everyone hyped up.                                         
## [5] RT @Drebae_: We going from New York to Cali\nLondon to Paris \nEl Mariachi\nEl Mariachi\nEl Mariachi\nWe going from Tokyo, Italy\nHong Kong to Br...
inspect(france.Corpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] RT @equalitynow: "I've never spoken to her. I don't know who she is. But she is my sister... and I can't let her die for defending herself ag...
## [2] RT @OzraeliAvi: Good. #bantheburqa                                                                                                              
## [3] I liked a @YouTube video  France visit USSR - Anthems                                                                                           
## [4] RT @BiafraHouse: #Cameroon Paul #Biya's Army in Action. Continued GENOCIDE in #Ambazonia Territory by these Beasts! BLACK MAN AND HIS PREDI...  
## [5] RT @elguapo: #TuesdayThoughts \n\nUS soldiers soothe a French girl with a puppy, during the liberation of France,  \n\n#KeepAmericaGreat...

Perform more transformation techniques such as lowercase conversions, removal of English stop words, and document stemming.

# Convert to lowercase.
italy.Corpus <- tm_map(italy.Corpus, content_transformer(tolower))
france.Corpus <- tm_map(france.Corpus, content_transformer(tolower))

# Remove English stop words.
italy.Corpus <- tm_map(italy.Corpus, removeWords, stopwords("english"))
france.Corpus <- tm_map(france.Corpus, removeWords, stopwords("english"))

# Stem document.
italy.Corpus <- tm_map(italy.Corpus, content_transformer(stemDocument))
france.Corpus <- tm_map(france.Corpus, content_transformer(stemDocument))

# Remove punctuation.
italy.Corpus <- tm_map(italy.Corpus, removePunctuation)
france.Corpus <- tm_map(france.Corpus, removePunctuation)

# Remove whitespace.
italy.Corpus <- tm_map(italy.Corpus, content_transformer(stripWhitespace))
france.Corpus <- tm_map(france.Corpus, content_transformer(stripWhitespace))

inspect(italy.Corpus[1:3])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] rt yogashar jew parsi live near thousand year india christian ungrat religion problem htt
## [2] rt sevenneuter mediev depict king david wall giant palazzo trinci italy                  
## [3] rt eclipsemyg someon born live itali yrs surpris see racist bull said bts itali
inspect(france.Corpus[1:3])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] rt equalitynow never spoken know sister let die defend ag
## [2] rt ozraeliavi good bantheburqa                           
## [3] like youtub video franc visit ussr anthem
# Save modified versions of the corpora.
save(italy.Corpus, file = "ModifiedItalyCorpus.RData")
save(france.Corpus, file = "ModifiedFranceCorpus.RData")

Term-Document Matrix and Word Clouds.

# Build the Term Document Matrix for each country.
italy.tdm <- TermDocumentMatrix(italy.Corpus)
france.tdm <- TermDocumentMatrix(france.Corpus)

# Determine frequencies of words in decreasing order.
italy.wordFreq <- rowSums(as.matrix(italy.tdm))
italy.wordFreq <- sort(italy.wordFreq, decreasing = TRUE)
france.wordFreq <- rowSums(as.matrix(france.tdm))
france.wordFreq <- sort(france.wordFreq, decreasing = TRUE)

# Display wordcloud for 'italy'.
set.seed(125)
wordcloud(words = names(italy.wordFreq),
          freq = italy.wordFreq,
          min.freq = 3,
          random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))

# Display wordcloud for 'france'.
wordcloud(words = names(france.wordFreq),
          freq = france.wordFreq,
          min.freq = 3,
          random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))

Sentiment Analysis.

# Create a function to perform sentiment analysis on a corpus.
sentiment <- function(text, pos.words, neg.words) {
  text <- gsub('[[:punct:]]', '', text)
  text <- gsub('[[:cntrl:]]', '', text)
  text <- gsub('\\d+', '', text)
  text <- tolower(text)
  # split the text into a vector of words
  words <- strsplit(text, '\\s+')
  words <- unlist(words)
  # find which words are positive
  pos.matches <- match(words, pos.words)
  pos.matches <- !is.na(pos.matches)
  # find which words are negative
  neg.matches <- match(words, neg.words)
  neg.matches <- !is.na(neg.matches)
  # calculate the sentiment score
  score <- sum(pos.matches) - sum(neg.matches)
  #cat(" Positive: ", words[pos.matches], "\n")
  #cat(" Negative: ", words[neg.matches], "\n")
  return (score)
}

We can also modify our sentiment analysis function to remove instances of scores which do not contribute to the analysis.

# Modified 'sentiment' function by returning sum of values which contribute to sentiment scores.
sentiment.na <- function(text, pos.words, neg.words) {
  text <- gsub('[[:punct:]]', '', text)
  text <- gsub('[[:cntrl:]]', '', text)
  text <- gsub('\\d+', '', text)
  text <- tolower(text)
  # split the text into a vector of words
  words <- strsplit(text, '\\s+')
  words <- unlist(words)
  # find which words are positive
  pos.matches <- match(words, pos.words)
  pos.matches <- !is.na(pos.matches)
  # find which words are negative
  neg.matches <- match(words, neg.words)
  neg.matches <- !is.na(neg.matches)
  # calculate the sentiment score
  p <- sum(pos.matches)
  n <- sum(neg.matches)
  #cat(" Positive: ", words[pos.matches], "\n")
  #cat(" Negative: ", words[neg.matches], "\n")
  if (p == 0 & n == 0)
    return (NA)
  else
    return (p - n)
}

We can use lists of “positive” and “negative” words for which to compare the contents of our tweets.

# Save positive and negative words for sentiment analysis.
pos.words = scan("positive-words.txt",
                 what = 'character',
                 comment.char = ';')
neg.words = scan("negative-words.txt",
                 what = 'character',
                 comment.char = ';')

# Determine sentiment analysis score of each corpus.
scores.italy <- sapply(italy.Corpus,
                       sentiment,
                       pos.words, neg.words)
scores.italy.na <- sapply(italy.Corpus,
                          sentiment.na,
                          pos.words, neg.words)

scores.france <- sapply(france.Corpus,
                        sentiment,
                        pos.words, neg.words)
scores.france.na <- sapply(france.Corpus,
                           sentiment.na,
                           pos.words, neg.words)

Let’s see the sentiment scores for each country tweets.

# Display table of sentiment scores for 'italy' and 'france'.
table(scores.italy) ; table(scores.france)
## scores.italy
##  -2  -1   0   1   2   3 
##   2  35 135  26   1   1
## scores.france
##  -3  -2  -1   0   1   2   3 
##   2   9  22 133  26   7   1
# Display table of sentiment scores without non-contributing scores.
table(scores.italy.na) ; table(scores.france.na)
## scores.italy.na
## -2 -1  0  1  2  3 
##  2 35  1 26  1  1
## scores.france.na
## -3 -2 -1  0  1  2  3 
##  2  9 22  9 26  7  1

We can also convert the tables of scores into data frames, and use ggplot2 to create a side-by-side bar plot of the original sentiment scores.

# Create data frames for tables of sentiment scores for both countries.
df.scores <- data.frame("Country" = rep(c("italy", "france"),
                               times = c(length(table(scores.italy)),
                                         length(table(scores.france)))),
                 "Values" = c(table(scores.italy), table(scores.france)),
                 "xVal" = as.numeric(c(names(table(scores.italy)), names(table(scores.france)))))

# Display a side-by-side bar graph of 'df.scores' using ggplot2.
require(ggplot2)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
ggplot(df.scores, aes(xVal, Values, fill = Country)) + 
  geom_bar(stat = "identity", position = "dodge", color = "white") +
  scale_fill_manual(values = c("darkblue", "forestgreen")) +
  labs(x = "Sentiment Score", y = "Frequency of Scores") +
  scale_x_discrete(limits = c(sort(unique(df.scores$xVal)))) +
  geom_text(aes(x = xVal, y = Values + 0.3 * sign(Values), label = Values), 
            position = position_dodge(0.9), vjust = -0.5, size = 3.3)

We can do the same for the modified sentiment score function results, too.

# Create data frames for tables of sentiment scores without non-contributing scores for both countries.
df.scores.na <- data.frame("Country" = rep(c("italy", "france"),
                                        times = c(length(table(scores.italy.na)),
                                                  length(table(scores.france.na)))),
                        "Values" = c(table(scores.italy.na), table(scores.france.na)),
                        "xVal" = as.numeric(c(names(table(scores.italy.na)), names(table(scores.france.na)))))

# Display a side-by-side bar graph of 'df.scores.na' using ggplot2.
ggplot2::ggplot(df.scores.na, aes(xVal, Values, fill = Country)) + 
  geom_bar(stat = "identity", position = "dodge", color = "white") +
  scale_fill_manual(values = c("darkblue", "forestgreen")) +
  labs(x = "Sentiment Score", y = "Frequency of Scores") +
  scale_x_discrete(limits = c(sort(unique(df.scores.na$xVal)))) +
  geom_text(aes(x = xVal, y = Values + 0.3 * sign(Values), label = Values), 
            position = position_dodge(0.9), vjust = -0.5, size = 3.3)

References.