Text-Mining-DataCamp-Sentiment Analysis in R

1. Fast & Dirty: Polarity Ccoring

1.1 Let’s talk about our feelings (video)

1. 2 Jump right in! Visualize polarity

# Examine the text data
text_df

# Calc overall polarity score
text_df %$% polarity(text)

# Calc polarity score by person
(datacamp_conversation <- text_df %$% polarity(text, person))

# Counts table from datacamp_conversation
counts(datacamp_conversation)

# Plot the conversation polarity
plot(datacamp_conversation)

1.3 TM refresher (I)

# clean_corpus(), tm_define are pre-defined
clean_corpus
tm_define

# Create a VectorSource
tm_vector <- VectorSource(tm_define)

# Apply VCorpus
tm_corpus <- VCorpus(tm_vector)

# Examine the first document's contents
content(tm_corpus[[1]])

# Clean the text
tm_clean <- clean_corpus(tm_corpus)

# Reexamine the contents of the first doc
content(tm_clean[[1]])

1.4 TM refresher (II)

Instruction:

# clean_text is pre-defined
clean_text

# Create tf_dtm
tf_dtm <- DocumentTermMatrix(clean_text)

# Create tf_dtm_m
tf_dtm_m <- as.matrix(tf_dtm)

# Dimensions of DTM matrix
dim(tf_dtm_m)

# Subset part of tf_dtm_m for comparison
tf_dtm_m[16:20, 2975:2985]

1.5 How many words do YOU know? Zipf’s law & subjectivity lexicon (video)

1.6 What is a subjectivity lexicon?

1.7 Where can you observe Zipf’s law?

Instruction:

# Examine sb_words
head(sb_words)

# Create expectations
sb_words$expectations <- sb_words %$% 
  {freq[1] / rank}

# Create metrics plot
sb_plot <- mjs_plot(sb_words, x = rank, y = freq, show_rollover_text = FALSE)

# Add 1st line
sb_plot <- mjs_line(sb_plot)

# Add 2nd line
sb_plot <- mjs_add_line(sb_plot, expectations)

# Add legend
sb_plot <- mjs_add_legend(sb_plot, legend = c("Frequency", "Expectation"))

# Display plot
sb_plot

1.8 Polarity on actual text

Instruction 1:

# Example statement
positive <- "DataCamp courses are good for learning"

# Calculate polarity of statement
(pos_score <-polarity(positive))

Instruction 2:

# From previous step
positive <- "DataCamp courses are good for learning"
pos_score <- polarity(positive)

# Get counts
(pos_counts <- counts(pos_score))
  
# Number of positive words
n_good <- length(pos_counts$pos.words[[1]])
  
# Total number of words
n_words <- pos_counts$wc
  
# Verify polarity score
n_good / sqrt(n_words)

1.9 Explore qdap’s polarity & built-in lexicon (video)

1.10 Happy songs!

Instruction:

# Examine conversation
conversation

# Polarity - All
polarity(conversation$text)

# Polarity - Grouped
student_pol <- conversation %$%
  polarity(text, student)

# Student results
scores(student_pol)

# Sentence by sentence
counts(student_pol)

# qdap plot
plot(student_pol)

1.11 LOL, this song is wicked good

Instruction 1:

# Examine the key.pol
key.pol

# Negators
negation.words

# Amplifiers
amplification.words

# De-amplifiers
deamplification.words

# Examine
text

Instruction 2:

# Complete the polarity parameters
polarity(
  text.var       = text$words,
  grouping.var   = text$speaker,
  polarity.frame = key.pol,
  negators       = negation.words,
  amplifiers     = amplification.words,
  deamplifiers   = deamplification.words 
)

1.12 Stressed Out!

Instruction:

# stressed_out has been pre-defined
head(stressed_out)

# Basic lexicon score
polarity(stressed_out)

# Check the subjectivity lexicon
key.pol[grep("stress", x)]

# New lexicon
custom_pol <- sentiment_frame(positive.words, c(negative.words, "stressed", "turn back"))

# Compare new score
polarity(stressed_out, polarity.frame = custom_pol)

2. Sentiment Analysis the Tidytext Way

2.1 Plutchik’s wheel of emotion, polarity vs. sentiment (video)

2.2 One theory of emotion

2.3 DTM vs. tidytext matrix

Instruction:

# As matrix
ag_dtm_m <- as.matrix(ag_dtm)

# Examine line 2206 and columns 245:250
ag_dtm_m[2206, 245:250]

# Tidy up the DTM
ag_tidy <- tidy(ag_dtm)

# Examine tidy with a word you saw
ag_tidy[831:835, ]

2.4 Getting Sentiment Lexicons

Instruction 1:

# Subset to AFINN
afinn_lex <- get_sentiments("afinn")

# Count AFINN scores
afinn_lex %>% 
  count(value)

Instruction 2:

# Subset to nrc
nrc_lex <- get_sentiments("nrc")

# Make the nrc counts object
nrc_counts <- nrc_lex%>%
count(sentiment)

Instruction 3:

# From previous step
nrc_counts <- get_sentiments("nrc") %>% 
  count(sentiment)
  
# Plot n vs. sentiment
ggplot(nrc_counts, aes(x = sentiment, y = n)) +
  # Add a col layer
  geom_col() +
  theme_gdocs()

2.5 Bing lexicon with an inner join explanation (video)

2.6 Bing tidy polarity: Simple example

Instruction:

# Qdap polarity
polarity(ag_txt)

# Get Bing lexicon
bing <- get_sentiments("bing")

# Join text to lexicon
ag_bing_words <- inner_join(ag_tidy, bing, by = c("term" = "word"))

# Examine
ag_bing_words

# Get counts by sentiment
ag_bing_words %>%
  count(sentiment)

2.7 Bing tidy polarity: Count & spread the white whale

Instruction:

# Inner join
moby_lex_words <- inner_join(m_dick_tidy, bing, by = c("term" = "word"))

moby_lex_words <- moby_lex_words %>%
  # Set index to numeric document
  mutate(index = as.numeric(document))

moby_count <- moby_lex_words %>%
  # Count by sentiment, index
  count(sentiment, index)

# Examine the counts
moby_count

moby_spread <- moby_count %>%
  # Spread sentiments
  spread(sentiment, n, fill = 0)

# Review the spread data
moby_spread

2.8 Bing tidy polarity: Call me Ishmael (with ggplot2)!

Instruction 1:

moby_polarity <- moby %>%
  # Inner join to lexicon
  inner_join(bing, by = c("term" = "word")) %>%
  # Count the sentiment scores
  count(sentiment, index) %>% 
  # Spread the sentiment into positive and negative columns
  spread(sentiment, n, fill = 0) %>%
  # Add polarity column
  mutate(polarity = positive - negative)

Instruction 2:

# From previous step
moby_polarity <- moby %>%
  inner_join(bing, by = c("term" = "word")) %>%
  count(sentiment, index) %>% 
  spread(sentiment, n, fill = 0) %>%
  mutate(polarity = positive - negative)
  
# Plot polarity vs. index
ggplot(moby_polarity, aes(y = polarity, x = index)) + 
  # Add a smooth trend curve
  geom_smooth()

2.9 AFINN & NRC methodologies in more detail (video)

2.10 AFINN: I’m your Huckleberry

Instruction 1:

# See abbreviated line 5400
huck %>% filter(line == 5400)

# What are the scores of the sentiment words?
afinn %>% filter(word %in% c("fun", "glad"))

huck_afinn <- huck %>% 
  # Inner Join to AFINN lexicon
  inner_join(afinn, by = c("term" = "word")) %>%
  # Count by value and line
  count(value, line)

Instruction 2:

# From previous step
huck_afinn <- huck %>% 
  inner_join(afinn, by = c("term" = "word")) %>%
  count(value, line)
  
huck_afinn_agg <- huck_afinn %>% 
  # Group by line
  group_by(line) %>%
  # Sum values times n (by line)
  summarize(total_value = sum(value * n))
  
huck_afinn_agg %>% 
  # Filter for line 5400
  filter(line == 5400)

Instruction 3:

# From previous steps
huck_afinn_agg <- huck %>% 
  inner_join(afinn, by = c("term" = "word")) %>%
  count(value, line) %>% 
  group_by(line) %>%
  summarize(total_value = sum(value * n))
  
# Plot total_value vs. line
ggplot(huck_afinn_agg, aes(x = line, y = total_value)) + 
  # Add a smooth trend curve
  geom_smooth()

2.11 The wonderful wizard of NRC

Instruction 1:

oz_plutchik <- oz %>% 
  # Join to nrc lexicon by term = word
  inner_join(nrc, by = c("term" = "word")) %>% 
  # Only consider Plutchik sentiments
  filter(!sentiment %in% c("positive", "negative")) %>%
  # Group by sentiment
  group_by(sentiment) %>%
  # Get total count by sentiment
  summarize(total_count = sum(count))

Instruction 2:

# From previous step
oz_plutchik <- oz %>% 
  inner_join(nrc, by = c("term" = "word")) %>% 
  filter(!sentiment %in% c("positive", "negative")) %>%
  group_by(sentiment) %>% 
  summarize(total_count = sum(count))
  
# Plot total_count vs. sentiment
ggplot(oz_plutchik, aes(x = sentiment, y = total_count)) +
  # Add a column geom
  geom_col()

3. Visualizing Sentiment

3.1 Parlor trick or worthwhile? (video)

3.2 Real insight?

3.3 Unhappy ending? Chronological polarity

Instruction 1:

moby_polarity <- moby %>%
  # Inner join to the lexicon
  inner_join(bing, by = c("term" = "word")) %>%
  # Count by sentiment, index
  count(sentiment, index) %>%
  # Spread sentiments
  spread(sentiment, n, fill = 0) %>%
  mutate(
    # Add polarity field
    polarity = positive - negative,
    # Add line number field
    line_number = row_number()
  )

Instruction 2:

# From previous step
moby_polarity <- moby %>%
  inner_join(bing, by = c("term" = "word")) %>%
  count(sentiment, index) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(
    polarity = positive - negative,
    line_number = row_number()
  )
  
# Plot polarity vs. line_number
ggplot(moby_polarity, aes(x = line_number, y = polarity)) + 
  # Add a smooth trend curve
  geom_smooth() +
  # Add a horizontal line at y = 0
  geom_hline(yintercept = 0, color = "red") +
  # Add a plot title
  ggtitle("Moby Dick Chronological Polarity") +
  theme_gdocs()

3.4 Word impact, frequency analysis

Instruction 1:

moby_tidy_sentiment <- moby %>% 
  # Inner join to bing lexicon by term = word
  inner_join(bing, by = c("term" = "word")) %>% 
  # Count by term and sentiment, weighted by count
  count(term, sentiment, wt = count) %>%
  # Spread sentiment, using n as values
  spread(sentiment, n, fill = 0) %>%
  # Mutate to add a polarity column
  mutate(polarity = positive - negative)

# Review
moby_tidy_sentiment

Instruction 2:

# From previous step
moby_tidy_sentiment <- moby %>% 
  inner_join(bing, by = c("term" = "word")) %>% 
  count(term, sentiment, wt = count) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(polarity = positive - negative)

moby_tidy_pol <- moby_tidy_sentiment %>% 
  # Filter for absolute polarity at least 50 
  filter(abs(polarity) >= 50) %>% 
  # Add positive/negative status
  mutate(
    pos_or_neg = ifelse(polarity > 0, "positive", "negative")
  )

Instruction 3:

# From previous steps
moby_tidy_pol <- moby %>% 
  inner_join(bing, by = c("term" = "word")) %>% 
  count(term, sentiment, wt = count) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(polarity = positive - negative) %>% 
  filter(abs(polarity) >= 50) %>% 
  mutate(
    pos_or_neg = ifelse(polarity > 0, "positive", "negative")
  )
  
# Plot polarity vs. (term reordered by polarity), filled by pos_or_neg
ggplot(moby_tidy_pol, aes(reorder(term, polarity), polarity,  fill = pos_or_neg)) +
  geom_col() + 
  ggtitle("Moby Dick: Sentiment Word Frequency") + 
  theme_gdocs() +
  # Rotate text and vertically justify
  theme(axis.text.x = element_text(angle = 90, vjust = -0.1))

3.5 Introspection using sentiment analysis (video)

3.6 Divide & conquer: Using polarity for a comparison cloud

Instruction 1:

oz_df <- oz_pol$all %>%
  # Select text.var as text and polarity
  select(text = text.var, polarity = polarity)

# Apply custom function pol_subsections()
all_terms <- pol_subsections(oz_df)

all_corpus <- all_terms %>%
  # Source from a vector
  VectorSource() %>% 
  # Make a volatile corpus 
  VCorpus()

Instruction 2:

# From previous step
all_corpus <- oz_pol$all %>%
  select(text = text.var, polarity = polarity) %>% 
  pol_subsections() %>%
  VectorSource() %>% 
  VCorpus()
  
all_tdm <- TermDocumentMatrix(
  # Create TDM from corpus
  all_corpus,
  control = list(
    # Yes, remove the punctuation
    removePunctuation = TRUE,
    # Use English stopwords
    stopwords = stopwords(kind = "en")
  )
) %>%
  # Convert to matrix
  as.matrix() %>%
  # Set column names
  set_colnames(c("positive", "negative"))

Instruction 3:

# From previous steps
all_tdm <- oz_pol$all %>%
  select(text = text.var, polarity = polarity) %>% 
  pol_subsections() %>%
  VectorSource() %>% 
  VCorpus() %>% 
  TermDocumentMatrix(
    control = list(
      removePunctuation = TRUE,
      stopwords = stopwords(kind = "en")
    )
  ) %>%
  as.matrix() %>%
  set_colnames(c("positive", "negative"))
  
comparison.cloud(
  # Create plot from the all_tdm matrix
  all_tdm,
  # Limit to 50 words
  max.words = 50,
  # Use darkgreen and darkred colors
  colors = c("darkgreen","darkred")
)

3.7 Emotional introspection

Instruction 1:

moby_tidy <- moby %>%
  # Inner join to nrc lexicon
  inner_join(nrc, by = c("term" = "word")) %>% 
  # Drop positive or negative
  filter(!grepl("positive|negative", sentiment)) %>% 
  # Count by sentiment and term
  count(sentiment, term) %>% 
  # Spread sentiment, using n for values
  spread(sentiment, n, fill = 0)  %>% 
  # Convert to data.frame, making term the row names
  data.frame(row.names = "term")

# Examine
head(moby_tidy)

Instruction 2:

# From previous step
moby_tidy <- moby %>%
  inner_join(nrc, by = c("term" = "word")) %>% 
  filter(!grepl("positive|negative", sentiment)) %>% 
  count(sentiment, term) %>% 
  spread(sentiment, n, fill = 0) %>% 
  data.frame(row.names = "term")
  
# Plot comparison cloud
comparison.cloud(moby_tidy, max.words = 50, title.size = 1.5)

3.8 Compare & contrast stacked bar chart

Instruction 1:

# Review tail of all_books
tail(all_books)

# Count by book & sentiment
books_sent_count <- all_books %>%
  # Inner join to nrc lexicon
  inner_join(nrc, by = c("term" = "word")) %>% 
  # Keep only positive or negative
  filter(grepl("positive|negative", sentiment)) %>% 
  # Count by book and by sentiment
  count(book, sentiment)
  
# Review entire object
books_sent_count

Instruction 2:

# From previous step
books_sent_count <- all_books %>%
  inner_join(nrc, by = c("term" = "word")) %>% 
  filter(grepl("positive|negative", sentiment)) %>% 
  count(book, sentiment)
  
book_pos <- books_sent_count %>%
  # Group by book
  group_by(book) %>% 
  # Mutate to add % positive column 
  mutate(percent_positive = 100 * n / sum(n) )

Instruction 3:

# From previous steps
book_pos <- all_books %>%
  inner_join(nrc, by = c("term" = "word")) %>% 
  filter(grepl("positive|negative", sentiment)) %>% 
  count(book, sentiment) %>%
  group_by(book) %>% 
  mutate(percent_positive = 100 * n / sum(n))
  
# Plot percent_positive vs. book, filled by sentiment
ggplot(book_pos, aes(y = percent_positive, x = book, fill = sentiment)) +  
  # Add a col layer
  geom_col()

3.9 Interpreting visualizations (video)

3.10 Kernel density plot

Instruction 1:

ag_afinn <- ag %>% 
  # Inner join to afinn lexicon
  inner_join(afinn, by = c("term" = "word"))

oz_afinn <- oz %>% 
  # Inner join to afinn lexicon
  inner_join(afinn, by = c("term" = "word"))
 
# Combine
all_df <- bind_rows(agamemnon = ag_afinn, oz = oz_afinn, .id = "book")

Instruction 2:

# From previous step
all_df <- bind_rows(
  agamemnon = ag %>% inner_join(afinn, by = c("term" = "word")), 
  oz = oz %>%
   inner_join(afinn, by = c("term" = "word")),
  .id = "book"
)

# Plot value, filled by book
ggplot(all_df, aes(x = value, fill = book)) + 
  # Set transparency to 0.3
  geom_density(alpha = 0.3) + 
  theme_gdocs() +
  ggtitle("AFINN Score Densities")

3.11 Box plot

Instruction:

# Examine
str(all_book_polarity)

# Summary by document
tapply(all_book_polarity$polarity,all_book_polarity$book, summary)

# Box plot
ggplot(all_book_polarity, aes(x = book, y = polarity)) +
  geom_boxplot(fill = c("#bada55", "#F00B42", "#F001ED", "#BA6E15"), col = "darkred") +
  geom_jitter(position = position_jitter(width = 0.1, height = 0), alpha = 0.02) +
  theme_gdocs() +
  ggtitle("Book Polarity")

3.12 Radar chart

Instruction 1:

# Review tail of moby_huck
tail(moby_huck)

scores <- moby_huck %>% 
  # Inner join to lexicon
  inner_join(nrc, by = c("term" = "word")) %>% 
  # Drop positive or negative
  filter(!grepl("positive|negative", sentiment)) %>% 
  # Count by book and sentiment
  count(book, sentiment) %>% 
  # Spread book, using n as values
  spread(book, n)

# Review scores
scores

Instruction 2:

# From previous step
scores <- moby_huck %>% 
  inner_join(nrc, by = c("term" = "word")) %>% 
  filter(!grepl("positive|negative", sentiment)) %>% 
  count(book, sentiment) %>%
  spread(book, n)
  
# JavaScript radar chart
chartJSRadar(scores)

3.13 Treemaps for groups of documents

Instruction 1:

book_length <- all_books %>%
  # Count number of words per book
  count(book)
  
# Examine the results
book_length

Instruction 2:

# From previous step
book_length <- all_books %>%
  count(book)
  
book_tree <- all_books %>% 
  # Inner join to afinn lexicon
  inner_join(afinn, by = c("term" = "word")) %>% 
  # Group by author, book
  group_by(author, book) %>%
  # Calculate mean book value
  summarize(mean_value = mean(value)) %>% 
  # Inner join by book
  inner_join(book_length, by = "book")

# Examine the results
book_tree

Instruction 3:

# From previous steps
book_length <- all_books %>%
  count(book)
book_tree <- all_books %>% 
  inner_join(afinn, by = c("term" = "word")) %>% 
  group_by(author, book) %>%
  summarize(mean_value = mean(value)) %>% 
  inner_join(book_length, by = "book")

treemap(
  # Use the book tree
  book_tree,
  # Index by author and book
  index = c("author", "book"),
  # Use n as vertex size
  vSize = "n",
  # Color vertices by mean_value
  vColor = "mean_value",
  # Draw a value type
  type = "value",
  title = "Book Sentiment Scores",
  palette = c("red", "white", "green")
)

4. Case study: Airbnb reviews

4.1 Refresher on the text mining workflow (video)

4.2 Step 1: What do you want to know?

4.3 Step 2: Identify Text Sources

Instruction:

# bos_reviews_file has been pre-defined
bos_reviews_file

# load raw text
bos_reviews <- read.csv(bos_reviews_file,stringsAsFactors = FALSE)

# Structure
str(bos_reviews)

# Dimensions
dim(bos_reviews)

4.4 Quickly examine the basic polarity

Instruction:

# Practice apply polarity to first 6 reviews
practice_pol <- polarity(bos_reviews$comments[1:6])

# Review the object
practice_pol

# Check out the practice polarity
summary(practice_pol$all$polarity)

# Summary for all reviews
summary(bos_pol$all$polarity)

# Plot Boston polarity all element
ggplot(bos_pol$all, aes(x = polarity, y = ..density..)) + 
  geom_histogram(binwidth = 0.25, fill = "#bada55", colour = "grey60") +
  geom_density(size = 0.75) +
  theme_gdocs()

4.5 Step 3: Organize (& clean) the text (video)

4.6 Create Polarity Based Corpora

Instruction 1:

pos_terms <- bos_reviews %>%
  # Add polarity column
  mutate(polarity = bos_pol$all$polarity) %>%
  # Filter for positive polarity
  filter(polarity > 0) %>%
  # Extract comments column
  pull(comments) %>% 
  # Paste and collapse
  paste(collapse = " ")

Instruction 2:

neg_terms <- bos_reviews %>%
  # Add polarity column
  mutate(polarity = bos_pol$all$polarity) %>%
  # Filter for negative polarity
  filter(polarity < 0) %>%
  # Extract comments column
  pull(comments) %>%
  # Paste and collapse
  paste(collapse = " ")

Instruction 3:

# From previous steps
pos_terms <- bos_reviews %>%
  mutate(polarity = bos_pol$all$polarity) %>%
  filter(polarity > 0) %>%
  pull(comments) %>% 
  paste(collapse = " ")
neg_terms <- bos_reviews %>%
  mutate(polarity = bos_pol$all$polarity) %>%
  filter(polarity < 0) %>%
  pull(comments) %>% 
  paste(collapse = " ")

# Concatenate the terms
all_corpus <- c(pos_terms, neg_terms) %>% 
  # Source from a vector
  VectorSource %>% 
  # Create a volatile corpus
  VCorpus()

Instruction 4:

# From previous steps
pos_terms <- bos_reviews %>%
  mutate(polarity = bos_pol$all$polarity) %>%
  filter(polarity > 0) %>%
  pull(comments) %>% 
  paste(collapse = " ")
neg_terms <- bos_reviews %>%
  mutate(polarity = bos_pol$all$polarity) %>%
  filter(polarity < 0) %>%
  pull(comments) %>% 
  paste(collapse = " ")
all_corpus <- c(pos_terms, neg_terms) %>% 
  VectorSource() %>% 
  VCorpus()
  
all_tdm <- TermDocumentMatrix(
  # Use all_corpus
  all_corpus, 
  control = list(
    # Use TFIDF weighting
    weighting = weightTfIdf, 
    # Remove the punctuation
    removePunctuation = TRUE,
    # Use English stopwords
    stopwords = stopwords(kind = "en")
  )
)

# Examine the TDM
all_tdm

4.7 Create a Tidy Text Tibble!

Instruction:

# Vector to tibble
tidy_reviews <- bos_reviews %>% 
  unnest_tokens(word, comments)

# Group by and mutate
tidy_reviews <- tidy_reviews %>% 
  group_by(id) %>% 
  mutate(original_word_order = seq_along(word))

# Quick review
tidy_reviews 

# Load stopwords
data("stop_words")

# Perform anti-join
tidy_reviews_without_stopwords <- tidy_reviews %>% 
  anti_join(stop_words)

4.8 Compare Tidy Sentiment to Qdap Polarity

Instruction:

# Get the correct lexicon
bing <- get_sentiments("bing")

# Calculate polarity for each review
pos_neg <- tidy_reviews %>% 
  inner_join(bing) %>%
  count(sentiment) %>%
  spread(sentiment, n, fill = 0) %>% 
  mutate(polarity = positive - negative)

# Check outcome
summary(pos_neg)

4.9 Step 4: Feature Extraction & Step 5: Time for analysis… almost there! (video)

4.10 Assessing author effort

Instruction 1:

# Review tidy_reviews and pos_neg
tidy_reviews
pos_neg

pos_neg_pol <- tidy_reviews %>% 
  # Effort is measured as count by id
  count(id) %>% 
  # Inner join to pos_neg
  inner_join(pos_neg) %>% 
  # Add polarity status
  mutate(pol = ifelse(polarity >= 0, "Positive", "Negative"))

# Examine results
pos_neg_pol

Instruction 2:

# From previous step
pos_neg_pol <- tidy_reviews %>% 
  count(id) %>% 
  inner_join(pos_neg) %>% 
  mutate(pol = ifelse(polarity >= 0, "Positive", "Negative"))
  
# Plot n vs. polarity, colored by pol
ggplot(pos_neg_pol, aes(y = n, x = polarity, color = pol)) + 
  # Add point layer
  geom_point(alpha = 0.25) +
  # Add smooth layer
  geom_smooth(method = "lm", se = FALSE) +
  theme_gdocs() +
  ggtitle("Relationship between word effort & polarity")

4.11 Comparison Cloud Scaled

Instruction 1:

# Matrix
all_tdm_m <- as.matrix(all_tdm)

# Column names
colnames(all_tdm_m) <- c("positive", "negative")

# Top pos words
order_by_pos <- order(all_tdm_m[, 1], decreasing = TRUE)

# Review top 10 pos words
all_tdm_m[order_by_pos, ] %>% head(10)

# Top neg words
order_by_neg <- order(all_tdm_m[, 2], decreasing = TRUE)

# Review top 10 neg words
all_tdm_m[order_by_neg, ] %>% head(10)

Instruction 2:

# From previous step
all_tdm_m <- as.matrix(all_tdm)
colnames(all_tdm_m) <- c("positive", "negative")

comparison.cloud(
  # Use the term-document matrix
  all_tdm_m,
  # Limit to 20 words
  max.words = 20,
  colors = c("darkgreen","darkred")
)

4.12 Comparison Cloud

Instruction:

# Review
bos_pol$all[1:6,1:3]

# Scale/center & append
bos_reviews$scaled_polarity <- scale(bos_pol$all$polarity)

# Subset positive comments
pos_comments <- subset(bos_reviews$comments, bos_reviews$scaled_polarit > 0)

# Subset negative comments
neg_comments <- subset(bos_reviews$comments, bos_reviews$scaled_polarit < 0)

# Paste and collapse the positive comments
pos_terms <- paste(pos_comments, collapse = " ")

# Paste and collapse the negative comments
neg_terms <- paste(neg_comments, collapse = " ")

# Organize
all_terms<- c(pos_terms, neg_terms)

# VCorpus
all_corpus <- VCorpus(VectorSource(all_terms))

# TDM
all_tdm <- TermDocumentMatrix(
  all_corpus, 
  control = list(
    weighting = weightTfIdf, 
    removePunctuation = TRUE, 
    stopwords = stopwords(kind = "en")
  )
)

# Column names
all_tdm_m <- as.matrix(all_tdm)
colnames(all_tdm_m) <- c("positive", "negative")

# Comparison cloud
comparison.cloud(
  all_tdm_m, 
  max.words = 100,
  colors = c("darkgreen", "darkred")
)