For more information see
Crossley, S. A., Kyle, K., & McNamara, D. S. (2017). Sentiment Analysis and Social Cognition Engine (SEANCE): An Automatic Tool for Sentiment, Social Cognition, and Social Order Analysis. Behavior Research Methods, 49 (3), 803-821.
Also called
A common approach to sentiment analysis is to take the mean score for the sentiment in words for texts in different categories and compare them.
Word level sentiment scores can be derived from sentiment dictionaries.
Dictionaries have a key and a value
Tidytext includes a number of sentiment dictionaries (see pipeline below)
NRC lexicon
The Bing lexicon
The AFINN lexicon
Limitations to Dictionary Approaches
#install.packages("textdata") #this contains the affect datasets
rm(list=ls(all=TRUE))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(stringr)
library(tidytext)
get_sentiments("nrc") #this calls in the NRC sentiment dictionary
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
nrc_data <- get_sentiments("nrc") #assign dictionary to a variable
nrc_data #what does it look like?
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
NRC is a problematic dictionary because it is just binary and provide no levels of sentiment.
Call in corpus (forum posts)
#get the words from each text
forum_posts <- read_csv("final_mooc_baker_data.csv") %>%
unnest_tokens(word, text) #%>% #one-token-per-row format tokenization
## Rows: 298 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Annonized_name, text, avereage_lecture, average_forum_reads
## dbl (17): nw, No._contributions, No_of_new_threads_started, Page_View, Lectu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#note that this will put everything in lowercase and remove punctuation
str(forum_posts)
## spc_tbl_ [112,090 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:112090] "Member_90" "Member_90" "Member_90" "Member_90" ...
## $ nw : num [1:112090] 26 26 26 26 26 26 26 26 26 26 ...
## $ No._contributions : num [1:112090] 2 2 2 2 2 2 2 2 2 2 ...
## $ No_of_new_threads_started : num [1:112090] 0 0 0 0 0 0 0 0 0 0 ...
## $ Page_View : num [1:112090] 231 231 231 231 231 231 231 231 231 231 ...
## $ Lecture_Action : num [1:112090] 285 285 285 285 285 285 285 285 285 285 ...
## $ Syllabus_Views : num [1:112090] 4 4 4 4 4 4 4 4 4 4 ...
## $ averege_assignment_score : num [1:112090] 1 1 1 1 1 1 1 1 1 1 ...
## $ avereage_lecture : chr [1:112090] "4.75" "4.75" "4.75" "4.75" ...
## $ Average_num_quizzes : num [1:112090] 3 3 3 3 3 3 3 3 3 3 ...
## $ average_forum_reads : chr [1:112090] "11.5" "11.5" "11.5" "11.5" ...
## $ num_upvotes_(total) : num [1:112090] 0 0 0 0 0 0 0 0 0 0 ...
## $ Forum_reputation : num [1:112090] 1 1 1 1 1 1 1 1 1 1 ...
## $ average_video_viewing : num [1:112090] 0.339 0.339 0.339 0.339 0.339 ...
## $ Time_before_deadline_for_first_attempt(Average): num [1:112090] 112991 112991 112991 112991 112991 ...
## $ Time_before_deadline_for_last_attempt(Average) : num [1:112090] 26778 26778 26778 26778 26778 ...
## $ average_page_views : num [1:112090] 32.3 32.3 32.3 32.3 32.3 ...
## $ average_syllabus_views : num [1:112090] 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 ...
## $ Completion : num [1:112090] 0 0 0 0 0 0 0 0 0 0 ...
## $ Final_Score : num [1:112090] 0.167 0.167 0.167 0.167 0.167 ...
## $ word : chr [1:112090] "hi" "i" "have" "the" ...
## - attr(*, "spec")=
## .. cols(
## .. Annonized_name = col_character(),
## .. text = col_character(),
## .. nw = col_double(),
## .. No._contributions = col_double(),
## .. No_of_new_threads_started = col_double(),
## .. Page_View = col_double(),
## .. Lecture_Action = col_double(),
## .. Syllabus_Views = col_double(),
## .. averege_assignment_score = col_double(),
## .. avereage_lecture = col_character(),
## .. Average_num_quizzes = col_double(),
## .. average_forum_reads = col_character(),
## .. `num_upvotes_(total)` = col_double(),
## .. Forum_reputation = col_double(),
## .. average_video_viewing = col_double(),
## .. `Time_before_deadline_for_first_attempt(Average)` = col_double(),
## .. `Time_before_deadline_for_last_attempt(Average)` = col_double(),
## .. average_page_views = col_double(),
## .. average_syllabus_views = col_double(),
## .. Completion = col_double(),
## .. Final_Score = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#match the words with NRC data for each text
nrc_words <- forum_posts %>%
inner_join(nrc_data) #join words within forum posts and affective dictionary using innerjoin
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc_data): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 60 of `x` matches multiple rows in `y`.
## ℹ Row 4511 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#An inner join matches pairs of observations whenever their keys are equal
#The output of an inner join is a new data frame that contains the key, the x values, and the y values.
#Unmatched rows are not included in the result
str(nrc_words) #down to 18,000 words that have sentiment attached to them.
## spc_tbl_ [17,475 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:17475] "Member_90" "Member_365" "Member_304" "Member_304" ...
## $ nw : num [1:17475] 26 27 38 38 38 38 38 38 38 38 ...
## $ No._contributions : num [1:17475] 2 1 2 2 2 2 2 2 2 2 ...
## $ No_of_new_threads_started : num [1:17475] 0 0 0 0 0 0 0 0 0 0 ...
## $ Page_View : num [1:17475] 231 513 780 780 780 780 780 780 780 780 ...
## $ Lecture_Action : num [1:17475] 285 1689 536 536 536 ...
## $ Syllabus_Views : num [1:17475] 4 4 7 7 7 7 7 7 7 7 ...
## $ averege_assignment_score : num [1:17475] 1 0.917 1 1 1 ...
## $ avereage_lecture : chr [1:17475] "4.75" "7.5" "7.25" "7.25" ...
## $ Average_num_quizzes : num [1:17475] 3 4 2.71 2.71 2.71 ...
## $ average_forum_reads : chr [1:17475] "11.5" "9" "20.125" "20.125" ...
## $ num_upvotes_(total) : num [1:17475] 0 0 5 5 5 5 5 5 5 5 ...
## $ Forum_reputation : num [1:17475] 1 0 0 0 0 0 0 0 0 0 ...
## $ average_video_viewing : num [1:17475] 0.339 0.962 0.792 0.792 0.792 ...
## $ Time_before_deadline_for_first_attempt(Average): num [1:17475] 112991 1004320 619588 619588 619588 ...
## $ Time_before_deadline_for_last_attempt(Average) : num [1:17475] 26778 992574 618682 618682 618682 ...
## $ average_page_views : num [1:17475] 32.3 64.1 74.5 74.5 74.5 ...
## $ average_syllabus_views : num [1:17475] 1.5 1 1.17 1.17 1.17 ...
## $ Completion : num [1:17475] 0 1 1 1 1 1 1 1 1 1 ...
## $ Final_Score : num [1:17475] 0.167 0.97 1 1 1 ...
## $ word : chr [1:17475] "question" "lines" "obvious" "obvious" ...
## $ sentiment : chr [1:17475] "positive" "fear" "positive" "trust" ...
## - attr(*, "spec")=
## .. cols(
## .. Annonized_name = col_character(),
## .. text = col_character(),
## .. nw = col_double(),
## .. No._contributions = col_double(),
## .. No_of_new_threads_started = col_double(),
## .. Page_View = col_double(),
## .. Lecture_Action = col_double(),
## .. Syllabus_Views = col_double(),
## .. averege_assignment_score = col_double(),
## .. avereage_lecture = col_character(),
## .. Average_num_quizzes = col_double(),
## .. average_forum_reads = col_character(),
## .. `num_upvotes_(total)` = col_double(),
## .. Forum_reputation = col_double(),
## .. average_video_viewing = col_double(),
## .. `Time_before_deadline_for_first_attempt(Average)` = col_double(),
## .. `Time_before_deadline_for_last_attempt(Average)` = col_double(),
## .. average_page_views = col_double(),
## .. average_syllabus_views = col_double(),
## .. Completion = col_double(),
## .. Final_Score = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#create a count by text for each group of words
count_data <- nrc_words %>%
group_by(Annonized_name, sentiment) %>%
count
count_data
## # A tibble: 2,232 × 3
## # Groups: Annonized_name, sentiment [2,232]
## Annonized_name sentiment n
## <chr> <chr> <int>
## 1 Member_1 anger 6
## 2 Member_1 anticipation 10
## 3 Member_1 disgust 5
## 4 Member_1 fear 14
## 5 Member_1 joy 7
## 6 Member_1 negative 24
## 7 Member_1 positive 39
## 8 Member_1 sadness 21
## 9 Member_1 surprise 8
## 10 Member_1 trust 16
## # ℹ 2,222 more rows
#widen out the tibble so that each NRC variable has a column
count_data_wide <- count_data %>%
pivot_wider(names_from = sentiment, values_from = n) %>% #pivot wide takes a long dataset and makes it wide. Here, by sentiment types
replace(is.na(.), 0) #this just says the text has no sentiment in this category. A zero is important here
count_data_wide
## # A tibble: 297 × 11
## # Groups: Annonized_name [297]
## Annonized_name anger anticipation disgust fear joy negative positive
## <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 Member_1 6 10 5 14 7 24 39
## 2 Member_101 1 22 3 23 16 56 64
## 3 Member_103 6 10 1 4 4 24 12
## 4 Member_104 1 0 0 2 0 5 2
## 5 Member_107 4 12 3 5 3 10 17
## 6 Member_108 1 0 0 2 0 4 5
## 7 Member_109 1 6 1 1 5 7 7
## 8 Member_110 4 27 4 12 23 18 42
## 9 Member_111 1 1 1 1 1 3 3
## 10 Member_112 0 3 0 0 3 0 6
## # ℹ 287 more rows
## # ℹ 3 more variables: sadness <int>, surprise <int>, trust <int>
#get word count for texts because we need to norm by text length
nw_texts <- read_csv("final_mooc_baker_data.csv") %>%
mutate(nw = str_count(text, "\\W+")) #this will create new variable called nw using stringr function str_count.
## Rows: 298 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Annonized_name, text, avereage_lecture, average_forum_reads
## dbl (17): nw, No._contributions, No_of_new_threads_started, Page_View, Lectu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(nw_texts)
## tibble [298 × 21] (S3: tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:298] "Member_90" "Member_365" "Member_304" "Member_644" ...
## $ text : chr [1:298] "Hi, I have the same question, where i can see the due date and the hard deadline from the quizzes and the progr"| __truncated__ "Well got into some trouble too. My input data is like this: Any hints.. or someone can put one or two lines wit"| __truncated__ "I am on Linux, but one obvious guess would be to try 64-bit version. It is possible that your Windows is actual"| __truncated__ "Hi mates, I'd want to expand my network of connections with people interested in topics taught in this class. F"| __truncated__ ...
## $ nw : int [1:298] 25 27 40 40 42 40 56 59 52 51 ...
## $ No._contributions : num [1:298] 2 1 2 2 2 1 2 4 2 6 ...
## $ No_of_new_threads_started : num [1:298] 0 0 0 0 1 0 1 1 1 0 ...
## $ Page_View : num [1:298] 231 513 780 199 199 49 175 187 406 179 ...
## $ Lecture_Action : num [1:298] 285 1689 536 697 241 ...
## $ Syllabus_Views : num [1:298] 4 4 7 4 10 1 11 2 5 5 ...
## $ averege_assignment_score : num [1:298] 1 0.917 1 1 1 ...
## $ avereage_lecture : chr [1:298] "4.75" "7.5" "7.25" "7.4" ...
## $ Average_num_quizzes : num [1:298] 3 4 2.71 5.33 6 ...
## $ average_forum_reads : chr [1:298] "11.5" "9" "20.125" "1.75" ...
## $ num_upvotes_(total) : num [1:298] 0 0 5 1 0 0 1 1 3 4 ...
## $ Forum_reputation : num [1:298] 1 0 0 0 2 0 1 3 1 1 ...
## $ average_video_viewing : num [1:298] 0.339 0.962 0.792 0.754 0.355 ...
## $ Time_before_deadline_for_first_attempt(Average): num [1:298] 112991 1004320 619588 855332 740341 ...
## $ Time_before_deadline_for_last_attempt(Average) : num [1:298] 26778 992574 618682 855088 739232 ...
## $ average_page_views : num [1:298] 32.3 64.1 74.5 21.6 99.5 ...
## $ average_syllabus_views : num [1:298] 1.5 1 1.17 1 1 ...
## $ Completion : num [1:298] 0 1 1 0 0 0 0 0 1 0 ...
## $ Final_Score : num [1:298] 0.167 0.97 1 0.485 0.167 ...
#join tibbles (nw and sentiment) together and get normed counts for sentiment/nw by text
final_nrc_forum_tib <- nw_texts %>%
inner_join(count_data_wide, by = "Annonized_name") %>% #join tibbles together
mutate_at(vars(anger:trust), list(normed = ~./nw)) %>% #this creates new variables that are normed sentiment by text
mutate(Completion_string = case_when(Completion == 1 ~ "Complete",
Completion == 0 ~ "Incomplete")) # Create a new variable called Completion_string
#now we have a tibble!
str(final_nrc_forum_tib)
## tibble [297 × 42] (S3: tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:297] "Member_90" "Member_365" "Member_304" "Member_644" ...
## $ text : chr [1:297] "Hi, I have the same question, where i can see the due date and the hard deadline from the quizzes and the progr"| __truncated__ "Well got into some trouble too. My input data is like this: Any hints.. or someone can put one or two lines wit"| __truncated__ "I am on Linux, but one obvious guess would be to try 64-bit version. It is possible that your Windows is actual"| __truncated__ "Hi mates, I'd want to expand my network of connections with people interested in topics taught in this class. F"| __truncated__ ...
## $ nw : int [1:297] 25 27 40 40 42 40 56 59 52 51 ...
## $ No._contributions : num [1:297] 2 1 2 2 2 1 2 4 2 6 ...
## $ No_of_new_threads_started : num [1:297] 0 0 0 0 1 0 1 1 1 0 ...
## $ Page_View : num [1:297] 231 513 780 199 199 49 175 187 406 179 ...
## $ Lecture_Action : num [1:297] 285 1689 536 697 241 ...
## $ Syllabus_Views : num [1:297] 4 4 7 4 10 1 11 2 5 5 ...
## $ averege_assignment_score : num [1:297] 1 0.917 1 1 1 ...
## $ avereage_lecture : chr [1:297] "4.75" "7.5" "7.25" "7.4" ...
## $ Average_num_quizzes : num [1:297] 3 4 2.71 5.33 6 ...
## $ average_forum_reads : chr [1:297] "11.5" "9" "20.125" "1.75" ...
## $ num_upvotes_(total) : num [1:297] 0 0 5 1 0 0 1 1 3 4 ...
## $ Forum_reputation : num [1:297] 1 0 0 0 2 0 1 3 1 1 ...
## $ average_video_viewing : num [1:297] 0.339 0.962 0.792 0.754 0.355 ...
## $ Time_before_deadline_for_first_attempt(Average): num [1:297] 112991 1004320 619588 855332 740341 ...
## $ Time_before_deadline_for_last_attempt(Average) : num [1:297] 26778 992574 618682 855088 739232 ...
## $ average_page_views : num [1:297] 32.3 64.1 74.5 21.6 99.5 ...
## $ average_syllabus_views : num [1:297] 1.5 1 1.17 1 1 ...
## $ Completion : num [1:297] 0 1 1 0 0 0 0 0 1 0 ...
## $ Final_Score : num [1:297] 0.167 0.97 1 0.485 0.167 ...
## $ anger : int [1:297] 0 0 1 0 0 0 1 0 0 0 ...
## $ anticipation : int [1:297] 0 0 0 1 0 1 3 1 0 1 ...
## $ disgust : int [1:297] 0 0 1 1 0 0 0 2 0 0 ...
## $ fear : int [1:297] 0 1 1 0 0 1 1 2 0 0 ...
## $ joy : int [1:297] 0 0 0 0 1 2 3 0 1 1 ...
## $ negative : int [1:297] 0 0 3 0 1 0 1 2 0 0 ...
## $ positive : int [1:297] 1 0 1 2 2 3 5 2 3 4 ...
## $ sadness : int [1:297] 0 0 3 1 0 0 1 0 0 0 ...
## $ surprise : int [1:297] 0 0 1 0 0 3 2 2 0 0 ...
## $ trust : int [1:297] 0 0 1 2 1 2 3 0 2 2 ...
## $ anger_normed : num [1:297] 0 0 0.025 0 0 ...
## $ anticipation_normed : num [1:297] 0 0 0 0.025 0 ...
## $ disgust_normed : num [1:297] 0 0 0.025 0.025 0 ...
## $ fear_normed : num [1:297] 0 0.037 0.025 0 0 ...
## $ joy_normed : num [1:297] 0 0 0 0 0.0238 ...
## $ negative_normed : num [1:297] 0 0 0.075 0 0.0238 ...
## $ positive_normed : num [1:297] 0.04 0 0.025 0.05 0.0476 ...
## $ sadness_normed : num [1:297] 0 0 0.075 0.025 0 ...
## $ surprise_normed : num [1:297] 0 0 0.025 0 0 ...
## $ trust_normed : num [1:297] 0 0 0.025 0.05 0.0238 ...
## $ Completion_string : chr [1:297] "Incomplete" "Complete" "Complete" "Incomplete" ...
Visualization
write.csv(final_nrc_forum_tib, "nrc_data.csv")
bar_plot_nrc <- final_nrc_forum_tib%>%
select(anger_normed: Completion_string) %>% #selected variable
pivot_longer(!Completion_string, names_to = "sentiment_type", values_to = "count") %>%
group_by(Completion_string, sentiment_type) %>% #group by sentiment
dplyr::summarize(mean_sent = mean(count, na.rm = TRUE)) %>%
ggplot(aes(x = factor(sentiment_type), y = mean_sent, fill = Completion_string, colour = Completion_string)) +
geom_bar(stat = "identity", position = "dodge") + #dodge places bars side by side
xlab("Completion type") + #label stuff
ylab("Mean sentiment") +
ggtitle("Bar plot for sentiment by student type") +
coord_flip()
## `summarise()` has grouped output by 'Completion_string'. You can override using
## the `.groups` argument.
bar_plot_nrc
Some results make sense
Quick statistics
A t-test to examine differences
library(rstatix) #for add_significance
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
t_test_nrc <- final_nrc_forum_tib%>%
select(anger_normed: Completion_string) %>% #select data
pivot_longer(!Completion_string, names_to = "sentiment", values_to = "count") %>% #pivot longer
group_by(sentiment) %>% #group by sentiment type
t_test(count ~ Completion_string) %>% #run a t test for count by sentiment
add_significance() #significance
print(t_test_nrc)
## # A tibble: 10 × 10
## sentiment .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 anger_normed count Compl… Incom… 179 118 -1.85 208. 0.0655 ns
## 2 anticipation… count Compl… Incom… 179 118 -1.54 213. 0.124 ns
## 3 disgust_norm… count Compl… Incom… 179 118 -1.92 189. 0.0566 ns
## 4 fear_normed count Compl… Incom… 179 118 -1.03 235. 0.302 ns
## 5 joy_normed count Compl… Incom… 179 118 -0.857 201. 0.392 ns
## 6 negative_nor… count Compl… Incom… 179 118 -0.941 216. 0.348 ns
## 7 positive_nor… count Compl… Incom… 179 118 -1.84 243. 0.0665 ns
## 8 sadness_norm… count Compl… Incom… 179 118 -2.55 220. 0.0113 *
## 9 surprise_nor… count Compl… Incom… 179 118 -1.18 176. 0.241 ns
## 10 trust_normed count Compl… Incom… 179 118 -0.756 229. 0.451 ns
#get descriptive stats as well
nrc_descriptive <- final_nrc_forum_tib%>%
select(anger_normed: Completion_string) %>% #select data
group_by(Completion_string) %>% #group by sentiment
summarise_at(vars(anger_normed:trust_normed), funs(mean, sd), na.rm = TRUE) #get the mean and SDs for comparison
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
##
## # Simple named list: list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
##
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(nrc_descriptive)
## # A tibble: 2 × 21
## Completion_string anger_normed_mean anticipation_normed_…¹ disgust_normed_mean
## <chr> <dbl> <dbl> <dbl>
## 1 Complete 0.00369 0.0160 0.00259
## 2 Incomplete 0.00509 0.0185 0.00389
## # ℹ abbreviated name: ¹anticipation_normed_mean
## # ℹ 17 more variables: fear_normed_mean <dbl>, joy_normed_mean <dbl>,
## # negative_normed_mean <dbl>, positive_normed_mean <dbl>,
## # sadness_normed_mean <dbl>, surprise_normed_mean <dbl>,
## # trust_normed_mean <dbl>, anger_normed_sd <dbl>,
## # anticipation_normed_sd <dbl>, disgust_normed_sd <dbl>,
## # fear_normed_sd <dbl>, joy_normed_sd <dbl>, negative_normed_sd <dbl>, …
Results
Interestingly, we see
Let’s check out another sentiment dictionary
The AFINN lexicon assigns words with a score that runs between -5 and 5, with negative scores indicating negative sentiment and positive scores indicating positive sentiment.
So, a bit different than NRC, but it also only has a single variable…
Data Wrangling
afinn <- get_sentiments("afinn")
afinn
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
#match the words from the electronics texts with afinn data
str(forum_posts) # 112,000 words for each student
## spc_tbl_ [112,090 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:112090] "Member_90" "Member_90" "Member_90" "Member_90" ...
## $ nw : num [1:112090] 26 26 26 26 26 26 26 26 26 26 ...
## $ No._contributions : num [1:112090] 2 2 2 2 2 2 2 2 2 2 ...
## $ No_of_new_threads_started : num [1:112090] 0 0 0 0 0 0 0 0 0 0 ...
## $ Page_View : num [1:112090] 231 231 231 231 231 231 231 231 231 231 ...
## $ Lecture_Action : num [1:112090] 285 285 285 285 285 285 285 285 285 285 ...
## $ Syllabus_Views : num [1:112090] 4 4 4 4 4 4 4 4 4 4 ...
## $ averege_assignment_score : num [1:112090] 1 1 1 1 1 1 1 1 1 1 ...
## $ avereage_lecture : chr [1:112090] "4.75" "4.75" "4.75" "4.75" ...
## $ Average_num_quizzes : num [1:112090] 3 3 3 3 3 3 3 3 3 3 ...
## $ average_forum_reads : chr [1:112090] "11.5" "11.5" "11.5" "11.5" ...
## $ num_upvotes_(total) : num [1:112090] 0 0 0 0 0 0 0 0 0 0 ...
## $ Forum_reputation : num [1:112090] 1 1 1 1 1 1 1 1 1 1 ...
## $ average_video_viewing : num [1:112090] 0.339 0.339 0.339 0.339 0.339 ...
## $ Time_before_deadline_for_first_attempt(Average): num [1:112090] 112991 112991 112991 112991 112991 ...
## $ Time_before_deadline_for_last_attempt(Average) : num [1:112090] 26778 26778 26778 26778 26778 ...
## $ average_page_views : num [1:112090] 32.3 32.3 32.3 32.3 32.3 ...
## $ average_syllabus_views : num [1:112090] 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5 ...
## $ Completion : num [1:112090] 0 0 0 0 0 0 0 0 0 0 ...
## $ Final_Score : num [1:112090] 0.167 0.167 0.167 0.167 0.167 ...
## $ word : chr [1:112090] "hi" "i" "have" "the" ...
## - attr(*, "spec")=
## .. cols(
## .. Annonized_name = col_character(),
## .. text = col_character(),
## .. nw = col_double(),
## .. No._contributions = col_double(),
## .. No_of_new_threads_started = col_double(),
## .. Page_View = col_double(),
## .. Lecture_Action = col_double(),
## .. Syllabus_Views = col_double(),
## .. averege_assignment_score = col_double(),
## .. avereage_lecture = col_character(),
## .. Average_num_quizzes = col_double(),
## .. average_forum_reads = col_character(),
## .. `num_upvotes_(total)` = col_double(),
## .. Forum_reputation = col_double(),
## .. average_video_viewing = col_double(),
## .. `Time_before_deadline_for_first_attempt(Average)` = col_double(),
## .. `Time_before_deadline_for_last_attempt(Average)` = col_double(),
## .. average_page_views = col_double(),
## .. average_syllabus_views = col_double(),
## .. Completion = col_double(),
## .. Final_Score = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
afinn_words <- forum_posts %>%
inner_join(afinn)
## Joining with `by = join_by(word)`
str(afinn_words) #down to around 5K words now with emotion
## spc_tbl_ [5,261 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:5261] "Member_90" "Member_90" "Member_365" "Member_365" ...
## $ nw : num [1:5261] 26 26 27 27 27 38 38 38 40 40 ...
## $ No._contributions : num [1:5261] 2 2 1 1 1 2 2 2 2 2 ...
## $ No_of_new_threads_started : num [1:5261] 0 0 0 0 0 0 0 0 0 0 ...
## $ Page_View : num [1:5261] 231 231 513 513 513 780 780 780 199 199 ...
## $ Lecture_Action : num [1:5261] 285 285 1689 1689 1689 ...
## $ Syllabus_Views : num [1:5261] 4 4 4 4 4 7 7 7 4 4 ...
## $ averege_assignment_score : num [1:5261] 1 1 0.917 0.917 0.917 ...
## $ avereage_lecture : chr [1:5261] "4.75" "4.75" "7.5" "7.5" ...
## $ Average_num_quizzes : num [1:5261] 3 3 4 4 4 ...
## $ average_forum_reads : chr [1:5261] "11.5" "11.5" "9" "9" ...
## $ num_upvotes_(total) : num [1:5261] 0 0 0 0 0 5 5 5 1 1 ...
## $ Forum_reputation : num [1:5261] 1 1 0 0 0 0 0 0 0 0 ...
## $ average_video_viewing : num [1:5261] 0.339 0.339 0.962 0.962 0.962 ...
## $ Time_before_deadline_for_first_attempt(Average): num [1:5261] 112991 112991 1004320 1004320 1004320 ...
## $ Time_before_deadline_for_last_attempt(Average) : num [1:5261] 26778 26778 992574 992574 992574 ...
## $ average_page_views : num [1:5261] 32.3 32.3 64.1 64.1 64.1 ...
## $ average_syllabus_views : num [1:5261] 1.5 1.5 1 1 1 ...
## $ Completion : num [1:5261] 0 0 1 1 1 1 1 1 0 0 ...
## $ Final_Score : num [1:5261] 0.167 0.167 0.97 0.97 0.97 ...
## $ word : chr [1:5261] "hard" "thank" "trouble" "like" ...
## $ value : num [1:5261] -1 2 -2 2 2 -3 -2 -2 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Annonized_name = col_character(),
## .. text = col_character(),
## .. nw = col_double(),
## .. No._contributions = col_double(),
## .. No_of_new_threads_started = col_double(),
## .. Page_View = col_double(),
## .. Lecture_Action = col_double(),
## .. Syllabus_Views = col_double(),
## .. averege_assignment_score = col_double(),
## .. avereage_lecture = col_character(),
## .. Average_num_quizzes = col_double(),
## .. average_forum_reads = col_character(),
## .. `num_upvotes_(total)` = col_double(),
## .. Forum_reputation = col_double(),
## .. average_video_viewing = col_double(),
## .. `Time_before_deadline_for_first_attempt(Average)` = col_double(),
## .. `Time_before_deadline_for_last_attempt(Average)` = col_double(),
## .. average_page_views = col_double(),
## .. average_syllabus_views = col_double(),
## .. Completion = col_double(),
## .. Final_Score = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#create an average sentiment score by text
mean_data_afinn <- afinn_words %>%
group_by(Annonized_name) %>% #group by student forum posts
dplyr::summarize(mean_sent = mean(value, na.rm = TRUE)) #get the mean value by student aggregate forum posts
str(mean_data_afinn)
## tibble [293 × 2] (S3: tbl_df/tbl/data.frame)
## $ Annonized_name: chr [1:293] "Member_1" "Member_101" "Member_103" "Member_104" ...
## $ mean_sent : num [1:293] 0.05405 0.00775 0.03704 -0.25 -0.25 ...
#use all the data to grab up the complete/incomplete labels
all_data <- read_csv("final_mooc_baker_data.csv")%>%
select(Annonized_name, Completion) %>% #just the data we need
mutate(Completion_string = case_when(Completion == 1 ~ "Complete",
Completion == 0 ~ "Incomplete"))
## Rows: 298 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Annonized_name, text, avereage_lecture, average_forum_reads
## dbl (17): nw, No._contributions, No_of_new_threads_started, Page_View, Lectu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create a new variable called Completion_string with a factor
str(all_data)
## tibble [298 × 3] (S3: tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:298] "Member_90" "Member_365" "Member_304" "Member_644" ...
## $ Completion : num [1:298] 0 1 1 0 0 0 0 0 1 0 ...
## $ Completion_string: chr [1:298] "Incomplete" "Complete" "Complete" "Incomplete" ...
str(mean_data_afinn)
## tibble [293 × 2] (S3: tbl_df/tbl/data.frame)
## $ Annonized_name: chr [1:293] "Member_1" "Member_101" "Member_103" "Member_104" ...
## $ mean_sent : num [1:293] 0.05405 0.00775 0.03704 -0.25 -0.25 ...
affin_mean_forum <- mean_data_afinn %>%
left_join(all_data, by = "Annonized_name") #join words within forum posts and affective dictionary using innerjoin
#now we have a tibble we can use
str(affin_mean_forum)
## tibble [293 × 4] (S3: tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:293] "Member_1" "Member_101" "Member_103" "Member_104" ...
## $ mean_sent : num [1:293] 0.05405 0.00775 0.03704 -0.25 -0.25 ...
## $ Completion : num [1:293] 1 1 1 0 0 0 1 1 0 0 ...
## $ Completion_string: chr [1:293] "Complete" "Complete" "Complete" "Incomplete" ...
Visualize differences
bar_plot_afinn <- affin_mean_forum%>%
ggplot(aes(x = Completion_string, y = mean_sent)) +
geom_bar(stat = "summary") + #you provide the mean scores.
coord_flip() + #flip it around so it is easier to read
xlab("MOOC completion") + #label x axes
ylab("Mean sentiment") + #label y axes
ggtitle("Mean sentiment for student success")
bar_plot_afinn
## No summary function supplied, defaulting to `mean_se()`
Simple Statistics
Will be a t-test as well
str(affin_mean_forum)
## tibble [293 × 4] (S3: tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:293] "Member_1" "Member_101" "Member_103" "Member_104" ...
## $ mean_sent : num [1:293] 0.05405 0.00775 0.03704 -0.25 -0.25 ...
## $ Completion : num [1:293] 1 1 1 0 0 0 1 1 0 0 ...
## $ Completion_string: chr [1:293] "Complete" "Complete" "Complete" "Incomplete" ...
t.test(mean_sent ~ Completion_string, affin_mean_forum)
##
## Welch Two Sample t-test
##
## data: mean_sent by Completion_string
## t = 0.0054672, df = 240.81, p-value = 0.9956
## alternative hypothesis: true difference in means between group Complete and group Incomplete is not equal to 0
## 95 percent confidence interval:
## -0.2246057 0.2258559
## sample estimates:
## mean in group Complete mean in group Incomplete
## 0.6641793 0.6635542
There are no differences in the overall valence of students that completed the class and those that did not.
Similarities in Words
Extra information…
What are the most common words in forum posts by those that completed the course and those that did not according to AFINN?
#grab up the words
str(afinn_words)
## spc_tbl_ [5,261 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Annonized_name : chr [1:5261] "Member_90" "Member_90" "Member_365" "Member_365" ...
## $ nw : num [1:5261] 26 26 27 27 27 38 38 38 40 40 ...
## $ No._contributions : num [1:5261] 2 2 1 1 1 2 2 2 2 2 ...
## $ No_of_new_threads_started : num [1:5261] 0 0 0 0 0 0 0 0 0 0 ...
## $ Page_View : num [1:5261] 231 231 513 513 513 780 780 780 199 199 ...
## $ Lecture_Action : num [1:5261] 285 285 1689 1689 1689 ...
## $ Syllabus_Views : num [1:5261] 4 4 4 4 4 7 7 7 4 4 ...
## $ averege_assignment_score : num [1:5261] 1 1 0.917 0.917 0.917 ...
## $ avereage_lecture : chr [1:5261] "4.75" "4.75" "7.5" "7.5" ...
## $ Average_num_quizzes : num [1:5261] 3 3 4 4 4 ...
## $ average_forum_reads : chr [1:5261] "11.5" "11.5" "9" "9" ...
## $ num_upvotes_(total) : num [1:5261] 0 0 0 0 0 5 5 5 1 1 ...
## $ Forum_reputation : num [1:5261] 1 1 0 0 0 0 0 0 0 0 ...
## $ average_video_viewing : num [1:5261] 0.339 0.339 0.962 0.962 0.962 ...
## $ Time_before_deadline_for_first_attempt(Average): num [1:5261] 112991 112991 1004320 1004320 1004320 ...
## $ Time_before_deadline_for_last_attempt(Average) : num [1:5261] 26778 26778 992574 992574 992574 ...
## $ average_page_views : num [1:5261] 32.3 32.3 64.1 64.1 64.1 ...
## $ average_syllabus_views : num [1:5261] 1.5 1.5 1 1 1 ...
## $ Completion : num [1:5261] 0 0 1 1 1 1 1 1 0 0 ...
## $ Final_Score : num [1:5261] 0.167 0.167 0.97 0.97 0.97 ...
## $ word : chr [1:5261] "hard" "thank" "trouble" "like" ...
## $ value : num [1:5261] -1 2 -2 2 2 -3 -2 -2 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Annonized_name = col_character(),
## .. text = col_character(),
## .. nw = col_double(),
## .. No._contributions = col_double(),
## .. No_of_new_threads_started = col_double(),
## .. Page_View = col_double(),
## .. Lecture_Action = col_double(),
## .. Syllabus_Views = col_double(),
## .. averege_assignment_score = col_double(),
## .. avereage_lecture = col_character(),
## .. Average_num_quizzes = col_double(),
## .. average_forum_reads = col_character(),
## .. `num_upvotes_(total)` = col_double(),
## .. Forum_reputation = col_double(),
## .. average_video_viewing = col_double(),
## .. `Time_before_deadline_for_first_attempt(Average)` = col_double(),
## .. `Time_before_deadline_for_last_attempt(Average)` = col_double(),
## .. average_page_views = col_double(),
## .. average_syllabus_views = col_double(),
## .. Completion = col_double(),
## .. Final_Score = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
afinn_word_counts <- afinn_words %>%
mutate(Completion_string = case_when(Completion == 1 ~ "Complete",
Completion == 0 ~ "Incomplete")) %>% # Create a new variable called Completion_string
count(word, Completion_string, sort = TRUE)
afinn_word_counts
## # A tibble: 761 × 3
## word Completion_string n
## <chr> <chr> <int>
## 1 thanks Complete 219
## 2 like Complete 175
## 3 wrong Complete 162
## 4 problem Complete 153
## 5 help Complete 137
## 6 no Complete 131
## 7 true Complete 89
## 8 please Complete 81
## 9 good Complete 80
## 10 big Complete 79
## # ℹ 751 more rows
And now let’s graph them out.
afinn_word_counts %>%
group_by(Completion_string) %>%
slice_max(n, n = 20) %>% #set max number
mutate(word = reorder(word, n)) %>% #create new variable with count
ungroup() %>% #ungroup for next bits below
ggplot(aes(n, word, fill = Completion_string)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Completion_string, scales = "free_y") +
labs(x = "Most common words per completion type",
y = NULL)