This notebook records the process of how we select a random sample of 1000 paragraphs to form the final online surveys. Half of these paragraphs are collected from RBA internal documents and the result are from external publications.
Load packages that were needed for this program.
<-c("dplyr","qdap", "tidytext", "xml2", "rvest", "stringr", "stringi","sylcount", "quanteda", "gtools", "knit", "kableExtra")
load.liblapply(load.lib,require,character.only = TRUE)
Three functions are created here that will be used later in this program:
## function for read multiple csv files and convert into a single dataset
<- function(dir) {
create_big_data_from_csv_dir <- list.files(dir, pattern="*.csv") #locate the files
files <- paste(dir, "/",files, sep="") #create the full link
speech_files_link <- lapply(speech_files_link, read.csv) #read the files into a list of data.frames
data.list <- do.call(smartbind, data.list) #concatenate into one big data.frame
data.cat return(data.cat)# return the data
} ## text clean function
<- function(text_input){
text_clean_fun <- text_input %>% str_replace_all(pattern = "\n", replacement=" ") %>%
clean_text str_replace_all(pattern = "[\\^]", replacement = " ") %>%
str_replace_all(pattern = "\"", replacement = " ") %>%
str_replace_all(pattern = "\\s+", replacement = " ") %>%
#str_replace_all(pattern = "['^<U+2011>']", replacement = " ") %>% this is a non-breaking hyphen sign
str_trim(side = "both")
return(clean_text)
}
#functions to calculate FK
<- function(words,sentences, syllabus){
fk_grade = 0.39 * words/sentences + 11.8*syllabus/words -15.59
fk_grade return(fk_grade)
}
To form the final online survey, we randomly extract 1000 paragraphs from 9 sources as per the guidance discussed in Table 1 of the main paper.
<- list.files(path = "../Survey Data/1_survey_text_raw_paragraphs", pattern = ".csv")
file_raw
# new.seed <- as.integer(runif(1)*1000)
<- function(i,n){
random_draw_paragraphs
set.seed(1211)
<- paste("../Survey Data/1_survey_text_raw_paragraphs/",file_raw[i], sep = "")
file_location <- read.csv(file_location)
raw_para
#paragraph exclusions
<- raw_para %>% select(-index) %>%
raw_para_clean filter(!grepl("\\), ‘", para, perl=TRUE)) %>%
filter(!grepl("\\). ‘", para, perl=TRUE)) %>%
filter(!grepl("\\) ‘", para, perl=TRUE)) %>%
filter(!grepl("\\),“", para, perl=TRUE)) %>%
filter(!str_detect(para,"^Footnote")) %>%
filter(!str_detect(para,"^APRA")) %>%
filter(!str_detect(para,"^See")) %>%
filter(!str_detect(para,"^In her 78-page-strong responses")) %>%
filter(!str_detect(para,"^Gagnon, E., B. K. Johannsen")) %>%
filter(!str_detect(para,"NBER Working Paper")) %>%
filter(!str_detect(para,"^Holston, K.,")) %>%
filter(!str_detect(para,"http://www.")) %>%
filter(!str_detect(para,", Discussion Paper")) %>%
filter(!str_detect(para,"Thank you")) %>%
filter(!str_detect(para,"see IMF")) %>%
filter(!str_detect(para,"^Coombs")) %>%
filter(!str_detect(para,"^It is a pleasure")) %>%
filter(!str_detect(para,"Louisa Macdonald ")) %>%
filter(!str_detect(para,"estimate the model")) %>%
filter(nchar(as.character(para)) > 140)
<- unique(raw_para_clean)
raw_para_clean # raw_para_clean %>% arrange(nchar(as.character(para)))
<- raw_para_clean %>% dplyr::sample_n(n) #extract 50 paragraphs for the 2nd survey
random_selected <- substr(file_raw[i],1,nchar(file_raw[i])-4)
source_name $source <- source_name
random_selected<- paste("random_",source_name,n,".csv", sep = "")
output_file_name write.csv(random_selected, paste("../Survey Data/2_random_selection_result/",output_file_name,sep = ""),row.names = F)
}
random_draw_paragraphs(1,50) #fsr
random_draw_paragraphs(2,200) #economist
random_draw_paragraphs(3,100) #grattan
random_draw_paragraphs(4,100) #bulletin
random_draw_paragraphs(5,100) #RBA speeches
random_draw_paragraphs(6,100) #SMP intro
random_draw_paragraphs(7,50) #SMP main
random_draw_paragraphs(8,100) #SMP boxes
random_draw_paragraphs(9,50) #boe main
random_draw_paragraphs(10,50) #boe intro
random_draw_paragraphs(11,100) #boe speeches
Then, we divide those 1000 sample paragraphs in to 5 online surveys, with each containing 200 paragraphs. However, each survey respondent will only receive a random selection of 10 paragraphs from those 200 paragraphs. Due to a purely random setting, two respondents who received the same survey link may read and rate different paragraphs. This allows us minimise the number of survey links that we need to send out but still get a decent number of sample paragraphs.
Check if the number of paragraphs from each source is the same as shown in Table 1 of the main paper.
<- create_big_data_from_csv_dir("../Survey Data/2_random_selection_result")
external_survey
# create 10 groups and assign random numbers from 1 to 5 for each group; 1 refers to the survey 1 and 2 refers to survey 2, etc.
$source <- as.character(external_survey$source)
external_survey
$source_group <- case_when(external_survey$source=="1_frs"|external_survey$source=="5_smp_main" ~ "G1",
external_survey$source=="2_bulletin" ~ "G2",
external_survey$source=="3_rba_speeches" ~ "G3",
external_survey$source=="4_smp_intro_2006_2019" ~ "G4",
external_survey$source=="6_smp_boxes_06_19" ~ "G5",
external_survey$source=="8_boe_ir_intro"|external_survey$source=="7_boe_main" ~ "G6",
external_survey$source=="9_boe_speeches" ~ "G7",
external_survey$source=="11_grattan" ~ "G8",
external_survey$source=="10_economist" ~ "G9"
external_survey
)#check the number of paragraphs for each
%>% group_by(source) %>% dplyr::summarise(sample_number = n()) %>%
external_survey kbl(caption = "Count of sample paragraphs by text source") %>%
kable_classic(full_width = F, html_font = "Cambria")
source | sample_number |
---|---|
1_frs | 50 |
10_economist | 200 |
11_grattan | 100 |
2_bulletin | 100 |
3_rba_speeches | 100 |
4_smp_intro_2006_2019 | 100 |
5_smp_main | 50 |
6_smp_boxes_06_19 | 100 |
7_boe_main | 50 |
8_boe_ir_intro | 50 |
9_boe_speeches | 100 |
Then, we assign a random number from 1 to 5 to each paragraph. This number indicates which survey this paragraph belongs to. After that, we assign a random numbers from 1 to 10 to indicate which question this paragraph belongs to in the survey. As mentioned previourly, there will be 5 survey links and each question within a survey include 20 paragraphs.
## For all sources except paragraphs extracted from the journal of The Economist
set.seed(1210)
<-
data_g1_8 %>% filter(source_group!="G9") %>%
external_survey group_by(source_group) %>% mutate(survey_group = sample(rep(1:5, 20), 100, replace=F))
<-
data_g1_8_v2 %>%
data_g1_8 group_by(survey_group, source_group) %>% mutate(question_group = sample(rep(1:10, 2), 20, replace=F))
## For paragraphs from the journal of The Economist (it is seperated because we add this source later)
<-
data_g9 %>% filter(source_group=="G9") %>%
external_survey group_by(source_group) %>% mutate(survey_group = sample(rep(1:5, 40), 200, replace=F))
<-
data_g9_v2 %>%
data_g9 group_by(source_group,survey_group) %>% mutate(question_group = sample(rep(1:10, 4), 40, replace=F))
#combine the selected paragraph and export into the csv file to be pasted into the survey monkey
<- rbind(data_g1_8_v2, data_g9_v2)
survey_sample
%>% group_by(survey_group, question_group) %>% dplyr::summarise(sample_number = n()) %>%
survey_sample kbl(caption = "Count of sample paragraphs by survey group and question group") %>%
kable_classic(full_width = F, html_font = "Cambria")
survey_group | question_group | sample_number |
---|---|---|
1 | 1 | 20 |
1 | 2 | 20 |
1 | 3 | 20 |
1 | 4 | 20 |
1 | 5 | 20 |
1 | 6 | 20 |
1 | 7 | 20 |
1 | 8 | 20 |
1 | 9 | 20 |
1 | 10 | 20 |
2 | 1 | 20 |
2 | 2 | 20 |
2 | 3 | 20 |
2 | 4 | 20 |
2 | 5 | 20 |
2 | 6 | 20 |
2 | 7 | 20 |
2 | 8 | 20 |
2 | 9 | 20 |
2 | 10 | 20 |
3 | 1 | 20 |
3 | 2 | 20 |
3 | 3 | 20 |
3 | 4 | 20 |
3 | 5 | 20 |
3 | 6 | 20 |
3 | 7 | 20 |
3 | 8 | 20 |
3 | 9 | 20 |
3 | 10 | 20 |
4 | 1 | 20 |
4 | 2 | 20 |
4 | 3 | 20 |
4 | 4 | 20 |
4 | 5 | 20 |
4 | 6 | 20 |
4 | 7 | 20 |
4 | 8 | 20 |
4 | 9 | 20 |
4 | 10 | 20 |
5 | 1 | 20 |
5 | 2 | 20 |
5 | 3 | 20 |
5 | 4 | 20 |
5 | 5 | 20 |
5 | 6 | 20 |
5 | 7 | 20 |
5 | 8 | 20 |
5 | 9 | 20 |
5 | 10 | 20 |
Summarise the number of paragraphs for each survey, and save the output in csv file for each survey. This will be copied and pasted into online surveys. I put the eval=FALSE comment here to escape this step for not saving the output.
# extract 5 survey groups (each contains 200 paragraphs from 11 sources)
for (i in (1:5)){
<- survey_sample %>% filter(survey_group==i)
survey_temp <- paste("external_survey_", i,".csv", sep = "")
out_name write.csv(survey_temp, paste("../Survey Data/3_survey_group/",out_name,sep = ""))
}
The session information for this program is:
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17763)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Australia.1252 LC_CTYPE=English_Australia.1252
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C
## [5] LC_TIME=English_Australia.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.3.4 gtools_3.8.2 quanteda_3.0.0
## [4] sylcount_0.2-2 stringi_1.5.3 stringr_1.4.0
## [7] rvest_1.0.0 xml2_1.3.2 tidytext_0.3.1
## [10] qdap_2.4.3 RColorBrewer_1.1-2 qdapTools_1.3.5
## [13] qdapRegex_0.7.2 qdapDictionaries_1.0.7 dplyr_1.0.6
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 viridisLite_0.4.0 gender_0.5.4
## [4] RcppParallel_5.1.4 assertthat_0.2.1 highr_0.9
## [7] yaml_2.2.1 slam_0.1-48 pillar_1.6.0
## [10] lattice_0.20-41 glue_1.4.2 chron_2.3-56
## [13] digest_0.6.27 colorspace_2.0-1 htmltools_0.5.1.1
## [16] Matrix_1.2-18 plyr_1.8.6 tm_0.7-8
## [19] XML_3.99-0.6 pkgconfig_2.0.3 purrr_0.3.4
## [22] webshot_0.5.2 scales_1.1.1 svglite_2.0.0
## [25] openxlsx_4.2.3 tibble_3.1.1 openNLP_0.2-7
## [28] generics_0.1.0 ggplot2_3.3.3 ellipsis_0.3.2
## [31] NLP_0.2-1 magrittr_2.0.1 crayon_1.4.1
## [34] evaluate_0.14 stopwords_2.2 tokenizers_0.2.1
## [37] janeaustenr_0.1.5 fansi_0.4.2 SnowballC_0.7.0
## [40] tools_4.0.3 data.table_1.14.0 lifecycle_1.0.0
## [43] munsell_0.5.0 plotrix_3.8-1 zip_2.1.1
## [46] compiler_4.0.3 systemfonts_1.0.1 rlang_0.4.11
## [49] grid_4.0.3 RCurl_1.98-1.3 rstudioapi_0.13
## [52] igraph_1.2.6 bitops_1.0-7 rmarkdown_2.8
## [55] venneuler_1.1-0 gtable_0.3.0 DBI_1.1.1
## [58] reshape2_1.4.4 R6_2.5.0 gridExtra_2.3
## [61] knitr_1.33 utf8_1.2.1 fastmatch_1.1-0
## [64] openNLPdata_1.5.3-4 rJava_1.0-4 parallel_4.0.3
## [67] Rcpp_1.0.6 vctrs_0.3.8 wordcloud_2.6
## [70] tidyselect_1.1.1 xfun_0.22