1 Introduction

This notebook records the process of how we select a random sample of 1000 paragraphs to form the final online surveys. Half of these paragraphs are collected from RBA internal documents and the result are from external publications.

  • For RBA internal publications the distribution of paragraphs are from:
    • Bulletin - 100 paragraphs
    • RBA Speeches - 100 paragraphs
    • SMP Introduction - 100 paragraphs
    • SMP boxes - 100 paragraphs
    • SMP main body - 50 paragraphs
    • FSR - 50 paragraphs
  • The distribution of paragraphs from external publications are:
    • The Economists - 200 paragraphs
    • Grattan Report - 100 paragraphs
    • Bank of England speeches - 100 paragraphs
    • Band of England IR Introduction - 50 paragraphs
    • Bank of England IR main text - 50 paragraphs

2 Prepration

2.1 Load packages

Load packages that were needed for this program.

load.lib<-c("dplyr","qdap", "tidytext", "xml2", "rvest", "stringr", "stringi","sylcount", "quanteda", "gtools", "knit", "kableExtra")
lapply(load.lib,require,character.only = TRUE)

2.2 Set up functions

Three functions are created here that will be used later in this program:

  • create_big_data_from_csv_dir: combining multiple csv files into a consolidated one
  • text_clean_fun: clean text by removing extra punctuation and unknown symbols in the text data
  • fk_grade: calculating the FK grade level for each paragraph
## function for read multiple csv files and convert into a single dataset
create_big_data_from_csv_dir <- function(dir) {
  files <- list.files(dir, pattern="*.csv") #locate the files 
  speech_files_link <- paste(dir, "/",files, sep="") #create the full link
  data.list <- lapply(speech_files_link, read.csv)  #read the files into a list of data.frames 
  data.cat <- do.call(smartbind, data.list) #concatenate into one big data.frame 
  return(data.cat)# return the data
} 
## text clean function
text_clean_fun  <- function(text_input){
  clean_text <- text_input %>% str_replace_all(pattern = "\n", replacement=" ") %>%
    str_replace_all(pattern = "[\\^]", replacement = " ") %>%
    str_replace_all(pattern = "\"", replacement = " ") %>%
    str_replace_all(pattern = "\\s+", replacement = " ") %>%
    #str_replace_all(pattern = "['^<U+2011>']", replacement = " ") %>% this is a non-breaking hyphen sign
    str_trim(side = "both")
  return(clean_text)
} 

#functions to calculate FK
fk_grade <- function(words,sentences, syllabus){
  fk_grade = 0.39 * words/sentences + 11.8*syllabus/words -15.59
  return(fk_grade)
}

3 Survey paragaphs extraction

To form the final online survey, we randomly extract 1000 paragraphs from 9 sources as per the guidance discussed in Table 1 of the main paper.

file_raw <- list.files(path = "../Survey Data/1_survey_text_raw_paragraphs", pattern = ".csv")

# new.seed <- as.integer(runif(1)*1000)

random_draw_paragraphs <- function(i,n){

  set.seed(1211)
  file_location <- paste("../Survey Data/1_survey_text_raw_paragraphs/",file_raw[i], sep = "")
  raw_para <- read.csv(file_location)
  
  #paragraph exclusions
  
  raw_para_clean <- raw_para %>% select(-index) %>%
    filter(!grepl("\\), ‘", para, perl=TRUE)) %>%
    filter(!grepl("\\). ‘", para, perl=TRUE)) %>%
    filter(!grepl("\\) ‘", para, perl=TRUE)) %>%
    filter(!grepl("\\),“", para, perl=TRUE)) %>%
    filter(!str_detect(para,"^Footnote")) %>%
    filter(!str_detect(para,"^APRA")) %>%
    filter(!str_detect(para,"^See")) %>%
    filter(!str_detect(para,"^In her 78-page-strong responses")) %>%
    filter(!str_detect(para,"^Gagnon, E., B. K. Johannsen")) %>%
    filter(!str_detect(para,"NBER Working Paper")) %>%
    filter(!str_detect(para,"^Holston, K.,")) %>%
    filter(!str_detect(para,"http://www.")) %>%
    filter(!str_detect(para,", Discussion Paper")) %>%
    filter(!str_detect(para,"Thank you")) %>%
    filter(!str_detect(para,"see IMF")) %>% 
    filter(!str_detect(para,"^Coombs")) %>%
    filter(!str_detect(para,"^It is a pleasure")) %>%
    filter(!str_detect(para,"Louisa Macdonald ")) %>%
    filter(!str_detect(para,"estimate the model")) %>%
    filter(nchar(as.character(para)) > 140)

  raw_para_clean <- unique(raw_para_clean)
  # raw_para_clean %>% arrange(nchar(as.character(para)))
                       
  random_selected <- raw_para_clean %>% dplyr::sample_n(n) #extract 50 paragraphs for the 2nd survey
  source_name <- substr(file_raw[i],1,nchar(file_raw[i])-4)
  random_selected$source <- source_name
  output_file_name <- paste("random_",source_name,n,".csv", sep = "")
  write.csv(random_selected, paste("../Survey Data/2_random_selection_result/",output_file_name,sep = ""),row.names = F)
}

random_draw_paragraphs(1,50) #fsr
random_draw_paragraphs(2,200) #economist
random_draw_paragraphs(3,100) #grattan
random_draw_paragraphs(4,100) #bulletin

random_draw_paragraphs(5,100) #RBA speeches
random_draw_paragraphs(6,100) #SMP intro
random_draw_paragraphs(7,50) #SMP main
random_draw_paragraphs(8,100) #SMP boxes

random_draw_paragraphs(9,50) #boe main
random_draw_paragraphs(10,50) #boe intro
random_draw_paragraphs(11,100) #boe speeches

4 Generate survey paragraphs

Then, we divide those 1000 sample paragraphs in to 5 online surveys, with each containing 200 paragraphs. However, each survey respondent will only receive a random selection of 10 paragraphs from those 200 paragraphs. Due to a purely random setting, two respondents who received the same survey link may read and rate different paragraphs. This allows us minimise the number of survey links that we need to send out but still get a decent number of sample paragraphs.

4.1 Check source of sample paragaphs

Check if the number of paragraphs from each source is the same as shown in Table 1 of the main paper.

external_survey <- create_big_data_from_csv_dir("../Survey Data/2_random_selection_result")



# create 10 groups and assign random numbers from 1 to 5 for each group; 1 refers to the survey 1 and 2 refers to survey 2, etc.
external_survey$source <- as.character(external_survey$source)

external_survey$source_group <- case_when(external_survey$source=="1_frs"|external_survey$source=="5_smp_main" ~ "G1",
                                          external_survey$source=="2_bulletin" ~ "G2",
                                          external_survey$source=="3_rba_speeches" ~ "G3",
                                          external_survey$source=="4_smp_intro_2006_2019" ~ "G4",
                                          external_survey$source=="6_smp_boxes_06_19" ~ "G5",
                                          external_survey$source=="8_boe_ir_intro"|external_survey$source=="7_boe_main" ~ "G6",
                                          external_survey$source=="9_boe_speeches" ~ "G7",
                                          external_survey$source=="11_grattan" ~ "G8",
                                          external_survey$source=="10_economist" ~ "G9"
                                          )
#check the number of paragraphs for each
external_survey %>% group_by(source) %>% dplyr::summarise(sample_number = n()) %>%
  kbl(caption = "Count of sample paragraphs by text source") %>%
  kable_classic(full_width = F, html_font = "Cambria")
Count of sample paragraphs by text source
source sample_number
1_frs 50
10_economist 200
11_grattan 100
2_bulletin 100
3_rba_speeches 100
4_smp_intro_2006_2019 100
5_smp_main 50
6_smp_boxes_06_19 100
7_boe_main 50
8_boe_ir_intro 50
9_boe_speeches 100

4.2 Assign random number

Then, we assign a random number from 1 to 5 to each paragraph. This number indicates which survey this paragraph belongs to. After that, we assign a random numbers from 1 to 10 to indicate which question this paragraph belongs to in the survey. As mentioned previourly, there will be 5 survey links and each question within a survey include 20 paragraphs.

## For all sources except paragraphs extracted from the journal of The Economist
set.seed(1210)

data_g1_8 <- 
  external_survey %>% filter(source_group!="G9") %>%
  group_by(source_group) %>% mutate(survey_group = sample(rep(1:5, 20), 100, replace=F))

data_g1_8_v2 <- 
  data_g1_8 %>% 
  group_by(survey_group, source_group) %>% mutate(question_group = sample(rep(1:10, 2), 20, replace=F))


## For paragraphs from the journal of The Economist (it is seperated because we add this source later)
data_g9 <- 
  external_survey %>% filter(source_group=="G9") %>%
  group_by(source_group) %>% mutate(survey_group = sample(rep(1:5, 40), 200, replace=F))

data_g9_v2 <- 
  data_g9 %>% 
  group_by(source_group,survey_group) %>% mutate(question_group = sample(rep(1:10, 4), 40, replace=F))

#combine the selected paragraph and export into the csv file to be pasted into the survey monkey
survey_sample <- rbind(data_g1_8_v2, data_g9_v2)

survey_sample %>% group_by(survey_group, question_group) %>% dplyr::summarise(sample_number = n()) %>%
  kbl(caption = "Count of sample paragraphs by survey group and question group") %>%
  kable_classic(full_width = F, html_font = "Cambria")
Count of sample paragraphs by survey group and question group
survey_group question_group sample_number
1 1 20
1 2 20
1 3 20
1 4 20
1 5 20
1 6 20
1 7 20
1 8 20
1 9 20
1 10 20
2 1 20
2 2 20
2 3 20
2 4 20
2 5 20
2 6 20
2 7 20
2 8 20
2 9 20
2 10 20
3 1 20
3 2 20
3 3 20
3 4 20
3 5 20
3 6 20
3 7 20
3 8 20
3 9 20
3 10 20
4 1 20
4 2 20
4 3 20
4 4 20
4 5 20
4 6 20
4 7 20
4 8 20
4 9 20
4 10 20
5 1 20
5 2 20
5 3 20
5 4 20
5 5 20
5 6 20
5 7 20
5 8 20
5 9 20
5 10 20

4.3 Save output

Summarise the number of paragraphs for each survey, and save the output in csv file for each survey. This will be copied and pasted into online surveys. I put the eval=FALSE comment here to escape this step for not saving the output.

# extract 5 survey groups (each contains 200 paragraphs from 11 sources)

for (i in (1:5)){
  survey_temp <- survey_sample %>% filter(survey_group==i)
  out_name <- paste("external_survey_", i,".csv", sep = "")
  write.csv(survey_temp, paste("../Survey Data/3_survey_group/",out_name,sep = ""))
}

5 Session information

The session information for this program is:

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17763)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
## [5] LC_TIME=English_Australia.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.3.4       gtools_3.8.2           quanteda_3.0.0        
##  [4] sylcount_0.2-2         stringi_1.5.3          stringr_1.4.0         
##  [7] rvest_1.0.0            xml2_1.3.2             tidytext_0.3.1        
## [10] qdap_2.4.3             RColorBrewer_1.1-2     qdapTools_1.3.5       
## [13] qdapRegex_0.7.2        qdapDictionaries_1.0.7 dplyr_1.0.6           
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2          viridisLite_0.4.0   gender_0.5.4       
##  [4] RcppParallel_5.1.4  assertthat_0.2.1    highr_0.9          
##  [7] yaml_2.2.1          slam_0.1-48         pillar_1.6.0       
## [10] lattice_0.20-41     glue_1.4.2          chron_2.3-56       
## [13] digest_0.6.27       colorspace_2.0-1    htmltools_0.5.1.1  
## [16] Matrix_1.2-18       plyr_1.8.6          tm_0.7-8           
## [19] XML_3.99-0.6        pkgconfig_2.0.3     purrr_0.3.4        
## [22] webshot_0.5.2       scales_1.1.1        svglite_2.0.0      
## [25] openxlsx_4.2.3      tibble_3.1.1        openNLP_0.2-7      
## [28] generics_0.1.0      ggplot2_3.3.3       ellipsis_0.3.2     
## [31] NLP_0.2-1           magrittr_2.0.1      crayon_1.4.1       
## [34] evaluate_0.14       stopwords_2.2       tokenizers_0.2.1   
## [37] janeaustenr_0.1.5   fansi_0.4.2         SnowballC_0.7.0    
## [40] tools_4.0.3         data.table_1.14.0   lifecycle_1.0.0    
## [43] munsell_0.5.0       plotrix_3.8-1       zip_2.1.1          
## [46] compiler_4.0.3      systemfonts_1.0.1   rlang_0.4.11       
## [49] grid_4.0.3          RCurl_1.98-1.3      rstudioapi_0.13    
## [52] igraph_1.2.6        bitops_1.0-7        rmarkdown_2.8      
## [55] venneuler_1.1-0     gtable_0.3.0        DBI_1.1.1          
## [58] reshape2_1.4.4      R6_2.5.0            gridExtra_2.3      
## [61] knitr_1.33          utf8_1.2.1          fastmatch_1.1-0    
## [64] openNLPdata_1.5.3-4 rJava_1.0-4         parallel_4.0.3     
## [67] Rcpp_1.0.6          vctrs_0.3.8         wordcloud_2.6      
## [70] tidyselect_1.1.1    xfun_0.22