#################################################################################################
################################ Name: Text Statistics Functions ################################
#### Creator: Joan Huang ####
#### Date: 8 May 2019 ####
#### Purpose:   
########### 1) count of words; 
########### 2) count of sentences; 
########### 3) count of syllables; 
########### 4) calculate readability scores (gk grade level and FRES scores)#####################
#### Requiredment: a dataset with clean paragraphs, such as text_data ####
#### Output: text_stats <- para_stats_function(text_data) ####
#################################################################################################


## load needed libraries - please note that if you may need to install packages if they are not installed yet

library(dplyr)
library(qdap) #for wc function
library(tidytext) #for unnest_tokens function
library(xml2)
library(rvest)
library(stringr)
library(stringi)
library(sylcount)
library(quanteda)

## functions to calculate readability sore given that we know the count of words, sentences, and syllales
## please note that those two functions will be used in the next fuction for calculating text statistics
fk_grade <- function(words,sentences, syllabus){
  fk_grade = 0.39 * words/sentences + 11.8*syllabus/words -15.59
  return(fk_grade)
}

FRES  <- function(words,sentences, syllabus){
  fk_grade = 206.835 - 1.015 * words/sentences - 84.6*syllabus/words
  return(fk_grade)
}

sentence_feature_count <- function(input_data){
  input_data %>% mutate(comma_count=str_count(paragraph, ","), #comma count
                        punc_count=str_count(paragraph, "[[:punct:]]+") - str_count(paragraph, ","), #punctuation count excluding comma
                        digit_count=str_count(paragraph, "[[:digit:]]") #digit count
  )
} #count # of comma, punctuations, and digits


## paragraph statistics calculation function
## This function requires an input of text data with a clean paragraphs that are properly punctuated, and it will, then, count words, sentences, syllables, and also to calculate the readability socres

para_stats_function <- function(text_data) {
  
  output_data <- NULL
  
  input_data <- text_data %>% as.data.frame()
  input_data[] <- lapply(input_data, as.character) #convert factor variables to characters
  input_data$paragraph_clean <- input_data$paragraph #create a new paragraph column to keep the original one untouched
  input_data$index <- 1:nrow(input_data) #create an index column to record paragraphs
  
  
  # write a loop to work on each paragraphs (row by row)
  for (i in 1:nrow(input_data)) {
    
    # count the words for each paragraph
    word_count <- wc(input_data$paragraph_clean[i]) 
    word_count_df <- cbind(index = input_data$index[i], word_count_stats = word_count) %>% as.data.frame()
    
    
    # split paragraphs to sentence and name sentences
    sentence_data <- sentSplit(input_data[i,], "paragraph_clean") %>% 
      plyr::rename(c("paragraph_clean" = "sentence"))
    
    # count sentence for each paragraph
    sentence_count <- sentence_data %>% 
      count(index) %>% #count number of sentences per paragraph
      plyr::rename(c("n" = "sentence_count")) %>% # of sentences per paragraph (index) and rename
      as.data.frame()
    
    
    # count syllables
    readability_stats_0 <- readability(input_data$paragraph_clean[i], nthreads = sylcount.nthreads()) # using package sylcount to calculate readability (please note that this formula is not correct as it counts sentence in a wrong way.)
    readability_stats_df <- cbind(index = i, readability_stats = readability_stats_0) %>% 
      select(index, readability_stats.sylls, readability_stats.polys)
    
    # combine the above stats together
    output_temp1 <- left_join(word_count_df, sentence_count, by="index") #combine word count and sentence count
    output_temp2 <- left_join(output_temp1, readability_stats_df, by="index") # join readability statistics
    
    # calculate fk_grade_level and FRES scores
    
    output_temp <- output_temp2 %>% 
      mutate(fk_grade_level =fk_grade(word_count,sentence_count,readability_stats.sylls), 
             FRES_score =FRES(word_count,sentence_count,readability_stats.sylls)) %>% as.data.frame()
    
    output_data <- rbind(output_temp, output_data) %>% as.data.frame()    
  }
  output_final <- left_join(input_data, output_data, by = "index")
  
  output_final <- sentence_feature_count(output_final)
  
}

