# Title: Run local projection for all regions; take peak effect; see what it correlates to
# Creator: Calvin He
# Date Created: February 2020

library(lars)
library(outliers)
library(lfe)
library(rGertrude) # RBA package to access internal database
library(data.table)
library(zoo)
library(dummies)
library(rlang)
library(ggplot2)
library(broom)
library(lubridate)
library(tidyverse)

# Step 0: Import functions and data ---------------------------------------
# Import Functions
user.defined.functions<- c(list.files(path = "./R", pattern = ".R",  full.names= T) )
if (length(user.defined.functions)>0){invisible(lapply(user.defined.functions, source))}

# Import Data
corelogic_data_raw <- readRDS("./data/corelogic_data_raw.RDS")
monetary_policy_shocks_raw <- readRDS("./data/monetary_policy_shocks.RDS")
grouping_variables <- readRDS("./data/grouping_variables.RDS")

set.seed(132164)

# Step 0: Set options -----------------------------------------------
opts <- set_options(figure = 3 ) # options for this are the same as that of figure 3

# 1. Manipulate data ------------------------------------------------------------
# filter, group, join shock
corelogic_sa3_data <- build_combined_data(corelogic_data_raw, monetary_policy_shocks_raw, grouping_variables,  opts)

# 2. Create Regression data ----------------------------------
# Join to core_logic data to chosen monetary policy shock; add in decile interactions
reg_data_pre <- build_regression_data(corelogic_sa3_data , opts)

# 3. Run Local Projections for each SA3 -------------------------------------------
output <- reg_data_pre  %>%
  group_by(SA3_CODE_2016) %>%
  nest() %>%
  mutate(output = map(data, individual_reg, m = opts$horizon  ))

# 4. Extract Peak effect -----------------------------------------------------
# Join with grouping variables
peak_effect <- output %>% unnest(output) %>%
  filter(term == "mp_shock", horizon == 8 ) %>%
  select(SA3_CODE_2016, estimate) %>% 
  left_join(., grouping_variables, by = "SA3_CODE_2016") %>%
  ungroup() %>%
  {bind_cols(., sjmisc::to_dummy(.$STATE_NAME_2016) )}

# 5. Remove redundant groups ------------------------------------------------------
redundant_groups <- c("SA3_NAME_2016", "SA4_CODE_2016", "region_name", 
                      "SA4_NAME_2016", "GCCSA_CODE_2016", "GCCSA_NAME_2016", 
                      "STATE_CODE_2016", "STATE_NAME_2016", "AREA_ALBERS_SQKM", "area")
peak_effect <- peak_effect %>% select(-redundant_groups) %>% 
  model_select_remove_vars()


# 6. Add additional variables ------------------------------------------------

#  rental growth
peak_effect <- peak_effect %>% 
  left_join(., corelogic_rental_growth(corelogic_data_raw , 2005, 2018, deflate = T))

# price level 
peak_effect <- peak_effect %>% left_join(., corelogic_sa3_data %>% select(SA3_CODE_2016, average_sales_price) %>% 
                                           distinct(SA3_CODE_2016, .keep_all =T))

#  price growth
peak_effect <- peak_effect %>% left_join(., corelogic_price_growth(corelogic_data_raw , 1994, 2018, deflate = T))


# 7. Transform (log) some grouping variables -----------------------------------------------------------
# take logs of some variables - mainly income and welath variables
transformed_data <- peak_effect %>%
  mutate_at( c(  "nworth", "liqnworth", "illiqnworth", 
                 "average_sales_price", "hh_net_worth",
                 "nworth", "median_total_income_excl_government_pensions", 
                 "median_investment_income", 
                 "mean_investment_income",
                 "median_employee_income", "individual_income_1991", "hh_income_1991") , 
             log) %>% 
  mutate_at(vars(matches("dollars")), log) 



# 8. Replace with NAs, then remove ------------------------------------------
transformed_trim <- transformed_data 
transformed_trim[is.infinite(data.matrix(transformed_trim))] <- NA # 
transformed_trim[is.nan(data.matrix(transformed_trim) )] <- NA
transformed_trim <- transformed_trim %>% na.omit

# 9. Truncate outliers ----------------------------------------------------------
# truncate extreme outliers to boundaries
transformed_trim[, !names(transformed_trim) %in% c("estimate","SA3_CODE_2016")] <- 
  purrr:::map_dfc(transformed_trim[, !names(transformed_trim) %in% c("estimate","SA3_CODE_2016")], function(x){
    
    # find outliers
    outliers_above <- scores(x) > 5
    outliers_below <- scores(x) < -5
    
    # find truncated values
    cap_above <- quantile(x, probs= 0.95, na.rm = T)
    cap_below <- quantile(x, probs= 0.05, na.rm = T)
    
    # replace outliers
    x[outliers_above] <- cap_above
    x[outliers_below] <- cap_below
    
    return(x)
  })


# 10. Model Selection -------------------------------------------------------
# Make data (unstandardised)
cs_data <- transformed_trim %>% na.omit %>% as_tibble %>% select(-SA3_CODE_2016) # remove any row wiht NA (i.e. keep only complete data)
vars_potential <- cs_data  %>% select( -estimate) %>% data.matrix() # potential variabls to be selected
y <- cs_data   %>% select(estimate) %>% data.matrix()  # estimate for cs_variables to explain

coefs <- list() # empty list to store coefficients

# elastic net Regression 
elastic_net_reg <- glmnet::cv.glmnet(vars_potential, y, standardize = T, alpha = 0.2)

coefs$elastic_net <- coef(elastic_net_reg, s = "lambda.1se") %>% data.matrix %>% 
  as_tibble(rownames = "id" ) %>% rename(elastic_net = `1`)

# Least Angle Regression 
least_angle_reg <- lars::lars(vars_potential, y,  type ="lar", normalize = T )
least_angle_reg_cv <- lars::cv.lars(vars_potential, y, index = 1:30 ,type = "lar", mode = "step", normalize = T)

highest_mse <- min(least_angle_reg_cv$cv) + least_angle_reg_cv$cv.error[which(least_angle_reg_cv$cv == min(least_angle_reg_cv$cv ))]
s <- min(which(least_angle_reg_cv$cv <= highest_mse))

temp_coeficients <- predict(least_angle_reg, s = s ,type = "coefficients")$coefficients 

coefs$least_angle_reg <- tibble(id =  names(temp_coeficients), least_angle= unname(temp_coeficients))


# Combine  estimates from models
coefs <- plyr::join_all(coefs, by = "id", type = "left") 
coefs[coefs == 0] <- NA

# take coefficients chosen by at least one model
coefs_partial <- coefs[rowSums(is.na(coefs))< 2,] %>% 
  mutate(sensitivity_elastic_net= case_when(elastic_net >0 ~  "Less",elastic_net <=0  ~ "More", TRUE ~ "NA"),
         sensitivity_least_angle= case_when(least_angle >0 ~  "Less", least_angle <=0  ~ "More", TRUE ~ "NA"))  %>% 
  arrange(sensitivity_least_angle) %>% 
  select(variable = id, contains("sensitivity"))

coefs_partial 




# 12. Liner Regression---------------------------------------------------------------
responses_sa3 <- output %>% unnest(output) %>% 
  filter(term == "mp_shock") %>% 
  select(-data,  SA3_CODE_2016, horizon, estimate)

# join estimates with grouping variables to explain estimates
final_data <- left_join(responses_sa3 , transformed_trim %>% select(-estimate), by = "SA3_CODE_2016")

vars <- c("average_sales_price",
          "prop_land",
          "rent_servicing_ratio",
          "mortgage_servicing_ratio",
          "mortgage_concentration",
          "outright_concentration",
          "investor_density_working_age_pop",
          "median_employee_income",
          "price_growth",
          paste0("STATE_NAME_2016_", c(6,7)))

reg_output <- final_data %>% group_by(horizon) %>% 
  nest() %>% 
  mutate(reg = map(data, ~ estimatr::lm_robust( as.formula(glue::glue("estimate ~ {paste0(vars, collapse = ' + ')}")), data = . , se_type = "stata"))) %>% 
  filter(horizon %in% c(4, 8, 12)) 


# import actual regression data
reg_output <- readRDS("data/regression-output/table-2.RDS")
reg_output
