
clear
set more off

use data/raw/FZdata_raw.dta

* merge in property tax rates
preserve
	import delimited data/raw/CountyPropTaxRates.csv, encoding(ISO-8859-1) clear // hand-collected from https://www.attomdata.com/news/heat-maps/2016-property-tax-analysis/
	keep fips_code property_tax_rate missing 
	rename missing prop_tax_missing
	save data/intermediate/pt.tmp, replace
restore

destring fips_code, replace
merge m:1 fips_code using data/intermediate/pt.tmp, nogen keep(1 3)
replace prop_tax_missing="1" if prop_tax_missing=="na"
destring prop_tax_missing, replace
replace prop_tax_missing=0 if prop_tax_missing==.
winsor property_tax_rate, p(0.025) gen(Wproperty_tax_rate)


* merge in output from TAXSIM (can be generated via get_taxrates.do)
merge 1:1 userid using data/raw/taxsim_out.dta, nogen

winsor margrate_noSS, p(0.025) gen(Wmargrate_noSS)

********************************************************************************************************************************************
* Cleaning
********************************************************************************************************************************************

forval j = 0/1 {

preserve

// initial drops
keep if hypo_q1<. & hypo_q2<. & hypo_q3<. & hypo_q4<. & hypo_d2 <. & hypo_d3 < . & hypo_d4 < . // only keep if filled out all questions

gen agedrop1 = age < 21 | age > 70

g qtime_hypo = (qtime_hypo_q1 + qtime_hypo_q2 + qtime_hypo_q3 + qtime_hypo_q4)/60
gen qtimedrop1 = qtime_hypo < 3 | qtime_hypo > 30

egen xx = pctile(homevalue), p(97.5)
egen yy = pctile(homevalue), p(2.5)
sum xx yy

if `j' == 1 { // if j = 1, drop based on home value, time spent & age; if j = 0, no drops
	drop if homevalue < yy | homevalue > xx
	drop if agedrop1 | qtimedrop1 
}
drop yy xx


g used_calc      = (hypo_q1_calc ~= .) & (hypo_q2_calc ~= .) & (hypo_q3_calc ~= .) & (hypo_q4_calc ~= .) & (hypo_d2_calc ~= .) & (hypo_d3_calc ~= .) & (hypo_d4_calc ~= .)
g used_calc_once = (hypo_q1_calc ~= .) |((hypo_q2_calc ~= .) & (hypo_d2_calc ~= .))|((hypo_q3_calc ~= .) & (hypo_d3_calc ~= .))|((hypo_q4_calc ~= .) & (hypo_d4_calc ~= .))

g hypo_d4_fraction = hypo_d4 / hypo_q4 * 100  

replace age=. if age == 999

* interest rates
g hypoRate1 = 4.5 if hypoversion==1
replace hypoRate1 = 6.5 if hypoversion==2
g hypoRate2 = 6.5 if hypoRate1 == 4.5
replace hypoRate2 = 4.5 if hypoRate1 == 6.5 
g startRateLow = 1 if hypoversion==1
replace startRateLow = 0 if hypoversion == 2
gen endRateLow = 0 if startRateLow == 1
replace endRateLow = 1 if startRateLow == 0

* homeownership-related variables
g owner = q4==1
replace house_debt = 0 if !owner
*replace credit_score=99 if credit_score==. // 99 is missing 
g equity_value = house_value - house_debt 
replace equity_value=0 if !q4
gen equity_pct = 100 * equity_value / house_value
sum equity_pct, det
gen equity_pct_outlier =  (equity_pct > `r(p99)' + 2 * `r(sd)' | equity_pct < `r(p1)' - 2 * `r(sd)')
replace equity_pct = 0 if (equity_pct > `r(p99)' + 2 * `r(sd)' | equity_pct < `r(p1)' - 2 * `r(sd)')

g Wpct_q3a_1yr_real = Wpct_q3a_1yr - meanri1


* Dependent variables
foreach var in hypo_q1 hypo_q2 hypo_q3 hypo_q4 {
	replace `var' = 500 if `var' <= 500
}

gen ln_price_appraisal_ratio = ln(hypo_q1) - ln(homevalue)
gen ln_hypo_q1 = ln(hypo_q1)
gen ln_homevalue = ln(homevalue)
g hypo_q2_q1_logchg = ln(hypo_q2) - ln(hypo_q1)
g hypo_logchange_rateup = ln(hypo_q3) - ln(hypo_q2) if hypoRate1 ==4.5
replace hypo_logchange_rateup = ln(hypo_q2) - ln(hypo_q3) if hypoRate1 ==6.5	   
g hypo_dlow = hypo_d2_fraction if hypoRate1 ==4.5
replace hypo_dlow = hypo_d3_fraction if hypoRate1 ==6.5
replace hypo_dlow = hypo_dlow / 100
g hypo_dchange_rateup = hypo_d3_fraction - hypo_d2_fraction if hypoRate1 ==4.5
replace hypo_dchange_rateup = hypo_d2_fraction - hypo_d3_fraction if hypoRate1 ==6.5
g hypo_change_inher = hypo_q4 - hypo_q3
g hypo_logchange_inher = ln(hypo_q4) - ln(hypo_q3)
g hypo_fraction_change_d4_d3 = hypo_d4_fraction - hypo_d3_fraction





forval w = 1/4 {
gen ln_wtp`w'_appr_ratio = ln(hypo_q`w') - ln(homevalue)
}

// fix some missing observations:
foreach var in liquid_savings non_house_debt Whh_inc_change inc_cont prob_move_3yr q6cnew equity_pct equity_value Wpct_q3a_1yr Wpct_q3a_1yr_real meanri1 Wpct_q3a_5yr total_numeracy risk_aversion {
gen `var'_missing = `var' == .
}

* recode continuous variables as categorical variables
replace liquid_savings = liquid_savings * 1000
replace liquid_savings = 0 if q16_6==1  
replace liquid_savings_missing = 0 if q16_6==1  
gen ls_cat = recode(liquid_savings, 3500, 25000, 75000, 375000, 1000000)
replace ls_cat = 9999999 if liquid_savings_missing == 1
replace liquid_savings = 9999999 if liquid_savings_missing == 1
label define liquid_savings 3500 "Less than 5K" 25000 "[5K, 30K)" 75000 "[30K, 100K)" 375000 "[100K, 500K)" 1000000 "500K or more" 9999999 "Missing"
label values ls_cat liquid_savings

gen nh_cat = recode(non_house_debt, 750, 3500, 25000, 100000) 
replace nh_cat = 9999999 if non_house_debt_missing == 1
replace non_house_debt = 9999999 if non_house_debt_missing == 1
label define non_house 1000 "Less than 1000" 3500 "[1000, 5000)" 25000 "[5K, 30K)" 100000 "30K or more" 9999999 "Missing"
label values nh_cat non_house

gen i_cat = recode(inc_cont, 35000, 67500, 125000, 200000) 
replace i_cat = 9999999 if inc_cont_missing == 1
replace inc_cont = 9999999 if inc_cont_missing == 1
label define income 35000 "Less than 40K" 67500 "(40K, 75K]" 125000 "(75K, 150K]" 200000 "Greater than 150K" 9999999 "Missing"
label values i_cat income

gen ic_cat = recode(Whh_inc_change, 0, 2, 5, 10, 100)
replace ic_cat = 9999999 if Whh_inc_change_missing == 1
label define inc_change 0 "Less than 0" 2 "(0, 2]" 5 "(2, 5]" 10 "(5, 10]" 100 "Greater than 10" 9999999 "Missing"
label values ic_cat inc_change

gen equity_pct_cat = recode(equity_pct, 0, 25, 50, 100)
replace equity_pct_cat = 9999999 if equity_pct_outlier
gen eq_cat = recode(equity_value, 0, 50000, 125000, 200000, 900000)
replace eq_cat = 9999999 if eq_cat==.

replace equity_value = equity_value/1000

gen credit_score_v2 = 1 if q14 == 1 | q14 == 2
replace credit_score_v2 = 2 if q14 == 3
replace credit_score_v2 = 3 if q14 == 4
replace credit_score_v2 = 4 if q14 == 5
replace credit_score_v2 = 9999999 if q14 == 6

egen regiongrp = group(region), label
egen stategrp = group(state), label

* home type
g homet = 1 if q4b==1
replace homet = 2 if q4b>1&q4b<4 // condo or townhouse
replace homet = 3 if homet==.
tab homet q4b
label define ht 1 "Single-family home" 2 "Apartment/Condo/Townhouse" 3 "Other/Unknown"
label values homet ht


gen equity_value_owner = equity_value if q4

qui tab credit_score_v2, gen(cs)
forvalues i = 1/4 {
replace cs`i' = . if credit_score_v2 == 9999999
}

foreach x in liquid_savings non_house_debt inc_cont homevalue {
gen `x'_1000 = `x'/1000 // express all these variables in 1000s
}

qui tab region, gen(reg)
qui tab homet, gen(hot)


foreach var in Wpct_q3a_1yr Wpct_q3a_1yr_real meanri1 Wpct_q3a_5yr total_numeracy liquid_savings non_house_debt inc_cont prob_move_3yr q6cnew equity_pct equity_value {
replace `var' = 0 if `var'_missing
}
replace Whh_inc_change=100 if Whh_inc_change_missing
replace risk_aversion=9999999 if risk_aversion_missing 

save data/intermediate/FZdata_drop`j'.dta, replace

restore
}

