#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@description: Categorize data with sentiment measures by topic and location
"""

# Before this, run dna_clean and dna_sentiment


import pandas as pd
from moda.dataprep import raw_to_ts, ts_to_range
import numpy as np
import pickle
import datetime as dt

# Load dataframe
with open('df_sent', 'rb') as f:
    raw = pickle.load(f)

#### GET STATE AND CITY

#### Split data by region

states=['tasman','waustr','victor','queensl','saustr','nswals','auscap','nterry']
cities=['canbrr','sydney','brisbn','adelai','melb','perth']

def split_state(origin,states):
    state=[word for word in origin if word in states]
    if len(state)>1:
        state=['multiple']
    return state

def split_city(origin,states):
    city=[word for word in origin if word in cities]
    if len(city)>1:
        city=['multiple']
    return city

def list_split(origin):
    origins=[word for word in origin.split(',')]
    return origins


raw['origins'] = raw['region_codes'].apply(lambda x: list_split(x))
raw['state']  = raw['origins'].apply(lambda x: split_state(x,states))
raw['city']  = raw['origins'].apply(lambda x: split_city(x,cities))
raw['state']=raw['state'].astype(str)
raw['city']=raw['city'].astype(str) 

#### A different way to categorize multiple cities and states: to have a dummy for each city

def split_city_d(origin, city):
    return bool(city in str(origin))

for y in cities:
    raw[y] = raw['origins'].apply(lambda x: split_city_d(x,y))


def split_state_d(origin, state):
    return bool(state in str(origin))

for y in states:
    raw[y] = raw['origins'].apply(lambda x: split_state_d(x,y))
    
    
# Generate month_year, quarter, year from publication date

raw['month_year'] = raw['pubdate'].dt.to_period('M')
raw['quarter_year'] = raw['pubdate'].dt.to_period('Q')
raw['year'] = raw['pubdate'].dt.to_period('Y')
raw['month'] = raw['pubdate'].dt.month
raw['date'] = raw['pubdate'].dt.to_period('D')
raw['week'] = raw['pubdate'].dt.to_period('W')

    
#### Documents with uncertainty words 

uncwords=['risk','risky','uncertain','uncertainty','volatile','volatility']
def unccount(cleantext, uncwords):
    """
    counts negative words in cleantext
    """
    unccount= [word for word in cleantext if word in uncwords]
    return len(unccount)

def unc_doc(unccount):
    if unccount>=1:
        unc_doc = 1
    else:
        unc_doc = 0
    return unc_doc

raw['countunc_lite']  = raw['lem'].apply(lambda x: unccount(x,uncwords))
raw['unc_doc'] = raw['countunc_lite'].apply(lambda x: unc_doc(x))

#### GET MONETARY POLICY ARTICLES: E12
    
def mp_news(string):
    return bool ('e12' in string)

raw['mpolicy']= raw['subject_codes'].apply(lambda x: mp_news(x))

raw_mpolicy = raw[raw['mpolicy']==True]
raw_nonmpolicy = raw[raw['mpolicy']==False]

with open('raw_mpolicy', 'wb') as f:
    pickle.dump(raw_mpolicy, f)
    

#### GET HOUSING ARTICLES: EREAL
    
def real_news(string):
    return bool ('ereal' in string)

raw['realestate']= raw['subject_codes'].apply(lambda x: real_news(x))

raw_real = raw[raw['realestate']==True]
with open('raw_real', 'wb') as f:
    pickle.dump(raw_real, f)


#### GET EDITORIAL ARTICLES: NEDI
    
def edit(string):
    return bool ('nedi' in string)

raw['editorial']= raw['subject_codes'].apply(lambda x: edit(x))

raw_edit = raw[raw['editorial']==True]

def opinion(string):
    return bool ('nedc' in string)

raw['opinions']= raw['subject_codes'].apply(lambda x: opinion(x))

raw_opinion = raw[raw['opinions']==True]

raw_regular = raw[raw['opinions']==False]
raw_regular = raw_regular[raw_regular['editorial']==False]


import pickle
with open('df_cat', 'wb') as f:
    pickle.dump(raw, f)
