#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec  4 22:14:16 2019

@author: menglingyu
"""

#****************************************************************************************#
#       First of all, we have to convert categorical data into a numerical form          #
# before we can pass it on to a machine learning algorithm. We doing so using            #
# the bag-of-words model. The idea behind it is quite simple and can be summarized       #
# as follows:                                                                            #
#  （1）We create a vocabulary of unique tokens - for example, words - from the entire    #
# set of documents.                                                                       #
# （2）We construct a feature vector from each document that contains the counts of       #
# how often each word occurs in the particular document.                                 #                                                          #
#****************************************************************************************#


#Transforming words into feature vector
#****************************************************************************************#
#       To construct the bag-of-words model, we can use the CountVectorizer class        #
# implemented in scikit-learn, as follows:                                               #
#****************************************************************************************#

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
    'So Happy The Red Cross Offers Shelters For Us.',
    'FEMA cannot help us.'
])

bag = count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

#Assessing word relevancy via term frequency-inverse document frequency
#**********************************************************************************************#
#       This technique can be used to downweight frequently occuring words that don't          #
# contain useful or discriminatory information in the feature vectors. scikit-learn implements #
# it with the TfidfTransformer                                                                 #
#**********************************************************************************************#

from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())


# =============================================================================
# #Cleaning text data
# =============================================================================
#**********************************************************************************************#
#       The first important step - before we build our bag-of-words model - its to clean the   #
# text data by stripping it of all unwanted characters. To illustrate, lets display the last   #
# 50 characters from a random document of the dataset:                                         #
#       We will remove all HTML markup as well as punctuation and other non-letter characters  #
# and keep only emoticon characters since those are certainly useful for sentiment analysis.   #
# To accomplish that, we will use Python's regular expressions library                         #
#**********************************************************************************************#
import pandas as pd

df = pd.read_csv('Hurricane_Summary.csv')

import re

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text + " ".join(emoticons).replace('-', '')
    return text

preprocessor("</a>This :) is :( a test :-)!")


df['text'] = df['text'].apply(preprocessor)

print(df.tail())


#Processing documents into tokens
#**********************************************************************************************#
#       Now that we successfully prepared the dataset, we need to think how to split the text  #
# into individual elements. We can do so:                                                      #
#       Another strategy is word stemming, which is the process of transforming a word into    #  
# its root form that allow us to map related words to the same stem. We will use Porter        #
# stemming algorithm implemented by nltk package.                                              #
#       Before we jump into the training of a machine learning model using the bag-of-word,    #
# lets remove those extremely common words (called stop-words) that doesn't add useful         #
# information to the text.                                                                     #
#**********************************************************************************************#

def tokenizer(text):
    return text.split()


from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

#Training a logistic regression model for document classification
#**********************************************************************************************#
#       In this section, we will train a logictic regression model to classify the movie       #
# reviews into positive and negative reviews. First, lets divide the DataFrame of cleaned      #
# text document into 50:50 ratio for training and testing.                                     #
#**********************************************************************************************#
X = df.iloc[:, 8].values
y = df.iloc[:, -1].values


print("Class labels:", np.unique(y))

from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 0, stratify = y)


# Hypertune and Cross Validation
#**********************************************************************************************#
#       Next we will use a GridSearchCV to hypertune our logistic regression (besides his      #
# name, its a classification model) model using 5-fold cross-validation:                       #         #
#**********************************************************************************************#
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

X= tfidf.fit_transform(X).toarray()

'''
param_grid = [{'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'vect__use_idf': [False],
              'vect__norm': [None],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]}]


lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=10, verbose=1, n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

print('Best parameter set: %s' % gs_lr_tfidf.best_params_)
'''


# Prediction
#**********************************************************************************************#
#       Using best parameter to predict                                                        #         
#**********************************************************************************************#
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#Features extraction
X = df.iloc[:, 8].values
y = df.iloc[:, -1].values
print("Class labels:", np.unique(y))

# create feature vectors
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
    
X= tfidf.fit_transform(X).toarray()

#Data split for training and testing
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 0, stratify = y)

#Scaling training data
sc = StandardScaler()
sc.fit(X_train)
sc.fit(X_test)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)


#Creating perceptron with hyperparameters
lr = LogisticRegression(C=10, random_state=0,penalty='l1')

#This is training the model
lr.fit(X_train_std, y_train)

#Testing the model data
y_test_pred = lr.predict(X_test_std)
y_train_pred = lr.predict(X_train_std)

#Print out the training and test accuracy values to a text file
print('Misclassified samples: %d' % (y_test != y_test_pred).sum())

print('Test_Accuracy: %.2f' % accuracy_score(y_test, y_test_pred))
print('Test_Accuracy: %.2f' % lr.score(X_test_std, y_test))

print('Train_Accuracy: %.2f' % accuracy_score(y_train, y_train_pred))
print('Train_Accuracy: %.2f' % lr.score(X_train_std, y_train))

#Generate Classification Report and Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (confusion_matrix(y_test, y_test_pred))
print (classification_report(y_test, y_test_pred))
print (accuracy_score (y_test,y_test_pred))


from sklearn.metrics import confusion_matrix
y_test_pred = lr.predict(X_test_std)
y_train_pred = lr.predict(X_train_std)


confmat = confusion_matrix(y_true=y_test,y_pred=y_test_pred)
print(confmat)

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(2.5, 2.5))
#Matplotlib’s matshow
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i,
                s=confmat[i, j],
                va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()