# We import the required pacakge:
#First, we can import all the necessary package:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from string import punctuation
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import re
from string import punctuation
stemmer = nltk.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words("english")
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fanyuyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/fanyuyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

True


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


def lower_case(text):
    text = text.lower()
    return text

train['text'] = train['text'].apply(lambda x: lower_case(x))
test['text'] = test['text'].apply(lambda x: lower_case(x))

def sent_tokenize(text):
    text = nltk.sent_tokenize(text)
    return text

train['text_senttoken'] = train['text'].apply(lambda x: sent_tokenize(x))
test['text_senttoken'] = test['text'].apply(lambda x: sent_tokenize(x))

def word_tokenize(text):
    text = nltk.word_tokenize(text)
    return text


train['text_wordtoken'] = train['text'].apply(lambda x: word_tokenize(x))
test['text_wordtoken'] = test['text'].apply(lambda x: word_tokenize(x))

def slt(text):
    #stemming
    text = [stemmer.stem(w) for w in text]
    #lemmatization
    text = [lemmatizer.lemmatize(w) for w in text]
    #stopword removal
    text = [w for w in text if w not in stopwords]
    text = [w for w in text if w.isalnum()]
    return text

train['cleanText'] = train['text_wordtoken'].apply(lambda x: slt(x))
test['cleanText'] = test['text_wordtoken'].apply(lambda x: slt(x))


from nltk.tokenize.treebank import TreebankWordDetokenizer

train_text = train["text"] # This train_text variable stores all the text in the object format.
test_text = test["text"] # This test_text variable stores all the text in the object format.
train_list = train["cleanText"].tolist()
list = [] # We create an empty list.
for ele in train_list:
    reconstruct = TreebankWordDetokenizer().detokenize(ele) 
    list.append(reconstruct) 
train["cleaned"] = list


test_list = test["cleanText"].tolist()
list_2 = []
for element in test_list:
    reconstruct = TreebankWordDetokenizer().detokenize(element) 

    list_2.append(reconstruct) 

test["cleaned"] = list_2 # Then, we send it back to the dataframe.


from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer() 
trainning_vectors = count_vectorizer.fit_transform(train['cleaned'] ) 
testing_vectors = count_vectorizer.transform(test["cleaned"] )


import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression # We import the required package:
from sklearn import model_selection
from sklearn.metrics import f1_score
import warnings

warnings.filterwarnings('ignore')
LR = LogisticRegression(C = 1.0) # We define our logistic Regression model
Approximate_outcome = model_selection.cross_val_score(LR, training_vectors, train["target"],cv = 20, scoring = 'f1' )
Approximate_outcome

array([0.74740484, 0.51538462, 0.54915254, 0.44984802, 0.3539823 ,
       0.51785714, 0.4939759 , 0.34969325, 0.46687697, 0.51552795,
       0.54277286, 0.54489164, 0.39716312, 0.65875371, 0.64102564,
       0.32067511, 0.69387755, 0.61688312, 0.62170088, 0.73888889])


LR.fit(training_vectors, train["target"])

LogisticRegression()


from sklearn.feature_extraction.text import TfidfVectorizer # We first import the required package:
LR_tf_idf = LogisticRegression(C = 1.0) # We define our logistic Regression model
tf_idf = TfidfVectorizer()
train_tf_idf = tf_idf.fit_transform(train["text"]) # We fit the transformation during vectorization
text_tf_idf = tf_idf.transform(test["text"]) # We fit the transformation during vectorization

Approximate_outcome_1 = model_selection.cross_val_score(LR_tf_idf, train_tf_idf, train["target"], cv = 20 , 
                                                        scoring = "f1")
Approximate_outcome_1

array([0.72413793, 0.54814815, 0.6124031 , 0.54482759, 0.45774648,
       0.60899654, 0.59333333, 0.48      , 0.55555556, 0.59183673,
       0.55218855, 0.59722222, 0.49056604, 0.68027211, 0.66666667,
       0.35964912, 0.70198675, 0.60649819, 0.62046205, 0.80996885])


Final_Modeling = pd.read_csv("sample_submission.csv") # This is our sample csv file.
Final_Modeling["target"] = LR.predict(text_tf_idf) # We are going to use our text_tf_idf to make the prediction
Final_Modeling.head()

Part VI -- Statistical Modeling¶

Model building - logistic¶

Then, we can fit our Logistic Regression¶

We can also try to fit our Logistic Regression Model on our Term Frequency, Inverse Document Frequency vectorization¶

Then, we can use the Logistic Regression Model we create to produce a Final Modeling:¶

	id	target
0	0	0
1	2	0
2	3	0
3	9	0
4	11	0