For this part, we are going to conduct statistical modeling.
Preparation: We have to establish our previous variables
References and ideas for this modeling is concluded in the reference page.
# We import the required pacakge:
#First, we can import all the necessary package:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from string import punctuation
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import re
from string import punctuation
stemmer = nltk.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words("english")
nltk.download('wordnet')
nltk.download('omw-1.4')
[nltk_data] Downloading package wordnet to [nltk_data] /Users/fanyuyan/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to [nltk_data] /Users/fanyuyan/nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
True
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
def lower_case(text):
text = text.lower()
return text
train['text'] = train['text'].apply(lambda x: lower_case(x))
test['text'] = test['text'].apply(lambda x: lower_case(x))
def sent_tokenize(text):
text = nltk.sent_tokenize(text)
return text
train['text_senttoken'] = train['text'].apply(lambda x: sent_tokenize(x))
test['text_senttoken'] = test['text'].apply(lambda x: sent_tokenize(x))
def word_tokenize(text):
text = nltk.word_tokenize(text)
return text
train['text_wordtoken'] = train['text'].apply(lambda x: word_tokenize(x))
test['text_wordtoken'] = test['text'].apply(lambda x: word_tokenize(x))
def slt(text):
#stemming
text = [stemmer.stem(w) for w in text]
#lemmatization
text = [lemmatizer.lemmatize(w) for w in text]
#stopword removal
text = [w for w in text if w not in stopwords]
text = [w for w in text if w.isalnum()]
return text
train['cleanText'] = train['text_wordtoken'].apply(lambda x: slt(x))
test['cleanText'] = test['text_wordtoken'].apply(lambda x: slt(x))
from nltk.tokenize.treebank import TreebankWordDetokenizer
train_text = train["text"] # This train_text variable stores all the text in the object format.
test_text = test["text"] # This test_text variable stores all the text in the object format.
train_list = train["cleanText"].tolist()
list = [] # We create an empty list.
for ele in train_list:
reconstruct = TreebankWordDetokenizer().detokenize(ele)
list.append(reconstruct)
train["cleaned"] = list
test_list = test["cleanText"].tolist()
list_2 = []
for element in test_list:
reconstruct = TreebankWordDetokenizer().detokenize(element)
list_2.append(reconstruct)
test["cleaned"] = list_2 # Then, we send it back to the dataframe.
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
trainning_vectors = count_vectorizer.fit_transform(train['cleaned'] )
testing_vectors = count_vectorizer.transform(test["cleaned"] )
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression # We import the required package:
from sklearn import model_selection
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
LR = LogisticRegression(C = 1.0) # We define our logistic Regression model
Approximate_outcome = model_selection.cross_val_score(LR, training_vectors, train["target"],cv = 20, scoring = 'f1' )
Approximate_outcome
array([0.74740484, 0.51538462, 0.54915254, 0.44984802, 0.3539823 , 0.51785714, 0.4939759 , 0.34969325, 0.46687697, 0.51552795, 0.54277286, 0.54489164, 0.39716312, 0.65875371, 0.64102564, 0.32067511, 0.69387755, 0.61688312, 0.62170088, 0.73888889])
LR.fit(training_vectors, train["target"])
LogisticRegression()
from sklearn.feature_extraction.text import TfidfVectorizer # We first import the required package:
LR_tf_idf = LogisticRegression(C = 1.0) # We define our logistic Regression model
tf_idf = TfidfVectorizer()
train_tf_idf = tf_idf.fit_transform(train["text"]) # We fit the transformation during vectorization
text_tf_idf = tf_idf.transform(test["text"]) # We fit the transformation during vectorization
Approximate_outcome_1 = model_selection.cross_val_score(LR_tf_idf, train_tf_idf, train["target"], cv = 20 ,
scoring = "f1")
Approximate_outcome_1
array([0.72413793, 0.54814815, 0.6124031 , 0.54482759, 0.45774648, 0.60899654, 0.59333333, 0.48 , 0.55555556, 0.59183673, 0.55218855, 0.59722222, 0.49056604, 0.68027211, 0.66666667, 0.35964912, 0.70198675, 0.60649819, 0.62046205, 0.80996885])
Final_Modeling = pd.read_csv("sample_submission.csv") # This is our sample csv file.
Final_Modeling["target"] = LR.predict(text_tf_idf) # We are going to use our text_tf_idf to make the prediction
Final_Modeling.head()
id | target | |
---|---|---|
0 | 0 | 0 |
1 | 2 | 0 |
2 | 3 | 0 |
3 | 9 | 0 |
4 | 11 | 0 |