# First, we can import all the necessary package:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from string import punctuation
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import re
from string import punctuation
stemmer = nltk.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words("english")
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alexmwt16\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

True

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alexmwt16\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.

True


# And then, we can read the dataset using pandas. 
#reading the data using pandas
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


def lower_case(text):
    text = text.lower()
    return text

# using .apply() to apply the function to both the training text and testing text
train['text'] = train['text'].apply(lambda x: lower_case(x))
test['text'] = test['text'].apply(lambda x: lower_case(x))

def sent_tokenize(text):
    text = nltk.sent_tokenize(text)
    return text

# using .apply() to apply the function to both the training text and testing text
train['text_senttoken'] = train['text'].apply(lambda x: sent_tokenize(x))
test['text_senttoken'] = test['text'].apply(lambda x: sent_tokenize(x))

def word_tokenize(text):
    text = nltk.word_tokenize(text)
    return text

# using .apply() to apply the function to both the training text and testing text
train['text_wordtoken'] = train['text'].apply(lambda x: word_tokenize(x))
test['text_wordtoken'] = test['text'].apply(lambda x: word_tokenize(x))

def slt(text):
    #stemming
    text = [stemmer.stem(w) for w in text]
    #lemmatization
    text = [lemmatizer.lemmatize(w) for w in text]
    #stopword removal
    text = [w for w in text if w not in stopwords]
    text = [w for w in text if w.isalnum()]
    return text

train['cleanText'] = train['text_wordtoken'].apply(lambda x: slt(x))
test['cleanText'] = test['text_wordtoken'].apply(lambda x: slt(x))


from nltk.tokenize.treebank import TreebankWordDetokenizer
# We can extract out the text column, and then we have that:
train_text = train["text"] # This train_text variable stores all the text in the object format.
test_text = test["text"] # This test_text variable stores all the text in the object format.
train_list = train["cleanText"].tolist()
list = [] # We create an empty list.
for ele in train_list:
    reconstruct = TreebankWordDetokenizer().detokenize(ele) 
    # Use the for loop to detokenize, like how we've done in homework four.
    list.append(reconstruct) # use the list.append(recontruct) to append the list.
train["cleaned"] = list # Then, we send it back to the dataframe.
train
del train["clean"]


test_list = test["cleanText"].tolist()
list_2 = []
for element in test_list:
    reconstruct = TreebankWordDetokenizer().detokenize(element) 
    # Use the for loop to detokenize, like how we've done in homework four.
    list_2.append(reconstruct) 
    # use the list.append(recontruct) to append the list.
test["cleaned"] = list_2 # Then, we send it back to the dataframe.
test


# First, we need to import the required package:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer() # Then, we can do apply this package to evaluate:
trainning_vectors = count_vectorizer.fit_transform(train['cleaned'] ) 
#We fit the transformation from countvecorizer package
testing_vectors = count_vectorizer.transform(test["cleaned"] ) 
#We fit the transformation from countvecorizer package
testing_vectors
# We can see tjat this is a sparse matrix, similar to the "freq" matrix we've constructed during lecture.

<3263x13000 sparse matrix of type '<class 'numpy.int64'>'
	with 27048 stored elements in Compressed Sparse Row format>


trainning_vectors

<7613x13000 sparse matrix of type '<class 'numpy.int64'>'
	with 70333 stored elements in Compressed Sparse Row format>


trainning_vectors[0].todense() # Apply todense() to convert this sparse matrix to a dense matrix.

matrix([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)


from sklearn.preprocessing import Binarizer # First, we import the required package.

binarizer = Binarizer() # We define our binarizer
ohot_tranning = binarizer.fit_transform(trainning_vectors) # First, fit the transform data.
ohot_tranning.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)


from sklearn.preprocessing import Binarizer # First, we import the required package.

binarizer = Binarizer() # We define our binarizer
ohot_testing = binarizer.transform(ohot_tranning) # Then, we can transform the fit back into testing dataset.
ohot_testing.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)


from sklearn.feature_extraction.text import TfidfVectorizer # We first import the required package:
tf_idf = TfidfVectorizer()
train_tf_idf = tf_idf.fit_transform(train["cleaned"]) # We fit the transformation
text_tf_idf = tf_idf.transform(test["cleaned"]) # We fit the transformation
train_tf_idf.todense() # Convert the sparse matrix to the dense matrix.

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])


text_tf_idf.todense() # Convert the sparse matrix to the dense matrix.

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])


(train_tf_idf @ train_tf_idf.T).todense() # Calculate the corresponding cosine similarities.

matrix([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
         0.01151233],
        [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.01151233, 0.        ,
         1.        ]])


(text_tf_idf @ text_tf_idf.T).todense() # Calculate the corresponding cosine similarities.

matrix([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 1.        , 0.01183585,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.01183585, 1.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         1.        ]])

	id	keyword	location	text	target	text_senttoken	text_wordtoken	cleanText	clean	cleaned
0	1	NaN	NaN	our deeds are the reason of this #earthquake m...	1	[our deeds are the reason of this #earthquake ...	[our, deeds, are, the, reason, of, this, #, ea...	[deed, reason, thi, earthquak, may, allah, for...	deed reason thi earthquak may allah forgiv u	deed reason thi earthquak may allah forgiv u
1	4	NaN	NaN	forest fire near la ronge sask. canada	1	[forest fire near la ronge sask., canada]	[forest, fire, near, la, ronge, sask, ., canada]	[forest, fire, near, la, rong, sask, canada]	forest fire near la rong sask canada	forest fire near la rong sask canada
2	5	NaN	NaN	all residents asked to 'shelter in place' are ...	1	[all residents asked to 'shelter in place' are...	[all, residents, asked, to, 'shelter, in, plac...	[resid, ask, place, notifi, offic, evacu, shel...	resid ask place notifi offic evacu shelter pla...	resid ask place notifi offic evacu shelter pla...
3	6	NaN	NaN	13,000 people receive #wildfires evacuation or...	1	[13,000 people receive #wildfires evacuation o...	[13,000, people, receive, #, wildfires, evacua...	[peopl, receiv, wildfir, evacu, order, califor...	peopl receiv wildfir evacu order california	peopl receiv wildfir evacu order california
4	7	NaN	NaN	just got sent this photo from ruby #alaska as ...	1	[just got sent this photo from ruby #alaska as...	[just, got, sent, this, photo, from, ruby, #, ...	[got, sent, thi, photo, rubi, alaska, smoke, w...	got sent thi photo rubi alaska smoke wildfir p...	got sent thi photo rubi alaska smoke wildfir p...
...	...	...	...	...	...	...	...	...	...	...
7608	10869	NaN	NaN	two giant cranes holding a bridge collapse int...	1	[two giant cranes holding a bridge collapse in...	[two, giant, cranes, holding, a, bridge, colla...	[two, giant, crane, hold, bridg, collaps, near...	two giant crane hold bridg collaps nearbi home...	two giant crane hold bridg collaps nearbi home...
7609	10870	NaN	NaN	@aria_ahrary @thetawniest the out of control w...	1	[@aria_ahrary @thetawniest the out of control ...	[@, aria_ahrary, @, thetawniest, the, out, of,...	[thetawniest, control, wild, fire, california,...	thetawniest control wild fire california even ...	thetawniest control wild fire california even ...
7610	10871	NaN	NaN	m1.94 [01:04 utc]?5km s of volcano hawaii. htt...	1	[m1.94 [01:04 utc]?5km s of volcano hawaii., h...	[m1.94, [, 01:04, utc, ], ?, 5km, s, of, volca...	[utc, 5km, volcano, hawaii, http]	utc 5km volcano hawaii http	utc 5km volcano hawaii http
7611	10872	NaN	NaN	police investigating after an e-bike collided ...	1	[police investigating after an e-bike collided...	[police, investigating, after, an, e-bike, col...	[polic, investig, collid, car, littl, portug, ...	polic investig collid car littl portug rider s...	polic investig collid car littl portug rider s...
7612	10873	NaN	NaN	the latest: more homes razed by northern calif...	1	[the latest: more homes razed by northern cali...	[the, latest, :, more, homes, razed, by, north...	[latest, home, raze, northern, california, wil...	latest home raze northern california wildfir a...	latest home raze northern california wildfir a...

	id	keyword	location	text	text_senttoken	text_wordtoken	cleanText	cleaned
0	0	NaN	NaN	just happened a terrible car crash	[just happened a terrible car crash]	[just, happened, a, terrible, car, crash]	[happen, terribl, car, crash]	happen terribl car crash
1	2	NaN	NaN	heard about #earthquake is different cities, s...	[heard about #earthquake is different cities, ...	[heard, about, #, earthquake, is, different, c...	[heard, earthquak, differ, citi, stay, safe, e...	heard earthquak differ citi stay safe everyon
2	3	NaN	NaN	there is a forest fire at spot pond, geese are...	[there is a forest fire at spot pond, geese ar...	[there, is, a, forest, fire, at, spot, pond, ,...	[forest, fire, spot, pond, gee, flee, across, ...	forest fire spot pond gee flee across street save
3	9	NaN	NaN	apocalypse lighting. #spokane #wildfires	[apocalypse lighting., #spokane #wildfires]	[apocalypse, lighting, ., #, spokane, #, wildf...	[apocalyps, light, spokan, wildfir]	apocalyps light spokan wildfir
4	11	NaN	NaN	typhoon soudelor kills 28 in china and taiwan	[typhoon soudelor kills 28 in china and taiwan]	[typhoon, soudelor, kills, 28, in, china, and,...	[typhoon, soudelor, kill, 28, china, taiwan]	typhoon soudelor kill 28 china taiwan
...	...	...	...	...	...	...	...	...
3258	10861	NaN	NaN	earthquake safety los angeles ûò safety faste...	[earthquake safety los angeles ûò safety fast...	[earthquake, safety, los, angeles, ûò, safety...	[earthquak, safeti, lo, angel, safeti, fasten,...	earthquak safeti lo angel safeti fasten xrwn
3259	10865	NaN	NaN	storm in ri worse than last hurricane. my city...	[storm in ri worse than last hurricane., my ci...	[storm, in, ri, worse, than, last, hurricane, ...	[storm, ri, wors, last, hurrican, citi, amp, 3...	storm ri wors last hurrican citi amp 3other ha...
3260	10868	NaN	NaN	green line derailment in chicago http://t.co/u...	[green line derailment in chicago http://t.co/...	[green, line, derailment, in, chicago, http, :...	[green, line, derail, chicago, http]	green line derail chicago http
3261	10874	NaN	NaN	meg issues hazardous weather outlook (hwo) htt...	[meg issues hazardous weather outlook (hwo) ht...	[meg, issues, hazardous, weather, outlook, (, ...	[meg, issu, hazard, weather, outlook, hwo, http]	meg issu hazard weather outlook hwo http
3262	10875	NaN	NaN	#cityofcalgary has activated its municipal eme...	[#cityofcalgary has activated its municipal em...	[#, cityofcalgary, has, activated, its, munici...	[cityofcalgari, ha, activ, municip, emerg, pla...	cityofcalgari ha activ municip emerg plan yycs...

Part V: Vectorization¶

What is Vectorization?¶

Previous Part where we've done text cleaning:¶

To be more precise:¶

Measuring Similarity:¶