from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from string import punctuation
from wordcloud import WordCloud

#reading the data using pandas
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


d = {'Factor': train.columns,"Description" : ["tweets id", 
                                              "A keyword from that tweet (may be blank!)",
                                              "The location the tweet was sent from (may be blank)",
                                              "The text of a tweet",
                                              "If it is a disaster tweet or not."]}
df_dis = pd.DataFrame(data = d)
df_dis


#get the shape of two data sets
print('The training data set contains {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print('The testing data set contains {} rows and {} columns'.format(test.shape[0],test.shape[1]))

The training data set contains 7613 rows and 5 columns
The testing data set contains 3263 rows and 4 columns


#missing values in the training and testing data set
#training data set
train.isnull().sum()
test.isnull().sum()
train.info()
test.info()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

id             0
keyword       26
location    1105
text           0
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB

II. Data Description¶

Import Necessary Packages and Reading Datasets¶

Variable description¶

Explore the size and composition of the data sets¶

Explore the missing values in both training and testing data sets¶

	Factor	Description
0	id	tweets id
1	keyword	A keyword from that tweet (may be blank!)
2	location	The location the tweet was sent from (may be b...
3	text	The text of a tweet
4	target	If it is a disaster tweet or not.