from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import nltk
from string import punctuation
from wordcloud import WordCloud


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


#count the number of keyword
keyword = train['keyword'].value_counts()
keyword.head(20)

fatalities     45
deluge         42
armageddon     42
sinking        41
damage         41
harm           41
body%20bags    41
outbreak       40
evacuate       40
fear           40
collided       40
siren          40
twister        40
windstorm      40
sinkhole       39
sunk           39
hellfire       39
weapon         39
weapons        39
famine         39
Name: keyword, dtype: int64


ax = sns.barplot(train['location'].value_counts()[:20], train['location'].value_counts().index[:20])
ax.set(xlabel="location appearance counts")

c:\users\lenovo\appdata\local\programs\python\python39\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

[Text(0.5, 0, 'location appearance counts')]


train_dis = train[train['target']==1]
ax = sns.barplot(train_dis['location'].value_counts()[:20], train_dis['location'].value_counts().index[:20])
ax.set(xlabel="location appearance counts", title="bar plot of disaster tweets location")

c:\users\lenovo\appdata\local\programs\python\python39\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

[Text(0.5, 0, 'location appearance counts'),
 Text(0.5, 1.0, 'bar plot of disaster tweets location')]


train_non = train[train['target']==0]
ax = sns.barplot(train_non['location'].value_counts()[:20], train_non['location'].value_counts().index[:20])
ax.set(xlabel="location appearance counts", title="bar plot of non-disaster tweets location")

c:\users\lenovo\appdata\local\programs\python\python39\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

[Text(0.5, 0, 'location appearance counts'),
 Text(0.5, 1.0, 'bar plot of non-disaster tweets location')]


#Renaming location names
train['location'].replace({'United States':'USA',
                           'New York':'USA',
                            "London":'UK',
                            "Los Angeles, CA":'USA',
                            "Washington, D.C.":'USA',
                            "California":'USA',
                             "Chicago, IL":'USA',
                             "Chicago":'USA',
                            "New York, NY":'USA',
                            "California, USA":'USA',
                            "FLorida":'USA',
                            "Nigeria":'Africa',
                            "Kenya":'Africa',
                            "Everywhere":'Worldwide',
                            "San Francisco":'USA',
                            "Florida":'USA',
                            "United Kingdom":'UK',
                            "Los Angeles":'USA',
                            "Toronto":'Canada',
                            "San Francisco, CA":'USA',
                            "NYC":'USA',
                            "Seattle":'USA',
                            "Earth":'Worldwide',
                            "Ireland":'UK',
                            "London, England":'UK',
                            "New York City":'USA',
                            "Texas":'USA',
                            "London, UK":'UK',
                            "Atlanta, GA":'USA',
                            "Mumbai":"India"},inplace=True)
ax = sns.barplot(y = train['location'].value_counts()[:5].index, x = train['location'].value_counts()[:5],
            palette='autumn', orient='h')
ax.set(xlabel="location appearance counts", title="bar plot of tweets location")

[Text(0.5, 0, 'location appearance counts'),
 Text(0.5, 1.0, 'bar plot of tweets location')]


#Renaming location names
train_dis['location'].replace({'United States':'USA',
                           'New York':'USA',
                            "London":'UK',
                            "Los Angeles, CA":'USA',
                            "Washington, D.C.":'USA',
                            "California":'USA',
                             "Chicago, IL":'USA',
                             "Chicago":'USA',
                            "New York, NY":'USA',
                            "California, USA":'USA',
                            "FLorida":'USA',
                            "Nigeria":'Africa',
                            "Kenya":'Africa',
                            "Everywhere":'Worldwide',
                            "San Francisco":'USA',
                            "Florida":'USA',
                            "United Kingdom":'UK',
                            "Los Angeles":'USA',
                            "Toronto":'Canada',
                            "San Francisco, CA":'USA',
                            "NYC":'USA',
                            "Seattle":'USA',
                            "Earth":'Worldwide',
                            "Ireland":'UK',
                            "London, England":'UK',
                            "New York City":'USA',
                            "Texas":'USA',
                            "London, UK":'UK',
                            "Atlanta, GA":'USA',
                            "Mumbai":"India"},inplace=True)
ax = sns.barplot(y = train_dis['location'].value_counts()[:5].index, x = train_dis['location'].value_counts()[:5],
            palette='autumn', orient='h')
ax.set(xlabel="location appearance counts", title="bar plot of disaster tweets location")

C:\Users\Lenovo\AppData\Local\Temp/ipykernel_26796/77062732.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dis['location'].replace({'United States':'USA',

[Text(0.5, 0, 'location appearance counts'),
 Text(0.5, 1.0, 'bar plot of disaster tweets location')]


#Renaming location names
train_non['location'].replace({'United States':'USA',
                           'New York':'USA',
                            "London":'UK',
                            "Los Angeles, CA":'USA',
                            "Washington, D.C.":'USA',
                            "California":'USA',
                             "Chicago, IL":'USA',
                             "Chicago":'USA',
                            "New York, NY":'USA',
                            "California, USA":'USA',
                            "FLorida":'USA',
                            "Nigeria":'Africa',
                            "Kenya":'Africa',
                            "Everywhere":'Worldwide',
                            "San Francisco":'USA',
                            "Florida":'USA',
                            "United Kingdom":'UK',
                            "Los Angeles":'USA',
                            "Toronto":'Canada',
                            "San Francisco, CA":'USA',
                            "NYC":'USA',
                            "Seattle":'USA',
                            "Earth":'Worldwide',
                            "Ireland":'UK',
                            "London, England":'UK',
                            "New York City":'USA',
                            "Texas":'USA',
                            "London, UK":'UK',
                            "Atlanta, GA":'USA',
                            "Mumbai":"India"},inplace=True)
ax = sns.barplot(y = train_non['location'].value_counts()[:5].index, x = train_non['location'].value_counts()[:5],
            palette='autumn', orient='h')
ax.set(xlabel="location appearance counts", title="bar plot of disaster tweets location")

C:\Users\Lenovo\AppData\Local\Temp/ipykernel_26796/4173496029.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_non['location'].replace({'United States':'USA',

[Text(0.5, 0, 'location appearance counts'),
 Text(0.5, 1.0, 'bar plot of disaster tweets location')]


fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
sent=train[train['target']==1]['text'].str.split(".").apply(lambda x : [len(i) for i in x])
sns.distplot(sent.map(lambda x: np.mean(x)),ax=ax1,color='b')
ax1.set_title('disaster')
sent=train[train['target']==0]['text'].str.split(".").apply(lambda x : [len(i) for i in x])
sns.distplot(sent.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('Not disaster')
fig.suptitle('Average Sentences length in each tweet')

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='text', ylabel='Density'>

Text(0.5, 1.0, 'disaster')

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='text', ylabel='Density'>

Text(0.5, 1.0, 'Not disaster')

Text(0.5, 0.98, 'Average Sentences length in each tweet')


fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
sent=train[train['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(sent.map(lambda x: np.mean(x)),ax=ax1,color='b')
ax1.set_title('disaster')
sent=train[train['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(sent.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('Not disaster')
fig.suptitle('Average Words length in each tweet')

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='text', ylabel='Density'>

Text(0.5, 1.0, 'disaster')

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='text', ylabel='Density'>

Text(0.5, 1.0, 'Not disaster')

Text(0.5, 0.98, 'Average Sentences length in each tweet')


fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
train_len=train[train['target']==1]['text'].str.len()
ax1.hist(train_len,color='blue')
ax1.set_title('disaster tweets')
train_len=train[train['target']==0]['text'].str.len()
ax2.hist(train_len,color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Characters in tweets')
plt.show()

(array([ 31.,  54., 118., 157., 302., 448., 374., 427., 936., 424.]),
 array([ 14. ,  27.7,  41.4,  55.1,  68.8,  82.5,  96.2, 109.9, 123.6,
        137.3, 151. ]),
 <BarContainer object of 10 artists>)

Text(0.5, 1.0, 'disaster tweets')

(array([  77.,  238.,  350.,  381.,  470.,  503.,  530.,  570., 1137.,
          86.]),
 array([  7.,  22.,  37.,  52.,  67.,  82.,  97., 112., 127., 142., 157.]),
 <BarContainer object of 10 artists>)

Text(0.5, 1.0, 'Not disaster tweets')

Text(0.5, 0.98, 'Characters in tweets')


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[26, 8])
p1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(train[train['target']==1]["text"]))
ax1.imshow(p1)
ax1.axis('off')
ax1.set_title('Disaster Tweets',fontsize=40);

p2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(train[train['target']==0]["text"]))
ax2.imshow(p2)
ax2.axis('off')
ax2.set_title('Non Disaster Tweets',fontsize=40);

III. Data Visualization¶

Explore the target column distribution¶

Explore number of sentences in tweets encoded by target 0 or 1¶

Explore number of words in tweets encoded by target 0 or 1¶

Explore number of characters in tweets encoded by target 0 or 1¶

Word Clouds¶

Comments¶