import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#reading and refering the data using the variable `data`
data = pd.read_csv("Titanic-DataSet.csv")
data


data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


# By using the parameter inplace, we are indicating that this change shall be made permanent in our dataset. 
# Which by default is false 
data.drop('Cabin', axis=1, inplace=True)


for i in data.columns:
    print(i)

PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Embarked


data['Age'].fillna(data['Age'].mean(), inplace=True)
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64


# data['Embarked'].isnull().sum()    { to check number of data entries with null value, turns out to be 2 }
missingports = data[data['Embarked'].isnull()]
missingports


table = pd.crosstab(data['Embarked'],data['Sex'])
print(table)

Sex       female  male
Embarked              
C             73    95
Q             36    41
S            203   441


data['Embarked'].fillna('S',inplace=True)


data['Embarked'].isnull().sum()

0


f, ax = plt.subplots(1,1, figsize=(10, 5))
sns.scatterplot(x="Fare", y="Age", data=data, ax=ax);


ax = sns.countplot(x="Pclass", data=data)
ax.set_title('data');


sns.histplot(data['Age'], bins=25, 
             color = 'navy', alpha = 1, 
             binwidth=10, shrink=.95)

<AxesSubplot:xlabel='Age', ylabel='Count'>


sns.catplot(x ="Sex", hue ="Survived",
kind ="count", data = data)

<seaborn.axisgrid.FacetGrid at 0x208f688d070>


# Group the dataset by Pclass and Survived and then unstack them
group = data.groupby(['Pclass', 'Survived'])
pclass_survived = group.size().unstack()

# Heatmap - Color encoded 2D representation of data.
sns.heatmap(pclass_survived, annot = True, fmt ="d")

<AxesSubplot:xlabel='Survived', ylabel='Pclass'>


fig = plt.figure(figsize=(12, 8))
gs = fig.add_gridspec(3,1)
gs.update(hspace= -0.55)

axes = list()
colors = ["#022133", "#5c693b", "#51371c"]

for idx, cls, c in zip(range(3), sorted(data['Pclass'].unique()), colors):
    axes.append(fig.add_subplot(gs[idx, 0]))
    
    # you can also draw density plot with matplotlib + scipy.
    sns.kdeplot(x='Age', data=data[data['Pclass']==cls], 
                fill=True, ax=axes[idx], cut=0, bw_method=0.25, 
                lw=1.4, edgecolor='lightgray', hue='Survived', 
                multiple="stack", palette='PuBu', alpha=0.7
               ) 
    
    axes[idx].set_ylim(0, 0.04)
    axes[idx].set_xlim(0, 85)
    
    axes[idx].set_yticks([])
    if idx != 2 : axes[idx].set_xticks([])
    axes[idx].set_ylabel('')
    axes[idx].set_xlabel('')
    
    spines = ["top","right","left","bottom"]
    for s in spines:
        axes[idx].spines[s].set_visible(False)
        
    axes[idx].patch.set_alpha(0)
    axes[idx].text(-0.2,0,f'Pclass {cls}',fontweight="light", fontfamily='serif', fontsize=11,ha="right")
    if idx != 1 : axes[idx].get_legend().remove()
        
fig.text(0.13,0.81,"Age distribution by Pclass", fontfamily='serif', fontsize=16)

plt.show()


# Violinplot Displays distribution of data across all levels of a category.
sns.violinplot(x ="Sex", y ="Age", hue ="Survived", data = data, split = True)

<AxesSubplot:xlabel='Sex', ylabel='Age'>


sns.catplot(x ='Embarked', hue ='Survived',
            kind ='count', col ='Pclass', 
            data = data)

<seaborn.axisgrid.FacetGrid at 0x208f6bfc160>

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...	...
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.0000	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.0000	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.4500	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.0000	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.7500	NaN	Q

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Embarked
61	62	1	1	Icard, Miss. Amelie	female	38.0	0	0	113572	80.0	NaN
829	830	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	80.0	NaN

Visualizing Data with Titanic DataSet¶

Procedure followed throughout the project¶

1. Importing Python Packages¶

2. Data inspection¶

2.1 Dataset at a glance¶

2.2 Columns at a glance¶

2.3 To check whether data contains any null values¶

Conclusion¶

3. Data Cleaning¶

3.1 Dropping data column¶

3.2 Populating the missing ages¶

3.3 Missing embarkation ports¶

4. Data Visualization ¶

4.1 Scatter plot of Fare versus Age¶

4.2 Bar Graph to represent the Seat Class taken by passengers¶

4.3 Histogram to represent Age¶

4.4 Bar plot of the Survival Rate of People¶

4.5 Heat Map of Survial vs PClass¶

4.6 Cool way to represent Age, Survival and Pclass¶

4.7 Violin plot of Gender versus Age¶

4.8 Count plot of Embarkment vs Count keeping Pclass into consideration¶

Suggestion¶