# import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# load csv into dataframe
df = pd.read_csv("titanic_passenger_list.csv")

# view the first 10 rows of the dataframe
df.head(10)

# view a 10 row sample of the dataframe
df.sample(10)

# use .shape to determine how many rows and columns are in the dataframe
df.shape

(1309, 14)

# check for duplicate values in the dataframe
df.duplicated().sum()

np.int64(0)

# information on all columns in the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB

# find the data type of each column
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

# find the number of non-null values for each column
df.count()

pclass       1309
survived     1309
name         1309
sex          1309
age          1046
sibsp        1309
parch        1309
ticket       1309
fare         1308
cabin         295
embarked     1307
boat          486
body          121
home.dest     745
dtype: int64

# find the number of null values for each row
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

# find statistical information for the numeric columns
df.describe()

df.describe().T

df.nunique()

pclass          3
survived        2
name         1307
sex             2
age            98
sibsp           7
parch           8
ticket        929
fare          281
cabin         186
embarked        3
boat           27
body          121
home.dest     369
dtype: int64

df['survived'].value_counts()

survived
0    809
1    500
Name: count, dtype: int64

df['age'].isna().sum()

np.int64(263)

df.age.describe().T

count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: age, dtype: float64

df[df['age'].isnull()]

sns.histplot(df['age'])

<Axes: xlabel='age', ylabel='Count'>

df.loc[df.age.isnull(), 'age'] = 30
# df['age'] = df['age'].replace(np.nan, ??)

df.age.isnull().sum()

np.int64(0)

# how did this affect the column's statistics?
df.age.describe().T

count    1309.000000
mean       29.905019
std        12.883281
min         0.170000
25%        22.000000
50%        30.000000
75%        35.000000
max        80.000000
Name: age, dtype: float64

df['cabin'].isnull().sum()

np.int64(1014)

df = df.drop(['cabin'], axis=1)
df.sample(10)

df['embarked'].isnull().sum()

np.int64(2)

df[df['embarked'].isnull()]

df['embarked'].mode()

0    S
Name: embarked, dtype: object

df['embarked'].value_counts()

embarked
S    914
C    270
Q    123
Name: count, dtype: int64

df['embarked'] = df['embarked'].replace(np.nan, 'S')

# verify there aren't any missing values
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
embarked        0
boat          823
body         1188
home.dest     564
dtype: int64

df['fare'].isnull().sum()

np.int64(1)

df[df['fare'].isnull()]

df.fare.describe().T

count    1308.000000
mean       33.295479
std        51.758668
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: fare, dtype: float64

sns.histplot(df['fare'])

<Axes: xlabel='fare', ylabel='Count'>

df.loc[df.fare.isnull(), 'fare'] = df.fare.median()
# df['fare'] = df['fare'].replace(np.nan, df.fare.median())

df.fare.describe().T

count    1309.000000
mean       33.281086
std        51.741500
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: fare, dtype: float64

df.describe().T

sns.histplot(df['age'])

<Axes: xlabel='age', ylabel='Count'>

df.fare.describe().T

count    1309.000000
mean       33.281086
std        51.741500
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: fare, dtype: float64

sns.histplot(df['fare'])

<Axes: xlabel='fare', ylabel='Count'>

df[df['fare'] > 500]

df.groupby("pclass")["fare"].median()

pclass
1    60.0000
2    15.0458
3     8.0500
Name: fare, dtype: float64

# update the 512.3292 fare prices with the median fare price for pclass 1
df.loc[df['fare'] == 512.3292, ['fare']] = 60

sns.histplot(df['parch'])

<Axes: xlabel='parch', ylabel='Count'>

sns.histplot(df['sibsp'])

<Axes: xlabel='sibsp', ylabel='Count'>

df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

df['embarked'].value_counts()

embarked
S    916
C    270
Q    123
Name: count, dtype: int64

df['parch'].value_counts()

parch
0    1002
1     170
2     113
3       8
4       6
5       6
6       2
9       2
Name: count, dtype: int64

df['sex'].value_counts()

sex
male      843
female    466
Name: count, dtype: int64

df['sibsp'].value_counts()

sibsp
0    891
1    319
2     42
4     22
3     20
8      9
5      6
Name: count, dtype: int64

df.nunique()

pclass          3
survived        2
name         1307
sex             2
age            98
sibsp           7
parch           8
ticket        929
fare          280
embarked        3
boat           27
body          121
home.dest     369
dtype: int64

df = df.drop(['ticket'], axis=1)

sns.set(color_codes=True)

def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to show the density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a star will indicate the mean value of the column
    
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the feature - number of rows
    count = data[feature].nunique() # unique values of the feature
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        hue=feature,
        legend=False,
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot

def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique() # unique values of the feature
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 115)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    print(tab)
    tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()

print(histogram_boxplot.__doc__)

    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to show the density curve (default False)
    bins: number of bins for histogram (default None)

print(labeled_barplot.__doc__)

    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)

print(stacked_barplot.__doc__)

    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable

histogram_boxplot(df, 'age', kde=True)

df.age.describe()

count    1309.000000
mean       29.905019
std        12.883281
min         0.170000
25%        22.000000
50%        30.000000
75%        35.000000
max        80.000000
Name: age, dtype: float64

labeled_barplot(df, 'embarked')

histogram_boxplot(df, 'fare', kde=True)

sns.displot(df['fare'], kde=True)

<seaborn.axisgrid.FacetGrid at 0x1be24bd9d90>

df.fare.describe()

count    1309.000000
mean       31.898873
std        44.448381
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       263.000000
Name: fare, dtype: float64

histogram_boxplot(df, 'parch', kde=True)

labeled_barplot(df, 'parch')

df.parch.describe()

count    1309.000000
mean        0.385027
std         0.865560
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         9.000000
Name: parch, dtype: float64

labeled_barplot(df, 'pclass')

labeled_barplot(df, 'sex')

sns.countplot(data=df, x='sex')

<Axes: xlabel='sex', ylabel='count'>

histogram_boxplot(df, 'sibsp', kde=True)

labeled_barplot(df, 'sibsp')

df.sibsp.describe()

count    1309.000000
mean        0.498854
std         1.041658
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         8.000000
Name: sibsp, dtype: float64

labeled_barplot(df, 'survived')

pd.crosstab([df.sex,df.survived],df.pclass,margins=True)

plt.figure(figsize=(10,5))
sns.heatmap(df.corr(numeric_only=True), annot=True)

<Axes: >

plt.figure(figsize=(10,5))
sns.pointplot(data=df, x='pclass', y='fare', hue='survived')
#sns.pointplot(data=df, x='fare', y='pclass', hue='survived')
#sns.pointplot(data=df, x='fare', y='survived', hue='pclass')
#sns.pointplot(data=df, x='survived', y='pclass', hue='fare')
#sns.pointplot(data=df, x='pclass', y='survived', hue='fare')
#sns.pointplot(data=df, x='survived', y='fare', hue='pclass')

<Axes: xlabel='pclass', ylabel='fare'>

dt0 = df['age'][df['survived']==0]
dt1 = df['age'][df['survived']==1]

bins = np.arange(0, df['age'].max()+10, 10)

plt.hist([dt0,dt1], label=['Survived: No','Survived: Yes'], bins=bins)

plt.legend(loc='upper right')
plt.title('Frequency of Age')

Text(0.5, 1.0, 'Frequency of Age')

sns.catplot(data=df, x='survived', y='age', kind='box')

<seaborn.axisgrid.FacetGrid at 0x1be24ecdd90>

stacked_barplot(df, 'embarked', 'survived')

survived    0    1   All
embarked                
All       809  500  1309
S         610  306   916
C         120  150   270
Q          79   44   123
-------------------------------------------------------------------------------------------------------------------
survived         0         1
embarked                    
C         0.444444  0.555556
Q         0.642276  0.357724
S         0.665939  0.334061

sns.countplot(x='embarked', hue='survived', data=df)

<Axes: xlabel='embarked', ylabel='count'>

dt0 = df['fare'][df['survived']==0]
dt1 = df['fare'][df['survived']==1]

bins = np.arange(0, df['fare'].max()+25, 25)

plt.hist([dt0,dt1], label=['Survived: No','Survived: Yes'], bins=bins)

plt.legend(loc='upper right')
plt.title('Frequency of Fare')

Text(0.5, 1.0, 'Frequency of Fare')

stacked_barplot(df, 'parch', 'survived')

survived    0    1   All
parch                   
All       809  500  1309
0         666  336  1002
1          70  100   170
2          56   57   113
3           3    5     8
4           5    1     6
5           5    1     6
6           2    0     2
9           2    0     2
-------------------------------------------------------------------------------------------------------------------
survived         0         1
parch                       
3         0.375000  0.625000
1         0.411765  0.588235
2         0.495575  0.504425
0         0.664671  0.335329
4         0.833333  0.166667
5         0.833333  0.166667
6         1.000000  0.000000
9         1.000000  0.000000

sns.countplot(x='parch', hue='survived', data=df)

<Axes: xlabel='parch', ylabel='count'>

stacked_barplot(df, 'pclass', 'survived')

survived    0    1   All
pclass                  
All       809  500  1309
1         123  200   323
3         528  181   709
2         158  119   277
-------------------------------------------------------------------------------------------------------------------
survived         0         1
pclass                      
1         0.380805  0.619195
2         0.570397  0.429603
3         0.744711  0.255289

# df.groupby('pclass')['survived'].value_counts()
sns.countplot(data=df, x='pclass', hue='survived')

<Axes: xlabel='pclass', ylabel='count'>

stacked_barplot(df, 'sex', 'survived')

survived    0    1   All
sex                     
All       809  500  1309
female    127  339   466
male      682  161   843
-------------------------------------------------------------------------------------------------------------------
survived         0         1
sex                         
female    0.272532  0.727468
male      0.809015  0.190985

sns.countplot(x='sex', hue='survived', data=df)

<Axes: xlabel='sex', ylabel='count'>

stacked_barplot(df, 'sibsp', 'survived')

survived    0    1   All
sibsp                   
All       809  500  1309
0         582  309   891
1         156  163   319
2          23   19    42
3          14    6    20
4          19    3    22
5           6    0     6
8           9    0     9
-------------------------------------------------------------------------------------------------------------------
survived         0         1
sibsp                       
1         0.489028  0.510972
2         0.547619  0.452381
0         0.653199  0.346801
3         0.700000  0.300000
4         0.863636  0.136364
5         1.000000  0.000000
8         1.000000  0.000000

sns.countplot(x='sibsp', hue='survived', data=df)

<Axes: xlabel='sibsp', ylabel='count'>

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.00	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.92	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.00	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.00	1	2	113781	151.5500	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON
4	1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.00	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
5	1	1	Anderson, Mr. Harry	male	48.00	0	0	19952	26.5500	E12	S	3	NaN	New York, NY
6	1	1	Andrews, Miss. Kornelia Theodosia	female	63.00	1	0	13502	77.9583	D7	S	10	NaN	Hudson, NY
7	1	0	Andrews, Mr. Thomas Jr	male	39.00	0	0	112050	0.0000	A36	S	NaN	NaN	Belfast, NI
8	1	1	Appleton, Mrs. Edward Dale (Charlotte Lamson)	female	53.00	2	0	11769	51.4792	C101	S	D	NaN	Bayside, Queens, NY
9	1	0	Artagaveytia, Mr. Ramon	male	71.00	0	0	PC 17609	49.5042	NaN	C	NaN	22.0	Montevideo, Uruguay

	pclass	survived	name	sex	age	sibsp	ticket	fare	cabin	embarked	boat	body	home.dest
915	3	0	Karlsson, Mr. Nils August	male	22.0	0	350060	7.5208	NaN	S	NaN	NaN	NaN
753	3	0	Davies, Mr. Evan	male	22.0	0	SC/A4 23568	8.0500	NaN	S	NaN	NaN	NaN
1272	3	0	Vander Cruyssen, Mr. Victor	male	47.0	0	345765	9.0000	NaN	S	NaN	NaN	NaN
203	1	0	Meyer, Mr. Edgar Joseph	male	28.0	1	PC 17604	82.1708	NaN	C	NaN	NaN	New York, NY
333	2	1	Ball, Mrs. (Ada E Hall)	female	36.0	0	28551	13.0000	D	S	10	NaN	Bristol, Avon / Jacksonville, FL
363	2	0	Campbell, Mr. William	male	NaN	0	239853	0.0000	NaN	S	NaN	NaN	Belfast
391	2	0	del Carlo, Mr. Sebastiano	male	29.0	1	SC/PARIS 2167	27.7208	NaN	C	NaN	295.0	Lucca, Italy / California
257	1	1	Schabert, Mrs. Paul (Emma Mock)	female	35.0	1	13236	57.7500	C28	C	11	NaN	New York, NY
450	2	0	Hodges, Mr. Henry Price	male	50.0	0	250643	13.0000	NaN	S	NaN	149.0	Southampton
582	2	1	Watt, Miss. Bertha J	female	12.0	0	C.A. 33595	15.7500	NaN	S	9	NaN	Aberdeen / Portland, OR

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
15	1	0	Baumann, Mr. John D	male	NaN	0	0	PC 17318	25.9250	NaN	S	NaN	NaN	New York, NY
37	1	1	Bradley, Mr. George ("George Arthur Brayton")	male	NaN	0	0	111427	26.5500	NaN	S	9	NaN	Los Angeles, CA
40	1	0	Brewe, Dr. Arthur Jackson	male	NaN	0	0	112379	39.6000	NaN	C	NaN	NaN	Philadelphia, PA
46	1	0	Cairns, Mr. Alexander	male	NaN	0	0	113798	31.0000	NaN	S	NaN	NaN	NaN
59	1	1	Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...	female	NaN	0	0	17770	27.7208	NaN	C	5	NaN	New York, NY
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1293	3	0	Williams, Mr. Howard Hugh "Harry"	male	NaN	0	0	A/5 2466	8.0500	NaN	S	NaN	NaN	NaN
1297	3	0	Wiseman, Mr. Phillippe	male	NaN	0	0	A/4. 34244	7.2500	NaN	S	NaN	NaN	NaN
1302	3	0	Yousif, Mr. Wazli	male	NaN	0	0	2647	7.2250	NaN	C	NaN	NaN	NaN
1303	3	0	Yousseff, Mr. Gerious	male	NaN	0	0	2627	14.4583	NaN	C	NaN	NaN	NaN
1305	3	0	Zabour, Miss. Thamine	female	NaN	1	0	2665	14.4542	NaN	C	NaN	NaN	NaN

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	embarked	boat	body	home.dest
902	3	0	Johnston, Mr. Andrew G	male	30.0	1	2	W./C. 6607	23.4500	S	NaN	NaN	NaN
945	3	1	Lam, Mr. Ali	male	30.0	0	0	1601	56.4958	S	C	NaN	NaN
1196	3	1	Sheerlinck, Mr. Jan Baptist	male	29.0	0	0	345779	9.5000	S	11	NaN	NaN
9	1	0	Artagaveytia, Mr. Ramon	male	71.0	0	0	PC 17609	49.5042	C	NaN	22.0	Montevideo, Uruguay
520	2	1	Nourney, Mr. Alfred ("Baron von Drachstedt")	male	20.0	0	0	SC/PARIS 2166	13.8625	C	7	NaN	Cologne, Germany
305	1	0	Weir, Col. John	male	60.0	0	0	113800	26.5500	S	NaN	NaN	England Salt Lake City, Utah
1291	3	0	Willer, Mr. Aaron ("Abi Weller")	male	30.0	0	0	3410	8.7125	S	NaN	NaN	NaN
1175	3	0	Sage, Miss. Stella Anna	female	30.0	8	2	CA. 2343	69.5500	S	NaN	NaN	NaN
715	3	0	Christmann, Mr. Emil	male	29.0	0	0	343276	8.0500	S	NaN	NaN	NaN
705	3	0	Caram, Mr. Joseph	male	30.0	1	0	2689	14.4583	C	NaN	NaN	Ottawa, ON

	pclass	survived	age	sibsp	parch	fare	body
count	1309.000000	1309.000000	1046.000000	1309.000000	1309.000000	1308.000000	121.000000
mean	2.294882	0.381971	29.881138	0.498854	0.385027	33.295479	160.809917
std	0.837836	0.486055	14.413493	1.041658	0.865560	51.758668	97.696922
min	1.000000	0.000000	0.170000	0.000000	0.000000	0.000000	1.000000
25%	2.000000	0.000000	21.000000	0.000000	0.000000	7.895800	72.000000
50%	3.000000	0.000000	28.000000	0.000000	0.000000	14.454200	155.000000
75%	3.000000	1.000000	39.000000	1.000000	0.000000	31.275000	256.000000
max	3.000000	1.000000	80.000000	8.000000	9.000000	512.329200	328.000000

	count	mean	std	min	25%	50%	75%	max
pclass	1309.0	2.294882	0.837836	1.00	2.0000	3.0000	3.000	3.0000
survived	1309.0	0.381971	0.486055	0.00	0.0000	0.0000	1.000	1.0000
age	1046.0	29.881138	14.413493	0.17	21.0000	28.0000	39.000	80.0000
sibsp	1309.0	0.498854	1.041658	0.00	0.0000	0.0000	1.000	8.0000
parch	1309.0	0.385027	0.865560	0.00	0.0000	0.0000	0.000	9.0000
fare	1308.0	33.295479	51.758668	0.00	7.8958	14.4542	31.275	512.3292
body	121.0	160.809917	97.696922	1.00	72.0000	155.0000	256.000	328.0000

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	embarked	boat	body	home.dest
168	1	1	Icard, Miss. Amelie	female	38.0	0	0	113572	80.0	NaN	6	NaN	NaN
284	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	80.0	NaN	6	NaN	Cincinatti, OH

	pclass	survived	name	sex	age	parch	ticket	fare	embarked	boat	body	home.dest
49	1	1	Cardeza, Mr. Thomas Drake Martinez	male	36.0	1	PC 17755	512.3292	C	3	NaN	Austria-Hungary / Germantown, Philadelphia, PA
50	1	1	Cardeza, Mrs. James Warburton Martinez (Charlo...	female	58.0	1	PC 17755	512.3292	C	3	NaN	Germantown, Philadelphia, PA
183	1	1	Lesurer, Mr. Gustave J	male	35.0	0	PC 17755	512.3292	C	3	NaN	NaN
302	1	1	Ward, Miss. Anna	female	35.0	0	PC 17755	512.3292	C	3	NaN	NaN

Abstract¶

Objective¶

Data Dictionary¶

Data Overview¶

Observations¶

Data Preprocessing¶

Handling missing values¶

age¶

cabin¶

embarked¶

fare¶

Examining outliers¶

age¶

fare¶

parch¶

sibsp¶

Checking for inconsistent data¶

embarked¶

parch¶

sex¶

sibsp¶

Deleting features that do not affect the prediction¶

Exploratory Data Analysis¶

User functions¶

Univariate Analysis¶

age¶

embarked¶

fare¶

parch¶

pclass¶

sex¶

sibsp¶

survived¶

Multivariate Analysis¶

survived / age¶

survived / embarked¶

survived / fare¶

survived / parch¶

survived / pclass¶

survived / sex¶

survived / sibsp¶

`age`¶

`cabin`¶

`embarked`¶

`fare`¶

`age`¶

`fare`¶

`parch`¶

`sibsp`¶

`embarked`¶

`parch`¶

`sex`¶

`sibsp`¶

`age`¶

`embarked`¶

`fare`¶

`parch`¶

`pclass`¶

`sex`¶

`sibsp`¶

`survived`¶

`survived` / `age`¶

`survived` / `embarked`¶

`survived` / `fare`¶

`survived` / `parch`¶

`survived` / `pclass`¶

`survived` / `sex`¶

`survived` / `sibsp`¶