import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.preprocessing
scaler = sklearn.preprocessing.MinMaxScaler()
labelEncoder = sklearn.preprocessing.LabelEncoder()

sns.set(color_codes=True)
%matplotlib inline

df = pd.read_csv("vacation_package.csv")

def ReturnByPurchase(columnparam):
    purchased = df.loc[(df[columnparam].isnull() == False) & (df['ProdTaken'] == 1)]
    nopurchase = df.loc[(df[columnparam].isnull() == False) & (df['ProdTaken'] == 0)]
    
    print("Purchased Mean: " + str(round(purchased[columnparam].mean(),6)))
    print("Purchased Median: " + str(purchased[columnparam].median()))
    print("Purchased Mode: " + str(purchased[columnparam].mode()))
    print('\n' + "-----------" + '\n')
    print("Unpurchased Mean: " + str(round(nopurchase[columnparam].mean(),6)))
    print("Unpurchased Median: " + str(nopurchase[columnparam].median()))
    print("Unpurchased Mode: " + str(nopurchase[columnparam].mode()))

def histogramboxplot(data, feature, figsize=(12,7), kde=True, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to show the density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, meanprops = dict(marker='D', markeredgecolor='black', markerfacecolor='red')
    )  # boxplot will be created and a star will indicate the mean value of the column
    
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    
    ax_hist2.axvline(
        data[feature].mean(), color="red", linestyle="-"
    )  # Add mean to the histogram
    
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="--"
    )  # Add median to the histogram

def stackedbarplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 115)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    print(tab)
    tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()

def descviolinbox(Data, tablename):
    """
    Prints the Description of a numeric value and a stacked violin and boxplot 
    
    Data = Dataframe 
    
    value = Target Column
    
    """
    tablename = str(tablename)
    
    description = Data[tablename].describe()
    
    sns.violinplot(data=Data, y=tablename)

    sns.boxplot(data=Data, y=tablename, color="gray")
    
    return description

def bounds(Data, tablename):
    """
    Returns upper and lower bounds for a table, helps us capture outliers
    
    Data = Dataframe 
    tablename = Target Column
    """
    tablename = str(tablename)
    
    Q1 = Data[tablename].quantile(0.25)
    Q3 = Data[tablename].quantile(0.75)
    IQR = Q3 - Q1
    lowerlim = Q1 - 1.5 * IQR
    upperlim = Q3 + 1.5 * IQR
    
    print("Q1: " + str(Q1))
    print("Q3: " + str(Q3))
    print("IQR: " + str(IQR))
    print("lower limit: " + str(lowerlim))
    print("upper limit: " + str(upperlim))
    
    return(lowerlim, upperlim, Q1, Q3)

def numericdata(Data, tablename):
    """
    Returns the Mean, Median, Mode, and Range for a dataset
    
    Data = Dataframe
    tablename = target column
    """
    tablename = str(tablename)
    
    Q1 = Data[tablename].quantile(0.25)
    Q3 = Data[tablename].quantile(0.75)
    IQR = Q3-Q1
    mean = Data[tablename].mean()
    median = Data[tablename].median()
    mode = Data[tablename].mode()[0]
    
    mean = round(mean,6)
    median = round(median,6)
    mode = round(mode,6)
    IQR = round(IQR,6)
 
    print("Mean: " + str(mean))
    print("Median: " + str(median))
    print("Mode: " + str(mode))
    print("Range: " + str(IQR))

def labeldisplot(Data, tablename): 
    
        tablename = str(tablename)
        
        sns.displot(data=Data, x=tablename, kde=True);
        plt.axvline(x=Data[tablename].median(), color = 'black')
        plt.axvline(x=Data[tablename].mean(), color = 'red', ls='--')

def labeledbarplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot

def stackedbox(data, value, hue): 
    """
    data= Dataframe
    Value = Independent variable we want to test 
    Hue = Dependent Variable we want to include
    """
    sns.catplot(data= data, x = hue, y = value, kind="box")

def minmeanmax(data, independent, dependent):
    """
    Prints the min, mean and max of things grouped by our dependent variable 
    
    data = Dataframe 
    independent = The variable we want to do math on 
    Dependent = the variable we want to group by 
    """
    
    notpurchased = df.loc[df[dependent] == 0] #Adjust Number of df.Locs for number of variables
    purchased = df.loc[df[dependent] == 1]
    
    print("Not Purchased Min Mean and Max:" + '\n')
    print(f' Min: {notpurchased[independent].min()}')
    print(f' Mean: {round(notpurchased[independent].mean(), 6)}')
    print(f' Max: {notpurchased[independent].max()}')
    
    print("\n" + "Purchased Min Mean and Max:" + "\n")
    print(f' Min: {purchased[independent].min()}')  
    print(f' Mean: {round(purchased[independent].mean(), 6)}') 
    print(f' Max: {purchased[independent].max()}')
    
    #Add as many print statement blocks as required for the dependent variable, keep this count low

# df.sample(10)

# df.sample(10)

df.shape

(4888, 20)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport                  4888 non-null   int64  
 15  PitchSatisfactionScore    4888 non-null   int64  
 16  OwnCar                    4888 non-null   int64  
 17  NumberOfChildrenVisiting  4822 non-null   float64
 18  Designation               4888 non-null   object 
 19  MonthlyIncome             4655 non-null   float64
dtypes: float64(7), int64(7), object(6)
memory usage: 763.9+ KB

df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

df.duplicated().sum()

0

df.describe().T

# df.mode()

# df.corr()

df.nunique()

CustomerID                  4888
ProdTaken                      2
Age                           44
TypeofContact                  2
CityTier                       3
DurationOfPitch               34
Occupation                     4
Gender                         3
NumberOfPersonVisiting         5
NumberOfFollowups              6
ProductPitched                 5
PreferredPropertyStar          3
MaritalStatus                  4
NumberOfTrips                 12
Passport                       2
PitchSatisfactionScore         5
OwnCar                         2
NumberOfChildrenVisiting       4
Designation                    5
MonthlyIncome               2475
dtype: int64

df['Gender'].value_counts()

Male       2916
Female     1817
Fe Male     155
Name: Gender, dtype: int64

df['MaritalStatus'].value_counts()

Married      2340
Divorced      950
Single        916
Unmarried     682
Name: MaritalStatus, dtype: int64

df = df.drop(['CustomerID'], axis = 1)

# df.head(5)

df.isnull().sum()

ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

df.Age.isna().sum()

226

df.Age.describe().T

count    4662.000000
mean       37.622265
std         9.316387
min        18.000000
25%        31.000000
50%        36.000000
75%        44.000000
max        61.000000
Name: Age, dtype: float64

sns.displot(df['Age'], kde = True)
plt.axvline(x = df.Age.median(), color = 'black', ls='--')
plt.axvline(x = df.Age.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x249363edb20>

ReturnByPurchase('Age')

Purchased Mean: 34.770548
Purchased Median: 33.0
Purchased Mode: 0    29.0
Name: Age, dtype: float64

-----------

Unpurchased Mean: 38.282092
Unpurchased Median: 37.0
Unpurchased Mode: 0    36.0
Name: Age, dtype: float64

df.loc[(df['Age'].isnull() == True) & (df['ProdTaken'] == 1), ['Age']] = 33
df.loc[(df['Age'].isnull() == True) & (df['ProdTaken'] == 0),['Age']] = 37

df.Age.isnull().sum()

0

df.Age.describe().T

count    4888.000000
mean       37.557488
std         9.109545
min        18.000000
25%        31.000000
50%        37.000000
75%        43.000000
max        61.000000
Name: Age, dtype: float64

sns.displot(df['Age'], kde = True)
plt.axvline(x = df.Age.median(), color = 'black', ls='--')
plt.axvline(x = df.Age.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x24937bbf520>

df.TypeofContact.isna().sum()

25

# df[df['TypeofContact'].isnull()]

df['TypeofContact'].mode()

0    Self Enquiry
Name: TypeofContact, dtype: object

df.loc[df.TypeofContact.isnull(), 'TypeofContact'] = 'Self Enquiry'

df.TypeofContact.isna().sum()

0

df.DurationOfPitch.isna().sum()

251

df.DurationOfPitch.describe()

count    4637.000000
mean       15.490835
std         8.519643
min         5.000000
25%         9.000000
50%        13.000000
75%        20.000000
max       127.000000
Name: DurationOfPitch, dtype: float64

sns.displot(df['DurationOfPitch'], kde = True)
plt.axvline(x = df.DurationOfPitch.median(), color = 'black', ls='--')
plt.axvline(x = df.DurationOfPitch.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x24937bd7ca0>

ReturnByPurchase('DurationOfPitch')

Purchased Mean: 16.873143
Purchased Median: 15.0
Purchased Mode: 0    9.0
Name: DurationOfPitch, dtype: float64

-----------

Unpurchased Mean: 15.169325
Unpurchased Median: 13.0
Unpurchased Mode: 0    9.0
Name: DurationOfPitch, dtype: float64

df.loc[(df['DurationOfPitch'].isnull() == True) & (df['ProdTaken'] == 1), ['DurationOfPitch']] = 15
df.loc[(df['DurationOfPitch'].isnull() == True) & (df['ProdTaken'] == 0),['DurationOfPitch']] = 13

df.DurationOfPitch.describe()

count    4888.000000
mean       15.381342
std         8.313127
min         5.000000
25%         9.000000
50%        13.000000
75%        19.000000
max       127.000000
Name: DurationOfPitch, dtype: float64

sns.displot(df['DurationOfPitch'], kde = True)
plt.axvline(x = df.DurationOfPitch.median(), color = 'black', ls='--')
plt.axvline(x = df.DurationOfPitch.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x24937e9bfd0>

df.NumberOfFollowups.isna().sum()

45

df.NumberOfFollowups.describe()

count    4843.000000
mean        3.708445
std         1.002509
min         1.000000
25%         3.000000
50%         4.000000
75%         4.000000
max         6.000000
Name: NumberOfFollowups, dtype: float64

sns.displot(df['NumberOfFollowups'], kde = True)
plt.axvline(x = df.NumberOfFollowups.median(), color = 'black', ls='--')
plt.axvline(x = df.NumberOfFollowups.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x24937f74310>

ReturnByPurchase('NumberOfFollowups')

Purchased Mean: 3.941886
Purchased Median: 4.0
Purchased Mode: 0    4.0
Name: NumberOfFollowups, dtype: float64

-----------

Unpurchased Mean: 3.654286
Unpurchased Median: 4.0
Unpurchased Mode: 0    4.0
Name: NumberOfFollowups, dtype: float64

df.loc[df['NumberOfFollowups'].isnull(), ['NumberOfFollowups']] = 4

df.NumberOfFollowups.isna().sum()

0

df.NumberOfFollowups.describe()

count    4888.000000
mean        3.711129
std         0.998271
min         1.000000
25%         3.000000
50%         4.000000
75%         4.000000
max         6.000000
Name: NumberOfFollowups, dtype: float64

sns.displot(df['NumberOfFollowups'], kde = True)
plt.axvline(x = df.NumberOfFollowups.median(), color = 'black', ls='--')
plt.axvline(x = df.NumberOfFollowups.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x24937fe0610>

df.PreferredPropertyStar.isna().sum()

26

df.PreferredPropertyStar.describe()

count    4862.000000
mean        3.581037
std         0.798009
min         3.000000
25%         3.000000
50%         3.000000
75%         4.000000
max         5.000000
Name: PreferredPropertyStar, dtype: float64

sns.displot(df['PreferredPropertyStar'], kde = True)
plt.axvline(x = df.PreferredPropertyStar.median(), color = 'black', ls='--')
plt.axvline(x = df.PreferredPropertyStar.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x249380501f0>

ReturnByPurchase('PreferredPropertyStar')

Purchased Mean: 3.746171
Purchased Median: 3.0
Purchased Mode: 0    3.0
Name: PreferredPropertyStar, dtype: float64

-----------

Unpurchased Mean: 3.542806
Unpurchased Median: 3.0
Unpurchased Mode: 0    3.0
Name: PreferredPropertyStar, dtype: float64

df.loc[df['PreferredPropertyStar'].isnull(), ['PreferredPropertyStar']] = 3

df.PreferredPropertyStar.isna().sum()

0

df.PreferredPropertyStar.describe()

count    4888.000000
mean        3.577946
std         0.797005
min         3.000000
25%         3.000000
50%         3.000000
75%         4.000000
max         5.000000
Name: PreferredPropertyStar, dtype: float64

sns.displot(df['PreferredPropertyStar'], kde = True)
plt.axvline(x = df.PreferredPropertyStar.median(), color = 'black', ls='--')
plt.axvline(x = df.PreferredPropertyStar.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x249380d2f10>

df.NumberOfTrips.isna().sum()

140

df.NumberOfTrips.describe()

count    4748.000000
mean        3.236521
std         1.849019
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max        22.000000
Name: NumberOfTrips, dtype: float64

sns.displot(df['NumberOfTrips'], kde = True)
plt.axvline(x = df.NumberOfTrips.median(), color = 'black', ls='--')
plt.axvline(x = df.NumberOfTrips.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x2493912a880>

ReturnByPurchase('NumberOfTrips')

Purchased Mean: 3.30837
Purchased Median: 3.0
Purchased Mode: 0    2.0
Name: NumberOfTrips, dtype: float64

-----------

Unpurchased Mean: 3.219531
Unpurchased Median: 3.0
Unpurchased Mode: 0    2.0
Name: NumberOfTrips, dtype: float64

df.loc[df['NumberOfTrips'].isnull(), ['NumberOfTrips']] = 3

df.NumberOfTrips.isna().sum()

0

df.NumberOfTrips.describe()

count    4888.000000
mean        3.229746
std         1.822769
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max        22.000000
Name: NumberOfTrips, dtype: float64

sns.displot(df['NumberOfTrips'], kde = True)
plt.axvline(x = df.NumberOfTrips.median(), color = 'black', ls='--')
plt.axvline(x = df.NumberOfTrips.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x249391926a0>

df.NumberOfChildrenVisiting.isna().sum()

66

df.NumberOfChildrenVisiting.describe()

count    4822.000000
mean        1.187267
std         0.857861
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         3.000000
Name: NumberOfChildrenVisiting, dtype: float64

sns.displot(df['NumberOfChildrenVisiting'], kde = True)
plt.axvline(x = df.NumberOfChildrenVisiting.median(), color = 'black', ls='--')
plt.axvline(x = df.NumberOfChildrenVisiting.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x24939393970>

ReturnByPurchase('NumberOfChildrenVisiting')

Purchased Mean: 1.200438
Purchased Median: 1.0
Purchased Mode: 0    1.0
Name: NumberOfChildrenVisiting, dtype: float64

-----------

Unpurchased Mean: 1.18419
Unpurchased Median: 1.0
Unpurchased Mode: 0    1.0
Name: NumberOfChildrenVisiting, dtype: float64

df.loc[df['NumberOfChildrenVisiting'].isnull(), ['NumberOfChildrenVisiting']] = 1

df.NumberOfChildrenVisiting.isna().sum()

0

df.NumberOfChildrenVisiting.describe()

count    4888.000000
mean        1.184738
std         0.852323
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         3.000000
Name: NumberOfChildrenVisiting, dtype: float64

sns.displot(df['NumberOfChildrenVisiting'], kde = True)
plt.axvline(x = df.NumberOfChildrenVisiting.median(), color = 'black', ls='--')
plt.axvline(x = df.NumberOfChildrenVisiting.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x2493924f580>

df.MonthlyIncome.isna().sum()

233

df.MonthlyIncome.describe()

count     4655.000000
mean     23619.853491
std       5380.698361
min       1000.000000
25%      20346.000000
50%      22347.000000
75%      25571.000000
max      98678.000000
Name: MonthlyIncome, dtype: float64

sns.displot(df['MonthlyIncome'], kde = True)
plt.axvline(x = df.MonthlyIncome.median(), color = 'black', ls='--')
plt.axvline(x = df.MonthlyIncome.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x2493951fb50>

ReturnByPurchase('MonthlyIncome')

Purchased Mean: 22172.827703
Purchased Median: 21172.0
Purchased Mode: 0    17293.0
1    17404.0
2    20971.0
3    21082.0
Name: MonthlyIncome, dtype: float64

-----------

Unpurchased Mean: 23960.962835
Unpurchased Median: 22729.0
Unpurchased Mode: 0    20855.0
Name: MonthlyIncome, dtype: float64

df.loc[(df['MonthlyIncome'].isnull() == True) & (df['ProdTaken'] == 1), ['MonthlyIncome']] = 21172
df.loc[(df['MonthlyIncome'].isnull() == True) & (df['ProdTaken'] == 0),['MonthlyIncome']] = 22729

df.MonthlyIncome.isna().sum()

0

df.MonthlyIncome.describe()

count     4888.000000
mean     23567.195376
std       5257.438805
min       1000.000000
25%      20485.000000
50%      22595.500000
75%      25424.750000
max      98678.000000
Name: MonthlyIncome, dtype: float64

sns.displot(df['NumberOfTrips'], kde = True)
plt.axvline(x = df.NumberOfTrips.median(), color = 'black', ls='--')
plt.axvline(x = df.NumberOfTrips.mean(), color = 'red')

<matplotlib.lines.Line2D at 0x2493a708070>

df.isna().sum()

ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

df.describe().T

df.DurationOfPitch.describe()

count    4888.000000
mean       15.381342
std         8.313127
min         5.000000
25%         9.000000
50%        13.000000
75%        19.000000
max       127.000000
Name: DurationOfPitch, dtype: float64

histogramboxplot(data = df, feature = 'DurationOfPitch')

# df[df['DurationOfPitch'] > 120]

df.groupby("ProdTaken")["DurationOfPitch"].median()

ProdTaken
0    13.0
1    15.0
Name: DurationOfPitch, dtype: float64

df.loc[df['DurationOfPitch'] > 125, 'DurationOfPitch'] = 13

histogramboxplot(data = df, feature = 'DurationOfPitch')

df.DurationOfPitch.describe()

count    4888.000000
mean       15.334902
std         8.003437
min         5.000000
25%         9.000000
50%        13.000000
75%        19.000000
max        36.000000
Name: DurationOfPitch, dtype: float64

histogramboxplot(data = df, feature = 'NumberOfPersonVisiting')

df.DurationOfPitch.describe()

count    4888.000000
mean       15.334902
std         8.003437
min         5.000000
25%         9.000000
50%        13.000000
75%        19.000000
max        36.000000
Name: DurationOfPitch, dtype: float64

histogramboxplot(data = df, feature = 'NumberOfFollowups')

df.DurationOfPitch.describe()

count    4888.000000
mean       15.334902
std         8.003437
min         5.000000
25%         9.000000
50%        13.000000
75%        19.000000
max        36.000000
Name: DurationOfPitch, dtype: float64

histogramboxplot(data = df, feature = 'NumberOfTrips')

# df[df['NumberOfTrips'] > 15]

df.groupby("ProdTaken")["NumberOfTrips"].median()

ProdTaken
0    3.0
1    3.0
Name: NumberOfTrips, dtype: float64

df.loc[df['NumberOfTrips'] > 18, 'NumberOfTrips'] = 3

histogramboxplot(data = df, feature = 'NumberOfTrips')

df.DurationOfPitch.describe()

count    4888.000000
mean       15.334902
std         8.003437
min         5.000000
25%         9.000000
50%        13.000000
75%        19.000000
max        36.000000
Name: DurationOfPitch, dtype: float64

histogramboxplot(data = df, feature = 'MonthlyIncome')

# df[df['MonthlyIncome'] < 10000]

# df[df['MonthlyIncome'] > 80000]

df.groupby("ProdTaken")["MonthlyIncome"].median()

ProdTaken
0    22729.0
1    21172.0
Name: MonthlyIncome, dtype: float64

df.loc[df['MonthlyIncome'] > 80000, 'MonthlyIncome'] = 22729
df.loc[df['MonthlyIncome'] < 10000, 'MonthlyIncome'] = 22729

histogramboxplot(data = df, feature = 'MonthlyIncome')

df['Gender'].value_counts()

Male       2916
Female     1817
Fe Male     155
Name: Gender, dtype: int64

df.loc[df['Gender'] == 'Fe Male', ['Gender']] = 'Female'

df['Gender'].value_counts()

Male      2916
Female    1972
Name: Gender, dtype: int64

# df.head(5)

df.describe().T

descviolinbox(df, 'Age')

count    4888.000000
mean       37.557488
std         9.109545
min        18.000000
25%        31.000000
50%        37.000000
75%        43.000000
max        61.000000
Name: Age, dtype: float64

labeldisplot(df, 'Age')

bounds(df, 'Age')

Q1: 31.0
Q3: 43.0
IQR: 12.0
lower limit: 13.0
upper limit: 61.0

(13.0, 61.0, 31.0, 43.0)

numericdata(df, 'Age')

Mean: 37.557488
Median: 37.0
Mode: 37.0
Range: 12.0

descviolinbox(df, 'DurationOfPitch')

count    4888.000000
mean       15.334902
std         8.003437
min         5.000000
25%         9.000000
50%        13.000000
75%        19.000000
max        36.000000
Name: DurationOfPitch, dtype: float64

labeldisplot(df, 'DurationOfPitch')

bounds(df, 'DurationOfPitch')

Q1: 9.0
Q3: 19.0
IQR: 10.0
lower limit: -6.0
upper limit: 34.0

(-6.0, 34.0, 9.0, 19.0)

numericdata(df, 'DurationOfPitch')

Mean: 15.334902
Median: 13.0
Mode: 9.0
Range: 10.0

descviolinbox(df, 'MonthlyIncome')

count     4888.000000
mean     23545.010434
std       5026.428314
min      16009.000000
25%      20486.750000
50%      22596.500000
75%      25407.750000
max      38677.000000
Name: MonthlyIncome, dtype: float64

labeldisplot(df, 'MonthlyIncome')

bounds(df, 'MonthlyIncome')

Q1: 20486.75
Q3: 25407.75
IQR: 4921.0
lower limit: 13105.25
upper limit: 32789.25

(13105.25, 32789.25, 20486.75, 25407.75)

numericdata(df, 'MonthlyIncome')

Mean: 23545.010434
Median: 22596.5
Mode: 22729.0
Range: 4921.0

labeledbarplot(df, 'ProdTaken')

labeledbarplot(df, 'TypeofContact')

labeledbarplot(df, 'CityTier')

labeledbarplot(df, 'Occupation')

labeledbarplot(df, 'Gender', perc= True )

labeledbarplot(df, 'NumberOfPersonVisiting')

labeledbarplot(df, 'NumberOfFollowups')

labeledbarplot(df, 'ProductPitched')

labeledbarplot(df, 'PreferredPropertyStar')

labeledbarplot(df, 'MaritalStatus')

labeledbarplot(df, 'NumberOfTrips')

labeledbarplot(df, 'Passport', perc = True)

labeledbarplot(df, 'PitchSatisfactionScore')

labeledbarplot(df, 'OwnCar', perc =True)

labeledbarplot(df, 'NumberOfChildrenVisiting')

labeledbarplot(df, 'Designation')

fig, ax = plt.subplots(figsize=(12,12))   
sns.heatmap(df.corr(), annot = True, cmap = "mako", square = True, ax = ax)

<AxesSubplot:>

sns.pairplot(df, diag_kind="kde", hue = 'ProdTaken')

<seaborn.axisgrid.PairGrid at 0x2493a99fd30>

df.groupby(['ProdTaken']).mean().T

stackedbox(df, 'Age', 'ProdTaken')

minmeanmax(df, 'Age', 'ProdTaken')

Not Purchased Min Mean and Max:

 Min: 18.0
 Mean: 38.223286
 Max: 61.0

Purchased Min Mean and Max:

 Min: 18.0
 Mean: 34.68587
 Max: 60.0

stackedbox(df, 'DurationOfPitch', 'ProdTaken')

minmeanmax(df, 'DurationOfPitch', 'ProdTaken')

Not Purchased Min Mean and Max:

 Min: 5.0
 Mean: 14.999496
 Max: 36.0

Purchased Min Mean and Max:

 Min: 6.0
 Mean: 16.781522
 Max: 36.0

stackedbox(df, 'MonthlyIncome', 'ProdTaken')

minmeanmax(df, 'MonthlyIncome', 'ProdTaken')

Not Purchased Min Mean and Max:

 Min: 16051.0
 Mean: 23871.228831
 Max: 38677.0

Purchased Min Mean and Max:

 Min: 16009.0
 Mean: 22138.016304
 Max: 38537.0

# df.head(5)

stackedbarplot(df, 'TypeofContact', 'ProdTaken')

ProdTaken           0    1   All
TypeofContact                   
All              3968  920  4888
Self Enquiry     2859  610  3469
Company Invited  1109  310  1419
-------------------------------------------------------------------------------------------------------------------
ProdTaken               0         1
TypeofContact                      
Company Invited  0.781536  0.218464
Self Enquiry     0.824157  0.175843

sns.countplot(data= df, x='TypeofContact', hue = 'ProdTaken')

<AxesSubplot:xlabel='TypeofContact', ylabel='count'>

stackedbarplot(df, 'CityTier', 'ProdTaken')

ProdTaken     0    1   All
CityTier                  
All        3968  920  4888
1          2670  520  3190
3          1146  354  1500
2           152   46   198
-------------------------------------------------------------------------------------------------------------------
ProdTaken         0         1
CityTier                     
3          0.764000  0.236000
2          0.767677  0.232323
1          0.836991  0.163009

sns.countplot(data= df, x='CityTier', hue = 'ProdTaken')

<AxesSubplot:xlabel='CityTier', ylabel='count'>

stackedbarplot(df, 'Occupation', 'ProdTaken')

ProdTaken          0    1   All
Occupation                     
All             3968  920  4888
Salaried        1954  414  2368
Small Business  1700  384  2084
Large Business   314  120   434
Free Lancer        0    2     2
-------------------------------------------------------------------------------------------------------------------
ProdTaken              0         1
Occupation                        
Free Lancer     0.000000  1.000000
Large Business  0.723502  0.276498
Small Business  0.815739  0.184261
Salaried        0.825169  0.174831

sns.countplot(data= df, x='Occupation', hue = 'ProdTaken')

<AxesSubplot:xlabel='Occupation', ylabel='count'>

df.loc[df['Occupation'] == 'Free Lancer']

stackedbarplot(df, 'Gender', 'ProdTaken')

ProdTaken     0    1   All
Gender                    
All        3968  920  4888
Male       2338  578  2916
Female     1630  342  1972
-------------------------------------------------------------------------------------------------------------------
ProdTaken         0         1
Gender                       
Male       0.801783  0.198217
Female     0.826572  0.173428

sns.countplot(data= df, x='Gender', hue = 'ProdTaken')

<AxesSubplot:xlabel='Gender', ylabel='count'>

stackedbarplot(df, 'NumberOfPersonVisiting', 'ProdTaken')

ProdTaken                  0    1   All
NumberOfPersonVisiting                 
All                     3968  920  4888
3                       1942  460  2402
2                       1151  267  1418
4                        833  193  1026
1                         39    0    39
5                          3    0     3
-------------------------------------------------------------------------------------------------------------------
ProdTaken                      0         1
NumberOfPersonVisiting                    
3                       0.808493  0.191507
2                       0.811707  0.188293
4                       0.811891  0.188109
1                       1.000000  0.000000
5                       1.000000  0.000000

sns.countplot(data= df, x='NumberOfPersonVisiting', hue = 'ProdTaken')

<AxesSubplot:xlabel='NumberOfPersonVisiting', ylabel='count'>

stackedbarplot(df, 'NumberOfFollowups', 'ProdTaken')

ProdTaken             0    1   All
NumberOfFollowups                 
All                3968  920  4888
4.0                1726  387  2113
3.0                1222  244  1466
5.0                 577  191   768
6.0                  82   54   136
2.0                 205   24   229
1.0                 156   20   176
-------------------------------------------------------------------------------------------------------------------
ProdTaken                 0         1
NumberOfFollowups                    
6.0                0.602941  0.397059
5.0                0.751302  0.248698
4.0                0.816848  0.183152
3.0                0.833561  0.166439
1.0                0.886364  0.113636
2.0                0.895197  0.104803

sns.countplot(data= df, x='NumberOfFollowups', hue = 'ProdTaken')

<AxesSubplot:xlabel='NumberOfFollowups', ylabel='count'>

stackedbarplot(df, 'ProductPitched', 'ProdTaken')

ProdTaken          0    1   All
ProductPitched                 
All             3968  920  4888
Basic           1290  552  1842
Deluxe          1528  204  1732
Standard         618  124   742
King             210   20   230
Super Deluxe     322   20   342
-------------------------------------------------------------------------------------------------------------------
ProdTaken              0         1
ProductPitched                    
Basic           0.700326  0.299674
Standard        0.832884  0.167116
Deluxe          0.882217  0.117783
King            0.913043  0.086957
Super Deluxe    0.941520  0.058480

sns.countplot(data= df, x='ProductPitched', hue = 'ProdTaken')

<AxesSubplot:xlabel='ProductPitched', ylabel='count'>

stackedbarplot(df, 'PreferredPropertyStar', 'ProdTaken')

ProdTaken                 0    1   All
PreferredPropertyStar                 
All                    3968  920  4888
3.0                    2531  488  3019
5.0                     706  250   956
4.0                     731  182   913
-------------------------------------------------------------------------------------------------------------------
ProdTaken                     0         1
PreferredPropertyStar                    
5.0                    0.738494  0.261506
4.0                    0.800657  0.199343
3.0                    0.838357  0.161643

sns.countplot(data= df, x='PreferredPropertyStar', hue = 'ProdTaken')

<AxesSubplot:xlabel='PreferredPropertyStar', ylabel='count'>

stackedbarplot(df, 'MaritalStatus', 'ProdTaken')

ProdTaken         0    1   All
MaritalStatus                 
All            3968  920  4888
Married        2014  326  2340
Single          612  304   916
Unmarried       516  166   682
Divorced        826  124   950
-------------------------------------------------------------------------------------------------------------------
ProdTaken             0         1
MaritalStatus                    
Single         0.668122  0.331878
Unmarried      0.756598  0.243402
Married        0.860684  0.139316
Divorced       0.869474  0.130526

sns.countplot(data= df, x='MaritalStatus', hue = 'ProdTaken')

<AxesSubplot:xlabel='MaritalStatus', ylabel='count'>

stackedbarplot(df, 'NumberOfTrips', 'ProdTaken')

ProdTaken         0    1   All
NumberOfTrips                 
All            3968  920  4888
2.0            1165  299  1464
3.0             992  231  1223
1.0             508  112   620
6.0             258   64   322
5.0             396   62   458
7.0             156   62   218
4.0             417   61   478
8.0              76   29   105
-------------------------------------------------------------------------------------------------------------------
ProdTaken             0         1
NumberOfTrips                    
7.0            0.715596  0.284404
8.0            0.723810  0.276190
2.0            0.795765  0.204235
6.0            0.801242  0.198758
3.0            0.811120  0.188880
1.0            0.819355  0.180645
5.0            0.864629  0.135371
4.0            0.872385  0.127615

sns.countplot(data= df, x='NumberOfTrips', hue = 'ProdTaken')

<AxesSubplot:xlabel='NumberOfTrips', ylabel='count'>

stackedbarplot(df, 'Passport', 'ProdTaken')

ProdTaken     0    1   All
Passport                  
All        3968  920  4888
1           928  494  1422
0          3040  426  3466
-------------------------------------------------------------------------------------------------------------------
ProdTaken         0         1
Passport                     
1          0.652602  0.347398
0          0.877092  0.122908

sns.countplot(data= df, x='Passport', hue = 'ProdTaken')

<AxesSubplot:xlabel='Passport', ylabel='count'>

stackedbarplot(df, 'PitchSatisfactionScore', 'ProdTaken')

ProdTaken                  0    1   All
PitchSatisfactionScore                 
All                     3968  920  4888
3                       1162  316  1478
5                        760  210   970
4                        750  162   912
1                        798  144   942
2                        498   88   586
-------------------------------------------------------------------------------------------------------------------
ProdTaken                      0         1
PitchSatisfactionScore                    
5                       0.783505  0.216495
3                       0.786198  0.213802
4                       0.822368  0.177632
1                       0.847134  0.152866
2                       0.849829  0.150171

sns.countplot(data= df, x='PitchSatisfactionScore', hue = 'ProdTaken')

<AxesSubplot:xlabel='PitchSatisfactionScore', ylabel='count'>

stackedbarplot(df, 'OwnCar', 'ProdTaken')

ProdTaken     0    1   All
OwnCar                    
All        3968  920  4888
1          2472  560  3032
0          1496  360  1856
-------------------------------------------------------------------------------------------------------------------
ProdTaken         0         1
OwnCar                       
0          0.806034  0.193966
1          0.815303  0.184697

sns.countplot(data= df, x='OwnCar', hue = 'ProdTaken')

<AxesSubplot:xlabel='OwnCar', ylabel='count'>

stackedbarplot(df, 'NumberOfChildrenVisiting', 'ProdTaken')

ProdTaken                    0    1   All
NumberOfChildrenVisiting                 
All                       3968  920  4888
1.0                       1747  399  2146
2.0                       1082  253  1335
0.0                        880  202  1082
3.0                        259   66   325
-------------------------------------------------------------------------------------------------------------------
ProdTaken                        0         1
NumberOfChildrenVisiting                    
3.0                       0.796923  0.203077
2.0                       0.810487  0.189513
0.0                       0.813309  0.186691
1.0                       0.814073  0.185927

sns.countplot(data= df, x='NumberOfChildrenVisiting', hue = 'ProdTaken')

<AxesSubplot:xlabel='NumberOfChildrenVisiting', ylabel='count'>

stackedbarplot(df, 'Designation', 'ProdTaken')

ProdTaken          0    1   All
Designation                    
All             3968  920  4888
Executive       1290  552  1842
Manager         1528  204  1732
Senior Manager   618  124   742
AVP              322   20   342
VP               210   20   230
-------------------------------------------------------------------------------------------------------------------
ProdTaken              0         1
Designation                       
Executive       0.700326  0.299674
Senior Manager  0.832884  0.167116
Manager         0.882217  0.117783
VP              0.913043  0.086957
AVP             0.941520  0.058480

sns.countplot(data= df, x='Designation', hue = 'ProdTaken')

<AxesSubplot:xlabel='Designation', ylabel='count'>

plt.figure(figsize=(15, 7))
sns.pointplot(x = 'NumberOfTrips', y='MonthlyIncome', data=df, hue = 'ProdTaken')

<AxesSubplot:xlabel='NumberOfTrips', ylabel='MonthlyIncome'>

plt.figure(figsize=(15, 7))
sns.pointplot(x = 'Designation', y='MonthlyIncome', data=df, hue = 'ProdTaken')

<AxesSubplot:xlabel='Designation', ylabel='MonthlyIncome'>

plt.figure(figsize=(15, 7))
sns.pointplot(x = 'ProductPitched', y='MonthlyIncome', data=df, hue = 'ProdTaken')

<AxesSubplot:xlabel='ProductPitched', ylabel='MonthlyIncome'>

plt.figure(figsize=(15, 7))
sns.pointplot(x = 'PreferredPropertyStar', y='MonthlyIncome', data=df, hue = 'ProdTaken')

<AxesSubplot:xlabel='PreferredPropertyStar', ylabel='MonthlyIncome'>

plt.figure(figsize=(15, 7))
sns.pointplot(x = 'PitchSatisfactionScore', y='DurationOfPitch', data=df, hue = 'ProdTaken')

<AxesSubplot:xlabel='PitchSatisfactionScore', ylabel='DurationOfPitch'>

plt.figure(figsize=(15, 7))
sns.pointplot(data=df, x='MaritalStatus', y='Age', hue='ProdTaken')

<AxesSubplot:xlabel='MaritalStatus', ylabel='Age'>

stackedbarplot(df, 'OwnCar', 'ProdTaken')

ProdTaken     0    1   All
OwnCar                    
All        3968  920  4888
1          2472  560  3032
0          1496  360  1856
-------------------------------------------------------------------------------------------------------------------
ProdTaken         0         1
OwnCar                       
0          0.806034  0.193966
1          0.815303  0.184697

df.head(5)

df.drop('OwnCar', axis = 1, inplace = True)

df.head(5)

stackedbarplot(df, 'NumberOfPersonVisiting', 'ProdTaken')

ProdTaken                  0    1   All
NumberOfPersonVisiting                 
All                     3968  920  4888
3                       1942  460  2402
2                       1151  267  1418
4                        833  193  1026
1                         39    0    39
5                          3    0     3
-------------------------------------------------------------------------------------------------------------------
ProdTaken                      0         1
NumberOfPersonVisiting                    
3                       0.808493  0.191507
2                       0.811707  0.188293
4                       0.811891  0.188109
1                       1.000000  0.000000
5                       1.000000  0.000000

df.head(5)

df.drop('NumberOfPersonVisiting', axis = 1, inplace = True)

df.head(5)

stackedbarplot(df, 'NumberOfChildrenVisiting', 'ProdTaken')

ProdTaken                    0    1   All
NumberOfChildrenVisiting                 
All                       3968  920  4888
1.0                       1747  399  2146
2.0                       1082  253  1335
0.0                        880  202  1082
3.0                        259   66   325
-------------------------------------------------------------------------------------------------------------------
ProdTaken                        0         1
NumberOfChildrenVisiting                    
3.0                       0.796923  0.203077
2.0                       0.810487  0.189513
0.0                       0.813309  0.186691
1.0                       0.814073  0.185927

df.head(5)

df.drop('NumberOfChildrenVisiting', axis = 1, inplace = True)

df.head(5)

stackedbarplot(df, 'PitchSatisfactionScore', 'ProdTaken')

ProdTaken                  0    1   All
PitchSatisfactionScore                 
All                     3968  920  4888
3                       1162  316  1478
5                        760  210   970
4                        750  162   912
1                        798  144   942
2                        498   88   586
-------------------------------------------------------------------------------------------------------------------
ProdTaken                      0         1
PitchSatisfactionScore                    
5                       0.783505  0.216495
3                       0.786198  0.213802
4                       0.822368  0.177632
1                       0.847134  0.152866
2                       0.849829  0.150171

df.head(5)

df.drop('PitchSatisfactionScore', axis = 1, inplace = True)

df.head(5)

sns.countplot(data= df, x='Occupation', hue = 'ProdTaken')

<AxesSubplot:xlabel='Occupation', ylabel='count'>

df.drop(df[(df['Occupation'] == 'Free Lancer')].index, inplace=True)

sns.countplot(data= df, x='Occupation', hue = 'ProdTaken')

<AxesSubplot:xlabel='Occupation', ylabel='count'>

stackedbarplot(df, 'NumberOfTrips', 'ProdTaken')

ProdTaken         0    1   All
NumberOfTrips                 
All            3968  918  4886
2.0            1165  299  1464
3.0             992  231  1223
1.0             508  112   620
6.0             258   64   322
5.0             396   62   458
4.0             417   61   478
7.0             156   61   217
8.0              76   28   104
-------------------------------------------------------------------------------------------------------------------
ProdTaken             0         1
NumberOfTrips                    
7.0            0.718894  0.281106
8.0            0.730769  0.269231
2.0            0.795765  0.204235
6.0            0.801242  0.198758
3.0            0.811120  0.188880
1.0            0.819355  0.180645
5.0            0.864629  0.135371
4.0            0.872385  0.127615

sns.countplot(data= df, x='NumberOfTrips', hue = 'ProdTaken')

<AxesSubplot:xlabel='NumberOfTrips', ylabel='count'>

conditions = [
    (df['NumberOfTrips'] >= 7),
    (df['NumberOfTrips'] == 6),
    (df['NumberOfTrips'] >= 4) & (df['NumberOfTrips'] <= 5),
    (df['NumberOfTrips'] >= 1) & (df['NumberOfTrips'] <= 3)
]

values = [
    '7+',
    '6',
    '4-5',
    '1-3'
]

df['NumberOfTrips'] = np.select(conditions, values)

sns.countplot(data=df, x='NumberOfTrips', order=['1-3', '4-5', '6', '7+'], hue='ProdTaken')

<AxesSubplot:xlabel='NumberOfTrips', ylabel='count'>

stackedbarplot(df, 'NumberOfTrips', 'ProdTaken')

ProdTaken         0    1   All
NumberOfTrips                 
All            3968  918  4886
1-3            2665  642  3307
4-5             813  123   936
7+              232   89   321
6               258   64   322
-------------------------------------------------------------------------------------------------------------------
ProdTaken             0         1
NumberOfTrips                    
7+             0.722741  0.277259
6              0.801242  0.198758
1-3            0.805866  0.194134
4-5            0.868590  0.131410

stackedbarplot(df, 'Designation', 'ProdTaken')

ProdTaken          0    1   All
Designation                    
All             3968  918  4886
Executive       1290  550  1840
Manager         1528  204  1732
Senior Manager   618  124   742
AVP              322   20   342
VP               210   20   230
-------------------------------------------------------------------------------------------------------------------
ProdTaken              0         1
Designation                       
Executive       0.701087  0.298913
Senior Manager  0.832884  0.167116
Manager         0.882217  0.117783
VP              0.913043  0.086957
AVP             0.941520  0.058480

sns.countplot(data=df, x='Designation', hue='ProdTaken')

<AxesSubplot:xlabel='Designation', ylabel='count'>

plt.figure(figsize=(15, 7))
sns.pointplot(data=df, x='Designation', y='MonthlyIncome')

<AxesSubplot:xlabel='Designation', ylabel='MonthlyIncome'>

conditions = [
    (df['Designation'] == 'Manager'), 
    (df['Designation'] == 'Executive'),
    (df['Designation'] == 'Senior Manager'),
    (df['Designation'] == 'AVP'),
    (df['Designation'] == 'VP')
]

values = [
    'Management',
    'Executive',
    'Management',
    'VP and AVP',
    'VP and AVP'
]

df['Designation'] = np.select(conditions, values)

stackedbarplot(df, 'Designation', 'ProdTaken')

ProdTaken       0    1   All
Designation                 
All          3968  918  4886
Executive    1290  550  1840
Management   2146  328  2474
VP and AVP    532   40   572
-------------------------------------------------------------------------------------------------------------------
ProdTaken           0         1
Designation                    
Executive    0.701087  0.298913
Management   0.867421  0.132579
VP and AVP   0.930070  0.069930

sns.countplot(data=df, x='Designation', hue='ProdTaken')

<AxesSubplot:xlabel='Designation', ylabel='count'>

df.head(5)

labeldisplot(df, 'Age')

scaler.fit(df[['Age']])
df[['Age']] = scaler.transform(df[['Age']])

labeldisplot(df, 'Age')

labeldisplot(df, 'DurationOfPitch')

sns.kdeplot(data=df, x="DurationOfPitch", fill=False)

<AxesSubplot:xlabel='DurationOfPitch', ylabel='Density'>

df.DurationOfPitch.describe()

count    4886.000000
mean       15.337700
std         8.003873
min         5.000000
25%         9.000000
50%        13.000000
75%        19.000000
max        36.000000
Name: DurationOfPitch, dtype: float64

df['DurationOfPitch'] = np.log(df['DurationOfPitch'])

df.DurationOfPitch.describe()

count    4886.000000
mean        2.603731
std         0.499056
min         1.609438
25%         2.197225
50%         2.564949
75%         2.944439
max         3.583519
Name: DurationOfPitch, dtype: float64

sns.kdeplot(data=df, x="DurationOfPitch", fill=False)

<AxesSubplot:xlabel='DurationOfPitch', ylabel='Density'>

labeldisplot(df, 'DurationOfPitch')

labeldisplot(df, 'MonthlyIncome')

sns.kdeplot(data=df, x="MonthlyIncome", fill=False)

<AxesSubplot:xlabel='MonthlyIncome', ylabel='Density'>

df.MonthlyIncome.describe()

count     4886.000000
mean     23546.899918
std       5026.451397
min      16009.000000
25%      20487.250000
50%      22597.000000
75%      25411.250000
max      38677.000000
Name: MonthlyIncome, dtype: float64

df['MonthlyIncome'] = np.log(df['MonthlyIncome'])

df.MonthlyIncome.describe()

count    4886.000000
mean       10.045997
std         0.199718
min         9.680906
25%         9.927558
50%        10.025572
75%        10.142947
max        10.563000
Name: MonthlyIncome, dtype: float64

sns.kdeplot(data=df, x="MonthlyIncome", fill=False)

<AxesSubplot:xlabel='MonthlyIncome', ylabel='Density'>

labeldisplot(df, 'MonthlyIncome')

scaler.fit(df[['MonthlyIncome']])
df[['MonthlyIncome']] = scaler.transform(df[['MonthlyIncome']])

labeldisplot(df, 'MonthlyIncome')

df.head()

labeledbarplot(df, 'TypeofContact')

df.head(5)

df['TypeofContact'] = labelEncoder.fit_transform(df['TypeofContact'])

df.head(5)

labeledbarplot(df, 'TypeofContact')

df.head(5)

labeledbarplot(df, 'Occupation')

df['Occupation'] = labelEncoder.fit_transform(df['Occupation'])

labeledbarplot(df, 'Occupation')

df.head(5)

labeledbarplot(df, 'Gender')

df.head(5)

df['Gender'] = labelEncoder.fit_transform(df['Gender'])

df.head(5)

labeledbarplot(df, 'Gender')

labeledbarplot(df, 'ProductPitched')

df.head(5)

df['ProductPitched'] = labelEncoder.fit_transform(df['ProductPitched'])

df.head(5)

labeledbarplot(df, 'ProductPitched')

labeledbarplot(df, 'MaritalStatus')

df.head(5)

df['MaritalStatus'] = labelEncoder.fit_transform(df['MaritalStatus'])

df.head(5)

labeledbarplot(df, 'MaritalStatus')

labeledbarplot(df, 'NumberOfTrips')

df.head(10)

df['NumberOfTrips'] = labelEncoder.fit_transform(df['NumberOfTrips'])

df.head(10)

labeledbarplot(df, 'NumberOfTrips')

labeledbarplot(df, 'Designation')

df.head(5)

df['Designation'] = labelEncoder.fit_transform(df['Designation'])

df.head(5)

labeledbarplot(df, 'Designation')

	CustomerID	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
0	200000	0.0	35.0	Self Enquiry	1.0	9.0	Salaried	Male	3.0	4.0	Basic	3.0	Married	2.0	0.0	3.0	1.0	1.0	Executive	17342.0
1	200001	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	20855.0
2	200002	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	21020.0
3	200003	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	21288.0
4	200004	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4883	204883	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4884	204884	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4885	204885	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4886	204886	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4887	204887	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	CustomerID	ProdTaken	Age	CityTier	DurationOfPitch	NumberOfPersonVisiting	NumberOfFollowups	PreferredPropertyStar	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	MonthlyIncome
CustomerID	1.000000	0.056506	0.032127	0.012975	0.064298	0.604013	0.427539	0.010553	0.224848	0.007974	-0.035847	0.003805	0.511763	0.276833
ProdTaken	0.056506	1.000000	-0.147254	0.086852	0.078257	0.009627	0.112171	0.099577	0.018898	0.260844	0.051394	-0.011508	0.007421	-0.130585
Age	0.032127	-0.147254	1.000000	-0.015625	-0.012063	0.011621	-0.002577	-0.010474	0.184905	0.033399	0.018510	0.048654	0.007370	0.464869
CityTier	0.012975	0.086852	-0.015625	1.000000	0.022703	-0.001671	0.023652	-0.009164	-0.029709	0.001793	-0.042160	0.003817	0.000672	0.051817
DurationOfPitch	0.064298	0.078257	-0.012063	0.022703	1.000000	0.065141	0.009434	-0.006637	0.009715	0.033034	-0.002880	-0.001626	0.031408	-0.006252
NumberOfPersonVisiting	0.604013	0.009627	0.011621	-0.001671	0.065141	1.000000	0.328569	0.033867	0.195223	0.011177	-0.019581	0.010362	0.610621	0.195134
NumberOfFollowups	0.427539	0.112171	-0.002577	0.023652	0.009434	0.328569	1.000000	-0.024176	0.139517	0.004970	0.004054	0.012112	0.286425	0.176503
PreferredPropertyStar	0.010553	0.099577	-0.010474	-0.009164	-0.006637	0.033867	-0.024176	1.000000	0.012115	0.001040	-0.022701	0.015742	0.035798	0.014289
NumberOfTrips	0.224848	0.018898	0.184905	-0.029709	0.009715	0.195223	0.139517	0.012115	1.000000	0.012949	-0.004378	-0.011825	0.168795	0.139105
Passport	0.007974	0.260844	0.033399	0.001793	0.033034	0.011177	0.004970	0.001040	0.012949	1.000000	0.002926	-0.022330	0.020264	0.002545
PitchSatisfactionScore	-0.035847	0.051394	0.018510	-0.042160	-0.002880	-0.019581	0.004054	-0.022701	-0.004378	0.002926	1.000000	0.068850	0.000878	0.030421
OwnCar	0.003805	-0.011508	0.048654	0.003817	-0.001626	0.010362	0.012112	0.015742	-0.011825	-0.022330	0.068850	1.000000	0.026572	0.080262
NumberOfChildrenVisiting	0.511763	0.007421	0.007370	0.000672	0.031408	0.610621	0.286425	0.035798	0.168795	0.020264	0.000878	0.026572	1.000000	0.201643
MonthlyIncome	0.276833	-0.130585	0.464869	0.051817	-0.006252	0.195134	0.176503	0.014289	0.139105	0.002545	0.030421	0.080262	0.201643	1.000000

ProdTaken	0	1
Age	38.223286	34.685870
CityTier	1.615927	1.819565
DurationOfPitch	14.999496	16.781522
NumberOfPersonVisiting	2.901714	2.919565
NumberOfFollowups	3.657510	3.942391
PreferredPropertyStar	3.540071	3.741304
NumberOfTrips	3.203125	3.268478
Passport	0.233871	0.536957
PitchSatisfactionScore	3.044355	3.223913
OwnCar	0.622984	0.608696
NumberOfChildrenVisiting	1.181452	1.198913
MonthlyIncome	23871.228831	22138.016304

	CustomerID	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
996	200996	0	27.0	Self Enquiry	3	16.0	Small Business	Female	3	4.0	Deluxe	3.0	Divorced	2.0	1	3	1	2.0	Manager	20769.0
100	200100	1	37.0	Self Enquiry	2	12.0	Salaried	Male	3	3.0	Basic	5.0	Married	5.0	1	2	1	0.0	Executive	17073.0
4493	204493	0	35.0	Self Enquiry	1	9.0	Small Business	Female	3	5.0	Basic	5.0	Unmarried	3.0	0	1	1	1.0	Executive	23059.0
3579	203579	0	47.0	Self Enquiry	3	7.0	Salaried	Male	3	2.0	Super Deluxe	5.0	Single	NaN	0	4	1	2.0	AVP	36245.0
320	200320	0	27.0	Self Enquiry	3	NaN	Salaried	Male	3	3.0	Deluxe	3.0	Single	2.0	1	4	1	2.0	Manager	NaN
2585	202585	0	46.0	Self Enquiry	1	36.0	Small Business	Male	3	4.0	Basic	3.0	Unmarried	7.0	0	2	1	1.0	Executive	22130.0
3409	203409	0	26.0	Self Enquiry	1	26.0	Small Business	Male	4	4.0	Basic	3.0	Divorced	5.0	0	5	1	3.0	Executive	22347.0
3718	203718	0	32.0	Self Enquiry	3	36.0	Small Business	Female	4	5.0	Deluxe	3.0	Married	3.0	0	3	1	1.0	Manager	24146.0
2039	202039	0	36.0	Company Invited	3	14.0	Salaried	Male	3	4.0	Standard	5.0	Unmarried	2.0	0	1	0	0.0	Senior Manager	22587.0
3581	203581	0	23.0	Company Invited	1	33.0	Salaried	Female	3	5.0	Basic	3.0	Married	3.0	1	3	0	1.0	Executive	21492.0

	CustomerID	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
3602	203602	0	38.0	Self Enquiry	1	26.0	Salaried	Male	3	4.0	Deluxe	3.0	Married	5.0	0	1	0	1.0	Manager	24446.0
2235	202235	0	38.0	Company Invited	1	9.0	Salaried	Male	2	3.0	Basic	3.0	Married	4.0	0	3	1	0.0	Executive	17821.0
4820	204820	1	35.0	Self Enquiry	1	14.0	Salaried	Male	4	4.0	Deluxe	3.0	Married	3.0	0	3	0	1.0	Manager	21263.0
2864	202864	0	30.0	Self Enquiry	1	10.0	Small Business	Male	4	5.0	Standard	3.0	Divorced	3.0	0	2	1	3.0	Senior Manager	30613.0
1577	201577	1	25.0	Self Enquiry	3	11.0	Small Business	Male	2	4.0	Deluxe	3.0	Single	2.0	1	3	0	1.0	Manager	20744.0
3342	203342	0	44.0	Self Enquiry	1	10.0	Salaried	Male	4	6.0	King	NaN	Divorced	5.0	0	5	1	3.0	VP	38418.0
1303	201303	0	47.0	Self Enquiry	1	10.0	Salaried	Female	3	4.0	Standard	4.0	Married	1.0	1	4	1	2.0	Senior Manager	25333.0
666	200666	1	NaN	Self Enquiry	1	9.0	Salaried	Female	2	3.0	Deluxe	3.0	Divorced	1.0	1	5	1	1.0	Manager	NaN
1129	201129	0	42.0	Self Enquiry	1	15.0	Salaried	Male	3	4.0	King	3.0	Single	1.0	0	1	1	1.0	VP	34613.0
1232	201232	0	35.0	Self Enquiry	1	33.0	Salaried	Male	2	3.0	Deluxe	3.0	Married	3.0	0	5	0	1.0	Manager	21883.0

	count	mean	std	min	25%	50%	75%	max
CustomerID	4888.0	202443.500000	1411.188388	200000.0	201221.75	202443.5	203665.25	204887.0
ProdTaken	4888.0	0.188216	0.390925	0.0	0.00	0.0	0.00	1.0
Age	4662.0	37.622265	9.316387	18.0	31.00	36.0	44.00	61.0
CityTier	4888.0	1.654255	0.916583	1.0	1.00	1.0	3.00	3.0
DurationOfPitch	4637.0	15.490835	8.519643	5.0	9.00	13.0	20.00	127.0
NumberOfPersonVisiting	4888.0	2.905074	0.724891	1.0	2.00	3.0	3.00	5.0
NumberOfFollowups	4843.0	3.708445	1.002509	1.0	3.00	4.0	4.00	6.0
PreferredPropertyStar	4862.0	3.581037	0.798009	3.0	3.00	3.0	4.00	5.0
NumberOfTrips	4748.0	3.236521	1.849019	1.0	2.00	3.0	4.00	22.0
Passport	4888.0	0.290917	0.454232	0.0	0.00	0.0	1.00	1.0
PitchSatisfactionScore	4888.0	3.078151	1.365792	1.0	2.00	3.0	4.00	5.0
OwnCar	4888.0	0.620295	0.485363	0.0	0.00	1.0	1.00	1.0
NumberOfChildrenVisiting	4822.0	1.187267	0.857861	0.0	1.00	1.0	2.00	3.0
MonthlyIncome	4655.0	23619.853491	5380.698361	1000.0	20346.00	22347.0	25571.00	98678.0

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
0	1	41.0	Self Enquiry	3	6.0	Salaried	Female	3	3.0	Deluxe	3.0	Single	1.0	1	2	1	0.0	Manager	20993.0
1	0	49.0	Company Invited	1	14.0	Salaried	Male	3	4.0	Deluxe	4.0	Divorced	2.0	0	3	1	2.0	Manager	20130.0
2	1	37.0	Self Enquiry	1	8.0	Free Lancer	Male	3	4.0	Basic	3.0	Single	7.0	1	3	0	0.0	Executive	17090.0
3	0	33.0	Company Invited	1	9.0	Salaried	Female	2	3.0	Basic	3.0	Divorced	2.0	1	5	1	1.0	Executive	17909.0
4	0	NaN	Self Enquiry	1	8.0	Small Business	Male	2	3.0	Basic	4.0	Divorced	1.0	0	5	1	0.0	Executive	18468.0

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
224	0	31.0	NaN	1	NaN	Small Business	Male	2	5.0	Deluxe	3.0	Divorced	1.0	3	1	0.0	Manager	NaN
571	0	26.0	NaN	1	NaN	Salaried	Female	3	5.0	Basic	3.0	Married	4.0	4	1	2.0	Executive	NaN
572	0	29.0	NaN	1	NaN	Small Business	Female	3	3.0	Deluxe	3.0	Divorced	5.0	2	1	0.0	Manager	NaN
576	0	27.0	NaN	3	NaN	Small Business	Male	2	3.0	Deluxe	3.0	Divorced	1.0	3	0	1.0	Manager	NaN
579	0	34.0	NaN	1	NaN	Small Business	Female	2	4.0	Basic	5.0	Single	2.0	2	1	1.0	Executive	NaN
598	1	28.0	NaN	1	NaN	Small Business	Male	2	3.0	Basic	3.0	Single	7.0	3	0	0.0	Executive	NaN
622	0	32.0	NaN	3	NaN	Salaried	Male	3	3.0	Deluxe	3.0	Married	3.0	2	0	0.0	Manager	NaN
724	0	24.0	NaN	1	NaN	Small Business	Female	2	4.0	Deluxe	3.0	Married	2.0	3	1	1.0	Manager	NaN
843	0	26.0	NaN	1	NaN	Small Business	Male	2	1.0	Basic	3.0	Divorced	2.0	5	1	1.0	Executive	NaN
1021	1	25.0	NaN	3	NaN	Salaried	Male	3	4.0	Basic	5.0	Divorced	4.0	1	1	0.0	Executive	NaN
1047	0	33.0	NaN	3	NaN	Small Business	Male	2	3.0	Deluxe	5.0	Divorced	1.0	3	0	0.0	Manager	NaN
1143	0	45.0	NaN	3	NaN	Small Business	Male	2	4.0	Deluxe	5.0	Married	2.0	3	0	0.0	Manager	NaN
1182	0	36.0	NaN	1	NaN	Small Business	Female	2	4.0	Deluxe	3.0	Married	1.0	5	1	1.0	Manager	NaN
1217	0	24.0	NaN	1	NaN	Small Business	Male	3	1.0	Basic	3.0	Married	2.0	1	0	0.0	Executive	NaN
1356	0	41.0	NaN	3	NaN	Small Business	Female	2	3.0	Deluxe	4.0	Married	6.0	3	1	1.0	Manager	NaN
1469	0	34.0	NaN	1	NaN	Small Business	Male	2	1.0	Deluxe	3.0	Married	3.0	3	0	1.0	Manager	NaN
1694	0	31.0	NaN	1	NaN	Small Business	Male	2	5.0	Deluxe	3.0	Married	1.0	3	0	0.0	Manager	NaN
2041	0	26.0	NaN	1	NaN	Salaried	Female	3	5.0	Basic	3.0	Married	4.0	4	1	0.0	Executive	NaN
2042	0	29.0	NaN	1	NaN	Small Business	Female	3	3.0	Deluxe	3.0	Married	5.0	1	0	1.0	Manager	NaN
2046	0	27.0	NaN	3	NaN	Small Business	Male	2	3.0	Deluxe	3.0	Married	1.0	3	1	1.0	Manager	NaN
2049	0	34.0	NaN	1	NaN	Small Business	Female	2	4.0	Basic	5.0	Single	2.0	1	1	0.0	Executive	NaN
2068	1	28.0	NaN	1	NaN	Small Business	Male	2	3.0	Basic	3.0	Single	7.0	3	1	1.0	Executive	NaN
2092	0	32.0	NaN	3	NaN	Salaried	Male	3	3.0	Deluxe	3.0	Married	3.0	1	0	2.0	Manager	NaN
2194	0	24.0	NaN	1	NaN	Small Business	Female	2	4.0	Deluxe	3.0	Married	2.0	3	0	0.0	Manager	NaN
2313	0	26.0	NaN	1	NaN	Small Business	Male	2	1.0	Basic	3.0	Married	2.0	5	1	1.0	Executive	NaN

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
1434	0	37.0	Company Invited	3	126.0	Salaried	Male	2	3.0	Basic	3.0	Married	3.0	0	1	1	1.0	Executive	18482.0
3878	0	53.0	Company Invited	3	127.0	Salaried	Male	3	4.0	Basic	3.0	Married	4.0	0	1	1	2.0	Executive	22160.0

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
142	0	38.0	Self Enquiry	1	9.0	Large Business	Female	2	3.0	Deluxe	3.0	Single	4.0	1	5	0	0.0	Manager	1000.0
2586	0	39.0	Self Enquiry	1	10.0	Large Business	Female	3	4.0	Deluxe	3.0	Single	5.0	1	5	0	1.0	Manager	4678.0

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	Designation	MonthlyIncome
0	1	0.534884	Self Enquiry	3	1.791759	Salaried	Female	3.0	Deluxe	3.0	Single	1-3	1	Management	0.307267
1	0	0.720930	Company Invited	1	2.639057	Salaried	Male	4.0	Deluxe	4.0	Divorced	1-3	0	Management	0.259678
3	0	0.348837	Company Invited	1	2.197225	Salaried	Female	3.0	Basic	3.0	Divorced	1-3	1	Executive	0.127143
4	0	0.441860	Self Enquiry	1	2.079442	Small Business	Male	3.0	Basic	4.0	Divorced	1-3	0	Executive	0.161988
5	0	0.325581	Company Invited	1	2.079442	Salaried	Male	3.0	Basic	3.0	Single	1-3	0	Executive	0.137164

Vacation Package Project¶

By: Kyle Bowen & Will Kittredge¶

Objective¶

Data Definition¶

Boilerplate¶

Functions¶

Data Overview¶

Observation¶

Observation¶

Observation¶

Observation¶

Observation¶

Exploring Numeric Data¶

Observations¶

Observations¶

Checking for Odd Categorical Data¶

Observations¶

Removing Unnecessary Column(s)¶

Data Preprocessing¶

Handling Null values¶

Age¶

Observations¶

Finding the values¶

Setting the Values¶

Examining Changes¶

Type of Contact¶

Notes¶

Duration of Pitch¶

Observations¶

Finding the Values¶

Setting the Values¶

Examining Changes¶

Number of Followups¶

Observations¶

Finding the Values¶

Setting the Values¶

Checking the Results¶

Preferred Property Star¶

Observations¶

Finding the Values¶

Checking the Results¶

Number of Trips¶

Observations¶

Finding the Values¶

Changing the Values¶

Checking the Changes¶

Number of Children Visitng¶

Observations¶

Finding the Values¶

Changing the Values¶

Checking the Changes¶

Monthly Income¶

Observations¶

Finding the Values¶

Changing the Values¶

Checking the Changes¶

Confirming no more Null Values¶

Handling Outliers¶

Duration of Pitch¶

Number of Persons Visiting¶

Number of Followups¶

Number of Trips¶

Monthly Income¶

Remove erronous Categorical Data¶

Exploratory Data Analysis¶

Univariate Analysis¶

Age¶

Observations¶

Duration of Pitch¶

Observations¶

Monthly Income¶

Observations¶

Prod Taken¶

Observations¶

Type of Contact¶

Observations¶

City Tier¶

Observations¶

Occupation¶

Observations¶