# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To split data
from sklearn.model_selection import train_test_split

# To impute missing values
from sklearn.impute import SimpleImputer

# To encode categorical values
from sklearn.preprocessing import OneHotEncoder

# To scale the data
from sklearn.preprocessing import StandardScaler

# To help with model building
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Metrics to evaluate the model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report,recall_score,precision_score, accuracy_score, f1_score, precision_recall_curve

# To do hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Importing the XGBClassifier from the xgboost library
from xgboost import XGBClassifier

# To suppress scientific notations for a dataframe
#pd.set_option("display.float_format", lambda x: "%.2f" % x)
# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# To suppress warnings
import warnings

warnings.filterwarnings("ignore")


# Connecting to Google Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the dataset Survey Train from Google Colab
path='/content/drive/MyDrive/AAA-MIT/Hackathon/Data/Surveydata_train_.csv'
df_survey_train=pd.read_csv(path)


# Loading the dataset Survey Test from Google Colab
path='/content/drive/MyDrive/AAA-MIT/Hackathon/Data/Surveydata_test_.csv'
df_survey_test=pd.read_csv(path)


# Loading the dataset Travel Train from Google Colab
path='/content/drive/MyDrive/AAA-MIT/Hackathon/Data/Traveldata_train_.csv'
df_travel_train=pd.read_csv(path)


# Loading the dataset Travel Test from Google Colab
path='/content/drive/MyDrive/AAA-MIT/Hackathon/Data/Traveldata_test_.csv'
df_travel_test=pd.read_csv(path)


# Checking the shape of the dataset
df_survey_train.shape

(94379, 17)


df_survey_test.shape

(35602, 16)


df_travel_train.shape

(94379, 9)


df_travel_test.shape

(35602, 9)


# Displaying the first rows of the dataset
df_survey_train.head()


df_survey_test.head()


df_travel_train.head()


df_travel_test.head()


# Checking for duplicated values
df_survey_train.duplicated().sum()

0


df_travel_train.duplicated().sum()

0


# Checking for missing values
round(df_survey_train.isnull().sum() / df_survey_train.isnull().count() * 100,2)

ID                        0.00000000
Overall_Experience        0.00000000
Seat_Comfort              0.06000000
Seat_Class                0.00000000
Arrival_Time_Convenient   9.46000000
Catering                  9.26000000
Platform_Location         0.03000000
Onboard_Wifi_Service      0.03000000
Onboard_Entertainment     0.02000000
Online_Support            0.10000000
Ease_of_Online_Booking    0.08000000
Onboard_Service           8.05000000
Legroom                   0.10000000
Baggage_Handling          0.15000000
CheckIn_Service           0.08000000
Cleanliness               0.01000000
Online_Boarding           0.01000000
dtype: float64


round(df_travel_train.isnull().sum() / df_travel_train.isnull().count() * 100,2)

ID                        0.00000000
Gender                    0.08000000
Customer_Type             9.48000000
Age                       0.03000000
Type_Travel               9.78000000
Travel_Class              0.00000000
Travel_Distance           0.00000000
Departure_Delay_in_Mins   0.06000000
Arrival_Delay_in_Mins     0.38000000
dtype: float64


# Checking the data types and columns for the dataset
df_survey_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       94379 non-null  int64 
 1   Overall_Experience       94379 non-null  int64 
 2   Seat_Comfort             94318 non-null  object
 3   Seat_Class               94379 non-null  object
 4   Arrival_Time_Convenient  85449 non-null  object
 5   Catering                 85638 non-null  object
 6   Platform_Location        94349 non-null  object
 7   Onboard_Wifi_Service     94349 non-null  object
 8   Onboard_Entertainment    94361 non-null  object
 9   Online_Support           94288 non-null  object
 10  Ease_of_Online_Booking   94306 non-null  object
 11  Onboard_Service          86778 non-null  object
 12  Legroom                  94289 non-null  object
 13  Baggage_Handling         94237 non-null  object
 14  CheckIn_Service          94302 non-null  object
 15  Cleanliness              94373 non-null  object
 16  Online_Boarding          94373 non-null  object
dtypes: int64(2), object(15)
memory usage: 12.2+ MB


df_travel_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.5+ MB


# Checking if unique values in the dataset
df_survey_train.nunique()

ID                         94379
Overall_Experience             2
Seat_Comfort                   6
Seat_Class                     2
Arrival_Time_Convenient        6
Catering                       6
Platform_Location              6
Onboard_Wifi_Service           6
Onboard_Entertainment          6
Online_Support                 6
Ease_of_Online_Booking         6
Onboard_Service                6
Legroom                        6
Baggage_Handling               5
CheckIn_Service                6
Cleanliness                    6
Online_Boarding                6
dtype: int64


df_travel_train.nunique()

ID                         94379
Gender                         2
Customer_Type                  2
Age                           75
Type_Travel                    2
Travel_Class                   2
Travel_Distance             5210
Departure_Delay_in_Mins      437
Arrival_Delay_in_Mins        434
dtype: int64


# Let´s merge the Training dataset using the ID variable.
df_train = pd.merge(df_survey_train, df_travel_train, how = 'left', on ='ID')
df_train.shape

(94379, 25)


# Let´s merge the Testing dataset using the ID variable.
df_test = pd.merge(df_survey_test, df_travel_test, how = 'left', on ='ID')
df_test.shape

(35602, 24)


# Let´s look at the distribution of the Overall_Experience target variable
df_train.Overall_Experience.value_counts(normalize = True)

1   0.54665763
0   0.45334237
Name: Overall_Experience, dtype: float64


# Checking the information of the merged train dataset
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94379 entries, 0 to 94378
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Overall_Experience       94379 non-null  int64  
 2   Seat_Comfort             94318 non-null  object 
 3   Seat_Class               94379 non-null  object 
 4   Arrival_Time_Convenient  85449 non-null  object 
 5   Catering                 85638 non-null  object 
 6   Platform_Location        94349 non-null  object 
 7   Onboard_Wifi_Service     94349 non-null  object 
 8   Onboard_Entertainment    94361 non-null  object 
 9   Online_Support           94288 non-null  object 
 10  Ease_of_Online_Booking   94306 non-null  object 
 11  Onboard_Service          86778 non-null  object 
 12  Legroom                  94289 non-null  object 
 13  Baggage_Handling         94237 non-null  object 
 14  CheckIn_Service          94302 non-null  object 
 15  Cleanliness              94373 non-null  object 
 16  Online_Boarding          94373 non-null  object 
 17  Gender                   94302 non-null  object 
 18  Customer_Type            85428 non-null  object 
 19  Age                      94346 non-null  float64
 20  Type_Travel              85153 non-null  object 
 21  Travel_Class             94379 non-null  object 
 22  Travel_Distance          94379 non-null  int64  
 23  Departure_Delay_in_Mins  94322 non-null  float64
 24  Arrival_Delay_in_Mins    94022 non-null  float64
dtypes: float64(3), int64(3), object(19)
memory usage: 18.7+ MB


# Displaying the first rows of the dataset
df_train.head()


# Checking the descriptive statistics of the numerical columns
df_train.describe().T


# List of all the important numerical variables
num_col = ['ID','Overall_Experience','Age', 'Travel_Distance','Departure_Delay_in_Mins','Arrival_Delay_in_Mins']


# List of all the important categorical variables
cat_col = ['Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering', 'Platform_Location', 'Onboard_Wifi_Service', 'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding', 'Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class']


# Printing the number of occurences of each unique value in each categorical column
for column in cat_col :
  print(df_train[column].value_counts(1))
  print('-' * 30)

Acceptable          0.22432622
Needs Improvement   0.22207850
Good                0.21835705
Poor                0.16099790
Excellent           0.13752412
Extremely Poor      0.03671622
Name: Seat_Comfort, dtype: float64
------------------------------
Green Car   0.50260121
Ordinary    0.49739879
Name: Seat_Class, dtype: float64
------------------------------
Good                0.22907231
Excellent           0.20695386
Acceptable          0.17761472
Needs Improvement   0.17542628
Poor                0.16023593
Extremely Poor      0.05069691
Name: Arrival_Time_Convenient, dtype: float64
------------------------------
Acceptable          0.21565193
Needs Improvement   0.20993017
Good                0.20982508
Poor                0.16182069
Excellent           0.15711483
Extremely Poor      0.04565730
Name: Catering, dtype: float64
------------------------------
Manageable          0.25620833
Convenient          0.23224411
Needs Improvement   0.18900041
Inconvenient        0.17434207
Very Convenient     0.14818387
Very Inconvenient   0.00002120
Name: Platform_Location, dtype: float64
------------------------------
Good                0.24202694
Excellent           0.22223871
Acceptable          0.21322960
Needs Improvement   0.20769695
Poor                0.11384328
Extremely Poor      0.00096450
Name: Onboard_Wifi_Service, dtype: float64
------------------------------
Good                0.32265449
Excellent           0.22937442
Acceptable          0.18609383
Needs Improvement   0.14758216
Poor                0.09157385
Extremely Poor      0.02272125
Name: Onboard_Entertainment, dtype: float64
------------------------------
Good                0.31834380
Excellent           0.27462668
Acceptable          0.16653233
Needs Improvement   0.13265739
Poor                0.10782920
Extremely Poor      0.00001061
Name: Online_Support, dtype: float64
------------------------------
Good                0.30654465
Excellent           0.26237991
Acceptable          0.17379594
Needs Improvement   0.15353212
Poor                0.10357772
Extremely Poor      0.00016966
Name: Ease_of_Online_Booking, dtype: float64
------------------------------
Good                0.31419254
Excellent           0.24513125
Acceptable          0.20824402
Needs Improvement   0.13125447
Poor                0.10113162
Extremely Poor      0.00004609
Name: Onboard_Service, dtype: float64
------------------------------
Good                0.30618630
Excellent           0.26336052
Acceptable          0.17376364
Needs Improvement   0.16707145
Poor                0.08601215
Extremely Poor      0.00360593
Name: Legroom, dtype: float64
------------------------------
Good                0.37080977
Excellent           0.27593196
Acceptable          0.18853529
Needs Improvement   0.10355805
Poor                0.06116494
Name: Baggage_Handling, dtype: float64
------------------------------
Good                0.28103328
Acceptable          0.27362092
Excellent           0.20827766
Needs Improvement   0.11895824
Poor                0.11809930
Extremely Poor      0.00001060
Name: CheckIn_Service, dtype: float64
------------------------------
Good                0.37539339
Excellent           0.27606413
Acceptable          0.18489398
Needs Improvement   0.10390684
Poor                0.05968868
Extremely Poor      0.00005298
Name: Cleanliness, dtype: float64
------------------------------
Good                0.27055408
Acceptable          0.23815074
Excellent           0.23038369
Needs Improvement   0.14253017
Poor                0.11825416
Extremely Poor      0.00012716
Name: Online_Boarding, dtype: float64
------------------------------
Female   0.50704121
Male     0.49295879
Name: Gender, dtype: float64
------------------------------
Loyal Customer      0.81733155
Disloyal Customer   0.18266845
Name: Customer_Type, dtype: float64
------------------------------
Business Travel   0.68837269
Personal Travel   0.31162731
Name: Type_Travel, dtype: float64
------------------------------
Eco        0.52280698
Business   0.47719302
Name: Travel_Class, dtype: float64
------------------------------


## function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(8, 5), kde=False, bins=None):
    """
    Boxplot and histogram combined

    df_train: dataframe
    feature: dataframe column
    figsize: size of figure
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=df_train, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data=df_train, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=df_train, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        df_train[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        df_train[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram


# Observations on Age
histogram_boxplot(df_train, 'Age', kde = True, bins=30)


# Observations on Travel Distance
histogram_boxplot(df_train, 'Travel_Distance', kde = True, bins=30)


# Observations on Departure Delay in Mins
histogram_boxplot(df_train, 'Departure_Delay_in_Mins', kde = True, bins=30)


# Observations on Arrival_Delay_in_Mins
histogram_boxplot(df_train, 'Arrival_Delay_in_Mins', kde = True, bins=30)


# Function to plot a barplot along the same scale
def bar_perc(data, z):
    total = len(df_train[z]) # Length of the column
    plt.figure(figsize = (4, 3))
    plt.xticks(rotation=45)

    # Convert the column to a categorical data type
    data[z] = data[z].astype('category')

    ax = sns.countplot(x=z, data=df_train, palette='Paired', order=data[z].value_counts().index)

    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height() / total) # Percentage of each class
        x = p.get_x() + p.get_width() / 2 - 0.05                    # Width of the plot
        y = p.get_y() + p.get_height()                              # Height of the plot
        ax.annotate(
            percentage,
            (x, y),
            ha="center",
            va="center",
            size=9,
            xytext=(0, 5),
            textcoords="offset points",
        )                # Annotate the percentage

    plt.show()


# Observations on Overall_Experience
bar_perc(df_train, 'Overall_Experience')


# Observations on Seat_Comfort
bar_perc(df_train, 'Seat_Comfort')


# Observations on Seat_Class
bar_perc(df_train, 'Seat_Class')


# Observations on Arrival_Time_Convenient
bar_perc(df_train, 'Arrival_Time_Convenient')


# Observations on Catering
bar_perc(df_train, 'Catering')


# Observations on Platform_Location
bar_perc(df_train, 'Platform_Location')


# Observations on Onboard_Wifi_Service
bar_perc(df_train, 'Onboard_Wifi_Service')


# Observations on Onboard_Entertainment
bar_perc(df_train, 'Onboard_Entertainment')


# Observations on Online_Support
bar_perc(df_train, 'Online_Support')


# Observations on Ease_of_Online_Booking
bar_perc(df_train, 'Ease_of_Online_Booking')


# Observations on Onboard_Service
bar_perc(df_train, 'Onboard_Service')


# Observations on Legroom
bar_perc(df_train, 'Legroom')


# Observations on Baggage_Handling
bar_perc(df_train, 'Baggage_Handling')


# Observations on CheckIn_Service
bar_perc(df_train, 'CheckIn_Service')


# Observations on Cleanliness
bar_perc(df_train, 'Cleanliness')


# Observations on Online_Boarding
bar_perc(df_train, 'Online_Boarding')


# Observations on Gender
bar_perc(df_train, 'Gender')


# Observations on Customer_Type
bar_perc(df_train, 'Customer_Type')


# Observations on Type_Travel
bar_perc(df_train, 'Type_Travel')


# Observations on Travel_Class
bar_perc(df_train, 'Travel_Class')


plt.figure(figsize = (10, 10))
sns.pairplot(df_train, size=2, hue = 'Overall_Experience', kind="reg");

<Figure size 1000x1000 with 0 Axes>


# Checking any correlation between variables.
plt.figure(figsize = (8, 4))
sns.heatmap(df_train.corr(), annot = True,  fmt = '0.2f', cmap = 'coolwarm');


def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    df_train: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 120)

    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    tab.plot(kind="bar", stacked=True, figsize=(count + 3, 3))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1), title='Overall_Experience')
    plt.xticks(rotation=45)
    plt.show()


# Observations on Seat_Comfort and Overall_Experience
stacked_barplot(df_train, "Seat_Comfort", "Overall_Experience")

Overall_Experience      0      1    All
Seat_Comfort                           
All                 42757  51561  94318
Acceptable          13669   7489  21158
Needs Improvement   13464   7482  20946
Poor                 8339   6846  15185
Good                 7181  13414  20595
Excellent              96  12875  12971
Extremely Poor          8   3455   3463
------------------------------------------------------------------------------------------------------------------------


# Observations on Seat_Class and Overall_Experience
stacked_barplot(df_train, "Seat_Class", "Overall_Experience")

Overall_Experience      0      1    All
Seat_Class                             
All                 42786  51593  94379
Green Car           21434  26001  47435
Ordinary            21352  25592  46944
------------------------------------------------------------------------------------------------------------------------


# Observations on Arrival_Time_Convenient and Overall_Experience
stacked_barplot(df_train, "Arrival_Time_Convenient", "Overall_Experience")

Overall_Experience           0      1    All
Arrival_Time_Convenient                     
All                      38794  46655  85449
Good                      9307  10267  19574
Excellent                 7866   9818  17684
Acceptable                7050   8127  15177
Needs Improvement         6952   8038  14990
Poor                      5638   8054  13692
Extremely Poor            1981   2351   4332
------------------------------------------------------------------------------------------------------------------------


# Observations on Catering and Overall_Experience
stacked_barplot(df_train, "Catering", "Overall_Experience")

Overall_Experience      0      1    All
Catering                               
All                 38839  46799  85638
Acceptable          10574   7894  18468
Needs Improvement   10226   7752  17978
Good                 7401  10568  17969
Poor                 6814   7044  13858
Excellent            2945  10510  13455
Extremely Poor        879   3031   3910
------------------------------------------------------------------------------------------------------------------------


# Observations on Platform_Location and Overall_Experience
stacked_barplot(df_train, "Platform_Location", "Overall_Experience")

Overall_Experience      0      1    All
Platform_Location                      
All                 42773  51576  94349
Manageable          12985  11188  24173
Convenient          11009  10903  21912
Needs Improvement    7504  10328  17832
Inconvenient         6422  10027  16449
Very Convenient      4853   9128  13981
Very Inconvenient       0      2      2
------------------------------------------------------------------------------------------------------------------------


# Observations on Onboard_Wifi_Service and Overall_Experience
stacked_barplot(df_train, "Onboard_Wifi_Service", "Overall_Experience")

Overall_Experience        0      1    All
Onboard_Wifi_Service                     
All                   42773  51576  94349
Acceptable             9857  10261  20118
Needs Improvement      9767   9829  19596
Good                   8235  14600  22835
Poor                   7908   2833  10741
Excellent              6950  14018  20968
Extremely Poor           56     35     91
------------------------------------------------------------------------------------------------------------------------


# Observations on Onboard_Entertainment and Overall_Experience
stacked_barplot(df_train, "Onboard_Entertainment", "Overall_Experience")

Overall_Experience         0      1    All
Onboard_Entertainment                     
All                    42778  51583  94361
Acceptable             14075   3485  17560
Needs Improvement      11589   2337  13926
Good                    8548  21898  30446
Poor                    6805   1836   8641
Excellent               1022  20622  21644
Extremely Poor           739   1405   2144
------------------------------------------------------------------------------------------------------------------------


# Observations on Online_Support and Overall_Experience
stacked_barplot(df_train, "Online_Support", "Overall_Experience")

Overall_Experience      0      1    All
Online_Support                         
All                 42755  51533  94288
Acceptable          11300   4402  15702
Good                 9582  20434  30016
Needs Improvement    8790   3718  12508
Poor                 7205   2962  10167
Excellent            5877  20017  25894
Extremely Poor          1      0      1
------------------------------------------------------------------------------------------------------------------------


# Observations on Ease_of_Online_Booking and Overall_Experience
stacked_barplot(df_train, "Ease_of_Online_Booking", "Overall_Experience")

Overall_Experience          0      1    All
Ease_of_Online_Booking                     
All                     42763  51543  94306
Acceptable              10559   5831  16390
Needs Improvement       10347   4132  14479
Good                     8115  20794  28909
Poor                     7890   1878   9768
Excellent                5836  18908  24744
Extremely Poor             16      0     16
------------------------------------------------------------------------------------------------------------------------


# Observations on Onboard_Service and Overall_Experience
stacked_barplot(df_train, "Onboard_Service", "Overall_Experience")

Overall_Experience      0      1    All
Onboard_Service                        
All                 39341  47437  86778
Acceptable          10708   7363  18071
Good                 9625  17640  27265
Needs Improvement    7547   3843  11390
Poor                 6425   2351   8776
Excellent            5032  16240  21272
Extremely Poor          4      0      4
------------------------------------------------------------------------------------------------------------------------


# Observations on Legroom and Overall_Experience
stacked_barplot(df_train, "Legroom", "Overall_Experience")

Overall_Experience      0      1    All
Legroom                                
All                 42750  51539  94289
Acceptable          10321   6063  16384
Needs Improvement    9814   5939  15753
Good                 9488  19382  28870
Excellent            7245  17587  24832
Poor                 5776   2334   8110
Extremely Poor        106    234    340
------------------------------------------------------------------------------------------------------------------------


# Observations on Baggage_Handling and Overall_Experience
stacked_barplot(df_train, "Baggage_Handling", "Overall_Experience")

Overall_Experience      0      1    All
Baggage_Handling                       
All                 42722  51515  94237
Good                14382  20562  34944
Acceptable          12205   5562  17767
Excellent            6949  19054  26003
Needs Improvement    5875   3884   9759
Poor                 3311   2453   5764
------------------------------------------------------------------------------------------------------------------------


# Observations on CheckIn_Service and Overall_Experience
stacked_barplot(df_train, "CheckIn_Service", "Overall_Experience")

Overall_Experience      0      1    All
CheckIn_Service                        
All                 42752  51550  94302
Good                11263  15239  26502
Acceptable          11194  14609  25803
Poor                 7574   3563  11137
Needs Improvement    7484   3734  11218
Excellent            5236  14405  19641
Extremely Poor          1      0      1
------------------------------------------------------------------------------------------------------------------------


# Observations on Cleanliness and Overall_Experience
stacked_barplot(df_train, "Cleanliness", "Overall_Experience")

Overall_Experience      0      1    All
Cleanliness                            
All                 42786  51587  94373
Good                14678  20749  35427
Acceptable          11859   5590  17449
Excellent            7067  18986  26053
Needs Improvement    5849   3957   9806
Poor                 3328   2305   5633
Extremely Poor          5      0      5
------------------------------------------------------------------------------------------------------------------------


# Observations on Online_Boarding and Overall_Experience
stacked_barplot(df_train, "Online_Boarding", "Overall_Experience")

Overall_Experience      0      1    All
Online_Boarding                        
All                 42786  51587  94373
Acceptable          10125  12350  22475
Needs Improvement    9674   3777  13451
Good                 8869  16664  25533
Poor                 8249   2911  11160
Excellent            5857  15885  21742
Extremely Poor         12      0     12
------------------------------------------------------------------------------------------------------------------------


# Observations on Gender and Overall_Experience
stacked_barplot(df_train, "Gender", "Overall_Experience")

Overall_Experience      0      1    All
Gender                                 
All                 42747  51555  94302
Male                26111  20376  46487
Female              16636  31179  47815
------------------------------------------------------------------------------------------------------------------------


# Observations on Customer_Type and Overall_Experience
stacked_barplot(df_train, "Customer_Type", "Overall_Experience")

Overall_Experience      0      1    All
Customer_Type                          
All                 38663  46765  85428
Loyal Customer      26794  43029  69823
Disloyal Customer   11869   3736  15605
------------------------------------------------------------------------------------------------------------------------


# Observations on Type_Travel and Overall_Experience
stacked_barplot(df_train, "Type_Travel", "Overall_Experience")

Overall_Experience      0      1    All
Type_Travel                            
All                 38600  46553  85153
Business Travel     24441  34176  58617
Personal Travel     14159  12377  26536
------------------------------------------------------------------------------------------------------------------------


# Observations on Travel_Class and Overall_Experience
stacked_barplot(df_train, "Travel_Class", "Overall_Experience")

Overall_Experience      0      1    All
Travel_Class                           
All                 42786  51593  94379
Eco                 29644  19698  49342
Business            13142  31895  45037
------------------------------------------------------------------------------------------------------------------------


# The mean of numerical variables grouped by Overall_Experience
df_train.groupby(['Overall_Experience'])[num_col].mean()


# Observations on Age and Overall_Experience
sns.histplot(data=df_train, x='Age', hue='Overall_Experience', kde = True, bins = 30);


# Observations on Travel_Distance and Overall_Experience
sns.histplot(data=df_train, x='Travel_Distance', hue='Overall_Experience', kde = True, bins = 30);


# Observations on Departure_Delay_in_Mins and Overall_Experience
sns.histplot(data=df_train, x='Departure_Delay_in_Mins', hue='Overall_Experience', kde = True, bins = 30);


# Observations on Arrival_Delay_in_Mins and Overall_Experience
sns.histplot(data=df_train, x='Arrival_Delay_in_Mins', hue='Overall_Experience', kde = True, bins = 30);


# selecting the instances where missing value is greater than 0
pd.DataFrame({'Count':df_train.isnull().sum()[df_train.isnull().sum()>0],'Percentage':(df_train.isnull().sum()[df_train.isnull().sum()>0]/df_train.shape[0])*100})


# Impute the missing values : Median for numerical values Mode for categorical values
imputer_mode_cat = SimpleImputer(strategy="most_frequent")
imputer_median_num = SimpleImputer(strategy="median")

# Fit and transform the train and test data
df_train[['Age','Travel_Distance','Departure_Delay_in_Mins','Arrival_Delay_in_Mins']] = imputer_median_num.fit_transform(df_train[['Age','Travel_Distance','Departure_Delay_in_Mins','Arrival_Delay_in_Mins']])
df_train[['Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering', 'Platform_Location', 'Onboard_Wifi_Service', 'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding', 'Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class']] = imputer_mode_cat.fit_transform(df_train[['Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering', 'Platform_Location', 'Onboard_Wifi_Service', 'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding', 'Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class']])

df_test[['Age','Travel_Distance','Departure_Delay_in_Mins','Arrival_Delay_in_Mins']] = imputer_median_num.transform(df_test[['Age','Travel_Distance','Departure_Delay_in_Mins','Arrival_Delay_in_Mins']])
df_test[['Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering', 'Platform_Location', 'Onboard_Wifi_Service', 'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding', 'Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class']] = imputer_mode_cat.transform(df_test[['Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering', 'Platform_Location', 'Onboard_Wifi_Service', 'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding', 'Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class']])


# Checking that no column has missing values in train and test datasets
print(df_train.isna().sum())
print(25 * '--')
print(df_test.isna().sum())

ID                         0
Overall_Experience         0
Seat_Comfort               0
Seat_Class                 0
Arrival_Time_Convenient    0
Catering                   0
Platform_Location          0
Onboard_Wifi_Service       0
Onboard_Entertainment      0
Online_Support             0
Ease_of_Online_Booking     0
Onboard_Service            0
Legroom                    0
Baggage_Handling           0
CheckIn_Service            0
Cleanliness                0
Online_Boarding            0
Gender                     0
Customer_Type              0
Age                        0
Type_Travel                0
Travel_Class               0
Travel_Distance            0
Departure_Delay_in_Mins    0
Arrival_Delay_in_Mins      0
dtype: int64
--------------------------------------------------
ID                         0
Seat_Comfort               0
Seat_Class                 0
Arrival_Time_Convenient    0
Catering                   0
Platform_Location          0
Onboard_Wifi_Service       0
Onboard_Entertainment      0
Online_Support             0
Ease_of_Online_Booking     0
Onboard_Service            0
Legroom                    0
Baggage_Handling           0
CheckIn_Service            0
Cleanliness                0
Online_Boarding            0
Gender                     0
Customer_Type              0
Age                        0
Type_Travel                0
Travel_Class               0
Travel_Distance            0
Departure_Delay_in_Mins    0
Arrival_Delay_in_Mins      0
dtype: int64


# Outlier detection using boxplot from numerical variables.
plt.figure(figsize=(10, 8))

for i, variable in enumerate(num_col):
    plt.subplot(4, 4, i + 1)
    sns.boxplot(df_train[variable], whis=1.5, showmeans = True, color = 'violet', orient="h")
    plt.title(variable)
    plt.tight_layout(pad=0.5)

plt.show()


# to find the 25th percentile and 75th percentile for the numerical columns.
cols = ['Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins']
Q1 = df_train[cols].quantile(0.25)
Q3 = df_train[cols].quantile(0.75)

IQR = Q3 - Q1                 #Inter Quantile Range (75th percentile - 25th percentile)

lower_whisker = Q1 - 1.5*IQR    #Finding lower and upper bounds for all values. All values outside these bounds are outliers
upper_whisker = Q3 + 1.5*IQR


# Percentage of outliers in each column in Train dataset
((df_train[cols] < lower_whisker) | (df_train[cols] > upper_whisker)).sum() / df_train.shape[0]*100

Travel_Distance            2.04176777
Departure_Delay_in_Mins   13.90775490
Arrival_Delay_in_Mins     13.39280984
dtype: float64


# Percentage of outliers in each column in Test dataset
((df_test[cols] < lower_whisker) | (df_test[cols] > upper_whisker)).sum() / df_test.shape[0]*100

Travel_Distance            1.97741700
Departure_Delay_in_Mins   13.94865457
Arrival_Delay_in_Mins     13.61440369
dtype: float64


# Create a function to floor and cap/clip outliers in a column
def treat_outliers(data, col):
    """
    treats outliers in a variable
    col: str, name of the numerical variable
    data: dataframe
    col: name of the column
    """
    Q1 = df_train[col].quantile(0.25)  # 25th quantile
    Q3 = df_train[col].quantile(0.75)  # 75th quantile
    IQR = Q3 - Q1                # Inter Quantile Range (75th percentile - 25th percentile)
    lower_whisker = Q1 - 1.5 * IQR
    upper_whisker = Q3 + 1.5 * IQR

    # all the values smaller than lower_whisker will be assigned the value of lower_whisker
    # all the values greater than upper_whisker will be assigned the value of upper_whisker
    # the assignment will be done by using the clip function of NumPy
    df_train[col] = np.clip(df_train[col], lower_whisker, upper_whisker)
    df_test[col] = np.clip(df_test[col], lower_whisker, upper_whisker)

    return data


# Treating outliers of numerical variables on Train dataset
df_train = treat_outliers(df_train, 'Travel_Distance')
df_train = treat_outliers(df_train, 'Departure_Delay_in_Mins')
df_train = treat_outliers(df_train, 'Arrival_Delay_in_Mins')


# Let's visualize numerical columns where outliers were treated
((df_train[cols] < lower_whisker) | (df_train[cols] > upper_whisker)).sum() / df_train.shape[0]*100

Travel_Distance           0.00000000
Departure_Delay_in_Mins   0.00000000
Arrival_Delay_in_Mins     0.00000000
dtype: float64


# Treating outliers of numerical variables on Test dataset
df_test = treat_outliers(df_test, 'Travel_Distance')
df_test = treat_outliers(df_test, 'Departure_Delay_in_Mins')
df_test = treat_outliers(df_test, 'Arrival_Delay_in_Mins')


# Let's visualize numerical columns where outliers were treated
((df_test[cols] < lower_whisker) | (df_test[cols] > upper_whisker)).sum() / df_test.shape[0]*100

Travel_Distance           0.00000000
Departure_Delay_in_Mins   0.00000000
Arrival_Delay_in_Mins     0.00000000
dtype: float64


# Change Overall_Experience to category type
df_train['Overall_Experience'] = df_train['Overall_Experience'].astype('int')


df_train = df_train.replace(['Excellent', 'Good', 'Acceptable', 'Needs Improvement', 'Poor', 'Extremely Poor'],
           [6, 5, 4, 3, 2, 1])
df_train = df_train.replace(['Male', 'Female'], [1,0])
df_train = df_train.replace(['Loyal Customer', 'Disloyal Customer'], [1,0])
df_train = df_train.replace(['Business Travel', 'Personal Travel'], [1,0])
df_train = df_train.replace(['Business', 'Eco'], [1,0])
df_train = df_train.replace(['Green Car', 'Ordinary'], [1,0])
df_train = df_train.replace(['Very Convenient', 'Convenient', 'Manageable', 'Needs Improvement', 'Inconvenient', 'Very Inconvenient'],
           [6, 5, 4, 3, 2, 1])


df_train


df_test = df_test.replace(['Excellent', 'Good', 'Acceptable', 'Needs Improvement', 'Poor', 'Extremely Poor'],
           [6, 5, 4, 3, 2, 1])
df_test = df_test.replace(['Male', 'Female'], [1,0])
df_test = df_test.replace(['Loyal Customer', 'Disloyal Customer'], [1,0])
df_test = df_test.replace(['Business Travel', 'Personal Travel'], [1,0])
df_test = df_test.replace(['Business', 'Eco'], [1,0])
df_test = df_test.replace(['Green Car', 'Ordinary'], [1,0])
df_test = df_test.replace(['Very Convenient', 'Convenient', 'Manageable', 'Needs Improvement', 'Inconvenient', 'Very Inconvenient'],
           [6, 5, 4, 3, 2, 1])


df_test


# Creating the list of columns for which we need to create the dummy variables
#to_get_dummies_for = ['Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering', 'Platform_Location', 'Onboard_Wifi_Service', 'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding', 'Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class']


# Creating dummy variables
#df_train = pd.get_dummies(df_train, columns = to_get_dummies_for, drop_first = False)
#df_test = pd.get_dummies(df_test, columns = to_get_dummies_for, drop_first = False)


# Separating the target variable and other variables
Y = df_train[['Overall_Experience']]
X = df_train.drop(columns = ['Overall_Experience', 'Seat_Class', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins'], axis=1)

x_test = df_test.drop(columns = ['Seat_Class', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins'], axis=1)


#performing train-test(validation) split of (70:30) on the training data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.30, random_state=13)

display(x_train.shape)
display(x_test.shape)
display(y_train.shape)
display(y_test.shape)

(66065, 21)

(28314, 21)

(66065, 1)

(28314, 1)


# Scaling the data
#scaler = StandardScaler()

##x_train_scaled = scaler.fit_transform(x_train)
#x_train_scaled = pd.DataFrame(x_train_scaled, columns = X.columns)

#x_test_scaled = scaler.transform(x_test)
#x_test_scaled = pd.DataFrame(x_test_scaled, columns = X.columns)


# Function to print classification report and get confusion matrix
def metrics_score(actual, predicted):
    print('Accuracy score:', accuracy_score(actual, predicted))

    cm = confusion_matrix(actual, predicted)

    plt.figure(figsize = (6, 3))

    sns.heatmap(cm, annot = True, fmt = '.2f',  xticklabels = ['Satisfied', 'Not Satisfied'], yticklabels = ['Satisfied', 'Not Satisfied'])

    plt.ylabel('Actual')

    plt.xlabel('Predicted')

    plt.show()


def model_performance_classification(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance
    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # Predicting using the independent variables
    pred = model.predict(predictors)
    acc = accuracy_score(target, pred)                    # To compute accuracy score

    # Creating a dataframe of metrics

    data_perf = pd.DataFrame(
        {
            "Accuracy": acc,

                    },

        index = [1],
    )

    pd.set_option("display.float_format", lambda x: "%.8f" % x)
    return data_perf


# Fitting the logistic regression model
lg = LogisticRegression()

lg.fit(x_train,y_train)

LogisticRegression()

LogisticRegression()


# Checking the performance on the train dataset
y_pred_train_lr = lg.predict(x_train)
metrics_score(y_train, y_pred_train_lr)

Accuracy score: 0.5471126920457126


# Checking the performance on the test dataset
y_pred_test_lr = lg.predict(x_test)
metrics_score(y_test, y_pred_test_lr)

Accuracy score: 0.545595818323091


# Summary of model performance on test data
logreg_test = model_performance_classification(lg,x_test,y_test)
logreg_test


# Building decision tree model
dt = DecisionTreeClassifier(random_state = 5)


# Fitting decision tree model
dt.fit(x_train, y_train)

DecisionTreeClassifier(random_state=5)

DecisionTreeClassifier(random_state=5)


# Checking performance on the training dataset
y_train_pred_dt = dt.predict(x_train)
metrics_score(y_train, y_train_pred_dt)

Accuracy score: 1.0


# Checking performance on the test dataset
y_test_pred_dt = dt.predict(x_test)
metrics_score(y_test, y_test_pred_dt)

Accuracy score: 0.925902380447835


# Summary of model performance on test data
dtree_test = model_performance_classification(dt, x_test, y_test)
dtree_test


# Choose the type of classifier
dt_estimator = DecisionTreeClassifier(random_state = 1)

# Grid of parameters to choose from
parameters = {'criterion': ['gini', 'entropy'],
              'max_features': [2, 3, 7],
              'min_samples_leaf': [1, 2],
              'min_samples_split': [5, 10, 20],
             }

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(accuracy_score, pos_label = 1)

# Run the grid search
gridCV = GridSearchCV(dt_estimator, parameters, scoring = scorer, cv = 10)

# Fitting the grid search on the train data
gridCV = gridCV.fit(x_train, y_train)

# Set the classifier to the best combination of parameters
dt_estimator = gridCV.best_estimator_

# Fit the best estimator to the data
dt_estimator.fit(x_train, y_train)

DecisionTreeClassifier(max_features=2, min_samples_split=5, random_state=1)

DecisionTreeClassifier(max_features=2, min_samples_split=5, random_state=1)


# Checking performance on the training dataset
y_pred_train_dt_tuned = dt_estimator.predict(x_train)
metrics_score(y_train, y_pred_train_dt_tuned)

Accuracy score: 0.9669719215923711


# Checking performance on the test dataset
y_test_pred_dt_tuned = dt_estimator.predict(x_test)
metrics_score(y_test, y_test_pred_dt_tuned)

Accuracy score: 0.8890301617574344


# Summary of model performance on test data
dtree_tuned_test = model_performance_classification(dt_estimator, x_test, y_test)
dtree_tuned_test


# Fitting the random forest tree classifier on the training data
rf_estimator = RandomForestClassifier( random_state = 7, criterion = "entropy")

rf_estimator.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', random_state=7)

RandomForestClassifier(criterion='entropy', random_state=7)


# Checking performance on the training data
y_pred_train_rf = rf_estimator.predict(x_train)

metrics_score(y_train, y_pred_train_rf)

Accuracy score: 1.0


# Checking performance on the test data
y_pred_test_rf = rf_estimator.predict(x_test)

metrics_score(y_test, y_pred_test_rf)

Accuracy score: 0.9503425867062231


# Summary of model performance on test data
rf_estimator_test = model_performance_classification(rf_estimator,x_test,y_test)
rf_estimator_test


# Choose the type of classifier
rf_estimator_tuned = RandomForestClassifier(random_state = 7, criterion = "entropy")


# Grid of parameters to choose from
parameters = {"n_estimators": [30, 50, 70],
              "max_features": [0.8, 1, 1.3],
              "n_jobs": [-1],
             }

# Type of scoring used to compare parameter combinations - f1 score for class 1
scorer = metrics.make_scorer(accuracy_score, pos_label = 1)

# Run the grid search
grid_obj = GridSearchCV(rf_estimator_tuned, parameters, scoring = scorer, cv = 5)

grid_obj = grid_obj.fit(x_train, y_train)

# Set the classifier to the best combination of parameters
rf_estimator_tuned_base = grid_obj.best_estimator_

# Fitting the best algorithm to the training data
rf_estimator_tuned_base.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', max_features=0.8, n_estimators=30,
                       n_jobs=-1, random_state=7)

RandomForestClassifier(criterion='entropy', max_features=0.8, n_estimators=30,
                       n_jobs=-1, random_state=7)


# Checking performance on the training data
y_pred_train_rf_tuned = rf_estimator_tuned_base.predict(x_train)
metrics_score(y_train, y_pred_train_rf_tuned)

Accuracy score: 0.9996367214107319


# Checking performance on the testing data
y_pred_test_rf_tuned = rf_estimator_tuned_base.predict(x_test)
metrics_score(y_test, y_pred_test_rf_tuned)

Accuracy score: 0.9497421770149043


rf_estimator_tuned_test = model_performance_classification(rf_estimator_tuned_base, x_test, y_test)
rf_estimator_tuned_test


# Visualize feature importance of the tuned random forest model
importances = rf_estimator_tuned_base.feature_importances_

columns = x_train.columns

importance_data = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (10, 10))

sns.barplot(x=importance_data.Importance, y=importance_data.index);


# XGBoost Classifier
xgb_model = XGBClassifier(random_state = 1)

param_grid = {
    'learning_rate': [0.1, 0.3, 0.5, 0.8],
    'n_estimators': [50, 70, 100, 150],
    'max_depth': [3, 5, 7],

}

# Fitting the model
xgb_model.fit(x_train,y_train)

# Summary of model performance on test data
xgb_perf_test = model_performance_classification(xgb_model, x_test, y_test)

xgb_perf_test


# Importing AdaBoost Regressor
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost Regressor
ada_class = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=1, base_estimator='deprecated')

# Fitting the model
ada_class.fit(x_train, y_train)

# Model Performance on the test data
ada_perf_test = model_performance_classification(ada_class, x_test, y_test)

ada_perf_test


# Summary of the implemented model on test data
models_test_comp_data = pd.concat(

    [
    logreg_test.T, dtree_test.T, dtree_tuned_test.T, rf_estimator_test.T, rf_estimator_tuned_test.T, xgb_perf_test.T, ada_perf_test.T
    ],

    axis = 1,
)

models_test_comp_data.columns = [
    'Logistic Regression',
    "Decision Tree classifier",
    "Tuned Decision Tree classifier",
    "Random Forest classifier",
    "Tuned Random Forest classifier",
    "XGBoost classifier",
    "AdaBoost classifier"
]


print("Test performance comparison:")
models_test_comp_data

Test performance comparison:


df_test


# Make predictions using the model
predictions = rf_estimator.predict(x_test)

# Create a DataFrame with the predictions
sub = pd.DataFrame(x_test)
sub["Overall_Experience"] = predictions

# Select 'ID' and 'Overall_Experience' columns and save to CSV
sub_to_file = sub[['ID', 'Overall_Experience']]
sub_to_file.to_csv("hackathon_predictions2.csv", index=False)


# Define a function for making predictions
#def make_predictions(model, test_data):
#    predictions = model.predict(test_data)
#    return predictions

# Call the function to make predictions on the test dataset
#res = make_predictions(xgb_perf_test, df_test)

# Create a DataFrame with the predictions
#sub = pd.DataFrame(df_test)
#sub["Overall_Experience"] = res

# Select 'ID' and 'Overall_Experience' columns and save to CSV
#sub_to_file = sub[['ID', 'Overall_Experience']]
#sub_to_file.to_csv("hackathon_karine4.csv", index=False)


from google.colab import files

files.download('hackathon_predictions2.csv')

	count	mean	std	min	25%	50%	75%	max
ID	94379.00000000	98847190.00000000	27245.01486511	98800001.00000000	98823595.50000000	98847190.00000000	98870784.50000000	98894379.00000000
Overall_Experience	94379.00000000	0.54665763	0.49782094	0.00000000	0.00000000	1.00000000	1.00000000	1.00000000
Age	94346.00000000	39.41964683	15.11663185	7.00000000	27.00000000	40.00000000	51.00000000	85.00000000
Travel_Distance	94379.00000000	1978.88818487	1027.96101914	50.00000000	1359.00000000	1923.00000000	2538.00000000	6951.00000000
Departure_Delay_in_Mins	94322.00000000	14.64709188	38.13878127	0.00000000	0.00000000	0.00000000	12.00000000	1592.00000000
Arrival_Delay_in_Mins	94022.00000000	15.00522218	38.43940923	0.00000000	0.00000000	0.00000000	13.00000000	1584.00000000

	ID	Age	Travel_Distance	Departure_Delay_in_Mins	Arrival_Delay_in_Mins
Overall_Experience
0	98847140.93032768	37.49018003	2025.82608797	17.73859969	18.39237447
1	98847230.69340803	41.01967970	1939.96264997	12.08310706	12.19676277

	ID	Overall_Experience	Seat_Comfort	Seat_Class	Arrival_Time_Convenient	Catering	Platform_Location	Onboard_Wifi_Service	Onboard_Entertainment	Online_Support	Ease_of_Online_Booking	Onboard_Service	Legroom	Baggage_Handling	CheckIn_Service	Cleanliness	Online_Boarding	Gender	Customer_Type	Age	Type_Travel	Travel_Class	Travel_Distance	Departure_Delay_in_Mins	Arrival_Delay_in_Mins
0	98800001	0	3	1	6	6	6	5	3	4	3	3	4	3	5	3	2	0	1	52.00000000	1	1	272.00000000	0.00000000	5.00000000
1	98800002	0	2	0	6	2	3	5	2	5	5	6	3	2	3	5	5	1	1	48.00000000	0	0	2200.00000000	9.00000000	0.00000000
2	98800003	1	3	1	3	3	3	3	5	6	6	6	6	6	5	6	6	0	1	43.00000000	1	1	1061.00000000	30.00000000	32.50000000
3	98800004	0	4	0	3	4	3	4	3	4	4	4	4	4	5	4	4	0	1	44.00000000	1	1	780.00000000	13.00000000	18.00000000
4	98800005	1	4	0	4	4	4	3	5	6	5	5	5	5	5	5	5	0	1	50.00000000	1	1	1981.00000000	0.00000000	0.00000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
94374	98894375	0	2	0	5	5	5	2	2	2	2	5	5	5	3	5	2	1	1	32.00000000	1	1	1357.00000000	30.00000000	32.50000000
94375	98894376	1	5	0	5	5	5	3	6	6	4	4	4	4	5	4	5	1	1	44.00000000	1	1	592.00000000	5.00000000	11.00000000
94376	98894377	1	3	1	3	3	3	5	6	5	5	5	5	5	4	5	4	1	1	63.00000000	1	1	2794.00000000	0.00000000	0.00000000
94377	98894378	0	3	0	5	3	5	5	3	5	5	4	5	5	5	6	5	1	1	16.00000000	0	0	2744.00000000	0.00000000	0.00000000
94378	98894379	0	4	0	2	4	4	4	4	4	4	2	5	5	2	5	4	1	1	54.00000000	1	0	2107.00000000	28.00000000	28.00000000

	ID	Seat_Comfort	Seat_Class	Arrival_Time_Convenient	Catering	Platform_Location	Onboard_Wifi_Service	Onboard_Entertainment	Online_Support	Ease_of_Online_Booking	Onboard_Service	Legroom	Baggage_Handling	CheckIn_Service	Cleanliness	Online_Boarding	Gender	Customer_Type	Age	Type_Travel	Travel_Class	Travel_Distance	Departure_Delay_in_Mins	Arrival_Delay_in_Mins
0	99900001	4	1	4	4	4	3	6	5	6	6	6	6	5	6	2	0	1	36.00000000	1	1	532.00000000	0.00000000	0.00000000
1	99900002	1	0	5	2	4	4	2	4	4	6	4	5	4	6	4	0	0	21.00000000	1	1	1425.00000000	9.00000000	28.00000000
2	99900003	6	0	6	6	6	6	6	6	3	3	3	3	5	3	6	1	1	60.00000000	1	1	2832.00000000	0.00000000	0.00000000
3	99900004	4	1	6	4	6	2	4	6	2	4	3	6	6	6	2	0	1	29.00000000	0	0	1352.00000000	0.00000000	0.00000000
4	99900005	6	0	1	6	3	6	6	6	6	5	4	6	6	6	6	1	0	18.00000000	1	1	1610.00000000	17.00000000	0.00000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
35597	99935598	3	1	6	3	4	4	3	4	4	5	6	5	4	5	4	1	1	8.00000000	0	0	1334.00000000	0.00000000	0.00000000
35598	99935599	3	0	3	5	3	4	6	6	5	5	5	5	4	5	5	0	1	53.00000000	1	1	1772.00000000	0.00000000	0.00000000
35599	99935600	5	1	1	5	3	3	5	2	3	2	4	2	2	6	3	1	0	22.00000000	1	0	1180.00000000	0.00000000	0.00000000
35600	99935601	6	0	6	6	2	4	6	5	6	6	6	6	4	6	5	0	1	67.00000000	0	0	420.00000000	23.00000000	16.00000000
35601	99935602	5	0	4	5	4	2	5	2	2	4	5	5	3	5	2	1	1	20.00000000	0	0	1680.00000000	0.00000000	0.00000000

	ID	Seat_Comfort	Seat_Class	Arrival_Time_Convenient	Catering	Platform_Location	Onboard_Wifi_Service	Onboard_Entertainment	Online_Support	Ease_of_Online_Booking	Onboard_Service	Legroom	Baggage_Handling	CheckIn_Service	Cleanliness	Online_Boarding	Gender	Customer_Type	Age	Type_Travel	Travel_Class	Travel_Distance	Departure_Delay_in_Mins	Arrival_Delay_in_Mins
0	99900001	4	1	4	4	4	3	6	5	6	6	6	6	5	6	2	0	1	36.00000000	1	1	532.00000000	0.00000000	0.00000000
1	99900002	1	0	5	2	4	4	2	4	4	6	4	5	4	6	4	0	0	21.00000000	1	1	1425.00000000	9.00000000	28.00000000
2	99900003	6	0	6	6	6	6	6	6	3	3	3	3	5	3	6	1	1	60.00000000	1	1	2832.00000000	0.00000000	0.00000000
3	99900004	4	1	6	4	6	2	4	6	2	4	3	6	6	6	2	0	1	29.00000000	0	0	1352.00000000	0.00000000	0.00000000
4	99900005	6	0	1	6	3	6	6	6	6	5	4	6	6	6	6	1	0	18.00000000	1	1	1610.00000000	17.00000000	0.00000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
35597	99935598	3	1	6	3	4	4	3	4	4	5	6	5	4	5	4	1	1	8.00000000	0	0	1334.00000000	0.00000000	0.00000000
35598	99935599	3	0	3	5	3	4	6	6	5	5	5	5	4	5	5	0	1	53.00000000	1	1	1772.00000000	0.00000000	0.00000000
35599	99935600	5	1	1	5	3	3	5	2	3	2	4	2	2	6	3	1	0	22.00000000	1	0	1180.00000000	0.00000000	0.00000000
35600	99935601	6	0	6	6	2	4	6	5	6	6	6	6	4	6	5	0	1	67.00000000	0	0	420.00000000	23.00000000	16.00000000
35601	99935602	5	0	4	5	4	2	5	2	2	4	5	5	3	5	2	1	1	20.00000000	0	0	1680.00000000	0.00000000	0.00000000

Shinkansen Travel Experience¶

Problem Definition¶

The Context¶

Data Description:¶

Import the necessary libraries and Data¶

Loading the Dataset¶

Data Overview¶

Summary Statistics¶

Exploratory Data Analysis (EDA) and Visualization¶

Univariate Analysis¶

Model Building - Approach¶

Data Preparation¶

Scaling the data¶

Building a model¶

Logistic Regression¶

Decision Tree¶

Decision Tree - Hyperparameter Tuning¶

Random Forest¶

Random Forest - Hyperparameter Tuning¶

	ID	Overall_Experience	Seat_Comfort	Seat_Class	Arrival_Time_Convenient	Catering	Platform_Location	Onboard_Wifi_Service	Onboard_Entertainment	Online_Support	Ease_of_Online_Booking	Onboard_Service	Legroom	Baggage_Handling	CheckIn_Service	Cleanliness	Online_Boarding
0	98800001	0	Needs Improvement	Green Car	Excellent	Excellent	Very Convenient	Good	Needs Improvement	Acceptable	Needs Improvement	Needs Improvement	Acceptable	Needs Improvement	Good	Needs Improvement	Poor
1	98800002	0	Poor	Ordinary	Excellent	Poor	Needs Improvement	Good	Poor	Good	Good	Excellent	Needs Improvement	Poor	Needs Improvement	Good	Good
2	98800003	1	Needs Improvement	Green Car	Needs Improvement	Needs Improvement	Needs Improvement	Needs Improvement	Good	Excellent	Excellent	Excellent	Excellent	Excellent	Good	Excellent	Excellent
3	98800004	0	Acceptable	Ordinary	Needs Improvement	NaN	Needs Improvement	Acceptable	Needs Improvement	Acceptable	Acceptable	Acceptable	Acceptable	Acceptable	Good	Acceptable	Acceptable
4	98800005	1	Acceptable	Ordinary	Acceptable	Acceptable	Manageable	Needs Improvement	Good	Excellent	Good	Good	Good	Good	Good	Good	Good

	ID	Gender	Customer_Type	Age	Type_Travel	Travel_Class	Travel_Distance	Departure_Delay_in_Mins	Arrival_Delay_in_Mins
0	98800001	Female	Loyal Customer	52.00000000	NaN	Business	272	0.00000000	5.00000000
1	98800002	Male	Loyal Customer	48.00000000	Personal Travel	Eco	2200	9.00000000	0.00000000
2	98800003	Female	Loyal Customer	43.00000000	Business Travel	Business	1061	77.00000000	119.00000000
3	98800004	Female	Loyal Customer	44.00000000	Business Travel	Business	780	13.00000000	18.00000000
4	98800005	Female	Loyal Customer	50.00000000	Business Travel	Business	1981	0.00000000	0.00000000

	Count	Percentage
Seat_Comfort	61	0.06463302
Arrival_Time_Convenient	8930	9.46185062
Catering	8741	9.26159421
Platform_Location	30	0.03178673
Onboard_Wifi_Service	30	0.03178673
Onboard_Entertainment	18	0.01907204
Online_Support	91	0.09641975
Ease_of_Online_Booking	73	0.07734772
Onboard_Service	7601	8.05369839
Legroom	90	0.09536020
Baggage_Handling	142	0.15045720
CheckIn_Service	77	0.08158595
Cleanliness	6	0.00635735
Online_Boarding	6	0.00635735
Gender	77	0.08158595
Customer_Type	8951	9.48410134
Age	33	0.03496541
Type_Travel	9226	9.77547971
Departure_Delay_in_Mins	57	0.06039479
Arrival_Delay_in_Mins	357	0.37826211