import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# For ML Work
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.metrics import confusion_matrix


df = pd.read_csv('https://raw.githubusercontent.com/BayTech-CSUMB/CST383Final/main/train.csv')
dfTest = pd.read_csv('https://raw.githubusercontent.com/BayTech-CSUMB/CST383Final/main/test.csv')


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      103904 non-null  int64  
 12  Food and drink                     103904 non-null  int64  
 13  Online boarding                    103904 non-null  int64  
 14  Seat comfort                       103904 non-null  int64  
 15  Inflight entertainment             103904 non-null  int64  
 16  On-board service                   103904 non-null  int64  
 17  Leg room service                   103904 non-null  int64  
 18  Baggage handling                   103904 non-null  int64  
 19  Checkin service                    103904 non-null  int64  
 20  Inflight service                   103904 non-null  int64  
 21  Cleanliness                        103904 non-null  int64  
 22  Departure Delay in Minutes         103904 non-null  int64  
 23  Arrival Delay in Minutes           103594 non-null  float64
 24  satisfaction                       103904 non-null  object 
dtypes: float64(1), int64(19), object(5)
memory usage: 19.8+ MB


df.describe().round(1)


df.dtypes[df.dtypes == 'object']

Gender            object
Customer Type     object
Type of Travel    object
Class             object
satisfaction      object
dtype: object


print(df['Gender'].value_counts())

Female    52727
Male      51177
Name: Gender, dtype: int64


print(df['Customer Type'].value_counts())

Loyal Customer       84923
disloyal Customer    18981
Name: Customer Type, dtype: int64


print(df['Type of Travel'].value_counts())

Business travel    71655
Personal Travel    32249
Name: Type of Travel, dtype: int64


print(df['Class'].value_counts())

Business    49665
Eco         46745
Eco Plus     7494
Name: Class, dtype: int64


print(df['Type of Travel'].value_counts())

Business travel    71655
Personal Travel    32249
Name: Type of Travel, dtype: int64


cols = ['Gender', 'Type of Travel', 'Class', 'Customer Type']
# Keeping these as backups for graphing later.
classForGraphing = df['Class']
genderForGraphing = df['Gender']

for col in cols:
    catCol = pd.get_dummies(df[col], prefix=col)
    df.drop(col, axis=1, inplace=True)
    df = pd.concat([df, catCol], axis=1)
    # Repeat but for our test data set too.
    catCol2 = pd.get_dummies(dfTest[col], prefix=col)
    dfTest.drop(col, axis=1, inplace=True)
    dfTest = pd.concat([dfTest, catCol2], axis=1)


# Here we made a copy so that when we later try and visualize the data we can get properly labeled graphs. 
satisfactionForGraphing = df['satisfaction'].copy()
# Checking before and after. 0 = neutral/dissatisfied 1 = satisfied
print(df['satisfaction'].value_counts())
df['satisfaction'] = df['satisfaction'].astype('category').cat.codes
print(df['satisfaction'].value_counts())

neutral or dissatisfied    58879
satisfied                  45025
Name: satisfaction, dtype: int64
0    58879
1    45025
Name: satisfaction, dtype: int64


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 30 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Age                                103904 non-null  int64  
 3   Flight Distance                    103904 non-null  int64  
 4   Inflight wifi service              103904 non-null  int64  
 5   Departure/Arrival time convenient  103904 non-null  int64  
 6   Ease of Online booking             103904 non-null  int64  
 7   Gate location                      103904 non-null  int64  
 8   Food and drink                     103904 non-null  int64  
 9   Online boarding                    103904 non-null  int64  
 10  Seat comfort                       103904 non-null  int64  
 11  Inflight entertainment             103904 non-null  int64  
 12  On-board service                   103904 non-null  int64  
 13  Leg room service                   103904 non-null  int64  
 14  Baggage handling                   103904 non-null  int64  
 15  Checkin service                    103904 non-null  int64  
 16  Inflight service                   103904 non-null  int64  
 17  Cleanliness                        103904 non-null  int64  
 18  Departure Delay in Minutes         103904 non-null  int64  
 19  Arrival Delay in Minutes           103594 non-null  float64
 20  satisfaction                       103904 non-null  int8   
 21  Gender_Female                      103904 non-null  uint8  
 22  Gender_Male                        103904 non-null  uint8  
 23  Type of Travel_Business travel     103904 non-null  uint8  
 24  Type of Travel_Personal Travel     103904 non-null  uint8  
 25  Class_Business                     103904 non-null  uint8  
 26  Class_Eco                          103904 non-null  uint8  
 27  Class_Eco Plus                     103904 non-null  uint8  
 28  Customer Type_Loyal Customer       103904 non-null  uint8  
 29  Customer Type_disloyal Customer    103904 non-null  uint8  
dtypes: float64(1), int64(19), int8(1), uint8(9)
memory usage: 16.8 MB


dfTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 30 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         25976 non-null  int64  
 1   id                                 25976 non-null  int64  
 2   Age                                25976 non-null  int64  
 3   Flight Distance                    25976 non-null  int64  
 4   Inflight wifi service              25976 non-null  int64  
 5   Departure/Arrival time convenient  25976 non-null  int64  
 6   Ease of Online booking             25976 non-null  int64  
 7   Gate location                      25976 non-null  int64  
 8   Food and drink                     25976 non-null  int64  
 9   Online boarding                    25976 non-null  int64  
 10  Seat comfort                       25976 non-null  int64  
 11  Inflight entertainment             25976 non-null  int64  
 12  On-board service                   25976 non-null  int64  
 13  Leg room service                   25976 non-null  int64  
 14  Baggage handling                   25976 non-null  int64  
 15  Checkin service                    25976 non-null  int64  
 16  Inflight service                   25976 non-null  int64  
 17  Cleanliness                        25976 non-null  int64  
 18  Departure Delay in Minutes         25976 non-null  int64  
 19  Arrival Delay in Minutes           25893 non-null  float64
 20  satisfaction                       25976 non-null  object 
 21  Gender_Female                      25976 non-null  uint8  
 22  Gender_Male                        25976 non-null  uint8  
 23  Type of Travel_Business travel     25976 non-null  uint8  
 24  Type of Travel_Personal Travel     25976 non-null  uint8  
 25  Class_Business                     25976 non-null  uint8  
 26  Class_Eco                          25976 non-null  uint8  
 27  Class_Eco Plus                     25976 non-null  uint8  
 28  Customer Type_Loyal Customer       25976 non-null  uint8  
 29  Customer Type_disloyal Customer    25976 non-null  uint8  
dtypes: float64(1), int64(19), object(1), uint8(9)
memory usage: 4.4+ MB


print(f'There are {df.isna().sum().sum()} NaN values in df.')
df.isna().sum()

There are 310 NaN values in df.

Unnamed: 0                             0
id                                     0
Age                                    0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction                           0
Gender_Female                          0
Gender_Male                            0
Type of Travel_Business travel         0
Type of Travel_Personal Travel         0
Class_Business                         0
Class_Eco                              0
Class_Eco Plus                         0
Customer Type_Loyal Customer           0
Customer Type_disloyal Customer        0
dtype: int64


print(f'There are {dfTest.isna().sum().sum()} NaN values in our test df.')
dfTest.isna().sum()

There are 83 NaN values in our test df.

Unnamed: 0                            0
id                                    0
Age                                   0
Flight Distance                       0
Inflight wifi service                 0
Departure/Arrival time convenient     0
Ease of Online booking                0
Gate location                         0
Food and drink                        0
Online boarding                       0
Seat comfort                          0
Inflight entertainment                0
On-board service                      0
Leg room service                      0
Baggage handling                      0
Checkin service                       0
Inflight service                      0
Cleanliness                           0
Departure Delay in Minutes            0
Arrival Delay in Minutes             83
satisfaction                          0
Gender_Female                         0
Gender_Male                           0
Type of Travel_Business travel        0
Type of Travel_Personal Travel        0
Class_Business                        0
Class_Eco                             0
Class_Eco Plus                        0
Customer Type_Loyal Customer          0
Customer Type_disloyal Customer       0
dtype: int64


# Calculate the average of the column
average_delay = df['Arrival Delay in Minutes'].mean()
average_delay_test = dfTest['Arrival Delay in Minutes'].mean()
# Impute & replace NaNs with the average value
df['Arrival Delay in Minutes'].fillna(value=average_delay, inplace=True)
dfTest['Arrival Delay in Minutes'].fillna(value=average_delay_test, inplace=True)


print(f'There are {df.isna().sum().sum()} NaN values in df.')
print(f'There are {dfTest.isna().sum().sum()} NaN values in our test df.')

There are 0 NaN values in df.
There are 0 NaN values in our test df.


corrDfOld = df.corr()
sns.heatmap(corrDfOld, xticklabels=corrDfOld.columns, yticklabels=corrDfOld.columns)

<AxesSubplot:>


corrDf = df.copy()
corrDf.drop(['id', 'Unnamed: 0'], axis=1, inplace=True)
correlation = corrDf.corr()
sns.heatmap(correlation, xticklabels=correlation.columns, yticklabels=correlation.columns)

<AxesSubplot:>


toDropCols = ['id', 'Unnamed: 0']
df.drop(toDropCols, axis=1, inplace=True)
dfTest.drop(toDropCols, axis=1, inplace=True)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 28 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Age                                103904 non-null  int64  
 1   Flight Distance                    103904 non-null  int64  
 2   Inflight wifi service              103904 non-null  int64  
 3   Departure/Arrival time convenient  103904 non-null  int64  
 4   Ease of Online booking             103904 non-null  int64  
 5   Gate location                      103904 non-null  int64  
 6   Food and drink                     103904 non-null  int64  
 7   Online boarding                    103904 non-null  int64  
 8   Seat comfort                       103904 non-null  int64  
 9   Inflight entertainment             103904 non-null  int64  
 10  On-board service                   103904 non-null  int64  
 11  Leg room service                   103904 non-null  int64  
 12  Baggage handling                   103904 non-null  int64  
 13  Checkin service                    103904 non-null  int64  
 14  Inflight service                   103904 non-null  int64  
 15  Cleanliness                        103904 non-null  int64  
 16  Departure Delay in Minutes         103904 non-null  int64  
 17  Arrival Delay in Minutes           103904 non-null  float64
 18  satisfaction                       103904 non-null  int8   
 19  Gender_Female                      103904 non-null  uint8  
 20  Gender_Male                        103904 non-null  uint8  
 21  Type of Travel_Business travel     103904 non-null  uint8  
 22  Type of Travel_Personal Travel     103904 non-null  uint8  
 23  Class_Business                     103904 non-null  uint8  
 24  Class_Eco                          103904 non-null  uint8  
 25  Class_Eco Plus                     103904 non-null  uint8  
 26  Customer Type_Loyal Customer       103904 non-null  uint8  
 27  Customer Type_disloyal Customer    103904 non-null  uint8  
dtypes: float64(1), int64(17), int8(1), uint8(9)
memory usage: 15.3 MB


# Passenger's satisfaction based on the Ratings of on-board service.
sns.countplot(x=df['On-board service'], hue=satisfactionForGraphing)
plt.title('Satisfaction based on Ratings for On-board Service')
plt.show()


# Passenger's satisfaction based off of Class
sns.countplot(x=classForGraphing, hue=satisfactionForGraphing)
plt.title('Satisfaction based on class')
plt.show()


# Passenger's satisfaction based off of their gender. To see if the graphs are near equal.
sns.countplot(x=genderForGraphing, hue=satisfactionForGraphing)
plt.title('Satisfaction based on class')
plt.show()


# Passenger's Satisfaction based off of Seat Comfort Ratings
sns.boxplot(x=df['Seat comfort'], y=satisfactionForGraphing)
plt.title('Satisfaction based on Seat Comfort')
plt.xlabel('Seat Comfort Rating (1-5)')
plt.ylabel('Satisfaction')

Text(0, 0.5, 'Satisfaction')


# Passenger's Satisfaction based off of Cleanliness Ratings
sns.boxplot(data=df, x=df['Cleanliness'], y=satisfactionForGraphing)
plt.xlabel('Cleanliness Rating (1-5)')
plt.ylabel('Satisfaction')
plt.title('Satisfaction based on Cleanliness')

Text(0.5, 1.0, 'Satisfaction based on Cleanliness')


# Passenger's Satisfaction based off of Inflight Entertainment
sns.boxplot(x='Inflight entertainment', y=satisfactionForGraphing, data=df)
plt.title('Satisfaction by Inflight Entertainment')
plt.xlabel('Inflight Entertainment')
plt.ylabel('Satisfaction')
plt.show()


# Checking overall satisfaction for all online boarders.
sns.countplot(x='Online boarding', hue=satisfactionForGraphing, data=df)
plt.title('Satisfaction Rating against Online Boarding')
plt.xlabel('Online Boarding Rating')
plt.ylabel('Satisfaction')
plt.show()


# Passenger's Satisfaction based off of Inflight Service
sns.boxplot(x='Inflight service', y=satisfactionForGraphing, data=df)
plt.title('Satisfaction by Inflight Service')
plt.xlabel('Inflight Service')
plt.ylabel('Satisfaction')
plt.show()


sns.countplot(x='Food and drink', hue=satisfactionForGraphing, data=df)
plt.title('Satisfaction Rating against Food & Drink')
plt.xlabel('Food and Drink')
plt.ylabel('Satisfaction')
plt.show()


y_train = df['satisfaction'].values
y_test = dfTest['satisfaction'].astype('category').cat.codes


def x_train_test_split(predictors):
    scaler = StandardScaler()
    xPred = scaler.fit_transform(df[predictors].values)
    testPred = scaler.fit_transform(dfTest[predictors].values)
    return (xPred, testPred)


predictors = ['Inflight entertainment', 'On-board service', 'Leg room service','Baggage handling']
X_train, X_test = x_train_test_split(predictors)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

predictions = knn.predict(X_test)
accuracy = (predictions == y_test).mean()
print(f'kNN 5-feature accuracy: {accuracy.round(2)}')

c:\Users\warre\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

kNN 5-feature accuracy: 0.73


dfCopy = df.copy()
dfCopy.drop('satisfaction', inplace=True, axis=1)
predictors = dfCopy.columns
X_train, X_test = x_train_test_split(predictors)

knn = KNeighborsClassifier(algorithm='brute')  # 25s
# knn = KNeighborsClassifier(algorithm='ball_tree') # over a min
# knn = KNeighborsClassifier(algorithm='kd_tree') # 40s
knn.fit(X_train, y_train)

predictions = knn.predict(X_test)
accuracy = (predictions == y_test).mean()
print(f'kNN all-feature accuracy: {accuracy.round(2)}')

c:\Users\warre\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

kNN all-feature accuracy: 0.93


dfCopy = df.copy()
dfCopy.drop('satisfaction', inplace=True, axis=1)
predictors = dfCopy.columns
X_train, X_test = x_train_test_split(predictors)

# tree = DecisionTreeClassifier(max_depth=2) # 0.86
tree = DecisionTreeClassifier(max_depth=4) # 0.89, close enough to kNN accuracy but keeping the speed of trees.
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)
accuracy = (predictions == y_test).mean()
print(f'Decision Tree all-feature accuracy: {accuracy.round(2)}')

Decision Tree all-feature accuracy: 0.89


# Code snippet from previous lab. Code to run plenty of tests with our model that'll map out different learning curves for all values of max_depth from 1 to 10.
k = 10
overallTE = []
overallTR = []
dfCopy = df.copy()
dfCopy.drop('satisfaction', inplace=True, axis=1)
predictors = dfCopy.columns
X_train, X_test = x_train_test_split(predictors)

for i in range(1, k+1, 1):
    knn = DecisionTreeClassifier(max_depth=i)
    te_errs = []
    tr_errs = []
    tr_sizes = np.linspace(100, X_train.shape[0], 10).astype(int)
    for tr_size in tr_sizes:
        # train model on a subset of the training data
        X_train1 = X_train[:tr_size,:]
        y_train1 = y_train[:tr_size]
        knn.fit(X_train1, y_train1)
        # Errors from Training & Test Data
        tr_predicted = knn.predict(X_train1)
        err = (tr_predicted != y_train1).mean()
        tr_errs.append(err)
        te_predicted = knn.predict(X_test)
        err = (te_predicted != y_test).mean()
        te_errs.append(err)
    # Calc the learning curve values and append them for later.
    tr_sizes, tr_errs, te_errs = learning_curve(
        knn, X_train, y_train, cv=10, scoring='accuracy')
    overallTR.append(np.mean(tr_errs, axis=1))
    overallTE.append(np.mean(te_errs, axis=1))


# Same snippet from lab, but separated for easier tweaking of graphs.
# Make the resulting pairs "easier" to interpret
color = ['red', 'green', 'blue', 'orange', 'purple', 'gold', 'violet', 'maroon', 'pink', 'lightblue']
k = 1
for i in range(0, len(overallTE), 1):
    plt.plot(tr_sizes, overallTE[i], label=f'test {k}', color=color[i], ls='dashdot')
    plt.plot(tr_sizes, overallTR[i], label=f'train {k}', color=color[i], linewidth=1.5, alpha=0.5)
    k = k+1
plt.legend(loc='right', prop={'size': 8})
plt.xlabel('Training Sizes')
plt.ylabel('Accuracy')
plt.title('Learning Curve of Decision Tree w/ Different Max Depths')
plt.show()


dfCopy = df.copy()
dfCopy.drop('satisfaction', inplace=True, axis=1)
predictors = dfCopy.columns
X_train, X_test = x_train_test_split(predictors)

parameters = [{'min_samples_leaf': [0.1, 0.2, 0.3], 'max_leaf_nodes': [4, 8, 16]}]
tree = DecisionTreeClassifier(max_depth=4)
test = GridSearchCV(tree, parameters, scoring='accuracy', cv=10)
test.fit(X_train, y_train)

print(f'Our best score was: {test.best_score_} and the best params were {test.best_params_}.')

Our best score was: 0.8435478840845556 and the best params were {'max_leaf_nodes': 4, 'min_samples_leaf': 0.1}.


dfCopy = df.copy()
dfCopy.drop(columns='satisfaction', axis=1, inplace=True)
predictors = dfCopy.columns
# Was unable to setup the proper file split here for some reason.
X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(dfCopy, df['satisfaction'], test_size=0.2, random_state=42)

colName = []
currentAccuracy = 0
for col in predictors:
    X_train_1 = X_train_feat[[col]]
    scores = cross_val_score(DecisionTreeClassifier(random_state = 42), X_train_1, y_train_feat, scoring='accuracy', cv=5)
    accuracy = scores.mean()
    if (accuracy > currentAccuracy):
        currentAccuracy = accuracy
        colName = col
print('Best Feature: {}, Best Accuracy: {:.2f}%'.format(colName, currentAccuracy))

Best Feature: Online boarding, Best Accuracy: 0.79%


dfCopy = df.copy()
dfCopy.drop(columns='satisfaction', axis=1, inplace=True)
predictors = dfCopy.columns
# Same as unable to setup the proper file split, but its fine for training 
X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(dfCopy, df['satisfaction'], test_size=0.2, random_state=42)

remaining = list(predictors)
selected = []
n = 10

while len(selected) < n:
    currentAccuracy = 0
    colName = ''
    for feature in remaining:
        X_selected = X_train_feat[selected + [feature]]
        scores = cross_val_score(DecisionTreeClassifier(random_state = 42), X_selected, y_train_feat, scoring='accuracy', cv=5)
        accuracy = scores.mean()
        if (accuracy > currentAccuracy):
            currentAccuracy = accuracy
            colName = feature

    remaining.remove(colName)
    selected.append(colName)
    print('Feature: {}, Accuracy: {:.2f}'.format(colName, currentAccuracy))

Feature: Online boarding, Accuracy: 0.79
Feature: Type of Travel_Business travel, Accuracy: 0.85
Feature: Inflight wifi service, Accuracy: 0.89
Feature: Gate location, Accuracy: 0.92
Feature: Baggage handling, Accuracy: 0.93
Feature: Customer Type_disloyal Customer, Accuracy: 0.94
Feature: Class_Business, Accuracy: 0.95
Feature: Inflight service, Accuracy: 0.95
Feature: Seat comfort, Accuracy: 0.95
Feature: Customer Type_Loyal Customer, Accuracy: 0.95


predictors = ['Online boarding', 'Type of Travel_Business travel', 'Inflight wifi service', 'Gate location', 'Baggage handling']
# True file split was achieved for the final model run
X_train, X_test = x_train_test_split(predictors)

tree = DecisionTreeClassifier(max_depth=4, random_state=42)
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)
accuracy = (predictions == y_test).mean()
print(f'Final Decision Tree Accuracy: {accuracy.round(2)}%')

Final Decision Tree Accuracy: 0.88%


# Create a dictionary with the feature values for a single demo customer.
demoCustomer = {'Gender_Female': 0, 'Gender_Male': 1, 
                'Type of Travel_Business travel': 0,
            'Type of Travel_Personal Travel': 0, 'Class_Business': 0, 'Class_Eco': 1,
            'Class_Eco Plus': 0, 'Customer Type_Loyal Customer': 1, 'Customer Type_disloyal Customer': 0,
            'Age': 35, 'Flight Distance': 1000, 'Inflight wifi service': 1, 'Departure/Arrival time convenient': 5,
            'Ease of Online booking': 4, 'Gate location': 1, 'Food and drink': 4, 'Online boarding': 1,
            'Seat comfort': 3, 'Inflight entertainment': 4, 'On-board service': 4, 'Leg room service': 3,
            'Baggage handling': 0, 'Checkin service': 5, 'Inflight service': 5, 'Cleanliness': 4,
            'Departure Delay in Minutes': 1000, 'Arrival Delay in Minutes': 1000}

# create a DataFrame with the new data
custDf = pd.DataFrame(demoCustomer, index=[0])
# get the predicted satisfaction value for the new customer
prediction = tree.predict(custDf[predictors].values)
# convert the predicted value to a string
satisfaction = 'satisfied' if prediction[0] == 1 else 'neutral/dissatisfied'

print(f'The prediction for our demo customer is: {satisfaction}')

The prediction for our demo customer is: satisfied


# Predictions here were from our final model before conclusion.
confusion = confusion_matrix(y_test, predictions)
# convert 0 to "neutral/dissatisfied", and 1 to "satisfied"
predictions = [0 if p==0 else 1 for p in predictions]  
# convert 0 to "neutral/dissatisfied", and 1 to "satisfied"
y_test = [0 if y==0 else 1 for y in y_test]  
# 1 if correct, 0 if incorrect
correct_predictions = [1 if p==t else 0 for p, t in zip(predictions, y_test)]  
accuracy = sum(correct_predictions) / len(correct_predictions)
# Setup the matrix
fig, ax = plt.subplots(figsize=(6, 6))
ax.matshow(confusion, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confusion.shape[0]):
    for j in range(confusion.shape[1]):
        ax.text(x=j, y=i, s=confusion[i, j], va='center', ha='center')
# Actually set the labels
tick_labels = ['Satisfied', 'Neutral/Dissatisfied']
ax.set_xticks([1, 0])
ax.set_yticks([1, 0])
ax.set_xticklabels(tick_labels)
ax.set_yticklabels(tick_labels)
# Axis & Display
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

	Unnamed: 0	id	Age	Flight Distance	Inflight wifi service	Departure/Arrival time convenient	Ease of Online booking	Gate location	Food and drink	Online boarding	Seat comfort	Inflight entertainment	On-board service	Leg room service	Baggage handling	Checkin service	Inflight service	Cleanliness	Departure Delay in Minutes	Arrival Delay in Minutes
count	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103904.0	103594.0
mean	51951.5	64924.2	39.4	1189.4	2.7	3.1	2.8	3.0	3.2	3.3	3.4	3.4	3.4	3.4	3.6	3.3	3.6	3.3	14.8	15.2
std	29994.6	37463.8	15.1	997.1	1.3	1.5	1.4	1.3	1.3	1.3	1.3	1.3	1.3	1.3	1.2	1.3	1.2	1.3	38.2	38.7
min	0.0	1.0	7.0	31.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0
25%	25975.8	32533.8	27.0	414.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	3.0	3.0	3.0	2.0	0.0	0.0
50%	51951.5	64856.5	40.0	843.0	3.0	3.0	3.0	3.0	3.0	3.0	4.0	4.0	4.0	4.0	4.0	3.0	4.0	3.0	0.0	0.0
75%	77927.2	97368.2	51.0	1743.0	4.0	4.0	4.0	4.0	4.0	4.0	5.0	4.0	4.0	4.0	5.0	4.0	5.0	4.0	12.0	13.0
max	103903.0	129880.0	85.0	4983.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	1592.0	1584.0

CST 383 - Final Project¶

Table of Contents:¶

Intro¶

Column Contents¶

Imports¶

Data Investigation and Preprocessing¶

Data Encoding¶

NaN Processing¶

Correlation Heatmap & Column Dropping¶

Data Exploration¶

Visualizations¶

Training¶

Conclusions¶