Imports¶

import pandas as pd
import numpy as np
from collections import Counter
from itertools import groupby
from operator import itemgetter
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from numpy.linalg import norm
from abc import ABC, abstractmethod 
import time
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import multivariate_normal as mv_norm

Helper Classes¶

class DataFetcher:
    """
    DataFetcher: Grabs all csv data as pandas dataframes.
    """
    def __init__(self, directory, X_name, y_name):
        self.directory = directory
        self.X_name = X_name
        self.y_name = y_name
        
    def _get_training_X_path(self, subset_num):
        # 0 <= subset_num <= 9
        return './%s/train%s%d.csv' % (self.directory, self.X_name, (subset_num + 1))

    def _get_training_y_path(self, subset_num):
        # 0 <= subset_num <= 9
        return './%s/train%s%d.csv' % (self.directory, self.y_name, (subset_num + 1))

    def get_all_training_X_y(self):
        training_X_dfs = []
        training_y_dfs = []
        for i in range(NUM_SUBSETS):
            X_path = self._get_training_X_path(i)
            y_path = self._get_training_y_path(i)
            X_df = pd.read_csv(X_path, header=None)
            y_df =  pd.read_csv(y_path, header=None)
            training_X_dfs.append(X_df)
            training_y_dfs.append(y_df)
        
        return training_X_dfs, training_y_dfs
    
    def get_test_X_y(self):
        test_X_path = './%s/test%s.csv' % (self.directory, self.X_name)
        test_y_path = './%s/test%s.csv' % (self.directory, self.y_name)
        test_X_df = pd.read_csv(test_X_path, header=None)
        test_y_df = pd.read_csv(test_y_path, header=None)
        return test_X_df, test_y_df
    
class CrossValidationData:
    """
    CrossValidationData: Splits list of training dataframes 
        into validation & training dataframes.
    """
    def __init__(self, X_dfs, y_dfs):
        assert(len(X_dfs) == len(y_dfs))
        self.X = X_dfs
        self.y = y_dfs
        self.num_subsets = len(X_dfs)
        
    def _split_training_validation(self, dfs, subset_num):
        
        validation_df = dfs[subset_num]
        
        training_dfs = dfs[:subset_num] + dfs[subset_num+1:]
        training_df = pd.concat(training_dfs, ignore_index=True)
      
        return [training_df, validation_df]     
        
    def _get_training_validation_X(self, subset_num):
        assert(subset_num < self.num_subsets)
        
        [training_X, validation_X] = self._split_training_validation(self.X, subset_num)
        
        return [training_X, validation_X]
    
    def _get_training_validation_y(self, subset_num):
        assert(subset_num < self.num_subsets)
        
        [training_y, validation_y] = self._split_training_validation(self.y, subset_num)
        
        return [training_y, validation_y]
    
    def get_training_validation_X_y(self, subset_num):
        [training_X, validation_X] = self._get_training_validation_X(subset_num)
        [training_y, validation_y] = self._get_training_validation_y(subset_num)
        return [training_X, training_y, validation_X, validation_y]

    def get_all_X_y(self):
        training_X = pd.concat(self.X, ignore_index=True)
        training_y = pd.concat(self.y, ignore_index=True)
        return [training_X, training_y]
    
def get_accuracy(true_labels, predicted_labels):
    assert(len(true_labels) == len(predicted_labels))
    return sum(1 for y, y_hat in zip(true_labels, predicted_labels) if y == y_hat ) / len(true_labels)

class Model(ABC):
    """
    Model -> Abstract base class for the models we will implement, namely 
            KNN and RidgeRegression.
    """
    def __init__(self, train_X, train_y):
        self.train_X = train_X
        self.train_y = train_y
    
    @abstractmethod
    def predict(self, x):
        pass
        
    def predict_df(self, X_df):
        predictions = X_df.apply(lambda row: self.predict(row), raw=True, axis=1)
        return predictions

Question 2¶

Importing Data¶

Constants¶

DIR_NAME = 'nonlinear-regression-dataset'
X_NAME = 'Input'
Y_NAME = 'Target'
NUM_SUBSETS = 10
SIGMAS = [1,2,3,4,5,6]
HIDDEN_NODES = [1,2,3,4,5,6,7,8,9,10]

Test Data¶

DF = DataFetcher(DIR_NAME, X_NAME, Y_NAME)
test_X, test_y = DF.get_test_X_y()

test_X.head()

test_y.head()

Training / Validation Data¶

training_X_dfs, training_y_dfs = DF.get_all_training_X_y()

CVData = CrossValidationData(training_X_dfs, training_y_dfs)

print("Training X shape: " + str(CVData.get_training_validation_X_y(0)[0].shape))

Training X shape: (180, 2)

print("Training y shape: " + str(CVData.get_training_validation_X_y(0)[1].shape))

Training y shape: (180, 1)

print("Validation X shape: " + str(CVData.get_training_validation_X_y(0)[2].shape))

Validation X shape: (20, 2)

print("Validation y shape: " + str(CVData.get_training_validation_X_y(0)[3].shape))

Validation y shape: (20, 1)

Question 2a¶

Generalized Linear Regression¶

Model Definition¶

class GeneralizedLinearRegression(Model):
    
    def __init__(self, train_X, train_y, d, lmbda=0):
        super().__init__(train_X, train_y)
        X_original = train_X.to_numpy()
        y = train_y.to_numpy()
        
        # Convert to new space
        poly = PolynomialFeatures(d)
        X = poly.fit_transform(X_original)
        
        # Get inverse of A
        A = X.T.dot(X) + lmbda * np.identity(len(X.T))
        A_inv = np.linalg.inv(A)
        # Get b
        b = X.T.dot(y)

        # Get w
        w = A_inv.dot(b)

        # Persist fields
        self.w = w
        self.lmbda = lmbda
        self.poly = poly
    

    def predict(self, x):
        x = self.poly.transform(x.reshape(1, -1)).flatten()
        y = np.dot(self.w.T, x)[0]
        return y

def get_mse_loss(true_values, predicted_values):
    assert(len(true_values) == len(predicted_values))
    l2_loss = np.mean(np.square(np.subtract(true_values,predicted_values)))
    return l2_loss

Cross Validation¶

def perform_GLRM_CV(d):
    np.random.seed(42)
    
    # Perform CV
    losses = []
    for j in range(NUM_SUBSETS):
        train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
        # Create hypothesis
        model = GeneralizedLinearRegression(train_X, train_y, d, lmbda=1)
        predicted_validation_y = model.predict_df(validation_X)
            
        # Get accuracy
        loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
        losses.append(loss)
            
    avg_loss = np.mean(losses)
    return avg_loss

Finding the Optimal Degree¶

t1 = time.time()
mse_loss_d1 = perform_GLRM_CV(1)
t2 = time.time()
mse_loss_d2 = perform_GLRM_CV(2)
t3 = time.time()
mse_loss_d3 = perform_GLRM_CV(3)
t4 = time.time()
mse_loss_d4 = perform_GLRM_CV(4)
t5 = time.time()

losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')

plt.show()

optimal_d = np.argsort(losses)[0] + 1
print(optimal_d)
print("The optimal d is %f with an average mse loss of %f" % (optimal_d, losses[optimal_d-1]))

4
The optimal d is 4.000000 with an average mse loss of 0.060250

Loss on Test Set with Optimal Degree¶

training_X, training_y = CVData.get_all_X_y()

final_model = GeneralizedLinearRegression(training_X, training_y, optimal_d, lmbda=1)
predicted_test_y = final_model.predict_df(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))

The loss on the test set is 0.05044037782525286

Run Time Complexity¶

print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))

Cross Validation Run Time (s) for d = 1: 0.06121087074279785
Cross Validation Run Time (s) for d = 2: 0.05336594581604004
Cross Validation Run Time (s) for d = 3: 0.06464838981628418
Cross Validation Run Time (s) for d = 4: 0.07327580451965332

The time complexity of Generalized Linear Regression is cubic in the number of basis functions, $O(n^3)$.

This is because we invert the "A" matrix in order to solve the system of linear equations defined by Linear Regression. The A matrix is of n x n dimension where n is the number of basis functions.

The run times above do not really reflect this time complexity. This could be because of my environment and the variance in processing power. Also, I include the time to predict and to perform other cross validation actions in the clock runtime of the algorithm. Perhaps, we could gain a better insight if I were to just time how long it takes to train. Asymptotically, this should make no difference since training is much more expensive than the other actions.

Question 2b¶

Bayesian Linear Regression¶

Model Definition¶

class BayesianLinearRegression(Model):
    
    def __init__(self, train_X, train_y, d):
        super().__init__(train_X, train_y)
        X_original = train_X.to_numpy()
        y = train_y.to_numpy()
        
        # Convert to new space
        poly = PolynomialFeatures(d)
        X = poly.fit_transform(X_original)
        
        # Prior Parameters
        prior_mean = 0
        prior_cov = np.identity(len(X.T))
        
        # Likelihood Parameters
        std_dev = 1
        
        # Posterior Parameters
        # A
        A = ( std_dev ** -2 ) * X.T.dot(X) + np.linalg.inv(prior_cov)
        A_inv = np.linalg.inv(A)
        
        # W bar
        w_bar = ( std_dev ** -2 ) * A_inv.dot(X.T.dot(y))

        # Persist fields
        self.w_bar = w_bar 
        self.poly = poly
        self.X = X
        self.A_inv = A_inv
    
    def predict(self, x):
        x = self.poly.transform(x.reshape(1, -1)).flatten()
        y = np.dot(self.w_bar.T, x)[0]
        return y

Cross Validation¶

def perform_BLRM_CV(d):
    np.random.seed(42)
    
    # Perform CV
    losses = []
    for j in range(NUM_SUBSETS):
        train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
        # Create hypothesis
        model = BayesianLinearRegression(train_X, train_y, d)
        predicted_validation_y = model.predict_df(validation_X)
            
        # Get accuracy
        loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
        losses.append(loss)
            
    avg_loss = np.mean(losses)
    return avg_loss

Finding the Optimal Degree¶

t1 = time.time()
mse_loss_d1 = perform_BLRM_CV(1)
t2 = time.time()
mse_loss_d2 = perform_BLRM_CV(2)
t3 = time.time()
mse_loss_d3 = perform_BLRM_CV(3)
t4 = time.time()
mse_loss_d4 = perform_BLRM_CV(4)
t5 = time.time()

losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')

plt.show()

optimal_d = np.argsort(losses)[0] + 1
print(optimal_d)
print("The optimal d is %f with an average mse loss of %f" % (optimal_d, losses[optimal_d-1]))

4
The optimal d is 4.000000 with an average mse loss of 0.060250

Loss on Test Set with Optimal Lambda¶

training_X, training_y = CVData.get_all_X_y()

final_model = BayesianLinearRegression(training_X, training_y, optimal_d)
predicted_test_y = final_model.predict_df(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))

The loss on the test set is 0.05044037782525286

Run Time Complexity¶

print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))

Cross Validation Run Time (s) for d = 1: 0.05828285217285156
Cross Validation Run Time (s) for d = 2: 0.0528712272644043
Cross Validation Run Time (s) for d = 3: 0.06827592849731445
Cross Validation Run Time (s) for d = 4: 0.07222414016723633

The time complexity of Bayesian Linear Regression is cubic in the number of basis functions.

The run times above reflect this time complexity because as the number of basis functions increase, we see a marked increase in the clock runtime.

Discussion¶

What are the similarities and differences between regularized generalized linear regression and Bayesian generalized linear regression?

The two methods are very similar in that they converge to the same result given a Bayesian prior that reflects the penalty term. The two methods are also similar in that they use basis functions to transform X from its input space to a new space where learning takes place.

The two methods are very different in their representational capacity. Regularized generalized linear regression fits a hyperplane to the points while penalizing the weights of this hyperplane. Bayesian generalized linear regression takes a Gaussian prior, uses a Gaussian conditional distribution for likelihood, and then computes a posterior distribution over the weights to make predictions. Each facet of the Bayesian model is very intrepretable, could be graphed with contours (for up to 2 weights), and shows how introducing data can reduce our uncertainity in a world of hypotheses by influencing the posterior via the likelihood.

They are also different in the way they fundamentally view statistics. Generalized linear regression takes a frequentist approach where we maintain that there are true correct weights and we do the best we can to learn these true weights. The weights found by generalized linear regression is all that is used to form predictions. In contrast, Bayesian linear regression maintains the entire world of hypotheses of weights as potential outcomes. In the Bayesian sense, we do not know if the w we have is the correct one, instead we take a probability distribution over all weights (or hypotheses) in order to create predictions. This means we use the expected value of w, instead of a single precomputed w.

It happens to be that when we use Gaussian distributions to model the prior and likelihood, the point estimate for a prediction (the expected value of w dotted with a new incoming point), is the same as the prediction equation for generalized linear regression.

Question 2c¶

Gaussian Process¶

Model Definition¶

def gram_matrix(X, kernel):
    num_points = X.shape[0]
    G = np.zeros((num_points, num_points))
    for i in range(num_points):
        for j in range(num_points):
            G[i][j] = kernel(X[i], X[j])
    
    return G

def identity_kernel(x, x_prime):
    return x.dot(x_prime)

def gaussian_kernel(x, x_prime, sigma):
    return np.exp(-(norm(x - x_prime)**2)/(2 * (sigma ** 2)))

def polynomial_kernel(x, x_prime, d):
    return (x.dot(x_prime) + 1) ** d

class GaussianProcess(Model):
    
    def __init__(self, train_X, train_y, kernel):
        super().__init__(train_X, train_y)
        X = train_X.to_numpy()
        y = train_y.to_numpy()
        
        std_dev = 1
        
        # Gram Matrix
        K = gram_matrix(X, kernel)
        K_inv = np.linalg.inv(K + std_dev * np.identity(len(K)))
        
        # Persist fields
        self.X = X
        self.y = y
        self.K = K
        self.K_inv = K_inv
        self.kernel = kernel
    
    def compute_kernel_dists(self, x_prime):
        num_points = self.X.shape[0]
        k = np.zeros(num_points)
        for i in range(num_points):
            k[i] = self.kernel(self.X[i], x_prime)
        return k
    
    def predict(self, x):
        k = self.compute_kernel_dists(x)
        y = k.dot(self.K_inv).dot(self.y)[0]
        return y

Identity Kernel¶

training_X, training_y = CVData.get_all_X_y()
t1 = time.time()
identity_kernel_model = GaussianProcess(training_X, training_y, identity_kernel)
t2 = time.time()

predicted_test_y = identity_kernel_model.predict_df(test_X)
test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
print("Run Time (s): " + str(t2-t1))

The loss on the test set is 3.5906206854485827
Run Time (s): 0.07543587684631348

Gaussian Kernel¶

Cross Validation¶

def perform_GP_GK_CV():
    np.random.seed(42)
    
    average_losses = []
    for sig in tqdm_notebook(SIGMAS):
        
        # define kernel
        def gaussian_kernel_fn(x, x_prime):
            return gaussian_kernel(x, x_prime, sig)
        
        # Perform CV
        losses = []
        for j in range(NUM_SUBSETS):
            train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
            # Create hypothesis
            model = GaussianProcess(train_X, train_y, gaussian_kernel_fn)
            predicted_validation_y = model.predict_df(validation_X)
            
            # Get accuracy
            loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
            losses.append(loss)
            
        avg_loss = np.mean(losses)
        average_losses.append(avg_loss)

    return average_losses

Finding the Optimal Sigma¶

average_losses = perform_GP_GK_CV()

optimal_sigma = SIGMAS[np.argsort(average_losses)[0]]
print("The optimal sigma is %f with an average mse loss of %f" % (optimal_sigma, min(average_losses)))

The optimal sigma is 4.000000 with an average mse loss of 0.057149

plt.plot(SIGMAS, average_losses)
plt.xlabel('Sigma')
plt.ylabel('Average L2 Loss')
plt.title('Cross Validation')
plt.show()

Loss on Test set with Optimal Sigma¶

training_X, training_y = CVData.get_all_X_y()
t1 = time.time()
final_model = GaussianProcess(training_X, training_y, lambda x, x_prime: gaussian_kernel(x, x_prime, optimal_sigma))
t2 = time.time()
predicted_test_y = final_model.predict_df(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
print("Run Time (s): " + str(t2-t1))

The loss on the test set is 0.04302516166820031
Run Time (s): 0.46358609199523926

Polynomial Kernel¶

Cross Validation¶

def perform_GP_PK_CV(d):
    np.random.seed(42)
    
    average_losses = []
        
    # define kernel
    def polynomial_kernel_fn(x, x_prime):
        return polynomial_kernel(x, x_prime, d)
        
    # Perform CV
    losses = []
    for j in range(NUM_SUBSETS):
        train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
        # Create hypothesis
        model = GaussianProcess(train_X, train_y, polynomial_kernel_fn)
        predicted_validation_y = model.predict_df(validation_X)
            
        # Get accuracy
        loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
        losses.append(loss)
            
    avg_loss = np.mean(losses)

    return avg_loss

Finding the Optimal Degree¶

t1 = time.time()
mse_loss_d1 = perform_GP_PK_CV(1)
t2 = time.time()
mse_loss_d2 = perform_GP_PK_CV(2)
t3 = time.time()
mse_loss_d3 = perform_GP_PK_CV(3)
t4 = time.time()
mse_loss_d4 = perform_GP_PK_CV(4)
t5 = time.time()

losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')
plt.show()

Loss on Test Set with Optimal Degree¶

training_X, training_y = CVData.get_all_X_y()

t1 = time.time()
final_model = GaussianProcess(training_X, training_y, lambda x,y: polynomial_kernel(x, y, optimal_d))
t2 = time.time()

predicted_test_y = final_model.predict_df(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("Run Time (s): " + str(t2-t1))
print("The loss on the test set is " + str(test_loss))

Run Time (s): 0.11498212814331055
The loss on the test set is 0.045769543486000624

Runtime complexity¶

print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))

Cross Validation Run Time (s) for d = 1: 0.11498212814331055
Cross Validation Run Time (s) for d = 2: -2.095305919647217
Cross Validation Run Time (s) for d = 3: 0.91843581199646
Cross Validation Run Time (s) for d = 4: 0.889031171798706

If we take a look at the clock runtimes above for the identity kernel, the Gaussian kernel, and the polynomial kernel, we see that the identity kernel is the fastest, followed by the polynomial kernel, followed by the Gaussian kernel.

The time complexity of training the Gaussian Process model is cubic in the number of training points. This is in stark contrast to Bayesian linear regression and Generalized linear regression. This kernel method uses the kernel to compute the dot product between training points, and so we are never required to transform the data to its actual representation in the transformed space.

However, we still have a major difference in clock runtime for each of the different kernels. This is because the kernels themselves have their own complexity which is exacerbated over the cubic complexity over the training points, especially for this small dataset.

The Gaussian kernel performs atleast 3n operations while taking the element-wise difference, the norm, the sum, and finally the exponentiation for a n-dimensional input vector. The Identity kernel performs 2n operations. The polynomial kernel performs 2n + d operations. Asymptotically, all three kernels are in $O(n)$ complexity. But at this small scale, we see the effects of the various operations in the runtime.

Question 2d¶

Neural Network¶

Model Definition¶

from keras.models import Sequential
from keras.layers.core import Activation, Dense 
from keras.optimizers import Adam
import keras.backend.tensorflow_backend as KTF
from sklearn.metrics import mean_squared_error

def bilayer_network(hidden_nodes):
        
    model = Sequential()

    model.add(Dense(hidden_nodes, input_dim=2, activation="sigmoid", name="hidden"))

    model.add(Dense(1, activation='linear', name='output'))

    return model

Using TensorFlow backend.

Cross Validation¶

def perform_NN_CV():
    np.random.seed(42)
    
    average_losses = []
    time_per_iteration = []
    for num_nodes in tqdm_notebook(HIDDEN_NODES):
        t1 = time.time()
        # Perform CV
        losses = []
        for j in range(NUM_SUBSETS):
            train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
            # Create hypothesis
            model = bilayer_network(num_nodes)
            model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
            
            model.fit(train_X, train_y, epochs=150, verbose=0, batch_size=10)
            predicted_validation_y = model.predict(validation_X)

            # Get accuracy
            loss = get_mse_loss(validation_y.values.flatten(),predicted_validation_y.flatten()) 
            losses.append(loss)
            
        t2 = time.time()
        avg_loss = np.mean(losses)
        average_losses.append(avg_loss)
        time_per_iteration.append(t2-t1)

    return average_losses, time_per_iteration

Finding the optimal # of hidden nodes¶

average_losses, time_per_iteration = perform_NN_CV()
optimal_nodes = HIDDEN_NODES[np.argsort(average_losses)[0]]
print("The optimal number of hidden nodes is %s with an average mse loss of %f" % (optimal_nodes, min(average_losses)))

WARNING: Logging before flag parsing goes to stderr.
W0630 22:48:20.222808 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0630 22:48:20.249330 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0630 22:48:20.255409 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0630 22:48:20.283298 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0630 22:48:20.496237 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

W0630 22:48:20.555334 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:973: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.

The optimal number of hidden nodes is 10 with an average mse loss of 0.778351

plt.plot(HIDDEN_NODES, average_losses)
plt.xlabel('Number of Hidden Nodes')
plt.ylabel('Average L2 Loss')
plt.title('Cross Validation')
plt.show()

Runtime Complexity¶

for i, runtime in enumerate(time_per_iteration):
    print("Cross Validation Run Time (s) for %s nodes : " % (i+1) + str(runtime))

Cross Validation Run Time (s) for 1 nodes : 24.123227834701538
Cross Validation Run Time (s) for 2 nodes : 29.28181791305542
Cross Validation Run Time (s) for 3 nodes : 31.126604080200195
Cross Validation Run Time (s) for 4 nodes : 36.65745496749878
Cross Validation Run Time (s) for 5 nodes : 41.401155948638916
Cross Validation Run Time (s) for 6 nodes : 47.95881390571594
Cross Validation Run Time (s) for 7 nodes : 54.396154165267944
Cross Validation Run Time (s) for 8 nodes : 60.853068828582764
Cross Validation Run Time (s) for 9 nodes : 74.93994808197021
Cross Validation Run Time (s) for 10 nodes : 88.44564294815063

For a neural network, we train by utilizing forward and backwards propagation.

For each node in the model, we utilize it once in forwards propagation and once in backwards propagation, given automatic differentiation. We use each node atleast once for every training example, in each epoch. So the time complexity of neural network can be interpreted as $O(w * m * e)$ where $w$ is the number of nodes in the neural network, $m$ is the number of training examples, and $e$ is the number of epochs.

Looking at only the number of nodes, we see that the runtime is linear in the number of nodes, $O(w)$.

This matches our results as the runtime increases by approximately 5-6 seconds per extra hidden node.

Loss on Test Set with Optimal Nodes¶

KTF.clear_session() 
training_X, training_y = CVData.get_all_X_y()
model = bilayer_network(optimal_nodes)
model.summary()
model.compile(optimizer='adam', loss='mse')

model.fit(training_X, training_y, epochs=200, verbose=0, batch_size=10)
predicted_test_y = model.predict(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y.flatten())
print("The loss on the test set is " + str(test_loss))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
hidden (Dense)               (None, 10)                30        
_________________________________________________________________
output (Dense)               (None, 1)                 11        
=================================================================
Total params: 41
Trainable params: 41
Non-trainable params: 0
_________________________________________________________________
The loss on the test set is 0.7125737639221129

	0
0	1.604
1	-0.064
2	-3.017
3	-2.354
4	-0.394

Table of Contents

Imports¶

Helper Classes¶

Question 2¶

Importing Data¶

Constants¶

Test Data¶

Training / Validation Data¶

Question 2a¶

Generalized Linear Regression¶

Model Definition¶

Cross Validation¶

Finding the Optimal Degree¶

Loss on Test Set with Optimal Degree¶

Run Time Complexity¶

Question 2b¶

Bayesian Linear Regression¶

Model Definition¶

Cross Validation¶

Finding the Optimal Degree¶

Loss on Test Set with Optimal Lambda¶

Run Time Complexity¶

Discussion¶

Question 2c¶

Gaussian Process¶

Model Definition¶

Identity Kernel¶

Gaussian Kernel¶

Cross Validation¶

Finding the Optimal Sigma¶

Loss on Test set with Optimal Sigma¶

Polynomial Kernel¶

Cross Validation¶

Finding the Optimal Degree¶

Loss on Test Set with Optimal Degree¶

Runtime complexity¶

Question 2d¶

Neural Network¶

Model Definition¶

Cross Validation¶

Finding the optimal # of hidden nodes¶

Runtime Complexity¶

Loss on Test Set with Optimal Nodes¶