Imports

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from itertools import groupby
from operator import itemgetter
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from numpy.linalg import norm
from abc import ABC, abstractmethod 
import time
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import multivariate_normal as mv_norm

Helper Classes

In [3]:
class DataFetcher:
    """
    DataFetcher: Grabs all csv data as pandas dataframes.
    """
    def __init__(self, directory, X_name, y_name):
        self.directory = directory
        self.X_name = X_name
        self.y_name = y_name
        
    def _get_training_X_path(self, subset_num):
        # 0 <= subset_num <= 9
        return './%s/train%s%d.csv' % (self.directory, self.X_name, (subset_num + 1))

    def _get_training_y_path(self, subset_num):
        # 0 <= subset_num <= 9
        return './%s/train%s%d.csv' % (self.directory, self.y_name, (subset_num + 1))

    def get_all_training_X_y(self):
        training_X_dfs = []
        training_y_dfs = []
        for i in range(NUM_SUBSETS):
            X_path = self._get_training_X_path(i)
            y_path = self._get_training_y_path(i)
            X_df = pd.read_csv(X_path, header=None)
            y_df =  pd.read_csv(y_path, header=None)
            training_X_dfs.append(X_df)
            training_y_dfs.append(y_df)
        
        return training_X_dfs, training_y_dfs
    
    def get_test_X_y(self):
        test_X_path = './%s/test%s.csv' % (self.directory, self.X_name)
        test_y_path = './%s/test%s.csv' % (self.directory, self.y_name)
        test_X_df = pd.read_csv(test_X_path, header=None)
        test_y_df = pd.read_csv(test_y_path, header=None)
        return test_X_df, test_y_df
    
class CrossValidationData:
    """
    CrossValidationData: Splits list of training dataframes 
        into validation & training dataframes.
    """
    def __init__(self, X_dfs, y_dfs):
        assert(len(X_dfs) == len(y_dfs))
        self.X = X_dfs
        self.y = y_dfs
        self.num_subsets = len(X_dfs)
        
    def _split_training_validation(self, dfs, subset_num):
        
        validation_df = dfs[subset_num]
        
        training_dfs = dfs[:subset_num] + dfs[subset_num+1:]
        training_df = pd.concat(training_dfs, ignore_index=True)
      
        return [training_df, validation_df]     
        
    def _get_training_validation_X(self, subset_num):
        assert(subset_num < self.num_subsets)
        
        [training_X, validation_X] = self._split_training_validation(self.X, subset_num)
        
        return [training_X, validation_X]
    
    def _get_training_validation_y(self, subset_num):
        assert(subset_num < self.num_subsets)
        
        [training_y, validation_y] = self._split_training_validation(self.y, subset_num)
        
        return [training_y, validation_y]
    
    def get_training_validation_X_y(self, subset_num):
        [training_X, validation_X] = self._get_training_validation_X(subset_num)
        [training_y, validation_y] = self._get_training_validation_y(subset_num)
        return [training_X, training_y, validation_X, validation_y]

    def get_all_X_y(self):
        training_X = pd.concat(self.X, ignore_index=True)
        training_y = pd.concat(self.y, ignore_index=True)
        return [training_X, training_y]
    
def get_accuracy(true_labels, predicted_labels):
    assert(len(true_labels) == len(predicted_labels))
    return sum(1 for y, y_hat in zip(true_labels, predicted_labels) if y == y_hat ) / len(true_labels)

class Model(ABC):
    """
    Model -> Abstract base class for the models we will implement, namely 
            KNN and RidgeRegression.
    """
    def __init__(self, train_X, train_y):
        self.train_X = train_X
        self.train_y = train_y
    
    @abstractmethod
    def predict(self, x):
        pass
        
    def predict_df(self, X_df):
        predictions = X_df.apply(lambda row: self.predict(row), raw=True, axis=1)
        return predictions

Question 2

Importing Data

Constants

In [4]:
DIR_NAME = 'nonlinear-regression-dataset'
X_NAME = 'Input'
Y_NAME = 'Target'
NUM_SUBSETS = 10
SIGMAS = [1,2,3,4,5,6]
HIDDEN_NODES = [1,2,3,4,5,6,7,8,9,10]

Test Data

In [5]:
DF = DataFetcher(DIR_NAME, X_NAME, Y_NAME)
test_X, test_y = DF.get_test_X_y()

test_X.head()
Out[5]:
0 1
0 11 1
1 2 13
2 13 11
3 16 11
4 10 6
In [6]:
test_y.head()
Out[6]:
0
0 1.604
1 -0.064
2 -3.017
3 -2.354
4 -0.394

Training / Validation Data

In [7]:
training_X_dfs, training_y_dfs = DF.get_all_training_X_y()

CVData = CrossValidationData(training_X_dfs, training_y_dfs)

print("Training X shape: " + str(CVData.get_training_validation_X_y(0)[0].shape))
Training X shape: (180, 2)
In [8]:
print("Training y shape: " + str(CVData.get_training_validation_X_y(0)[1].shape))
Training y shape: (180, 1)
In [9]:
print("Validation X shape: " + str(CVData.get_training_validation_X_y(0)[2].shape))
Validation X shape: (20, 2)
In [10]:
print("Validation y shape: " + str(CVData.get_training_validation_X_y(0)[3].shape))
Validation y shape: (20, 1)

Question 2a

Generalized Linear Regression

Model Definition

In [11]:
class GeneralizedLinearRegression(Model):
    
    def __init__(self, train_X, train_y, d, lmbda=0):
        super().__init__(train_X, train_y)
        X_original = train_X.to_numpy()
        y = train_y.to_numpy()
        
        # Convert to new space
        poly = PolynomialFeatures(d)
        X = poly.fit_transform(X_original)
        
        # Get inverse of A
        A = X.T.dot(X) + lmbda * np.identity(len(X.T))
        A_inv = np.linalg.inv(A)
        # Get b
        b = X.T.dot(y)

        # Get w
        w = A_inv.dot(b)

        # Persist fields
        self.w = w
        self.lmbda = lmbda
        self.poly = poly
    

    def predict(self, x):
        x = self.poly.transform(x.reshape(1, -1)).flatten()
        y = np.dot(self.w.T, x)[0]
        return y

def get_mse_loss(true_values, predicted_values):
    assert(len(true_values) == len(predicted_values))
    l2_loss = np.mean(np.square(np.subtract(true_values,predicted_values)))
    return l2_loss

Cross Validation

In [12]:
def perform_GLRM_CV(d):
    np.random.seed(42)
    
    # Perform CV
    losses = []
    for j in range(NUM_SUBSETS):
        train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
        # Create hypothesis
        model = GeneralizedLinearRegression(train_X, train_y, d, lmbda=1)
        predicted_validation_y = model.predict_df(validation_X)
            
        # Get accuracy
        loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
        losses.append(loss)
            
    avg_loss = np.mean(losses)
    return avg_loss

Finding the Optimal Degree

In [13]:
t1 = time.time()
mse_loss_d1 = perform_GLRM_CV(1)
t2 = time.time()
mse_loss_d2 = perform_GLRM_CV(2)
t3 = time.time()
mse_loss_d3 = perform_GLRM_CV(3)
t4 = time.time()
mse_loss_d4 = perform_GLRM_CV(4)
t5 = time.time()
In [14]:
losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')

plt.show()
In [15]:
optimal_d = np.argsort(losses)[0] + 1
print(optimal_d)
print("The optimal d is %f with an average mse loss of %f" % (optimal_d, losses[optimal_d-1]))
4
The optimal d is 4.000000 with an average mse loss of 0.060250

Loss on Test Set with Optimal Degree

In [16]:
training_X, training_y = CVData.get_all_X_y()

final_model = GeneralizedLinearRegression(training_X, training_y, optimal_d, lmbda=1)
predicted_test_y = final_model.predict_df(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
The loss on the test set is 0.05044037782525286

Run Time Complexity

In [17]:
print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))
Cross Validation Run Time (s) for d = 1: 0.06121087074279785
Cross Validation Run Time (s) for d = 2: 0.05336594581604004
Cross Validation Run Time (s) for d = 3: 0.06464838981628418
Cross Validation Run Time (s) for d = 4: 0.07327580451965332

The time complexity of Generalized Linear Regression is cubic in the number of basis functions, $O(n^3)$.

This is because we invert the "A" matrix in order to solve the system of linear equations defined by Linear Regression. The A matrix is of n x n dimension where n is the number of basis functions.

The run times above do not really reflect this time complexity. This could be because of my environment and the variance in processing power. Also, I include the time to predict and to perform other cross validation actions in the clock runtime of the algorithm. Perhaps, we could gain a better insight if I were to just time how long it takes to train. Asymptotically, this should make no difference since training is much more expensive than the other actions.

Question 2b

Bayesian Linear Regression

Model Definition

In [18]:
class BayesianLinearRegression(Model):
    
    def __init__(self, train_X, train_y, d):
        super().__init__(train_X, train_y)
        X_original = train_X.to_numpy()
        y = train_y.to_numpy()
        
        # Convert to new space
        poly = PolynomialFeatures(d)
        X = poly.fit_transform(X_original)
        
        # Prior Parameters
        prior_mean = 0
        prior_cov = np.identity(len(X.T))
        
        # Likelihood Parameters
        std_dev = 1
        
        # Posterior Parameters
        # A
        A = ( std_dev ** -2 ) * X.T.dot(X) + np.linalg.inv(prior_cov)
        A_inv = np.linalg.inv(A)
        
        # W bar
        w_bar = ( std_dev ** -2 ) * A_inv.dot(X.T.dot(y))

        # Persist fields
        self.w_bar = w_bar 
        self.poly = poly
        self.X = X
        self.A_inv = A_inv
    
    def predict(self, x):
        x = self.poly.transform(x.reshape(1, -1)).flatten()
        y = np.dot(self.w_bar.T, x)[0]
        return y

Cross Validation

In [19]:
def perform_BLRM_CV(d):
    np.random.seed(42)
    
    # Perform CV
    losses = []
    for j in range(NUM_SUBSETS):
        train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
        # Create hypothesis
        model = BayesianLinearRegression(train_X, train_y, d)
        predicted_validation_y = model.predict_df(validation_X)
            
        # Get accuracy
        loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
        losses.append(loss)
            
    avg_loss = np.mean(losses)
    return avg_loss

Finding the Optimal Degree

In [20]:
t1 = time.time()
mse_loss_d1 = perform_BLRM_CV(1)
t2 = time.time()
mse_loss_d2 = perform_BLRM_CV(2)
t3 = time.time()
mse_loss_d3 = perform_BLRM_CV(3)
t4 = time.time()
mse_loss_d4 = perform_BLRM_CV(4)
t5 = time.time()
In [21]:
losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')

plt.show()
In [22]:
optimal_d = np.argsort(losses)[0] + 1
print(optimal_d)
print("The optimal d is %f with an average mse loss of %f" % (optimal_d, losses[optimal_d-1]))
4
The optimal d is 4.000000 with an average mse loss of 0.060250

Loss on Test Set with Optimal Lambda

In [23]:
training_X, training_y = CVData.get_all_X_y()

final_model = BayesianLinearRegression(training_X, training_y, optimal_d)
predicted_test_y = final_model.predict_df(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
The loss on the test set is 0.05044037782525286

Run Time Complexity

In [24]:
print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))
Cross Validation Run Time (s) for d = 1: 0.05828285217285156
Cross Validation Run Time (s) for d = 2: 0.0528712272644043
Cross Validation Run Time (s) for d = 3: 0.06827592849731445
Cross Validation Run Time (s) for d = 4: 0.07222414016723633

The time complexity of Bayesian Linear Regression is cubic in the number of basis functions.

The run times above reflect this time complexity because as the number of basis functions increase, we see a marked increase in the clock runtime.

Discussion

What are the similarities and differences between regularized generalized linear regression and Bayesian generalized linear regression?

The two methods are very similar in that they converge to the same result given a Bayesian prior that reflects the penalty term. The two methods are also similar in that they use basis functions to transform X from its input space to a new space where learning takes place.

The two methods are very different in their representational capacity. Regularized generalized linear regression fits a hyperplane to the points while penalizing the weights of this hyperplane. Bayesian generalized linear regression takes a Gaussian prior, uses a Gaussian conditional distribution for likelihood, and then computes a posterior distribution over the weights to make predictions. Each facet of the Bayesian model is very intrepretable, could be graphed with contours (for up to 2 weights), and shows how introducing data can reduce our uncertainity in a world of hypotheses by influencing the posterior via the likelihood.

They are also different in the way they fundamentally view statistics. Generalized linear regression takes a frequentist approach where we maintain that there are true correct weights and we do the best we can to learn these true weights. The weights found by generalized linear regression is all that is used to form predictions. In contrast, Bayesian linear regression maintains the entire world of hypotheses of weights as potential outcomes. In the Bayesian sense, we do not know if the w we have is the correct one, instead we take a probability distribution over all weights (or hypotheses) in order to create predictions. This means we use the expected value of w, instead of a single precomputed w.

It happens to be that when we use Gaussian distributions to model the prior and likelihood, the point estimate for a prediction (the expected value of w dotted with a new incoming point), is the same as the prediction equation for generalized linear regression.

Question 2c

Gaussian Process

Model Definition

In [43]:
def gram_matrix(X, kernel):
    num_points = X.shape[0]
    G = np.zeros((num_points, num_points))
    for i in range(num_points):
        for j in range(num_points):
            G[i][j] = kernel(X[i], X[j])
    
    return G

def identity_kernel(x, x_prime):
    return x.dot(x_prime)

def gaussian_kernel(x, x_prime, sigma):
    return np.exp(-(norm(x - x_prime)**2)/(2 * (sigma ** 2)))

def polynomial_kernel(x, x_prime, d):
    return (x.dot(x_prime) + 1) ** d

class GaussianProcess(Model):
    
    def __init__(self, train_X, train_y, kernel):
        super().__init__(train_X, train_y)
        X = train_X.to_numpy()
        y = train_y.to_numpy()
        
        std_dev = 1
        
        # Gram Matrix
        K = gram_matrix(X, kernel)
        K_inv = np.linalg.inv(K + std_dev * np.identity(len(K)))
        
        # Persist fields
        self.X = X
        self.y = y
        self.K = K
        self.K_inv = K_inv
        self.kernel = kernel
    
    def compute_kernel_dists(self, x_prime):
        num_points = self.X.shape[0]
        k = np.zeros(num_points)
        for i in range(num_points):
            k[i] = self.kernel(self.X[i], x_prime)
        return k
    
    def predict(self, x):
        k = self.compute_kernel_dists(x)
        y = k.dot(self.K_inv).dot(self.y)[0]
        return y

Identity Kernel

In [26]:
training_X, training_y = CVData.get_all_X_y()
t1 = time.time()
identity_kernel_model = GaussianProcess(training_X, training_y, identity_kernel)
t2 = time.time()

predicted_test_y = identity_kernel_model.predict_df(test_X)
test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
print("Run Time (s): " + str(t2-t1))
The loss on the test set is 3.5906206854485827
Run Time (s): 0.07543587684631348

Gaussian Kernel

Cross Validation

In [44]:
def perform_GP_GK_CV():
    np.random.seed(42)
    
    average_losses = []
    for sig in tqdm_notebook(SIGMAS):
        
        # define kernel
        def gaussian_kernel_fn(x, x_prime):
            return gaussian_kernel(x, x_prime, sig)
        
        # Perform CV
        losses = []
        for j in range(NUM_SUBSETS):
            train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
            # Create hypothesis
            model = GaussianProcess(train_X, train_y, gaussian_kernel_fn)
            predicted_validation_y = model.predict_df(validation_X)
            
            # Get accuracy
            loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
            losses.append(loss)
            
        avg_loss = np.mean(losses)
        average_losses.append(avg_loss)

    return average_losses

Finding the Optimal Sigma

In [45]:
average_losses = perform_GP_GK_CV()

In [46]:
optimal_sigma = SIGMAS[np.argsort(average_losses)[0]]
print("The optimal sigma is %f with an average mse loss of %f" % (optimal_sigma, min(average_losses)))
The optimal sigma is 4.000000 with an average mse loss of 0.057149
In [47]:
plt.plot(SIGMAS, average_losses)
plt.xlabel('Sigma')
plt.ylabel('Average L2 Loss')
plt.title('Cross Validation')
plt.show()

Loss on Test set with Optimal Sigma

In [48]:
training_X, training_y = CVData.get_all_X_y()
t1 = time.time()
final_model = GaussianProcess(training_X, training_y, lambda x, x_prime: gaussian_kernel(x, x_prime, optimal_sigma))
t2 = time.time()
predicted_test_y = final_model.predict_df(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
print("Run Time (s): " + str(t2-t1))
The loss on the test set is 0.04302516166820031
Run Time (s): 0.46358609199523926

Polynomial Kernel

Cross Validation

In [32]:
def perform_GP_PK_CV(d):
    np.random.seed(42)
    
    average_losses = []
        
    # define kernel
    def polynomial_kernel_fn(x, x_prime):
        return polynomial_kernel(x, x_prime, d)
        
    # Perform CV
    losses = []
    for j in range(NUM_SUBSETS):
        train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
        # Create hypothesis
        model = GaussianProcess(train_X, train_y, polynomial_kernel_fn)
        predicted_validation_y = model.predict_df(validation_X)
            
        # Get accuracy
        loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
        losses.append(loss)
            
    avg_loss = np.mean(losses)

    return avg_loss

Finding the Optimal Degree

In [33]:
t1 = time.time()
mse_loss_d1 = perform_GP_PK_CV(1)
t2 = time.time()
mse_loss_d2 = perform_GP_PK_CV(2)
t3 = time.time()
mse_loss_d3 = perform_GP_PK_CV(3)
t4 = time.time()
mse_loss_d4 = perform_GP_PK_CV(4)
t5 = time.time()
In [34]:
losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')
plt.show()

Loss on Test Set with Optimal Degree

In [35]:
training_X, training_y = CVData.get_all_X_y()

t1 = time.time()
final_model = GaussianProcess(training_X, training_y, lambda x,y: polynomial_kernel(x, y, optimal_d))
t2 = time.time()

predicted_test_y = final_model.predict_df(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("Run Time (s): " + str(t2-t1))
print("The loss on the test set is " + str(test_loss))
Run Time (s): 0.11498212814331055
The loss on the test set is 0.045769543486000624

Runtime complexity

In [36]:
print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))
Cross Validation Run Time (s) for d = 1: 0.11498212814331055
Cross Validation Run Time (s) for d = 2: -2.095305919647217
Cross Validation Run Time (s) for d = 3: 0.91843581199646
Cross Validation Run Time (s) for d = 4: 0.889031171798706

If we take a look at the clock runtimes above for the identity kernel, the Gaussian kernel, and the polynomial kernel, we see that the identity kernel is the fastest, followed by the polynomial kernel, followed by the Gaussian kernel.

The time complexity of training the Gaussian Process model is cubic in the number of training points. This is in stark contrast to Bayesian linear regression and Generalized linear regression. This kernel method uses the kernel to compute the dot product between training points, and so we are never required to transform the data to its actual representation in the transformed space.

However, we still have a major difference in clock runtime for each of the different kernels. This is because the kernels themselves have their own complexity which is exacerbated over the cubic complexity over the training points, especially for this small dataset.

The Gaussian kernel performs atleast 3n operations while taking the element-wise difference, the norm, the sum, and finally the exponentiation for a n-dimensional input vector. The Identity kernel performs 2n operations. The polynomial kernel performs 2n + d operations. Asymptotically, all three kernels are in $O(n)$ complexity. But at this small scale, we see the effects of the various operations in the runtime.

Question 2d

Neural Network

Model Definition

In [37]:
from keras.models import Sequential
from keras.layers.core import Activation, Dense 
from keras.optimizers import Adam
import keras.backend.tensorflow_backend as KTF
from sklearn.metrics import mean_squared_error

def bilayer_network(hidden_nodes):
        
    model = Sequential()

    model.add(Dense(hidden_nodes, input_dim=2, activation="sigmoid", name="hidden"))

    model.add(Dense(1, activation='linear', name='output'))

    return model
Using TensorFlow backend.

Cross Validation

In [38]:
def perform_NN_CV():
    np.random.seed(42)
    
    average_losses = []
    time_per_iteration = []
    for num_nodes in tqdm_notebook(HIDDEN_NODES):
        t1 = time.time()
        # Perform CV
        losses = []
        for j in range(NUM_SUBSETS):
            train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
            
            # Create hypothesis
            model = bilayer_network(num_nodes)
            model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
            
            model.fit(train_X, train_y, epochs=150, verbose=0, batch_size=10)
            predicted_validation_y = model.predict(validation_X)

            # Get accuracy
            loss = get_mse_loss(validation_y.values.flatten(),predicted_validation_y.flatten()) 
            losses.append(loss)
            
        t2 = time.time()
        avg_loss = np.mean(losses)
        average_losses.append(avg_loss)
        time_per_iteration.append(t2-t1)

    return average_losses, time_per_iteration

Finding the optimal # of hidden nodes

In [39]:
average_losses, time_per_iteration = perform_NN_CV()
optimal_nodes = HIDDEN_NODES[np.argsort(average_losses)[0]]
print("The optimal number of hidden nodes is %s with an average mse loss of %f" % (optimal_nodes, min(average_losses)))
WARNING: Logging before flag parsing goes to stderr.
W0630 22:48:20.222808 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0630 22:48:20.249330 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0630 22:48:20.255409 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0630 22:48:20.283298 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0630 22:48:20.496237 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

W0630 22:48:20.555334 4344497600 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:973: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.

The optimal number of hidden nodes is 10 with an average mse loss of 0.778351
In [40]:
plt.plot(HIDDEN_NODES, average_losses)
plt.xlabel('Number of Hidden Nodes')
plt.ylabel('Average L2 Loss')
plt.title('Cross Validation')
plt.show()

Runtime Complexity

In [41]:
for i, runtime in enumerate(time_per_iteration):
    print("Cross Validation Run Time (s) for %s nodes : " % (i+1) + str(runtime))
Cross Validation Run Time (s) for 1 nodes : 24.123227834701538
Cross Validation Run Time (s) for 2 nodes : 29.28181791305542
Cross Validation Run Time (s) for 3 nodes : 31.126604080200195
Cross Validation Run Time (s) for 4 nodes : 36.65745496749878
Cross Validation Run Time (s) for 5 nodes : 41.401155948638916
Cross Validation Run Time (s) for 6 nodes : 47.95881390571594
Cross Validation Run Time (s) for 7 nodes : 54.396154165267944
Cross Validation Run Time (s) for 8 nodes : 60.853068828582764
Cross Validation Run Time (s) for 9 nodes : 74.93994808197021
Cross Validation Run Time (s) for 10 nodes : 88.44564294815063

For a neural network, we train by utilizing forward and backwards propagation.

For each node in the model, we utilize it once in forwards propagation and once in backwards propagation, given automatic differentiation. We use each node atleast once for every training example, in each epoch. So the time complexity of neural network can be interpreted as $O(w * m * e)$ where $w$ is the number of nodes in the neural network, $m$ is the number of training examples, and $e$ is the number of epochs.

Looking at only the number of nodes, we see that the runtime is linear in the number of nodes, $O(w)$.

This matches our results as the runtime increases by approximately 5-6 seconds per extra hidden node.

Loss on Test Set with Optimal Nodes

In [42]:
KTF.clear_session() 
training_X, training_y = CVData.get_all_X_y()
model = bilayer_network(optimal_nodes)
model.summary()
model.compile(optimizer='adam', loss='mse')

model.fit(training_X, training_y, epochs=200, verbose=0, batch_size=10)
predicted_test_y = model.predict(test_X)

test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y.flatten())
print("The loss on the test set is " + str(test_loss))
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
hidden (Dense)               (None, 10)                30        
_________________________________________________________________
output (Dense)               (None, 1)                 11        
=================================================================
Total params: 41
Trainable params: 41
Non-trainable params: 0
_________________________________________________________________
The loss on the test set is 0.7125737639221129