import pandas as pd
import numpy as np
from collections import Counter
from itertools import groupby
from operator import itemgetter
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from numpy.linalg import norm
from abc import ABC, abstractmethod
import time
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import multivariate_normal as mv_norm
class DataFetcher:
"""
DataFetcher: Grabs all csv data as pandas dataframes.
"""
def __init__(self, directory, X_name, y_name):
self.directory = directory
self.X_name = X_name
self.y_name = y_name
def _get_training_X_path(self, subset_num):
# 0 <= subset_num <= 9
return './%s/train%s%d.csv' % (self.directory, self.X_name, (subset_num + 1))
def _get_training_y_path(self, subset_num):
# 0 <= subset_num <= 9
return './%s/train%s%d.csv' % (self.directory, self.y_name, (subset_num + 1))
def get_all_training_X_y(self):
training_X_dfs = []
training_y_dfs = []
for i in range(NUM_SUBSETS):
X_path = self._get_training_X_path(i)
y_path = self._get_training_y_path(i)
X_df = pd.read_csv(X_path, header=None)
y_df = pd.read_csv(y_path, header=None)
training_X_dfs.append(X_df)
training_y_dfs.append(y_df)
return training_X_dfs, training_y_dfs
def get_test_X_y(self):
test_X_path = './%s/test%s.csv' % (self.directory, self.X_name)
test_y_path = './%s/test%s.csv' % (self.directory, self.y_name)
test_X_df = pd.read_csv(test_X_path, header=None)
test_y_df = pd.read_csv(test_y_path, header=None)
return test_X_df, test_y_df
class CrossValidationData:
"""
CrossValidationData: Splits list of training dataframes
into validation & training dataframes.
"""
def __init__(self, X_dfs, y_dfs):
assert(len(X_dfs) == len(y_dfs))
self.X = X_dfs
self.y = y_dfs
self.num_subsets = len(X_dfs)
def _split_training_validation(self, dfs, subset_num):
validation_df = dfs[subset_num]
training_dfs = dfs[:subset_num] + dfs[subset_num+1:]
training_df = pd.concat(training_dfs, ignore_index=True)
return [training_df, validation_df]
def _get_training_validation_X(self, subset_num):
assert(subset_num < self.num_subsets)
[training_X, validation_X] = self._split_training_validation(self.X, subset_num)
return [training_X, validation_X]
def _get_training_validation_y(self, subset_num):
assert(subset_num < self.num_subsets)
[training_y, validation_y] = self._split_training_validation(self.y, subset_num)
return [training_y, validation_y]
def get_training_validation_X_y(self, subset_num):
[training_X, validation_X] = self._get_training_validation_X(subset_num)
[training_y, validation_y] = self._get_training_validation_y(subset_num)
return [training_X, training_y, validation_X, validation_y]
def get_all_X_y(self):
training_X = pd.concat(self.X, ignore_index=True)
training_y = pd.concat(self.y, ignore_index=True)
return [training_X, training_y]
def get_accuracy(true_labels, predicted_labels):
assert(len(true_labels) == len(predicted_labels))
return sum(1 for y, y_hat in zip(true_labels, predicted_labels) if y == y_hat ) / len(true_labels)
class Model(ABC):
"""
Model -> Abstract base class for the models we will implement, namely
KNN and RidgeRegression.
"""
def __init__(self, train_X, train_y):
self.train_X = train_X
self.train_y = train_y
@abstractmethod
def predict(self, x):
pass
def predict_df(self, X_df):
predictions = X_df.apply(lambda row: self.predict(row), raw=True, axis=1)
return predictions
DIR_NAME = 'nonlinear-regression-dataset'
X_NAME = 'Input'
Y_NAME = 'Target'
NUM_SUBSETS = 10
SIGMAS = [1,2,3,4,5,6]
HIDDEN_NODES = [1,2,3,4,5,6,7,8,9,10]
DF = DataFetcher(DIR_NAME, X_NAME, Y_NAME)
test_X, test_y = DF.get_test_X_y()
test_X.head()
test_y.head()
training_X_dfs, training_y_dfs = DF.get_all_training_X_y()
CVData = CrossValidationData(training_X_dfs, training_y_dfs)
print("Training X shape: " + str(CVData.get_training_validation_X_y(0)[0].shape))
print("Training y shape: " + str(CVData.get_training_validation_X_y(0)[1].shape))
print("Validation X shape: " + str(CVData.get_training_validation_X_y(0)[2].shape))
print("Validation y shape: " + str(CVData.get_training_validation_X_y(0)[3].shape))
class GeneralizedLinearRegression(Model):
def __init__(self, train_X, train_y, d, lmbda=0):
super().__init__(train_X, train_y)
X_original = train_X.to_numpy()
y = train_y.to_numpy()
# Convert to new space
poly = PolynomialFeatures(d)
X = poly.fit_transform(X_original)
# Get inverse of A
A = X.T.dot(X) + lmbda * np.identity(len(X.T))
A_inv = np.linalg.inv(A)
# Get b
b = X.T.dot(y)
# Get w
w = A_inv.dot(b)
# Persist fields
self.w = w
self.lmbda = lmbda
self.poly = poly
def predict(self, x):
x = self.poly.transform(x.reshape(1, -1)).flatten()
y = np.dot(self.w.T, x)[0]
return y
def get_mse_loss(true_values, predicted_values):
assert(len(true_values) == len(predicted_values))
l2_loss = np.mean(np.square(np.subtract(true_values,predicted_values)))
return l2_loss
def perform_GLRM_CV(d):
np.random.seed(42)
# Perform CV
losses = []
for j in range(NUM_SUBSETS):
train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
# Create hypothesis
model = GeneralizedLinearRegression(train_X, train_y, d, lmbda=1)
predicted_validation_y = model.predict_df(validation_X)
# Get accuracy
loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
losses.append(loss)
avg_loss = np.mean(losses)
return avg_loss
t1 = time.time()
mse_loss_d1 = perform_GLRM_CV(1)
t2 = time.time()
mse_loss_d2 = perform_GLRM_CV(2)
t3 = time.time()
mse_loss_d3 = perform_GLRM_CV(3)
t4 = time.time()
mse_loss_d4 = perform_GLRM_CV(4)
t5 = time.time()
losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')
plt.show()
optimal_d = np.argsort(losses)[0] + 1
print(optimal_d)
print("The optimal d is %f with an average mse loss of %f" % (optimal_d, losses[optimal_d-1]))
training_X, training_y = CVData.get_all_X_y()
final_model = GeneralizedLinearRegression(training_X, training_y, optimal_d, lmbda=1)
predicted_test_y = final_model.predict_df(test_X)
test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))
The time complexity of Generalized Linear Regression is cubic in the number of basis functions, $O(n^3)$.
This is because we invert the "A" matrix in order to solve the system of linear equations defined by Linear Regression. The A matrix is of n x n dimension where n is the number of basis functions.
The run times above do not really reflect this time complexity. This could be because of my environment and the variance in processing power. Also, I include the time to predict and to perform other cross validation actions in the clock runtime of the algorithm. Perhaps, we could gain a better insight if I were to just time how long it takes to train. Asymptotically, this should make no difference since training is much more expensive than the other actions.
class BayesianLinearRegression(Model):
def __init__(self, train_X, train_y, d):
super().__init__(train_X, train_y)
X_original = train_X.to_numpy()
y = train_y.to_numpy()
# Convert to new space
poly = PolynomialFeatures(d)
X = poly.fit_transform(X_original)
# Prior Parameters
prior_mean = 0
prior_cov = np.identity(len(X.T))
# Likelihood Parameters
std_dev = 1
# Posterior Parameters
# A
A = ( std_dev ** -2 ) * X.T.dot(X) + np.linalg.inv(prior_cov)
A_inv = np.linalg.inv(A)
# W bar
w_bar = ( std_dev ** -2 ) * A_inv.dot(X.T.dot(y))
# Persist fields
self.w_bar = w_bar
self.poly = poly
self.X = X
self.A_inv = A_inv
def predict(self, x):
x = self.poly.transform(x.reshape(1, -1)).flatten()
y = np.dot(self.w_bar.T, x)[0]
return y
def perform_BLRM_CV(d):
np.random.seed(42)
# Perform CV
losses = []
for j in range(NUM_SUBSETS):
train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
# Create hypothesis
model = BayesianLinearRegression(train_X, train_y, d)
predicted_validation_y = model.predict_df(validation_X)
# Get accuracy
loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
losses.append(loss)
avg_loss = np.mean(losses)
return avg_loss
t1 = time.time()
mse_loss_d1 = perform_BLRM_CV(1)
t2 = time.time()
mse_loss_d2 = perform_BLRM_CV(2)
t3 = time.time()
mse_loss_d3 = perform_BLRM_CV(3)
t4 = time.time()
mse_loss_d4 = perform_BLRM_CV(4)
t5 = time.time()
losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')
plt.show()
optimal_d = np.argsort(losses)[0] + 1
print(optimal_d)
print("The optimal d is %f with an average mse loss of %f" % (optimal_d, losses[optimal_d-1]))
training_X, training_y = CVData.get_all_X_y()
final_model = BayesianLinearRegression(training_X, training_y, optimal_d)
predicted_test_y = final_model.predict_df(test_X)
test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))
The time complexity of Bayesian Linear Regression is cubic in the number of basis functions.
The run times above reflect this time complexity because as the number of basis functions increase, we see a marked increase in the clock runtime.
What are the similarities and differences between regularized generalized linear regression and Bayesian generalized linear regression?
The two methods are very similar in that they converge to the same result given a Bayesian prior that reflects the penalty term. The two methods are also similar in that they use basis functions to transform X from its input space to a new space where learning takes place.
The two methods are very different in their representational capacity. Regularized generalized linear regression fits a hyperplane to the points while penalizing the weights of this hyperplane. Bayesian generalized linear regression takes a Gaussian prior, uses a Gaussian conditional distribution for likelihood, and then computes a posterior distribution over the weights to make predictions. Each facet of the Bayesian model is very intrepretable, could be graphed with contours (for up to 2 weights), and shows how introducing data can reduce our uncertainity in a world of hypotheses by influencing the posterior via the likelihood.
They are also different in the way they fundamentally view statistics. Generalized linear regression takes a frequentist approach where we maintain that there are true correct weights and we do the best we can to learn these true weights. The weights found by generalized linear regression is all that is used to form predictions. In contrast, Bayesian linear regression maintains the entire world of hypotheses of weights as potential outcomes. In the Bayesian sense, we do not know if the w we have is the correct one, instead we take a probability distribution over all weights (or hypotheses) in order to create predictions. This means we use the expected value of w, instead of a single precomputed w.
It happens to be that when we use Gaussian distributions to model the prior and likelihood, the point estimate for a prediction (the expected value of w dotted with a new incoming point), is the same as the prediction equation for generalized linear regression.
def gram_matrix(X, kernel):
num_points = X.shape[0]
G = np.zeros((num_points, num_points))
for i in range(num_points):
for j in range(num_points):
G[i][j] = kernel(X[i], X[j])
return G
def identity_kernel(x, x_prime):
return x.dot(x_prime)
def gaussian_kernel(x, x_prime, sigma):
return np.exp(-(norm(x - x_prime)**2)/(2 * (sigma ** 2)))
def polynomial_kernel(x, x_prime, d):
return (x.dot(x_prime) + 1) ** d
class GaussianProcess(Model):
def __init__(self, train_X, train_y, kernel):
super().__init__(train_X, train_y)
X = train_X.to_numpy()
y = train_y.to_numpy()
std_dev = 1
# Gram Matrix
K = gram_matrix(X, kernel)
K_inv = np.linalg.inv(K + std_dev * np.identity(len(K)))
# Persist fields
self.X = X
self.y = y
self.K = K
self.K_inv = K_inv
self.kernel = kernel
def compute_kernel_dists(self, x_prime):
num_points = self.X.shape[0]
k = np.zeros(num_points)
for i in range(num_points):
k[i] = self.kernel(self.X[i], x_prime)
return k
def predict(self, x):
k = self.compute_kernel_dists(x)
y = k.dot(self.K_inv).dot(self.y)[0]
return y
training_X, training_y = CVData.get_all_X_y()
t1 = time.time()
identity_kernel_model = GaussianProcess(training_X, training_y, identity_kernel)
t2 = time.time()
predicted_test_y = identity_kernel_model.predict_df(test_X)
test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
print("Run Time (s): " + str(t2-t1))
def perform_GP_GK_CV():
np.random.seed(42)
average_losses = []
for sig in tqdm_notebook(SIGMAS):
# define kernel
def gaussian_kernel_fn(x, x_prime):
return gaussian_kernel(x, x_prime, sig)
# Perform CV
losses = []
for j in range(NUM_SUBSETS):
train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
# Create hypothesis
model = GaussianProcess(train_X, train_y, gaussian_kernel_fn)
predicted_validation_y = model.predict_df(validation_X)
# Get accuracy
loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
losses.append(loss)
avg_loss = np.mean(losses)
average_losses.append(avg_loss)
return average_losses
average_losses = perform_GP_GK_CV()
optimal_sigma = SIGMAS[np.argsort(average_losses)[0]]
print("The optimal sigma is %f with an average mse loss of %f" % (optimal_sigma, min(average_losses)))
plt.plot(SIGMAS, average_losses)
plt.xlabel('Sigma')
plt.ylabel('Average L2 Loss')
plt.title('Cross Validation')
plt.show()
training_X, training_y = CVData.get_all_X_y()
t1 = time.time()
final_model = GaussianProcess(training_X, training_y, lambda x, x_prime: gaussian_kernel(x, x_prime, optimal_sigma))
t2 = time.time()
predicted_test_y = final_model.predict_df(test_X)
test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("The loss on the test set is " + str(test_loss))
print("Run Time (s): " + str(t2-t1))
def perform_GP_PK_CV(d):
np.random.seed(42)
average_losses = []
# define kernel
def polynomial_kernel_fn(x, x_prime):
return polynomial_kernel(x, x_prime, d)
# Perform CV
losses = []
for j in range(NUM_SUBSETS):
train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
# Create hypothesis
model = GaussianProcess(train_X, train_y, polynomial_kernel_fn)
predicted_validation_y = model.predict_df(validation_X)
# Get accuracy
loss = get_mse_loss(validation_y.values.flatten(), predicted_validation_y.values.flatten())
losses.append(loss)
avg_loss = np.mean(losses)
return avg_loss
t1 = time.time()
mse_loss_d1 = perform_GP_PK_CV(1)
t2 = time.time()
mse_loss_d2 = perform_GP_PK_CV(2)
t3 = time.time()
mse_loss_d3 = perform_GP_PK_CV(3)
t4 = time.time()
mse_loss_d4 = perform_GP_PK_CV(4)
t5 = time.time()
losses = [mse_loss_d1,mse_loss_d2, mse_loss_d3, mse_loss_d4]
plt.plot([1,2,3,4], losses )
plt.xlabel('Degree')
plt.ylabel('Average L2 Loss')
plt.xticks([1,2,3,4])
plt.title('Cross Validation')
plt.show()
training_X, training_y = CVData.get_all_X_y()
t1 = time.time()
final_model = GaussianProcess(training_X, training_y, lambda x,y: polynomial_kernel(x, y, optimal_d))
t2 = time.time()
predicted_test_y = final_model.predict_df(test_X)
test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y)
print("Run Time (s): " + str(t2-t1))
print("The loss on the test set is " + str(test_loss))
print("Cross Validation Run Time (s) for d = 1: " + str(t2-t1))
print("Cross Validation Run Time (s) for d = 2: " + str(t3-t2))
print("Cross Validation Run Time (s) for d = 3: " + str(t4-t3))
print("Cross Validation Run Time (s) for d = 4: " + str(t5-t4))
If we take a look at the clock runtimes above for the identity kernel, the Gaussian kernel, and the polynomial kernel, we see that the identity kernel is the fastest, followed by the polynomial kernel, followed by the Gaussian kernel.
The time complexity of training the Gaussian Process model is cubic in the number of training points. This is in stark contrast to Bayesian linear regression and Generalized linear regression. This kernel method uses the kernel to compute the dot product between training points, and so we are never required to transform the data to its actual representation in the transformed space.
However, we still have a major difference in clock runtime for each of the different kernels. This is because the kernels themselves have their own complexity which is exacerbated over the cubic complexity over the training points, especially for this small dataset.
The Gaussian kernel performs atleast 3n operations while taking the element-wise difference, the norm, the sum, and finally the exponentiation for a n-dimensional input vector. The Identity kernel performs 2n operations. The polynomial kernel performs 2n + d operations. Asymptotically, all three kernels are in $O(n)$ complexity. But at this small scale, we see the effects of the various operations in the runtime.
from keras.models import Sequential
from keras.layers.core import Activation, Dense
from keras.optimizers import Adam
import keras.backend.tensorflow_backend as KTF
from sklearn.metrics import mean_squared_error
def bilayer_network(hidden_nodes):
model = Sequential()
model.add(Dense(hidden_nodes, input_dim=2, activation="sigmoid", name="hidden"))
model.add(Dense(1, activation='linear', name='output'))
return model
def perform_NN_CV():
np.random.seed(42)
average_losses = []
time_per_iteration = []
for num_nodes in tqdm_notebook(HIDDEN_NODES):
t1 = time.time()
# Perform CV
losses = []
for j in range(NUM_SUBSETS):
train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
# Create hypothesis
model = bilayer_network(num_nodes)
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=150, verbose=0, batch_size=10)
predicted_validation_y = model.predict(validation_X)
# Get accuracy
loss = get_mse_loss(validation_y.values.flatten(),predicted_validation_y.flatten())
losses.append(loss)
t2 = time.time()
avg_loss = np.mean(losses)
average_losses.append(avg_loss)
time_per_iteration.append(t2-t1)
return average_losses, time_per_iteration
average_losses, time_per_iteration = perform_NN_CV()
optimal_nodes = HIDDEN_NODES[np.argsort(average_losses)[0]]
print("The optimal number of hidden nodes is %s with an average mse loss of %f" % (optimal_nodes, min(average_losses)))
plt.plot(HIDDEN_NODES, average_losses)
plt.xlabel('Number of Hidden Nodes')
plt.ylabel('Average L2 Loss')
plt.title('Cross Validation')
plt.show()
for i, runtime in enumerate(time_per_iteration):
print("Cross Validation Run Time (s) for %s nodes : " % (i+1) + str(runtime))
For a neural network, we train by utilizing forward and backwards propagation.
For each node in the model, we utilize it once in forwards propagation and once in backwards propagation, given automatic differentiation. We use each node atleast once for every training example, in each epoch. So the time complexity of neural network can be interpreted as $O(w * m * e)$ where $w$ is the number of nodes in the neural network, $m$ is the number of training examples, and $e$ is the number of epochs.
Looking at only the number of nodes, we see that the runtime is linear in the number of nodes, $O(w)$.
This matches our results as the runtime increases by approximately 5-6 seconds per extra hidden node.
KTF.clear_session()
training_X, training_y = CVData.get_all_X_y()
model = bilayer_network(optimal_nodes)
model.summary()
model.compile(optimizer='adam', loss='mse')
model.fit(training_X, training_y, epochs=200, verbose=0, batch_size=10)
predicted_test_y = model.predict(test_X)
test_loss = get_mse_loss(test_y.values.flatten(), predicted_test_y.flatten())
print("The loss on the test set is " + str(test_loss))