import pandas as pd
import numpy as np
from collections import Counter
from itertools import groupby
from operator import itemgetter
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from numpy.linalg import norm
from abc import ABC, abstractmethod
import time
class DataFetcher:
"""
DataFetcher: Grabs all csv data as pandas dataframes.
"""
def __init__(self, directory, X_name, y_name):
self.directory = directory
self.X_name = X_name
self.y_name = y_name
def _get_training_X_path(self, subset_num):
# 0 <= subset_num <= 9
return './%s/train%s%d.csv' % (self.directory, self.X_name, (subset_num + 1))
def _get_training_y_path(self, subset_num):
# 0 <= subset_num <= 9
return './%s/train%s%d.csv' % (self.directory, self.y_name, (subset_num + 1))
def get_all_training_X_y(self):
training_X_dfs = []
training_y_dfs = []
for i in range(NUM_SUBSETS):
X_path = self._get_training_X_path(i)
y_path = self._get_training_y_path(i)
X_df = pd.read_csv(X_path, header=None)
y_df = pd.read_csv(y_path, header=None)
training_X_dfs.append(X_df)
training_y_dfs.append(y_df)
return training_X_dfs, training_y_dfs
def get_test_X_y(self):
test_X_path = './%s/test%s.csv' % (self.directory, self.X_name)
test_y_path = './%s/test%s.csv' % (self.directory, self.y_name)
test_X_df = pd.read_csv(test_X_path, header=None)
test_y_df = pd.read_csv(test_y_path, header=None)
return test_X_df, test_y_df
class CrossValidationData:
"""
CrossValidationData: Splits list of training dataframes
into validation & training dataframes.
"""
def __init__(self, X_dfs, y_dfs):
assert(len(X_dfs) == len(y_dfs))
self.X = X_dfs
self.y = y_dfs
self.num_subsets = len(X_dfs)
def _split_training_validation(self, dfs, subset_num):
validation_df = dfs[subset_num]
training_dfs = dfs[:subset_num] + dfs[subset_num+1:]
training_df = pd.concat(training_dfs, ignore_index=True)
return [training_df, validation_df]
def _get_training_validation_X(self, subset_num):
assert(subset_num < self.num_subsets)
[training_X, validation_X] = self._split_training_validation(self.X, subset_num)
return [training_X, validation_X]
def _get_training_validation_y(self, subset_num):
assert(subset_num < self.num_subsets)
[training_y, validation_y] = self._split_training_validation(self.y, subset_num)
return [training_y, validation_y]
def get_training_validation_X_y(self, subset_num):
[training_X, validation_X] = self._get_training_validation_X(subset_num)
[training_y, validation_y] = self._get_training_validation_y(subset_num)
return [training_X, training_y, validation_X, validation_y]
def get_all_X_y(self):
training_X = pd.concat(self.X, ignore_index=True)
training_y = pd.concat(self.y, ignore_index=True)
return [training_X, training_y]
def get_accuracy(true_labels, predicted_labels):
assert(len(true_labels) == len(predicted_labels))
return sum(1 for y, y_hat in zip(true_labels, predicted_labels) if y == y_hat ) / len(true_labels)
class Model(ABC):
"""
Model -> Abstract base class for the models we will implement, namely
KNN and RidgeRegression.
"""
def __init__(self, train_X, train_y):
self.train_X = train_X
self.train_y = train_y
@abstractmethod
def predict(self, x):
pass
def predict_df(self, X_df):
predictions = X_df.apply(lambda row: self.predict(row), raw=True, axis=1)
return predictions
DIR_NAME = 'knn-dataset'
X_NAME = 'Data'
Y_NAME = 'Labels'
NUM_SUBSETS = 10
POSITIVE_LABEL = 6
NEGATIVE_LABEL = 5
LAMBDAS = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000]
DF = DataFetcher(DIR_NAME, X_NAME, Y_NAME)
test_X, test_y = DF.get_test_X_y()
test_X.head()
test_y.head()
training_X_dfs, training_y_dfs = DF.get_all_training_X_y()
CVData = CrossValidationData(training_X_dfs, training_y_dfs)
print("Training X shape: " + str(CVData.get_training_validation_X_y(0)[0].shape))
print("Training y shape: " + str(CVData.get_training_validation_X_y(0)[1].shape))
print("Validation X shape: " + str(CVData.get_training_validation_X_y(0)[2].shape))
print("Validation y shape: " + str(CVData.get_training_validation_X_y(0)[3].shape))
class GaussianMixture(Model):
def __init__(self, train_X, train_y, positive_label, negative_label):
super().__init__(train_X, train_y)
self.positive_label = positive_label
self.negative_label = negative_label
# MLE Estimates for parameters
y_list = train_y.values.flatten().tolist()
pi = sum([1 if y == positive_label else 0 for y in y_list]) / len(y_list)
# for mew 1, mew 2, split train X into positive negative groups and take average vector
positive_indices = [ind for ind, y in enumerate(y_list) if y == positive_label]
negative_indices = [ind for ind, y in enumerate(y_list) if y != positive_label]
n1 = len(positive_indices)
n2 = len(negative_indices)
N = n1 + n2
positive_X_df = train_X.iloc[positive_indices]
negative_X_df = train_X.iloc[negative_indices]
mu_1 = positive_X_df.mean()
mu_2 = negative_X_df.mean()
positive_X = positive_X_df.to_numpy()
negative_X = negative_X_df.to_numpy()
positive_X_dists = positive_X_df.to_numpy() - mu_1.to_numpy()
negative_X_dists = negative_X_df.to_numpy() - mu_2.to_numpy()
s1 = positive_X_dists.T.dot(positive_X_dists) / n1
s2 = negative_X_dists.T.dot(negative_X_dists) / n2
cov = (n1/N) * s1 + (n2/N) * s2
cov_inv = np.linalg.inv(cov)
w = cov_inv.dot(mu_1 - mu_2)
w_0 = -(1/2) * mu_1.T.dot(cov_inv).dot(mu_1) + (1/2) * mu_2.T.dot(cov_inv).dot(mu_2) + np.log(pi/ (1-pi))
self.pi = pi
self.n1 = n1
self.n2 = n2
self.mu_1 = mu_1
self.mu_2 = mu_2
self.cov = cov
self.w = w
self.w_0 = w_0
def predict_prob(self, x):
logit_odds = self.w.dot(x) + self.w_0
prob = 1 / (1 + np.exp(-logit_odds))
return prob
def predict(self, x):
prob = self.predict_prob(x)
if prob > 0.5:
return self.positive_label
else:
return self.negative_label
train_X, train_y = CVData.get_all_X_y()
# Create hypothesis
t1 = time.time()
model = GaussianMixture(train_X, train_y, POSITIVE_LABEL, NEGATIVE_LABEL)
t2 = time.time()
# Train Accuracy
predicted_train_y = model.predict_df(train_X)
train_accuracy = get_accuracy(train_y.values.flatten(), predicted_train_y.values.flatten())
# Test Accuracy
predicted_test_y = model.predict_df(test_X)
test_accuracy = get_accuracy(test_y.values.flatten(), predicted_test_y.values.flatten())
print("Train Accuracy: %f" % train_accuracy)
print("Test Accuracy: %f" % test_accuracy)
print("Training Time: " + str(t2-t1))
print("pi = %f" % model.pi)
print("mu_1 = " + str(model.mu_1.to_numpy()))
print("mu_2 = " + str(model.mu_2.to_numpy()))
print("diag(cov) = " + str(np.diag(model.cov)))
class LogisticRegression(Model):
def __init__(self, train_X, train_y, positive_label, negative_label, lmbda=0, max_iters=10, threshold=0.01, train_intercept=True):
super().__init__(train_X, train_y)
self.positive_label = positive_label
self.negative_label = negative_label
self.lmbda = lmbda
self.max_iters = max_iters
self.threshold = threshold
self.train_intercept = train_intercept
X = train_X.to_numpy()
y = train_y.to_numpy()
# y's as 1 & 0's
y = [1 if y_i == positive_label else 0 for y_i in y]
# Add column of ones
if train_intercept:
X = np.insert(X, 0, np.ones(len(X)), axis=1)
self.X = X
self.y = y
self.w = np.zeros(X.shape[1])
self._train(self.X, self.w, self.y)
def _sigmoid(self, odds):
prob = 1 / (1 + np.exp(-odds))
return prob
def _gradient(self, X, w, y):
weighted_values = X.dot(w)
probabilities = np.apply_along_axis(self._sigmoid, 0, weighted_values)
return X.T.dot(probabilities - y) + self.lmbda * w
def _inverse_proba(self, prob):
return 1 - prob
def _R(self, X, w):
weighted_values = X.dot(w)
probabilities = np.apply_along_axis(self._sigmoid, 0, weighted_values)
inv_probabilities = np.apply_along_axis(self._inverse_proba, 0, probabilities)
return np.diag(np.multiply(probabilities, inv_probabilities))
def _hessian(self, X, w):
R = self._R(X, w)
H = X.T.dot(R).dot(X)
return H + self.lmbda * np.identity(len(H))
def _train(self, X, w, y):
iters = 0
while(True):
if iters >= self.max_iters:
break
gradient = self._gradient(X, w, y)
hessian = self._hessian(X, w)
hessian_inv = np.linalg.inv(hessian)
w_new = w - hessian_inv.dot(gradient)
dist = norm(w_new - w)
w = w_new
if dist < self.threshold:
break
iters += 1
self.w = w
def predict_prob(self, x):
if self.train_intercept:
x = np.insert(x, 0, 1)
odds = self.w.dot(x)
return self._sigmoid(odds)
def predict(self, x):
prob = self.predict_prob(x)
if prob > 0.5:
return self.positive_label
else:
return self.negative_label
def perform_logreg_CV():
average_accuracies = []
for lmbda in tqdm_notebook(LAMBDAS):
# Perform CV
accuracies = []
for j in range(NUM_SUBSETS):
train_X, train_y, validation_X, validation_y = CVData.get_training_validation_X_y(j)
# Create hypothesis
model = LogisticRegression(train_X, train_y, POSITIVE_LABEL, NEGATIVE_LABEL, lmbda)
predicted_validation_y = model.predict_df(validation_X)
# Get accuracy
accuracy = get_accuracy(validation_y.values.flatten(), predicted_validation_y.values.flatten())
accuracies.append(accuracy)
avg_accuracy = np.mean(accuracies)
average_accuracies.append(avg_accuracy)
return average_accuracies
cv_accuracies = perform_logreg_CV()
optimal_lambda = LAMBDAS[np.argsort(cv_accuracies)[-1]]
print("The optimal lambda is %f with an average accuracy of %f" % (optimal_lambda, max(cv_accuracies)))
plt.plot(LAMBDAS, cv_accuracies)
plt.xlabel('Lambda')
plt.ylabel('Average Accuracy')
plt.title('Cross Validation')
plt.show()
train_X, train_y = CVData.get_all_X_y()
# Create hypothesis
t1 = time.time()
model = LogisticRegression(train_X, train_y, POSITIVE_LABEL, NEGATIVE_LABEL, optimal_lambda)
t2 = time.time()
# Train Accuracy
predicted_train_y = model.predict_df(train_X)
train_accuracy = get_accuracy(train_y.values.flatten(), predicted_train_y.values.flatten())
# Test Accuracy
predicted_test_y = model.predict_df(test_X)
test_accuracy = get_accuracy(test_y.values.flatten(), predicted_test_y.values.flatten())
print("Train Accuracy: %f" % train_accuracy)
print("Test Accuracy: %f" % test_accuracy)
print("Training Time (s): " + str(t2-t1))
print("w_0 = " + str(model.w[0]))
print("w = " + str(model.w[1:]))
The parameters of the Gaussian Mixture model are π, $μ_1$, $μ_2$, Σ.
For an input space in $R^n$, $μ_1$ and $μ_2$ are both n-dimensional, π is scalar, and Σ is an n x n matrix. Σ has a symmetrical upper and lower triangle. Thus, all the parameters are:
They each correspond to:
The total number of parameters in the Gaussian Mixture model is $\frac{n^2}{2} + \frac{5n}{2} + 1$
For our data in the 64-dimensional digits dataset, this equals 2209 parameters.
The parameters of the Logistic Regression model is w and $w_0$ where w is n dimensional and $w_0$ is scalar.
Thus, the total number of parameters in the Logistic Regression model is n + 1.
For our data above, this equals 65 parameters.
The Gaussian Mixture model is heavily parameterized in comparison to the Logistic Regression model. This shows that the Gaussian Mixture model has less degress of freedom. This makes sense because the Logistic Regression model can represent any conditional distribution $P(x|C_i)$ in the exponential family, where the Gaussian Mixture model only represents one of these possibilities (the Gaussian conditional).
There are two areas where computation is involved: the complexity involved to learn the parameters, and the complexity involved to predict a value. We will consider both in our analsyis as the computational complexity of learning, and the computational complexity of prediction.
Let's consider the complexity of learning as the asymptotic computational complexity of calculating the parameters as a function of the dimensionality of the data, n, and the number of training samples, m.
For the Gaussian Mixture model, we calculate π by finding the proportion of points that belongs to a class. This is an $O(m)$ operation. We calculate $μ_1$ by taking the average across each dimension for all the points in a class. This is an $O(mn)$ operation. WLOG, this also applies to to $μ_2$. To calculate the covariance matrix we subtract each mean from $X_{y=c_{1}}$ and $X_{y=c_{2}}$ respectively and taking $X^TX$ for each and combining them. Subtracting the mean can be done in $O(m)$ and calculating $X^TX$ can be done by taking a dot product of two m dimensional vectors for n^2 positions in the resulting matrix, an $O(mn^2)$ operation. So, the covariance can be calculated in $O(m) + O(mn^2) = O(mn^2)$.
Finally, combining the above complexities, $O(m) + O(mn) + O(mn^2) = O(mn^2)$ is the computational complexity of learning for the Gaussian Mixture model.
The computational complexity of prediction for the Gaussian Mixture model is simply to take the weights learned and dot them with the input vector and then apply a sigmoid transformation. This is done in $O(n)$.
Now, let's consider the complexity of learning for the logistic regression model. Let's assume that 10 iterations is sufficient for convergence.
In each iteration, we calculate the hessian, we calculate the gradient, we find the inverse of the hessian, dot the inverse by the gradient, and subtract the result from w to find the new w.
To calculate the hessian, we calculate $X^TRX + λI$ where R is a diagonal matrix with m non-zero elements that represent each training sample's sigmoid probability ($σ_i$) multiplied by its inverse probability ($1-σ_i$). R is calculated in $O(mn)$ since each of the m elements is calculated from the dot product of two n dimensional vectors. To calculate $X^TR$, each column is multiplied by a diagonal element of R. Thus, the computational complexity is $O(mn)$. Then to calculate $X^TRX$ from the previous step, it will take $O(mn^2)$. We know this from our results involving calculating the covariance matrix in the Gaussian Mixture model case. Finally, to add the regularization term is an $O(n)$ operation. In summary, calculating the hessian is an $O(mn) + O(mn^2) + O(n)= O(mn^2) $ operation.
To calculate the gradient, we take all the difference between sigmoid probabilities for each training sample and the y label and dot that with X and add the regularization term. This takes $O(mn) + O(mn) + O(n) = O(mn)$ time.
To calculate the inverse of the n x n hessian matrix, it takes $O(n^3)$. I'm using the result here for the time complexity of naive Gaussian elimination, documented here: http://mathforum.org/library/drmath/view/51908.html. I don't think the goal of this assignment is to discuss the time complexity of Gaussian elimination, so I will use this result freely.
The time complexity of learning for the Logistic Regression model is then $O(mn^2) + O(mn) + O(n^3) = O(mn^2) + O(n^3)$. If $m > n$, which should really be true if you want meaningful results, then the time complexity is simply $O(mn^2)$.
The time complexity of prediction for the Logistic Regression model is the same as the Gaussian result at $O(n)$.
In summary, the time complexity of learning for Gaussian Mixture models and Logistic Regression is $O(mn^2)$ and the time complexity of prediction for both is $O(n)$.
In wall clock time, the results for training both models are as follows:
Gaussian Mixture: 0.0048 seconds
Logistic Regression: 0.0176 seconds
These values may vary from the ones above, since I just took the ones I got on this specific run of the notebook.
The wall clock times make sense because the logistic regression does up to 10 more iterations, with each iteration in $O(mn^2)$. These finite iterations make no difference to the asymptotic complexity, but in wall clock time the difference is clear.
The Gaussian Mixture model gave us:
Train Accuracy: 0.884000 Test Accuracy: 0.890909
The Logistic Regression model gave us:
Train Accuracy: 0.894000 Test Accuracy: 0.900000
The logistic regression model fits the data slightly better.
K-Nearest Neighbours is able to express complex non-linear hypotheses as a function of the training data. In KNN, the input space is partitioned with various degrees of smoothing as we increase k. KNN's decision boundary can take on any form since it is non-parametric.
KNN is expected to do well if the training data is very representative of the true distribution. Although, if there are many training points, this may result in decreased speed when forming predictions since many more comparisons need to be made. KNN evaluates all features as equally important (has no concept of feature importance), so if this is not true, KNN may not be well suited to the problem. KNN is also not suited to the problem if there are linearities between features, since KNN does not capture these.
The Gaussian Mixture model is able to express hypotheses with a linear decision boundary between two classes of data that are conditionally normally distributed around $μ_1$ and $μ_2$ with the same covariance matrix of Σ.
The Gaussian Mixture model is expcted to perform best if the conditional normality conditions are satisfied and both classes have the same covariance matrix. The model can work best when the two classes can be linearly seperated. The model is effective at capturing linearity between features. The model is also robust to irrelevant features since it can assign those features a weight close to zero if they are less important.
The Logistic Regression model is able to express hypotheses with a linear decision boundary between two classes of data that are conditionally distributed according to some unknown exponential distribution. This model is effective for data that is linearly seperable and follows some exponential distribution. The model is effective at capturing linearities between features and is also robust to irrelevant features. Logistic Regression should be used when there is a strong belief that the features are dependent on the class, and the features of both classes are different from one another.
KNN gave us a test accuracy of 0.727 where both Gaussian Mixture and Logistic Regression gave us close to 0.9 test accuracy. The linear seperators vastly outperformed KNN.
The reason for this can be understood by taking a look what the data represents. Each feature represents the number of dark pixels found in a 4x4 patch. Each feature is spatially correlated to one another. For the digits 5 and 6, we would expect that some features in tandem could define a 5, where the prescence of other features in tandem could represent a 6. We expect the features to have linearities between each other. KNN is unable to capture these linearities, where the linear seperators are able to create a linear decision boundary defined between features.
Furthermore, we would expect some features to be more important than other features. For example, if we superimpose a 5 on top of a 6, we can tell that most of the shape may be identical except for the bottom left of the 6 and the the 5. This may lead these specific feature to be more important than the rest. Generally, we would also expect the features along the perimeter of the image to also be less important.
We can verify this by taking a look at the weights we learned. In the logistic regression model, the 8th weight represents the weight for the feature in the top right corner of the image. This value is -9.17977578e-06. This is the lowest magnitude value from all the features. The linear seperators are able to learn this feature is less important where the KNN model gives this feature the same importance.
These factors contribute to the linear seperators outperforming KNN on this dataset.
Consider a set of data D that is linearly seperable.
In a logistic regression, we minimize the negative log likelihood to form a linear decision boundary. For the data D, the weights, $w$, that would minimize the negative log likelihood form a decision boundary, $w^Tx = 0$, that linearly seperates the data D perfectly, and whose weights tend to infinity.
The reason for this is as follows: Imagine that we have weights w for a linear boundary that accomplishes perfect classification on D. We can minimize the negative log likelihood further by multiplying all the weights by some c. This will keep the boundary in the same place but increase the magnitude of $w^Tx$ for any points. Thus, the sigmoid function will tend to a step function or threshold function with each iteration. With each iteration, we expect the magnitude of weights to increase.
Using this knowledge, we can conduct an experiment to see if the training data is linearly seperable. We should train a logistic regression classifier on the training data and check to see if the mode has 100% on the train set. If it is true, we can confirm that the data is linearly seperable. A training accuracy of 100% is enough to prove the data is linearly seperable since the linear decision boundary classifies each point perfectly. For interest, I would like to also take a look at the weights themselves as well as I increase the number of iterations.
A natural question is what should our regularization parameter be?
For a linearly seperable dataset, regularization should not matter since the magnitude of the weights only affects how the sigmoid function looks and should not alter the linear boundary from its optimum.
However, we want to avoid a hessian matrix with no inverse so we will use a small value for lambda in the logistic regression.
train_X, train_y = CVData.get_all_X_y()
NUM_ITERS = [10, 20, 30, 40]
weight_norms = []
train_accuracies = []
for num in tqdm_notebook(NUM_ITERS):
# Create hypothesis
model = LogisticRegression(train_X, train_y, POSITIVE_LABEL, NEGATIVE_LABEL, 10, max_iters=num, threshold=0.001)
# Train Accuracy
predicted_train_y = model.predict_df(train_X)
train_accuracy = get_accuracy(train_y.values.flatten(), predicted_train_y.values.flatten())
train_accuracies.append(train_accuracy)
weight_norms.append(norm(model.w))
plt.plot(NUM_ITERS, train_accuracies)
plt.xlabel('Number of Iterations')
plt.ylabel('Accuracy')
plt.title('Test Set Accuracy')
plt.show()
plt.plot(NUM_ITERS, weight_norms)
plt.xlabel('Number of Iterations')
plt.ylabel('Magnitude')
plt.title('Weight Norm')
plt.show()
From the above graphs, it is clear that the training set is not linearly seperable.
The accuracy does not reach 100% and the weights do not increase as we expect them to, instead the logistic regression converges to certain weights, and the train accuracy is below 0.9.