defkNN(test, train, target, k=1, normalize=False): """ This is the k nearest neighbour algorithm :param test: N*D numpy array, each row record an example, each column represent a feature field :param train: N*D numpy array, each row record an example, each column represent a feature field :param target: 1*D numpy array, each element is the target value :param k: int, the best k near neighbour :param normalize: bool, determine whether to normalize `test` and `train` or not :return: 1*D numpy array, the predict value """ y_hat = np.zeros(test.shape[0]) if normalize: test = test / norm(test, axis=1)[:, np.newaxis] train = train / norm(train, axis=1)[:, np.newaxis] for i in range(test.shape[0]): #tmp = train - test[i] dist = distance(train, test[i], method="cosine") idx = dist.argsort()[:k] # given the weight of each neighbour w = np.array([1]) / dist[idx] w /= w.sum() y_hat[i] = (target[idx] * w).sum() return y_hat
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
defdistance(x, y, method="euclidean", axis=1): """ compute the distance between `x` and `y` by `method` :param x: 1*D numpy array or array like :param y: 1*D numpy array or array like :param method: the method using to compute the distance between `x` and `y` the method can only be "euclidean" or "cosine" :return: a float number represent the distance between `x` and `y` """ x = np.asarray(x) y = np.asarray(y) if method == "cosine": tmp = (x * y).sum(axis=axis) / (np.linalg.norm(x, axis=axis) * np.linalg.norm(y)) return np.array([1]) / (tmp + 1e-3) elif method == "euclidean": return np.linalg.norm(x - y, axis=axis) else: raiseNotImplemented("Not implement method" + method)