mit-netflix-collab-filtering/em.py at main · vanzeot/mit-netflix-collab-filtering · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Mixture model for matrix completion"""
from typing import Tuple
import numpy as np
from scipy.special import logsumexp
from common import GaussianMixture


def estep(X: np.ndarray, mixture: GaussianMixture) -> Tuple[np.ndarray, float]:
    """E-step: Softly assigns each datapoint to a gaussian component

    Args:
        X: (n, d) array holding the data, with incomplete entries (set to 0)
        mixture: the current gaussian mixture

    Returns:
        np.ndarray: (n, K) array holding the soft counts
            for all components for all examples
        float: log-likelihood of the assignment

    """
    n = X.shape[0]
    K = mixture.mu.shape[0]

    # Initialization
    log_posterior = np.float64(np.zeros((n, K)))
    sum_log_likelihood = 0

    # For each point i and cluster j:
    for i in range(n):
        # Boolean "is different from zero" for i'th row:
        is_not_zero = (X[i, :] != 0)
        for j in range(K):
            # Simplifying variables:
            mu = mixture.mu[j, is_not_zero]
            var = mixture.var[j]
            X_observed = X[i, is_not_zero]
            d = len(X_observed)
            # Log Gaussian:
            # log_likelihood = (-d/2) * np.log(2 * np.pi * var) - (1/2) * ((X_observed - mu) ** 2).sum() / var
            log_likelihood = (-d / 2) * np.log(2 * np.pi * var) - (1 / 2) * (np.sum((X_observed - mu) ** 2)) / var
            log_posterior[i, j] = np.log(mixture.p[j] + 1e-16) + log_likelihood
        # Sum over all K for for each i
        total = logsumexp(log_posterior[i, :])
        log_posterior[i, :] = log_posterior[i, :] - total
        sum_log_likelihood += total
    # Exponentiate the logarithm to take the posterior
    posterior = np.exp(log_posterior)
    return posterior, sum_log_likelihood


def mstep(X: np.ndarray, post: np.ndarray, mixture: GaussianMixture,
          min_variance: float = .25) -> GaussianMixture:
    """M-step: Updates the gaussian mixture by maximizing the log-likelihood
    of the weighted dataset

    Args:
        X: (n, d) array holding the data, with incomplete entries (set to 0)
        post: (n, K) array holding the soft counts
            for all components for all examples
        mixture: the current gaussian mixture
        min_variance: the minimum variance for each gaussian

    Returns:
        GaussianMixture: the new gaussian mixture
    """
    raise NotImplementedError


def run(X: np.ndarray, mixture: GaussianMixture,
        post: np.ndarray) -> Tuple[GaussianMixture, np.ndarray, float]:
    """Runs the mixture model

    Args:
        X: (n, d) array holding the data
        post: (n, K) array holding the soft counts
            for all components for all examples

    Returns:
        GaussianMixture: the new gaussian mixture
        np.ndarray: (n, K) array holding the soft counts
            for all components for all examples
        float: log-likelihood of the current assignment
    """
    log_likelihood = 0
    previous_log_likelihood = 0
    epsilon = 1e-6

    while log_likelihood - previous_log_likelihood >= epsilon * np.abs(log_likelihood) or previous_log_likelihood == 0:
        # Update previous log:
        previous_log_likelihood = log_likelihood
        # Take steps E and M:
        post, log_likelihood = estep(X, mixture)
        mixture = mstep(X, post)

    return mixture, post, log_likelihood


def fill_matrix(X: np.ndarray, mixture: GaussianMixture) -> np.ndarray:
    """Fills an incomplete matrix according to a mixture model

    Args:
        X: (n, d) array of incomplete data (incomplete entries =0)
        mixture: a mixture of gaussians

    Returns
        np.ndarray: a (n, d) array with completed data
    """
    raise NotImplementedError