"""
Methods for performing matrix factorization.  See
graphlab.matrix_factorization.create for additional documentation.
"""

import graphlab.connect as _mt
import graphlab as _graphlab
from graphlab.toolkits.recommender.recommender import RecommenderModel
import logging

DEFAULT_HYPER_PARAMETER_RANGE = {
    'n_factors': range(2, 25),
    'regularization': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
}

class MatrixFactorizationModel(RecommenderModel):
    r"""
    A Matrix Factorization recommender model learns latent factors for each 
    user and item and uses them to make rating predictions.

    MatrixFactorizationModel [Koren_et_al]_ contains a number of options that
    tailor to a variety of datasets and evaluation metrics, making this one of
    the most powerful model in the GraphLab Create recommender toolkit. In terms
    of modeling capabilities, it is less powerful than
    :class:`~graphlab.recommender.FactorizationModel` but is often faster to 
    train, making it a good trade-off between power and ease of use.

    **Side information**

    Side features may be provided via the `user_data` and `item_data` options
    when the model is created. 

    Additionally, observation-specific information, such as the time of day when
    the user rated the item, can also be included. Any column in the
    `observation_data` SFrame that is not the user id, item id, or target is
    treated as a observation side features. The same side feature columns must
    be present when calling :meth:`predict`. 

    Side features may be numeric or categorical. User ids and item ids are
    treated as categorical variables. For the additional side features, the type
    of the :class:`~graphlab.SFrame` column determines how it's handled: strings
    are treated as categorical variables and integers and floats are treated as
    numeric variables. Dictionaries and numeric arrays are also supported.

    **Optimizing for ranking performance**

    By default, MatrixFactorizationModel optimizes for accuracy of target
    rating prediction. However, a modified version can be created to optimize 
    the precision-recall performance of recommendations.

    When `ranking_regularization` is larger than zero, the model samples a small
    set of unobserved user-item pairs and attempts to drive their rating 
    predictions below the value specified with `unobserved_rating_value`. 
    This has the effect of improving the precision-recall performance of 
    recommended items.

    **Creating a MatrixFactorizationModel**

    This model can be created using
    :func:`graphlab.recommender.create(..., method='matrix_factorization') <graphlab.recommender.create>`.
    Do NOT instantiate this model class directly.

    When creating a MatrixFactorizationModel, the following options may be 
    specified:

    +---------------------------+---------+------------------------------------+
    |        Options            | Default |   Description                      |
    +===========================+=========+====================================+
    | n_factors                 | 8       | Number of latent factors.          |
    +---------------------------+---------+------------------------------------+
    | regularization            | 0.0001  | Regularization for interaction     |
    |                           |         | terms. (L2)                        |
    +---------------------------+---------+------------------------------------+
    | linear_regularization     | 0.0     | Regularization for linear term     |
    +---------------------------+---------+------------------------------------+
    | nmf                       | False   | Use nonnegative matrix             |
    |                           |         | factorization, which forces the    |
    |                           |         | factors to be nonnegative. Disables|
    |                           |         | linear and intercept terms.        |
    +---------------------------+---------+------------------------------------+
    | ranking_regularization    | 0       | If nonzero, then penalize the      |
    |                           |         | predicted value of user-item pairs |
    |                           |         | not in the training set. Larger    |
    |                           |         | values increase this penalization. |
    |                           |         | Suggested values: 0, 0.1, 0.5, 1.  |
    |                           |         | (Note: not compatible with         |
    |                           |         | `binary_targets=True`.)            |
    +---------------------------+---------+------------------------------------+
    | unobserved_rating_value   | None    | If ranking_regularization          |
    |                           |         | is greater than 0, then penalize   |
    |                           |         | unobserved items with a larger     |
    |                           |         | predicted score than this value.   |
    |                           |         | If None, the estimated 5% quantile |
    |                           |         | is used (mean - 1.96*std dev).     |
    +---------------------------+---------+------------------------------------+
    | binary_targets            | False   | Assume the target column is either |
    |                           |         | 0 or 1. If True, use logistic      |
    |                           |         | loss to fit the model.             |
    |                           |         | (Note: not compatible with         |
    |                           |         | `ranking_regularization > 0`.)     |
    +---------------------------+---------+------------------------------------+

    In addition, the following parameters govern model training:

    +--------------------------+---------+------------------------------------+
    |        Options           | Default |   Description                      |
    +==========================+=========+====================================+
    | max_iterations           | 50      | The training algorithm will make   |
    |                          |         | at most this many iterations       |
    |                          |         | through the observation data.      |
    +--------------------------+---------+------------------------------------+
    | sgd_step_size            | 0       | Step size for stochastic gradient  |
    |                          |         | descent. Smaller values generally  |
    |                          |         | lead to more accurate models that  |
    |                          |         | take more time to train. The       |
    |                          |         | default setting of 0 means that    |
    |                          |         | the step size is chosen by trying  |
    |                          |         | several options on a small subset  |
    |                          |         | of the data.                       |
    +--------------------------+---------+------------------------------------+
    | random_seed              | 0       | The random seed used to choose the |
    |                          |         | initial starting point for model   |
    |                          |         | training.                          |
    +--------------------------+---------+------------------------------------+

    **Model parameters**

    Trained model parameters may be accessed using
    `m.get('coefficients')` or equivalently `m['coefficients']`, where `m` 
    is a LinearRegressionModel.

    See Also
    --------
    FactorizationModel, LinearRegressionModel

    Notes
    -----
    **Model Definition**

    Like :class:`FactorizationModel <graphlab.recommender.FactorizationModel>`
    and :class:`LinearRegressionModel <graphlab.recommender.LinearRegressionModel>`,
    `MatrixFactorizationModel` trains a model capable of predicting a score for
    each possible combination of users and items.  The internal coefficients of
    the model are learned from known scores of users and items.
    Recommendations are then based on these scores.

    In the two factorization models, users and items are represented by weights
    and factors.  These model coefficients are learned during training.
    Roughly speaking, the weights, or bias terms, account for a user or item's
    bias towards higher or lower ratings.  For example, an item that is
    consistently rated highly would have a higher weight coefficient associated
    with them.  Similarly, an item that consistently receives below average
    ratings would have a lower weight coefficient to account for this bias.

    The factor terms model interactions between users and items.  For example,
    if a user tends to love romance movies and hate action movies, the factor
    terms attempt to capture that, causing the model to predict lower scores
    for action movies and higher scores for romance movies.  Learning good
    weights and factors is controlled by several options outlined below.

    More formally, the predicted score for user :math:`i` on item :math:`j` is
    given by

    .. math::
       \operatorname{score}(i, j) = 
          \mu + w_i + w_j 
          + \mathbf{a}^T \mathbf{x}_i + \mathbf{b}^T \mathbf{y}_j
          + {\mathbf u}_i^T {\mathbf v}_j,

    where :math:`\mu` is a global bias term, :math:`w_i` is the weight term for
    user :math:`i`, :math:`w_j` is the weight term for item :math:`j`, 
    :math:`\mathbf{x}_i` and :math:`\mathbf{y}_j` are respectively the user and
    item side feature vectors, and :math:`\mathbf{a}` and :math:`\mathbf{b}`
    are respectively the weight vectors for those side features.
    The latent factors, which are vectors of length ``num_factors``, are given
    by :math:`{\mathbf u}_i` and :math:`{\mathbf v}_j`.

    When `binary_targets=True`, the above score is passed through a logistic 
    function:

    .. math::

       \operatorname{score}(i, j) = 1 / (1 + exp (- z)),

    where :math:`z` is the original linear score.

    **Training the model**

    Formally, the objective function we are optimizing for is:

    .. math::
      \min_{\mathbf{w}, \mathbf{a}, \mathbf{b}, \mathbf{V}, \mathbf{U}}
      \frac{1}{|\mathcal{D}|} \sum_{(i,j,r_{ij}) \in \mathcal{D}}
      \mathcal{L}(\operatorname{score}(i, j), r_{ij})
      + \lambda_1 (\lVert {\mathbf w} \rVert^2_2 + || {\mathbf a} ||^2_2 + || {\mathbf b} ||^2_2 )
      + \lambda_2 \left(\lVert {\mathbf U} \rVert^2_2
                           + \lVert {\mathbf V} \rVert^2_2 \right)

    where :math:`\mathcal{D}` is the observation dataset, :math:`r_{ij}` is the
    rating that user :math:`i` gave to item :math:`j`, 
    :math:`{\mathbf U} = ({\mathbf u}_1, {\mathbf u}_2, ...)` denotes the user's
    latent factors and :math:`{\mathbf V} = ({\mathbf v}_1, {\mathbf v}_2, ...)`
    denotes the item latent factors.  The loss function 
    :math:`\mathcal{L}(\hat{y}, y)` is :math:`(\hat{y} - y)^2` by default. 
    :math:`\lambda_1` denotes the `linear_regularization` parameter and 
    :math:`\lambda_2` the `regularization` parameter.

    When ``ranking_regularization`` is nonzero, then the equation
    above gets an additional term.  Let :math:`\lambda_{\text{rr}}` represent
    the value of `ranking_regularization`, and let
    :math:`v_{\text{ur}}` represent `unobserved_rating_value`.  Then the
    objective we attempt to minimize is:

    .. math::
      \min_{\mathbf{w}, \mathbf{a}, \mathbf{b}, \mathbf{V}, \mathbf{U}}
      \frac{1}{|\mathcal{D}|} \sum_{(i,j,r_{ij}) \in \mathcal{D}}
      \mathcal{L}(\operatorname{score}(i, j), r_{ij})
      + \lambda_1 (\lVert {\mathbf w} \rVert^2_2 + || {\mathbf a} ||^2_2 + || {\mathbf b} ||^2_2 )
      + \lambda_2 \left(\lVert {\mathbf U} \rVert^2_2
                           + \lVert {\mathbf V} \rVert^2_2 \right) \\
      + \frac{\lambda_{rr}}{\text{const} * |\mathcal{U}|}
      \sum_{(i,j) \in \mathcal{U}}
      \mathcal{L}\left(\operatorname{score}(i, j), v_{\text{ur}}\right),

    where :math:`\mathcal{U}` is a sample of unobserved user-item pairs. Note
    that `ranking_regularization` is incompatible with `binary_targets=True` 
    in the current implementation.

    The model is trained using Stochastic Gradient Descent [sgd]_ with additional
    tricks [Bottou]_ to improve convergence. The optimization is done in parallel
    over multiple threads. This procedure is inherently random, so different
    calls to `create()` may return slightly different models, even with the 
    same `random_seed`.

    References
    ----------
    .. [Koren_et_al] Koren, Yehuda, Robert Bell and Chris Volinsky. `"Matrix 
        Factorization Techniques for Recommender Systems." <http://www2.research
        .att.com/~volinsky/papers/ieeecomputer.pdf?utm_source=twitterfeed&utm_me
        dium=twitter>`_ Computer Volume: 42, Issue: 8 (2009): 30-37.

    .. [sgd] `Wikipedia - Stochastic gradient descent
        <http://en.wikipedia.org/wiki/Stochastic_gradient_descent>`_

    .. [Bottou] Leon Bottou, `"Stochastic Gradient Tricks,"
        <http://research.microsoft.com/apps/pubs/default.aspx?id=192769>`_
        Neural Networks, Tricks of the Trade, Reloaded, 430--445, Lecture Notes
        in Computer Science (LNCS 7700), Springer, 2012.

    Examples
    --------
    **Basic usage**

    >>> sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],
    ...                       'rating': [1, 3, 2, 5, 4, 1, 4, 3]})
    >>> m1 = graphlab.recommender.create(sf, target='rating')

    When a target column is present, :meth:`~graphlab.recommender.create`
    defaults to creating a MatrixFactorizationModel.

    **Including side features**

    >>> user_info = graphlab.SFrame({'user_id': ["0", "1", "2"],
    ...                              'name': ["Alice", "Bob", "Charlie"],
    ...                              'numeric_feature': [0.1, 12, 22]})
    >>> item_info = graphlab.SFrame({'item_id': ["a", "b", "c", d"],
    ...                              'name': ["item1", "item2", "item3", "item4"],
    ...                              'dict_feature': [{'a' : 23}, {'a' : 13}, 
    ...                                               {'b' : 1},
    ...                                               {'a' : 23, 'b' : 32}]})
    >>> m2 = graphlab.recommender.create(sf, target='rating', 
    ...                                  user_data=user_info, 
    ...                                  item_data=item_info,
    ...                                  method = 'matrix_factorization')

    **Optimizing for ranking performance**

    Create a model that pushes predicted ratings of unobserved user-item
    pairs toward 1 or below.

    >>> m3 = graphlab.recommender.create(sf, target='rating',
    ...                                  ranking_regularization = 0.1,
    ...                                  unobserved_rating_value = 1)
    """

    def __init__(self, model_proxy):
        '''__init__(self)'''
        self.__proxy__ = model_proxy

    def _get_wrapper(self):
        def model_wrapper(model_proxy):
            return MatrixFactorizationModel(model_proxy)
        return model_wrapper
