"""
Methods for performing linear regression.  See
graphlab.linear_regression.create for additional documentation.
"""

import graphlab.connect as _mt
import graphlab as _graphlab
from graphlab.toolkits.recommender.recommender import RecommenderModel
from graphlab.data_structures.sframe import SFrame as _SFrame
from graphlab.deps import pandas as _pandas, HAS_PANDAS as _HAS_PANDAS
import logging

class LinearRegressionModel(RecommenderModel):
    r"""
    The Linear Regression recommender model approximates target rating values 
    using a linear function of the user offset, item offset, and side features.

    Compared to 
    :class:`~graphlab.recommender.MatrixFactorizationModel` and
    :class:`~graphlab.recommender.FactorizationModel`, LinearRegressionModel
    is less powerful, but easier and faster to train due to its simplicity.

    **Side information**

    Side features may be provided via the `user_data` and `item_data` options
    when the model is created. 

    Additionally, observation-specific information, such as the time of day when
    the user rated the item, can also be included. Any column in the
    `observation_data` SFrame that is not the user id, item id, or target is
    treated as a observation side features. The same side feature columns must
    be present when calling :meth:`predict`. 

    Side features may be numeric or categorical. User ids and item ids are
    treated as categorical variables. For the additional side features, the type
    of the :class:`~graphlab.SFrame` column determines how it's handled: strings
    are treated as categorical variables and integers and floats are treated as
    numeric variables. Dictionaries and numeric arrays are also supported.

    **Creating a LinearRegressionModel**

    This model can be created using
    :func:`graphlab.recommender.create(..., method='linear_model') <graphlab.recommender.create>`.
    Do NOT instantiate this model class directly.

    When creating a LinearRegressionModel, the following options may be 
    specified:

    +--------------------------+---------+------------------------------------+
    |        Options           | Default |   Description                      |
    +==========================+=========+====================================+
    | regularization           | 0.0001  | L2 regularization for user and     |
    |                          |         | item variables and side features.  |
    +--------------------------+---------+------------------------------------+
    | binary_targets           | False   | Assume the target column is either |
    |                          |         | 0 or 1. If True, use logistic      |
    |                          |         | regresion.                         |
    +--------------------------+---------+------------------------------------+
    | max_iterations           | 50      | The training algorithm will make   |
    |                          |         | at most this many iterations       |
    |                          |         | through the observation data.      |
    +--------------------------+---------+------------------------------------+
    | sgd_step_size            | 0       | Step size for stochastic gradient  |
    |                          |         | descent. Smaller values generally  |
    |                          |         | lead to more accurate models that  |
    |                          |         | take more time to train. The       |
    |                          |         | default setting of 0 means that    |
    |                          |         | the step size is chosen by trying  |
    |                          |         | several options on a small subset  |
    |                          |         | of the data.                       |
    +--------------------------+---------+------------------------------------+
    | random_seed              | 0       | The random seed used to choose the |
    |                          |         | initial starting point for model   |
    |                          |         | training.                          |
    +--------------------------+---------+------------------------------------+

    **Model parameters**

    Trained model parameters may be accessed using
    `m.get('coefficients')` or equivalently `m['coefficients']`, where `m` 
    is a LinearRegressionModel.

    See Also
    --------
    MatrixFactorizationModel
    FactorizationModel

    Examples
    --------
    Create a basic LinearRegressionModel:

    >>> sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],
    ...                       'rating': [1, 3, 2, 5, 4, 1, 4, 3]})
    >>> m = graphlab.recommender.create(sf, target="rating", 
    ...                                 method="linear_regression",
    ...                                 regularization=0.01)

    With this model object, one can make recommendations for the unique users in
    the training data ``sf``:

    >>> recs = m.recommend()

    The model can be saved to disk and loaded back in later with 
    :meth:`graphlab.load_model`:

    >>> m.save("./my_linear_regression_model")

    Notes
    -----
    **Model definition**

    Formally, the predicted score for user :math:`i` on item :math:`j` is
    given by

    .. math::
    
       \operatorname{score}(i, j) = 
        \mu + w_i + w_j + \mathbf{a}^T \mathbf{x}_i + \mathbf{b}^T \mathbf{y}_j,

    where :math:`\mu` is a global bias term, :math:`w_i` is the weight term for
    user :math:`i`, :math:`w_j` is the weight term for item :math:`j`, 
    :math:`\mathbf{x}_i` and :math:`\mathbf{y}_j` are respectively the user and
    item side feature vectors, and :math:`\mathbf{a}` and :math:`\mathbf{b}`
    are respectively the weight vectors for those side features.

    When `binary_targets=True`, the above score is passed through a logistic 
    function:

    .. math::

       \operatorname{score}(i, j) = 1 / (1 + exp (- z)),

    where :math:`z` is the original linear score.

    **Training the model**

    The training procedure optimizes the following loss function:

    .. math::

        \min_{ \mathbf{w}, \mathbf{a}, \mathbf{b} }
        \frac{1}{|\mathcal{D}|} \sum_{(i,j,r_{ij}) \in \mathcal{D}}
        \mathcal{L}(\operatorname{score}(i, j), r_{ij})
        + \lambda ( || {\mathbf w} ||^2_2 + || {\mathbf a} ||^2_2 + || {\mathbf b} ||^2_2),

    where :math:`\mathcal{D}` is the observation dataset, :math:`r_{ij}` is the
    rating that user :math:`i` gave to item :math:`j`, :math:`\lambda` is the 
    `regularization` parameter, and :math:`\mathcal{L}(\hat{y}, y)` is the 
    squared loss function :math:`(\hat{y} - y)^2`.

    The model is trained using Stochastic Gradient Descent [sgd]_ with additional
    tricks [Bottou]_ to improve convergence. The optimization is done in parallel
    over multiple threads. This procedure is inherently random, so different
    calls to `create()` may return slightly different models, even with the 
    same `random_seed`.
    """

    def __init__(self, model_proxy):
        '''__init__(self)'''
        self.__proxy__ = model_proxy

    def _get_wrapper(self):
        def model_wrapper(model_proxy):
            return LinearRegressionModel(model_proxy)
        return model_wrapper
