"""
Methods for creating and using a linear regression model.
"""
import graphlab.connect as _mt
import graphlab as _graphlab
from graphlab.toolkits.regression.regression import RegressionModel
from graphlab.data_structures.sframe import SFrame as _SFrame
from pandas import DataFrame as _DataFrame
from graphlab import vowpal_wabbit as _vw

DEFAULT_SOLVER_OPTIONS = {
'convergence_threshold': 1e-2,
'step_size': 1.0,
'lbfgs_memory_level': 3,
'mini_batch_size': 1,
'max_iterations': 10}


def create(dataset, target, features=None, L2_penalty=1e-2, L1_penalty=0.0,
           solver='auto', solver_options=None, verbose=True):
    """
    Create a :class:`~graphlab.linear_regression.LinearRegressionModel` to
    predict a scalar target variable as a linear function of one or more
    features. In addition to standard numeric and categorical types, features
    can also be extracted automatically from list- or dictionary-type SFrame
    columns.

    The linear regression module can be used for ridge regression, Lasso, and
    elastic net regression (see References for more detail on these methods). By
    default, this model has an L2 regularization weight of 0.01.

    Parameters
    ----------
    dataset : pandas.DataFrame/SFrame
        The dataset to use for training the model.

    target : string
        Name of the column containing the target variable.

    features : list[string], optional
        Names of the columns containing features. 'None' (the default) indicates
        that all columns except the target variable should be used as features.
        Each feature column can be of one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Categorical*: values of type string.

        - *List*: list of numeric (integer or float) values. Each list element
          is treated as a separate feature in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key of a dictionary is treated as a categorical variable in the
          model.

    L2_penalty : float, optional
        Weight on the L2-regularizer of the model. The larger this weight, the
        more the model coefficients shrink toward 0. This introduces bias into
        the model but decreases variance, potentially leading to better
        predictions. The default value is 0.01; setting this parameter to 0
        corresponds to unregularized linear regression. See the ridge
        regression reference for more detail.

    L1_penalty : float, optional
        Weight on L1 regularization of the model. Like the L2 penalty, the
        higher the L1 penalty, the more the estimated coefficients shrink toward
        0. The L1 penalty, however, completely zeros out sufficiently small
        coefficients, automatically indicating features that are not useful for
        the model. The default weight of 0 prevents any features from being
        discarded. See the LASSO regression reference for more detail.

    solver : string, optional
        Solver to use for training the model. See the references for more detail
        on each solver.

        - *auto (default)*: automatically chooses the best solver for the data
          and model parameters.
        - *newton*: Newton-Raphson
        - *lbfgs*: limited memory BFGS
        - *gd*: gradient descent
        - *vw*: Vowpal Wabbit
        - *fista*: accelerated gradient descent
        - *sgd*: stochastic gradient descent

    solver_options : dict, optional
        Solver options. The options and their default values are as follows.

        +-----------------------+---------+------------------------------------------+
        |      Option           | Default |      Description                         |
        +=======================+=========+==========================================+
        | auto_tuning           |    True | Toggle step-size auto-tuner (sgd only)   |
        +-----------------------+---------+------------------------------------------+
        | convergence_threshold |    1e-2 | Desired training accuracy                |
        +-----------------------+---------+------------------------------------------+
        | lbfgs_memory_level    |     3   | Number of updates to store (lbfgs only)  |
        +-----------------------+---------+------------------------------------------+
        | max_iterations        |     10  | Max number of solver iterations          |
        +-----------------------+---------+------------------------------------------+
        | mini_batch_size       |     1   | Number of mini-batch examples (sgd only) |
        +-----------------------+---------+------------------------------------------+
        | step_size             |     1.0 | Initial solver step size                 |
        +-----------------------+---------+------------------------------------------+

    verbose : bool, optional (default True)
        If True, print progress updates.

    Returns
    -------
    out : LinearRegressionModel.
        A trained model of type
        :class:`~graphlab.linear_regression.LinearRegressionModel`.

    Notes
    -----
    - Categorical variables are encoded by creating dummy variables. For a
      variable with :math:`K` categories, the encoding creates :math:`K-1` dummy
      variables, while the first category encountered in the data is used as the
      baseline.

    - For prediction and evaluation of linear regression models with
      categorical variables, test datasets cannot contain categories that were
      not present during train time.

    - For prediction and evaluation of linear regression models with sparse
      dictionary inputs, new keys/columns that were not seen during training
      are silently ignored.

    - L2 and L1 regularization typically work best if the features are
      standardized before model training by subtracting the mean of column and
      dividing by the standard deviation. The regression model does *not* do
      this automatically; use SFrame and SArray methods to transform the data
      prior to training.

    - If the 'vw' solver is used, the model returns only predictions; it does
      not allow inspection of estimated effect sizes.

    - Any 'None' values in the data will result in an error being thrown.

    - A constant term is automatically added for the model intercept. This term
      is not regularized.

    Examples
    --------
    *Training*

    Given an :class:`~graphlab.SFrame` ``sf`` with a list of columns
    [``feature_1`` ... ``feature_K``] denoting features and a target column
    ``target``, we can create a
    :class:`~graphlab.linear_regression.LinearRegressionModel` as follows:

    >>> m = linear_regression.create(sf, 'target')

    By default, all columns in the training data except the target are used as
    features. We can also select only a subset of columns in the SFrame to train
    the model:

    >>> m = linear_regression.create(sf, 'target', ['feature_1', 'feature_2'])

    For ridge regression, we can set the ``L2_penalty`` parameter higher (the
    default is 0.01). For Lasso regression, we set the L1_penalty higher, and
    for elastic net, we set both to be higher.

    >>> m_ridge = linear_regression.create(sf, 'target', L2_penalty=0.1)
    >>> m_lasso = linear_regression.create(sf, 'target', L2_penalty=0.,
                                           L1_penalty=1.0)
    >>> m_enet = linear_regression.create(sf, 'target', L2_penalty=0.5,
                                          L1_penalty=0.5)

    *Model Querying*

    Model attributes can be retrieved with either bracket syntax or the
    :func:`~graphlab.linear_regression.LinearRegressionModel.get` method. The
    set of queryable fields is described in the ``get`` documentation, and
    can be obtained programmatically with the
    :func:`~graphlab.linear_regression.LinearRegressionModel.list_fields`
    method. The model coefficients are stored in an SFrame.

    >>> list_of_fields = m.list_fields()
    >>> coef = m['coefficients']  # an SFrame
    >>> coef = m.get('coefficients')  # equivalent to previous line
    >>> rmse = m.get('train_rmse')

    The :func:`~graphlab.linear_regression.LinearRegressionModel.summary`
    and
    :func:`~graphlab.linear_regression.LinearRegressionModel.training_stats`
    methods provide shortcuts to many of the model attributes.

    >>> m.summary()
    >>> stats = m.training_stats()

    *Prediction and Evaluation*

    With the trained model we can make predictions for new data in SFrame
    ``sf_new`` and compare these predictions to actual target values, if these
    values are known. For both of these methods, the new SFrame must include
    columns with the same names as the training features; the ``evaluate``
    method requires ``sf_new`` to have the same ``target`` column name as well.

    >>> predictions = m.predict(sf_new)
    >>> results = m.evaluate(sf_new)

    *Saving and Loading*

    The model can be saved to disk for later use.

    >>> m.save("mymodel")
    >>> m = graphlab.load_model("mymodel")

    For more, see the documentation for
    :class:`~graphlab.linear_regression.LinearRegressionModel`.

    References
    ----------
    - Hoerl, A.E. and Kennard, R.W. (1970) `Ridge regression: Biased Estimation
      for Nonorthogonal Problems
      <http://amstat.tandfonline.com/doi/abs/10.1080/00401706.1970.10488634>`_.
      Technometrics 12(1) pp.55-67

    - Tibshirani, R. (1996) `Regression Shrinkage and Selection via the Lasso <h
      ttp://www.jstor.org/discover/10.2307/2346178?uid=3739256&uid=2&uid=4&sid=2
      1104169934983>`_. Journal of the Royal Statistical Society. Series B
      (Methodological) 58(1) pp.267-288.

    - Zhu, C., et al. (1997) `Algorithm 778: L-BFGS-B: Fortran subroutines for
      large-scale bound-constrained optimization
      <http://dl.acm.org/citation.cfm?id=279236>`_. ACM Transactions on
      Mathematical Software 23(4) pp.550-560.

    - Barzilai, J. and Borwein, J. `Two-Point Step Size Gradient Methods
      <http://imajna.oxfordjournals.org/content/8/1/141.short>`_. IMA Journal of
      Numerical Analysis 8(1) pp.141-148.

    - Beck, A. and Teboulle, M. (2009) `A Fast Iterative Shrinkage-Thresholding
      Algorithm for Linear Inverse Problems
      <http://epubs.siam.org/doi/abs/10.1137/080716542>`_. SIAM Journal on
      Imaging Sciences 2(1) pp.183-202.

    - Zhang, T. (2004) `Solving large scale linear prediction problems using
      stochastic gradient descent algorithms
      <http://dl.acm.org/citation.cfm?id=1015332>`_. ICML '04: Proceedings of
      the twenty-first international conference on Machine learning p.116.

    - `Vowpal Wabbit website <http://hunch.net/~vw/>`_

    - `Vowpal Wabbit on Github
      <https://github.com/JohnLangford/vowpal_wabbit/wiki>`_
    """

    _mt._get_metric_tracker().track('toolkit.regression.linear_regression.create')

    if not isinstance(dataset, (_DataFrame, _SFrame)):
        raise TypeError('Input dataset must be an SFrame or a pandas dataframe.')

    if type(dataset) != _SFrame:
        dataset = _SFrame(dataset)

    # Regression model names.
    model_name = "regression_linear_regression"
    solver = solver.lower()

    # Make sure all keys in the dictionary are lower case.
    # Also keeps a separate copy of solver_options to prevent changes.
    if solver_options is not None:
        _solver_options = {k.lower(): v for k, v in solver_options.items()}
    else:
        _solver_options = {}

    # Extract the target columns into an SFrame.
    target_sframe = dataset.select_columns([target])

    # If features is None, then select all other columns in the table.
    if features is None:
      features = dataset.column_names()
      features.remove(target)

    # Make sure that features are of type strings.
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be an iterable")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str" % x)
    features_sframe = dataset.select_columns(features)

    # Make sure all options are in a flat dictionary.
    opts = {}
    opts.update(_solver_options)
    opts.update({'target'     : target_sframe,
                'features'    : features_sframe,
                'model_name'  : model_name,
                'solver'      : solver,
                'l1_penalty'  : L1_penalty,
                'l2_penalty'  : L2_penalty})

    # Pre-training
    print "Starting pre-training..."
    ret = _graphlab.toolkits.main.run("regression_train_init", opts)
    opts.update(ret)

    # Switching point for vw
    if(solver == 'vw' or solver == 'vowpal-wabbit'):

        # Make the SFrame into VW format
        required_columns = features + [target]
        sf = dataset.select_columns(required_columns)

        # Append options
        vw_options = {}
        for key in ['max_iterations', 'step_size']:
            if key in _solver_options:
                vw_options[key] = _solver_options[key]
            else:
                vw_options[key] = DEFAULT_SOLVER_OPTIONS[key]

        # Call VW
        vw_model = _vw.create(sf, target,
                              l1_penalty= L1_penalty,
                              l2_penalty= L2_penalty,
                              loss_function ='squared',
                              step_size = vw_options['step_size'],
                              verbose = verbose,
                              max_iterations = vw_options['max_iterations'])

        # Save the 'solver' option so that the regression toolkit
        # can query for it when testing which solver is active.
        vw_model._set('solver', 'vw')

        # Use the proxy object for VW
        model = LinearRegressionModel(vw_model.__proxy__, use_vw=True)
        print "\n"

    # Call all our solvers!
    else:

        # Print some output for ipython notebook users
        print "Starting training..."
        ret = _graphlab.toolkits.main.run("regression_train", opts,
                verbose)

        model_proxy = ret['model']
        model = LinearRegressionModel(model_proxy)

    model.summary()
    return model


class LinearRegressionModel(RegressionModel):
    """
    Linear regression is an approach for modeling a scalar target :math:`y` as a
    linear function of one or more explanatory variables denoted
    :math:`X`.

    An instance of this model can be created using :func:`graphlab.linear_regression.create`.
    Do NOT construct the model directly.
    """
    def __init__(self, model_proxy, use_vw=False):
        '''__init__(self)'''
        self.__proxy__ = model_proxy
        self.__name__ = "regression_linear_regression"
        self.use_vw = use_vw

    def _get_wrapper(self):
        def vw_model_wrapper(model_proxy):
            vw_model = _vw.VWModel(model_proxy)
            return LinearRegressionModel(model_proxy, use_vw=True)
        def model_wrapper(model_proxy):
            return LinearRegressionModel(model_proxy)

        if (self.get('solver') == 'vw'):
            return vw_model_wrapper
        else:
            return model_wrapper

    def __str__(self):
        """
        Return a string description of the model, including a description of
        the training data, training statistics, and model hyper-parameters.

        Returns
        -------
        out : string
            A description of the model.
        """

        return self.__repr__()

    def __repr__(self):
        """
        Return a string description of the model, including a description of
        the training data, training statistics, and model hyper-parameters.

        Returns
        -------
        out : string
            A description of the model.
        """

        solver = self.get('solver')
        width = 24
        key_str = "{:<{}}: {}"
        model_fields = [
            ("L1 penalty", 'l1_penalty'),
            ("L2 penalty", 'l2_penalty'),
            ("Examples", 'num_examples'),
            ("Features", 'num_features'),
            ("Coefficients", 'num_coefficients')]

        solver_fields = [
            ("Solver", 'solver'),
            ("Solver iterations", 'train_iters'),
            ("Solver status", 'solver_status'),
            ("Training time (sec)", 'train_time')]

        train_fields = [
            ("Residual sum of squares", 'train_loss'),
            ("Training RMSE", 'train_rmse')]

        ret = []
        ret.append(key_str.format("Class", width, self.__class__.__name__))

        if solver == 'vw':
            m = _vw.VWModel(self.__proxy__)
            ret.append(m.__repr__())

        else:
            for tranche_fields in [model_fields, solver_fields, train_fields]:
                for k, v in tranche_fields:
                    value = self.get(v)
                    if isinstance(value, float):
                        try:
                            value = round(value, 4)
                        except:
                            pass
                    ret.append(key_str.format(k, width, value))
                ret.append("")

        return '\n'.join(ret)

    def get(self, field):
        """
        Get the value of a given field. The list of all queryable fields is
        detailed below, and can be obtained programmatically using the
        :func:`~graphlab.linear_regression.LinearRegressionModel.list_fields`
        method.

        Each of these fields can be queried in one of two ways:

        >>> out = m['field']
        >>> out = m.get('field')  # equivalent to previous line

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | auto_tuning           | True if auto-tuning was used during training |
        +-----------------------+----------------------------------------------+
        | coefficients          | Regression coefficients (non-'vw' only)      |
        +-----------------------+----------------------------------------------+
        | convergence_threshold | Desired solver accuracy                      |
        +-----------------------+----------------------------------------------+
        | features              | Feature column names                         |
        +-----------------------+----------------------------------------------+
        | l1_penalty            | L1 regularization weight                     |
        +-----------------------+----------------------------------------------+
        | l2_penalty            | L2 regularization weight                     |
        +-----------------------+----------------------------------------------+
        | lbfgs_memory_level    | LBFGS memory level ('lbfgs only')            |
        +-----------------------+----------------------------------------------+
        | max_iterations        | Maximum number of solver iterations          |
        +-----------------------+----------------------------------------------+
        | mini_batch_size       | Size of mini-batches ('sgd' only)            |
        +-----------------------+----------------------------------------------+
        | num_coefficients      | Number of coefficients in the model          |
        +-----------------------+----------------------------------------------+
        | num_examples          | Number of examples used for training         |
        +-----------------------+----------------------------------------------+
        | num_features          | Number of dataset columns used for training  |
        +-----------------------+----------------------------------------------+
        | solver                | Type of solver                               |
        +-----------------------+----------------------------------------------+
        | solver_status         | Solver status after training                 |
        +-----------------------+----------------------------------------------+
        | step_size             | Initial step size for the solver             |
        +-----------------------+----------------------------------------------+
        | target                | Target column name                           |
        +-----------------------+----------------------------------------------+
        | train_iters           | Number of solver iterations                  |
        +-----------------------+----------------------------------------------+
        | train_loss            | Residual sum-of-squares training loss        |
        +-----------------------+----------------------------------------------+
        | train_rmse            | Training root-mean-squared-error (RMSE)      |
        +-----------------------+----------------------------------------------+
        | train_time            | Training time (excludes preprocessing)       |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : string
            Name of the field to be retrieved.

        Returns
        -------
        out : [various]
            The current value of the requested field.
        """

        _mt._get_metric_tracker().track('toolkit.regression.linear_regression.get')
        # Note: For VW as the solver, map the options back to the options
        # stored in VW. This lets us have control over what the options
        # should look like when VW is the solver.
        if self.use_vw:
            if field == 'coefficients':
                raise ValueError("Models trained with 'vw' as the "\
                                "solver cannot provide coefficients.")
            vw_option_map = {'solver'     : 'solver',
                             'target'     : 'target_column',
                             'step_size'  : 'step_size',
                             'l1_penalty' : 'l1_penalty',
                             'l2_penalty' : 'l2_penalty',
                             'max_iterations'  : 'max_iterations',
                             'train_iters': 'max_iterations',
                             'train_time' : 'elapsed_time',
                             'train_loss' : 'loss_function',
                             'train_rmse' : 'train_rmse'}

            # Only return options in the map
            if field not in vw_option_map:
                raise ValueError, "Key %s does not exist. Use " \
                        "list_fields() for a list of keys that can be " \
                        "queried." % field
            return _vw.VWModel(self.__proxy__).get(vw_option_map[field])

        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'field': field}
        response = _graphlab.toolkits.main.run('regression_get_value',
            opts)

        # Coefficients returns a unity SFrame. Cast to an SFrame.
        if field == 'coefficients':
            return _SFrame(None, _proxy=response['value'])
        else:
            return response['value']

    def summary(self):
        """
        Display a summary of the model, coefficients (if applicable),
        training options, and training statistics.  Use the
        :func:`~graphlab.linear_regression.LinearRegressionModel.get` method
        to retrieve these values programmatically, or to see more detail about
        the queryable fields.
        """
        _mt._get_metric_tracker().track('toolkit.regression.linear_regression.summary')
        return super(LinearRegressionModel, self).summary()

    def get_default_options(self):
        """
        A dictionary describing the default options for the model.

        Returns
        -------
        out : dict
             A dictionary with default option (name, value) pairs.
        """

        _mt._get_metric_tracker().track('toolkit.regression.linear_regression.get_default_options')
        return super(LinearRegressionModel, self).get_default_options()

    def get_options(self):
        """
        A dictionary describing the options requested during training.

        Returns
        -------
        out : dict
             A dictionary with option (name, value) pairs requested during
             train time.
        """

        _mt._get_metric_tracker().track('toolkit.regression.linear_regression.get_options')
        return super(LinearRegressionModel, self).get_options()

    def predict(self, dataset):
        """
        Return target value predictions for ``dataset``, using the trained
        linear regression model. This method can be used to get fitted values
        for the model by inputting the training dataset.

        Parameters
        ----------
        dataset : SFrame/pandas.Dataframe
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        Returns
        -------
        out : SArray
            Predicted target value for each example (i.e. row) in the dataset.
        """

        _mt._get_metric_tracker().track(
            'toolkit.regression.linear_regression.predict')
        return super(LinearRegressionModel, self).predict(dataset)

    def evaluate(self, dataset):
        r"""Evaluate the model by making target value predictions and comparing
        to actual values.

        Two metrics are used to evaluate linear regression models.  The first
        is root-mean-squared error (RMSE) while the second is the absolute
        value of the maximum error between the actual and predicted values.
        Let :math:`y` and :math:`\hat{y}` denote vectors of length :math:`N`
        (number of examples) with actual and predicted values. The RMSE is
        defined as:

        .. math::

            RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (\widehat{y}_i - y_i)^2}

        while the max-error is defined as

        .. math::

            max-error = \max_{i=1}^N \|\widehat{y}_i - y_i\|

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the target and features used for model training. Additional
            columns are ignored.

        Returns
        -------
        out : dict
            Results from  model evaluation procedure.

        References
        ----------
        - `Wikipedia - root-mean-square deviation
          <http://en.wikipedia.org/wiki/Root-mean-square_deviation>`_
        """

        _mt._get_metric_tracker().track('toolkit.regression.linear_regression.evaluate')
        return super(LinearRegressionModel, self).evaluate(dataset)

    def training_stats(self):
        """
        Returns a dictionary with (name, value) pairs with statistics collected
        during training time. This includes:

        Returns
        -------
        out : dict
            Statistics about model training, e.g. runtime.

            The following description provides details on each of fields
            +-----------------------+------------------------------------------------------+
            |      Field Name       | Description                                          |
            +=======================+======================================================+
            | solver_status         | Solver status during training.                       |
            +-----------------------+------------------------------------------------------+
            | step_size             | Initial step size requested during training.         |
            +-----------------------+------------------------------------------------------+
            | train_iters           | Total number of iterations required for training.    |
            +-----------------------+------------------------------------------------------+
            | train_loss            | Residual sum-of-squares training loss.               |
            +-----------------------+------------------------------------------------------+
            | train_rmse            | Training root-mean-squared-error (RMSE).             |
            +-----------------------+------------------------------------------------------+
            | train_time            | Time required for training (excludes preprocessing). |
            +-----------------------+------------------------------------------------------+
        """

        _mt._get_metric_tracker().track('toolkit.regression.linear_regression.training_stats')
        solver = self.get('solver')
        # Hard code the options exposed from the VW model proxy.
        if solver == 'vw':
            training_stats_fields = ['train_iters',
                    'train_rmse',
                    'train_time']
            return dict(zip(training_stats_fields,
                        map(self.get, training_stats_fields)))
        else:
            return super(LinearRegressionModel, self).training_stats()

    def list_fields(self):
        """
        List of fields stored in the model. Each of these fields can be queried
        using the ``get`` function. Note that the list of fields that can be
        queried by the model are different when the chosen solver is ``vw``.

        Returns
        -------
        out : list
            A list of fields that can be queried using the ``get`` method.
        """
        _mt._get_metric_tracker().track('toolkit.regression.linear_regression.list_fields')
        solver = self.get('solver')
        # Hard code the options exposed from the VW model proxy.
        if solver == 'vw':
            return [
                    'train_iters',
                    'l1_penalty',
                    'l2_penalty',
                    'max_iterations',
                    'solver',
                    'step_size',
                    'target',
                    'train_loss',
                    'train_rmse',
                    'train_time',
                    ]
        else:
            opts = {'model': self.__proxy__,
                    'model_name': self.__name__}
            response = _graphlab.toolkits.main.run('regression_list_keys', opts)
            return sorted(response.keys())

