"""
This module contains the K-Means++ clustering algorithm, including the
KmeansModel class which provides methods for inspecting the returned cluster
information.
"""

import graphlab.connect as _mt
import graphlab as _graphlab
from graphlab.toolkits._model import Model as _Model
from graphlab.data_structures.sframe import SFrame as _SFrame
from graphlab.toolkits._internal_utils import _toolkit_repr_print
from graphlab.util import validate_feature_types as _validate_feature_types
from array import array as _array
import logging as _logging
import json as _json
from graphlab.toolkits._model import _get_default_options_wrapper

DEFAULT_HYPER_PARAMETER_RANGE = {
    'num_clusters': range(2, 21)
}


class KmeansModel(_Model):
    """
    A k-means model object containing the results of running kmeans clustering
    on a dataset.  Queryable fields include a cluster id for each vertex, as 
    well as the centers of the clusters.

    An instance of this model can be created using
    :func:`graphlab.kmeans.create`. Do NOT construct the model directly.
    """
    def __init__(self, model):
        '''__init__(self)'''
        self.__proxy__ = model
        self.__name__ = 'kmeans'

    def get(self, field):
        """
        Return the value of a given field.

        The list of all queryable fields is detailed below, and can be obtained
        with the ``list_fields`` method.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | cluster_id            | Cluster assignment for each data point and   |
        |                       | Euclidean distance to the cluster center     |
        +-----------------------+----------------------------------------------+
        | cluster_info          | Cluster centers, sum of Euclidean distance   |
        |                       | from each cluster member to the center, and  |
        |                       | the number of data points belonging to the   |
        |                       | cluster                                      |
        +-----------------------+----------------------------------------------+
        | features              | Names of feature columns                     |
        +-----------------------+----------------------------------------------+
        | max_iterations        | Maximum number of iterations to perform      |
        +-----------------------+----------------------------------------------+
        | num_clusters          | Number of clusters                           |
        +-----------------------+----------------------------------------------+
        | num_examples          | Number of examples in the dataset            |
        +-----------------------+----------------------------------------------+
        | num_features          | Number of feature columns used               |
        +-----------------------+----------------------------------------------+
        | num_unpacked_features | Number of features unpacked from the         |
        |                       | feature columns                              |
        +-----------------------+----------------------------------------------+
        | training_iterations   | Total number of iterations performed         |
        +-----------------------+----------------------------------------------+
        | training_time         | Total time taken to cluster the data         |
        +-----------------------+----------------------------------------------+
        | unpacked_features     | Names of features unpacked from the          |
        |                       | feature columns                              |
        +-----------------------+----------------------------------------------+
        | verbose               | True if model training should print progress |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : str
            The name of the field to query.

        Returns
        -------
        out
            Value of the requested field

        See Also
        --------
        list_fields

        Examples
        --------

        >>> model.get("cluster_info")
                d1        d2        d3        d4  __within_distance__  __size__
        0 -0.777484  1.048897  0.523926  0.487775             2.459470         4
        1  0.844906 -0.613151 -0.088785 -0.212908             3.651614         5
        2 -1.114592 -1.129836 -1.651781 -0.886557             0.000000         1

        [3 rows x 6 columns]
        """

        _mt._get_metric_tracker().track('toolkit.kmeans.get')

        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'field': field}
        response = _graphlab.toolkits._main.run('kmeans_get_value',
                                               opts)

        # cluster_id and cluster_info both return a unity SFrame. Cast to an SFrame.
        if field == 'cluster_id' or field == 'cluster_info':
            return _SFrame(None, _proxy=response['value'])
        else:
            return response['value']

    def list_fields(self):
        """
        List the fields stored in the model, including the number of iterations
        performed, total runtime for the clustering algorithm, and cluster data.

        Each field can be queried with the ``get`` method.

        Returns
        -------
        out : list
            List of fields queryable with the ``get`` method.

        See Also
        --------
        get
        """
        _mt._get_metric_tracker().track('toolkit.kmeans.list_fields')

        opts = {'model': self.__proxy__, 'model_name': self.__name__}
        response = _graphlab.toolkits._main.run('kmeans_list_keys',
                                               opts)

        return sorted(response.keys())

    def summary(self):
        """
        Display a summary of model training information.

        Use the ``get`` method to retrieve these values programmatically, or to
        see more detail about the queryable fields.

        See Also
        --------
        get, list_fields
        """

        _mt._get_metric_tracker().track('toolkit.kmeans.summary')
        print ""
        print "                    Model summary                       "
        print "--------------------------------------------------------"
        print self.__repr__()

    def _get_wrapper(self):
        def model_wrapper(model_proxy):
            return KmeansModel(model_proxy)
        return model_wrapper

    def __str__(self):
        """
        Return a string description of the model to the ``print`` method.

        Returns
        -------
        out : string
            A description of the KMeansModel.
        """
        return self.__repr__()

    def __repr__(self):
        """
        Print a string description of the model when the model name is entered
        in the terminal.
        """

        width = 30
        key_str = "{:<{}}: {}"

        model_fields = [
            ('Total training time (seconds)', 'training_time'),
            ('Number of clusters', 'num_clusters'),
            ('Number of training iterations', 'training_iterations'),
            ('Number of examples', 'num_examples'),
            ('Number of feature columns', 'num_features'),
            ('Number of unpacked features', 'num_unpacked_features')]

        out = [_toolkit_repr_print(self, [model_fields], width=width)]
        out.append(key_str.format("Accessible fields", width, ""))
        out.append(key_str.format("   cluster_id", width, "An SFrame containing the cluster assignments."))
        out.append(key_str.format("   cluster_info", width, "An SFrame containing the cluster centers."))

        return '\n'.join(out)

    def get_current_options(self):
        """
        Return a dictionary with the options used to define and create the
        current KmeansModel instance.

        Returns
        -------
        out : dict
            Dictionary of options used to train the current instance of the
            KmeansModel.

        See Also
        --------
        get_default_options, list_fields, get

        Examples
        --------
        >>> sf = graphlab.SFrame({'a' : [0.1, 8, 3.5], 'b':[-3, 7.6, 3]})
        >>> model = graphlab.kmeans.create(sf, 2)
        >>> model.get_current_options()
        {'num_clusters': 2, 'max_iterations': 10}
        """

        _mt._get_metric_tracker().track('toolkit.kmeans.get_current_options')

        opts = {'model': self.__proxy__, 'model_name': self.__name__}

        return _graphlab.toolkits._main.run(
            'kmeans_get_current_options', opts)


get_default_options = _get_default_options_wrapper(
                          'kmeans', 
                          'kmeans', 
                          'KmeansModel')

def create(dataset, num_clusters, features=None, max_iterations=10, verbose=True):
    r"""
    Run the k-means++ clustering algorithm, returning a KmeansModel object
    that contains the cluster centers and the cluster assignment for
    each data point in the dataset.

    Given a number of clusters, k-means++ iteratively chooses the best cluster
    centers and assigns nearby points to the best cluster. If no points change
    cluster membership between iterations, the algorithm terminates. 

    Parameters
    -----------
    dataset : SFrame
        Each row in the SFrame is an observation.

    num_clusters : int
        Number of clusters.  This is the 'k' in k-means.

    features : list[string], optional
        Names of feature columns to use in computing distances between
        observations and cluster centers. 'None' (the default) indicates that
        all columns should be used as features. Columns may be of the following
        types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a distinct feature in the model.

        - *Dict*: dictionary of keys mapped to numeric values. Each unique key
          is treated as a distinct feature in the model.

        Note that columns of type *list* are not supported. Convert them to
        array columns if all entries in the list are of numeric types.

    max_iterations : int, optional
        The maximum number of iterations to run. Prints a warning if the
        algorithm does not converge after max_iterations iterations.

    verbose : bool, optional
        If True, print model training progress to the screen.

    Returns
    -------
    out : KmeansModel
        A Model object containing a cluster id for each vertex, and the centers
        of the clusters.

    See Also
    --------
    KmeansModel

    References
    ----------
    - `Wikipedia - k-means clustering
      <http://en.wikipedia.org/wiki/K-means_clustering>`_
    - Artuhur, D. and Vassilvitskii, S. (2007) `k-means++: The Advantages of
      Careful Seeding <http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf>`_. In
      Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete
      Algorithms. pp. 1027-1035.

    Examples
    --------

    >>> sf = graphlab.SFrame({
        "d1": [ 0.46973508, 0.0063261, 0.14143399, 0.35025834,
                0.83728709, 0.81438336, 0.74205833, 0.36273747,
                0.00793858, 0.02298716],
        "d2": [ 0.51050977, 0.82167952, 0.61451765, 0.51179513,
                0.35223035, 0.59366481, 0.48848649, 0.90026032,
                0.78798728, 0.40125452],
        "d3": [ 0.71716265, 0.54163387, 0.55577274, 0.12619953,
                0.80172228, 0.21519973, 0.21014113, 0.54207596,
                0.65432528, 0.00754797],
        "d4": [ 0.69486673, 0.92585721, 0.95461882, 0.72658554,
                0.86590678, 0.18017175, 0.60361348, 0.89223113,
                0.37992791, 0.44700959]
        })

    It's important to standardize our columns to get the best results
    possible from the k-means algorithm.

    >>> for col in ['d1', 'd2', 'd3', 'd4']:
            sf[col] = (sf[col] - sf[col].mean()) / sf[col].std()
    >>> model = graphlab.kmeans.create(sf, num_clusters=3)
    """

    _mt._get_metric_tracker().track('toolkit.kmeans.create')

    opts = {'model_name': 'kmeans',
            'num_clusters': num_clusters,
            'max_iterations': max_iterations,
            'verbose': verbose}

    if not (isinstance(dataset, _SFrame)):
        raise TypeError("Input 'dataset' must be an SFrame")

    if dataset.num_rows() == 0 or dataset.num_cols() == 0:
        raise ValueError("Input 'dataset' has no data.")

    features = _validate_feature_types(dataset, features, [_array, dict, int, float])
    sf_features = dataset.select_columns(features)
    opts["features"] = sf_features

    num_examples = dataset.num_rows()
    if num_clusters > num_examples:
        _logging.warning("Clipping num_clusters to be the number of data points.\n")
        num_clusters = num_examples
        opts['num_clusters'] = num_examples

    params = _graphlab.toolkits._main.run('kmeans_train', opts, verbose)
    return KmeansModel(params['model'])
