import operator
import graphlab.connect as _mt
import graphlab.connect.main as glconnect
from graphlab.cython.cy_sketch import UnitySketchProxy
from graphlab.cython.context import debug_trace as cython_context
from graphlab.data_structures.sarray import SArray
from math import sqrt

__all__ = ['Sketch']


class Sketch(object):
    """
    The Sketch object contains a sketch of a single SArray (a column of an
    SFrame). Using a sketch representation of an SArray, many approximate and
    exact statistics can be computed very quickly.

    To construct a Sketch object, the following methods are equivalent:

    >>> sketch = graphlab.Sketch(my_sarray)
    >>> sketch = my_sarray.sketch_summary()

    Typically, the SArray is a column of an SFrame:

    >>> sketch = graphlab.Sketch(my_sframe['column1'])
    >>> sketch = my_sframe['column1'].sketch_summary()

    The sketch computation is fast, with complexity approximately linear in the
    length of the SArray. After the Sketch is computed, all queryable functions
    are performed nearly instantly.

    A sketch can compute the following information depending on whether it is a
    numeric array or a non-numeric (string) array.

    For numeric columns, the following information is provided exactly:
     - length (:func:`~graphlab.Sketch.size`)
     - number of missing Values (:func:`~graphlab.Sketch.num_undefined`)
     - minimum  value (:func:`~graphlab.Sketch.min`)
     - maximum value (:func:`~graphlab.Sketch.max`)
     - mean (:func:`~graphlab.Sketch.mean`)
     - variance (:func:`~graphlab.Sketch.var`)
     - standard deviation (:func:`~graphlab.Sketch.std`)

    And the following information is provided approximately:
     - number of unique values (:func:`~graphlab.Sketch.num_unique`)
     - quantiles (:func:`~graphlab.Sketch.quantile`)
     - frequent items (:func:`~graphlab.Sketch.frequent_items`)
     - frequency count for any value (:func:`~graphlab.Sketch.frequency_count`)

    For non-numeric columns, the following information is provided exactly:
     - length (:func:`~graphlab.Sketch.size`)
     - number of missing values (:func:`~graphlab.Sketch.num_undefined`)

    And the following information is provided approximately:
     - number of unique Values (:func:`~graphlab.Sketch.num_unique`)
     - frequent items (:func:`~graphlab.Sketch.frequent_items`)
     - frequency count of any value (:func:`~graphlab.Sketch.frequency_count`)

    Please see the individual function documentation for detail about each of
    these statistics.

    Parameters
    ----------
    array : SArray
        Array to sketch.

    References
    ----------
    - `Wikipedia. Streaming algorithms. <http://en.wikipedia.org/wiki/Streaming_algorithm>`_
    - Charikar, et al. (2002) `Finding frequent items in data streams. <https://www.cs.rutgers.edu/~farach/pubs/FrequentStream.pdf>`_
    - Cormode, G. and Muthukrishnan, S. (2004) `An Improved Data Stream Summary: The Count-Min Sketch and its Applications. <http://dimacs.rutgers.edu/~graham/pubs/papers/cm-latin.pdf>`_
    """

    def __init__(self, array, _proxy=None):
        """__init__(array)
        Construct a new Sketch from an SArray.

        Parameters
        ----------
        array : SArray
            Array to sketch.
        """
        _mt._get_metric_tracker().track('sketch.init')
        if (_proxy):
            self.__proxy__ = _proxy
        else:
            self.__proxy__ = UnitySketchProxy(glconnect.get_client())
            if not isinstance(array, SArray):
                raise TypeError("Sketch object can only be constructed from SArrays")

            self.__proxy__.construct_from_sarray(array.__proxy__)

    def __repr__(self):
      """
      Emits a brief summary of all the statistics as a string.
      """
      s = "Exact Values\n"
      s += "------------\n";
      s += "Length: " + str(self.size()) + "\n"
      s += "# Missing Values: " + str(self.num_undefined()) + "\n"
      try:
        # these may fail for non-numeric columns
        s += "Min: " + str(self.min()) + "\n"
        s += "Max: " + str(self.max()) + "\n"
        s += "Mean: " + str(self.mean()) + "\n"
        s += "Variance: " + str(self.var()) + "\n"
        s += "Standard Deviation: " + str(self.std()) + "\n"
      except:
        pass
      s += "\n"
      s += "Approximate Values\n"
      s += "------------------\n";
      s += "# Unique Values: " + str(self.num_unique()) + "\n"
      frequent = self.frequent_items()
      sorted_freq = sorted(frequent.iteritems(), key=operator.itemgetter(1), reverse=True)
      sorted_freq = sorted_freq[:10]
      s += "Most frequent items (item:count) :\n"
      s += "\t"
      if len(sorted_freq) == 0:
          s += " -- All elements appear with less than 0.01% frequency -- "
      else:
          for elem in sorted_freq:
              s += str(elem[0]) + ":" + str(elem[1]) + "\t"
      s += "\n"

      try:
        # test if we can query quantiles
        t = self.quantile(0)
        s += "Quantiles: \n"
        s += "\tMin\t1%\t5%\t25%\t50%\t75%\t95%\t99%\tMax\n"
        s += "\t"
        for q in [0.0,0.01,0.05,0.25,0.5,0.75,0.95,0.99,1.00]:
          s += str(self.quantile(q)) + "\t"
        s += "\n"
      except:
        pass

      return s.expandtabs(8)

    def __str__(self):
        """
        Emits a brief summary of all the statistics as a string.
        """
        return self.__repr__()

    def size(self):
        """
        Returns the size of the input SArray.

        Returns
        -------
        out : int
            The number of elements of the input SArray.
        """
        with cython_context():
            return int(self.__proxy__.size())

    def max(self):
        """
        Returns the maximum value in the SArray. Returns *nan* on an empty
        array. Throws an exception if called on an SArray with non-numeric type.

        Raises
        ------
        Throws an exception if the SArray is a non-numeric type.

        Returns
        -------
        out : type of SArray
            Maximum value of SArray. Returns nan if the SArray is empty.
        """
        with cython_context():
            return self.__proxy__.max()

    def min(self):
        """
        Returns the minimum value in the SArray. Returns *nan* on an empty
        array. Throws an exception if called on an SArray with non-numeric type.

        Raises
        ------
        Throws an exception if the sarray is a non-numeric type.

        Returns
        -------
        out : type of SArray
            Minimum value of SArray. Returns nan if the sarray is empty.
        """
        with cython_context():
            return self.__proxy__.min()

    def sum(self):
        """
        Returns the sum of all the values in the SArray.  Returns 0 on an empty
        array. Throws an exception if called on an sarray with non-numeric type.
        Will overflow without warning.

        Raises
        ------
        Throws an exception if the SArray is a non-numeric type.

        Returns
        -------
        out : type of SArray
            Sum of all values in SArray. Returns 0 if the SArray is empty.
        """
        with cython_context():
            return self.__proxy__.sum()

    def mean(self):
        """
        Returns the mean of the values in the SArray. Returns 0 on an empty
        array. Throws an exception if called on an SArray with non-numeric type.

        Raises
        ------
        Throws an exception if the SArray is a non-numeric type.

        Returns
        -------
        out : float
            Mean of all values in SArray. Returns 0 if the sarray is empty.
        """
        with cython_context():
            return self.__proxy__.mean()

    def std(self):
        """
        Returns the standard deviation of the values in the SArray. Returns 0 on
        an empty array. Throws an exception if called on an SArray with
        non-numeric type.

        Returns
        -------
        out : float
            The standard deviation of all the values. Returns 0 if the sarray is
            empty.

        Raises
        ------
            Throws an exception if the SArray is a non-numeric type.
        """
        return sqrt(self.var())

    def var(self):
        """
        Returns the variance of the values in the sarray. Returns 0 on an empty
        array. Throws an exception if called on an SArray with non-numeric type.

        Raises
        ------
        Throws an exception if the SArray is a non-numeric type.

        Returns
        -------
        out : float
            The variance of all the values. Returns 0 if the SArray is empty.
        """
        with cython_context():
            return self.__proxy__.var()

    def num_undefined(self):
        """
        Returns the the number of undefined elements in the SArray. Return 0
        on an empty SArray.

        Returns
        -------
        out : int
            The number of missing values in the SArray.
        """
        with cython_context():
            return int(self.__proxy__.num_undefined())

    def num_unique(self):
        """
        Returns a sketched estimate of the number of unique values in the
        SArray based on the Hyperloglog sketch.

        Returns
        -------
        out : float
            An estimate of the number of unique values in the SArray.
        """
        _mt._get_metric_tracker().track('sketch.num_unique')

        with cython_context():
            return int(self.__proxy__.num_unique())

    def frequent_items(self):
        """
        Returns a sketched estimate of the most frequent elements in the SArray
        based on the SpaceSaving sketch. It is only guaranteed that all
        elements which appear in more than 0.01% rows of the array will
        appear in the set of returned elements. However, other elements may
        also appear in the result. The item counts are estimated using
        the CountSketch.

        If this function returns no elements, it means that all elements appear
        with less than 0.01% occurance.

        Returns
        -------
        out : dict
            A dictionary mapping items and their estimated occurance frequencies.
        """
        _mt._get_metric_tracker().track('sketch.frequent_items')

        with cython_context():
            return self.__proxy__.frequent_items()

    def quantile(self, quantile_val):
        """
        Returns a sketched estimate of the value at a particular quantile
        between 0.0 and 1.0. The quantile is guaranteed to be accurate within
        1%: meaning that if you ask for the 0.55 quantile, the returned value is
        guaranteed to be between the true 0.54 quantile and the true 0.56
        quantile. The quantiles are only defined for numeric arrays and this
        function will throw an exception if called on a sketch constructed for a
        non-numeric column.

        Parameters
        ----------
        quantile_val : float
          A value between 0.0 and 1.0 inclusive. Values below 0.0 will be
          interpreted as 0.0. Values above 1.0 will be interpreted as 1.0.

        Raises
        ------
        Throws an exception if the SArray is a non-numeric type.

        Returns
        -------
        out : float
          An estimate of the value at a quantile.
        """
        _mt._get_metric_tracker().track('sketch.quantile', {'quantile':str(quantile_val)})

        with cython_context():
            return self.__proxy__.get_quantile(quantile_val)

    def frequency_count(self, element):
        """
        Returns a sketched estimate of the number of occurances of a given
        element. This estimate is based on the count sketch. The element type
        must be of the same type as the input SArray. Throws an exception if
        element is of the incorrect type.

        Parameters
        ----------
        element : val
          An element of the same type as the SArray.

        Raises
        ------
        Throws an exception if element is of the incorrect type.

        Returns
        -------
        out : int
          An estimate of the number of occurances of the element.
        """
        _mt._get_metric_tracker().track('sketch.frequency_count')
        with cython_context():
            return int(self.__proxy__.frequency_count(element))
