"""
This module defines the SArray class which provides the
ability to create, access and manipulate a remote scalable array object.

SArray acts similarly to pandas.Series but without indexing.
The data is immutable, homogeneous, and is stored on the GraphLab Server side.
"""
import graphlab.connect as _mt
import graphlab.connect.main as glconnect
from graphlab.cython.cy_sarray import UnitySArrayProxy
from graphlab.cython.context import debug_trace as cython_context
from graphlab.util import make_internal_url
import graphlab as gl
import inspect
import numpy
import pandas
import time
import array

__all__ = ['SArray']

class SArray(object):
    """
    SArray is an array object scaled to big data.
    The data in SArray is immutable and homogeneously typed.
    Each column in an :py:class:`graphlab.SFrame` is an SArray.

    SArray can be constructed in various ways:

    Construct an SArray from list.

    >>> from graphlab import SArray
    >>> sa = SArray(data=[1,2,3,4,5], dtype=int)

    Construct an SArray from numpy.ndarray.

    >>> sa = SArray(data=numpy.asarray([1,2,3,4,5]), dtype=int)
    or:
    >>> sa = SArray(numpy.asarray([1,2,3,4,5]), int)

    Construct an SArray from pandas.Series.

    >>> sa = SArray(data=pd.Series([1,2,3,4,5]), dtype=int)
    or:
    >>> sa = SArray(pd.Series([1,2,3,4,5]), int)

    If the type is not specified, automatic inference is attempted:

    >>> SArray(data=[1,2,3,4,5]).dtype()
    int
    >>> SArray(data=[1,2,3,4,5.0]).dtype()
    float

    The SArray supports standard datatypes such as: integer, float and string.
    It also supports three higher level datatypes: float arrays, dict
    and list (array of arbitrary types).

    Create an SArray from a list of strings:

    >>> sa = SArray(data=['a','b'])

    Create an SArray from a list of float arrays;

    >>> sa = SArray([[1,2,3], [3,4,5]])

    Create an SArray from a list of lists:

    >>> sa = SArray(data=[['a', 1, {'work': 3}], [2, 2.0]])

    Create an SArray from a list of dictionaries:

    >>> sa = SArray(data=[{'a':1, 'b': 2}, {'b':2, 'c': 1}])

    Construct an SArray from local text file. (Only works for local server).

    >>> sa = SArray('/tmp/a_to_z.txt.gz')

    Construct an SArray from a text file downloaded from a URL.

    >>> sa = SArray('http://s3-us-west-2.amazonaws.com/testdatasets/a_to_z.txt.gz')

    Construct an SArray from a text file stored on the server side.

    >>> sa = SArray('remote:///tmp/a_to_z.txt.gz')

    **Numeric Operators**

    SArrays support a large number of vectorized operations on numeric types.
    For instance:

    >>> sa = SArray([1,1,1,1,1])
    >>> sb = SArray([2,2,2,2,2])
    >>> sc = sa + sb
    >>> sc
    dtype: int
    Rows: 5
    [3, 3, 3, 3, 3]
    >>> sc + 2
    dtype: int
    Rows: 5
    [5, 5, 5, 5, 5]

    Operators which are supported include all numeric operators (+,-,*,/), as
    well as comparison operators (>, >=, <, <=), and logical operators (&, |).

    For instance:

    >>> sa = SArray([1,2,3,4,5])
    >>> (sa >= 2) & (sa <= 4)
    dtype: int
    Rows: 5
    [0, 1, 1, 1, 0]

    The numeric operators (+,-,*,/) also work on array types:

    >>> sa = SArray(data=[[1.0,1.0], [2.0,2.0]])
    >>> sa + 1
    dtype: list
    Rows: 2
    [array('f', [2.0, 2.0]), array('f', [3.0, 3.0])]
    >>> sa + sa
    dtype: list
    Rows: 2
    [array('f', [2.0, 2.0]), array('f', [4.0, 4.0])]

    The addition operator (+) can also be used for string concatenation:

    >>> sa = SArray(data=['a','b'])
    >>> sa + "x"
    dtype: str
    Rows: 2
    ['ax', 'bx']

    All comparison operations and boolean operators are supported and emit
    binary SArrays.

    >>> sa = SArray([1,2,3,4,5])
    >>> sa >= 2
    dtype: int
    Rows: 3
    [0, 1, 1, 1, 1]
    >>> (sa >= 2) & (sa <= 4)
    dtype: int
    Rows: 3
    [0, 1, 1, 1, 0]

    **Logical Filter**
    
    An SArray can be filtered using

    >>> array[binary_filter] 

    where array and binary_filter are SArrays of the same length. The result is
    a new SArray which contains only elements of 'array' where its matching row
    in the binary_filter is non zero.

    This permits the use of boolean operators that can be used to perform
    logical filtering operations.  For instance:

    >>> sa = SArray([1,2,3,4,5])
    >>> sa[(sa >= 2) & (sa <= 4)]
    dtype: int
    Rows: 3
    [2, 3, 4]

    This can also be used more generally to provide filtering capability which
    is otherwise not expressible with simple boolean functions. For instance:

    >>> sa = SArray([1,2,3,4,5])
    >>> sa[sa.apply(lambda x: math.log(x) <= 1)]
    dtype: int
    Rows: 3
    [1, 2]

    This is equivalent to 

    >>> sa.filter(lambda x: math.log(x) <= 1)
    dtype: int
    Rows: 3
    [1, 2]

    **Iteration**
    
    The SArray is also iterable, but not efficiently since this involves a
    streaming transmission of data from the server to the client. This should
    not be used for large data.

    >>> sa = SArray([1,2,3,4,5])
    >>> [i + 1 for i in sa]
    [2, 3, 4, 5, 6]

    This can be used to convert an SArray to a list:

    >>> sa = SArray([1,2,3,4,5])
    >>> l = list(sa)
    >>> l
    [1, 2, 3, 4, 5]

    Parameters
    ----------
    data : list | numpy.ndarray | pandas.Series | string
        The input data. If this is a list, numpy.ndarray, or pandas.Series
        the data in the list is converted and stored in an SArray.
        Alternatively if this is a string, it is inerpreted as a path (or
        url) to a text file.  Each line of the text file is loaded as a
        separate row. If data is a directory where an SArray was previously
        saved, this is loaded as a SArray reading directly out of that
        directory.

    dtype : SArray
        The data type of the SArray. Supported types are: {int,
        float, str}.  If not specified (None), we attempt to evaluate
        it from the input.  If it is a numpy array, or a Pandas
        series, the dtype of the array/series is used. If it is a
        list, it is first converted to a Pandas series, and the
        dtype of that is used. If it is a URL or path to a text file,
        we default to strings.

    ignore_cast_failure : bool
        If true, ignores casting failures, but warns
        when the elements that cannot be casted into the dtype
        specified.

    Notes
    -----
    When working with the graphlab EC2 instance, e.g. :py:func:`graphlab.aws.launch_EC2()`,
    SArray cannot be constructed using local file path, because it involves
    potentially large amount of data transfer from client to server.
    However, it is still ok to the remote file path.

    >>> graphlab.aws.launch_EC2('m1.large')
    >>> sa = SArray('~/mydata/foo.csv') # throws exception
    >>> sa = SArray('remote:///mydata/foo.csv') # works
    >>> sa = SArray("http://testdatasets.s3-website-us-west-2.amazonaws.com/users.csv.gz") # works
    >>> sa = SArray("s3://mybucket/foo.csv") # works
    >>> graphlab.aws.teminate_EC2()

    Similar restriction applies to :py:class:`graphlab.SGraph` and :py:class:`graphlab.SFrame`.
    """

    def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None):
        """__init__(data=list(), dtype=None, ignore_cast_failure=False)
        Construct a new SArray. The source of data includes: list, numpy.ndarray, pandas.Series, and urls.

        Parameters
        ----------
        data : list | numpy.ndarray | pandas.Series | string
            The input data. If this is a list, numpy.ndarray, or pandas.Series
            the data in the list is converted and stored in an SArray.
            Alternatively if this is a string, it is inerpreted as a path (or
            url) to a text file.  Each line of the text file is loaded as a
            separate row. If data is a directory where an SArray was previously
            saved, this is loaded as a SArray reading directly out of that
            directory.

        dtype : SArray
            The data type of the SArray. Supported types are: {int,
            float, str, list, dict}.  If not specified (None), we attempt to infer
            it from the input.  If it is a numpy array, or a Pandas
            series, the dtype of the array/series is used. If it is a
            list, it is first converted to a Pandas series, and the
            dtype of that is used. If it is a URL or path to a text file,
            we default to strings.

        ignore_cast_failure : bool
            If true, ignores casting failures, but warns
            when the elements that cannot be casted into the dtype
            specified.

        _proxy : None
            Internal, do not use.

        Notes
        -----
        If data is pandas.Series, the index will be ignored.
        """
        _mt._get_metric_tracker().track('sarray.init')
        if (_proxy):
            self.__proxy__ = _proxy
        else:
            self.__proxy__ = UnitySArrayProxy(glconnect.get_client())
            # we need to perform type inference
            if dtype is None:
                if (isinstance(data, list)):
                    # if it is a list, Get the first type and make sure
                    # the remaining items are all of the same type
                    dtype = self.__infer_dtype_of_list__(data)
                elif (isinstance(data, pandas.Series) or
                      isinstance(data, numpy.ndarray)):
                    # if it is a pandas series or numpy array, get the dtype
                    # of the series / array
                    dtype = data.dtype
                    if dtype == object:
                        # we need to get a bit more fine grained than that
                        dtype = self.__infer_dtype_of_list__(data)

                elif (isinstance(data, str)):
                    # if it is a file, we default to string
                    dtype = str

            if (isinstance(data, pandas.Series) or
                isinstance(data, numpy.ndarray) or
                isinstance(data, list)):
                with cython_context():
                    self.__proxy__.load_from_iterable(data, dtype, ignore_cast_failure)
            elif (isinstance(data, str)):
                internal_url = make_internal_url(data)
                with cython_context():
                    self.__proxy__.load_autodetect(internal_url, dtype)

            else:
                raise TypeError("Unexpected data source. " \
                                "Possible data source types are: list, " \
                                "numpy.ndarray, pandas.Series, and string(url)")

    def __infer_dtype_of_list__(self, data):
        # default is float
        if len(data) == 0:
            return float
        integral_types = set([int,long,numpy.bool_,numpy.int_,numpy.intc,numpy.intp,
                              numpy.int8,numpy.int16,numpy.int32,numpy.int64,
                              numpy.uint8,numpy.uint16,numpy.uint32,numpy.uint64])
        float_types = set([float,numpy.float_,numpy.float16,numpy.float32,numpy.float64])
        numeric_types = integral_types.union(float_types)

        unique_types = set([type(x) for x in data if x is not None])
        if len(unique_types) == 0:
            return float
        elif len(unique_types) == 1:
            ret = unique_types.pop()
            # only 1 type. check against basic types, 
            # and cast it back to int or float
            if ret in integral_types:
                return int
            elif ret in float_types:
                return float 
            elif ret != list:
                # none of the above. its not a list of lists, so its fine.
                # We can handle the other cases.
                return ret;

            # if it is a list of lists, we need to look harder at the contents
            # of each list.
            # if all contents of the list is numeric, then use vector
            # otherwise use list
            value_types = set([])
            for l in data:
                value_types = value_types.union(set([type(x) for x in l if l is not None]))

            if value_types.issubset(set([int,long,float])):
                return array.array
            else:
                return list

        elif len(unique_types) == 2:
            # we can do both ints, longs, floats as a float
            if unique_types.issubset(numeric_types):
                return float
            else:
                raise TypeError("Cannot infer Array type. Not all elements of array are the same type.")
        else:
            raise TypeError("Cannot infer Array type. Not all elements of array are the same type.")

    def save(self, targetfile):
        """
        Saves the SArray to file.

        Parameters
        ----------
        filename : string
            The directory to save the SArray. Either a local path or a remote
            URL.
        """
        with cython_context():
            self.__proxy__.save(make_internal_url(targetfile))

    def __repr__(self):
        """
        Returns a string description of the Array.
        """
        ret =       "dtype: " + str(self.dtype().__name__) + "\n"
        ret = ret + "Rows: " + str(self.size()) + "\n"
        ret = ret + str(self)
        return ret


    def __str__(self):
        """
        Returns a string containing the first 100 elements of the array.
        """
        headln = str(self.head(100))
        if (self.size() > 100):
            # cut the last close bracket
            # and replace it with ...
            headln = headln[0:-1] + ", ... ]"
        return headln


    def __nonzero__(self):
        """
        Returns true if the array is not empty.
        """
        return self.size() != 0

    def __len__(self):
        """
        Returns the length of the array
        """
        return self.size()

    def __iter__(self):
        """
        Provides an iterator to the contents of the array.
        """
        def generator():
            elems_at_a_time = 262144
            self.__proxy__.begin_iterator()
            ret = self.__proxy__.iterator_get_next(elems_at_a_time)
            while(True):
                for j in ret:
                    yield j

                if len(ret) == elems_at_a_time:
                    ret = self.__proxy__.iterator_get_next(elems_at_a_time)
                else:
                    break

        return generator()

    def __add__(self, other):
        """
        If other is a scalar value, adds it to the current array, returning
        the new result. If other is an SArray, performs an element-wise
        addition of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '+'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '+'))

    def __sub__(self, other):
        """
        If other is a scalar value, subtracts it from the current array, returning
        the new result. If other is an SArray, performs an element-wise
        subtraction of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '-'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '-'))

    def __mul__(self, other):
        """
        If other is a scalar value, multiplies it to the current array, returning
        the new result. If other is an SArray, performs an element-wise
        multiplication of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '*'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '*'))

    def __div__(self, other):
        """
        If other is a scalar value, divides each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise division of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '/'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '/'))

    def __lt__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '<'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '<'))

    def __gt__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '>'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '>'))


    def __le__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '<='))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '<='))


    def __ge__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '>='))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '>='))


    def __radd__(self, other):
        """
        Adds a scalar value to the current array.
        Returned array has the same type as the array on the right hand side
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '+'))


    def __rsub__(self, other):
        """
        Subtracts a scalar value from the current array.
        Returned array has the same type as the array on the right hand side
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '-'))


    def __rmul__(self, other):
        """
        Multiplies a scalar value to the current array.
        Returned array has the same type as the array on the right hand side
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '*'))


    def __rdiv__(self, other):
        """
        Divides a scalar value by each element in the array
        Returned array has the same type as the array on the right hand side
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '/'))


    def __eq__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the new result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '=='))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '=='))


    def __ne__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the new result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '!='))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '!='))


    def __and__(self, other):
        """
        Perform a logical element-wise 'and' against another SArray.
        """
        if type(other) is SArray:
            with cython_context():
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '&'))
        else:
            raise TypeError("SArray can only perform logical and against another SArray")


    def __or__(self, other):
        """
        Perform a logical element-wise 'or' against another SArray.
        """
        if type(other) is SArray:
            with cython_context():
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '|'))
        else:
            raise TypeError("SArray can only perform logical or against another SArray")


    def __getitem__(self, other):
        """
        If other is an SArray of identical length, this function performs a
        logical filter: i.e. it subselects all the elements in this array
        where the corresponding value in the other array evaluates to true.
        """
        if type(other) is SArray:
            if len(other) != len(self):
                raise IndexError("Cannot perform logical indexing on arrays of different length.")
            with cython_context():
                return SArray(_proxy = self.__proxy__.logical_filter(other.__proxy__))
        else:
            raise IndexError("Invalid type to use for indexing")

    def __materialize__(self):
        """
        For a SArray that is lazily evaluated, force persist this sarray to disk
        to enable benchmarking or other usage
        """
        with cython_context():
            self.__proxy__.materialize()

    def __is_materialized__(self):
        """
        Returns whether or not the sarray has been materialized. A materialized
        SArray has all contents saved on disk
        """
        return self.__proxy__.is_materialized()

    def size(self):
        """
        Returns the size of the sarray.
        """
        return self.__proxy__.size()

    def dtype(self):
        """
        Returns the data type of the sarray.

        Returns
        -------
        out : type
            The type of the sarray.
        """
        return self.__proxy__.dtype()


    def head(self, n=10):
        """
        Returns a pandas.DataFrame which contains the first n rows of the SFrame.

        This operation will construct a pandas.DataFrame in memory. Care must be taken
        when size of the returned object is big.

        Parameters
        ----------
        n : int
            The number of rows to fetch.

        Returns
        -------
        out : pandas.DataFrame
            The dataframe which contains the first n rows of the SFrame.
        """
        return self.__proxy__.head(n)

    def vector_slice(self, start, end=None):
        """
        If this SArray contains vectors or recursive types, this returns a new SArray
        containing each individual vector sliced, between start and end, exclusive.

        For instance, if g is a vector of floats:

        >>> g
        dtype: array.array
        Rows: 2
        [[1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0]]

        >>> g.vector_slice(0) # extracts the first element of each vector
        dtype: float
        Rows: 2
        [1.0, 2.0]

        >>> g.vector_slice(0, 2) # extracts the first two elements of each vector
        dtype: array.array
        Rows: 2
        [[1.0, 2.0], [2.0, 3.0]]

        If a vector cannot be sliced, the result will be None.

        For instance:

        >>> g
        dtype: array.array
        Rows: 3
        [[1.0], [1.0, 2.0], [1.0, 2.0, 3.0]]

        >>> g.vector_slice(2)
        dtype: float
        Rows: 3
        [None, None, 3.0]

        >>> g.vector_slice(0,2)
        dtype: list
        Rows: 3
        [None, [1.0, 2.0], [1.0, 2.0]]

        If g is a vector of mixed types (float, int, str, array, list, etc.):
        >>> l
        dtype: list
        Rows: 2
        [['a', 1, 1.0],
         ['b', 2, 2.0]]

        >>> g.vector_slice(0) # extracts the first element of each vector
        dtype: list
        Rows: 2
        [[a'], ['b']]

        >>> g.vector_slice(0, 2) # extracts the first two elements of each vector
        dtype: list
        Rows: 2
        [[a', 1], ['b', 2]]


        Parameters
        ----------
        start :  The start position of the slice.

        end :  Optional. The end position of the slice. Note that the end
        position is NOT included in the slice. Thus a g.vector_slice(1,3) will
        extract entries in position 1 and 2.
        """
        if (self.dtype() != array.array) and (self.dtype() != list):
            raise RuntimeError("Only Vector type can be sliced")
        if end == None:
            end = start + 1

        with cython_context():
            return SArray(_proxy=self.__proxy__.vector_slice(start, end))

    def count_words(self, to_lower=True):
        """
        Returns a new SArray of dictionary type. Each element contains the word-count
        for each of the word appeared in corresponding input element. Only SArray of
        string type is supported.

        For example, if input SArray contains the following element:

            "This is about animal and is also about plants"
        Then output SArray contains an element that is a dictionary:

            {"this":1, "is":2, "about":2, "and":1, "animal":1, "also":1, plants":1}


        Parameters
        ----------
        to_lower: bool, optional
            If True, all word is converted to lower case before counting

        Returns
        -------
        out : SArray
            The SArray of dictionary type, where each element contains the word-count
            for each of the word appeared in corresponding input element

        """
        if (self.dtype() != str):
            raise TypeError("Only SArray of string type is supported for counting bag of words")

        _mt._get_metric_tracker().track('sarray.count_words')

        # construct options, will extend over time
        options = dict()
        options["to_lower"] = to_lower == True

        with cython_context():
            return SArray(_proxy=self.__proxy__.count_bag_of_words(options))

    def dict_trim_by_keys(self, keys, exclude=True):
        """
        If SArray dtype is dict, filter out each dict by the given keys.
        If exclude is True, then all keys that are in the input key list are removed
        from the dict. exclude is True by default.
        If exclude is False, then only keys that are in the input key list are
        retained in the dict.

        For example, if input SArray contains the following element:

        >>> sa = SArray([{'this':1, "is":1, "dog":2}, {"this": 2, "are": 2, "cat": 1}])
        >>> sa.dict_trim_by_keys(["this", "is", "and", "are"], exclude=True)
        Out:
            dict
            [{"dog":2}, {"cat": 1}]

        >>> sa.dict_trim_by_keys(["dog", "cat", "animal"], exclude=False)
        Out:
            dict
            [{"dog":2}, {"cat": 1}]

        Parameters
        ----------
        keys: list
            A collection of keys to trim down the elements in the SArray

        exclude: bool, optional
            If True, all keys that are in the input key list are removed, default
                value is True
            If False, only keys that are in the input key list are retained.

        Returns
        -------
        out : SArray
            A SArray of dictionary type, with each dict element trimmed according
            to the input criteria

        """
        if isinstance(keys, str) or (not hasattr(keys, "__iter__")):
            keys = [keys]

        _mt._get_metric_tracker().track('sarray.dict_trim_by_keys')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_trim_by_keys(keys, exclude))

    def dict_trim_by_values(self, lower=None, upper=None):
        """
        If SArray dtype is dict, filter out each dict by checking the value of
        the dictionary against the given lower and upper bound. Only the value
        that falls between the bound are retained (both lower and upper bound
        are inclusive)
        This only works if the dictionary value is comparable to the given low/upper
        values. If not comparable, the values are not trimmed

        For example, if input SArray contains the following element:

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_trim_by_values(2, 5)
        Out:
            dict
            [{"is":5}, {"this": 2, "cat": 5}]

        This is an example only lower bound is given
        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_trim_by_values(2)
        Out:
            dict
            [{"is":5, "dog":7}, {"this": 2, "cat": 5}]

        This is an example only upper bound is given
        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_trim_by_values(upper=5)
        Out:
            dict
            [{'this':1, "is":5}, {"this": 2, "are": 1, "cat": 5}]

        Parameters
        ----------
        lower: int, long, float
            The lowest dictionary value that would be retained in the result. If not
            given, lower bound is not applied.

        upper: int, long, float
            The highest dictionary value that would be retained in the result. If not
            given, upper bound is not applied.

        Returns
        -------
        out : SArray
            A SArray of dictionary type, with each dict element trimmed according to
            the input criteria

        """

        if None != lower and (not isinstance(lower, (int, float, numpy.integer, float, numpy.float))):
            raise TypeError("lower bound has to be a numeric value")

        if None != upper and (not isinstance(upper, (int, float, numpy.integer, float, numpy.float))):
            raise TypeError("upper bound has to be a numeric value")

        _mt._get_metric_tracker().track('sarray.dict_trim_by_values')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_trim_by_values(lower, upper))

    def dict_keys(self):
        """
        If SArray dtype is dict, get all keys for each dictionary element and
        return a new SArray that contains keys from coresponding dictionary

        For example, if input SArray contains the following element:

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_keys()
        Out:
            dict
            [["this", "is", "dog"], ["this", "are", "cat"]]

        Parameters
        ----------

        Returns
        -------
        out : SArray
            A SArray of list type, with each element is a collection of keys from
            input SArray element

        """
        _mt._get_metric_tracker().track('sarray.dict_keys')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_keys())

    def dict_values(self):
        """
        If SArray dtype is dict, get all values for each dictionary element and
        return a new SArray that contains values from coresponding dictionary

        For example, if input SArray contains the following element:

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_values()
        Out:
            dict
            [[1,5,7], [2,1,5]]

        Parameters
        ----------

        Returns
        -------
        out : SArray
            A SArray of list type, with each element is a collection of values from
            input SArray element

        """
        _mt._get_metric_tracker().track('sarray.dict_values')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_values())

    def dict_has_any_keys(self, keys):
        """
        If SArray dtype is dict, returns SArray of integer with each element
        indicates whether or not the original SArray element has any key in the
        input key list (1 means yes, 0 means no).

        For example, if input SArray contains the following element:

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}, {"animal":1}])
        >>> sa.dict_has_any_keys(["is", "this", "are"])
        Out:
            dict
            [1, 1, 0]

        Parameters
        ----------
        keys: list
            A list of key values to check the dictionary against

        Returns
        -------
        out : SArray
            A SArray of int type, with each element indicates whether or not
            input SArray element contains any key in the input list

        """
        if isinstance(keys, str) or (not hasattr(keys, "__iter__")):
            keys = [keys]

        _mt._get_metric_tracker().track('sarray.dict_has_any_keys')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_has_any_keys(keys))

    def dict_has_all_keys(self, keys):
        """
        If SArray dtype is dict, returns SArray of integer with each element
        indicates whether or not the original SArray element has all keys in the
        input key list (1 means yes, 0 means no).

        For example, if input SArray contains the following element:

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_has_any_keys(["is", "this"])
        Out:
            dict
            [1, 0]

        Parameters
        ----------
        keys: list
            A list of key values to check the dictionary against

        Returns
        -------
        out : SArray
            A SArray of int type, with each element indicates whether or not
            input SArray element contains all keys in the input list

        """
        if isinstance(keys, str) or (not hasattr(keys, "__iter__")):
            keys = [keys]

        _mt._get_metric_tracker().track('sarray.dict_has_all_keys')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_has_all_keys(keys))

    def apply(self, fn, dtype=None, skip_undefined=True, seed=None):
        """
        Returns a new SArray of dtype where each element in this SArray is
        transformed by fn(x).
        The fn should return a value which can be cast into dtype.

        If dtype is not specified, the first 100 elements of the Array are
        used to make a guess of the target datatype.

        Parameters
        ----------
        fn : function
            The function to transform each element.

        dtype : dtype
            The dtype of the new SArray. If None, the first 100
            elements of the array are used to guess the target
            data type.

        skip_undefined : bool, optional
            If True, will not apply fn to any undefined values.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SArray
            The SArray transformed by fn.  Each element of the SArray is of
            type ``dtype``
        """
        assert inspect.isfunction(fn), "Input must be a function"
        dryrun = [fn(i) for i in self.head(100)]
        if dtype == None:
            dtype = self.__infer_dtype_of_list__(dryrun)

        if not seed:
            seed = time.time()

        _mt._get_metric_tracker().track('sarray.apply')

        with cython_context():
            return SArray(_proxy=self.__proxy__.transform(fn, dtype, skip_undefined, seed))


    def filter(self, fn, skip_undefined=True, seed=None):
        """
        Returns a new SArray which is filtered by the given function.
        If the lambda evaluates an element to true, this element is copied to the
        new SArray.  If not, it isn't.  Throws an exception if the return type
        of the lambda is not castable to a boolean value.

        Parameters
        ----------
        fn : function
            Function that filters the SArray. Must evaluate to bool or int.

        skip_undefined : bool, optional
            If True, will not apply fn to any undefined values.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SArray
        """
        assert inspect.isfunction(fn), "Input must be a function"
        if not seed:
            seed = time.time()

        _mt._get_metric_tracker().track('sarray.filter')

        with cython_context():
            return SArray(_proxy=self.__proxy__.filter(fn, skip_undefined, seed))


    def sample(self, fraction, seed=None):
        """
        Returns an SArray which contains a subsample of the current SArray.

        Parameters
        ----------
        fraction : float
            The fractionage of the rows to fetch. Must be between 0 and 1.

        seed : int
            The random seed for the random number generator.

        Returns
        -------
        out : SArray
            The new SArray which contains the subsampled rows.
        """
        if (fraction > 1 or fraction < 0):
            raise ValueError('Invalid sampling rate: ' + str(fraction))
        if (self.size() == 0):
            return SArray()
        if not seed:
            seed = time.time()

        _mt._get_metric_tracker().track('sarray.sample')

        with cython_context():
            return SArray(_proxy=self.__proxy__.sample(fraction, seed))

    def _save_as_text(self, url):
        """
        Save the SArray to disk as text file.
        """
        raise NotImplementedError



    def all(self):
        """
        Returns True if all of the elements evaluate to True.
        The empty array returns True.

        Returns
        -------
        out : bool
        """
        with cython_context():
            return self.__proxy__.all()


    def any(self):
        """
        Returns True if any of the elements evaluate to True.
        The empty array returns False.

        Returns
        -------
        out : bool
        """
        with cython_context():
            return self.__proxy__.any()


    def max(self):
        """
        Returns the maximum value in the SArray.  The empty frame returns None.
        Throws an exception if called on an SArray with non-numeric type.

        Returns
        -------
        out : type of SArray
            Maximum value of SArray
        """
        with cython_context():
            return self.__proxy__.max()


    def min(self):
        """
        Returns the minimum value in the SArray.  The empty frame returns None.
        Throws an exception if called on an SArray with non-numeric type.

        Returns
        -------
        out : type of SArray
            Minimum value of SArray
        """
        with cython_context():
            return self.__proxy__.min()


    def sum(self):
        """
        Returns the sum of all the values in the sarray.  The empty frame
        returns None.
        If the array contains strings or dictionary, this will throw an exception.
        If the array contains vectors, and all the vectors are the same length,
        the sum over all the vectors will be returned. Otherwise, if the vectors
        have different lengths, this will throw an exception.

        For large values, this may overflow without warning.

        Returns
        -------
        out : type of SArray
            Sum of all values in SArray
        """
        with cython_context():
            return self.__proxy__.sum()


    def mean(self):
        """
        Returns the mean of all the values in the SArray.  The empty frame returns
        None.  Throws an exception if called on an SArray with non-numeric
        type.

        If the array contains strings or dictionary, this will throw an exception.

        Returns
        -------
        out : float
            Mean of all values in SArray
        """
        with cython_context():
            return self.__proxy__.mean()


    def std(self, ddof=0):
        """
        Returns the standard deviation of all the values in the sarray as
        a float.The empty array returns None.

        If the array contains strings or dictionary, this will throw an exception.

        Parameters
        ----------
        ddof : int
            "delta degrees of freedom" in the variance calculation.

        Raises
        ------
        Throws an exception if ddof >= sarray size or if the sarray is
        a non-numeric type.

        Returns
        -------
        out : float
            The standard deviation of all the values.
        """
        with cython_context():
            return self.__proxy__.std(ddof)


    def var(self, ddof=0):
        """
        Returns the variance of all the values in the sarray as
        a float. The empty array returns None.

        If the array contains strings or dictionary, this will throw an exception.

        Parameters
        ----------
        ddof : int
            "delta degrees of freedom" in the variance calculation.

        Raises
        ------
            Throws an exception if ddof >= sarray size or if the sarray is a
            non-numeric type.

        Returns
        -------
        out : float
            Variance of all values in SArray.
        """
        with cython_context():
            return self.__proxy__.var(ddof)

    def num_missing(self):
        """
        Returns the number of missing elements in the SArray.  Returns 0 if the
        sarray is empty.

        Returns
        -------
        out : int
            Number of missing values.
        """
        with cython_context():
            return self.__proxy__.num_missing()

    def nnz(self):
        """
        Returns the number of non-zero elements in the SArray.  Returns 0 if the
        sarray is empty.

        Returns
        -------
        out : int
            Number of non-zero elements.
        """
        with cython_context():
            return self.__proxy__.nnz()

    def astype(self, dtype, undefined_on_failure=False):
        """
        Returns a new SArray with all of the current values casted to the given
        type.

        Throws an exception if the types are not castable to the given type.

        Parameters
        ----------
        dtype : type
            The type to cast the elements to in SArray

        undefined_on_failure: bool
            If set to true, runtime cast failures will be emitted as
            missing values rather than failing.

        Returns
        -------
        out : SArray (of type dtype)
            The SArray converted to the dtype
        """

        _mt._get_metric_tracker().track('sarray.astype.%s' % str(dtype.__name__))

        with cython_context():
            return SArray(_proxy=self.__proxy__.astype(dtype, undefined_on_failure))

    def clip(self, lower=numpy.nan, upper=numpy.nan):
        """
        Returns a new SArray with the clipped values of this SArray.
        This means to modify each value outside of the given bounds
        to be the bound.

        This function can operate on numeric arrays, as well as vector arrays,
        in which case each individual element in each vector is clipped.

        If lower or upper are given numpy.nan as the argument, this is interpreted
        as a non-existent bound.

        Parameters
        ----------
        lower : int
            The lower bound to clip to. Ignored if equal to numpy.nan

        upper : int
            The upper bound to clip to. Ignored if equal to numpy.nan

        Raises
        ------
            Throws an exception if the SArray is empty, the types are non-numeric,
            or if the upper bound is less than the lower bound

        Returns
        -------
        out : SArray
        """
        with cython_context():
            return SArray(_proxy=self.__proxy__.clip(lower, upper))

    def clip_lower(self, threshold):
        """
        Returns a new SArray with all values clipped to the lower bound given.

        This function can operate on numeric arrays, as well as vector arrays,
        in which case each individual element in each vector is clipped.

        Parameters
        ----------
        threshold : float
            The lower bound to clip values to

        Raises
        ------
        Exception
            Throws an exception if the SArray is empty or the types are non-numeric

        Returns
        -------
        out : SArray
        """
        with cython_context():
            return SArray(_proxy=self.__proxy__.clip(threshold, numpy.nan))


    def clip_upper(self, threshold):
        """
        Returns a new SArray with all values clipped to the upper bound given.

        This function can operate on numeric arrays, as well as vector arrays,
        in which case each individual element in each vector is clipped.

        Parameters
        ----------
        threshold : float
            The upper bound to clip values to

        Raises
        ------
        Exception
            Throws an exception if the SArray is empty or the types are non-numeric

        Returns
        -------
        out : SArray
        """
        with cython_context():
            return SArray(_proxy=self.__proxy__.clip(numpy.nan, threshold))

    def tail(self, n=10):
        """
        Returns a list of the last n elements in the SArray.

        Does not throw. Care must be taken when size of the returned object is
        big (larger than available memory).

        Parameters
        ----------
        n : int
            The number of elements to fetch


        Returns
        -------
        out : list
            The last n elements of the SArray
        """
        with cython_context():
            return self.__proxy__.tail(n)


    def dropna(self):
        """
        Returns a new SArray containing only the non-missing values of the
        array.  The size of the returned SArray will be <= to the size of the
        original.

        A missing value shows up in an SArray as 'None'.

        Returns
        -------
        out : SArray
            The new SArray with missing values removed.
        """

        _mt._get_metric_tracker().track('sarray.dropna')

        with cython_context():
            return SArray(_proxy = self.__proxy__.drop_missing_values())


    def topk_index(self, topk=10, reverse=False):
        """
        Used to return the topk elements, sorted by those elements (descending
        by default).  Returns a new SArray of type 'int' of the same size as
        the current SArray.  Entries are '1' if the corresponding element in
        the current SArray is a part of the topk elements, and '0' if that
        corresponding element is not.

        Parameters
        ----------
        topk : int
            The number of elements to determine if 'top'

        reverse: bool
            If True, return the topk elements in ascending order

        Returns
        -------
        out : SArray (of type int)

        Notes
        -----
        This is mostly used internally by SFrame's topk function.
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.topk_index(topk, reverse))

    def sketch_summary(self, background=False):
        """
        Returns a graphlab.Sketch object which can be further queried for many
        descriptive statistics over this SArray. Many of the statistics are
        approximate. See the :class:`~graphlab.Sketch` documentation for more
        detail.

        Returns
        -------
        out : Sketch
            Sketch object that contains descriptive statistics for this SArray.
            Many of the statistics are approximate.

        background : boolean
          If True, the sketch construction will return immediately and the
          sketch will be constructed in the background. While this is going on,
          the sketch can be queried incrementally, but at a performance penalty.
          Defaults to False.
        """
        from graphlab.data_structures.sketch import Sketch
        _mt._get_metric_tracker().track('sarray.sketch_summary')
        return Sketch(self, background)

    def append(self, other):
        """
        Append the second SArray to current SArray. Returns a new SArray contains rows from both SArrays.
        Both SArrays have to have the same type.

        Parameters
        ----------
        other : SArray
            Another SArray whose rows are appended to current SArray

        Returns
        -------
        out_sf : SArray
            A new SArray that contains rows from both SArrays, with second SArray's rows after
            first SArray's rows
        """
        _mt._get_metric_tracker().track('sarray.append')
        if type(other) is not SArray:
            raise RuntimeError("SArray append can only work with SArray")

        if self.dtype() != other.dtype():
            raise RuntimeError("Data types in both SArrays have to be the same")

        with cython_context():
            return SArray(_proxy = self.__proxy__.append(other.__proxy__))

    def unique(self):
        """
        Return an SArray containing only the unique values of the given SArray.
        Will not necessarily preserve the order of the given SArray in the new
        SArray.

        Returns
        -------
        out : SArray
            A new SArray that contains the unique values of the given SArray.
        """
        _mt._get_metric_tracker().track('sarray.unique')
        tmp_sf = gl.SFrame()
        tmp_sf.add_column(self, 'X1')

        res = tmp_sf.groupby('X1',{}) 

        return SArray(_proxy=res['X1'].__proxy__)
