"""
This module defines the SArray class which provides the
ability to create, access and manipulate a remote scalable array object.

SArray acts similarly to pandas.Series but without indexing.
The data is immutable, homogeneous, and is stored on the GraphLab Server side.
"""
import graphlab.connect as _mt
import graphlab.connect.main as glconnect
from graphlab.cython.cy_type_utils import pytype_from_dtype, infer_type_of_list, is_numeric_type
from graphlab.cython.cy_sarray import UnitySArrayProxy
from graphlab.cython.context import debug_trace as cython_context
from graphlab.util import make_internal_url
import graphlab as gl
import graphlab.canvas
import graphlab.canvas.inspect
import inspect
import math
from graphlab.deps import numpy, HAS_NUMPY
from graphlab.deps import pandas, HAS_PANDAS
import time
import array
import graphlab.meta as meta
import itertools

__all__ = ['SArray']

def _create_sequential_sarray(size, start=0, reverse=False):
    if type(size) is not int:
        raise TypeError("size must be int")

    if type(start) is not int:
        raise TypeError("size must be int")

    if type(reverse) is not bool:
        raise TypeError("reverse must me bool")

    with cython_context():
        return SArray(_proxy=glconnect.get_unity().create_sequential_sarray(size, start, reverse))

class SArray(object):
    """
    SArray is an array object scaled to big data.  The data in SArray is
    immutable and homogeneously typed and with missing value support.  Each
    column in an :py:class:`graphlab.SFrame` is an SArray.

    SArray can be constructed in various ways:

    Construct an SArray from list.

    >>> from graphlab import SArray
    >>> sa = SArray(data=[1,2,3,4,5], dtype=int)

    Construct an SArray from numpy.ndarray.

    >>> sa = SArray(data=numpy.asarray([1,2,3,4,5]), dtype=int)
    or:
    >>> sa = SArray(numpy.asarray([1,2,3,4,5]), int)

    Construct an SArray from pandas.Series.

    >>> sa = SArray(data=pd.Series([1,2,3,4,5]), dtype=int)
    or:
    >>> sa = SArray(pd.Series([1,2,3,4,5]), int)

    If the type is not specified, automatic inference is attempted:

    >>> SArray(data=[1,2,3,4,5]).dtype()
    int
    >>> SArray(data=[1,2,3,4,5.0]).dtype()
    float

    The SArray supports standard datatypes such as: integer, float and string.
    It also supports three higher level datatypes: float arrays, dict
    and list (array of arbitrary types).

    Create an SArray from a list of strings:

    >>> sa = SArray(data=['a','b'])

    Create an SArray from a list of float arrays;

    >>> sa = SArray([[1,2,3], [3,4,5]])

    Create an SArray from a list of lists:

    >>> sa = SArray(data=[['a', 1, {'work': 3}], [2, 2.0]])

    Create an SArray from a list of dictionaries:

    >>> sa = SArray(data=[{'a':1, 'b': 2}, {'b':2, 'c': 1}])

    Construct an SArray from local text file. (Only works for local server).

    >>> sa = SArray('/tmp/a_to_z.txt.gz')

    Construct an SArray from a text file downloaded from a URL.

    >>> sa = SArray('http://s3-us-west-2.amazonaws.com/testdatasets/a_to_z.txt.gz')

    Construct an SArray from a text file stored on the server side.

    >>> sa = SArray('remote:///tmp/a_to_z.txt.gz')

    **Numeric Operators**

    SArrays support a large number of vectorized operations on numeric types.
    For instance:

    >>> sa = SArray([1,1,1,1,1])
    >>> sb = SArray([2,2,2,2,2])
    >>> sc = sa + sb
    >>> sc
    dtype: int
    Rows: 5
    [3, 3, 3, 3, 3]
    >>> sc + 2
    dtype: int
    Rows: 5
    [5, 5, 5, 5, 5]

    Operators which are supported include all numeric operators (+,-,*,/), as
    well as comparison operators (>, >=, <, <=), and logical operators (&, |).

    For instance:

    >>> sa = SArray([1,2,3,4,5])
    >>> (sa >= 2) & (sa <= 4)
    dtype: int
    Rows: 5
    [0, 1, 1, 1, 0]

    The numeric operators (+,-,*,/) also work on array types:

    >>> sa = SArray(data=[[1.0,1.0], [2.0,2.0]])
    >>> sa + 1
    dtype: list
    Rows: 2
    [array('f', [2.0, 2.0]), array('f', [3.0, 3.0])]
    >>> sa + sa
    dtype: list
    Rows: 2
    [array('f', [2.0, 2.0]), array('f', [4.0, 4.0])]

    The addition operator (+) can also be used for string concatenation:

    >>> sa = SArray(data=['a','b'])
    >>> sa + "x"
    dtype: str
    Rows: 2
    ['ax', 'bx']

    This can be useful for performing type interpretation of lists or
    dictionaries stored as strings:

    >>> sa = SArray(data=['a,b','c,d'])
    >>> ("[" + sa + "]").astype(list) # adding brackets make it look like a list
    dtype: list
    Rows: 2
    [['a', 'b'], ['c', 'd']]

    All comparison operations and boolean operators are supported and emit
    binary SArrays.

    >>> sa = SArray([1,2,3,4,5])
    >>> sa >= 2
    dtype: int
    Rows: 3
    [0, 1, 1, 1, 1]
    >>> (sa >= 2) & (sa <= 4)
    dtype: int
    Rows: 3
    [0, 1, 1, 1, 0]


    **Element Access and Slicing**
    SArrays can be accessed by integer keys just like a regular python list.
    Such operations may not be fast on large datasets so looping over an SArray
    should be avoided.

    >>> sa = SArray([1,2,3,4,5])
    >>> sa[0]
    1
    >>> sa[2]
    3
    >>> sa[5]
    IndexError: SFrame index out of range

    Negative indices can be used to access elements from the tail of the array
    >>> sa[-1] # returns the last element
    5
    >>> sa[-2] # returns the second to last element
    4

    The SArray also supports the full range of python slicing operators:

    >>> sa[1000:] # Returns an SArray containing rows 1000 to the end
    >>> sa[:1000] # Returns an SArray containing rows 0 to row 999 inclusive
    >>> sa[0:1000:2] # Returns an SArray containing rows 0 to row 1000 in steps of 2
    >>> sa[-100:] # Returns an SArray containing last 100 rows
    >>> sa[-100:len(sa):2] # Returns an SArray containing last 100 rows in steps of 2

    **Logical Filter**

    An SArray can be filtered using

    >>> array[binary_filter]

    where array and binary_filter are SArrays of the same length. The result is
    a new SArray which contains only elements of 'array' where its matching row
    in the binary_filter is non zero.

    This permits the use of boolean operators that can be used to perform
    logical filtering operations.  For instance:

    >>> sa = SArray([1,2,3,4,5])
    >>> sa[(sa >= 2) & (sa <= 4)]
    dtype: int
    Rows: 3
    [2, 3, 4]

    This can also be used more generally to provide filtering capability which
    is otherwise not expressible with simple boolean functions. For instance:

    >>> sa = SArray([1,2,3,4,5])
    >>> sa[sa.apply(lambda x: math.log(x) <= 1)]
    dtype: int
    Rows: 3
    [1, 2]

    This is equivalent to

    >>> sa.filter(lambda x: math.log(x) <= 1)
    dtype: int
    Rows: 3
    [1, 2]

    **Iteration**

    The SArray is also iterable, but not efficiently since this involves a
    streaming transmission of data from the server to the client. This should
    not be used for large data.

    >>> sa = SArray([1,2,3,4,5])
    >>> [i + 1 for i in sa]
    [2, 3, 4, 5, 6]

    This can be used to convert an SArray to a list:

    >>> sa = SArray([1,2,3,4,5])
    >>> l = list(sa)
    >>> l
    [1, 2, 3, 4, 5]

    Parameters
    ----------
    data : list | numpy.ndarray | pandas.Series | string
        The input data. If this is a list, numpy.ndarray, or pandas.Series
        the data in the list is converted and stored in an SArray.
        Alternatively if this is a string, it is inerpreted as a path (or
        url) to a text file.  Each line of the text file is loaded as a
        separate row. If data is a directory where an SArray was previously
        saved, this is loaded as a SArray reading directly out of that
        directory.

    dtype : SArray
        The data type of the SArray. Supported types are: {int,
        float, str}.  If not specified (None), we attempt to evaluate
        it from the input.  If it is a numpy array, or a Pandas
        series, the dtype of the array/series is used. If it is a
        list, it is first converted to a Pandas series, and the
        dtype of that is used. If it is a URL or path to a text file,
        we default to strings.

    ignore_cast_failure : bool
        If true, ignores casting failures, but warns
        when the elements that cannot be casted into the dtype
        specified.

    Notes
    -----
    When working with the graphlab EC2 instance, e.g. :py:func:`graphlab.aws.launch_EC2()`,
    SArray cannot be constructed using local file path, because it involves
    potentially large amount of data transfer from client to server.
    However, it is still ok to the remote file path.

    >>> graphlab.aws.launch_EC2('m1.large')
    >>> sa = SArray('~/mydata/foo.csv') # throws exception
    >>> sa = SArray('remote:///mydata/foo.csv') # works
    >>> sa = SArray("http://testdatasets.s3-website-us-west-2.amazonaws.com/users.csv.gz") # works
    >>> sa = SArray("s3://mybucket/foo.csv") # works
    >>> graphlab.aws.teminate_EC2()

    Similar restriction applies to :py:class:`graphlab.SGraph` and :py:class:`graphlab.SFrame`.
    """

    def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None):
        """__init__(data=list(), dtype=None, ignore_cast_failure=False)
        Construct a new SArray. The source of data includes: list, numpy.ndarray, pandas.Series, and urls.

        Parameters
        ----------
        data : list | numpy.ndarray | pandas.Series | string | SArray
            The input data. If this is a list, numpy.ndarray, or pandas.Series
            the data in the list is converted and stored in an SArray.
            Alternatively if this is a string, it is inerpreted as a path (or
            url) to a text file.  Each line of the text file is loaded as a
            separate row. If data is a directory where an SArray was previously
            saved, this is loaded as a SArray reading directly out of that
            directory.

        dtype : type
            The data type of the SArray. Supported types are: {int,
            float, str, list, dict}.  If not specified (None), we attempt to infer
            it from the input.  If it is a numpy array, or a Pandas
            series, the dtype of the array/series is used. If it is a
            list, then the dtype is infered from the inner list.
            If it is a URL or path to a text file, we default to strings.

        ignore_cast_failure : bool
            If true, ignores casting failures, but warns
            when the elements that cannot be casted into the dtype
            specified.

        _proxy : None
            Internal, do not use.

        Notes
        -----
        If data is pandas.Series, the index will be ignored.
        """
        _mt._get_metric_tracker().track('sarray.init')
        if dtype is not None and type(dtype) != type:
            raise TypeError('dtype must be a type, e.g. use int rather than \'int\'')

        if (_proxy):
            self.__proxy__ = _proxy
        elif type(data) == SArray:
            self.__proxy__ = data.__proxy__
        else:
            self.__proxy__ = UnitySArrayProxy(glconnect.get_client())
            # we need to perform type inference
            if dtype is None:
                if (isinstance(data, list)):
                    # if it is a list, Get the first type and make sure
                    # the remaining items are all of the same type
                    dtype = infer_type_of_list(data)
                elif isinstance(data, array.array):
                    dtype = infer_type_of_list(data)
                elif HAS_PANDAS and isinstance(data, pandas.Series):
                    # if it is a pandas series get the dtype of the series
                    dtype = pytype_from_dtype(data.dtype)
                    if dtype == object:
                        # we need to get a bit more fine grained than that
                        dtype = infer_type_of_list(data)

                elif HAS_NUMPY and isinstance(data, numpy.ndarray):
                    # if it is a numpy array, get the dtype of the array
                    dtype = pytype_from_dtype(data.dtype)
                    if dtype == object:
                        # we need to get a bit more fine grained than that
                        dtype = infer_type_of_list(data)
                    if len(data.shape) == 2:
                        # we need to make it an array or a list
                        if dtype == float or dtype == int:
                            dtype = array.array
                        else:
                            dtype = list
                    elif len(data.shape) > 2:
                        raise TypeError("Cannot convert Numpy arrays of greater than 2 dimensions")

                elif (isinstance(data, str)):
                    # if it is a file, we default to string
                    dtype = str

            if HAS_PANDAS and isinstance(data, pandas.Series):
                with cython_context():
                    self.__proxy__.load_from_iterable(data.values, dtype, ignore_cast_failure)
            elif (HAS_NUMPY and isinstance(data, numpy.ndarray)) or isinstance(data, list) or isinstance(data, array.array):
                with cython_context():
                    self.__proxy__.load_from_iterable(data, dtype, ignore_cast_failure)
            elif (isinstance(data, str)):
                internal_url = make_internal_url(data)
                with cython_context():
                    self.__proxy__.load_autodetect(internal_url, dtype)
            else:
                raise TypeError("Unexpected data source. " \
                                "Possible data source types are: list, " \
                                "numpy.ndarray, pandas.Series, and string(url)")

    @classmethod
    def from_const(cls, value, size):
        """
        Constructs an SArray of size with a const value .

        Parameters
        ----------
        value : [int | float | str | array.array | list | dict]
          The value to fill the sarray
        size : int
          The size of the SArray
        """
        assert type(size) is int and size > 0, "size must be a positive int"
        if (type(value) not in set([int, float, str, array.array, list, dict])):
            raise TypeError('Cannot create sarray of value type %s' % str(type(value)))
        proxy = UnitySArrayProxy(glconnect.get_client())
        proxy.load_from_const(value, size)
        return cls(_proxy=proxy)

    def __get_content_identifier__(self):
        """
        Returns whether or not the sframe has been materialized. A materialized
        SFrame has all underneath sarrays saved on disk
        """
        with cython_context():
            return self.__proxy__.get_content_identifier()

    def save(self, targetfile):
        """
        Saves the SArray to file.

        Parameters
        ----------
        filename : string
            The directory to save the SArray. Either a local path or a remote
            URL.
        """
        with cython_context():
            self.__proxy__.save(make_internal_url(targetfile))

    def __repr__(self):
        """
        Returns a string description of the Array.
        """
        ret =       "dtype: " + str(self.dtype().__name__) + "\n"
        ret = ret + "Rows: " + str(self.size()) + "\n"
        ret = ret + str(self)
        return ret

    def __str__(self):
        """
        Returns a string containing the first 100 elements of the array.
        """
        headln = str(list(self.head(100)))
        if (self.size() > 100):
            # cut the last close bracket
            # and replace it with ...
            headln = headln[0:-1] + ", ... ]"
        return headln

    def __nonzero__(self):
        """
        Returns true if the array is not empty.
        """
        return self.size() != 0

    def __len__(self):
        """
        Returns the length of the array
        """
        return self.size()

    def __iter__(self):
        """
        Provides an iterator to the contents of the array.
        """
        def generator():
            elems_at_a_time = 262144
            self.__proxy__.begin_iterator()
            ret = self.__proxy__.iterator_get_next(elems_at_a_time)
            while(True):
                for j in ret:
                    yield j

                if len(ret) == elems_at_a_time:
                    ret = self.__proxy__.iterator_get_next(elems_at_a_time)
                else:
                    break

        return generator()

    def __add__(self, other):
        """
        If other is a scalar value, adds it to the current array, returning
        the new result. If other is an SArray, performs an element-wise
        addition of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '+'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '+'))

    def __sub__(self, other):
        """
        If other is a scalar value, subtracts it from the current array, returning
        the new result. If other is an SArray, performs an element-wise
        subtraction of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '-'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '-'))

    def __mul__(self, other):
        """
        If other is a scalar value, multiplies it to the current array, returning
        the new result. If other is an SArray, performs an element-wise
        multiplication of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '*'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '*'))

    def __div__(self, other):
        """
        If other is a scalar value, divides each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise division of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '/'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '/'))

    def __lt__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '<'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '<'))

    def __gt__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '>'))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '>'))


    def __le__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '<='))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '<='))


    def __ge__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '>='))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '>='))


    def __radd__(self, other):
        """
        Adds a scalar value to the current array.
        Returned array has the same type as the array on the right hand side
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '+'))


    def __rsub__(self, other):
        """
        Subtracts a scalar value from the current array.
        Returned array has the same type as the array on the right hand side
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '-'))


    def __rmul__(self, other):
        """
        Multiplies a scalar value to the current array.
        Returned array has the same type as the array on the right hand side
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '*'))


    def __rdiv__(self, other):
        """
        Divides a scalar value by each element in the array
        Returned array has the same type as the array on the right hand side
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '/'))


    def __eq__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the new result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '=='))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '=='))


    def __ne__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the new result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        with cython_context():
            if type(other) is SArray:
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '!='))
            else:
                return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '!='))


    def __and__(self, other):
        """
        Perform a logical element-wise 'and' against another SArray.
        """
        if type(other) is SArray:
            with cython_context():
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '&'))
        else:
            raise TypeError("SArray can only perform logical and against another SArray")


    def __or__(self, other):
        """
        Perform a logical element-wise 'or' against another SArray.
        """
        if type(other) is SArray:
            with cython_context():
                return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '|'))
        else:
            raise TypeError("SArray can only perform logical or against another SArray")


    def __getitem__(self, other):
        """
        If the key is an SArray of identical length, this function performs a
        logical filter: i.e. it subselects all the elements in this array
        where the corresponding value in the other array evaluates to true.
        If the key is an integer this returns a single row of
        the SArray. If the key is a slice, this returns an SArray with the
        sliced rows.  See :py:class:`graphlab.SArray` for usage examples.
        """
        if type(other) is SArray:
            if len(other) != len(self):
                raise IndexError("Cannot perform logical indexing on arrays of different length.")
            with cython_context():
                return SArray(_proxy = self.__proxy__.logical_filter(other.__proxy__))
        elif type(other) is int:
            if other < 0:
                other = len(self) + other
            if other >= len(self):
                raise IndexError("SFrame index out of range")
            return list(SArray(_proxy = self.__proxy__.copy_range(other, 1, other+1)))[0]
        elif type(other) is slice:
            start = other.start
            stop = other.stop
            step = other.step
            if start is None:
                start = 0
            if stop is None:
                stop = len(self)
            if step is None:
                step = 1
            # handle negative indices
            if start < 0:
                start = len(self) + start
            if stop < 0:
                stop = len(self) + stop
            return SArray(_proxy = self.__proxy__.copy_range(start, step, stop))
        else:
            raise IndexError("Invalid type to use for indexing")

    def __materialize__(self):
        """
        For a SArray that is lazily evaluated, force persist this sarray
        to disk, commiting all lazy evaluated operations.
        """
        with cython_context():
            self.__proxy__.materialize()

    def __is_materialized__(self):
        """
        Returns whether or not the sarray has been materialized.
        """
        return self.__proxy__.is_materialized()

    def size(self):
        """
        Returns the size of the sarray.
        """
        return self.__proxy__.size()

    def dtype(self):
        """
        Returns the data type of the sarray.

        Returns
        -------
        out : type
            The type of the sarray.
        """
        return self.__proxy__.dtype()

    def head(self, n=10):
        """
        Returns an SArray which contains the first n rows of the SArray.

        Parameters
        ----------
        n : int
            The number of rows to fetch.

        Returns
        -------
        out : SArray
            A new SArray which contains the first n rows of the current SArray.
        """
        return SArray(_proxy=self.__proxy__.head(n))

    def vector_slice(self, start, end=None):
        """
        If this SArray contains vectors or recursive types, this returns a new SArray
        containing each individual vector sliced, between start and end, exclusive.

        Examples
        ---------

        For instance, if g is a vector of floats:

        >>> g
        dtype: array.array
        Rows: 2
        [[1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0]]

        >>> g.vector_slice(0) # extracts the first element of each vector
        dtype: float
        Rows: 2
        [1.0, 2.0]

        >>> g.vector_slice(0, 2) # extracts the first two elements of each vector
        dtype: array.array
        Rows: 2
        [[1.0, 2.0], [2.0, 3.0]]

        If a vector cannot be sliced, the result will be None.

        For instance:

        >>> g
        dtype: array.array
        Rows: 3
        [[1.0], [1.0, 2.0], [1.0, 2.0, 3.0]]

        >>> g.vector_slice(2)
        dtype: float
        Rows: 3
        [None, None, 3.0]

        >>> g.vector_slice(0,2)
        dtype: list
        Rows: 3
        [None, [1.0, 2.0], [1.0, 2.0]]

        If g is a vector of mixed types (float, int, str, array, list, etc.):

        >>> l
        dtype: list
        Rows: 2
        [['a', 1, 1.0],
         ['b', 2, 2.0]]

        >>> g.vector_slice(0) # extracts the first element of each vector
        dtype: list
        Rows: 2
        [[a'], ['b']]

        >>> g.vector_slice(0, 2) # extracts the first two elements of each vector
        dtype: list
        Rows: 2
        [[a', 1], ['b', 2]]


        Parameters
        ----------
        start : int
            The start position of the slice.

        end : int, optional.
            The end position of the slice. Note that the end position
            is NOT included in the slice. Thus a g.vector_slice(1,3) will extract
            entries in position 1 and 2.
        """
        if (self.dtype() != array.array) and (self.dtype() != list):
            raise RuntimeError("Only Vector type can be sliced")
        if end == None:
            end = start + 1

        with cython_context():
            return SArray(_proxy=self.__proxy__.vector_slice(start, end))

    def count_words(self, to_lower=True):
        """
        Returns a new SArray of dictionary type where each output element
        contains the word-count for each of the word appeared in corresponding
        input element. The words are split on all whitespace and punctuation
        characters. The input SArray must contain strings.

        For instance, the following sentence:

            "The quick brown fox jumps over the lazy dog."

        Is converted to:

        [{'brown': 1, 'lazy': 1, 'jumps': 1, 'fox': 1, 'dog': 1, 'quick': 1, 'the': 2, 'over': 1}]


        Parameters
        ----------
        to_lower: bool, optional
            If True, all word is converted to lower case before counting

        Returns
        -------
        out : SArray
            The SArray of dictionary type, where each element contains the word-count
            for each of the word appeared in corresponding input element

        """
        if (self.dtype() != str):
            raise TypeError("Only SArray of string type is supported for counting bag of words")

        _mt._get_metric_tracker().track('sarray.count_words')

        # construct options, will extend over time
        options = dict()
        options["to_lower"] = to_lower == True

        with cython_context():
            return SArray(_proxy=self.__proxy__.count_bag_of_words(options))

    def dict_trim_by_keys(self, keys, exclude=True):
        """
        If SArray datatype is a dictionary, filters each dictionary by the
        provided keys.  If exclude is True, then all keys that are in the input
        key list are removed.  If exclude is False, then only keys that are in
        the input key list are retained. Exclude is True by default.

        Parameters
        ----------
        keys: list
            A collection of keys to trim down the elements in the SArray.

        exclude: bool, optional
            If True, all keys that are in the input key list are removed
            If False, only keys that are in the input key list are retained.
            Defaults to True.

        Returns
        -------
        out : SArray
            A SArray of dictionary type, with each dict element trimmed
            according to the input criteria.

        Examples
        --------

        >>> sa = SArray([{'this':1, "is":1, "dog":2}, {"this": 2, "are": 2, "cat": 1}])
        >>> sa.dict_trim_by_keys(["this", "is", "and", "are"], exclude=True)
        Out:
            dict
            [{"dog":2}, {"cat": 1}]

        >>> sa.dict_trim_by_keys(["dog", "cat", "animal"], exclude=False)
        Out:
            dict
            [{"dog":2}, {"cat": 1}]

        """
        if isinstance(keys, str) or (not hasattr(keys, "__iter__")):
            keys = [keys]

        _mt._get_metric_tracker().track('sarray.dict_trim_by_keys')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_trim_by_keys(keys, exclude))

    def dict_trim_by_values(self, lower=None, upper=None):
        """
        If SArray datatype is a dictionary, filter each dictionary by checking
        the value of the dictionary against the given lower and upper bounds
        (inclusive).  Only the values that fall between the bounds are
        retained. Trimming is only performed on values which can be compared
        to the bound values.

        Parameters
        ----------
        lower: int, long, float
            The lowest dictionary value that would be retained in the result. If not
            given, lower bound is not applied.

        upper: int, long, float
            The highest dictionary value that would be retained in the result. If not
            given, upper bound is not applied.

        Returns
        -------
        out : SArray
            A SArray of dictionary type, with each dict element trimmed
            according to the input criteria

        Examples
        --------

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_trim_by_values(2, 5)
        Out:
            dict
            [{"is":5}, {"this": 2, "cat": 5}]

        This is an example only lower bound is given

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_trim_by_values(2)
        Out:
            dict
            [{"is":5, "dog":7}, {"this": 2, "cat": 5}]

        This is an example only upper bound is given

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_trim_by_values(upper=5)
        Out:
            dict
            [{'this':1, "is":5}, {"this": 2, "are": 1, "cat": 5}]

        """

        if None != lower and (not is_numeric_type(type(lower))):
            raise TypeError("lower bound has to be a numeric value")

        if None != upper and (not is_numeric_type(type(upper))):
            raise TypeError("upper bound has to be a numeric value")

        _mt._get_metric_tracker().track('sarray.dict_trim_by_values')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_trim_by_values(lower, upper))

    def dict_keys(self):
        """
        If the SArray datatype is a dictionary. returns a new SArray that
        contains all the keys from each dictionary element as a list.

        Returns
        -------
        out : SArray
            A SArray of list type, where each element is a list of keys
            from the input SArray element.

        Examples
        ---------

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_keys()
        Out:
            dict
            [["this", "is", "dog"], ["this", "are", "cat"]]

        """
        _mt._get_metric_tracker().track('sarray.dict_keys')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_keys())

    def dict_values(self):
        """
        If the SArray datatype is a dictionary. returns a new SArray that
        contains all the values from each dictionary element as a list.

        Returns
        -------
        out : SArray
            A SArray of list type, where each element is a list of values
            from the input SArray element.

        Examples
        --------

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_values()
        Out:
            dict
            [[1,5,7], [2,1,5]]

        """
        _mt._get_metric_tracker().track('sarray.dict_values')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_values())

    def dict_has_any_keys(self, keys):
        """
        If the SArray data is a dictionary, returns an SArray of integers where
        each element indicates whether the original SArray element has any key
        in the input key list.

        Also see :py:func:`graphlab.SArray.dict_has_all_keys()` for a version
        of this function which requires all keys to be matched to return True
        in the output SArray.

        Parameters
        ----------
        keys: list
            A list of key values to check the dictionary against

        Returns
        -------
        out : SArray
            A SArray of int type, where each element indicates whether the
            input SArray element contains any key in the input list.

        Examples
        --------

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}, {"animal":1}])
        >>> sa.dict_has_any_keys(["is", "this", "are"])
        Out:
            dict
            [1, 1, 0]

        """
        if isinstance(keys, str) or (not hasattr(keys, "__iter__")):
            keys = [keys]

        _mt._get_metric_tracker().track('sarray.dict_has_any_keys')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_has_any_keys(keys))

    def dict_has_all_keys(self, keys):
        """

        If the SArray data is a dictionary, returns an SArray of integers where
        each element indicates whether the original SArray element has all kels
        in the input key list.

        Also see :py:func:`graphlab.SArray.dict_has_any_keys()` which only
        requires any one key to match to return True in the output SArray.

        Parameters
        ----------
        keys: list
            A list of key values to check the dictionary against

        Returns
        -------
        out : SArray
            A SArray of int type, where each element indicates whether the
            input SArray element contains all keys in the input list.

        Examples
        --------

        >>> sa = SArray([{'this':1, "is":5, "dog":7}, {"this": 2, "are": 1, "cat": 5}])
        >>> sa.dict_has_any_keys(["is", "this"])
        Out:
            dict
            [1, 0]

        """
        if isinstance(keys, str) or (not hasattr(keys, "__iter__")):
            keys = [keys]

        _mt._get_metric_tracker().track('sarray.dict_has_all_keys')

        with cython_context():
            return SArray(_proxy=self.__proxy__.dict_has_all_keys(keys))

    def apply(self, fn, dtype=None, skip_undefined=True, seed=None, __lua_translate__=False):
        """
        Returns a new SArray of dtype where each element in this SArray is
        transformed by fn(x).
        The fn should return a value which can be cast into dtype.

        If dtype is not specified, the first 100 elements of the Array are
        used to make a guess of the target datatype.

        Parameters
        ----------
        fn : function
            The function to transform each element.

        dtype : dtype
            The dtype of the new SArray. If None, the first 100
            elements of the array are used to guess the target
            data type.

        skip_undefined : bool, optional
            If True, will not apply fn to any undefined values.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SArray
            The SArray transformed by fn.  Each element of the SArray is of
            type ``dtype``
        """
        if (type(fn) == str):
            fn = "LUA" + fn
            if dtype == None:
                raise TypeError("dtype must be specified for a lua function")
        else:
            assert inspect.isfunction(fn), "Input must be a function"

            dryrun = [fn(i) for i in self.head(100) if i is not None]
            import traceback
            if dtype == None:
                dtype = infer_type_of_list(dryrun)
            try:
                # try compilation
                if __lua_translate__:
                    # its a function
                    print "Attempting Lua Translation"
                    import graphlab.Lua_Translator
                    import ast
                    import StringIO

                    def isalambda(v):
                        return isinstance(v, type(lambda: None)) and v.__name__ == '<lambda>'

                    output = StringIO.StringIO()
                    translator = graphlab.Lua_Translator.translator_NodeVisitor(output)
                    ast_node = None
                    try:
                        if not isalambda(fn):
                            ast_node = ast.parse(inspect.getsource(fn))
                            translator.rename_function[fn.__name__] = "__lambda__transfer__"
                    except:
                        pass

                    try:
                        if ast_node == None:
                            print "Cannot translate. Trying again from byte code decompilation"
                            ast_node = meta.decompiler.decompile_func(fn)
                            translator.rename_function[""] = "__lambda__transfer__"
                    except:
                        pass
                    if ast_node == None:
                        raise ValueError("Unable to get source of function")

                    ftype = graphlab.Lua_Translator.FunctionType()
                    selftype = self.dtype()
                    if selftype == list:
                        ftype.input_type = tuple([[]])
                    elif selftype == dict:
                        ftype.input_type = tuple([{}])
                    elif selftype == array.array:
                        ftype.input_type = tuple([[float]])
                    else:
                        ftype.input_type = tuple([selftype])
                    translator.function_known_types["__lambda__transfer__"] = ftype
                    translator.translate_ast(ast_node)
                    print "Lua Translation Success"
                    print output.getvalue()
                    fn = "LUA" + output.getvalue()
            except Exception as e:
                print traceback.format_exc()
                print "Lua Translation Failed"
                print e
            except:
                print traceback.format_exc()
                print "Lua Translation Failed"

        if not seed:
            seed = time.time()

        _mt._get_metric_tracker().track('sarray.apply')

        with cython_context():
            return SArray(_proxy=self.__proxy__.transform(fn, dtype, skip_undefined, seed))


    def filter(self, fn, skip_undefined=True, seed=None):
        """
        Returns a new SArray which is filtered by the given function.
        If the lambda evaluates an element to true, this element is copied to the
        new SArray.  If not, it isn't.  Throws an exception if the return type
        of the lambda is not castable to a boolean value.

        Parameters
        ----------
        fn : function
            Function that filters the SArray. Must evaluate to bool or int.

        skip_undefined : bool, optional
            If True, will not apply fn to any undefined values.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SArray
        """
        assert inspect.isfunction(fn), "Input must be a function"
        if not seed:
            seed = time.time()

        _mt._get_metric_tracker().track('sarray.filter')

        with cython_context():
            return SArray(_proxy=self.__proxy__.filter(fn, skip_undefined, seed))


    def sample(self, fraction, seed=None):
        """
        Returns an SArray which contains a subsample of the current SArray.

        Parameters
        ----------
        fraction : float
            The fractionage of the rows to fetch. Must be between 0 and 1.

        seed : int
            The random seed for the random number generator.

        Returns
        -------
        out : SArray
            The new SArray which contains the subsampled rows.
        """
        if (fraction > 1 or fraction < 0):
            raise ValueError('Invalid sampling rate: ' + str(fraction))
        if (self.size() == 0):
            return SArray()
        if not seed:
            seed = time.time()

        _mt._get_metric_tracker().track('sarray.sample')

        with cython_context():
            return SArray(_proxy=self.__proxy__.sample(fraction, seed))

    def _save_as_text(self, url):
        """
        Save the SArray to disk as text file.
        """
        raise NotImplementedError



    def all(self):
        """
        Returns True if every element of the SArray evaluates to False.

        For numeric SArrays, 0, or missing values (None) evaluates to False. All
        non-zero values evaluates to True.
        For string, list, or dictionary SArrays, empty values (0 length
        strings, lists or dictionaries) or missing values (None) evaluates to
        False. All other values evaluates to True.

        Returns True on an empty SArray.

        Example
        -------

        >>> gl.SArray([1, None]).all()
        False
        >>> gl.SArray([1, 0]).all()
        False
        >>> gl.SArray([1, 2]).all()
        True
        >>> gl.SArray(["hello", "world"]).all()
        True
        >>> gl.SArray(["hello", ""]).all()
        False
        >>> gl.SArray([]).all()
        True

        Returns
        -------
        out : bool
        """
        with cython_context():
            return self.__proxy__.all()


    def any(self):
        """
        Returns True if any element of the SArray evaluates to True.

        For numeric SArrays, Any non-zero values evaluates to True.
        For string, list, or dictionary SArrays, any element of non-zero length
        evaluates to True.

        Returns False on an empty SArray.

        Example
        -------

        >>> gl.SArray([1, None]).any()
        True
        >>> gl.SArray([1, 0]).any()
        True
        >>> gl.SArray([0, 0]).any()
        False
        >>> gl.SArray(["hello", "world"]).any()
        True
        >>> gl.SArray(["hello", ""]).any()
        True
        >>> gl.SArray(["", ""]).any()
        False
        >>> gl.SArray([]).any()
        False

        Returns
        -------
        out : bool
        """
        with cython_context():
            return self.__proxy__.any()


    def max(self):
        """
        Returns the maximum value in the SArray.  Returns None on an empty
        SArray.  Raises an exception if called on an SArray with non-numeric
        type.

        Returns
        -------
        out : type of SArray
            Maximum value of SArray
        """
        with cython_context():
            return self.__proxy__.max()


    def min(self):
        """
        Returns the minimum value in the SArray. Returns None on an empty
        SArray.  Raises an exception if called on an SArray with non-numeric
        type.

        Returns
        -------
        out : type of SArray
            Minimum value of SArray
        """
        with cython_context():
            return self.__proxy__.min()


    def sum(self):
        """
        Returns the sum of all values in the SArray.  Returns None on an empty
        SArray.  Raises an exception if called on an SArray of strings, lists,
        or dictionaries.  If the SArray contains numeric arrays (array.array),
        and all the arrays are the same length, the sum over all the vectors
        will be returned.
        Otherwise, if the vectors have different lengths, this will raise an
        exception.

        For large values, this may overflow without warning.

        Returns
        -------
        out : type of SArray
            Sum of all values in SArray
        """
        with cython_context():
            return self.__proxy__.sum()


    def mean(self):
        """
        Returns the mean of all the values in the SArray.  Returns None on an
        empty SArray. Raises an exception if called on an SArray with non-numeric
        type.

        Returns
        -------
        out : float
            Mean of all values in SArray
        """
        with cython_context():
            return self.__proxy__.mean()


    def std(self, ddof=0):
        """
        Returns the standard deviation of all the values in the SArray.
        Returns None on an empty SArray. Raises an exception if called on an
        SArray with non-numeric type.

        Parameters
        ----------
        ddof : int
            "delta degrees of freedom" in the variance calculation.

        Raises
        ------
        RuntimeError
            If ddof >= sarray size or the sarray is a non-numeric type.

        Returns
        -------
        out : float
            The standard deviation of all the values.
        """
        with cython_context():
            return self.__proxy__.std(ddof)


    def var(self, ddof=0):
        """
        Returns the variance of all the values in the SArray.
        Returns None on an empty SArray. Raises an exception if called on an
        SArray with non-numeric type.

        Parameters
        ----------
        ddof : int
            "delta degrees of freedom" in the variance calculation.

        Raises
        ------
        RuntimeError
            If ddof >= sarray size or the sarray is a non-numeric type.

        Returns
        -------
        out : float
            Variance of all values in SArray.
        """
        with cython_context():
            return self.__proxy__.var(ddof)

    def num_missing(self):
        """
        Returns the number of missing elements in the SArray.  Returns 0 if the
        sarray is empty.

        Returns
        -------
        out : int
            Number of missing values.
        """
        with cython_context():
            return self.__proxy__.num_missing()

    def nnz(self):
        """
        Returns the number of non-zero elements in the SArray.  Returns 0 if the
        sarray is empty.

        Returns
        -------
        out : int
            Number of non-zero elements.
        """
        with cython_context():
            return self.__proxy__.nnz()

    def astype(self, dtype, undefined_on_failure=False):
        """
        Returns a new SArray with all of the current values casted to the given
        type.

        For instance:

        >>> sa = gl.SArray(['1','2','3','4'])
        >>> sa
        dtype: str
        Rows: 4
        ['1', '2', '3', '4']
        >>> sa.astype(int)
        dtype: int
        Rows: 4
        [1, 2, 3, 4]

        The string parsing techniques used to handling conversion to
        dictionary and list types are quite generic and permits a
        variety of interesting formats to be interpreted. For instance, a
        JSON string can usually be interpreted as a list or a dictionary type.

        For instance given a simple Sarray of strings, each an sequence of a:b
        terms:

        >>> sa
        dtype: str
        Rows: 2
        ['1:2 3:4', 'a:b c:d']

        We can add curly brackets to make each string look like a dict:

        >>> '{' + sa + '}'
        dtype: str
        Rows: 2
        ['{1:2 3:4}', '{a:b c:d}']

        And convert it to a dictionary type:

        >>> ('{' + sa + '}').astype(dict)
        dtype: dict
        Rows: 2
        [{1: 2, 3: 4}, {'a': 'b', 'c': 'd'}]

        Throws an exception if the types are not castable to the given type.

        Parameters
        ----------
        dtype : type
            The type to cast the elements to in SArray

        undefined_on_failure: bool
            If set to true, runtime cast failures will be emitted as
            missing values rather than failing.

        Returns
        -------
        out : SArray (of type dtype)
            The SArray converted to the dtype
        """

        _mt._get_metric_tracker().track('sarray.astype.%s' % str(dtype.__name__))

        with cython_context():
            return SArray(_proxy=self.__proxy__.astype(dtype, undefined_on_failure))

    def clip(self, lower=float('nan'), upper=float('nan')):
        """
        Returns a new SArray clipping each value to be within the bounds set
        by the lower and upper parameters. i.e. values below the lower bound
        will be set to the lower bound value. Values above the upper bound will
        be set to the upper bound value.

        This function can operate on numeric arrays, as well as vector arrays,
        in which case each individual element in each vector is clipped.

        If lower or upper are given float('nan') as the argument, the bound
        value is ignored.

        Parameters
        ----------
        lower : int
            The lower bound to clip to. Ignored if equal to float('nan')

        upper : int
            The upper bound to clip to. Ignored if equal to float('nan')

        Raises
        ------
            Throws an exception if the SArray is empty, the types are non-numeric,
            or if the upper bound is less than the lower bound

        Returns
        -------
        out : SArray
        """
        with cython_context():
            return SArray(_proxy=self.__proxy__.clip(lower, upper))

    def clip_lower(self, threshold):
        """
        Returns a new SArray with all values clipped to the lower bound given.

        This function can operate on numeric arrays, as well as vector arrays,
        in which case each individual element in each vector is clipped.

        Parameters
        ----------
        threshold : float
            The lower bound to clip values to

        Raises
        ------
        Exception
            Throws an exception if the SArray is empty or the types are non-numeric

        Returns
        -------
        out : SArray
        """
        with cython_context():
            return SArray(_proxy=self.__proxy__.clip(threshold, float('nan')))


    def clip_upper(self, threshold):
        """
        Returns a new SArray with all values clipped to the upper bound given.

        This function can operate on numeric arrays, as well as vector arrays,
        in which case each individual element in each vector is clipped.

        Parameters
        ----------
        threshold : float
            The upper bound to clip values to

        Raises
        ------
        Exception
            Throws an exception if the SArray is empty or the types are non-numeric

        Returns
        -------
        out : SArray
        """
        with cython_context():
            return SArray(_proxy=self.__proxy__.clip(float('nan'), threshold))

    def tail(self, n=10):
        """
        Returns an SArray that contains last n elements in the SArray.

        Parameters
        ----------
        n : int
            The number of elements to fetch

        Returns
        -------
        out : SArray
            A new SArray which contains the last n rows of the current SArray.
        """
        with cython_context():
            return SArray(_proxy=self.__proxy__.tail(n))


    def dropna(self):
        """
        Returns a new SArray containing only the non-missing values of the
        array.  The size of the returned SArray will be <= to the size of the
        original.

        A missing value shows up in an SArray as 'None'.

        Returns
        -------
        out : SArray
            The new SArray with missing values removed.
        """

        _mt._get_metric_tracker().track('sarray.dropna')

        with cython_context():
            return SArray(_proxy = self.__proxy__.drop_missing_values())

    def fillna(self, value):
        """
        Returns a new SArray with all missing values (None or NaN) filled in
        with the given value.  The size of the new SArray will be the same as
        the original SArray.  If the given value is not the same type as the
        values in the SArray, fillna will attempt to convert the value to the
        original SArray's type.  If this fails, an error will be raised.

        Parameters
        ----------
        value : type convertible to SArray's type
            The value that all missing values will be replaced with

        Returns
        -------
        out : SArray
            A new SArray with all missing values filled
        """
        _mt._get_metric_tracker().track('sarray.fillna')

        with cython_context():
            return SArray(_proxy = self.__proxy__.fill_missing_values(value))

    def topk_index(self, topk=10, reverse=False):
        """
        Used to return the topk elements, sorted by those elements (descending
        by default).  Returns a new SArray of type 'int' of the same size as
        the current SArray.  Entries are '1' if the corresponding element in
        the current SArray is a part of the topk elements, and '0' if that
        corresponding element is not.

        Parameters
        ----------
        topk : int
            The number of elements to determine if 'top'

        reverse: bool
            If True, return the topk elements in ascending order

        Returns
        -------
        out : SArray (of type int)

        Notes
        -----
        This is mostly used internally by SFrame's topk function.
        """
        with cython_context():
            return SArray(_proxy = self.__proxy__.topk_index(topk, reverse))

    def sketch_summary(self, background=False, sub_sketch_keys=None):
        """
        Returns a graphlab.Sketch object which can be further queried for many
        descriptive statistics over this SArray. Many of the statistics are
        approximate. See the :class:`~graphlab.Sketch` documentation for more
        detail.

        Returns
        -------
        out : Sketch
            Sketch object that contains descriptive statistics for this SArray.
            Many of the statistics are approximate.

        Parameters
        ----------
        background : boolean, optional
          If True, the sketch construction will return immediately and the
          sketch will be constructed in the background. While this is going on,
          the sketch can be queried incrementally, but at a performance penalty.
          Defaults to False.

        sub_sketch_keys: int | str | list of int | list of str, optional
            For SArray of dict type, also constructs sketches for a given set of keys,
            For SArray of array type, also constructs sketches for the given indexes.
            The sub sketches may be queried using:
                 :py:func:`~graphlab.Sketch.element_sub_sketch()`
            Defaults to None in which case no sub sketches will be constructed.

        """
        from graphlab.data_structures.sketch import Sketch
        if (type(background) != bool):
            raise TypeError("'background' parameter has to be a boolean value")
        if (sub_sketch_keys != None):
            if (self.dtype() != dict and self.dtype() != array.array):
                raise TypeError("sub_sketch_keys is only supported SArray of dictionary or array type")
            if not hasattr(sub_sketch_keys, "__iter__"):
                sub_sketch_keys = [sub_sketch_keys]
            value_types = set([type(i) for i in sub_sketch_keys])
            if (len(value_types) != 1):
                raise ValueError("sub_sketch_keys member values need to have the same type.")
            value_type = value_types.pop();
            if (self.dtype() == dict and value_type != str):
                raise TypeError("only string value(s) can be passed to sub_sketch_keys for SArray of dictionary type")
            if (self.dtype() == array.array and value_type != int):
                raise TypeError("only int value(s) can be passed to sub_sketch_keys for SArray of array type")
        else:
            sub_sketch_keys = list()

        _mt._get_metric_tracker().track('sarray.sketch_summary')
        return Sketch(self, background, sub_sketch_keys = sub_sketch_keys)

    def append(self, other):
        """
        Append the second SArray to current SArray. Returns a new SArray
        contains rows from both SArrays.  Both SArrays must be of the same
        type.

        Parameters
        ----------
        other : SArray
            Another SArray whose rows are appended to current SArray

        Returns
        -------
        out : SArray
            A new SArray that contains rows from both SArrays, with second
            SArray's rows after first SArray's rows
        """
        _mt._get_metric_tracker().track('sarray.append')
        if type(other) is not SArray:
            raise RuntimeError("SArray append can only work with SArray")

        if self.dtype() != other.dtype():
            raise RuntimeError("Data types in both SArrays have to be the same")

        with cython_context():
            return SArray(_proxy = self.__proxy__.append(other.__proxy__))

    def unique(self):
        """
        Return an SArray containing only the unique values of the given SArray.
        Will not necessarily preserve the order of the given SArray in the new
        SArray.

        Raises a TypeError if the SArray is of dictionary type.

        Returns
        -------
        out : SArray
            A new SArray that contains the unique values of the given SArray.
        """
        _mt._get_metric_tracker().track('sarray.unique')
        tmp_sf = gl.SFrame()
        tmp_sf.add_column(self, 'X1')

        res = tmp_sf.groupby('X1',{})

        return SArray(_proxy=res['X1'].__proxy__)

    @graphlab.canvas.inspect.find_vars
    def show(self):
        """
        show()
        Launch or update GraphLab Canvas and generate specified visualization view of the SArray.

        Returns
        -------
        view : graphlab.canvas.view.View
            An object representing the GraphLab Canvas view
        """
        return graphlab.canvas._show(variable=self)

    def item_length(self):
        """
        For SArray that is of dict/arraylist type, retruns a new SArray that contains
        length for each item in the SArray. This function is equivalent to the following
        but more performant:

            sa_item_len =  sa.apply(lambda x: len(x) if x is not None else None)

        An exception is raised if the SArray is not of type dict, array.array
        or list.  If a given element is a missing value, then the output
        elements is also a missing value.

        Examples
        --------

        >>> sa = gl.SArray([
          {"is_restaurant": 1, "is_electronics": 0},
          {"is_restaurant": 1, "is_retail": 1, "is_electronics": 0},
          {"is_restaurant": 0, "is_retail": 1, "is_electronics": 0},
          {"is_restaurant": 0},
          {"is_restaurant": 1, "is_electronics": 1},
          None])
        >>> sa.item_length()
        Out:
            dtype: int
            Rows: 6
            [2, 3, 3, 1, 2, None]

        Returns
        -------
        out_sf : SArray
            A new SArray, each element in the SArray is the len of the corresponding
            items in original SArray.

        """
        if (self.dtype() not in [list, dict, array.array]):
            raise TypeError("item_length() is only applicable for SArray of type list, dict and array.")

        _mt._get_metric_tracker().track('sarray.item_length')

        with cython_context():
            return SArray(_proxy = self.__proxy__.item_length())

    def unpack(self, column_name_prefix = "X", column_types = None, na_value=None, limit=None):
        """
        Expands an SArray of list/array/dict type to multiple columns, return a
        new SFrame that contains expanded columns. For instance, a SArray of
        lists each of length 4, will be expanded into an SFrame of 4 columns, one
        for each list element. An SArray of dictionaries will be expanded into
        as many columns as there are keys. (The set of keys are inferred by
        inspecting the first 100 rows.)

        column naming:
        When unpacking an SArray of list or array type, new columns are named:
        prefix.0, prefix.1, etc. If unpacking a column of dict
        type, unpacked columns are named prefix.key1, prefix.key2, etc.
        The prefix is set by the parameter "column_name_prefix" and defaults to
        'X'. If column_name_prefix is None or empty, then no prefix is used.

        Missing values:
        When unpacking an SArray of list or dictionary types, missing values in
        original list remain as missing value in the resultant columns.
        Furthermore, if the 'na_value' paremeter is specified, all values that
        are equal to 'na_value' are replaced with missing values. In an SArray of
        array.array type, NaN is interpreted as a missing value.

        Controlling the set of keys to unpack:
        For dictionary SArrays, the 'limit' parameter is a list of values that
        are used to limit the subset of keys to unpack.
        For list/array SArray, 'limit' is a list of integer indices of the
        list/array to unpack.

        into the list/array value

        Parameters
        ----------
        column_name_prefix: str, optional
            If provided, unpacked column names would start with the given prefix.
            Defaults to "X".

        column_types: list[type], optional
            Column types for the upacked columns.
            If not provided, column types are automatically inferred from first
            100 rows. Defaults to None.

        na_value: flexible_type, optional
            If provided, convert all values that are equal to "na_value" to
            missing value. Defaults to None.

        limit: list, optional
            Limits the set of list/array/dict keys to unpack.
            For list/array SArrays, 'limit' must contain integer indices.
            For dict SArray, 'limit' must contain dictionary keys.

        Returns
        -------
        out : SFrame
            A new SFrame that contains all unpacked columns

        Examples
        --------
        To unpack a dict SArray

         >>> sa = SArray(
            [{ 'word': 'a',     'count': 1},
             { 'word': 'cat',   'count': 2},
             { 'word': 'is',    'count': 3},
             { 'word': 'coming','count': 4}])

         >>> sa.unpack(column_name_prefix=None)
            Columns:
                count   int
                word    str
            Rows: 4
            Data:
            +-------+--------+
            | count |  word  |
            +-------+--------+
            |   1   |   a    |
            |   2   |  cat   |
            |   3   |   is   |
            |   4   | coming |
            +-------+--------+
            [4 rows x 2 columns]


        To unpack with column names using a given prefix:

        >>> sa.unpack(column_name_prefix="wc")
            Columns:
                wc.count    int
                wc.word str
            Rows: 4
            Data:
            +----------+---------+
            | wc.count | wc.word |
            +----------+---------+
            |    1     |    a    |
            |    2     |   cat   |
            |    3     |    is   |
            |    4     |  coming |
            +----------+---------+
            [4 rows x 2 columns]

        To unpack only keys with 'word':

        >>> sa.unpack(limit=['word'])
            Columns:
                X.word  str
            Rows: 4
            Data:
            +--------+
            | X.word |
            +--------+
            |   a    |
            |  cat   |
            |   is   |
            | coming |
            +--------+
            [4 rows x 1 columns]


        This is an example of using na_value. Suppose there is an SArray of array:

        >>>  sa = SArray([
                       [1, 0, 1],
                       [1, 1, 1],
                       [0, 1, 1]])

        To unpack the sarray, also convert all zeros to missing value:

        >>> sa.unpack(column_types=[int, int, int], na_value=0)
            Columns:
                X.0 int
                X.1 int
                X.2 int
            Rows: 3
            Data:
            +------+------+-----+
            | X.0  | X.1  | X.2 |
            +------+------+-----+
            |  1   | None |  1  |
            |  1   |  1   |  1  |
            | None |  1   |  1  |
            +------+------+-----+
            [3 rows x 3 columns]

        To unpack only second and third value from the value:

        >>> sa.unpack(limit=[1,2])
            Columns:
                X.1 float
                X.2 float
            Rows: 3
            Data:
            +-----+-----+
            | X.1 | X.2 |
            +-----+-----+
            | 0.0 | 1.0 |
            | 1.0 | 1.0 |
            | 1.0 | 1.0 |
            +-----+-----+
            [3 rows x 2 columns]


        Notes
        -----
        Refer to :py:func:`graphlab.SFrame.pack_columns()` for reverse effect of unpack.
        """
        if self.dtype() not in [dict, array.array, list]:
            raise TypeError("Only SArray of dict/list/array type supports unpack")

        if column_name_prefix != None and type(column_name_prefix) != str:
            raise TypeError("'column_name_prefix' must be a string")

        # convert limit to column_keys
        if limit != None:
            if (not hasattr(limit, '__iter__')):
                raise TypeError("'limit' must be a list");

            name_types = set([type(i) for i in limit])
            if (len(name_types) != 1):
                raise TypeError("'limit' contains values that are different types")

            # limit value should be numeric if unpacking sarray.array value
            if (self.dtype() != dict) and (name_types.pop() != int):
                raise TypeError("'limit' must contain integer values.")

            if len(set(limit)) != len(limit):
                raise ValueError("'limit' contains duplicate values")

        if (column_types != None):
            if not hasattr(column_types, '__iter__'):
                raise TypeError("column_types must be a list");

            for column_type in column_types:
                if (column_type not in (int, float, str, list, dict, array.array)):
                    raise TypeError("column_types contains unsupported types. Supported types are ['float', 'int', 'list', 'dict', 'str', 'array.array']")

            if limit != None:
                if len(limit) != len(column_types):
                    raise ValueError("limit and column_types do not have the same length")
            elif self.dtype() == dict:
                raise ValueError("if 'column_types' is given, 'limit' has to be provided to unpack dict type.")
            else:
                limit = range(len(column_types))

        else:
            # infer column types and names if needed
            head_rows = self.head(100).dropna()
            if self.dtype() == dict:
                if (limit == None):
                    limit = set(itertools.chain.from_iterable(head_rows.dict_keys()))
                column_types = list()
                for name in limit:
                    t = [(x[name] if ((x is not None) and x.has_key(name)) else None) for x in head_rows]
                    column_types.append(infer_type_of_list(t));
            else:
                lengths = [len(i) for i in head_rows]
                if len(lengths) == 0 or max(lengths) == 0:
                    raise RuntimeError("Cannot infer number of items from the SArray, SArray may be empty. please explicitly provide column types")
                length = max(lengths)
                if limit == None:
                    limit = range(length)
                else:
                    # adjust the length
                    length = len(limit)

                if self.dtype() == array.array:
                    column_types = [float for i in range(length)]
                else:
                    column_types = list()
                    for i in limit:
                        t = [(x[i] if ((x is not None) and len(x) > i) else None) for x in head_rows]
                        column_types.append(infer_type_of_list(t));

        # generate column names
        if (column_name_prefix != None and column_name_prefix != ""):
            column_names = [column_name_prefix + "." + str(key) for key in limit]
        else:
            column_names = [str(key) for key in limit]

        _mt._get_metric_tracker().track('sarray.unpack')

        with cython_context():
            return gl.SFrame(_proxy=self.__proxy__.unpack(column_names, limit, column_types, na_value))

    def sort(self, ascending=True):
        '''
        Sort all values in the sarray by the given sort order.
        Sort only works for sarray of type str, int and float, otherwise TypeError
        will be raised

        Parameters
        ----------
        ascending: boolean, optional
           If true, the sarray values are sorted in ascending order, otherwise,
           descending order.
           default to True(ascending)

        Returns
        -------
        out: SArray

        Examples
        --------
          Sort an sarry in ascending order

          >>> sa.sort()

          Sort an sarray in descending order

          >>> sa.sort(ascending = False)

        '''
        if self.dtype() not in (int, float, str):
            raise TypeError("Only sarray with type (int, float, str) can be sorted")
        sf = gl.SFrame()
        sf['a'] = self
        return sf.sort('a', ascending)['a']


