"""
This module defines the SArray class which provides the
ability to create, access and manipulate a remote scalable array object.

SArray acts similarly to pandas.Series but without indexing.
The data is immutable, homogeneous, and is stored on the GraphLab Server side.
"""
import graphlab.connect.main as glconnect
from graphlab.cython.sarray import UnitySArrayProxy
from graphlab.util import make_internal_url, shallow_throw
import inspect
import pandas, numpy
import time
import random

__all__ = ['SArray']

class SArray(object):
    """
    SArray is an array object scaled to big data.
    The data in SArray is immutable, homogeneously typed, and is stored column wise.
    SArray is also used as column in the :py:class:`~graphlab.SFrame`.

    SArray can be constructed in various ways:

    Construct an SArray from list.

    >>> from graphlab import SArray 
    >>> sa = SArray(data=[1,2,3,4,5], int)

    Construct an SArray from numpy.ndarray.

    >>> sa = gl.SArray(data=numpy.asarray([1,2,3,4,5]), int)

    Construct an SArray from pandas.Series.

    >>> sf = gl.SArray(data=pd.Series([1,2,3,4,5]), int)

    If the type is not specified, automatic inference is attempted via
    conversion to a Pandas Series

    >>> gl.SArray(data=pd.Series([1,2,3,4,5])).dtype()
    int
    >>> gl.SArray(data=pd.Series([1,2,3,4,5.0])).dtype()
    float

    Construct an SArray from local text file. (Only works for local server).

    >>> sf = SArray('/tmp/a_to_z.txt.gz')

    Construct an SArray from a text file downloaded from a URL.

    >>> sf = SArray('http://s3-us-west-2.amazonaws.com/testdatasets/a_to_z.txt.gz')

    Construct an SArray from a text file stored on the server side.

    >>> sf = SArray('remote:///tmp/a_to_z.txt.gz')

    Parameters
    ----------
    data : list | numpy.ndarray | pandas.Series | string
        The input data. If this is a list, numpy.ndarray, or pandas.Series
        the data in the list is converted and stored in an SArray.
        Alternatively if this is a string, it is inerpreted as a path (or
        url) to a text file.  Each line of the text file is loaded as a
        separate row. If data is a file name ending with ".sidx", this is
        loaded as a SArray file.

    dtype : SArray
        The data type of the SArray. Supported types are: {int,
        float, str}.  If not specified (None), we attempt to evaluate
        it from the input.  If it is a numpy array, or a Pandas
        series, the dtype of the array/series is used. If it is a
        list, it is first converted to a Pandas series, and the
        dtype of that is used. If it is a URL or path to a text file,
        we default to strings.

    ignore_cast_failure : bool
        If true, ignores casting failures, but warns
        when the elements that cannot be casted into the dtype
        specified.

    Notes
    -----
    When working with the graphlab EC2 instance, e.g. :py:func:`graphlab.aws.launch_EC2()`,
    SArray cannot be constructed using local file path, because it involves
    potentially large amount of data transfer from client to server.
    However, it is still ok to the remote file path.

    >>> graphlab.aws.launch_EC2('m1.large')
    >>> sf = SArray('~/mydata/foo.csv') # throws exception
    >>> sf = SArray('remote:///mydata/foo.csv') # works
    >>> sf = SArray("http://testdatasets.s3-website-us-west-2.amazonaws.com/users.csv.gz") # works
    >>> sf = SArray("s3://mybucket/foo.csv") # works
    >>> graphlab.aws.teminate_EC2()

    Similar restriction applies to :py:class:`~graphlab.Graph` and :py:class:`~graphlab.SFrame`.
    """

    @shallow_throw
    def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None):
        """__init__(data=[], dtype=None, ignore_cast_failure=False)
        Construct a new SArray. The source of data includes: list, numpy.ndarray, pandas.Series, and urls.

        Parameters
        ----------
        data : list | numpy.ndarray | pandas.Series | string
            The input data. If this is a list, numpy.ndarray, or pandas.Series
            the data in the list is converted and stored in an SArray.
            Alternatively if this is a string, it is inerpreted as a path (or
            url) to a text file.  Each line of the text file is loaded as a
            separate row. If data is a file name ending with ".sidx", this is
            loaded as a SArray file.

        dtype : SArray
            The data type of the SArray. Supported types are: {int,
            float, str}.  If not specified (None), we attempt to evaluate
            it from the input.  If it is a numpy array, or a Pandas
            series, the dtype of the array/series is used. If it is a
            list, it is first converted to a Pandas series, and the
            dtype of that is used. If it is a URL or path to a text file,
            we default to strings.

        ignore_cast_failure : bool
            If true, ignores casting failures, but warns
            when the elements that cannot be casted into the dtype
            specified.

        _proxy : None
            Internal, do not use.

        Notes
        -----
        If data is pandas.Series, the index will be ignored.
        """
        if (_proxy):
            self.__proxy__ = _proxy
        else:
            self.__proxy__ = UnitySArrayProxy(glconnect.get_client())
            # we need to perform type inference
            if dtype is None:
                if (isinstance(data, list)):
                    # if it is a list, convert the list to a pandas series
                    # and get the type of that
                    dtype = pandas.Series(data).dtype
                elif (isinstance(data, pandas.Series) or
                      isinstance(data, numpy.ndarray)):
                    # if it is a pandas series or numpy array, get the dtype
                    # of the series / array
                    dtype = data.dtype
                elif (isinstance(data, str)):
                    # if it is a file, we default to string
                    dtype = str

            if (isinstance(data, pandas.Series) or
                isinstance(data, numpy.ndarray) or
                isinstance(data, list)):
                self.__proxy__.load_from_iterable(data, dtype, ignore_cast_failure)
            elif (isinstance(data, str)):
                if data.endswith('.sidx'):
                    internal_url = make_internal_url(data)
                    self.__proxy__.load_from_sarray_index(internal_url)
                else:
                    internal_url = make_internal_url(data)
                    self.__proxy__.load_from_url(internal_url, dtype)
            else:
                raise TypeError("Unexpected data source. " \
                                "Possible data source types are: list, " \
                                "numpy.ndarray, pandas.Series, and string(url)")

    @shallow_throw
    def save(self, targetfile):
        """
        Saves the SArray to file.

        Parameters
        ----------
        filename : string
            The location to save the SFrame. Either a local path or a remote
            URL. The filename will get the suffix '.sidx' if it does not already have it.
        """
        if not targetfile.endswith('.sidx'):
            raise ValueError("Save target must end with sidx")
        else:
            self.__proxy__.save(targetfile)

    def __repr__(self):
        """
        Returns a string description of the Array.
        """
        ret =       "dtype: " + str(self.dtype().__name__) + "\n"
        ret = ret + "Rows: " + str(self.size()) + "\n"
        ret = ret + str(self)
        return ret


    def __str__(self):
        """
        Returns a string containing the first 100 elements of the array.
        """
        headln = str(self.head(100))
        if (self.size() > 100):
            # cut the last close bracket
            # and replace it with ...
            headln = headln[0:-1] + ", ... ]"
        return headln


    def __nonzero__(self):
        """
        Returns true if the array is not empty.
        """
        return self.size() != 0

    def __len__(self):
        """
        Returns the length of the array
        """
        return self.size()

    def __iter__(self):
        """
        Provides an iterator to the contents of the array.
        """
        def generator():
            elems_at_a_time = 262144
            self.__proxy__.begin_iterator()
            ret = self.__proxy__.iterator_get_next(elems_at_a_time)
            while(True):
                for j in ret:
                    yield j

                if len(ret) == elems_at_a_time:
                    ret = self.__proxy__.iterator_get_next(elems_at_a_time)
                else:
                    break

        return generator()

    @shallow_throw
    def __add__(self, other):
        """
        If other is a scalar value, adds it to the current array, returning
        the new result. If other is an SArray, performs an element-wise
        addition of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '+'))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '+'))

    @shallow_throw
    def __sub__(self, other):
        """
        If other is a scalar value, subtracts it from the current array, returning
        the new result. If other is an SArray, performs an element-wise
        subtraction of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '-'))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '-'))

    @shallow_throw
    def __mul__(self, other):
        """
        If other is a scalar value, multiplies it to the current array, returning
        the new result. If other is an SArray, performs an element-wise
        multiplication of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '*'))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '*'))

    @shallow_throw
    def __div__(self, other):
        """
        If other is a scalar value, divides each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise division of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '/'))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '/'))

    @shallow_throw
    def __lt__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '<'))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '<'))

    @shallow_throw
    def __gt__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '>'))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '>'))

    @shallow_throw
    def __le__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '<='))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '<='))

    @shallow_throw
    def __ge__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '>='))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '>='))

    @shallow_throw
    def __radd__(self, other):
        """
        Adds a scalar value to the current array.
        Returned array has the same type as the array on the right hand side
        """
        return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '+'))

    @shallow_throw
    def __rsub__(self, other):
        """
        Subtracts a scalar value from the current array.
        Returned array has the same type as the array on the right hand side
        """
        return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '-'))

    @shallow_throw
    def __rmul__(self, other):
        """
        Multiplies a scalar value to the current array.
        Returned array has the same type as the array on the right hand side
        """
        return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '*'))

    @shallow_throw
    def __rdiv__(self, other):
        """
        Divides a scalar value by each element in the array
        Returned array has the same type as the array on the right hand side
        """
        return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '/'))

    @shallow_throw
    def __eq__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the new result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '=='))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '=='))

    @shallow_throw
    def __ne__(self, other):
        """
        If other is a scalar value, compares each element of the current array
        by the value, returning the new result. If other is an SArray, performs
        an element-wise comparison of the two arrays.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '!='))
        else:
            return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '!='))

    @shallow_throw
    def __and__(self, other):
        """
        Perform a logical element-wise 'and' against another SArray.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '&'))
        else:
            raise TypeError("SArray can only perform logical and against another SArray")

    @shallow_throw
    def __or__(self, other):
        """
        Perform a logical element-wise 'or' against another SArray.
        """
        if type(other) is SArray:
            return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '|'))
        else:
            raise TypeError("SArray can only perform logical or against another SArray")

    @shallow_throw
    def __getitem__(self, other):
        """
        If other is an SArray of identical length, this function performs a
        logical filter: i.e. it subselects all the elements in this array
        where the corresponding value in the other array evaluates to true.
        """
        if type(other) is SArray:
            if len(other) != len(self):
                raise IndexError("Cannot perform logical indexing on arrays of different length.")
            return SArray(_proxy = self.__proxy__.logical_filter(other.__proxy__))
        else:
            raise IndexError("Invalid type to use for indexing")

    def size(self):
        """
        Returns the size of the sarray.
        """
        return self.__proxy__.size()

    def dtype(self):
        """
        Returns the data type of the sarray.

        Returns
        -------
        out : type
            The type of the sarray.
        """
        return self.__proxy__.dtype()

    @shallow_throw
    def head(self, n=10):
        """
        Returns a pandas.DataFrame which contains the first n rows of the SFrame.

        This operation will construct a pandas.DataFrame in memory. Care must be taken
        when size of the returned object is big.

        Parameters
        ----------
        n : int
            The number of rows to fetch.

        Returns
        -------
        out : pandas.DataFrame
            The dataframe which contains the first n rows of the SFrame.
        """
        return self.__proxy__.head(n)

    @shallow_throw
    def apply(self, fn, dtype=None, skip_undefined=True, seed=time.time()):
        """
        Returns a new SArray of dtype where each element in this SArray is
        transformed by fn(x).
        The fn should return a value which can be cast into dtype.

        If dtype is not specified, the first 100 elements of the Array are
        used to make a guess of the target datatype.

        Parameters
        ----------
        fn : function
            The function to transform each element.

        dtype : dtype
            The dtype of the new SArray. If None, the first 100
            elements of the array are used to guess the target
            data type.

        skip_undefined : bool, optional
            If True, will not apply fn to any undefined values.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SArray
            The SArray transformed by fn.  Each element of the SArray is of
            type ``dtype``
        """
        assert inspect.isfunction(fn), "Input must be a function"
        try:
            if dtype is None:
                dtype = pandas.Series(self.head(100)).apply(fn).dtype
        except Exception as e:
            # convert all exceptions to RuntimeError.
            # This provides uniformity of error types. (Since SArray
            # basically just raises runtime errors)
            raise RuntimeError(str(e))

        return SArray(_proxy=self.__proxy__.transform(fn, dtype, skip_undefined, seed))

    @shallow_throw
    def filter(self, fn, skip_undefined=True, seed=time.time()):
        """
        Returns a new SArray which is filtered by the given function.
        If the lambda evaluates an element to true, this element is copied to the
        new SArray.  If not, it isn't.  Throws an exception if the return type
        of the lambda is not castable to a boolean value.

        Parameters
        ----------
        fn : function
            Function that filters the SArray. Must evaluate to bool or int.

        skip_undefined : bool, optional
            If True, will not apply fn to any undefined values.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SArray
        """
        assert inspect.isfunction(fn), "Input must be a function"
        return SArray(_proxy=self.__proxy__.filter(fn, skip_undefined, seed))

    @shallow_throw
    def sample(self, fraction, seed=time.time()):
        """
        Returns an SArray which contains a subsample of the current SArray.

        Parameters
        ----------
        fraction : float
            The fractionage of the rows to fetch. Must be between 0 and 1.

        seed : int
            The random seed for the random number generator.

        Returns
        -------
        out : SArray
            The new SArray which contains the subsampled rows.
        """
        if (fraction > 1 or fraction < 0):
            raise ValueError('Invalid sampling rate: ' + str(fraction))
        if (self.size() == 0):
            return SArray()
        return SArray(_proxy=self.__proxy__.sample(fraction, seed))

    def _save_as_text(self, url):
        """
        Save the SArray to disk as text file.
        """
        raise NotImplementedError


    @shallow_throw
    def all(self):
        """
        Returns True if all of the elements evaluate to True.
        The empty array returns True.

        Returns
        -------
        out : bool
        """
        return self.__proxy__.all()

    @shallow_throw
    def any(self):
        """
        Returns True if any of the elements evaluate to True.
        The empty array returns False.

        Returns
        -------
        out : bool
        """
        return self.__proxy__.any()

    @shallow_throw
    def max(self):
        """
        Returns the maximum value in the SArray.  The empty frame returns numpy.nan.
        Throws an exception if called on an SArray with non-numeric type.

        Returns
        -------
        out : type of SArray
            Maximum value of SArray
        """
        return self.__proxy__.max()

    @shallow_throw
    def min(self):
        """
        Returns the minimum value in the SArray.  The empty frame returns numpy.nan.
        Throws an exception if called on an SArray with non-numeric type.

        Returns
        -------
        out : type of SArray
            Minimum value of SArray
        """
        return self.__proxy__.min()

    @shallow_throw
    def sum(self):
        """
        Returns the sum of all the values in the sarray.  The empty frame
        returns numpy.nan.  Throws an exception if called on an sarray with
        non-numeric type.  Will overflow without warning.

        Returns
        -------
        out : type of SArray
            Sum of all values in SArray
        """
        return self.__proxy__.sum()

    @shallow_throw
    def mean(self):
        """
        Returns the mean of the values in the sarray.  The empty frame returns
        numpy.nan.  Throws an exception if called on an sarray with non-numeric
        type.

        Returns
        -------
        out : float
            Mean of all values in SArray
        """
        return self.__proxy__.mean()

    @shallow_throw
    def std(self, ddof=0):
        """
        Returns the standard deviation of all the values in the sarray as
        a float.

        Parameters
        ----------
        ddof : int
            "delta degrees of freedom" in the variance calculation.

        Raises
        ------
        Throws an exception if ddof >= sarray size or if the sarray is
        a non-numeric type.

        Returns
        -------
        out : float
            The standard deviation of all the values.
            Returns numpy.nan if the sarray is empty.
        """
        return self.__proxy__.std(ddof)

    @shallow_throw
    def var(self, ddof=0):
        """
        Returns the variance of all the values in the sarray as
        a float.

        Parameters
        ----------
        ddof : int
            "delta degrees of freedom" in the variance calculation.

        Raises
        ------
            Throws an exception if ddof >= sarray size or if the sarray is a
            non-numeric type.

        Returns
        -------
        out : float
            Variance of all values in SArray. Returns numpy.nan if the sarray
            is empty.
        """
        return self.__proxy__.var(ddof)

    @shallow_throw
    def nnz(self):
        """
        Returns the number of non-zero elements in the SArray.  Returns 0 if the
        sarray is empty.

        Returns
        -------
        out : int
            Number of non-zero elements.
        """
        return self.__proxy__.nnz()

    @shallow_throw
    def astype(self, dtype):
        """
        Returns a new SArray with all of the current values casted to the given
        type.

        Throws an exception if the types are not castable to the given type.

        Parameters
        ----------
        dtype : type
            The type to cast the elements to in SArray

        Returns
        -------
        out : SArray (of type dtype)
            The SArray converted to the dtype
        """
        return SArray(_proxy=self.__proxy__.astype(dtype))

    @shallow_throw
    def clip(self, lower=numpy.nan, upper=numpy.nan):
        """
        Returns a new SArray with the clipped values of this SArray.
        This means to modify each value outside of the given bounds
        to be the bound.

        If lower or upper are given numpy.nan as the argument, this is interpreted
        as a non-existent bound.

        Parameters
        ----------
        lower : int
            The lower bound to clip to. Ignored if equal to numpy.nan

        upper : int
            The upper bound to clip to. Ignored if equal to numpy.nan

        Raises
        ------
            Throws an exception if the SArray is empty, the types are non-numeric,
            or if the upper bound is less than the lower bound

        Returns
        -------
        out : SArray
        """
        return SArray(_proxy=self.__proxy__.clip(lower, upper))

    def clip_lower(self, threshold):
        """
        Returns a new SArray with all values clipped to the lower bound given.

        Parameters
        ----------
        threshold : float
            The lower bound to clip values to

        Raises
        ------
        Exception
            Throws an exception if the SArray is empty or the types are non-numeric

        Returns
        -------
        out : SArray
        """
        return SArray(_proxy=self.__proxy__.clip(threshold, numpy.nan))

    @shallow_throw
    def clip_upper(self, threshold):
        """
        Returns a new SArray with all values clipped to the upper bound given.

        Parameters
        ----------
        threshold : float
            The upper bound to clip values to

        Raises
        ------
        Exception
            Throws an exception if the SArray is empty or the types are non-numeric

        Returns
        -------
        out : SArray
        """
        return SArray(_proxy=self.__proxy__.clip(numpy.nan, threshold))

#    def nonzero(self):
#        """
#        Returns a list of indices in this SArray that evaluate to a nonzero
#
#        Throws an exception if the SArray has at least one nan value. Care must
#        be taken if the resulting list is big (larger than available memory).
#
#        Returns
#        -------
#        out : list
#            The list of indices that correspond to each nonzero element in
#            the SArray.
#        """
#        return self.__proxy__.nonzero()

    @shallow_throw
    def tail(self, n=10):
        """
        Returns a list of the last n elements in the SArray.

        Does not throw. Care must be taken when size of the returned object is
        big (larger than available memory).

        Parameters
        ----------
        n : int
            The number of elements to fetch


        Returns
        -------
        out : list
            The last n elements of the SArray
        """
        return self.__proxy__.tail(n)

    @shallow_throw
    def dropna(self):
        """
        Returns a new SArray containing only the non-missing values of the
        array.  The size of the returned SArray will be <= to the size of the
        original.

        A missing value shows up in an SArray as 'None'.

        Returns
        -------
        out : SArray
            The new SArray with missing values removed.
        """
        return SArray(_proxy = self.__proxy__.drop_missing_values())

    @shallow_throw
    def topk_index(self, topk=10, reverse=False):
        """
        Used to return the topk elements, sorted by those elements (descending
        by default).  Returns a new SArray of type 'int' of the same size as
        the current SArray.  Entries are '1' if the corresponding element in
        the current SArray is a part of the topk elements, and '0' if that
        corresponding element is not.

        Parameters
        ----------
        topk : int
            The number of elements to determine if 'top'

        reverse: bool
            If True, return the topk elements in ascending order

        Returns
        -------
        out : SArray (of type int)

        Notes
        -----
        This is mostly used internally by SFrame's topk function.
        """
        return SArray(_proxy = self.__proxy__.topk_index(topk, reverse))

