"""
This module defines the SFrame class which provides the
ability to create, access and manipulate a remote scalable dataframe object.

SFrame acts similarly to pandas.DataFrame, but the data is completely immutable
and is stored column wise on the GraphLab Server side.
"""
import graphlab.connect as _mt
import graphlab.connect.main as glconnect
from graphlab.cython.context import debug_trace as cython_context
from graphlab.cython.cy_sframe import UnitySFrameProxy
from graphlab.util import make_internal_url
from graphlab.data_structures.sarray import SArray
import graphlab.aggregate

import inspect
import pandas
import time

_has_matplotlib = False
try:
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    from matplotlib import rcParams
    _has_matplotlib = True
except:
    pass


__all__ = ['SFrame']


def load_sframe(filename):
    """
    Load an SFrame. The filename extension is used to determine the format
    automatically. This function is particularly useful for SFrames previously
    saved in binary format. For CSV imports the SFrame.read_csv function
    provides greater control.

    Parameters
    ----------
    filename : string
        Location of the file to load. Can be a local path or a remote URL.

    Returns
    -------
    sf : SFrame
    """
    sf = SFrame(data=filename)
    return sf


class SFrame(object):
    """
    SFrame is a dataframe object (as in pandas or R) that can scale to big data.
    The data in SFrame is stored column-wise on the GraphLab Server side,
    and is stored on disk to avoid being constrained by memory size.

    While the columns in the SFrame are immutable, SFrame itself is mutable in
    that columns can be added and subtracted from an SFrame with ease. Each
    column of an SFrame is actually an :py:class:`graphlab.SArray`, so an
    SFrame essentially acts as an ordered dict of SArrays.

    SFrame can be constructed in various ways. Currently, we support
    constructing an SFrame from:

    * pandas.DataFrame
    * csv file (comma separated, first line is header)
    * sframe file (ends with .frame_idx)
    * general text file (with csv parsing options, See :py:meth:`read_csv()`)

    >>> import graphlab
    >>> from graphlab import SFrame

    Construct an SFrame from a dataframe and transfers the dataframe object
    across the network.

    >>> df = pandas.DataFrame()
    >>> sf = SFrame(data=df)

    Construct an SFrame from a local csv file (only works for local server).

    >>> sf = SFrame(data='~/mydata/foo.csv')

    Construct an SFrame from a csv file on Amazon S3. This requires the
    environment variables: *AWS_ACCESS_KEY_ID* and *AWS_SECRET_ACCESS_KEY* to be
    set before the python session started. Alternatively, you can use
    :py:func:`graphlab.aws.set_credentials()` to set the credentials after
    python is started and :py:func:`graphlab.aws.get_credentials()` to verify
    these environment variables.

    >>> sf = SFrame(data='s3://mybucket/foo.csv')

    Construct an SFrame from a server side gzipped csv file.

    >>> sf = SFrame(data='remote:///mydata/foo.csv.gz')

    Construct an SFrame from a csv file downloaded from a URL using more csv
    parseing parameters.

    >>> sf = SFrame.read_csv('http://testdatasets.s3-website-us-west-2.amazonaws.com/users.csv.gz',
                             delimiter=',',
                             header=False,
                             comment_char="#",
                             column_type_hints={'user_id': int})

    Save and load the sframe in native format.

    >>> sf.save('remote:///mysframe.frame_idx')
    >>> sf2 = graphlab.load_sframe('remote:///mysframe.frame_idx')

    An SFrame is comprised of a collection of columns of SArrays, and individual
    SArrays can be extracted easily. For instance given an SFrame:

    >>> sf
    Columns:
        A	int
        B	int
    Rows: 3
    Data:
       A  B
    0  1  4
    1  2  5
    2  3  6

    The "A" column can be extracted using:

    >>> sf["A"]
    dtype: int
    Rows: 3
    [1, 2, 3]

    And can be deleted using:

    >>> del sf["A"]

    Parameters
    ----------
    data : Array | pandas.DataFrame | string
        The actual interpretation of this field is dependent on the "format"
        parameter. If data is an Array or a Pandas DataFrame, the contents are
        stored in the SFrame. If the contents is a string, it is interpreted as
        a file. Files can be read from local file system, or urls (local://,
        hdfs://, s3://, http://, or remote://)

    format : {'auto', 'array', 'dataframe', 'csv', 'sframe'}
        The format of the data. Default 'auto' will automatically infer the
        input data format. The inference rules are simple: If the data is an
        array/dict or a dataframe, it is associated with 'array' and 'dataframe'
        respectively. If the data is a string, it is interpreted as a file, and
        the file extension is used to infer the file format.


    Notes
    -----
    When working with the graphlab EC2 instance, e.g.
    :py:func:`graphlab.aws.launch_EC2()`, SFrame cannot be constructed using
    local file path,  because it involves potentially large amount of data
    transfer from client to server. However, it is still ok to the remote file
    path.

    >>> graphlab.aws.launch_EC2('m1.large')
    >>> sf = SFrame('~/mydata/foo.csv') # throws exception
    >>> sf = SFrame('remote:///mydata/foo.csv') # works
    >>> sf = SFrame('http://testdatasets.s3-website-us-west-2.amazonaws.com/users.csv.gz') # works
    >>> sf = SFrame('s3://mybucket/foo.csv') # works
    >>> graphlab.aws.teminate_EC2()

    Similar restriction applies to :py:class:`graphlab.Graph` and
    :py:class:`graphlab.SArray`.
    """

    __slots__ = ['shape', '__proxy__']


    def __init__(self, data=[],
                 format='auto',
                 _proxy=None):
        """__init__(data=list(), format='auto')
        Construct a new SFrame from a url or a pandas.DataFrame.

        Parameters
        ----------
        data : Array | pandas.DataFrame | string
            The actual interpretation of this field is dependent on the "format"
            parameter. If data is an Array or a Pandas DataFrame, the contents
            are stored in the SFrame. If the contents is a string, it is
            interpreted as a file. Files can be read from local file system, or
            urls (local://, hdfs://, s3://, http://, or remote://)

        format : {'auto', 'array', 'dataframe', 'csv', 'sframe'}
            The format of the data. Default 'auto' will automatically infer the
            input data format. The inference rules are simple: If the data is an
            array/dict or a dataframe, it is associated with 'array' and
            'dataframe' respectively. If the data is a string, it is interpreted
            as a file, and the file extension is used to infer the file format.

        _proxy : None
            Internal, do not use.

        Notes
        -----
        For CSV files, the preferred constructor is SFrame.read_csv since
        that has a lot more options which can be used to control the parser.
        """
        # emit metrics for num_rows, num_columns, and type (local://, s3, hdfs, http)
        tracker = _mt._get_metric_tracker()
        if (_proxy):
            self.__proxy__ = _proxy
        else:
            self.__proxy__ = UnitySFrameProxy(glconnect.get_client())
            _format = None
            csv_delimiter = None
            if (format == 'auto'):
                if (isinstance(data, pandas.DataFrame)):
                    _format = 'dataframe'
                    tracker.track('sframe.location.dataframe', value=1)
                elif (isinstance(data, str) or isinstance(data, unicode)):

                    if data.find('://') == -1:
                        suffix = 'local'
                    else:
                        suffix = data.split('://')[0]
                    tracker.track(('sframe.location.%s' % (suffix)), value=1)

                    if data.endswith(('.csv', '.csv.gz')):
                        _format = 'csv'
                    elif data.endswith(('.tsv', '.tsv.gz')):
                        _format = 'tsv'
                        csv_delimiter = '\t'
                    elif data.endswith('.frame_idx'):
                        _format = 'sframe'
                    elif data.endswith(('.txt', '.txt.gz')):
                        print "Assuming file is csv. For other delimiters, " + \
                            "please use `SFrame.read_csv`."
                        _format = 'csv'
                    else:
                        _format = 'sframe'

                elif hasattr(data, '__iter__'):
                    _format = 'array'
                    tracker.track('sframe.format.array', value=1)
                else:
                    raise ValueError('Cannot infer input type for data ' + str(data))
            else:
                _format = format

            tracker.track(('sframe.format.%s' % _format), value=1)

            with cython_context():
                if (_format == 'dataframe'):
                    self.__proxy__.load_from_dataframe(data)
                elif (_format == 'array'):
                    pd = pandas.DataFrame(data)
                    pd.columns = [('X%d' % (i + 1)) for i in xrange(len(pd.columns))]
                    self.__proxy__.load_from_dataframe(pd)
                elif (_format == 'csv' or _format == 'tsv'):
                    url = make_internal_url(data)
                    csv_config = dict()
                    if csv_delimiter is not None:
                        csv_config['delimiter'] = csv_delimiter
                        csv_config['use_header'] = True
                    self.__proxy__.load_from_csv(url, csv_config, dict())
                elif (_format == 'sframe'):
                    if not data.endswith('.frame_idx'):
                        data += '.frame_idx'
                    url = make_internal_url(data)
                    self.__proxy__.load_from_sframe_index(url)
                else:
                    raise ValueError('Unknown input type: ' + format)

        self.shape = (self.num_rows(), self.num_cols())
        tracker.track('sframe.row.size', value=self.num_rows())
        tracker.track('sframe.col.size', value=self.num_cols())

    @classmethod
    def read_csv(cls,
                 url,
                 delimiter=',',
                 header=True,
                 comment_char='',
                 escape_char='\\',
                 double_quote=True,
                 quote_char='\"',
                 skip_initial_space=True,
                 column_type_hints=str,
                 verbose=True):
        """
        Constructs an SFrame from a CSV file.

        Parameters
        ----------
        delimiter : string
            This describes the delimiter used for parsing csv files. Must be a
            single character.

        header : bool
            If true, uses the first row as the column names.
            Otherwise use the default column names:'X1, X2,...'.

        comment_char : string
            The character which denotes that the
            remainder of the line is a comment.

        escape_char : string
            Character which begins a C escape sequence

        double_quote : bool
            If two consecutive quotes in a string parses to
            to a single quote.

        skip_initial_space : bool
            If extra spaces at the start of a field is ignored

        column_type_hints : type, list[type], dict[string, type]
            This provides type hints for each column.
            Supported types are int, float, str
            - If a single type is provided, the type will be
            applied to all columns. For instance, column_type_hints=float will
            force all columns to be parsed as float.
            - If a list of types is provided, the types applies
            to each column in order, e.g.[int, float, str]
            will parse the first column as int, second as float and third as string.
            - If a dictionary of column name to type is provided,
            each type value in the dictonary is applied to the key it belongs to.
            For instance {'user':int} will hint that the column
            called "user" should be parsed as an integer, and the rest will default
            to string.

        verbose : bool
            If True, print the progress.
        """
        parsing_config = dict()
        parsing_config["delimiter"] = delimiter
        parsing_config["use_header"] = header
        parsing_config["comment_char"] = comment_char
        parsing_config["escape_char"] = escape_char
        parsing_config["double_quote"] = double_quote
        parsing_config["quote_char"] = quote_char
        parsing_config["skip_initial_space"] = skip_initial_space

        proxy = UnitySFrameProxy(glconnect.get_client())
        internal_url = make_internal_url(url)

        if (not verbose):
            glconnect.get_client().set_log_progress(False)

        if type(column_type_hints) is type:
            type_hints = {'__all_columns__': column_type_hints}
        elif type(column_type_hints) is list:
            type_hints = dict(zip(['__X%d__' % i for i in range(len(column_type_hints))], column_type_hints))
        elif type(column_type_hints) is dict:
            type_hints = column_type_hints


        _mt._get_metric_tracker().track('sframe.csv.parse')

        suffix=''
        if url.find('://') == -1:
            suffix = 'local'
        else:
            suffix = url.split('://')[0]

        _mt._get_metric_tracker().track(('sframe.location.%s' % (suffix)), value=1)

        with cython_context():
            proxy.load_from_csv(internal_url, parsing_config, type_hints)
        glconnect.get_client().set_log_progress(True)

        return cls(_proxy=proxy)

    def __repr__(self):
        """
        Returns a string description of the frame
        """
        colnames = self.column_names()
        coltypes = self.column_types()
        ret = "Columns:\n"
        if len(colnames) > 0:
            for i in range(len(colnames)):
                ret = ret + "\t" + colnames[i] + "\t" + coltypes[i].__name__ + "\n"
            ret = ret + "\n"
        else:
            ret = ret + "\tNone\n\n"
        ret = ret + "Rows: " + str(len(self)) + "\n\n"
        ret = ret + "Data:\n"
        if (len(self) > 0):
            ret = ret + str(self)
        else:
            ret = ret + "\t[]"
        return ret

    def __str__(self):
        """
        Returns a string containing the first 10 elements of the frame, along
        with a description of the frame.
        """
        headln = str(self.head(10))
        ## pandas dataframe displays a [#row * #col] at the end which causes confusion.
        ## replace the last line with the actual row and columns
        headln_lines = headln.split("\n")
        if (len(self) > 10):
            headln_lines[-1] = "...\n[%d rows x %d columns]\n" % self.shape
        else:
            headln_lines[-1] = "[%d rows x %d columns]\n" % self.shape
        headln = "\n".join(headln_lines)
        return headln

    def __nonzero__(self):
        """
        Returns true if the frame is not empty.
        """
        return self.num_rows() != 0

    def __len__(self):
        """
        Returns the number of rows of the array
        """
        return self.num_rows()

    def _row_selector(self, other):
        """
        Where other is an SArray of identical length as the current Frame,
        this returns a selection of a subset of rows in the current SFrame
        where the corresponding row in the selector is non-zero.
        """
        if type(other) is SArray:
            if len(other) != len(self):
                raise IndexError("Cannot perform logical indexing on arrays of different length.")
            with cython_context():
                return SFrame(_proxy=self.__proxy__.logical_filter(other.__proxy__))

    def dtype(self):
        """
        Returns the column types. Same as :py:meth:`column_types`.

        Returns
        -------
        out : list[type]
            Column types of the SFrame.
        """

        return self.column_types()

    def num_rows(self):
        """
        Returns the number of rows.

        Returns
        -------
        out : int
            Number of rows in the SFrame.
        """
        return self.__proxy__.num_rows()

    def num_cols(self):
        """
        Returns the number of columns.

        Returns
        -------
        out : int
            Number of columns in the SFrame.
        """
        return self.__proxy__.num_columns()

    def column_names(self):
        """
        Returns the column names.

        Returns
        -------
        out : list[string]
            Column names of the SFrame.
        """
        return self.__proxy__.column_names()

    def column_types(self):
        """
        Returns the column types.

        Returns
        -------
        out : list[type]
            Column types of the SFrame.
        """
        return self.__proxy__.dtype()

    def head(self, n=10):
        """
        Returns a pandas.DataFrame which contains the first n rows of the
        SFrame.

        This operation will construct a pandas.DataFrame in memory. Care must
        be taken when size of the returned object is big.

        Parameters
        ----------
        n : int
            The number of rows to fetch.

        Returns
        -------
        out : pandas.DataFrame
            the dataframe which contains the first n rows of SFrame
        """
        return self.__proxy__.head(n)

    def to_dataframe(self):
        """
        Returns a pandas.DataFrame which contains the all rows of the
        SFrame.

        This operation will construct a pandas.DataFrame in memory. Care must
        be taken when size of the returned object is big.

        Returns
        -------
        out : pandas.DataFrame
            The dataframe which contains all rows of SFrame
        """
        return self.head(self.num_rows())

    def tail(self, n=10):
        """
        Returns a pandas.DataFrame which contains the last n rows of the
        SFrame.

        This operation will construct a pandas.DataFrame in memory. Care must
        be taken when size of the returned object is big.

        Parameters
        ----------
        n : int
            The number of rows to fetch.

        Returns
        -------
        out : pandas.DataFrame
            The dataframe which contains the last n rows of SFrame
        """
        return self.__proxy__.tail(n)

    def apply(self, fn, dtype=None, seed=None):
        """
        Returns a new SArray of dtype where each element in this SArray is
        transformed by fn(x) where x is a row in the sframe, as a dictionary.
        The fn should return a value which can be cast into dtype.

        If dtype is not specified, the first 100 rows of the SFrame are
        used to make a guess of the target datatype.

        Parameters
        ----------
        fn : function
            The function to transform each row of the sframe. The return
            type should be convertible to dtype if dtype is not None.

        dtype : dtype
            The dtype of the new SArray. If None, the first 100
            elements of the array are used to guess the target
            data type.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SArray
            The SArray transformed by fn.  Each element of the SArray is of
            type ``dtype``


        Example
        -------

        >>> import graphlab
        >>> sf = graphlab.SFrame.read_csv('netflix.csv')

        The following code create a new SArray where each element is the
        string concatination of the 'user_id', 'movie_id' and 'rating' columns.

        >>> sa = sf.apply(lambda x: str(x['user_id']) + str(x['movie_id']) + str(x['rating']))

        """
        assert inspect.isfunction(fn), "Input must be a function"
        dryrun = [fn(dict(zip(self.column_names(), i))) for i in self.head(10).values]
        if dtype is None:
            dtype = SArray(dryrun).dtype()

        if not seed:
            seed = int(time.time())

        _mt._get_metric_tracker().track('sframe.apply')

        with cython_context():
            return SArray(_proxy=self.__proxy__.transform(fn, dtype, seed))

    def sample(self, fraction, seed=None):
        """
        Returns an SFrame which contains a subsample (row) of the current SFrame.

        Parameters
        ----------
        fraction : float
            The fractionage of the rows to fetch. Must be between 0 and 1.

        seed : int
            The random seed for the random number generator. Defaut uses
            current time.

        Returns
        -------
        out : SFrame
            An SFrame containing the subsampled rows of the SFrame.
        """
        if not seed:
            seed = int(time.time())

        if (fraction > 1 or fraction < 0):
            raise ValueError('Invalid sampling rate: ' + str(fraction))

        _mt._get_metric_tracker().track('sframe.sample')

        if (self.num_rows() == 0 or self.num_cols() == 0):
            return self
        else:
            with cython_context():
                return SFrame(_proxy=self.__proxy__.sample(fraction, seed))

    def random_split(self, fraction, seed=None):
        """
        Returns a pair of SFrames from random splitting the current one. The first
        sframe contains uniformly random "fraction" rows, and the second
        contains the rest 1-"fraction" rows.

        Parameters
        ----------
        fraction : float
            The fractionage of the rows to fetch. Must be between 0 and 1.

        seed : int
            The random seed for the random number generator.

        Returns
        -------
        out : pair of SFrames
        """
        if (fraction > 1 or fraction < 0):
            raise ValueError('Invalid sampling rate: ' + str(fraction))
        if (self.num_rows() == 0 or self.num_cols() == 0):
            return (SFrame(), SFrame())

        if not seed:
            seed = int(time.time())

        # The server side requires this to be an int, so cast if we can
        try:
            seed = int(seed)
        except ValueError:
            raise ValueError('The \'seed\' parameter must be of type int.')

        _mt._get_metric_tracker().track('sframe.random_split')

        with cython_context():
            proxy_pair = self.__proxy__.random_split(fraction, seed)
            return (SFrame(data=[], _proxy=proxy_pair[0]), SFrame(data=[], _proxy=proxy_pair[1]))


    def topk(self, column_name, k=10, reverse=False):
        """
        Returns the topk rows sorted by the column_name in descending order.

        Parameters
        ----------
        column_name : string
            The column to sort on

        k : int
            The number of rows to return

        reverse : bool
            If True, return the topk rows in ascending order.

        Returns
        -------
        out : pandas.DataFrame
            A pandas.DataFrame containing topk rows sorted by column_name.
        """
        if type(column_name) is not str:
            raise TypeError("column_name must be a string")

        _mt._get_metric_tracker().track('sframe.topk')

        df = self[self[column_name].topk_index(k, reverse)].to_dataframe()
        df = df.sort(column_name, ascending=reverse)
        df.index = range(len(df))
        return df


    def save(self, filename, format='binary'):
        """
        Save the SFrame to file.

        Parameters
        ----------
        filename : string
            The location to save the SFrame. Either a local path or a remote
            URL. If the format is 'binary', the filename will get the suffix
            '.frame_idx' if it does not already have it.

        format : {'binary', 'csv'}, optional
            Format in which to save the SFrame.
        """

        _mt._get_metric_tracker().track('sframe.save', properties={'format':format})

        if filename.endswith(('.csv', '.csv.gz')):
            format = 'csv'
        elif filename.endswith('.frame_idx'):
            format = 'binary'
        else:
            if format is 'csv':
                filename = filename + '.csv'
            elif format is 'binary':
                filename = filename + '.frame_idx'
            else:
                raise ValueError("Invalid format: {}. Supported formats are 'csv' and 'binary'".format(format))

        ## Save the SFrame
        url = make_internal_url(filename)

        with cython_context():
            if format is 'binary':
                assert filename.endswith('.frame_idx')
                self.__proxy__.save(url)

            elif format is 'csv':
                assert filename.endswith(('.csv', '.csv.gz'))
                self.__proxy__.save_as_csv(url)
            else:
                raise ValueError("Unsupported format: {}".format(format))

    def select_column(self, key):
        """
        Return the SArray with one column that corresponds to the key

        Throws an exception if the key is something other than a str or
        if the key is not found.

        Parameters
        ----------
        key : str
            The column name

        Returns
        -------
        out : graphlab.SArray
            The sarray that is referred by 'key'
        """
        if not isinstance(key, str):
            raise TypeError("Invalid key type: must be str")
        with cython_context():
            return SArray(data=[], _proxy=self.__proxy__.select_column(key))

    def select_columns(self, keylist):
        """
        Returns an SFrame with the columns listed in 'keylist'.

        Raises
        ------
        TypeError
            Raises an exception if ANY of the keys are not in this SFrame or
            if keylist is anything other than a list of strings.

        Parameters
        ----------
        keylist : list
            The list of column names

        Returns
        -------
        out : graphlab.SFrame
            A new SFrame that is made up of the columns
            referred to in 'keylist' in this current SFrame
        """
        if not hasattr(keylist, '__iter__'):
            raise TypeError("keylist must be an iterable")
        if not all([isinstance(x, str) for x in keylist]):
            raise TypeError("Invalid key type: must be str")
        with cython_context():
            return SFrame(data=[], _proxy=self.__proxy__.select_columns(keylist))

    def add_column(self, data, name=""):
        """
        Adds the specified column to this SFrame.  The number of elements in
        the data given must match every other column of the SFrame.

        Parameters
        ----------
        data : SArray
            The 'column' of data.

        name : string
            The name of the column. If no name is given, a default name is chosen.
        """
        # Check type for pandas dataframe or SArray?
        if not isinstance(data, SArray):
            raise TypeError("Must give column as SArray")
        if not isinstance(name, str):
            raise TypeError("Invalid column name: must be str")
        with cython_context():
            self.__proxy__.add_column(data.__proxy__, name)
            self.shape = (self.num_rows(), self.num_cols())

    def add_columns(self, datalist, namelist):
        """
        Adds columns to the SFrame.  The number of elements in all columns must
        match every other column of the SFrame.

        Parameters
        ----------
        datalist : list of SArray
            A list of columns

        namelist : list of string
            A list of column names. All names must be specified.
        """
        if not hasattr(datalist, '__iter__'):
            raise TypeError("datalist must be an iterable")
        if not hasattr(namelist, '__iter__'):
            raise TypeError("namelist must be an iterable")
        if not all([isinstance(x, SArray) for x in datalist]):
            raise TypeError("Must give column as SArray")
        if not all([isinstance(x, str) for x in namelist]):
            raise TypeError("Invalid column name in list: must all be str")
        with cython_context():
            self.__proxy__.add_columns([x.__proxy__ for x in datalist], namelist)
            self.shape = (self.num_rows(), self.num_cols())

    def remove_column(self, name):
        """
        Removes the column with the given name from the SFrame.

        Parameters
        ----------
        name : string
            The name of the column to remove.
        """
        colid = self.column_names().index(name)
        with cython_context():
            self.__proxy__.remove_column(colid)
            self.shape = (self.num_rows(), self.num_cols())

    def swap_columns(self, column_1, column_2):
        """
        Swaps the columns with the given names.

        Parameters
        ----------
        column_1 : string
            Name of column to swap

        column_2 : string
            Name of other column to swap
        """
        colnames = self.column_names()
        colid_1 = colnames.index(column_1)
        colid_2 = colnames.index(column_2)
        with cython_context():
            self.__proxy__.swap_columns(colid_1, colid_2)

    def rename(self, names):
        """
        Rename the columns using the 'names' dict.  This changes the names of
        the columns given as the keys and replaces them with the names given as
        the values.

        Parameters
        ----------
        names : dict[string, string]
            Dictionary of [old_name, new_name]
        """
        if (type(names) is not dict):
            raise TypeError('names must be a dictionary: oldname -> newname')
        with cython_context():
            for k in names:
                colid = self.column_names().index(k)
                self.__proxy__.set_column_name(colid, names[k])

    def __getitem__(self, key):
        """
        Wrapper around select_column to allow column selection with array index.
        Only text keys are accepted.
        """
        if type(key) is SArray:
            return self._row_selector(key)
        elif type(key) is list:
            return self.select_columns(key)
        elif type(key) is str:
            return self.select_column(key)
        else:
            raise TypeError("Invalid index type: must be SArray, list, or str")

    def __setitem__(self, key, value):
        """
        A wrapper around add_column(s).  Key can be either a list or a str.  If
        value is an SArray, it is added to the SFrame as a column.  If it is a
        constant value (int, str, or float), then a column is created where
        every entry is equal to the constant value.  Existing columns can also
        be replaced using this wrapper.
        """
        if type(key) is list:
            self.add_columns(value, key)
        elif type(key) is str:
            # set new column
            if not key in self.column_names():
                if (type(value) is SArray):
                    self.add_column(value, key)
                elif hasattr(value, '__iter__'):
                    self.add_column(SArray(value), key)
                else:
                    with cython_context():
                        self.__proxy__.add_column_from_const(key, value)
            else:
                # add the column to a unique column name.
                tmpname = '__' + '-'.join(self.column_names())
                if (type(value) is SArray):
                    self.add_column(value, tmpname)
                elif hasattr(value, '__iter__'):
                    self.add_column(SArray(value), tmpname)
                else:
                    with cython_context():
                        self.__proxy__.add_column_from_const(tmpname, value)
                # if add succeeded, remove the column name and rename tmpname->columnname.
                self.swap_columns(key, tmpname)
                self.remove_column(key)
                self.rename({tmpname: key})
        else:
            raise TypeError('Cannot set column with value type: ' + type(value))

    def __delitem__(self, key):
        """
        Wrapper around remove_column.
        """
        self.remove_column(key)

    def __iter__(self):
        """
        Provides an iterator to the rows of the sframe.
        """

        _mt._get_metric_tracker().track('sframe.__iter__')

        def generator():
            elems_at_a_time = 262144
            self.__proxy__.begin_iterator()
            ret = self.__proxy__.iterator_get_next(elems_at_a_time)
            column_names = self.column_names()
            while(True):
                for j in ret:
                    yield dict(zip(column_names, j))

                if len(ret) == elems_at_a_time:
                    ret = self.__proxy__.iterator_get_next(elems_at_a_time)
                else:
                    break

        return generator()

    def group(self, column):
        """
        Return a new SFrame where the rows are grouped according to the
        value of the provided column.

        Parameters
        ----------
        column : string
            Name of column to group.

        Returns
        -------
        out_sf : SFrame
            A new SFrame having the same shape and data, but the rows
            are grouped by the given column.

        Notes
        -----
        The group method does not sort on the selected column.
        """
        _mt._get_metric_tracker().track('sframe.group')
        if column not in self.column_names():
            raise RuntimeError("Column " + column + " does not exist in SFrame")
        else:
            with cython_context():
                return SFrame(_proxy=self.__proxy__.group(column))

    def groupby(self, key_columns, operation_dict, *args):
        """
        Perform a group on the key_columns followed by aggregations on the
        columns listed in operation_dict.

        The operation_dict parameter is a dictionary that indicates which
        aggregation operators to use and which columns to use them on. The
        available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, and
        QUANTILE. For convenience, aggregators MEAN, STD, and VARIANCE are
        available as synonyms for AVG, STDV, and VAR. See
        :mod:`~graphlab.aggregate` for more detail on the aggregators.

        The columns of the output SFrame are named "[operator] of
        [aggregation column name]". For example, if key_columns is 'user_id' and
        operation_dict is {'rating': [gl.aggregate.SUM, gl.aggregate.COUNT]},
        the output is an SFrame with three columns: 'user_id', 'Sum of rating',
        and 'Count of rating'.

        The COUNT aggregate is special and can be used without specifying the
        full operation_dict parameter.

        Parameters
        ----------
        key_columns : string | list[string]
            Column(s) to group by.

        operation_dict : dict
            Dictionary of columns and aggregation operations. Each key is a
            column name and each value is a list of aggregators to be applied to
            that particular column.

        Returns
        -------
        out_sf : SFrame
            A new SFrame, with a column for each groupby column and each
            aggregation operation.

        Examples
        --------
        >>> import graphlab as gl
        >>> import graphlab.aggregate as agg
        >>> sf = gl.load_sframe('netflix.frame_idx')

        Compute the number of occurrences of each user.

        >>> user_count = sf.groupby(key_columns='user_id',
                                    operation_dict={'user_id': agg.COUNT})

        Compute the same thing with the special syntax for the COUNT aggregate.

        >>> user_count = sf.groupby(key_columns='user_id',
                                    operation_dict=agg.COUNT)

        Compute the mean and standard deviation of ratings per user.

        >>> user_rating_stats = sf.groupby(key_columns='user_id',
                                           operation_dict={'rating': agg.MEAN, agg.STD]})

        Compute the count, mean, and standard deviation of ratings per (user,
        time).

        >>> user_rating_stats = sf.groupby(['user_id', 'time'], {'rating':
                                           [agg.COUNT, agg.AVG, agg.STDV]})

        The groupby function can take a variable length list of aggregation
        specifiers so if we want the count and the 0.25 and 0.75 quantiles of
        ratings:

        >>> user_rating_stats = sf.groupby(['user_id', 'time'], agg.COUNT,
                                           {'rating': agg.QUANTILE(0.25, 0.75)})
        """
        # some basic checking first
        # make sure key_columns is a list
        if isinstance(key_columns, str):
            key_columns = [key_columns]
        # check that every column is a string, and is a valid column name
        my_column_names = self.column_names()
        key_columns_array = []
        for column in key_columns:
            if not isinstance(column, str):
                raise TypeError("Column name must be a string")
            if column not in my_column_names:
                raise KeyError("Column " + column + " does not exist in SFrame")
            key_columns_array.append(column)

        group_columns = []
        group_ops = []

        all_ops = [operation_dict] + list(args)

        for operation in all_ops:
            # if instance type is not dict, then it must be gl.aggregate.COUNT
            if not isinstance(operation, dict):
                if isinstance(operation, str) and operation == graphlab.aggregate.COUNT:
                    #translate a single COUNT value to graphlab.aggregate.COUNT
                    operation = {'':graphlab.aggregate.COUNT}
                else:
                    raise RuntimeError("Groupby operations must be either a dictionary of key:operation, or graphlab.aggregate.COUNT")


            # now sweep the dict and add to group_columns and group_ops
            for key in operation:
                val = operation[key]
                if type(key) is not list:
                    key = [key]
                if type(val) is not list:
                    val = [val]
                all_pairs = [(k, v) for k in key for v in val]
                group_columns = group_columns + [k for (k, v) in all_pairs]
                group_ops = group_ops + [v for (k, v) in all_pairs]

        # let's validate group_columns and group_ops are valid
        for (col, op) in zip(group_columns, group_ops):
            if not isinstance(col, str):
                raise TypeError("Column name must be a string")
            elif not isinstance(op, str):
                raise TypeError("Operation type not recognized.")
            else:
                _mt._get_metric_tracker().track('sframe.groupby', properties={'operator':op})
                if op is not graphlab.aggregate.COUNT and col not in my_column_names:
                    raise KeyError("Column " + column + " does not exist in SFrame")
        with cython_context():
            return SFrame(_proxy=self.__proxy__.groupby_aggregate(key_columns_array, group_columns, group_ops))
