"""
This module defines the SFrame class which provides the
ability to create, access and manipulate a remote scalable dataframe object.

SFrame acts similarly to pandas.DataFrame, but the data is completely immutable
and is stored column wise on the GraphLab Server side.
"""
import graphlab.connect as _mt
import graphlab.connect.main as glconnect
from graphlab.cython.cy_type_utils import infer_type_of_list
from graphlab.cython.context import debug_trace as cython_context
from graphlab.cython.cy_sframe import UnitySFrameProxy
from graphlab.util import make_internal_url, split_path_elements
from graphlab.data_structures.sarray import SArray, _create_sequential_sarray
import graphlab.aggregate
import graphlab.canvas
import graphlab.canvas.inspect
import array
from prettytable import PrettyTable
from textwrap import wrap

import inspect
from graphlab.deps import pandas, HAS_PANDAS
import time
import itertools


__all__ = ['SFrame']

def load_sframe(filename):
    """
    Load an SFrame. The filename extension is used to determine the format
    automatically. This function is particularly useful for SFrames previously
    saved in binary format. For CSV imports the SFrame.read_csv function
    provides greater control. If the SFrame is in binary format, provide the
    directory it is in (which is created when the SFrame is saved).

    Parameters
    ----------
    filename : string
        Location of the file to load. Can be a local path or a remote URL.

    Returns
    -------
    out : SFrame
    """
    sf = SFrame(data=filename)
    return sf


class SFrame(object):
    """
    SFrame is a dataframe object (as in pandas or R) that can scale to big data.
    The data in SFrame is stored column-wise on the GraphLab Server side,
    and is stored on disk to avoid being constrained by memory size.

    While the columns in the SFrame are immutable, SFrame itself is mutable in
    that columns can be added and subtracted from an SFrame with ease. Each
    column of an SFrame is actually an :py:class:`graphlab.SArray`, so an
    SFrame essentially acts as an ordered dict of SArrays.

    SFrame can be constructed in various ways. Currently, we support
    constructing an SFrame from:

    * pandas.DataFrame
    * csv file (comma separated, first line is header)
    * sframe directory archive (A directory where an sframe was saved previously)
    * general text file (with csv parsing options, See :py:meth:`read_csv()`)

    >>> import graphlab
    >>> from graphlab import SFrame

    **Construction**

    Construct an SFrame from a dataframe and transfers the dataframe object
    across the network.

    >>> df = pandas.DataFrame()
    >>> sf = SFrame(data=df)

    Construct an SFrame from a local csv file (only works for local server).

    >>> sf = SFrame(data='~/mydata/foo.csv')

    Construct an SFrame from a csv file on Amazon S3. This requires the
    environment variables: *AWS_ACCESS_KEY_ID* and *AWS_SECRET_ACCESS_KEY* to be
    set before the python session started. Alternatively, you can use
    :py:func:`graphlab.aws.set_credentials()` to set the credentials after
    python is started and :py:func:`graphlab.aws.get_credentials()` to verify
    these environment variables.

    >>> sf = SFrame(data='s3://mybucket/foo.csv')

    Construct an SFrame from a server side gzipped csv file.

    >>> sf = SFrame(data='remote:///mydata/foo.csv.gz')

    Construct an SFrame from a csv file downloaded from a URL using more csv
    parsing parameters.

    >>> sf = SFrame.read_csv('http://testdatasets.s3-website-us-west-2.amazonaws.com/users.csv.gz',
                             delimiter=',',
                             header=False,
                             comment_char="#",
                             column_type_hints={'user_id': int})

    An SFrame can be constructed from a dictionary of values or SArrays:

    >>> sf = gl.SFrame({'id':[1,2,3],'val':['A','B','C']})
    >>> sf
    Columns:
        id  int
        val str
    Rows: 3
    Data:
       id  val
    0  1   A
    1  2   B
    2  3   C

    Or equivalently:

    >>> ids = SArray([1,2,3])
    >>> vals = SArray(['A','B','C'])
    >>> sf = SFrame({'id':ids,'val':vals})

    It can also be constructed from an array of SArrays in which case column
    names are automatically assigned.

    >>> ids = SArray([1,2,3])
    >>> vals = SArray(['A','B','C'])
    >>> sf = SFrame([ids, vals])
    >>> sf
    Columns:
        X1 int
        X2 str
    Rows: 3
    Data:
       X1  X2
    0  1   A
    1  2   B
    2  3   C

    If the SFrame is constructed from a list of values, an SFrame of a single
    column is constructed.

    >>> sf = SFrame([1,2,3])
    >>> sf
    Columns:
        X1 int
    Rows: 3
    Data:
       X1
    0  1
    1  2
    2  3

    **Parsing**

    The :py:func:`graphlab.SFrame.read_csv()` is quite powerful and, can be
    used to import a variety of row-based formats.

    First, some simple cases:

    >>> !cat ratings.csv
    user_id,movie_id,rating
    10210,1,1
    10213,2,5
    10217,2,2
    10102,1,3
    10109,3,4
    10117,5,2
    10122,2,4
    10114,1,5
    10125,1,1
    >>> gl.SFrame.read_csv('ratings.csv')
    Columns:
      user_id	int
      movie_id	int
      rating	int
    Rows: 9
    Data:
    +---------+----------+--------+
    | user_id | movie_id | rating |
    +---------+----------+--------+
    |  10210  |    1     |   1    |
    |  10213  |    2     |   5    |
    |  10217  |    2     |   2    |
    |  10102  |    1     |   3    |
    |  10109  |    3     |   4    |
    |  10117  |    5     |   2    |
    |  10122  |    2     |   4    |
    |  10114  |    1     |   5    |
    |  10125  |    1     |   1    |
    +---------+----------+--------+
    [9 rows x 3 columns]


    Delimiters can be specified, if "," is not the delimiter, for instance
    space ' ' in this case. Only single character delimiters are supported.

    >>> !cat ratings.csv
    user_id movie_id rating
    10210 1 1
    10213 2 5
    10217 2 2
    10102 1 3
    10109 3 4
    10117 5 2
    10122 2 4
    10114 1 5
    10125 1 1
    >>> gl.SFrame.read_csv('ratings.csv', delimiter=' ')

    By default, "NA" or a missing element are interpreted as missing values.

    >>> !cat ratings2.csv
    user,movie,rating
    "tom",,1
    harry,5,
    jack,2,2
    bill,,
    >>> gl.SFrame.read_csv('ratings2.csv')
    Columns:
      user	str
      movie	int
      rating	int
    Rows: 4
    Data:
    +---------+-------+--------+
    |   user  | movie | rating |
    +---------+-------+--------+
    |   tom   |  None |   1    |
    |  harry  |   5   |  None  |
    |   jack  |   2   |   2    |
    | missing |  None |  None  |
    +---------+-------+--------+
    [4 rows x 3 columns]

    Furthermore due to the dictionary types and list types, can handle parsing
    of JSON-like formats.

    >>> !cat ratings3.csv
    business, categories, ratings
    "Restaurant 1", [1 4 9 10], {"funny":5, "cool":2}
    "Restaurant 2", [], {"happy":2, "sad":2}
    "Restaurant 3", [2, 11, 12], {}
    >>> gl.SFrame.read_csv('ratings3.csv')
    Columns:
    business	str
    categories	array
    ratings	dict
    Rows: 3
    Data:
    +--------------+--------------------------------+-------------------------+
    |   business   |           categories           |         ratings         |
    +--------------+--------------------------------+-------------------------+
    | Restaurant 1 | array('d', [1.0, 4.0, 9.0, ... | {'funny': 5, 'cool': 2} |
    | Restaurant 2 |           array('d')           |  {'sad': 2, 'happy': 2} |
    | Restaurant 3 | array('d', [2.0, 11.0, 12.0])  |            {}           |
    +--------------+--------------------------------+-------------------------+
    [3 rows x 3 columns]

    The list and dictionary parsers are quite flexible and can absorb a
    variety of purely formatted inputs. Also, note that the list and dictionary
    types are recursive, allowing for arbitrary values to be contained.

    All these are valid lists:

    >>> !cat interesting_lists.csv
    list
    []
    [1,2,3]
    [1;2,3]
    [1 2 3]
    [{a:b}]
    ["c",d, e]
    [[a]]
    >>> gl.SFrame.read_csv('interesting_lists.csv')
    Columns:
      list	list
    Rows: 7
    Data:
    +-----------------+
    |       list      |
    +-----------------+
    |        []       |
    |    [1, 2, 3]    |
    |    [1, 2, 3]    |
    |    [1, 2, 3]    |
    |   [{'a': 'b'}]  |
    | ['c', 'd', 'e'] |
    |     [['a']]     |
    +-----------------+
    [7 rows x 1 columns]

    All these are valid dicts:

    >>> !cat interesting_dicts.csv
    dict
    {"classic":1,"dict":1}
    {space:1 seperated:1}
    {emptyvalue:}
    {}
    {:}
    {recursive1:[{a:b}]}
    {:[{:[a]}]}
    >>> gl.SFrame.read_csv('interesting_dicts.csv')
    Columns:
      dict	dict
    Rows: 7
    Data:
    +------------------------------+
    |             dict             |
    +------------------------------+
    |  {'dict': 1, 'classic': 1}   |
    | {'seperated': 1, 'space': 1} |
    |     {'emptyvalue': None}     |
    |              {}              |
    |         {None: None}         |
    | {'recursive1': [{'a': 'b'}]} |
    | {None: [{None: array('d')}]} |
    +------------------------------+
    [7 rows x 1 columns]

    **Saving**

    Save and load the sframe in native format.

    >>> sf.save('remote:///mysframedir')
    >>> sf2 = graphlab.load_sframe('remote:///mysframedir')

    **Column Manipulation **

    An SFrame is composed of a collection of columns of SArrays, and individual
    SArrays can be extracted easily. For instance given an SFrame:

    >>> sf = SFrame({'id':[1,2,3],'val':['A','B','C']})
    >>> sf
    Columns:
        id  int
        val str
    Rows: 3
    Data:
       id  val
    0  1   A
    1  2   B
    2  3   C

    The "id" column can be extracted using:

    >>> sf["id"]
    dtype: int
    Rows: 3
    [1, 2, 3]

    And can be deleted using:

    >>> del sf["id"]

    Multiple columns can be selected by passing a list of column names:

    >>> sf = SFrame({'id':[1,2,3],'val':['A','B','C'],'val2':[5,6,7]})
    >>> sf
    Columns:
        id   int
        val  str
        val2 int
    Rows: 3
    Data:
       id  val val2
    0  1   A   5
    1  2   B   6
    2  3   C   7
    >>> sf2 = sf[['id','val']]
    >>> sf2
    Columns:
        id  int
        val str
    Rows: 3
    Data:
       id  val
    0  1   A
    1  2   B
    2  3   C

    The same mechanism can be used to re-order columns:

    >>> sf = SFrame({'id':[1,2,3],'val':['A','B','C']})
    >>> sf
    Columns:
        id  int
        val str
    Rows: 3
    Data:
       id  val
    0  1   A
    1  2   B
    2  3   C
    >>> sf[['val','id']]
    >>> sf
    Columns:
        val str
        id  int
    Rows: 3
    Data:
       val id
    0  A   1
    1  B   2
    2  C   3

    **Element Access and Slicing**
    SFrames can be accessed by integer keys just like a regular python list.
    Such operations may not be fast on large datasets so looping over an SFrame
    should be avoided.

    >>> sf = SFrame({'id':[1,2,3],'val':['A','B','C']})
    >>> sf[0]
    {'id': 1, 'val': 'A'}
    >>> sf[2]
    {'id': 3, 'val': 'C'}
    >>> sf[5]
    IndexError: SFrame index out of range

    Negative indices can be used to access elements from the tail of the array

    >>> sf[-1] # returns the last element
    {'id': 3, 'val': 'C'}
    >>> sf[-2] # returns the second to last element
    {'id': 2, 'val': 'B'}

    The SFrame also supports the full range of python slicing operators:

    >>> sf[1000:] # Returns an SFrame containing rows 1000 to the end
    >>> sf[:1000] # Returns an SFrame containing rows 0 to row 999 inclusive
    >>> sf[0:1000:2] # Returns an SFrame containing rows 0 to row 1000 in steps of 2
    >>> sf[-100:] # Returns an SFrame containing last 100 rows
    >>> sf[-100:len(sf):2] # Returns an SFrame containing last 100 rows in steps of 2

    **Logical Filter**

    An SFrame can be filtered using

    >>> sframe[binary_filter]

    where sframe is an SFrame and binary_filter is an SArray of the same length.
    The result is a new SFrame which contains only rows of the SFrame where its
    matching row in the binary_filter is non zero.

    This permits the use of boolean operators that can be used to perform
    logical filtering operations. For instance, given an SFrame

    >>> sf
    Columns:
        id  int
        val	str
    Rows: 3
    Data:
       id  val
    0  1   A
    1  2   B
    2  3   C

    >>> sf[(sf['id'] >= 1) & (sf['id'] <= 2)]
    Columns:
        id  int
        val	str
    Rows: 3
    Data:
       id  val
    0  1   A
    1  2   B

    See :class:`~graphlab.SArray` for more details on the use of the logical
    filter.

    This can also be used more generally to provide filtering capability which
    is otherwise not expressible with simple boolean functions. For instance:

    >>> sf[sf['id'].apply(lambda x: math.log(x) <= 1)]
    Columns:
        id  int
        val	str
    Rows: 3
    Data:
       id  val
    0  1   A
    1  2   B

    Or alternatively:

    >>> sf[sf.apply(lambda x: math.log(x['id']) <= 1)]

    Parameters
    ----------
    data : Array | pandas.DataFrame | string
        The actual interpretation of this field is dependent on the "format"
        parameter. If data is an Array or a Pandas DataFrame, the contents are
        stored in the SFrame. If the contents is a string, it is interpreted as
        a file. Files can be read from local file system, or urls (local://,
        hdfs://, s3://, http://, or remote://)

    format : {'auto', 'array', 'dataframe', 'csv', 'sframe'}
        The format of the data. Default 'auto' will automatically infer the
        input data format. The inference rules are simple: If the data is an
        array/dict or a dataframe, it is associated with 'array' and 'dataframe'
        respectively. If the data is a string, it is interpreted as a file, and
        the file extension is used to infer the file format.


    Notes
    -----
    When working with the graphlab EC2 instance, e.g.
    :py:func:`graphlab.aws.launch_EC2()`, SFrame cannot be constructed using
    local file path,  because it involves potentially large amount of data
    transfer from client to server. However, it is still ok to the remote file
    path.

    >>> graphlab.aws.launch_EC2('m1.large')
    >>> sf = SFrame('~/mydata/foo.csv') # throws exception
    >>> sf = SFrame('remote:///mydata/foo.csv') # works
    >>> sf = SFrame('http://testdatasets.s3-website-us-west-2.amazonaws.com/users.csv.gz') # works
    >>> sf = SFrame('s3://mybucket/foo.csv') # works
    >>> graphlab.aws.teminate_EC2()

    Similar restriction applies to :py:class:`graphlab.SGraph` and
    :py:class:`graphlab.SArray`.
    """

    __slots__ = ['shape', '__proxy__', '_proxy']

    def __init__(self, data=None,
                 format='auto',
                 _proxy=None):
        """__init__(data=list(), format='auto')
        Construct a new SFrame from a url or a pandas.DataFrame.

        Parameters
        ----------
        data : Array | pandas.DataFrame | string
            The actual interpretation of this field is dependent on the "format"
            parameter. If data is an Array or a Pandas DataFrame, the contents
            are stored in the SFrame. If the contents is a string, it is
            interpreted as a file. Files can be read from local file system, or
            urls (local://, hdfs://, s3://, http://, or remote://)

        format : {'auto', 'array', 'dataframe', 'csv', 'sframe'}
            The format of the data. Default 'auto' will automatically infer the
            input data format. The inference rules are simple: If the data is an
            array/dict or a dataframe, it is associated with 'array' and
            'dataframe' respectively. If the data is a string, it is interpreted
            as a file, and the file extension is used to infer the file format.

        _proxy : None
            Internal, do not use.

        Notes
        -----
        For CSV files, the preferred constructor is SFrame.read_csv since
        that has a lot more options which can be used to control the parser.
        """
        # emit metrics for num_rows, num_columns, and type (local://, s3, hdfs, http)
        tracker = _mt._get_metric_tracker()
        if (_proxy):
            self.__proxy__ = _proxy
        else:
            self.__proxy__ = UnitySFrameProxy(glconnect.get_client())
            _format = None
            if (format == 'auto'):
                if (HAS_PANDAS and isinstance(data, pandas.DataFrame)):
                    _format = 'dataframe'
                    tracker.track('sframe.location.memory', value=1)
                elif (isinstance(data, str) or isinstance(data, unicode)):

                    if data.find('://') == -1:
                        suffix = 'local'
                    else:
                        suffix = data.split('://')[0]
                    tracker.track(('sframe.location.%s' % (suffix)), value=1)

                    if data.endswith(('.csv', '.csv.gz')):
                        _format = 'csv'
                    elif data.endswith(('.tsv', '.tsv.gz')):
                        _format = 'tsv'
                    elif data.endswith(('.txt', '.txt.gz')):
                        print "Assuming file is csv. For other delimiters, " + \
                            "please use `SFrame.read_csv`."
                        _format = 'csv'
                    else:
                        _format = 'sframe'
                elif type(data) == SArray:
                    _format = 'sarray'

                elif isinstance(data, SFrame):
                    _format = 'sframe_obj'

                elif (hasattr(data, 'iteritems')):
                    _format = 'dict'
                    tracker.track('sframe.location.memory', value=1)

                elif hasattr(data, '__iter__'):
                    _format = 'array'
                    tracker.track('sframe.location.memory', value=1)
                elif data is None:
                    _format = 'empty'
                else:
                    raise ValueError('Cannot infer input type for data ' + str(data))
            else:
                _format = format

            tracker.track(('sframe.format.%s' % _format), value=1)

            with cython_context():
                if (_format == 'dataframe'):
                    self.__proxy__.load_from_dataframe(data)
                elif (_format == 'sframe_obj'):
                    for col in data.column_names():
                        self.__proxy__.add_column(data[col].__proxy__, col)
                elif (_format == 'sarray'):
                    self.__proxy__.add_column(data.__proxy__, "")
                elif (_format == 'array'):
                    if len(data) > 0:
                        unique_types = set([type(x) for x in data if x is not None])
                        if len(unique_types) == 1 and SArray in unique_types:
                            for arr in data:
                                self.add_column(arr)
                        elif SArray in unique_types:
                            raise ValueError("Cannot create SFrame from mix of regular values and SArrays")
                        else:
                            self.__proxy__.add_column(SArray(data).__proxy__, "")
                elif (_format == 'dict'):
                    for key,val in iter(sorted(data.iteritems())):
                        if (type(val) == SArray):
                            self.__proxy__.add_column(val.__proxy__, key)
                        else:
                            self.__proxy__.add_column(SArray(val).__proxy__, key)
                elif (_format == 'csv'):
                    url = make_internal_url(data)
                    tmpsf = SFrame.read_csv(url, delimiter=',', header=True)
                    self.__proxy__ = tmpsf.__proxy__
                elif (_format == 'tsv'):
                    url = make_internal_url(data)
                    tmpsf = SFrame.read_csv(url, delimiter='\t', header=True)
                    self.__proxy__ = tmpsf.__proxy__
                elif (_format == 'sframe'):
                    url = make_internal_url(data)
                    self.__proxy__.load_from_sframe_index(url)
                elif (_format == 'empty'):
                    pass
                else:
                    raise ValueError('Unknown input type: ' + format)

        sframe_size = -1
        if self.__has_size__():
          sframe_size = self.num_rows()
        tracker.track('sframe.row.size', value=sframe_size)
        tracker.track('sframe.col.size', value=self.num_cols())

    @staticmethod
    def _infer_column_types_from_lines(first_rows, delimiter, na_values):
        if (len(first_rows.column_names()) < 1):
          print "Insufficient number of columns to perform type inference"
          raise RuntimeError("Insufficient columns ")
        if len(first_rows) < 1:
          print "Insufficient number of rows to perform type inference"
          raise RuntimeError("Insufficient rows")
        first_lines = [x.strip() for x in first_rows[first_rows.column_names()[0]]]
        # replace with commas.
        # special handling for space splits:
        # merge multiple spaces into one
        if (delimiter == ' '):
          first_lines = [','.join(x.split()) for x in first_lines]

        first_lines = ['[' + x.replace(delimiter, ",") + ']' for x in first_lines]
        type_lines = graphlab.SArray(first_lines).astype(list)
        na_set = set(na_values)
        all_column_values = [[None if type(t) is str and t in na_set else t for t in vals] for vals in type_lines]
        all_column_type_hints = [[type(t) for t in vals] for vals in all_column_values]
        # collect the hints
        # if every line was inferred to have a different number of elements, die
        if len(set(len(x) for x in all_column_type_hints)) != 1:
            print "Unable to infer column types. Defaulting to str"
            return str

        import types

        column_type_hints = all_column_type_hints[0]
        # now perform type combining across rows
        for i in range(1, len(all_column_type_hints)):
          currow = all_column_type_hints[i]
          for j in range(len(column_type_hints)):
            # combine types
            d = set([currow[j], column_type_hints[j]])
            if (len(d) == 1):
              # easy case. both agree on the type
              continue
            if ((int in d) and (float in d)):
              # one is an int, one is a float. its a float
              column_type_hints[j] = float
            elif ((array.array in d) and (list in d)):
              # one is an array , one is a list. its a list
              column_type_hints[j] = list
            elif types.NoneType in d:
              # one is a NoneType. assign to other type
              if currow[j] != types.NoneType:
                  column_type_hints[j] = currow[j]
            else:
              column_type_hints[j] = str
        # final pass. everything whih is still NoneType is now a str
        for i in range(len(column_type_hints)):
          if column_type_hints[i] == types.NoneType:
            column_type_hints[i] = str

        # special handling for '\n'
        if delimiter == '\n' and len(column_type_hints) != 1:
          column_type_hints = [str]

        return column_type_hints

    @classmethod
    def _read_csv_impl(cls,
                       url,
                       delimiter=',',
                       header=True,
                       error_bad_lines=False,
                       comment_char='',
                       escape_char='\\',
                       double_quote=True,
                       quote_char='\"',
                       skip_initial_space=True,
                       column_type_hints=None,
                       na_values=["NA"],
                       nrows=None,
                       verbose=True,
                       store_errors=True):
        """
        Constructs an SFrame from a CSV file or a path to multiple CSVs, and
        returns a pair containing the SFrame and optionally
        (if store_errors=True) a dict of filenames to SArrays
        indicating for each file, what are the incorrectly parsed lines
        encountered.

        Parameters
        ----------
        url : string
            Location of the CSV file or directory to load. If URL is a directory
            or a "glob" pattern, all matching files will be loaded.

        delimiter : string
            This describes the delimiter used for parsing csv files. Must be a
            single character.

        header : bool
            If true, uses the first row as the column names.
            Otherwise use the default column names:'X1, X2,...'.

        error_bad_lines: bool
            If true, will fail upon encountering a bad line. If false, will
            continue parsing skipping lines which fail to parse correctly.
            A sample of the first 10 encountered bad lines will be printed.
            Defaults to False.

        comment_char : string
            The character which denotes that the
            remainder of the line is a comment.

        escape_char : string
            Character which begins a C escape sequence

        double_quote : bool
            If two consecutive quotes in a string parses to
            to a single quote.

        skip_initial_space : bool
            If extra spaces at the start of a field is ignored

        column_type_hints : None, type, list[type], dict[string, type]
            This provides type hints for each column. By default, this method
            attempts to detect the type of each column automatically.

            Supported types are int, float, str.

            * If a single type is provided, the type will be
              applied to all columns. For instance, column_type_hints=float
              will force all columns to be parsed as float.
            * If a list of types is provided, the types applies
              to each column in order, e.g.[int, float, str]
              will parse the first column as int, second as float and third as
              string.
            * If a dictionary of column name to type is provided,
              each type value in the dictonary is applied to the key it
              belongs to.

        na_values: str or list of str
            A string, or a list of strings to be interpreted as missing values.

        nrows: integer
            If set, only this many rows will be read from the file.

        store_errors: bool
            If true, the output errors dict will be filled.

        verbose : bool
            If True, print the progress.
        """
        parsing_config = dict()
        parsing_config["delimiter"] = delimiter
        parsing_config["use_header"] = header
        parsing_config["continue_on_failure"] = not error_bad_lines
        parsing_config["comment_char"] = comment_char
        parsing_config["escape_char"] = escape_char
        parsing_config["double_quote"] = double_quote
        parsing_config["quote_char"] = quote_char
        parsing_config["skip_initial_space"] = skip_initial_space
        parsing_config["store_errors"] = store_errors
        if type(na_values) is str:
          na_values = [na_values]
        if na_values is not None and len(na_values) > 0:
            parsing_config["na_values"] = na_values

        if nrows != None:
          parsing_config["row_limit"] = nrows

        proxy = UnitySFrameProxy(glconnect.get_client())
        internal_url = make_internal_url(url)

        if (not verbose):
            glconnect.get_client().set_log_progress(False)

        # Attempt to automatically detect the column types. Either produce a
        # list of types; otherwise default to all str types.
        column_type_inference_was_used = False
        if column_type_hints is None:
            try:
                # Get the first 100 rows (using all the desired arguments).
                first_rows = graphlab.SFrame.read_csv(url, nrows=100,
                                 column_type_hints=str,
                                 header=header,
                                 delimiter='\n',
                                 comment_char=comment_char,
                                 escape_char=escape_char,
                                 double_quote=double_quote,
                                 quote_char=quote_char,
                                 skip_initial_space=skip_initial_space)
                column_type_hints = SFrame._infer_column_types_from_lines(first_rows, delimiter, na_values)
                typelist = '[' + ','.join(t.__name__ for t in column_type_hints) + ']'
                print "------------------------------------------------------"
                print "Inferred types from first line of file as "
                print "column_type_hints="+ typelist
                print "If parsing fails due to incorrect types, you can correct"
                print "the inferred type list above and pass it to read_csv in"
                print "the column_type_hints argument"
                print "------------------------------------------------------"
                column_type_inference_was_used = True
            except:
                # If the above fails, default back to str for all columns.
                column_type_hints = str
                print 'Could not detect types. Using str for each column.'

        if type(column_type_hints) is type:
            type_hints = {'__all_columns__': column_type_hints}
        elif type(column_type_hints) is list:
            type_hints = dict(zip(['__X%d__' % i for i in range(len(column_type_hints))], column_type_hints))
        elif type(column_type_hints) is dict:
            type_hints = column_type_hints
        else:
            raise TypeError("Invalid type for column_type_hints. Must be a dictionary, list or a single type.")


        _mt._get_metric_tracker().track('sframe.csv.parse')

        suffix=''
        if url.find('://') == -1:
            suffix = 'local'
        else:
            suffix = url.split('://')[0]

        _mt._get_metric_tracker().track(('sframe.location.%s' % (suffix)), value=1)
        try:
            with cython_context():
                errors = proxy.load_from_csvs(internal_url, parsing_config, type_hints)
        except:
            if column_type_inference_was_used:
                # try again
                print "Unable to parse the file with automatic type inference."
                print "Defaulting to column_type_hints=str"
                type_hints = {'__all_columns__': str}
                try:
                    with cython_context():
                        errors = proxy.load_from_csvs(internal_url, parsing_config, type_hints)
                except:
                    raise
            else:
                raise

        glconnect.get_client().set_log_progress(True)

        return (cls(_proxy=proxy), { f: SArray(_proxy = es) for (f, es) in errors.iteritems() })

    @classmethod
    def read_csv_with_errors(cls,
                             url,
                             delimiter=',',
                             header=True,
                             comment_char='',
                             escape_char='\\',
                             double_quote=True,
                             quote_char='\"',
                             skip_initial_space=True,
                             column_type_hints=None,
                             na_values=["NA"],
                             nrows=None,
                             verbose=True):
        """
        Constructs an SFrame from a CSV file or a path to multiple CSVs, and
        returns a pair containing the SFrame and a dict of filenames to SArrays
        indicating for each file, what are the incorrectly parsed lines
        encountered.

        Parameters
        ----------
        url : string
            Location of the CSV file or directory to load. If URL is a directory
            or a "glob" pattern, all matching files will be loaded.

        delimiter : string
            This describes the delimiter used for parsing csv files. Must be a
            single character.

        header : bool
            If true, uses the first row as the column names.
            Otherwise use the default column names:'X1, X2,...'.

        comment_char : string
            The character which denotes that the
            remainder of the line is a comment.

        escape_char : string
            Character which begins a C escape sequence

        double_quote : bool
            If two consecutive quotes in a string parses to
            to a single quote.

        skip_initial_space : bool
            If extra spaces at the start of a field is ignored

        column_type_hints : None, type, list[type], dict[string, type]
            This provides type hints for each column. By default, this method
            attempts to detect the type of each column automatically.

            Supported types are int, float, str.

            * If a single type is provided, the type will be
              applied to all columns. For instance, column_type_hints=float
              will force all columns to be parsed as float.
            * If a list of types is provided, the types applies
              to each column in order, e.g.[int, float, str]
              will parse the first column as int, second as float and third as
              string.
            * If a dictionary of column name to type is provided,
              each type value in the dictonary is applied to the key it
              belongs to.
              For instance {'user':int} will hint that the column
              called "user" should be parsed as an integer, and the rest will default
              to string.

        na_values : str | list of str
            A string, or a list of strings to be interpreted as missing values.

        nrows: integer
            If set, only this many rows will be read from the file.

        verbose : bool
            If True, print the progress.
        """
        return cls._read_csv_impl(url,
                                  delimiter=delimiter,
                                  header=header,
                                  error_bad_lines=False, # we are storing errors,
                                                         # thus we must not fail
                                                         # on bad lines
                                  comment_char=comment_char,
                                  escape_char=escape_char,
                                  double_quote=double_quote,
                                  quote_char=quote_char,
                                  skip_initial_space=skip_initial_space,
                                  column_type_hints=column_type_hints,
                                  na_values=na_values,
                                  nrows=nrows,
                                  verbose=verbose,
                                  store_errors=True)
    @classmethod
    def read_csv(cls,
                 url,
                 delimiter=',',
                 header=True,
                 error_bad_lines=False,
                 comment_char='',
                 escape_char='\\',
                 double_quote=True,
                 quote_char='\"',
                 skip_initial_space=True,
                 column_type_hints=None,
                 na_values=["NA"],
                 nrows=None,
                 verbose=True):
        """
        Constructs an SFrame from a CSV file or a path to multiple CSVs.

        Parameters
        ----------
        url : string
            Location of the CSV file or directory to load. If URL is a directory
            or a "glob" pattern, all matching files will be loaded.

        delimiter : string
            This describes the delimiter used for parsing csv files. Must be a
            single character.

        header : bool
            If true, uses the first row as the column names.
            Otherwise use the default column names:'X1, X2,...'.

        error_bad_lines: bool
            If true, will fail upon encountering a bad line. If false, will
            continue parsing skipping lines which fail to parse correctly.
            A sample of the first 10 encountered bad lines will be printed.
            Defaults to False.

        comment_char : string
            The character which denotes that the
            remainder of the line is a comment.

        escape_char : string
            Character which begins a C escape sequence

        double_quote : bool
            If two consecutive quotes in a string parses to
            to a single quote.

        skip_initial_space : bool
            If extra spaces at the start of a field is ignored

        column_type_hints : None, type, list[type], dict[string, type]
            This provides type hints for each column. By default, this method
            attempts to detect the type of each column automatically.

            Supported types are int, float, str.

            * If a single type is provided, the type will be
              applied to all columns. For instance, column_type_hints=float
              will force all columns to be parsed as float.
            * If a list of types is provided, the types applies
              to each column in order, e.g.[int, float, str]
              will parse the first column as int, second as float and third as
              string.
            * If a dictionary of column name to type is provided,
              each type value in the dictonary is applied to the key it
              belongs to.
              For instance {'user':int} will hint that the column called "user"
              should be parsed as an integer, and the rest will default to
              string.

        na_values : str | list of str
            A string, or a list of strings to be interpreted as missing values.

        nrows: integer
            If set, only this many rows will be read from the file.

        verbose : bool
            If True, print the progress.
        """
        return cls._read_csv_impl(url,
                                  delimiter=delimiter,
                                  header=header,
                                  error_bad_lines=error_bad_lines,
                                  comment_char=comment_char,
                                  escape_char=escape_char,
                                  double_quote=double_quote,
                                  quote_char=quote_char,
                                  skip_initial_space=skip_initial_space,
                                  column_type_hints=column_type_hints,
                                  na_values=na_values,
                                  nrows=nrows,
                                  verbose=verbose,
                                  store_errors=False)[0]

    def __repr__(self):
        """
        Returns a string description of the frame
        """
        ret = self.__get_column_description__();
        if self.__has_size__():
            ret = ret + "Rows: " + str(len(self)) + "\n\n"
        else:
            ret = ret + "Rows: Unknown" + "\n\n"
        ret = ret + "Data:\n"
        if (len(self.head()) > 0):
            ret = ret + str(self)
        else:
            ret = ret + "\t[]"
        return ret

    def __get_column_description__(self):
        colnames = self.column_names()
        coltypes = self.column_types()
        ret = "Columns:\n"
        if len(colnames) > 0:
            for i in range(len(colnames)):
                ret = ret + "\t" + colnames[i] + "\t" + coltypes[i].__name__ + "\n"
            ret = ret + "\n"
        else:
            ret = ret + "\tNone\n\n"
        return ret

    def __get_pretty_tables__(self, wrap_text=False, max_row_width=80, max_column_width=30, max_columns=20, max_rows_to_display=60):
        """
        Returns a list of pretty print tables representing the current SFrame.
        If the number of columns is larger than max_columns, the last pretty
        table will contain and extra column of "...".

        Parameters
        ----------
        max_row_width: max number of characters per table
        max_column_width: max number of characters per column
        max_columns: max number of columns per table

        Returns
        -------
        out: list[PrettyTable]
        """
        headsf = self.head(max_rows_to_display)
        if headsf.shape == (0, 0):
            return [PrettyTable()]

        def _truncate_str(s, wrap_str=False):
            """
            Truncate and optionally wrap the input string as unicode, replace
            unconvertible character with a daimond ?.
            """
            s = repr(s)
            # repr adds the escape characters. but also adds quotes around
            # the string
            if (len(s) >= 2):
              s = s[1:-1]
            if len(s) <= max_column_width:
                return unicode(s, errors='replace')
            else:
                ret = ''
                # if wrap_str is true, wrap the text and take at most 2 rows
                if wrap_str:
                    wrapped_lines = wrap(s, max_column_width)
                    ret = "\n".join(wrapped_lines[:2])
                    last_line = wrapped_lines[:2][-1]
                    if len(last_line) >= max_column_width or len(wrapped_lines) > 2:
                        space_left = max_column_width - len(last_line)
                        space_truncate = max(0, 4 - space_left)
                        if space_truncate > 0:
                            ret = ret[:-space_truncate] + ' ...'
                        else:
                            ret = ret + ' ...'
                else:
                    ret = s[:max_column_width]
                    ret = ret[:-4] + ' ...'
                return unicode(ret, errors='replace')

        columns = self.column_names()[:max_columns]
        columns.reverse()  # reverse the order of columns and we will pop from the end

        num_column_of_last_table = 0
        row_of_tables = []
        # let's build a list of tables with max_columns
        # each table should satisfy, max_row_width, and max_column_width
        while len(columns) > 0:
            tbl = PrettyTable()
            table_width = 0
            num_column_of_last_table = 0
            while len(columns) > 0:
                col = columns.pop()
                # check the max length of element in the column
                if len(headsf) > 0:
                    col_width = min(max_column_width, max(len(str(x)) for x in headsf[col]))
                else:
                    col_width = max_column_width
                if (table_width + col_width < max_row_width):
                    # truncate the header if necessary
                    header = _truncate_str(col, wrap_text)
                    tbl.add_column(header, [_truncate_str(str(x), wrap_text) for x in headsf[col]])
                    table_width = str(tbl).find('\n')
                    num_column_of_last_table += 1
                else:
                    # the column does not fit in the current table, push it back to columns
                    columns.append(col)
                    break
            tbl.align = 'c'
            row_of_tables.append(tbl)

        # add a column of all "..." if there are more columns than displayed
        if self.num_cols() > max_columns:
            row_of_tables[-1].add_column('...', ['...'] * len(headsf))
            num_column_of_last_table += 1

        # add a row of all "..." if there are more rows than displayed
        if self.__has_size__() and self.num_rows() > headsf.num_rows():
            row_of_tables[-1].add_row(['...'] * num_column_of_last_table)
        return row_of_tables

    def print_rows(self, num_rows=10, num_columns=40):
        """
        Print the first M rows and N columns of the SFrame in human readable format.
        """
        row_of_tables = self.__get_pretty_tables__(wrap_text=False, max_rows_to_display=num_rows, max_columns=num_columns)
        footer = "[%d rows x %d columns]\n" % self.shape
        print '\n'.join([str(tb) for tb in row_of_tables]) + "\n" + footer

    FOOTER_STRS = ['Note: Only the head of the SFrame is printed.',
                   'You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.']

    LAZY_FOOTER_STRS = ['Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.',
                        'You can use len(sf) to force materialization.']

    def __str__(self, num_rows=10):
        """
        Returns a string containing the first 10 elements of the frame, along
        with a description of the frame.
        """
        MAX_ROWS_TO_DISPLAY = num_rows
        row_of_tables = self.__get_pretty_tables__(wrap_text=False, max_rows_to_display=MAX_ROWS_TO_DISPLAY)
        if self.__has_size__():
            footer = '[%d rows x %d columns]\n' % self.shape
            if (self.num_rows() > MAX_ROWS_TO_DISPLAY):
                footer += '\n'.join(self.FOOTER_STRS)
        else:
            footer = '[? rows x %d columns]\n' % self.num_columns()
            footer += '\n'.join(self.LAZY_FOOTER_STRS)
        return '\n'.join([str(tb) for tb in row_of_tables]) + "\n" + footer

    def _repr_html_(self):
        MAX_ROWS_TO_DISPLAY = 10
        row_of_tables = self.__get_pretty_tables__(wrap_text=True, max_row_width=120, max_columns=40, max_column_width=25, max_rows_to_display=MAX_ROWS_TO_DISPLAY)
        if self.__has_size__():
            footer = '[%d rows x %d columns]<br/>' % self.shape
            if (self.num_rows() > MAX_ROWS_TO_DISPLAY):
                footer += '<br/>'.join(self.FOOTER_STRS)
        else:
            footer = '[? rows x %d columns]<br/>' % self.num_columns()
            footer += '<br/>'.join(self.LAZY_FOOTER_STRS)
        begin = '<div style="max-height:1000px;max-width:1500px;overflow:auto;">'
        end = '\n</div>'
        return begin + '\n'.join([tb.get_html_string(format=True) for tb in row_of_tables]) + "\n" + footer + end

    def __nonzero__(self):
        """
        Returns true if the frame is not empty.
        """
        return self.num_rows() != 0

    def __len__(self):
        """
        Returns the number of rows of the sframe
        """
        return self.num_rows()

    def __copy__(self):
        """
        Returns a shallow copy of the sframe
        """
        return self.select_columns(self.column_names())

    def _row_selector(self, other):
        """
        Where other is an SArray of identical length as the current Frame,
        this returns a selection of a subset of rows in the current SFrame
        where the corresponding row in the selector is non-zero.
        """
        if type(other) is SArray:
            if len(other) != len(self):
                raise IndexError("Cannot perform logical indexing on arrays of different length.")
            with cython_context():
                return SFrame(_proxy=self.__proxy__.logical_filter(other.__proxy__))

    def dtype(self):
        """
        Returns the column types. Same as :py:meth:`column_types`.

        Returns
        -------
        out : list[type]
            Column types of the SFrame.
        """

        return self.column_types()

    def num_rows(self):
        """
        Returns the number of rows.

        Returns
        -------
        out : int
            Number of rows in the SFrame.
        """
        return self.__proxy__.num_rows()

    def num_cols(self):
        """
        Returns the number of columns.

        Returns
        -------
        out : int
            Number of columns in the SFrame.
        """
        return self.__proxy__.num_columns()

    def num_columns(self):
        """
        Returns the number of columns.

        Returns
        -------
        out : int
            Number of columns in the SFrame.
        """
        return self.__proxy__.num_columns()

    def column_names(self):
        """
        Returns the column names.

        Returns
        -------
        out : list[string]
            Column names of the SFrame.
        """
        return self.__proxy__.column_names()

    def column_types(self):
        """
        Returns the column types.

        Returns
        -------
        out : list[type]
            Column types of the SFrame.
        """
        return self.__proxy__.dtype()

    def head(self, n=10):
        """
        Returns a new SFrame which contains the first n rows of the SFrame

        Parameters
        ----------
        n : int
            The number of rows to fetch.

        Returns
        -------
        out : SFrame
            A new SFrame which contains the first n rows of the current SFrame
        """
        return SFrame(_proxy=self.__proxy__.head(n))

    def to_dataframe(self):
        """
        Returns a pandas.DataFrame which contains all rows of the SFrame

        This operation will construct a pandas.DataFrame in memory. Care must
        be taken when size of the returned object is big.

        Returns
        -------
        out : pandas.DataFrame
            The dataframe which contains all rows of SFrame
        """
        assert HAS_PANDAS
        df = pandas.DataFrame()
        for i in range(self.num_columns()):
            column_name = self.column_names()[i]
            df[column_name] = list(self[column_name])
            if len(df[column_name]) == 0:
                df[column_name] = df[column_name].astype(self.column_types()[i])
        return df

    def tail(self, n=10):
        """
        Returns an SFrame which contains the last n rows of the SFrame

        Parameters
        ----------
        n : int
            The number of rows to fetch.

        Returns
        -------
        out : SFrame
            A new SFrame which contains the last n rows of the current SFrame
        """
        return SFrame(_proxy=self.__proxy__.tail(n))

    def apply(self, fn, dtype=None, seed=None):
        """
        Returns a new SArray of dtype where each element in this SArray is
        transformed by fn(x) where x is a row in the sframe, as a dictionary.
        The fn should return a value which can be cast into dtype.

        If dtype is not specified, the first 100 rows of the SFrame are
        used to make a guess of the target datatype.

        Parameters
        ----------
        fn : function
            The function to transform each row of the sframe. The return
            type should be convertible to dtype if dtype is not None.

        dtype : dtype
            The dtype of the new SArray. If None, the first 100
            elements of the array are used to guess the target
            data type.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SArray
            The SArray transformed by fn.  Each element of the SArray is of
            type ``dtype``


        Examples
        --------

        >>> import graphlab
        >>> sf = graphlab.SFrame.read_csv('netflix.csv')

        The following code create a new SArray where each element is the
        string concatination of the 'user_id', 'movie_id' and 'rating' columns.

        >>> sa = sf.apply(lambda x: str(x['user_id']) + str(x['movie_id']) + str(x['rating']))

        """
        assert inspect.isfunction(fn), "Input must be a function"
        test_sf = self[:10]
        dryrun = [fn(row) for row in test_sf]
        if dtype is None:
            dtype = SArray(dryrun).dtype()

        if not seed:
            seed = int(time.time())

        _mt._get_metric_tracker().track('sframe.apply')

        with cython_context():
            return SArray(_proxy=self.__proxy__.transform(fn, dtype, seed))

    def flat_map(self, column_names, fn, column_types='auto', seed=None):
        """
        Map each row of the SFrame to multiple rows in in the new sframe.
        The output of the lambda function must have type List[List[...]],
        and all inner elements should have the same length, and types.

        If column_types is not specified, the first 10 rows of the
        SFrame are used to determine the column types of the returned sframe.

        Parameters
        ----------
        column_names : list[str]
            The column names for the returned sframe.

        fn : function
            The function that maps each of the sframe row into
            multiple rows, returning List[List[...]].
            All rows must have the same length and types.

        column_types : list[type], optional.
            The column types of the output sframe. Default value
            will be automatically inferred by running `fn` on the first
            10 rows of the input.

        seed : int, optional
            Used as the seed if a random number generator is included in fn.

        Returns
        -------
        out : SFrame
            A new SFrame containing the results of the flat_map of the
            original SFrame.

        Examples
        ---------

        First read wikipeia data and convert to word-count, each row is one document:

        >>> import graphlab
        >>> sf = graphlab.SFrame.read_csv('wikipedia.csv', header=False)
        >>> sf.rename({'X1': 'raw_text'})
        >>> # create a new column contain a sparse dictionary of {word: count} for each doc.
        >>> sf['wc_per_doc'] = sf['raw_text'].count_words()

        Use flat_map to get total count of words in the document:

        >>> new_sf = sf.flat_map(["word", "count"], lambda x: [list(p) for p in x['wc_per_doc'].iteritems()])
        >>> total_wc = new_sf.group_by(key_columns=["word"], {'total_count':graphlab.aggregate.SUM("count")})

        """
        assert inspect.isfunction(fn), "Input must be a function"
        if not seed:
            seed = int(time.time())

        _mt._get_metric_tracker().track('sframe.flat_map')

        # determine the column_types
        if column_types == 'auto':
            types = set()
            rows = self[0:10]
            results = [fn(row) for row in rows]
            if not (results is None or type(results) == list):
                raise TypeError("Output type of the lambda function must be a list of lists")
            else:
                for rows in results:
                    if type(rows) is not list:
                        raise TypeError("Output type of the lambda function must be a list of lists")
                    for row in rows:
                        if type(row) is not list:
                            raise TypeError("Output type of the lambda function must be a list of lists")
                        types.add(tuple([type(v) for v in row]))
            if len(types) != 1:
                raise TypeError("Mapped rows must have the same length and types")

            column_types = list(types.pop())

        assert type(column_types) is list
        assert len(column_types) == len(column_names), "Number of output columns must match the size of column names"
        with cython_context():
            return SFrame(_proxy=self.__proxy__.flat_map(fn, column_names, column_types, seed))

    def sample(self, fraction, seed=None):
        """
        Return an SFrame with a sample of the current SFrame's rows.

        Parameters
        ----------
        fraction : float
            Approximate fraction of the rows to fetch. Must be between 0 and 1.
            The number of rows returned is approximately the fraction times the
            number of rows.

        seed : int, optional
            Seed for the random number generator.

        Returns
        -------
        out : SFrame
            A new SFrame containing sampled rows of the current SFrame.

        Examples
        --------
        Suppose we have an SFrame called my_sf with 6,145 rows.

        >>> print my_sf.num_rows()
        6145

        Retrieve about 30% of the SFrame rows.

        >>> sub_sf = my_sf.sample(fraction=0.3, seed=None)
        >>> print sub_sf.num_rows()
        1886

        For prototyping and testing, it is important to be able to set the seed
        so the same subset of rows can be retrieved each time.

        >>> sub_sf = my_sf.sample(fraction=0.3, seed=5)

        Notes
        -----
        For more examples of SFrame.sample, see the `Introduction to SFrames
        <http://graphlab.com/learn/notebooks/introduction_to_sframes.html#Splitting-and-Sampling>`_
        notebook.
        """
        if not seed:
            seed = int(time.time())

        if (fraction > 1 or fraction < 0):
            raise ValueError('Invalid sampling rate: ' + str(fraction))

        _mt._get_metric_tracker().track('sframe.sample')

        if (self.num_rows() == 0 or self.num_cols() == 0):
            return self
        else:
            with cython_context():
                return SFrame(_proxy=self.__proxy__.sample(fraction, seed))

    def random_split(self, fraction, seed=None):
        """
        Return a pair of SFrames by random splitting the current one. The first
        SFrame contains *M* rows, sampled uniformly randomly (without
        replacement) from the original SFrame. *M* is approximately the fraction
        times the original number of rows. The second SFrame contains the
        remaining rows of the original SFrame.

        Parameters
        ----------
        fraction : float
            Approximate fraction of the rows to fetch for the first returned
            SFrame. Must be between 0 and 1.

        seed : int, optional
            Seed for the random number generator.

        Returns
        -------
        out : tuple [SFrame]
            Two new SFrames.

        Examples
        --------
        Suppose we have an SFrame called my_sf with 6,145 rows.

        >>> print my_sf.num_rows()
        6145

        Randomly split my_sf into training and testing datasets with about a 70%/30%
        split.

        >>> sf_train, sf_test = my_sf.random_split(fraction=0.7, seed=None)
        >>> print sf_train.num_rows(), sf_test.num_rows()
        4294 1851

        For prototyping and testing, it is important to be able to set the seed
        so the same split can be reproduced in each call.

        >>> sf_train, sf_test = my_sf.sample(fraction=0.7, seed=5)

        Notes
        -----
        For more examples of SFrame.random_split, see the `Introduction to SFrames
        <http://graphlab.com/learn/notebooks/introduction_to_sframes.html#Splitting-and-Sampling>`_
        notebook.
        """
        if (fraction > 1 or fraction < 0):
            raise ValueError('Invalid sampling rate: ' + str(fraction))
        if (self.num_rows() == 0 or self.num_cols() == 0):
            return (SFrame(), SFrame())

        if not seed:
            seed = int(time.time())

        # The server side requires this to be an int, so cast if we can
        try:
            seed = int(seed)
        except ValueError:
            raise ValueError('The \'seed\' parameter must be of type int.')

        _mt._get_metric_tracker().track('sframe.random_split')

        with cython_context():
            proxy_pair = self.__proxy__.random_split(fraction, seed)
            return (SFrame(data=[], _proxy=proxy_pair[0]), SFrame(data=[], _proxy=proxy_pair[1]))

    def topk(self, column_name, k=10, reverse=False):
        """
        Returns the topk rows sorted by the column_name in given order.

        Parameters
        ----------
        column_name : string
            The column to sort on

        k : int
            The number of rows to return

        reverse : bool
            If True, return the topk rows in ascending order, otherwise, in
            descending order. By default in descending order.

        Returns
        -------
        out : SFrame
            an SFrame containing topk rows sorted by column_name.
        """
        if type(column_name) is not str:
            raise TypeError("column_name must be a string")

        _mt._get_metric_tracker().track('sframe.topk')

        sf = self[self[column_name].topk_index(k, reverse)]
        return sf.sort(column_name, ascending=reverse)

    def save(self, filename, format=None):
        """
        Save the SFrame to file/directory..

        Parameters
        ----------
        filename : string
            The location to save the SFrame. Either a local directory or a
            remote URL. If the format is 'binary', a directory will be created
            at the location which will contain the sframe.

        format : {'binary', 'csv'}, optional
            Format in which to save the SFrame. Binary saved sframes can be
            re-loaded much faster and without any format conversion losses.
            If not given, will try to infer the format from filename given. If file
            name ends with 'csv' or '.csv.gz', then save as 'csv' format, otherwise
            save as 'binary' format.

        Notes
        ------
        The binary saved SFrame may be loaded later on with :py:func:`graphlab.load_sframe()`
        or loaded through SFrame constructor:

            >>> sf = graphlab.SFrame('saved_sframe')
            >>> sf = graphlab.load_sframe('saved_sframe')

        """

        _mt._get_metric_tracker().track('sframe.save', properties={'format':format})
        if format == None:
            if filename.endswith(('.csv', '.csv.gz')):
                format = 'csv'
            else:
                format = 'binary'
        else:
            if format is 'csv':
                if not filename.endswith(('.csv', '.csv.gz')):
                    filename = filename + '.csv'
            elif format is not 'binary':
                raise ValueError("Invalid format: {}. Supported formats are 'csv' and 'binary'".format(format))

        ## Save the SFrame
        url = make_internal_url(filename)

        with cython_context():
            if format is 'binary':
                self.__proxy__.save(url)

            elif format is 'csv':
                assert filename.endswith(('.csv', '.csv.gz'))
                self.__proxy__.save_as_csv(url, {})
            else:
                raise ValueError("Unsupported format: {}".format(format))

    def select_column(self, key):
        """
        Return the SArray with one column that corresponds to the key

        Throws an exception if the key is something other than a str or
        if the key is not found.

        Parameters
        ----------
        key : str
            The column name

        Returns
        -------
        out : graphlab.SArray
            The sarray that is referred by 'key'
        """
        if not isinstance(key, str):
            raise TypeError("Invalid key type: must be str")
        with cython_context():
            return SArray(data=[], _proxy=self.__proxy__.select_column(key))

    def select_columns(self, keylist):
        """
        Returns an SFrame with the columns listed in 'keylist'.

        Raises
        ------
        TypeError
            Raises an exception if ANY of the keys are not in this SFrame or
            if keylist is anything other than a list of strings.

        Parameters
        ----------
        keylist : list
            The list of column names

        Returns
        -------
        out : graphlab.SFrame
            A new SFrame that is made up of the columns
            referred to in 'keylist' in this current SFrame
        """
        if not hasattr(keylist, '__iter__'):
            raise TypeError("keylist must be an iterable")
        if not all([isinstance(x, str) for x in keylist]):
            raise TypeError("Invalid key type: must be str")

        key_set = set(keylist)
        if (len(key_set)) != len(keylist):
            for key in key_set:
                if keylist.count(key) > 1:
                    raise ValueError("There are duplicate keys in key list: '" + key + "'")

        with cython_context():
            return SFrame(data=[], _proxy=self.__proxy__.select_columns(keylist))

    def add_column(self, data, name=""):
        """
        Adds the specified column to this SFrame.  The number of elements in
        the data given must match every other column of the SFrame.

        This operation modifies the current SFrame in place and returns self.

        Parameters
        ----------
        data : SArray
            The 'column' of data.

        name : string
            The name of the column. If no name is given, a default name is chosen.

        Returns
        -------
        out : SFrame
            The current SFrame.
        """
        # Check type for pandas dataframe or SArray?
        if not isinstance(data, SArray):
            raise TypeError("Must give column as SArray")
        if not isinstance(name, str):
            raise TypeError("Invalid column name: must be str")
        with cython_context():
            self.__proxy__.add_column(data.__proxy__, name)
        return self

    def add_columns(self, data, namelist=None):
        """
        Adds columns to the SFrame.  The number of elements in all columns must
        match every other column of the SFrame.

        This operation modifies the current SFrame in place and returns self.

        Parameters
        ----------
        data : list of SArray | SFrame
            A list of columns or another SFrame

        namelist : list of string
            A list of column names. All names must be specified.
            namelist is ignored if data is an SFrame

        Returns
        -------
        out : SFrame
            The current SFrame.
        """
        datalist = data
        if isinstance(data, SFrame):
            other = data
            datalist = [other.select_column(name) for name in other.column_names()]
            namelist = other.column_names()

            my_columns = set(self.column_names())
            for name in namelist:
                if name in my_columns:
                    raise ValueError("Column '" + name + "' already exists in current SFrame")
        else:
            if not hasattr(datalist, '__iter__'):
                raise TypeError("datalist must be an iterable")
            if not hasattr(namelist, '__iter__'):
                raise TypeError("namelist must be an iterable")

            if not all([isinstance(x, SArray) for x in datalist]):
                raise TypeError("Must give column as SArray")
            if not all([isinstance(x, str) for x in namelist]):
                raise TypeError("Invalid column name in list: must all be str")

        with cython_context():
            self.__proxy__.add_columns([x.__proxy__ for x in datalist], namelist)
        return self

    def remove_column(self, name):
        """
        Removes the column with the given name from the SFrame.

        This operation modifies the current SFrame in place and returns self.

        Parameters
        ----------
        name : string
            The name of the column to remove.

        Returns
        -------
        out : SFrame
            The SFrame with given column removed.
        """
        if name not in self.column_names():
            raise KeyError('Cannot find column %s' % name)
        colid = self.column_names().index(name)
        with cython_context():
            self.__proxy__.remove_column(colid)
        return self

    def swap_columns(self, column_1, column_2):
        """
        Swaps the columns with the given names.

        This operation modifies the current SFrame in place and returns self.

        Parameters
        ----------
        column_1 : string
            Name of column to swap

        column_2 : string
            Name of other column to swap

        Returns
        -------
        out: SFrame
            The current SFrame.
        """
        colnames = self.column_names()
        colid_1 = colnames.index(column_1)
        colid_2 = colnames.index(column_2)
        with cython_context():
            self.__proxy__.swap_columns(colid_1, colid_2)
        return self

    def rename(self, names):
        """
        Rename the columns using the 'names' dict.  This changes the names of
        the columns given as the keys and replaces them with the names given as
        the values.

        This operation modifies the current SFrame in place and returns self.

        Parameters
        ----------
        names : dict[string, string]
            Dictionary of [old_name, new_name]

        Examples
        ---------

        >>> sf.rename({'X1': 'name', 'X2':'address'})
        This renames column 'X1' to 'name' and column 'X2' to 'address'

        Returns
        -------
        out: SFrame
            The current SFrame.
        """
        if (type(names) is not dict):
            raise TypeError('names must be a dictionary: oldname -> newname')
        all_columns = set(self.column_names())
        for k in names:
            if not k in all_columns:
                raise ValueError('Cannot find column %s in the SFrame' % k)
        with cython_context():
            for k in names:
                colid = self.column_names().index(k)
                self.__proxy__.set_column_name(colid, names[k])
        return self

    def __getitem__(self, key):
        """
        If the key is a string, this performs the same operation as
        select_column.  If the key is an SArray, this performs a logical filter.
        If the key is an integer this returns a single row of
        the SFrame as a dictionary. If the key is a slice, this returns an
        SFrame with the sliced rows. See :py:class:`graphlab.SFrame` for usage
        examples.
        """
        if type(key) is SArray:
            return self._row_selector(key)
        elif type(key) is list:
            return self.select_columns(key)
        elif type(key) is str:
            return self.select_column(key)
        elif type(key) is int:
            if key < 0:
                key = len(self) + key
            if key >= len(self):
                raise IndexError("SFrame index out of range")
            return list(SFrame(_proxy = self.__proxy__.copy_range(key, 1, key+1)))[0]
        elif type(key) is slice:
            start = key.start
            stop = key.stop
            step = key.step
            if start is None:
                start = 0
            if stop is None:
                stop = len(self)
            if step is None:
                step = 1
            # handle negative indices
            if start < 0:
                start = len(self) + start
            if stop < 0:
                stop = len(self) + stop
            return SFrame(_proxy = self.__proxy__.copy_range(start, step, stop))
        else:
            raise TypeError("Invalid index type: must be SArray, list, or str")

    def __setitem__(self, key, value):
        """
        A wrapper around add_column(s).  Key can be either a list or a str.  If
        value is an SArray, it is added to the SFrame as a column.  If it is a
        constant value (int, str, or float), then a column is created where
        every entry is equal to the constant value.  Existing columns can also
        be replaced using this wrapper.
        """
        if type(key) is list:
            self.add_columns(value, key)
        elif type(key) is str:
            sa_value = None
            if (type(value) is SArray):
                sa_value = value
            elif hasattr(value, '__iter__'):  # wrap list, array... to sarray
                sa_value = SArray(value)
            else:  # create an sarray  of constant value
                sa_value = SArray.from_const(value, self.num_rows())

            # set new column
            if not key in self.column_names():
                with cython_context():
                    self.add_column(sa_value, key)
            else:
                # special case if replacing the only column.
                # server would fail the replacement if the new column has different
                # length than current one, which doesn't make sense if we are replacing
                # the only column. To support this, we first take out the only column
                # and then put it back if exception happens
                single_column = (self.num_cols() == 1)
                if (single_column):
                    tmpname = key
                    saved_column = self.select_column(key)
                    self.remove_column(key)
                else:
                    # add the column to a unique column name.
                    tmpname = '__' + '-'.join(self.column_names())
                try:
                    self.add_column(sa_value, tmpname)
                except:
                    if (single_column):
                        self.add_column(saved_column, key)
                    raise

                if (not single_column):
                    # if add succeeded, remove the column name and rename tmpname->columnname.
                    self.swap_columns(key, tmpname)
                    self.remove_column(key)
                    self.rename({tmpname: key})

        else:
            raise TypeError('Cannot set column with key type ' + str(type(key)))

    def __delitem__(self, key):
        """
        Wrapper around remove_column.
        """
        self.remove_column(key)

    def __materialize__(self):
        """
        For an SFrame that is lazily evaluated, force persist of the SFrame to
        disk, commiting all lazy evaluated operations.
        """
        with cython_context():
            self.__proxy__.materialize()

    def __is_materialized__(self):
        """
        Returns whether or not the sframe has been materialized.
        """
        return self.__proxy__.is_materialized()

    def __has_size__(self):
        """
        Returns whether or not the size of the sframe is known.
        """
        return self.__proxy__.has_size()

    def __iter__(self):
        """
        Provides an iterator to the rows of the sframe.
        """

        _mt._get_metric_tracker().track('sframe.__iter__')

        def generator():
            elems_at_a_time = 262144
            self.__proxy__.begin_iterator()
            ret = self.__proxy__.iterator_get_next(elems_at_a_time)
            column_names = self.column_names()
            while(True):
                for j in ret:
                    yield dict(zip(column_names, j))

                if len(ret) == elems_at_a_time:
                    ret = self.__proxy__.iterator_get_next(elems_at_a_time)
                else:
                    break

        return generator()

    def append(self, other):
        """
        Append the second SFrame to current SFrame. Returns a new SFrame contains rows from both SFrames.
        Both SFrames have to have the same set of columns with the same column names and column types.

        Parameters
        ----------
        other : SFrame
            Another SFrame whose rows are appended to current SFrame

        Returns
        -------
        out_sf : SFrame
            A new SFrame that have the same shape and contains rows from both SFrames

        Notes
        -----
        Both SFrames have to have the same set of columns with the same column names and column types.
        """
        _mt._get_metric_tracker().track('sframe.append')
        if type(other) is not SFrame:
            raise RuntimeError("SFrame append can only work with SFrame")

        left_empty = len(self.column_names()) == 0
        right_empty = len(other.column_names()) == 0

        if (left_empty and right_empty):
            return SFrame()

        if (left_empty or right_empty):
            non_empty_sframe = self if right_empty else other
            return non_empty_sframe.select_columns(non_empty_sframe.column_names())

        my_column_names = self.column_names()
        my_column_types = self.column_types()
        other_column_names = other.column_names()
        if (len(my_column_names) != len(other_column_names)):
            raise RuntimeError("Two SFrames have to have the same number of columns")

        # we allow name order of two sframes to be different, so we create a new sframe from
        # "other" sframe to make it has exactly the same shape
        processed_other_frame = SFrame()
        for i in range(0, len(my_column_names)):
            col_name = my_column_names[i]
            if(col_name not in other_column_names):
                raise RuntimeError("Column " + my_column_names[i] + " does not exist in second SFrame")

            other_column = other.select_column(col_name);
            processed_other_frame.add_column(other_column, col_name)

            # check column type
            if my_column_types[i] != other_column.dtype():
                raise RuntimeError("Column " + my_column_names[i] + " type is not the same in two SFrames, one is " + str(my_column_types[i]) + ", the other is " + str(other_column.dtype()))

        with cython_context():
            return SFrame(_proxy=self.__proxy__.append(processed_other_frame.__proxy__))

    def groupby(self, key_columns, operations, *args):
        """
        Perform a group on the key_columns followed by aggregations on the
        columns listed in operations.

        The operations parameter is a dictionary that indicates which
        aggregation operators to use and which columns to use them on. The
        available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, CONCAT,
        SELECT_ONE, and QUANTILE.
        For convenience, aggregators MEAN, STD, and VARIANCE are available as
        synonyms for AVG, STDV, and VAR.
        See :mod:`~graphlab.aggregate` for more detail on the aggregators.

        Parameters
        ----------
        key_columns : string | list[string]
            Column(s) to group by. Key columns can be of any type other than
            dictionary.

        operations : dict, list
            Dictionary of columns and aggregation operations. Each key is a
            output column name and each value is an aggregator. This can also
            be a list of aggregators, in which case column names will be
            automatically assigned.

        *args :
            All other remaining arguments will be interpreted in the same
            way as the operations argument.

        Returns
        -------
        out_sf : SFrame
            A new SFrame, with a column for each groupby column and each
            aggregation operation.

        Examples
        --------

        >>> import graphlab as gl
        >>> import graphlab.aggregate as agg
        >>> sf = gl.load_sframe('netflix')

        Compute the number of occurrences of each user.

        >>> user_count = sf.groupby(key_columns='user_id',
                                    operations={'count': agg.COUNT()})

        Compute the number of occurrences of each user, automatically assigning
        a column name.

        >>> user_count = sf.groupby(key_columns='user_id',
                                    operations=agg.COUNT())

        Compute the mean and standard deviation of ratings per user.

        >>> user_rating_stats = sf.groupby(key_columns='user_id',
                    operations={'mean_rating': agg.MEAN('rating'),
                                    'std_rating':agg.STD('rating')})

        Compute the count, mean, and standard deviation of ratings per (user,
        time), automatically assignming output column names.

        >>> user_rating_stats = sf.groupby(['user_id', 'time'],
                                           [agg.COUNT(),
                                             agg.AVG('rating'),
                                             agg.STDV('rating')]})

        The groupby function can take a variable length list of aggregation
        specifiers so if we want the count and the 0.25 and 0.75 quantiles of
        ratings:

        >>> user_rating_stats = sf.groupby(['user_id', 'time'], agg.COUNT(),
                                           {'rating_quantiles': agg.QUANTILE('rating',[0.25, 0.75])})

        To put all items a user rated into one list value by their star rating:

        >>> user_rating_stats = sf.groupyby(["user_id", "rating"],
                                           {"rated_items":agg.CONCAT("item")})

        To put all items and rating of a given user together into a dictionary value:

        >>> user_rating_stats = sf.groupyby("user_id",
                                            {"item_rating":agg.CONCAT("item", "rating")})
        """
        # some basic checking first
        # make sure key_columns is a list
        if isinstance(key_columns, str):
            key_columns = [key_columns]
        # check that every column is a string, and is a valid column name
        my_column_names = self.column_names()
        key_columns_array = []
        for column in key_columns:
            if not isinstance(column, str):
                raise TypeError("Column name must be a string")
            if column not in my_column_names:
                raise KeyError("Column " + column + " does not exist in SFrame")
            if self[column].dtype() == dict:
                raise TypeError("Cannot group on a dictionary column.")
            key_columns_array.append(column)

        group_output_columns = []
        group_columns = []
        group_ops = []

        all_ops = [operations] + list(args)

        for op_entry in all_ops:
            # if it is not a dict, nor a list, it is just a single aggregator
            # element (probably COUNT). wrap it in a list so we can reuse the
            # list processing code
            operation = op_entry
            if not(isinstance(operation, list) or isinstance(operation, dict)):
              operation = [operation]

            if isinstance(operation, dict):
              # now sweep the dict and add to group_columns and group_ops
              for key in operation:
                  val = operation[key]
                  group_output_columns = group_output_columns + [key]
                  if type(val) is tuple:
                    (op, column) = val
                    group_columns = group_columns + [column]
                    group_ops = group_ops + [op]
                  elif val == graphlab.aggregate.COUNT:
                    val = graphlab.aggregate.COUNT()
                    (op, column) = val
                    group_columns = group_columns + [column]
                    group_ops = group_ops + [op]
                  else:
                    raise TypeError("Unexpected type in aggregator definition of output column: " + key)
            elif isinstance(operation, list):
              # we will be using automatically defined column names
              for val in operation:
                  group_output_columns = group_output_columns + [""]
                  if type(val) is tuple:
                    (op, column) = val
                    group_columns = group_columns + [column]
                    group_ops = group_ops + [op]
                  elif val == graphlab.aggregate.COUNT:
                    val = graphlab.aggregate.COUNT()
                    (op, column) = val
                    group_columns = group_columns + [column]
                    group_ops = group_ops + [op]
                  else:
                    raise TypeError("Unexpected type in aggregator definition.")


        # let's validate group_columns and group_ops are valid
        for (cols, op) in zip(group_columns, group_ops):
            for col in cols:
                if not isinstance(col, str):
                    raise TypeError("Column name must be a string")

            if not isinstance(op, str):
                raise TypeError("Operation type not recognized.")

            if op is not graphlab.aggregate.COUNT()[0]:
                for col in cols:
                    if col not in my_column_names:
                        raise KeyError("Column " + col + " does not exist in SFrame")

            _mt._get_metric_tracker().track('sframe.groupby', properties={'operator':op})

        with cython_context():
            return SFrame(_proxy=self.__proxy__.groupby_aggregate(key_columns_array, group_columns,
                                                                  group_output_columns, group_ops))

    def join(self, right, on=None, how='inner'):
        """
        Merges the current (left) SFrame with the given (right) SFrame using a
        SQL-style equi-join operation by columns.

        Parameters
        ----------
        right : SFrame
            The SFrame to join

        on : None, str, list, or dict
            The column name(s) representing the set of join keys.  Each row
            that has the same value in this set of columns will be merged
            together.

            If 'None' is given, join will use all columns that have
            the same name as the set of join keys.

            If a str is given, this is interpreted as a join using one column,
            where both frames have the same column name.

            If a list is given, this is interpreted as a join using one or
            more column names, where each column name given exists in both
            SFrames.

            If a dict is given, each dict key is taken as a column name in the
            left SFrame, and each dict value is taken as the column name in
            right SFrame that will be joined together. e.g.
            {'left_col_name':'right_col_name'}.


        how : str in {'left','right','outer','inner'}
            The type of join to perform.  'inner' is default.

            - inner: Equivalent to a SQL inner join.  Result consists of the
              rows from the two frames whose join key values match exactly,
              merged together into one SFrame.

            - left: Equivalent to a SQL left outer join. Result is the union
              between the result of an inner join and the rest of the rows from
              the left SFrame, merged with missing values.

            - right: Equivalent to a SQL right outer join.  Result is the union
              between the result of an inner join and the rest of the rows from
              the right SFrame, merged with missing values.

            - outer: Equivalent to a SQL full outer join. Result is
              the union between the result of a left outer join and a right
              outer join.

        Returns
        -------
        out : SFrame

        Examples
        --------
        Assume sf_left and sf_right are two SFrames, each with a column called 'itemid' and 'region'.

        An inner join on columns of the same name:

        >>> result = sf_left.join(sf_right, on=['itemid','region'])

        A full outer join on columns with (possibly) different names:

        >>> result = sf_left.join(sf_right, how='outer', on={'thingid':'stuffid','some_col':'some_col2', 'itemid':'itemid'})

        As a more concrete example, let's create some SFrames to join:

        >>> import graphlab as gl
        >>> animals = gl.SFrame({'id':[1,2,3,4], 'name':['dog','cat','sheep','cow']})
        >>> sounds = gl.SFrame({'id':[1,3,4,5], 'sound':['woof','baa','moo','oink']})

        And then use each available type of join and see what the data looks like:

        >>> inner_result = animals.join(sounds, on='id', how='inner')

        >>> print inner_result
        (...output truncated...)
        Rows: 3
        Data:
           id   name sound
        0   1    dog  woof
        1   3  sheep   baa
        2   4    cow   moo

        >>> left_result = animals.join(sounds, on='id', how='left')

        >>> print left_result
        (...output truncated...)
        Rows: 4
        Data:
           id   name sound
        0   3  sheep   baa
        1   1    dog  woof
        2   4    cow   moo
        3   2    cat  None

        >>> right_result = animals.join(sounds, on='id', how='right')

        >>> print right_result
        (...output truncated...)
        Rows: 4
        Data:
           id   name sound
        0   3  sheep   baa
        1   1    dog  woof
        2   4    cow   moo
        3   5   None  oink

        >>> outer_result = animals.join(sounds, on='id', how='outer')

        >>> print outer_result
        (...output truncated...)
        Rows: 5
        Data:
           id   name sound
        0   1    dog  woof
        1   3  sheep   baa
        2   5   None  oink
        3   4    cow   moo
        4   2    cat  None
        """
        _mt._get_metric_tracker().track('sframe.join', properties={'type':how})
        available_join_types = ['left','right','outer','inner']

        if type(right) is not SFrame:
            raise TypeError("Can only join two SFrames")

        if how not in available_join_types:
            raise ValueError("Invalid join type")

        join_keys = dict()
        if on is None:
            left_names = self.column_names()
            right_names = right.column_names()
            common_columns = [name for name in left_names if name in right_names]
            for name in common_columns:
                join_keys[name] = name
        elif type(on) is str:
            join_keys[on] = on
        elif type(on) is list:
            for name in on:
                if type(name) is not str:
                    raise TypeError("Join keys must each be a str.")
                join_keys[name] = name
        elif type(on) is dict:
            join_keys = on
        else:
            raise TypeError("Must pass a str, list, or dict of join keys")

        with cython_context():
            return SFrame(_proxy=self.__proxy__.join(right.__proxy__, how, join_keys))

    def filter_by(self, values, column_name):
        """
        Filters an SFrame by values inside an SArray. Result is an SFrame that
        only includes the rows that have a column with the given 'column_name'
        which holds one of the values in the given 'values' SArray. If 'values'
        is not an SArray, we attempt to convert it to one before filtering.

        Parameters
        ----------
        values : SArray | list | numpy.ndarray | pandas.Series | str
            The values to use to filter the SFrame.  The resulting SFrame
            will only include rows that have one of these values.

        column_name : str
            The column of the SFrame to match with the given 'values'.

        Returns
        -------
        out : SFrame

        Examples
        --------
        Let's create an SFrame of animals I own:

        >>> import graphlab as gl
        >>> sf = gl.SFrame({'id':[1,2,3,4], 'animal_type':['dog','cat','cow','horse'], 'name':['bob','jim','jimbob','bobjim'})

        The filter_by function let's you filter an SFrame by the actual values in a
        column.  So, for example, I have a list of common household pets:

        >>> common_household_pets = ['cat','hamster','dog','fish','bird','snake']

        I can use this to filter out household pets from barnyard animals, perhaps:

        >>> household_sf = sf.filter_by(common_household_pets, 'animal_type')
        >>> household_sf
        Columns:
        animal_type     str
        id      int
        name    str
        Rows: 2
        Data:
          animal_type  id name
        0         cat   2  jim
        1         dog   1  bob
        [2 rows x 3 columns]
        """
        _mt._get_metric_tracker().track('sframe.filter_by')
        if type(column_name) is not str:
            raise TypeError("Must pass a str as column_name")

        if type(values) is not SArray:
            # If we were given a single element, try to put in list and convert
            # to SArray
            if not hasattr(values, '__iter__'):
                values = [values]
            values = SArray(values)

        value_sf = SFrame()
        value_sf.add_column(values, column_name)

        # Make sure the values list has unique values, or else join will not
        # filter.
        value_sf = value_sf.groupby(column_name, {})

        existing_columns = self.column_names()
        if column_name not in existing_columns:
            raise KeyError("Column '" + column_name + "' not in SFrame.")

        existing_type = self.column_types()[self.column_names().index(column_name)]
        given_type = value_sf.column_types()[0]
        if given_type != existing_type:
            raise TypeError("Type of given values does not match type of column '" +
                column_name + "' in SFrame.")

        with cython_context():
            return SFrame(_proxy=self.__proxy__.join(value_sf.__proxy__,
                                                     'inner',
                                                     {column_name:column_name}))

    @graphlab.canvas.inspect.find_vars
    def show(self):
        """
        show()
        Launch or update GraphLab Canvas and generate specified visualization view of the SFrame.

        Returns
        -------
        view : graphlab.canvas.view.View
            An object representing the GraphLab Canvas view.
        """
        return graphlab.canvas._show(variable=self)

    def pack_columns(self, columns = None, column_prefix=None, dtype = list, fill_na=None, remove_prefix=True, new_column_name=None):
        """
        Pack two or more columns of the current SFrame into one single column,
        return a new SFrame with rest of columns plus the newly created column.

        The list of columns that are packed is chosen through either "columns"
        or "column_prefix" parameter. Only one of the parameter is allowed to
        be provided. "columns" explicitly specified the list of columns to pack,
        "column_prefix" specifies all columns that have given prefix are to be
        packed.

        The resulting column type is decided by "dtype" paramter, allowed values
        are dict, array.arary and list.
         * dtype = dict: pack to a dictionary SArray where column name becomes
           dictionary key and column value becomes dictionary value
         * dtype = array.array: pack all values from the packing columns into an
           array
         * dtype=list: pack all values from the packing columns into a list

        Parameters
        ----------
        columns : list[str], optional
            A list of column names to be packed.
            There needs to have at least two columns to pack.
            If omitted and column_prefix is not specified, all columns from current
            SFrame are packed.
            This parameter is mutually exclusive with "column_prefix" parameter.

        column_prefix : str, optional
            Pack all columns with the given column_prefix.
            This parameter is mutually exclusive with "columns" parameter.

        remove_prefix : bool, optional
            If True and "column_prefix" is specified, the dictionary key will
            be constructed by removing the prefix from the column name.
            This option is only applicable when packing to dict type.
            If not given, default value is True

        dtype : dict | aray.array | list, optional
            The resulting packed column type.
            If not provided, dtype is list.

        fill_na : value, optional
          Value to fill into packed column if missing value is encountered.
          If pack to dictionary, 'fill_na' is only applicable to dictionary value,
          missing keys are not replaced.
          By default, do not replace any missing value.

        new_column_name : str, optional
            Packed column name.
            If not given, and column_prefix is given, then the prefix will be used
            as new column name, otherwise name is generated automatically.

        Returns
        -------
        out : SFrame
            An SFrame that contains columns that are not packed, plus the newly
            packed column.

        Examples
        --------
        Suppose 'sf' is an an SFrame that maintains business category information:

        >>> sf
        Columns:
            business    int
            category.retail int
            category.food   int
            category.service    int
            category.shop   int
        Rows: 4
        Data:
        +----------+-----------------+---------------+------------------+---------------+
        | business | category.retail | category.food | category.service | category.shop |
        +----------+-----------------+---------------+------------------+---------------+
        |    1     |        1        |       1       |       None       |       1       |
        |    2     |       None      |       1       |        1         |       1       |
        |    3     |        1        |      None     |        1         |      None     |
        |    4     |       None      |       1       |       None       |       1       |
        +----------+-----------------+---------------+------------------+---------------+


        To pack all category columns into a list:

        >>> sf.pack_columns(column_prefix='category')
        Columns:
            business    int
            X2  list
        Rows: 4
        Data:
        +----------+--------------------+
        | business |         X2         |
        +----------+--------------------+
        |    1     |  [1, 1, None, 1]   |
        |    2     |  [None, 1, 1, 1]   |
        |    3     | [1, None, 1, None] |
        |    4     | [None, 1, None, 1] |
        +----------+--------------------+

        To pack all category columns into a dictionary, with new column name:

        >>> sf.pack_columns(column_prefix='category', dtype=dict, new_column_name='category')
        Columns:
            business    int
            category    dict
        Rows: 4
        Data:
        +----------+--------------------------------+
        | business |            category            |
        +----------+--------------------------------+
        |    1     | {'food': 1, 'shop': 1, 're ... |
        |    2     | {'food': 1, 'shop': 1, 'se ... |
        |    3     |  {'retail': 1, 'service': 1}   |
        |    4     |     {'food': 1, 'shop': 1}     |
        +----------+--------------------------------+


        To keep column prefix in the resulting dict key:

        >>> sf.pack_columns(column_prefix='category', dtype=dict, remove_prefix=False)
        To explicitly give all column names to pack:
        Columns:
            business    int
            X2  dict
        Rows: 4
        Data:
        +----------+--------------------------------+
        | business |               X2               |
        +----------+--------------------------------+
        |    1     | {'category.retail': 1, 'ca ... |
        |    2     | {'category.food': 1, 'cate ... |
        |    3     | {'category.retail': 1, 'ca ... |
        |    4     | {'category.food': 1, 'cate ... |
        +----------+--------------------------------+

        To explicitly pack a set of columns:

        >>> sf.pack_columns(columns = ['business','category.retail','category.food','category.service','category.shop'])
        Columns:
            X1  list
        Rows: 4
        Data:
        +-----------------------+
        |           X1          |
        +-----------------------+
        |   [1, 1, 1, None, 1]  |
        |   [2, None, 1, 1, 1]  |
        | [3, 1, None, 1, None] |
        | [4, None, 1, None, 1] |
        +-----------------------+

        To pack all columns with name starting with 'category' into an array type,
        and with missing value replaced with 0:

        >>> sf.pack_columns(column_prefix="category", dtype=array.array, fill_na=0)
        Columns:
            business    int
            X2  array
        Rows: 4
        Data:
        +----------+--------------------------------+
        | business |               X2               |
        +----------+--------------------------------+
        |    1     | array('d', [1.0, 1.0, 0.0, ... |
        |    2     | array('d', [0.0, 1.0, 1.0, ... |
        |    3     | array('d', [1.0, 0.0, 1.0, ... |
        |    4     | array('d', [0.0, 1.0, 0.0, ... |
        +----------+--------------------------------+
        [4 rows x 2 columns]

        Notes
        -----
        Need at least two columns to pack
        Refer to :py:func:`graphlab.SFrame.unpack()` for opposite effect of pack.

        Missing value behavior:
        If pack to dictionary, missing key is always dropped. Missing value is
        dropped if fill_na is not provided, otherwise, missing value is replaced
        by 'fill_na'.
        If pack to list or array, missing value will be kept. If 'fill_na' is
        provided, the missing value is replaced with 'fill_na' value.


        """
        if columns != None and column_prefix != None:
            raise ValueError("'columns' and 'column_prefix' parameter cannot be given at the same time.")

        if new_column_name == None and column_prefix != None:
            new_column_name = column_prefix

        if column_prefix != None:
            if type(column_prefix) != str:
                raise TypeError("'column_prefix' must be a string")
            columns = [name for name in self.column_names() if name.startswith(column_prefix)]
            if len(columns) == 0:
                raise ValueError("There is no column starts with prefix '" + column_prefix + "'")
        elif columns == None:
            columns = self.column_names()
        else:
            if not hasattr(columns, '__iter__'):
                raise TypeError("columns must be an iterable type")

            column_names = set(self.column_names())
            for column in columns:
                if (column not in column_names):
                    raise ValueError("Current SFrame has no column called '" + str(column) + "'.")

            # check duplicate names
            if len(set(columns)) != len(columns):
                raise ValueError("There is duplicate column names in columns parameter")

        if (len(columns) <= 1):
            raise ValueError("Please provide at least two columns to pack")

        if (dtype not in (dict, list, array.array)):
            raise ValueError("Resulting dtype has to be one of dict/array.array/list type")

        # fill_na value for array needs to be numeric
        if dtype == array.array:
            if (fill_na != None) and (type(fill_na) not in (int, float)):
                raise ValueError("fill_na value for array needs to be numeric type")
            # all columns have to be numeric type
            for column in columns:
                if self[column].dtype() not in (int, float):
                    raise TypeError("Column '" + column + "' type is not numeric, cannot pack into array type")

        # generate dict key names if pack to dictionary
        # we try to be smart here
        # if all column names are like: a.b, a.c, a.d,...
        # we then use "b", "c", "d", etc as the dictionary key during packing
        if (dtype == dict) and (column_prefix != None) and (remove_prefix == True):
            size_prefix = len(column_prefix)
            first_char = set([c[size_prefix:size_prefix+1] for c in columns])
            if ((len(first_char) == 1) and first_char.pop() in ['.','-','_']):
                dict_keys = [name[size_prefix+1:] for name in columns]
            else:
                dict_keys = [name[size_prefix:] for name in columns]

        else:
            dict_keys = columns

        rest_columns = [name for name in self.column_names() if name not in columns]
        if new_column_name != None:
            if type(new_column_name) != str:
                raise TypeError("'new_column_name' has to be a string")
            if new_column_name in rest_columns:
                raise KeyError("Current SFrame already contains a column name " + new_column_name)
        else:
            new_column_name = ""

        _mt._get_metric_tracker().track('sframe.pack_columns')

        ret_sa = None
        with cython_context():
            ret_sa = SArray(_proxy=self.__proxy__.pack_columns(columns, dict_keys, dtype, fill_na))

        new_sf = self.select_columns(rest_columns)
        new_sf.add_column(ret_sa, new_column_name)
        return new_sf

    def unpack(self, unpack_column, new_column_name_prefix = None, column_types = None, na_value=None, limit=None):
        """
        Expand one column of SFrame to multiple columns, with each value in a
        separate column. Returns a new SFrame with the upacked column replaced with
        a list of new columns.
        The column must be of list/array/dict type.

        For more details regarding name generation, missing value handling and
        other, refer to :py:func:`graphlab.SArray.unpack()`

        Parameters
        ----------
        unpack_column: str
            Name of the upacked column

        new_column_name_prefix: str, optional
            If provided, unpacked column names would start with the given prefix.
            If not provided, default value is the name of the unpacked column

        column_types: [type], optional
            Column types for the upacked columns.
            If not provided, column types are automatically inferred from first
            100 rows. For array type, default column types are float.  If
            provided, column_types also restricts how many columns to unpack.

        na_value: flexible_type, optional
            If provided, convert all values that are equal to "na_value" to missing
            value (None).

        limit: list of str | list of int, optional
            Control unpacking only a subset of list/array/dict value.
            For dictionary SArray, 'limit' is a list of dictionary keys to restrict.
            For list/array SArray, 'limit' is a list of integers that are indexes
            into the list/array value.

        Returns
        -------
        out : SFrame
            A new SFrame that contains rest of columns from original SFrame with
            the given column replaced with a collection of unpacked columns.

        Examples
        ---------

        >>> sf
        Columns:
            id   int
            wc   dict
        Rows: 3
        Data:
            +----+------------------+
            | id |        wc        |
            +----+------------------+
            | 1  |     {'a': 1}     |
            | 2  |     {'b': 2}     |
            | 3  | {'a': 1, 'b': 2} |
            +----+------------------+

        >>> sf.unpack('wc')
        Columns:
            id  int
            wc.a    int
            wc.b    int
        Rows: 3
        Data:
        +----+------+------+
        | id | wc.a | wc.b |
        +----+------+------+
        | 1  |  1   | None |
        | 2  | None |  2   |
        | 3  |  1   |  2   |
        +----+------+------+


        To not have prefix in the generated column name:

        >>> sf.unpack('wc', new_column_name_prefix="")
        Columns:
            id  int
            a   int
            b   int
        Rows: 3
        Data:
        +----+------+------+
        | id |  a   |  b   |
        +----+------+------+
        | 1  |  1   | None |
        | 2  | None |  2   |
        | 3  |  1   |  2   |
        +----+------+------+

        To limit subset of keys to unpack:

        >>> sf.unpack('wc', limit=['b'])
        Columns:
            id  int
            wc.b    int
        Rows: 3
        Data:
        +----+------+
        | id | wc.b |
        +----+------+
        | 1  | None |
        | 2  |  2   |
        | 3  |  2   |
        +----+------+

        To unpack an array column:

        >>> sf
        Columns:
            id  int
            friends  array
        Rows: 3
        Data:
        +----+-----------------------------+
        | id |            friends          |
        +----+-----------------------------+
        | 1  | array('d', [1.0, 2.0, 3.0]) |
        | 2  | array('d', [2.0, 3.0, 4.0]) |
        | 3  | array('d', [3.0, 4.0, 5.0]) |
        +----+-----------------------------+
        >>> sf.unpack('friends')
        Columns:
            id  int
            friends.0   float
            friends.1   float
            friends.2   float
        Rows: 3
        Data:
        +----+-----------+-----------+-----------+
        | id | friends.0 | friends.1 | friends.2 |
        +----+-----------+-----------+-----------+
        | 1  |    1.0    |    2.0    |    3.0    |
        | 2  |    2.0    |    3.0    |    4.0    |
        | 3  |    3.0    |    4.0    |    5.0    |
        +----+-----------+-----------+-----------+

        To only unpack first and third friend and pick new column name prefix:

        >>> sf.unpack('friends', limit=[0,2], new_column_name_prefix="buddy")
        Columns:
            id  int
            buddy.0 float
            buddy.2 float
        Rows: 3
        Data:
        +----+---------+---------+
        | id | buddy.0 | buddy.2 |
        +----+---------+---------+
        | 1  |   1.0   |   3.0   |
        | 2  |   2.0   |   4.0   |
        | 3  |   3.0   |   5.0   |
        +----+---------+---------+

        Notes:
        ------
        Refer to :py:func:`graphlab.SFrame.pack()` for opposite effect of unpack.

        """
        if unpack_column not in self.column_names():
            raise KeyError("column '" + unpack_column + "' does not exist in current SFrame")

        if new_column_name_prefix == None:
            new_column_name_prefix = unpack_column

        new_sf = self[unpack_column].unpack(new_column_name_prefix, column_types, na_value, limit)

        # construct return SFrame, check if there is conflict
        rest_columns =  [name for name in self.column_names() if name != unpack_column]
        new_names = new_sf.column_names()
        while set(new_names).intersection(rest_columns):
            new_names = [name + ".1" for name in new_names]
        new_sf.rename(dict(zip(new_sf.column_names(), new_names)))

        _mt._get_metric_tracker().track('sframe.unpack')
        ret_sf = self.select_columns(rest_columns)
        ret_sf.add_columns(new_sf)
        return ret_sf

    def stack(self, column_name, new_column_name = None, drop_na = False):
        """
        Convert a "wide" column of current SFrame to one or two "tall" column by
        stacking all values. The stack works only for column of dict/list/array type.

        If the column is dict type, two new columns are created as a result of
        stacking: one column holds the key and another column holds the value.
        All rest columns are repeated for each key/value pair.
        Returns a new SFrame with the two newly created columns, and all columns
        other than the column that is stacked.

        If the column is array or list type, one new column is created as
        a result of stacking. With each row holds one element of the array or list
        value, and the rest columns from the same original row repeated.
        Returns a new SFrame with the newly created column, and all columns
        other than the column that is stacked.

        Parameters
        --------------
        column_name: str
            The column to stack. This column must be of dict/list/array type

        new_column_name: str | list of str, optional
            The new column name(s).
            If original column is list/array type, new_column_name must a string.
            If original column is dict type, new_column_name must be a list of
            two strings.
            If not given, column names are generated automatically.

        drop_na: boolean, optional
            If True, missing values and empty list/array/dict are all dropped
            from the resulting column(s). If False, missing values are maintained
            in stacked column(s).
            Default value is False.

        Notes
        ------
        Refer to :py:func:`graphlab.SFrame.unstack()` for opposite effect of stack.

        Returns
        -------
        out : SFrame
            A new SFrame that contains newly stacked column(s) plus columns in
            original SFrame other than the stacked column.

        Examples
        ---------
        Suppose 'sf' is an SFrame that contains a column of dict type:

        >>> sf
        Columns:
          topic	int
          words	dict
        Rows: 4
        Data:
        +-------+----------------------+
        | topic |        words         |
        +-------+----------------------+
        |   1   |  {'a': 3, 'cat': 2}  |
        |   2   |  {'a': 1, 'the': 2}  |
        |   3   | {'the': 1, 'dog': 3} |
        |   4   |          {}          |
        +-------+----------------------+
        [4 rows x 2 columns]

        Stack would stack all keys in one column and all values in another column:

        >>> sf.stack('words', new_column_name=['word', 'count'])
        Columns:
          topic int
          word  str
          count int
        Rows: 7
        Data:
        +-------+------+-------+
        | topic | word | count |
        +-------+------+-------+
        |   1   |  a   |   3   |
        |   1   | cat  |   2   |
        |   2   |  a   |   1   |
        |   2   | the  |   2   |
        |   3   | the  |   1   |
        |   3   | dog  |   3   |
        |   4   | None |  None |
        +-------+------+-------+
        [7 rows x 3 columns]

        Observe that since topic 4 had no words, an empty row is inserted.
        To drop that row, set dropna=True in the parameters to stack.

        Suppose 'sf' is an SFrame that contains a user and his/her friends, where
        'friends' columns is an array type. Stack on 'friends' column would create
        a user/friend list for each user/friend pair:

        >>> sf
        Columns:
          user	int
          friends	list
        Rows: 3
        Data:
        +------+------------------+
        | user |     friends      |
        +------+------------------+
        |  1   |     [2, 3, 4]    |
        |  2   |      [5, 6]      |
        |  3   | [4, 5, 10, None] |
        +------+------------------+
        [3 rows x 2 columns]
        >>> sf.stack('friends', new_column_name='friend')
        Columns:
          user	int
          friend int
        Rows: 9
        Data:
        +------+--------+
        | user | friend |
        +------+--------+
        |  1   |  2     |
        |  1   |  3     |
        |  1   |  4     |
        |  2   |  5     |
        |  2   |  6     |
        |  3   |  4     |
        |  3   |  5     |
        |  3   |  1     |
        |  3   |  None  |
        +------+--------+
        [9 rows x 2 columns]


        To remove the missing value from resulting stacked column,
        set the remove_na=True parameter.

        Notes
        -----
        Refer to :py:func:`graphlab.SFrame.unstack()` for opposite effect of stack
        """
        # validate column_name
        column_name = str(column_name)
        if column_name not in self.column_names():
            raise ValueError("Cannot find column '" + str(column_name) + "' in the SFrame.")

        stack_column_type =  self[column_name].dtype()
        if (stack_column_type not in [dict, array.array, list]):
            raise TypeError("Stack is only supported for column of dict/list/array type.")

        if (new_column_name != None):
            if stack_column_type == dict:
                if (type(new_column_name) is not list):
                    raise TypeError("new_column_name has to be a list to stack dict type")
                elif (len(new_column_name) != 2):
                    raise TypeError("new_column_name must have length of two")
            else:
                if (type(new_column_name) != str):
                    raise TypeError("new_column_name has to be a str")
                new_column_name = [new_column_name]

            # check if the new column name conflicts with existing ones
            for name in new_column_name:
                if (name in self.column_names()) and (name != column_name):
                    raise ValueError("Column with name '" + name + "' already exists, pick a new column name")
        else:
            if stack_column_type == dict:
                new_column_name = ["",""]
            else:
                new_column_name = [""]

        # infer column types
        head_row = SArray(self[column_name].head(100)).dropna()
        if (len(head_row) == 0):
            raise ValueError("Cannot infer column type because there is not enough rows to infer value")
        if stack_column_type == dict:
            # infer key/value type
            keys = []; values = []
            for row in head_row:
                for val in row:
                    keys.append(val)
                    if val != None: values.append(row[val])

            new_column_type = [
                infer_type_of_list(keys),
                infer_type_of_list(values)
            ]
        else:
            values = [v for v in itertools.chain.from_iterable(head_row)]
            new_column_type = [infer_type_of_list(values)]

        _mt._get_metric_tracker().track('sframe.stack')

        with cython_context():
            return SFrame(_proxy=self.__proxy__.stack(column_name, new_column_name, new_column_type, drop_na))

    def unstack(self, column, new_column_name = None):
        """
        Transforms an SFrame by concating values from one or two columns into one
        column, gropuby other columns.
        Returns an SFrame that contains the concatenated column, in addition to
        remaining columns of original SFrame.

        The resulting column could be of type list, array or dictionary.
        If 'column' is a numeric column, resulting
        column would be array.array type; if 'column' is a non-numeric column,
        resulting columns would be list type. If 'column' is a list of two columns,
        resulting column would be dict type.

        'unstack' is a special version of :py:func:`graphlab.SFrame.groupby()`
        and :mod:`~graphlab.aggregate.CONCAT`.

        Missing values are maintained during unstack.

        Duplicate dict key handling:
        For one group, if there are more than one instance of a given key, an
        arbitrary value is selected.

        Parameters
        -----------
        column: str | [str, str]
            The column(s) that is(are) to be concatenated.
            If str, then collapsed column type is either array or list.
            If [str, str], then collapsed column type is dict

        new_column_name: str, optional
            New column name. If not given, a new column name is generated automatically.

        Returns
        -------
        out : SFrame
            A new SFrame containing the grouped columns as well as the new column.

        Notes
        ------
        * Refer to :py:func:`graphlab.SFrame.stack()` for the inverse of unstack.
        * There is no guarantee the resulting SFrame maintains the same order as
          original SFrame after unstack.

        Examples
        ---------
        Suppose 'sf' is an SFrame that captures topic, word and count information:

        >>> sf
        Columns:
          count	int
          topic	str
          word	str
        Rows: 6
        Data:
        +-------+----------+------+
        | count |  topic   | word |
        +-------+----------+------+
        |   4   |   cat    |  a   |
        |   2   |   cat    |  c   |
        |   1   |   dog    |  c   |
        |   1   | elephant |  a   |
        |   2   | elephant |  b   |
        |  None |   fish   | None |
        +-------+----------+------+
        [6 rows x 3 columns]

        Unstack on column pair ['word', 'count'] would create a new SFrame that
        groups the data by "topic" and concats "word" and "count" into a dictionary:

        >>> sf.unstack(column = ['word', 'count'], new_column_name = 'words')
        Columns:
          topic	str
          words	dict
        Rows: 4
        Data:
        +----------+------------------+
        |  topic   |      words       |
        +----------+------------------+
        | elephant | {'a': 1, 'b': 2} |
        |   dog    |     {'c': 1}     |
        |   cat    | {'a': 4, 'c': 2} |
        |   fish   |       None       |
        +----------+------------------+
        [4 rows x 2 columns]

        Suppose 'sf' is an SFrame that contains friend list for a user

        >>> sf
        Columns:
          friend	int
          user	int
        Rows: 10
        Data:
        +--------+------+
        | friend | user |
        +--------+------+
        |   2    |  1   |
        |   3    |  1   |
        |   4    |  1   |
        |   5    |  2   |
        |   6    |  2   |
        |   4    |  3   |
        |   5    |  3   |
        |   10   |  3   |
        |   2    |  4   |
        |   3    |  4   |
        +--------+------+
        [10 rows x 2 columns]

        To create a user-friends column by collapsing all friends from one user:

        >>> sf.unstack('friend', new_column_name='friends')
        Columns:
          user	int
          friends	array
        Rows: 4
        Data:
        +------+------------------------------+
        | user |           friends            |
        +------+------------------------------+
        |  2   |    array('d', [5.0, 6.0])    |
        |  4   |    array('d', [3.0, 2.0])    |
        |  3   | array('d', [5.0, 10.0, 4.0]) |
        |  1   | array('d', [2.0, 3.0, 4.0])  |
        +------+------------------------------+
        [4 rows x 2 columns]

        Notes
        -----
        Refer to :py:func:`graphlab.SFrame.stack()` for opposite effect of unstack

        Refer to :py:func:`graphlab.SFrame.groupby()` and :mod:`~graphlab.aggregate.CONCAT`,
        They give more powerful functionality than unstack. Where you could unstack
        multiple columns at the same time, or combine with other aggregation operations.

        """
        if (type(column) != str and len(column) != 2):
            raise TypeError("'column' parameter has to be either a string or a list of two strings.")

        _mt._get_metric_tracker().track('sframe.unstack')

        with cython_context():
            if type(column) == str:
                key_columns = [i for i in self.column_names() if i != column]
                if new_column_name != None:
                    return self.groupby(key_columns, {new_column_name: graphlab.aggregate.CONCAT(column)})
                else:
                    return self.groupby(key_columns, graphlab.aggregate.CONCAT(column))
            elif len(column) == 2:
                key_columns = [i for i in self.column_names() if i not in column]
                if new_column_name != None:
                    return self.groupby(key_columns, {new_column_name:graphlab.aggregate.CONCAT(column[0], column[1])})
                else:
                    return self.groupby(key_columns, graphlab.aggregate.CONCAT(column[0], column[1]))

    def unique(self):
        """
        Return an SFrame containing only the unique rows of a given SFrame.
        Will not necessarily preserve the order of the given SFrame in the new
        SFrame.

        Raises a TypeError if any column in the SFrame is a dictionary.

        Returns
        -------
        out : SFrame
            A new SFrame that contains the unique values of the current SFrame.
        """
        return self.groupby(self.column_names(),{})

    def sort(self, sort_columns, ascending = True):
        """
        Sort current SFrame by the given columns, using the given sort order.

        Only columns that are type of str, int and float can be sorted.

        Parameters
        ------------
        sort_columns : str | list of str | list of (str, bool) pair
            The column name, or column names to be sorted.
            The result will be sorted first by first column, followed by second
            column, and so on. All columns are going to be sorted in the same
            order as governed by the "ascending" paramter.

            To control the sort ordering for each column individually,
            sort_columns must be a list of (str, bool) pairs. In which case
            the first value is the column name and the second value is a
            boolean indicating the sort order (True means ascending).

        ascending: boolean, optional
            Sort all columns in the given order. where true means ascending,
            and False means descending. By default, the value is True.

        Returns
        -------
        out : SFrame
            A new SFrame that is sorted according to given sort criteria

        Examples
        --------
           Suppose 'sf' is an sframe that has three columns 'a', 'b', 'c'.
           To sort by column 'a', ascending

           >>> sf.sort('a')

           To sort by column 'a', desending

           >>> sf.sort('a', ascending = False)

           To sort by column 'a' and 'b', all ascending

           >>> sf.sort(['a', 'b'])

           To sort by column 'a' and 'b', all desending

           >>> sf.sort(['a', 'b'], ascending = False)

           To sort by column 'a' ascending, and then by column 'b' desending

           >>> sf.sort([('a': True), ('b': False)])

        """
        sort_column_names = []
        sort_column_orders = []

        # validate sort_columns
        if (type(sort_columns) == str):
            sort_column_names = [sort_columns]
        elif (type(sort_columns) == list):
            if (len(sort_columns) == 0):
                raise ValueError("Please provide at least one column to sort")

            first_param_types = set([type(i) for i in sort_columns])
            if (len(first_param_types) != 1):
                raise ValueError("sort_columns element are not of the same type")

            first_param_type = first_param_types.pop()
            if (first_param_type == tuple):
                sort_column_names = [i[0] for i in sort_columns]
                sort_column_orders = [i[1] for i in sort_columns]
            elif(first_param_type == str):
                sort_column_names = sort_columns
            else:
                raise TypeError("sort_columns type is not supported")
        else:
            raise TypeError("sort_columns type is not correct. Supported types are str, list of str or list of (str,bool) pair.")

        # use the second parameter if the sort order is not given
        if (len(sort_column_orders) == 0):
            sort_column_orders = [ascending for i in sort_column_names]

        # make sure all column exists
        my_column_names = set(self.column_names())
        for column in sort_column_names:
            if (type(column) != str):
                raise TypeError("Only string parameter can be passed in as column names")
            if (column not in my_column_names):
                raise ValueError("SFrame has no column named: '" + str(column) + "'")
            if (self[column].dtype() not in (str, int, float)):
                raise TypeError("Only columns of type (str, int, float) can be sorted")

        _mt._get_metric_tracker().track('sframe.sort')

        with cython_context():
            return SFrame(_proxy=self.__proxy__.sort(sort_column_names, sort_column_orders))

    def dropna(self, columns=None, how='any'):
        """
        Returns a new SFrame with rows containing 'NA' values (None or NaN)
        removed.  If how='any', a row will be removed if any of the columns in
        the columns parameter contains at least one NA value.  If how='all', a
        row will be removed if all of the columns in the columns parameter are
        NA values.

        If the columns parameter is not specified, it defaults to all columns.

        See :py:func:`graphlab.SFrame.dropna_split()` for a version of this
        function which also returns the dropped rows.

        Examples
        --------
        >>> sf=gl.SFrame({'a':[1,None,None],'b':['a','b',None]})
        >>> sf
        Columns:
          a	int
          b	str
        Rows: 3
        Data:
        +------+------+
        |  a   |  b   |
        +------+------+
        |  1   |  a   |
        | None |  b   |
        | None | None |
        +------+------+
        [3 rows x 2 columns]

        Drop all the missing values:

        >>> sf.dropna()
        Columns:
          a	int
          b	str
        Rows: 1
        Data:
        +---+---+
        | a | b |
        +---+---+
        | 1 | a |
        +---+---+
        [1 rows x 2 columns]

        Drop all rows where every value is missing:

        >>> sf.dropna(any="all")
        Columns:
          a	int
          b	str
        Rows: 2
        Data:
        +------+---+
        |  a   | b |
        +------+---+
        |  1   | a |
        | None | b |
        +------+---+
        [2 rows x 2 columns]

        Drop all rows where column 'a' has a missing value:

        >>> sf.dropna('a', any="all")
        Columns:
          a	int
          b	str
        Rows: 1
        Data:
        +---+---+
        | a | b |
        +---+---+
        | 1 | a |
        +---+---+
        [1 rows x 2 columns]

        Parameters
        ----------
        columns : list or str
            The columns to use when looking for NA values. By default, all
            columns are used.

        how : 'any' or 'all'
            Specifies whether a row should be dropped if at least one column
            has NA values, or if all columns have NA values.  'any' is
            default.

        Returns
        -------
        out : SFrame
            SFrame with NA values removed (according to the given rules).
        """
        _mt._get_metric_tracker().track('sframe.dropna')

        # If the user gives me an empty list (the indicator to use all columns)
        # NA values being dropped would not be the expected behavior. This
        # is a NOOP, so let's not bother the server
        if type(columns) is list and len(columns) == 0:
            return SFrame(_proxy=self.__proxy__)

        (columns, all_behavior) = self.__dropna_errchk(columns, how)

        with cython_context():
            return SFrame(_proxy=self.__proxy__.drop_missing_values(columns, all_behavior, False))

    def dropna_split(self, columns=None, how='any'):
        """
        This function has the same functionality as dropna, but returns a tuple
        of two SFrames.  The first is the expected output from dropna, and the
        second contains all the rows filtered by the dropna algorithm.

        Examples
        --------
        >>> sf=gl.SFrame({'a':[1,None,None],'b':['a','b',None]})
        >>> sf
        Columns:
          a	int
          b	str
        Rows: 3
        Data:
        +------+------+
        |  a   |  b   |
        +------+------+
        |  1   |  a   |
        | None |  b   |
        | None | None |
        +------+------+
        [3 rows x 2 columns]

        We can split this into two SFrames one with the bad rows,
        one with the rest.

        >>> (good,bad) = sf.dropna_split()
        >>> good
        Columns:
          a	int
          b	str
        Rows: 1
        Data:
        +---+---+
        | a | b |
        +---+---+
        | 1 | a |
        +---+---+
        [1 rows x 2 columns]
        >>> bad
        Columns:
          a	int
          b	str
        Rows: 2
        Data:
        +------+------+
        |  a   |  b   |
        +------+------+
        | None |  b   |
        | None | None |
        +------+------+
        [2 rows x 2 columns]

        Parameters
        ----------
        columns : list or str
            The columns to use when looking for 'NA' values. By default, all
            columns are used.

        how : 'any' or 'all'
            Specifies whether a row should be dropped if at least one column
            has NA values, or if all columns have NA values.  'any' is
            default.

        Returns
        -------
        out : (SFrame, SFrame)
            (SFrame with 'NA' values removed, SFrame with the removed 'NA' values)
            according to the given rules
        """
        _mt._get_metric_tracker().track('sframe.dropna_split')

        # If the user gives me an empty list (the indicator to use all columns)
        # NA values being dropped would not be the expected behavior. This
        # is a NOOP, so let's not bother the server
        if type(columns) is list and len(columns) == 0:
            return (SFrame(_proxy=self.__proxy__), SFrame())

        (columns, all_behavior) = self.__dropna_errchk(columns, how)

        sframe_tuple = self.__proxy__.drop_missing_values(columns, all_behavior, True)

        if len(sframe_tuple) != 2:
            raise RuntimeError("Did not return two SFrames!")

        with cython_context():
            return (SFrame(_proxy=sframe_tuple[0]), SFrame(_proxy=sframe_tuple[1]))

    def __dropna_errchk(self, columns, how):
        if columns is None:
            # Default behavior is to consider every column, specified to
            # the server by an empty list (to avoid sending all the column
            # in this case, since it is the most common)
            columns = list()
        elif type(columns) is str:
            columns = [columns]
        elif type(columns) is not list:
            raise TypeError("Must give columns as a list, str, or 'None'")
        else:
            # Verify that we are only passing strings in our list
            list_types = set([type(i) for i in columns])
            if (str not in list_types) or (len(list_types) > 1):
                raise TypeError("All columns must be of 'str' type")


        if how not in ['any','all']:
            raise ValueError("Must specify 'any' or 'all'")

        if how == 'all':
            all_behavior = True
        else:
            all_behavior = False

        return (columns, all_behavior)

    def fillna(self, column, value):
        """
        Fill all the missing values (None or NaN) in a column of this SFrame.
        If the given value is not the same type as the values in the column,
        fillna will attempt to convert the value to the original column's type.
        If this fails, an error will be raised.

        Parameters
        ----------
        column : str
            The name of the column to modify

        value : type convertible to SArray's type
            The value that all missing values will be replaced with
        """
        # Normal error checking
        if type(column) is not str:
            raise TypeError("Must give column name as a str")

        self[column] = self[column].fillna(value)

    def add_row_number(self, column_name='id', start=0):
        """
        Adds a sequentially increasing row number as a new column to the
        existing SFrame.  By default, the count starts at 0, but this can be
        changed to a positive or negative number.  The new column will be named
        with the given column name.  An error will be raised if the given
        column name already exists in the SFrame.

        This operation modifies the current SFrame in place and returns self.

        Note: The range of numbers is constrained by a signed 64-bit integer,
        so beware of overflow if you think the results in the row number column
        will be greater than 9 quintillion.

        Examples
        --------
        >>> sf=gl.SFrame({'a':[1,None,None],'b':['a','b',None]})
        >>> sf
        Columns:
          a	int
          b	str
        Rows: 3
        Data:
        +------+------+
        |  a   |  b   |
        +------+------+
        |  1   |  a   |
        | None |  b   |
        | None | None |
        +------+------+
        >>> sf.add_row_number()
        >>> sf
        Columns:
          id	int
          a	int
          b	str
        Rows: 3
        Data:
        +----+------+------+
        | id |  a   |  b   |
        +----+------+------+
        | 0  |  1   |  a   |
        | 1  | None |  b   |
        | 2  | None | None |
        +----+------+------+
        [3 rows x 3 columns]

        Parameters
        ----------
        column_name : str
            The name of the new column that will hold the row numbers

        start : int
            The number to start the row number count from
        """
        _mt._get_metric_tracker().track('sframe.add_row_number')

        if type(column_name) is not str:
            raise TypeError("Must give column_name as str")

        if type(start) is not int:
            raise TypeError("Must give start as int")

        the_col = _create_sequential_sarray(self.num_rows(), start)

        # Make sure the row number column is the first column
        new_sf = SFrame()
        new_sf.add_column(the_col, column_name)
        for i in self.column_names():
            new_sf.add_column(self.select_column(i), i)

        # Make this a mutating function
        self.__proxy__ = new_sf.__proxy__
        return self

    @property
    def shape(self):
        """ The shape -- number of rows and columns -- of the SFrame

        Examples
        --------

        >>> sf.shape
        Out: (3, 3)

        """
        return (self.num_rows(), self.num_cols())

    @property
    def __proxy__(self):
        return self._proxy

    @__proxy__.setter
    def __proxy__(self, value):
        assert type(value) is UnitySFrameProxy
        self._proxy = value
