#!/usr/bin/env python
""" Creates a frequency distribution of values from columns of the input file
    and prints it out in columns - the first being the unique key and the last
    being the count of occurances.

    Examples:
       $ gristle_freaker sample.csv -d '|'  -c 0
                             Creates two columns from the input - the first with
                             unique keys from column 0, the second with a count of
                             how many times each exists.
       $ gristle_freaker sample.csv -d '|'  -c 0 --sortcol 1 --sortorder forward --writelimit 25
                             In addition to what was described in the first example,
                             this example adds sorting of the output by count ascending
                             and just prints the first 25 entries.
       $ gristle_freaker sample.csv -d '|'  -c 0 --sampling_rate 3 --sampling_method interval
                             In addition to what was described in the first example,
                             this example adds a sampling in which it only references
                             every third record.
       $ gristle_freaker sample.csv -d '|'  -c 0,1
                             Creates three columns from the input - the first two
                             with unique key combinations from columns 0 & 1, the
                             third with the number of times each combination exists.

    To do:
    - add sampling method of random
    - improve error msg if no input provided - tell user about -h

    This source code is protected by the BSD license.  See the file "LICENSE"
    in the source code root directory for the full language or refer to it here:
       http://opensource.org/licenses/BSD-3-Clause
    Copyright 2011,2012,2013 Ken Farmer
"""

#--- standard modules ------------------
from __future__ import division
import sys
import optparse
import csv
from collections import defaultdict
import operator
import fileinput
import os

#--- gristle modules -------------------
# lets get pathing set for running code out of project structure & testing it via tox
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


try:
    import gristle.file_type            as file_type
except ImportError:
    pass   # can still do things the manual way

#from pprint import pprint as pp      # just used for debugging



def main():
    """ Run the frequency distribution process in the following steps:
        - get cmdline options & determine dialect
        - build the freq distribution dictionary
        - calculate field lengths for write formatting
        - sort frequency distribution
        - write dictionary to output
    """

    #----- initialize -----
    cmd_parser   = CommandLineParser()
    opts         = cmd_parser.opts
    files        = cmd_parser.files
    dialect      = get_dialect(opts, files)

    #----- build the frequency distribution dictionary -----
    field_freq, truncated = build_freq(files,
                                       dialect,
                                       opts.columns,
                                       opts.number,
                                       opts.sampling_method,
                                       opts.sampling_rate)

    #----- calculate field lengths -----
    col_len = ColumnLengthTracker()
    col_len.add_all_values(field_freq)
    col_len.trunc_all_col_lengths(opts.maxkeylen)

    #----- sort the field_freq ------
    revorder = (True if opts.sortorder == 'reverse' else False)
    sort_freq = freq_sorter(field_freq, opts.sortcol, revorder)

    #----- write sorted list to output -----
    write_output(sort_freq,
               opts.output,
               col_len,
               opts.writelimit)

    fileinput.close()

    if truncated:
        return 1
    else:
        return 0



def get_dialect(opts, files):
    """ Determine the csv dialect properties for the input file(s).
        If there's only a single input file and the gristle_file_type module
        was imported - then use that to determine csv dialect.  Otherwise, rely
        on option input.
    """
    if (len(files) == 1 and file_type):
        my_file       = file_type.FileTyper(files[0],
                                            opts.delimiter,
                                            opts.recdelimiter,
                                            opts.hasheader)
        try:
            my_file.analyze_file()
        except file_type.IOErrorEmptyFile:
            sys.exit(1)
        dialect                = my_file.dialect
        dialect.hasheader      = opts.hasheader       # shouldn't be necessary
    else:
        # dialect parameters needed for stdin - since the normal code can't
        # analyze this data.
        dialect                = csv.Dialect
        dialect.delimiter      = opts.delimiter
        dialect.quoting        = opts.quoting
        dialect.quotechar      = opts.quotechar
        dialect.hasheader      = opts.hasheader
        dialect.lineterminator = '\n'                 # naive assumption
    return dialect



def freq_sorter(field_freq, sortcol, revorder):
    """ Inputs:
          - field_freq - a dictionary of tuples in which each tuple consists
                         of two parts: a key tuple and value.
          - sortcol    - indicates which column to sort - either 0 or 1
          - revorder   - True indicates sorting in reverse order, False otherwise.
        Outputs:
          - a list of tuples
        Tested via test harness
    """
    assert(revorder in [True, False])
    assert(sortcol in [0, 1])
    sort_freq = sorted(field_freq.iteritems(),
                       key=operator.itemgetter(sortcol),
                       reverse=revorder)
    return sort_freq



def create_key(fields, columns):
    """ input:
          - fields  - a single record in the form of a list of fields
          - columns - identifies which fields to put into output key
        output:
          - the output key in tuple format.
        Tested via test-harness.
    """
    concat_key = []
    for key in columns:
        concat_key.append(fields[key].strip())

    return tuple(concat_key)



def build_freq(files, dialect, columns, number, sampling_method, sampling_rate):
    """ Inputs:
            - files           - list of files to open
            - dialect         - csv dialect object
            - columns         - list of columns to put into key
            - number          - max number of entries in frequency distribution dictionary
            - sampling_method - non or interval
            - sampling_rate   - for interval method
        Outputs:
            - freq_dict       - freq distribution dict
            - truncated       - True/False boolean that indicates if the dict was truncated
        Tested via test-harness
    """
    read_cnt   = 0
    freq_cnt   = 0
    field_freq = {}
    truncated  = False
    for fields in csv.reader(fileinput.input(files), dialect):
        if not fields:
            break
        read_cnt += 1
        if (read_cnt == 1
        and dialect.hasheader):
            continue
        if sampling_method == 'interval':
            if read_cnt % sampling_rate != 0:
                continue
        key = create_key(fields, columns)
        try:
            field_freq[key] += 1
        except KeyError:
            field_freq[key] = 1
            freq_cnt += 1
            if freq_cnt >= number:
                truncated = True
                print 'WARNING: freq dict is too large - will truncate'
                break
    fileinput.close()

    return field_freq, truncated



class ColumnLengthTracker(object):
    """ Tracks the maximum key length for each column.
        Tested via test harness.
    """
    def __init__(self):
        self.max_dict   = defaultdict(int)

    def add_val(self, col_num, value):
        """ Update tracked max length for a given column if the new value
            is greater than the old value.
            Inputs:
                 - col_num:  is the position of the column within the key
                 - value:    is the actual column value
            Outputs:
                 - none:     it updates the internal object length dictionary
        """
        if len(value) > self.max_dict[col_num]:
            self.max_dict[col_num] = len(value)

    def add_all_values(self, freq_list):
        """ Step through a dictionary in which the key is a tuple of 1-many
            parts, and the value is the count.  For each part of the key call
            the add_val function to then store the max length.
        """
        for key_tup in freq_list:
            for index, key in enumerate(key_tup):
                self.add_val(index, key)

    def trunc_all_col_lengths(self, maxkeylen):
        """ Gradually reduces the length of columns, reducing the longest one
            before the shorter ones.
        """
        while (self._get_tot_col_len() - maxkeylen) > 0:
            max_key = max(self.max_dict, key=self.max_dict.get)
            self.max_dict[max_key] -= 1

    def _get_tot_col_len(self):
        """ Returns the total column length by adding together to the longest
            column value of each column.
        """
        return sum(list(self.max_dict.values()))




def write_output(freq, output, col_len, write_limit):
    """ freq input - is a list of tuples, each tuple must consist of
        two entries: key & count.   Each key is also a tuple.
                 [ (tup1),
                   (tup2) ]
        output - is the user-provided output filename
        col_len - is an object that has the max lengths of all columns
    """
    if output == '-':
        outfile = sys.stdout
    else:
        outfile = open(output, 'w')

    write_cnt = 0
    for key_tup in freq:
        write_cnt += 1
        if write_limit and write_cnt > write_limit:
            break
        row = create_output_row(key_tup, col_len)
        outfile.write(row)

    # don't close stdout
    if output != '-':
        outfile.close()


def create_output_row(freq_tup, col_len):
    """ input:
           - freq_tup - is a tuple consisting of two entries: key & count.
             Each key is also a tuple.  So it looks like this:
                  (('key1a', 'key1b', 'key1c'), 5   )
           - col_len - is an object that has the max lengths of all columns
        output:
           - an entire row to be written as output
        tested via test-harness
    """
    key_part    = ''
    for colnum, key in enumerate(freq_tup[0]):
        key_string = ' %-*.*s  - ' % (col_len.max_dict[colnum],
                                      col_len.max_dict[colnum],
                                      key)
        key_part += key_string
    row = '%s %s\n' % (key_part, freq_tup[1])
    return row



class CommandLineParser(object):
    """ Manages all parsing, formating and validating of command line options
        and args.
        Tested via test-harness.
    """

    def __init__(self):
        use = ('%prog is used to print a frequency distribution of one or more'
               'columns from the input file: \n'
               '\n'
               '   %prog [file] [misc options] '
               '\n')
        self.parser = optparse.OptionParser(usage = use)

        self._define_parsers()
        self.opts, self.files = self.parser.parse_args()

        if self.opts.long_help:
            print __doc__
            sys.exit(0)

        self._validate_sampling()
        self._validate_misc()
        self._format_columns()


    def _define_parsers(self):
        """ Define all cmdline parsers.
        """
        self.parser.add_option('--long-help',
               default=False,
               action='store_true',
               help='Print verbose help and exit.')

        self.parser.add_option('-c', '--columns',
               type='string',       # could be -c 1 or -c 3,10 or -c "15,20,21"
               help='one or many comma-separated column numbers to analyze')

        self.parser.add_option('-n', '--number',
               type=int,
               default=10000000,    # 10 million
               help='Specifies the max number of items in frequency dictionary.  '
                    'Default is 10 million - expressed as 10000000.  This results '
                    'in approx 300 MBytes of memory used to store 10 million 20 byte'
                    'keys.')

        self.parser.add_option('--maxkeylen',
               default=50,
               type=int,
               help='Specifies the maximum length of the key when written, not '
                    'when the data is collected.  The default is 50 characters.')

        self.parser.add_option('--writelimit',
               type=int,
               default=0,
               help='Specifies a limit to the number of rows written. '
                    'The default of 0 indicates no limit.')

        self.parser.add_option('--sortcol',
               type=int,
               default=1,
               #choices=[0,1],
               help='Specifies which column to sort by - 0 (key) or 1 (count). '
                    'The default is 1 (count). ')

        self.parser.add_option('--sortorder',
               default='reverse',
               choices=['forward','reverse'],
               help='Specifies whether to sort the column in forward or reverse'
                    ' order. The default is reverse')

        self.parser.add_option('--sampling_method',
               default='non',
               choices=['non','interval'],
               help='Specifies method of sampling to use.  Choices are non'
                    ' or interval.  Default is non.')

        self.parser.add_option('--sampling_rate',
               default=None,
               type='float',
               help='Specifies which to use in frequency distribution. If the'
                    'method is interval, then this number describes how many'
                    'records to skip between each one to use.')

        self.parser.add_option('-o', '--output',
               default='-',
               help="Specifies the output file, defaults to stdout indicated by '-'.")

        self.parser.add_option('-q', '--quiet',
               action='store_false',
               dest='verbose',
               default=True,
               help='provides less detail')

        self.parser.add_option('-v', '--verbose',
               action='store_true',
               dest='verbose',
               default=True,
               help='provides more detail')

        self.parser.add_option('-d', '--delimiter',
               help=('Specify a quoted single-column field delimiter. This may be'
                     'determined automatically by the program - unless you pipe the'
                     'data in.'))

        self.parser.add_option('--quoting',
               default=False,
               #choices=[True,False],
               action='store_true',
               help='Specify field quoting - generally only used for stdin data.'
                    '  The default is False.')

        self.parser.add_option('--quotechar',
               default='"',
               help='Specify field quoting character - generally only used for '
                    'stdin data.  Default is double-quote')

        self.parser.add_option('--recdelimiter',
               help='Specify quoted end-of-record delimiter.')

        self.parser.add_option('--hasheader',
               default=False,
               action='store_true',
               help='indicates that there is a header in the file.')

    def _format_columns(self):
        """ converts input commma-delimited string into a list of integers
        """
        args  = []
        valid = []
        if self.opts.columns:
            if ',' in self.opts.columns:
                args = self.opts.columns.split(',')
            else:
                args.append(self.opts.columns)
        for arg in args:
            try:
                valid.append(int(arg))
            except ValueError:
                self.parser.error('Columns must consist only of comma-separated integers'
                                 ' that refer to column offsets in source file')
        self.opts.columns   = valid

    def _validate_misc(self):
        """  Perform validations on all columns with simple needs.
        """
        if self.opts.number < 1000:
            self.parser.error('Please provided a value between 1001 and 1000000000 for number')
        if not self.opts.columns:
            self.parser.error('Please identify the columns to analyze')

        if self.files:
            if len(self.files) > 1 and not self.opts.delimiter:
                self.parser.error('Please provide delimiter when piping data into program '
                             'via stdin or reading multiple input files')
        else:   # stdin
            if not self.opts.delimiter:
                self.parser.error('Please provide delimiter when piping data into program'
                             ' via stdin or reading multiple input files')
        if self.opts.sortcol not in [0,1]:
            self.parser.error('Please provide a sortcol value of 0 or 1')

        if 1 > self.opts.maxkeylen > 500:
            self.parser.error('Please provide a maxkeylen > 0 and < 500')


    def _validate_sampling(self):
        """ Perform validation on the two sampling options.
        """
        if self.opts.sampling_method == 'non':
            if self.opts.sampling_rate:
                self.parser.error('Sampling_rate must be left to default if sampling_method is "non"')
        elif self.opts.sampling_method == 'interval':
            if self.opts.sampling_rate:
                if self.opts.sampling_rate != int(self.opts.sampling_rate):
                    self.parser.error('Inverval sampling rate must be an integer')
            else:
                self.parser.error('Integer sampling_rate must be provided if sampling_method == "interval"')



if __name__ == '__main__':
    sys.exit(main())

