#!/usr/bin/env python
""" Creates a frequency distribution of values from columns of the input file
    and prints it out in columns - the first being the unique key and the last
    being the count of occurances.

    Examples:
       $ gristle_freaker sample.csv -d '|'  -c 0
                             Creates two columns from the input - the first with
                             unique keys from column 0, the second with a count
                             of how many times each exists.
       $ gristle_freaker sample.csv -d '|'  -c 0 --sortcol 1 --sortorder forward
         --writelimit 25
                             In addition to what was described in the first
                             example, this example adds sorting of the output
                             by count ascending and just prints the first 25
                             entries.
       $ gristle_freaker sample.csv -d '|'  -c 0 --sampling_rate 3
         --sampling_method interval
                             In addition to what was described in the first
                             example, this example adds a sampling in which it
                             only references every third record.
       $ gristle_freaker sample.csv -d '|'  -c 0,1
                             Creates three columns from the input - the first
                             two with unique key combinations from columns 0
                             & 1, the third with the number of times each
                             combination exists.
       $ gristle_freaker sample.csv -d '|'  -c -1
                             Creates two columns from the input - the first
                             with unique keys from the last column of the file
                             (negative numbers wrap), then a second with the
                             number of times each exists.
       $ gristle_freaker sample.csv -d '|'  --columntype all
                             Creates two columns from the input - all columns
                             combined into a key, then a second with the number
                             of times each combination exists.
       $ gristle_freaker sample.csv -d '|'  --columntype each
                             Unless all other examples, this one performs a
                             separate analysis for every single column of the
                             file.  Each analysis produces three columns from
                             the input - the first is a column number, second
                             is a unique value from the column, and the third
                             is the number of times that value appeared.
                             This output is repeated for each column.

    To do:
    - add sampling method of random
    - improve error msg if no input provided - tell user about -h

    This source code is protected by the BSD license.  See the file "LICENSE"
    in the source code root directory for the full language or refer to it here:
       http://opensource.org/licenses/BSD-3-Clause
    Copyright 2011,2012,2013 Ken Farmer
"""

#--- standard modules ------------------
from __future__ import division
import sys
import optparse
import csv
from collections import defaultdict
import operator
import fileinput
import os
import errno
from pprint import pprint as pp      # just used for debugging

# Ignore SIG_PIPE and don't throw exceptions on it... 
# (http://docs.python.org/library/signal.html)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)


#--- gristle modules -------------------
# lets get pathing set for running code out of project structure 
# & testing it via tox
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import gristle.file_type            as file_type




def main():
    """ Run the frequency distribution process in the following steps:
        - get cmdline options & determine dialect
        - build the freq distribution dictionary
        - calculate field lengths for write formatting
        - sort frequency distribution
        - write dictionary to output
    """

    #----- initialize -----
    cmd_parser   = CommandLineParser()
    opts         = cmd_parser.opts
    files        = cmd_parser.files
    dialect      = get_dialect(opts, files)
    return_code  = 0 # success

    # set up an array of sets of columns to process.
    curr_col_set       = []
    if opts.col_type == 'each':
        curr_col_set.append(0)      # special op - multiple runs
    else:
        curr_col_set = opts.columns # normal op - single run

    # Run frequency processing in a loop - in case we have a columntype
    # of 'each' - which requires a separate run for every column starting
    # with the leftmost.  Other columntypes of 'specified' or 'all' will
    # only go through this loop once.
    while True:

        #----- build the frequency distribution dictionary -----
        col_freaker = ColFreaker(files,
                                 dialect,
                                 opts.col_type,
                                 opts.number,
                                 opts.sampling_method,
                                 opts.sampling_rate,
                                 opts.sortorder,
                                 opts.sortcol,
                                 opts.maxkeylen)
        col_freaker.create_freq_from_column_set(curr_col_set)

        #----- set up special column for 'each' columntype -----
        if opts.col_type == 'each':
            key_label = 'col:%03d - ' % curr_col_set[0]
        else:
            key_label = ''

        #----- write sorted list to output -----
        write_output(col_freaker.sort_freq,
                     opts.output,
                     col_freaker.col_len,
                     opts.writelimit,
                     key_label)


        #----- handle looping ----- 
        if col_freaker.truncated:
            return_code = errno.ENOMEM # too many unique values
            break
        elif opts.col_type != 'each' and col_freaker.read_cnt == 0:
            return_code = errno.ENODATA # empty file
            break
        elif col_freaker.sort_freq == []:
            break # columntype 'each' just ran out of cols
        elif opts.col_type == 'each':
            curr_col_set[0] += 1  # columntype each has data
        else:
            break # non-'each' columntype - only runs once

    return return_code # will be return code



def get_dialect(opts, files):
    """ Determine the csv dialect properties for the input file(s).
        If there's only a single input file and the gristle_file_type module
        was imported - then use that to determine csv dialect.  Otherwise, rely
        on option input.
    """
    if (len(files) == 1 and file_type):
        my_file       = file_type.FileTyper(files[0],
                                            opts.delimiter,
                                            opts.recdelimiter,
                                            opts.hasheader)
        try:
            my_file.analyze_file()
        except file_type.IOErrorEmptyFile:
            # going to let this continue in order to handle messaging
            # thru main
            # all values are being set to defaults just to get csv reader to 
            # work - no actual data will be read
            dialect                = csv.Dialect
            dialect.quoting        = 3
            dialect.delimiter      = ','
            dialect.lineterminator = '\n'                 # naive assumption
        else:
            dialect                = my_file.dialect
            #removing following line - should be obsolete:
            #dialect.hasheader      = opts.hasheader       # shouldn't be needed
    else:
        # dialect parameters needed for stdin - since the normal code can't
        # analyze this data.
        dialect                = csv.Dialect
        dialect.delimiter      = opts.delimiter
        dialect.quoting        = opts.quoting
        dialect.quotechar      = opts.quotechar
        dialect.has_header     = opts.hasheader
        dialect.lineterminator = '\n'                     # naive assumption
    return dialect






class ColFreaker(object):

    def __init__(self, files, dialect, col_type, number, sampling_method,
                 sampling_rate, sort_order, sort_col, max_key_len):
        self.files           = files
        self.dialect         = dialect
        self.col_type        = col_type
        self.number          = number
        self.sampling_method = sampling_method
        self.sampling_rate   = sampling_rate
        self.sort_order      = sort_order
        self.sort_col        = sort_col
        self.max_key_len     = max_key_len
        self.col_len         = None
        self.field_freq      = None
        self.sort_freq       = None
        self.truncated       = None
        self.read_cnt        = 0

    def create_freq_from_column_set(self, col_set):

        #----- calculate field lengths -----
        self.build_freq(col_set)

        #----- sort the field_freq ------
        revorder = (True if self.sort_order == 'reverse' else False)
        self.sort_freq = self._dict_sorter(self.field_freq, self.sort_col,
                                           revorder)

        #----- calculate field lengths for output formmating -----
        self.col_len = ColumnLengthTracker()
        self.col_len.add_all_values(self.sort_freq)
        self.col_len.trunc_all_col_lengths(self.max_key_len)


    def build_freq(self, columns):
        """ Inputs:
                - files           - list of files to open
                - dialect         - csv dialect object
                - columns         - list of columns to put into key
                - number          - max number of entries in frequency distribution dictionary
                - sampling_method - non or interval
                - sampling_rate   - for interval method
            Updates instance variables:
                - freq_dict       - freq distribution dict - can be empty if column
                                        doesn't exist in file.  This happens when user
                                        is doing a freq on "each" columns.
                - truncated       - True/False boolean that indicates if the dict was truncated
                Tested via test-harness
        """

        self.read_cnt = 0
        freq_cnt   = 0
        self.field_freq = {}
        self.truncated  = False
        for record in csv.reader(fileinput.input(self.files), self.dialect):
            if not record:
                break
            self.read_cnt += 1
            if (self.read_cnt == 1 and self.dialect.has_header):
                continue
            if self.sampling_method == 'interval':
                if self.read_cnt % self.sampling_rate != 0:
                    continue
            key = self._create_key(record, columns)
            if key is not ():
                try:
                    self.field_freq[key] += 1
                except KeyError:
                    self.field_freq[key] = 1
                    freq_cnt += 1
                    if freq_cnt >= self.number:
                        self.truncated = True
                        print 'WARNING: freq dict is too large - will truncate'
                        break
        fileinput.close()




    def _create_key(self, field_list, required_field_indexes):
        """ input:
                - field_list  - a single record in the form of a list of fields
                - requird_fields - identifies which fields to put into output key
                output:
                - the required fields from field_list in tuple format.
                Tested via test-harness.
        """
        if self.col_type == 'all':
            return tuple(field_list)
        else:
            key = []
            for field_idx in required_field_indexes:
                try:
                    key.append(field_list[field_idx].strip())
                except IndexError:
                    break # is probably freaking all cols with 'each' option.
            return tuple(key)


    def _dict_sorter(self, field_freq, sortcol, revorder):
        """ Inputs:
                - field_freq - a dictionary of tuples in which each tuple consists
                                of two parts: a key tuple and value.
                             - ex: {('8', 'A', 'B', 'C'): 1, ('30', 'A', 'B', 'C'): 1}
                - sortcol    - indicates which column to sort - either 0 or 1
                - revorder   - True indicates sorting in reverse order, False otherwise.
            Output:
                - a list of tuples - in which each tuple contains 2 items:
                    - key - this is a tuple so it can handle multiple columns
                    - count - number of times the key occurs
                - ex: [(('8', 'A', 'B', 'C'), 1), (('30', 'A', 'B', 'C'), 1)]
            Tested via test harness
        """
        assert revorder in [True, False]
        assert sortcol in [0, 1]
        sort_freq = sorted(field_freq.iteritems(),
                        key=operator.itemgetter(sortcol),
                        reverse=revorder)
        return sort_freq




class ColumnLengthTracker(object):
    """ Tracks the maximum key length for each column.
        This is useful later when printing columns - in order to keep them
        of consistent widths for each record.
        Tested via test harness.
    """
    def __init__(self):
        self.max_dict   = defaultdict(int)

    def add_val(self, col_num, value):
        """ Update tracked max length for a given column if the new value
            is greater than the old value.
            Inputs:
                 - col_num:  is the position of the column within the key
                 - value:    is the actual column value
            Outputs:
                 - none:     it updates the internal object length dictionary
        """
        if len(value) > self.max_dict[col_num]:
            self.max_dict[col_num] = len(value)

    def add_all_values(self, freq_list):
        """ Step through a dictionary in which the key is a tuple of 1-many
            parts, and the value is the count.  For each part of the key call
            the add_val function to then store the max length.
        """
        for keyval_tup in freq_list:
            keys = keyval_tup[0]
            for index, key in enumerate(keys):
                self.add_val(index, key)

    def trunc_all_col_lengths(self, maxkeylen):
        """ Gradually reduces the length of columns, reducing the longest one
            before the shorter ones.
        """
        while (self._get_tot_col_len() - maxkeylen) > 0:
            max_key = max(self.max_dict, key=self.max_dict.get)
            self.max_dict[max_key] -= 1

    def _get_tot_col_len(self):
        """ Returns the total column length by adding together to the longest
            column value of each column.
        """
        return sum(list(self.max_dict.values()))




def write_output(freq, output, col_len, write_limit, col_label=None):
    """ freq input - is a list of tuples, each tuple must consist of
        two entries: key & count.   Each key is also a tuple.
                 [ (tup1),
                   (tup2) ]
        output - is the user-provided output filename
        col_len - is an object that has the max lengths of all columns
    """
    if output == '-':
        outfile = sys.stdout
    else:
        outfile = open(output, 'a')

    write_cnt = 0
    for key_tup in freq:
        write_cnt += 1
        if write_limit and write_cnt > write_limit:
            break
        row = create_output_row(key_tup, col_len, col_label)
        outfile.write(row)

    # don't close stdout
    if output != '-':
        outfile.close()



def create_output_row(freq_tup, col_len, key_label):
    """ input:
           - freq_tup - is a tuple consisting of two entries: key & count.
             Each key is also a tuple.  So it looks like this:
                  (('key1a', 'key1b', 'key1c'), 5   )
           - col_len - is an object that has the max lengths of all columns
           - key_label - first column only populated for 'each columntype
        output:
           - an entire row to be written as output
        tested via test-harness
    """
    key_part    = ''
    for colnum, key in enumerate(freq_tup[0]):
        key_string = ' %-*.*s  - ' % (col_len.max_dict[colnum],
                                      col_len.max_dict[colnum],
                                      key)
        key_part += key_string
    row = '%s%s %s\n' % (key_label, key_part, freq_tup[1])
    return row



class CommandLineParser(object):
    """ Manages all parsing, formating and validating of command line options
        and args.
        Tested via test-harness.
    """

    def __init__(self):
        use = ('%prog is used to print a frequency distribution of one or more'
               'columns from the input file: \n'
               '\n'
               '   %prog [file] [misc options] '
               '\n')
        self.parser = optparse.OptionParser(usage = use)

        self._define_parsers()
        self.opts, self.files = self.parser.parse_args()

        if self.opts.long_help:
            print __doc__
            sys.exit(0)

        self._validate_sampling()
        self._validate_misc()
        self._format_columns()


    def _define_parsers(self):
        """ Define all cmdline parsers.
        """
        self.parser.add_option('--long-help',
               default=False,
               action='store_true',
               help='Print verbose help and exit.')

        self.parser.add_option('-c', '--columns',
               type='string',       # could be -c 1 or -c 3,10 or -c "15,20,21"
               help="One or many comma-separated column numbers to analyze."
                    "Are only used with default columntype of 'specified'.")

        self.parser.add_option('--columntype',
               dest="col_type",
               #type='string',
               choices=['specified', 'all', 'each'],
               default='specified',
               help='Indicates type of column processing:  '
                    'All == all columns are joined into a single concatenated '
                    'key.  '
                    'Each == each column is processed individually, and the '
                    'results are written to the same output file with along '
                    'with a column number prefix.'
                    'Specified == the typical use (and default) that indicates '
                    'that the user specified columns to operate upon.')

        self.parser.add_option('-n', '--number',
               type=int,
               default=10000000,    # 10 million
               help='Specifies the max number of items in frequency dictionary.'
                    'Default is 10 million - expressed as 10000000.  This '
                    'results in approx 300 MBytes of memory used to store 10 '
                    'million 20 byte keys.')

        self.parser.add_option('--maxkeylen',
               default=50,
               type=int,
               help='Specifies the maximum length of the key when written, not '
                    'when the data is collected.  The default is 50 chars.')

        self.parser.add_option('--writelimit',
               type=int,
               default=0,
               help='Specifies a limit to the number of rows written. '
                    'The default of 0 indicates no limit.')

        self.parser.add_option('--sortcol',
               type=int,
               default=1,
               #choices=[0,1],
               help='Specifies which column to sort by - 0 (key) or 1 (count). '
                    'The default is 1 (count). ')

        self.parser.add_option('--sortorder',
               default='reverse',
               choices=['forward','reverse'],
               help='Specifies whether to sort the column in forward or reverse'
                    ' order. The default is reverse')

        self.parser.add_option('--sampling_method',
               default='non',
               choices=['non','interval'],
               help='Specifies method of sampling to use.  Choices are non'
                    ' or interval.  Default is non.')

        self.parser.add_option('--sampling_rate',
               default=None,
               type='float',
               help='Specifies which to use in frequency distribution. If the'
                    'method is interval, then this number describes how many'
                    'records to skip between each one to use.')

        self.parser.add_option('-o', '--output',
               default='-',
               help="Specifies the output file, defaults to stdout indicated by"
                    " '-'.")

        self.parser.add_option('-q', '--quiet',
               action='store_false',
               dest='verbose',
               default=True,
               help='provides less detail')

        self.parser.add_option('-v', '--verbose',
               action='store_true',
               dest='verbose',
               default=True,
               help='provides more detail')

        self.parser.add_option('-d', '--delimiter',
               help=('Specify a quoted single-column field delimiter. This may '
                     'be determined automatically by the program - unless you '
                     'pipe the data in.'))

        self.parser.add_option('--quoting',
               default=False,
               #choices=[True,False],
               action='store_true',
               help='Specify field quoting - generally only used for stdin '
                    'data.  The default is False.')

        self.parser.add_option('--quotechar',
               default='"',
               help='Specify field quoting character - generally only used for '
                    'stdin data.  Default is double-quote')

        self.parser.add_option('--recdelimiter',
               help='Specify quoted end-of-record delimiter.')

        self.parser.add_option('--hasheader',
               dest='hasheader',
               default=None,
               action='store_true',
               help='indicates that there is a header in the file.')
        self.parser.add_option('--hasnoheader',
               dest='hasheader',
               default=None,
               action='store_false',
               help=('Indicates that there is no header in the file.'
                     'Useful for overriding automatic csv dialect-guessing'
                     ' behavior.'))



    def _format_columns(self):
        """ Converts input commma-delimited string into a list of integers.
            Also validates columns vs columntype.
        """
        args  = []
        valid = []
        if self.opts.columns:
            if ',' in self.opts.columns:
                args = self.opts.columns.split(',')
            else:
                args.append(self.opts.columns)
        for arg in args:
            try:
                valid.append(int(arg))
            except ValueError:
                self.parser.error('Columns must consist only of comma-separated'
                                  ' integers that refer to column offsets in'
                                  ' source file')
        self.opts.columns   = valid


    def _validate_misc(self):
        """  Perform validations on all columns
        """
        if (self.opts.columns is not None
            and self.opts.col_type != 'specified'):
            self.parser.error('Columns can only be specified for columntype '
                              '!= specified.')
        elif (self.opts.columns is None
            and self.opts.col_type == 'specified'):
            self.parser.error("Columns must be provided for columntype of "
                              "'specified'.")

        if self.opts.number < 1000:
            self.parser.error('Please provided a value between 1001 and '
                              '1000000000 for number')

        if self.files:
            if len(self.files) > 1 and not self.opts.delimiter:
                self.parser.error('Please provide delimiter when piping data '
                                  'into program via stdin or reading multiple '
                                  'input files')
        else:   # stdin
            if not self.opts.delimiter:
                self.parser.error('Please provide delimiter when piping data '
                                  'into program via stdin or reading multiple '
                                  'input files')
        if self.opts.sortcol not in [0, 1]:
            self.parser.error('Please provide a sortcol value of 0 or 1')

        if 1 > self.opts.maxkeylen > 500:
            self.parser.error('Please provide a maxkeylen > 0 and < 500')


    def _validate_sampling(self):
        """ Perform validation on the two sampling options.
        """
        if self.opts.sampling_method == 'non':
            if self.opts.sampling_rate:
                self.parser.error('Sampling_rate must be left to default if '
                                  'sampling_method is "non"')
        elif self.opts.sampling_method == 'interval':
            if self.opts.sampling_rate:
                if self.opts.sampling_rate != int(self.opts.sampling_rate):
                    self.parser.error('Inverval sampling rate must be an int')
            else:
                self.parser.error('Integer sampling_rate must be provided if '
                                  'sampling_method == "interval"')




if __name__ == '__main__':
    sys.exit(main())

