#!/usr/bin/env python
"""
gristle_scalar is used to perform a single scalar operation on one column
within an input file.

Potential scalar operations include Sum, AVG, Min, Max, Freq, and CountDistinct

Usage:
   $ gristle_scalar -v -f [file] [misc options]

Example:
   $ gristle_scalar ../data/state_crime.csv -c 2 -t float -a avg
   $ 23045.79


Options:
  -h, --help            show this help message and exit
  --long-help           Print more verbose help
  -o OUTPUT, --output=OUTPUT
                        Specifies output file.  Default is stdout.
  -c COLUMN_NUMBER, --column=COLUMN_NUMBER
                        Specifies the column to perform the operation on -
                        based on a zero offset (ie, the first col is 0)
  -t COLUMN_TYPE, --column_type=COLUMN_TYPE
                        column type:  integer, float or string
  -a ACTION, --action=ACTION
                        scalar action to be performed:  min, max, avg, sum,
                        freq, countdistinct
  -d DELIMITER, --delimiter=DELIMITER
                        Specify a quoted single-column field delimiter. This
                        may bedetermined automatically by the program - unless
                        you pipe the data in. Default is comma.
  --quoting=QUOTING     Specify field quoting - generally only used for stdin
                        data.  The default is False.
  --quotechar=QUOTECHAR
                        Specify field quoting character - generally only used
                        for stdin data.  Default is double-quote
  --hasheader           indicates that there is a header in the file.
  --recdelimiter=RECDELIMITER


Limitations:
  - Can only handle csv files
  - Can only process a single column
  - Does not check on max size for countdistinct or freq operations - so
    very large files could run out of memory.
  - Can only process a single file

To do:
  - Eliminate col type arg or at least automate some of it
  - Add max dictionary checks for freq & countdistinct operations
  - Add actions: stddev & count
  - Add actions: countknown & countunknown
  - Add ability to process multiple columns simultaneously
  - Improve design of how actions run and how intermediate data is stored
  - Improve msg if user provides no args and tell about -h

This source code is protected by the BSD license.  See the file "LICENSE"
in the source code root directory for the full language or refer to it here:
   http://opensource.org/licenses/BSD-3-Clause
Copyright 2011,2012,2013 Ken Farmer
"""

#--- standard modules ------------------
from __future__ import division
import sys
import optparse
import csv
import collections
import fileinput
import os
#from pprint import pprint as pp

#Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)

#--- gristle modules -------------------
# lets get pathing set for running code out of project structure & testing it via tox
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import gristle.file_type           as file_type 

#--- global variables ------------------
temp_value = None
temp_dict  = collections.defaultdict(int)


def main():
    """ runs all processes:
            - gets opts & args
            - analyzes file to determine csv characteristics
            - runs each input record through process_value to get output
            - writes records
    """
    (opts, files) = get_opts_and_args()

    if len(files) == 1:
        my_file       = file_type.FileTyper(files[0],
                                       opts.delimiter,
                                       opts.recdelimiter,
                                       opts.hasheader)
        try:
            my_file.analyze_file()
        except file_type.IOErrorEmptyFile:
            sys.exit(1)
        dialect       = my_file.dialect
    else:
        # dialect parameters needed for stdin - since the normal code can't
        # analyze this data.
        dialect                = csv.Dialect
        dialect.delimiter      = opts.delimiter
        dialect.quoting        = opts.quoting
        dialect.quotechar      = opts.quotechar
        dialect.lineterminator = '\n'                 # naive assumption

    if opts.output:
        outfile = open(opts.output, 'w')
    else:
        outfile = sys.stdout

    rec_cnt = 0
    for rec in csv.reader(fileinput.input(files), dialect):
        rec_cnt      += 1
        if (opts.hasheader 
            and rec_cnt == 1): 
            continue
        else:
            converted_column = type_converter(rec[opts.column_number], 
                                              opts.column_type)
            process_value(converted_column, opts.action)

    if (opts.hasheader
    and rec_cnt > 0):
        process_cnt = rec_cnt - 1
    else:
        process_cnt = rec_cnt

    if opts.action in ['sum', 'min', 'max']:
        if temp_value is not None:
            outfile.write('%s\n' % str(temp_value))
    elif opts.action == 'avg':
        if temp_value is not None:
            outfile.write('%s\n' % str(temp_value / process_cnt))
    elif opts.action == 'freq':
        if temp_dict:
            for key in temp_dict:
                outfile.write('%s - %d\n' % (key, temp_dict[key]))
    elif opts.action == 'countdistinct':
        if temp_dict:
            outfile.write('%s\n' % len(temp_dict))

    fileinput.close()
    if opts.output:
        outfile.close()

    return 


def type_converter(value, column_type):
    """ Converts a single value to the type indicated.  Returns either that or
        None.
    """

    if column_type == 'integer':
        try:
            return int(value)
        except TypeError:
            return None        
    elif column_type == 'float':
        try:
            return float(value)
        except TypeError:         # catch strings
            return None         
        except ValueError:        # catch empty input
            return None
    else:
        return value



def process_value(value, action):
    """ Runs scalar action on a single row's column value.  Intermediate values
        are stored in global variables for now.
    """
    global temp_value

    if action == 'sum':
        if value:
            if temp_value is None:
                temp_value = value
            else:
                temp_value += value
    elif action == 'avg':
        if temp_value is None:
            temp_value = value
        elif value is not None:
            temp_value += value
    elif action == 'min':
        if (temp_value is None 
            or value < temp_value):
            temp_value = value
    elif action == 'max':
        if value > temp_value:
            temp_value = value
    elif action == 'freq':
        temp_dict[value] += 1
    elif action == 'countdistinct':
        temp_dict[value] += 1
        



def get_opts_and_args():
    """ gets opts & args and returns them
        Input:
            - command line args & options
        Output:
            - opts dictionary
            - files list
    """
    use = ("%prog is used to perform a single scalar operation on one column "
           "within an input file. \n "
           "Potential scalar operations include Sum, AVG, Min, Max, Freq, and"
           " CountDistinct"
           "\n"
           "   %prog -v -f [file] [misc options]"
           "   example:  %prog ../data/state_crime.csv -c 2 -t float -a avg"
           "\n")
    parser = optparse.OptionParser(usage = use)

    parser.add_option('--long-help',
           default=False,
           action='store_true',
           help='Print more verbose help')

    parser.add_option('-o', '--output',
           help='Specifies output file.  Default is stdout.')
    parser.add_option('-c', '--column',
           type=int,
           dest='column_number')
    parser.add_option('-t', '--column_type',
           choices=['integer', 'float', 'string'],
           help='column type:  integer, float or string')
    parser.add_option('-a', '--action',
           choices=['min', 'max', 'avg', 'sum', 'freq', 'countdistinct'],
           help=('scalar action to be performed:  min, max, avg, sum, freq, '
                 'countdistinct'))

    parser.add_option('-d', '--delimiter',
           help=('Specify a quoted single-column field delimiter. This may be'
                 'determined automatically by the program - unless you pipe the'
                 'data in. Default is comma.'))
    parser.add_option('--quoting',
           default=False,
           help='Specify field quoting - generally only used for stdin data.'
                '  The default is False.')
    parser.add_option('--quotechar',
           default='"',
           help='Specify field quoting character - generally only used for '
                'stdin data.  Default is double-quote')
    parser.add_option('--hasheader',
           default=False,
           action='store_true',
           help='indicates that there is a header in the file.')
    parser.add_option('--recdelimiter')

    (opts, files) = parser.parse_args()

    if opts.long_help:
        print __doc__
        sys.exit(0)

    if files:
        if len(files) > 1 and not opts.delimiter:
            parser.error('Please provide delimiter when piping data into program via stdin or reading multiple input files')
    else:   # stdin
        if not opts.delimiter:
            parser.error('Please provide delimiter when piping data into program via stdin or reading multiple input files')

    if opts.column_type == 'string':
        if opts.action not in ['min', 'max', 'freq', 'countdistinct']:
            parser.error('invalid action for string type')
    if not opts.action:
        parser.error("action is a required option")
    if opts.column_number is None:
        parser.error("column is a required option")
    if not opts.column_type:
        parser.error("type is a required option")

    return opts, files



if __name__ == '__main__':
    sys.exit(main())

