#!/usr/bin/env python
""" Used to identify characteristics of a file


    This source code is protected by the BSD license.  See the file "LICENSE"
    in the source code root directory for the full language or refer to it here:
    http://opensource.org/licenses/BSD-3-Clause
    Copyright 2011,2012,2013 Ken Farmer
"""

#--- standard modules ------------------
from __future__ import division
import sys
import os
import optparse
import errno

#--- gristle modules -------------------
# lets get pathing set for running code out of project structure & testing it via tox
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import gristle.file_type           as file_type
import gristle.field_determinator  as field_determinator


#import pprint as pp
#from pprint import pprint as pp
#pp(sys.path)


#------------------------------------------------------------------------------
# Command-line section 
#------------------------------------------------------------------------------
def main():
    """ Allows users to directly call determinator from command line
    """
    (opts, files) = get_opts_and_args()

    # turn off verbose if parsable output is going to stdout
    if opts.outputformat == 'parsable' and not opts.output:
        opts.verbose = False

    out_writer    = OutputWriter(output_filename=opts.output,
                                 output_format=opts.outputformat)

    my_file       = file_type.FileTyper(files[0],
                                       opts.delimiter,
                                       opts.recdelimiter,
                                       opts.hasheader)
    try:
        my_file.analyze_file()
    except file_type.IOErrorEmptyFile:
        return errno.ENODATA

    out_writer.write_file_results(my_file)
    #print_file_info(my_file, outfile)

    if opts.brief:
        return 0

    # Get Analysis on ALL Fields:
    my_fields = field_determinator.FieldDeterminator(files[0],
                                  my_file.format_type,
                                  my_file.field_cnt,
                                  my_file.dialect.has_header,
                                  my_file.dialect,
                                  my_file.dialect.delimiter,
                                  opts.recdelimiter,
                                  opts.verbose)

    if opts.column_type_overrides:
        assert max(opts.column_type_overrides) < my_file.field_cnt,   \
           "ERROR: column_type_override references non-existing column_number"

    my_fields.analyze_fields(opts.column_number,
                             opts.column_type_overrides,
                             opts.number)

    out_writer.write_field_results(my_fields, opts.column_number)

    out_writer.terminate()

    return 0



class OutputWriter(object):
    """ 
        Parsable Format:
          [division]             | [section]    | [subsection] | [key]       | [value]
          file_structure         | main         | main         | format_type | csv
          file_structure         | main         | main         | field_count | 4
          field_analysis_results | field_0      | main         | name        | station_id
          field_analysis_results | field_0      | topvalues    | blue        | 57
    """

    def __init__(self, output_filename, output_format):
        self.output_filename = output_filename
        self.output_format   = output_format
        assert(self.output_format in ['readable','parsable'])

        if self.output_filename:
            self.outfile   = open(self.output_filename, 'w')
        else:
            self.outfile   = sys.stdout

        self.section    = None
        self.subsection = None

    def terminate(self):
        if self.outfile:
            self.outfile.close()

    def write_file_results(self, my_file):
        self.write_header()
        self.write_header('File Analysis Results:')
        self.division   = 'File Analysis Results'
        self.section    = 'main'
        self.subsection = 'main'

        self.write_string('format type',      my_file.format_type)
        self.write_string('field count',      my_file.field_cnt)
        self.write_string('record count',     my_file.record_cnt)
        self.write_string('hasheader',        my_file.dialect.has_header)
        if my_file.dialect.delimiter.strip() == '':
            self.write_string('delimiter',        '[space]')
        elif my_file.dialect.delimiter.strip() == '|':
            self.write_string('delimiter',        "'|'")
        else:
            self.write_string('delimiter',        my_file.dialect.delimiter)
        self.write_string('csv quoting',      my_file.csv_quoting)
        self.write_string('skipinitialspace', my_file.dialect.skipinitialspace)
        self.write_string('quoting',          file_type.get_quote_name(my_file.dialect.quoting))
        self.write_string('doublequote',      my_file.dialect.doublequote)
        if self.output_format == 'readable':
            self.write_string('quotechar',        my_file.dialect.quotechar)
            self.write_string('lineterminator',   my_file.dialect.lineterminator, use_repr=True)
        self.write_string('escapechar',       my_file.dialect.escapechar)
        self.write_header()


    def write_field_results(self, my_fields, column_number):
        self.write_header()
        self.write_header('Field Analysis Results')
        self.division   = 'Field Analysis Results'
        for sub in range(my_fields.field_cnt):
            self.section    = 'field_%d' % sub
            self.subsection = 'main'
            if (column_number is not None 
            and sub != column_number):
                continue
            self.write_header()
            self.write_header('------------------------------------------------------', indent=6)
            self.write_string('Name',             my_fields.field_names[sub], indent=4)
            self.write_string('Field Number',     sub, indent=4)
            self.write_string('Wrong Field Cnt',  my_fields.field_rows_invalid[sub], indent=4)
            if my_fields.field_trunc[sub]:
                self.write_string('Data Truncated: analysis will be partial', indent=4)

            self.write_string('Type',             my_fields.field_types[sub], indent=4)
            self.write_string('Min',              my_fields.field_min[sub], indent=4)
            self.write_string('Max',              my_fields.field_max[sub], indent=4)
            self.write_string('Unique Values',    len(my_fields.field_freqs[sub]), indent=4)
            self.write_string('Known Values',     len(my_fields.get_known_values(sub)), indent=4)

            if my_fields.field_types[sub] in ('integer','float'):
                self.write_string('Mean',     my_fields.field_mean[sub], indent=4)
                self.write_string('Median',   my_fields.field_median[sub], indent=4)
                self.write_string('Variance', my_fields.variance[sub], indent=4)
                self.write_string('Std Dev',  my_fields.stddev[sub], indent=4)
            elif my_fields.field_types[sub] == 'string':
                self.write_string('Case',        my_fields.field_case[sub], indent=4)
                self.write_string('Min Length',  my_fields.field_min_length[sub], indent=4)
                self.write_string('Max Length',  my_fields.field_max_length[sub], indent=4)
                self.write_string('Mean Length', my_fields.field_mean_length[sub], indent=4)

            self.write_field_freq(my_fields, sub)

    def write_field_freq(self, my_fields, col_no):
        key_sub = 0
        val_sub = 1
        self.subsection = 'top_values'
        if my_fields.field_freqs[col_no] is not None:
            sorted_list = my_fields.get_top_freq_values(col_no, limit=10)
            if sorted_list[key_sub][val_sub] == 1:
                self.write_string('Top Values not shown - all values are unique', indent=4)
            else:
                if self.output_format == 'readable':
                    self.write_string('Top Values', indent=4)
                for pair in sorted_list:
                    if self.output_format == 'readable':
                        self.write_string(pair[key_sub], 'x %d occurances' % pair[val_sub], indent=8, key_width=30)
                    else:
                        self.write_string(pair[key_sub], '%d' % pair[val_sub], indent=8, key_width=30)


    def write_header(self, val='', indent=0):
        if self.output_format == 'readable':
            self.outfile.write('{1:<{2}}{0}\n'.format(val, '', indent))
        elif self.output_format == 'parsable':
            pass

    def write_string(self, key, value=' ', indent=0, key_width=20, use_repr=False):
        if (self.output_format == 'parsable'
        and self._parsify(self.subsection) != 'top_values'):
            trunc_key   = self._parsify(key[:key_width])
            trunc_value = '"%s"' % str(value)[:30]
        else:
            trunc_key   = key[:key_width]
            trunc_value = str(value)[:30]

        if self.output_format == 'readable':
            if use_repr:
                self.outfile.write('  {0:<{1}}{2:<{3}}{4!r}\n'.format('%s' % '', indent, trunc_key, key_width, trunc_value))
            else:
                self.outfile.write('  {0:<{1}}{2:<{3}}{4}\n'.format('%s' % '', indent, trunc_key, key_width, trunc_value))
        elif self.output_format == 'parsable':
            self.outfile.write('{0}|{1}|{2}|{3}|{4}\n'.format(self._parsify(self.division),
                                                              self._parsify(self.section),
                                                              self._parsify(self.subsection),
                                                              trunc_key,
                                                              trunc_value))

    def _parsify(self, val):
        return '"%s"' % val.lower().replace(' ', '_')


#parsable fixes:
# remove x 4 occurances
# lower case everything
# replace spaces with '_' in section, subsection
# print more freq dist results
# fix test harness!


def print_field_info(my_fields, column_number, outfile):
    """ Prints information about each field within the file.
    """
    #w = outfile.write
    #write_header(outfile)
    #write_header(outfile, 'Fields Analysis Results:')
    #for sub in range(my_fields.field_cnt):
    #    if column_number is not None \
    #    and sub != column_number:
    #        continue

#        write_header(outfile)
#        w('      ------------------------------------------------------\n')
#        w('      Name:             %-20s \n' %  my_fields.field_names[sub])
#        w('      Field Number:     %-20s \n' %  sub)
#        w('      Wrong Field Cnt:  %-20s \n' %  my_fields.field_rows_invalid[sub])
#        if my_fields.field_trunc[sub]:
#            w('      Data Truncated: analysis will be partial\n')
#
#        w('      Type:             %-20s \n' %  my_fields.field_types[sub])
#        w('      Min:              %-20s \n' %  my_fields.field_min[sub])
#        w('      Max:              %-20s \n' %  my_fields.field_max[sub])
#        w('      Unique Values:    %-20d \n' %  len(my_fields.field_freqs[sub]))
#        w('      Known Values:     %-20d \n' %  len(my_fields.get_known_values(sub)))
#
#        if my_fields.field_types[sub] in ('integer','float'):
#            w('      Mean:             %-20s \n' % my_fields.field_mean[sub])
#            w('      Median:           %-20s \n' % my_fields.field_median[sub])
#            w('      Variance:         %-20s \n' % my_fields.variance[sub])
#            w('      Std Dev:          %-20s \n' % my_fields.stddev[sub])
#        elif my_fields.field_types[sub] == 'string':
#            w('      Case:             %-20s \n' %   my_fields.field_case[sub])
#            w('      Min Length:       %-20s \n' %   my_fields.field_min_length[sub])
#            w('      Max Length:       %-20s \n' %   my_fields.field_max_length[sub])
#            w('      Mean Length:      %-20.2f\n' %  my_fields.field_mean_length[sub])

#        key_sub = 0
#        val_sub = 1
#        if my_fields.field_freqs[sub] is not None:
#            sorted_list = my_fields.get_top_freq_values(sub, limit=10)
#            if sorted_list[key_sub][val_sub] == 1:
#                w('      Top Values not shown - all values are unique\n')
#            else:
#                w('      Top Values: \n')
#                for pair in sorted_list:
#                    w('         %-40s x %d occurrences\n' % \
#                          ( pair[key_sub], pair[val_sub]))



def get_opts_and_args():
    """ gets opts & args and returns them
        run program with -h or --help for command line args
    """
    use = ("%prog determines file structure then analyzes contents of each "
           "column.\n"
           "Once complete it then prints the results for the user\n"
           "\n"
           "Usage: %prog [file] [misc options]"
           "\n")
    parser = optparse.OptionParser(usage = use)
    parser.add_option('-o', '--output',
           help='Specify output file.  Default is stdout.')
    parser.add_option('--outputformat',
           choices=['readable','parsable'],
           default='readable',
           help='describe output format')
    parser.add_option('-v', '--verbose',
           action='store_true',
           dest='verbose',
           default=True,
           help='provides more detail')
    parser.add_option('-b', '--brief',
           action='store_true',
           dest='brief',
           default=False,
           help='skips field-level analysis')
    parser.add_option('-c', '--column',
           type=int,
           dest='column_number',
           help=('Restrict analysis to a single column (field number)'
                 ' - using a zero-offset'))
    parser.add_option('-n', '--number',
           type=int,
           help='Specify a maximum number of entries for freq dictionary. '
                 'This is applied separately to each column.  The default is'
                 ' set at approximately 1 million entries. ')
    parser.add_option('-d', '--delimiter',
           help=('Specify a quoted field delimiter.'
                 'This is essential for multi-column delimiters.'))
    parser.add_option('--recdelimiter',
           help='Specify a quoted end-of-record delimiter. ')
    parser.add_option('--hasheader',
           default=False,
           action='store_true',
           help='Indicates that there is a header in the file. ')
    parser.add_option('-T', '--types',
           type='string',
           dest='column_types',
           help=('Allows manual specification of field types: integer, float, '
                 'string, timestamp. Use format: "colno:type, colno:type, '
                 ' colno:type"'))

    (opts, files) = parser.parse_args()

    # validate opts
    if len(files) == 0:
        parser.error("no filename was provided")
    elif len(files) > 1:
        parser.error("multiple files not yet supported")
    elif not os.path.exists(files[0]):
        parser.error("filename %s could not be accessed" % files[0])

    if opts.brief and opts.column_number:
        parser.error('must not specify both brevity and column number')

    if (opts.number is not None 
    and  opts.number < 1000):
        parser.error('please specify a number between 1001 and 1000000000')

    # set up column_type_overrides
    opts.column_type_overrides = {}
    if opts.column_types:
        for col_type_pair in opts.column_types.split(','):
            try:
                (col_no, col_type) = col_type_pair.split(':')
                try:
                    int(col_no)
                except ValueError:
                    parser.error('invalid column number for types option')
                if col_type not in ['integer', 'float', 'string', 'timestamp']:
                    parser.error('invalid type for types option')
            except ValueError:
                parser.error('invalid format for types option')
            opts.column_type_overrides[int(col_no)] = col_type

    return opts, files



if __name__ == '__main__':
    sys.exit(main())

