#!/usr/bin/env python
"""
Gristle_determinator analyzes a file structure then analyzes contents of each column
within the file.  Once complete it then prints the results for the user.


Usage: gristle_determinator [file] [misc options]

    Options:
    -h, --help            show this help message and exit
    --long-help           show more verbose help
    -o OUTPUT, --output=OUTPUT
                          Specify output file.  Default is stdout.
    --outputformat        Choices include: readable, parsable, none with a
                          default of readable.
    -q, --quiet           provides less detail
    -v, --verbose         provides more detail
    -s, --silent          performs operation with no output
    -b, --brief           skips field-level analysis
    -c COLUMN_NUMBER, --column=COLUMN_NUMBER
                          Restrict analysis to a single column (field number) -
                          using a zero-offset
    -n NUMBER, --number=NUMBER
                          Specify a maximum number of entries for freq
                          dictionary. This is applied separately to each column.
                          The default is set at approximately 1 million entries.
    -d DELIMITER, --delimiter=DELIMITER
                          Specify a quoted field delimiter.This is essential for
                          multi-column delimiters.
    --recdelimiter=RECDELIMITER
                          Specify a quoted end-of-record delimiter.
    --hasheader           Indicates that there is a header in the file.
    --hasnoheader         Indicates that there is no header in the file. Useful
                          for overriding automatic csv-guessing behavior.
    -T COLUMN_TYPES, --types=COLUMN_TYPES
                          Allows manual specification of field types: integer,
                          float, string, timestamp. Use format: "colno:type,
                          colno:type,  colno:type"
    --metadata            Indicates whether or not to write results into
                          metadata database
    --schemaid=SCHEMA_ID  Used with metadata outputformat to identify structure
                          set being analyzed.
    --collectionid=COLLECTION_ID
                          Used with metadata outputformat to identify structure
                          being analyzed.

Example #1 - here's using it to look at a linux passwd file.  In this case we're not providing the delimiter and quoting info - but will let it figure these out itself.  There's no header - so it doesn't know the names of the fields, but otherwise is getting everything correct.

$ gristle_determinator /etc/passwd

File Structure:
  format type:       csv
  field cnt:         7
  record cnt:        42
  has header:        False
  delimiter:         :
  csv quoting:       False
  skipinitialspace:  False
  quoting:           QUOTE_NONE
  doublequote:       False
  quotechar:         "
  lineterminator:    '\n'
  escapechar:        None

Field Analysis Progress:
   Analyzing field: 0
   Analyzing field: 1
   Analyzing field: 2
   Analyzing field: 3
   Analyzing field: 4
   Analyzing field: 5
   Analyzing field: 6

Fields Analysis Results:

      ------------------------------------------------------
      Name:             field_0
      Field Number:     0
      Wrong Field Cnt:  0
      Type:             string
      Min:              avahi
      Max:              www-data
      Unique Values:    42
      Known Values:     42
      Case:             lower
      Min Length:       2
      Max Length:       17
      Mean Length:      6.55
      Top Values not shown - all values are unique

      ------------------------------------------------------
      Name:             field_1
      Field Number:     1
      Wrong Field Cnt:  0
      Type:             string
      Min:              x
      Max:              x
      Unique Values:    1
      Known Values:     1
      Case:             lower
      Min Length:       1
      Max Length:       1
      Mean Length:      1.00
      Top Values:
         x                                        x 42 occurrences

      ------------------------------------------------------
      Name:             field_2
      Field Number:     2
      Wrong Field Cnt:  0
      Type:             integer
      Min:              0
      Max:              65534
      Unique Values:    42
      Known Values:     42
      Mean:             1777.23809524
      Median:           103.5
      Variance:         99268886.3719
      Std Dev:          9963.37725733
      Top Values not shown - all values are unique

      ------------------------------------------------------
      Name:             field_3
      Field Number:     3
      Wrong Field Cnt:  0
      Type:             integer
      Min:              0
      Max:              65534
      Unique Values:    38
      Known Values:     38
      Mean:             6450.35714286
      Median:           104.0
      Variance:         367584156.087
      Std Dev:          19172.4843483
      Top Values:
         65534                                    x 4 occurrences
         7                                        x 2 occurrences
         1000                                     x 1 occurrences
         1001                                     x 1 occurrences
         1002                                     x 1 occurrences
         1003                                     x 1 occurrences
         1004                                     x 1 occurrences
         1005                                     x 1 occurrences
         1006                                     x 1 occurrences
         123                                      x 1 occurrences

      < remaining 3 fields of output truncated for brevity >


Example #2 - same as the first, but this time writing the output to the "parsable" output format.

$ ./gristle_determinator /etc/passwd  --outputformat parsable

"file_analysis_results"|"main"|"main"|"format_type"|"csv"
"file_analysis_results"|"main"|"main"|"field_count"|"7"
"file_analysis_results"|"main"|"main"|"record_count"|"42"
"file_analysis_results"|"main"|"main"|"hasheader"|"False"
"file_analysis_results"|"main"|"main"|"delimiter"|":"
"file_analysis_results"|"main"|"main"|"csv_quoting"|"False"
"file_analysis_results"|"main"|"main"|"skipinitialspace"|"False"
"file_analysis_results"|"main"|"main"|"quoting"|"QUOTE_NONE"
"file_analysis_results"|"main"|"main"|"doublequote"|"False"
"file_analysis_results"|"main"|"main"|"escapechar"|"None"
"field_analysis_results"|"field_0"|"main"|"name"|"field_0"
"field_analysis_results"|"field_0"|"main"|"field_number"|"0"
"field_analysis_results"|"field_0"|"main"|"wrong_field_cnt"|"0"
"field_analysis_results"|"field_0"|"main"|"type"|"string"
"field_analysis_results"|"field_0"|"main"|"min"|"avahi"
"field_analysis_results"|"field_0"|"main"|"max"|"www-data"
"field_analysis_results"|"field_0"|"main"|"unique_values"|"42"
"field_analysis_results"|"field_0"|"main"|"known_values"|"42"
"field_analysis_results"|"field_0"|"main"|"case"|"lower"
"field_analysis_results"|"field_0"|"main"|"min_length"|"2"
"field_analysis_results"|"field_0"|"main"|"max_length"|"17"
"field_analysis_results"|"field_0"|"main"|"mean_length"|"6.54761904762"
"field_analysis_results"|"field_0"|"top_values"|"top_values_not_shown"|" "
"field_analysis_results"|"field_1"|"main"|"name"|"field_1"
"field_analysis_results"|"field_1"|"main"|"field_number"|"1"
"field_analysis_results"|"field_1"|"main"|"wrong_field_cnt"|"0"
"field_analysis_results"|"field_1"|"main"|"type"|"string"
"field_analysis_results"|"field_1"|"main"|"min"|"x"
"field_analysis_results"|"field_1"|"main"|"max"|"x"
"field_analysis_results"|"field_1"|"main"|"unique_values"|"1"
"field_analysis_results"|"field_1"|"main"|"known_values"|"1"
"field_analysis_results"|"field_1"|"main"|"case"|"lower"
"field_analysis_results"|"field_1"|"main"|"min_length"|"1"
"field_analysis_results"|"field_1"|"main"|"max_length"|"1"
"field_analysis_results"|"field_1"|"main"|"mean_length"|"1.0"
"field_analysis_results"|"field_1"|"top_values"|"x"|"42"

< remaining 5 fields of output truncated for brevity >

This source code is protected by the BSD license.  See the file "LICENSE"
in the source code root directory for the full language or refer to it here:
http://opensource.org/licenses/BSD-3-Clause
Copyright 2011,2012,2013 Ken Farmer
"""

#--- standard modules ------------------
from __future__ import division
import sys
import os
import optparse
import errno
import csv
import sqlalchemy
from pprint import pprint as pp

#Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)

#--- gristle modules -------------------
# lets get pathing set for running code out of project structure & testing it via tox
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import gristle.file_type           as file_type
import gristle.field_determinator  as field_determinator
import gristle.metadata            as metadata

#Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL) 

# pylint is confused on some inheritance issues
# pylint: disable=E1101 

#from pprint import pprint as pp


#------------------------------------------------------------------------------
# Command-line section
#------------------------------------------------------------------------------
def main():
    """ Allows users to directly call determinator from command line
    """
    (opts, files) = get_opts_and_args()
    md_man        = None

    # turn off verbose if parsable output is going to stdout
    if opts.outputformat == 'parsable' and not opts.output:
        opts.verbose = False  # don't mix data and info

    if opts.metadata:
        md_man    = MetadataManager(opts.schema_id, opts.collection_id)


    out_writer    = OutputWriter(output_filename=opts.output,
                                 output_format=opts.outputformat)

    my_file       = file_type.FileTyper(files[0],
                                       opts.delimiter,
                                       opts.recdelimiter,
                                       opts.hasheader,
                                       quoting=csv.QUOTE_MINIMAL)
    try:
        my_file.analyze_file()
    except file_type.IOErrorEmptyFile:
        return errno.ENODATA

    out_writer.write_file_results(my_file)
    if opts.metadata:
        md_man.write_file_results(my_file)

    if opts.brief:
        return 0

    # Get Analysis on ALL Fields:
    my_fields = field_determinator.FieldDeterminator(files[0],
                                  my_file.format_type,
                                  my_file.field_cnt,
                                  my_file.dialect.has_header,
                                  my_file.dialect,
                                  my_file.dialect.delimiter,
                                  opts.recdelimiter,
                                  opts.verbose)

    if opts.column_type_overrides:
        assert max(opts.column_type_overrides) < my_file.field_cnt,   \
           "ERROR: column_type_override references non-existing column_number"

    my_fields.analyze_fields(opts.column_number,
                             opts.column_type_overrides,
                             opts.number)

    out_writer.write_field_results(my_fields, opts.column_number)
    if opts.metadata:
        md_man.write_field_results(my_fields, opts.column_number)

    out_writer.terminate()
    return 0


class MetadataManager(object):
    def __init__(self, schema_id, collection_id):
        self.schema_id           = schema_id
        self.collection_id       = collection_id
        self.instance_id         = None
        self.analysis_profile_id = None
        self.md                  = metadata.GristleMetaData()

        if self.md.collection_tools.getter(collection_id=collection_id) is None:
            print 'ERROR: schema_id / collection_id (%d / %d) does not exist' \
                   % (schema_id, collection_id)
            sys.exit(1)
        else:
            print self.md.collection_tools.getter(schema_id=schema_id,
                                                  collection_id=collection_id)

        self.instance_id = self.md.instance_tools.get_instance_id(self.schema_id)
        self.analysis_profile_id = self.md.analysis_profile_tools.get_analysis_profile_id(\
                                   self.instance_id, self.collection_id)


        # blow away prior analysis, collection analysis, etc
        # create new analysis record
        self.analysis_id = self.md.analysis_tools.setter(instance_id=self.instance_id,
                           analysis_profile_id=self.analysis_profile_id,
                           analysis_tool='gristle_determinator')


    def write_file_results(self, filetype):

        self.ca_id = self.md.collection_analysis_tools.setter(analysis_id=self.analysis_id,
                     collection_id=self.collection_id,
                     ca_name='unknown',
                     ca_location=filetype.fqfn,
                     ca_row_cnt=filetype.record_cnt,
                     ca_field_cnt=filetype.field_cnt,
                     ca_delimiter=filetype.dialect.delimiter,
                     ca_hasheader=filetype.dialect.has_header,
                     ca_quoting=file_type.get_quote_name(filetype.dialect.quoting).lower(),
                     ca_quote_char=filetype.dialect.quotechar)

    def write_field_results(self, field_analysis, col_number):

        for sub in range(field_analysis.field_cnt):
            if (col_number is not None and sub != col_number):
                continue
            if field_analysis.field_types[sub] == 'integer':
                field_type = 'int'
            else:
                field_type = field_analysis.field_types[sub]
            if field_type == 'string':
                field_len =  field_analysis.field_max_length[sub]
            else:
                field_len = None


            field_id = self.md.field_tools.get_field_id(self.collection_id,
                                                        field_order=sub,
                                                        field_type=field_type,
                                                        field_len=field_len,
                                                        field_name=field_analysis.field_names[sub])

            fa_case      = None
            fa_min_len   = None
            fa_max_len   = None
            fa_mean_len  = None
            fa_mean      = None
            fa_median    = None
            fa_stddev    = None
            fa_variance  = None
            if field_analysis.field_types[sub] in ('integer','float'):
                fa_mean      = field_analysis.field_mean[sub]
                fa_median    = field_analysis.field_median[sub]
                fa_stddev    = field_analysis.stddev[sub]
                fa_variance  = field_analysis.variance[sub]
            elif field_analysis.field_types[sub] == 'string':
                fa_case      = field_analysis.field_case[sub]
                fa_min_len   = field_analysis.field_min_length[sub]
                fa_max_len   = field_analysis.field_max_length[sub]
                fa_mean_len  = field_analysis.field_mean_length[sub]

            fa_id    = self.md.field_analysis_tools.setter(ca_id=self.ca_id,
                        field_id=field_id,
                        fa_type=field_analysis.field_types[sub],
                        fa_unique_cnt=len(field_analysis.field_freqs[sub]),
                        fa_known_cnt=len(field_analysis.get_known_values(sub)),
                        fa_min=field_analysis.field_min[sub],
                        fa_max=field_analysis.field_max[sub],
                        fa_mean=fa_mean,
                        fa_median=fa_median,
                        fa_stddev=fa_stddev,
                        fa_variance=fa_variance,
                        fa_case=fa_case,
                        fa_min_len=fa_min_len,
                        fa_max_len=fa_max_len,
                        fa_mean_len=fa_mean_len)

            if field_analysis.field_freqs[sub] is not None:
                sorted_list = field_analysis.get_top_freq_values(sub, limit=100)
                for pair in sorted_list:
                    fav_id = self.md.field_analysis_value_tools.setter(fa_id=fa_id,
                                                                       fav_value=pair[0],
                                                                       fav_count=pair[1])



class OutputWriter(object):
    """
        Parsable Format:
          [division]             | [section]    | [subsection] | [key]       | [value]
          file_structure         | main         | main         | format_type | csv
          file_structure         | main         | main         | field_count | 4
          field_analysis_results | field_0      | main         | name        | station_id
          field_analysis_results | field_0      | topvalues    | blue        | 57
    """

    def __init__(self, output_filename, output_format):
        self.output_filename = output_filename
        self.output_format   = output_format
        assert self.output_format in ['readable', 'parsable', 'metadata']

        if self.output_filename:
            self.outfile   = open(self.output_filename, 'w')
        else:
            self.outfile   = sys.stdout

        self.section    = None
        self.subsection = None

    def terminate(self):
        if self.outfile:
            self.outfile.close()


    def write_file_results(self, my_file):
        self.write_header()
        self.write_header('File Analysis Results:')
        self.division   = 'File Analysis Results'
        self.section    = 'main'
        self.subsection = 'main'

        self.write_string('format type',      my_file.format_type)
        self.write_string('field count',      my_file.field_cnt)
        self.write_string('record count',     my_file.record_cnt)
        self.write_string('hasheader',        my_file.dialect.has_header)
        if my_file.dialect.delimiter.strip() == '':
            self.write_string('delimiter',        '[space]')
        elif my_file.dialect.delimiter.strip() == '|':
            self.write_string('delimiter',        "'|'")
        else:
            self.write_string('delimiter',        my_file.dialect.delimiter)
        self.write_string('csv quoting',      my_file.csv_quoting)
        self.write_string('skipinitialspace', my_file.dialect.skipinitialspace)
        self.write_string('quoting',          file_type.get_quote_name(my_file.dialect.quoting))
        self.write_string('doublequote',      my_file.dialect.doublequote)
        if self.output_format == 'readable':
            self.write_string('quotechar',        my_file.dialect.quotechar)
            self.write_string('lineterminator',   my_file.dialect.lineterminator, use_repr=True)
        self.write_string('escapechar',       my_file.dialect.escapechar)
        self.write_header()


    def write_field_results(self, my_fields, column_number):
        self.write_header()
        self.write_header('Field Analysis Results')
        self.division   = 'Field Analysis Results'
        for sub in range(my_fields.field_cnt):
            self.section    = 'field_%d' % sub
            self.subsection = 'main'
            if (column_number is not None
            and sub != column_number):
                continue
            self.write_header()
            self.write_header('------------------------------------------------------', indent=6)
            self.write_string('Name',             my_fields.field_names[sub], indent=4)
            self.write_string('Field Number',     sub, indent=4)
            self.write_string('Wrong Field Cnt',  my_fields.field_rows_invalid[sub], indent=4)
            if my_fields.field_trunc[sub]:
                self.write_string('Data Truncated: analysis will be partial', indent=4)

            self.write_string('Type',             my_fields.field_types[sub], indent=4)
            self.write_string('Min',              my_fields.field_min[sub], indent=4)
            self.write_string('Max',              my_fields.field_max[sub], indent=4)
            self.write_string('Unique Values',    len(my_fields.field_freqs[sub]), indent=4)
            self.write_string('Known Values',     len(my_fields.get_known_values(sub)), indent=4)

            if my_fields.field_types[sub] in ('integer','float'):
                self.write_string('Mean',     my_fields.field_mean[sub], indent=4)
                self.write_string('Median',   my_fields.field_median[sub], indent=4)
                self.write_string('Variance', my_fields.variance[sub], indent=4)
                self.write_string('Std Dev',  my_fields.stddev[sub], indent=4)
            elif my_fields.field_types[sub] == 'string':
                self.write_string('Case',        my_fields.field_case[sub], indent=4)
                self.write_string('Min Length',  my_fields.field_min_length[sub], indent=4)
                self.write_string('Max Length',  my_fields.field_max_length[sub], indent=4)
                self.write_string('Mean Length', my_fields.field_mean_length[sub], indent=4)

            self.write_field_freq(my_fields, sub)


    def write_field_freq(self, my_fields, col_no):
        """ Writes out the top_values section for a single column.
        """
        key_sub = 0
        val_sub = 1
        self.subsection = 'top_values'
        if my_fields.field_freqs[col_no] is not None:
            sorted_list = my_fields.get_top_freq_values(col_no, limit=10)
            if len(sorted_list):
                if sorted_list[key_sub][val_sub] == 1:
                    self.write_string('Top Values', 'not shown - all are unique', indent=4)
                else:
                    if self.output_format == 'readable':
                        self.write_string('Top Values', indent=4)
                    for pair in sorted_list:
                        if self.output_format == 'readable':
                            self.write_string(pair[key_sub], 'x %d occurances' % \
                                              pair[val_sub], indent=8, key_width=30)
                        else:
                            self.write_string(pair[key_sub], '%d' % \
                                              pair[val_sub], indent=8, key_width=30)
            else:
                self.write_string('Top Values', 'no value found', indent=4)
        else: # wasn't originally included
            self.write_string('Top Values', 'no values found', indent=4)



    def write_header(self, val='', indent=0):
        if self.output_format == 'readable':
            self.outfile.write('{1:<{2}}{0}\n'.format(val, '', indent))
        elif self.output_format == 'parsable':
            pass


    def write_string(self, key, value=' ', indent=0, key_width=20, use_repr=False):
        if (self.output_format == 'parsable'
        and self._parsify(self.subsection) != 'top_values'):
            trunc_key   = self._parsify(key[:key_width])
            trunc_value = '"%s"' % str(value)[:30]
        else:
            trunc_key   = key[:key_width]
            trunc_value = str(value)[:30]

        if self.output_format == 'readable':
            if use_repr:
                self.outfile.write('  {0:<{1}}{2:<{3}}{4!r}\n'.format('%s' % '', indent, trunc_key, key_width, trunc_value))
            else:
                self.outfile.write('  {0:<{1}}{2:<{3}}{4}\n'.format('%s' % '', indent, trunc_key, key_width, trunc_value))
        elif self.output_format == 'parsable':
            self.outfile.write('{0}|{1}|{2}|{3}|{4}\n'.format(self._parsify(self.division),
                                                              self._parsify(self.section),
                                                              self._parsify(self.subsection),
                                                              trunc_key,
                                                              trunc_value))

    def _parsify(self, val):
        return '"%s"' % val.lower().replace(' ', '_')





def get_opts_and_args():
    """ gets opts & args and returns them
        run program with -h or --help for command line args
    """
    use = ("%prog determines file structure then analyzes contents of each "
           "column.\n"
           "Once complete it then prints the results for the user\n"
           "\n"
           "Usage: %prog [file] [misc options]"
           "\n")
    parser = optparse.OptionParser(usage = use)
    parser.add_option('--long-help',
           default=False,
           action='store_true',
           help='Print more verbose help')
    parser.add_option('-o', '--output',
           help='Specify output file.  Default is stdout.')
    parser.add_option('--outputformat',
           choices=['readable', 'parsable', 'none'],
           default='readable',
           help='describe output format')
    parser.add_option('-v', '--verbose',
           action='store_true',
           dest='verbose',
           default=True,
           help='provides more detail')
    parser.add_option('-b', '--brief',
           action='store_true',
           dest='brief',
           default=False,
           help='skips field-level analysis')
    parser.add_option('-c', '--column',
           type=int,
           dest='column_number',
           help=('Restrict analysis to a single column (field number)'
                 ' - using a zero-offset'))
    parser.add_option('-n', '--number',
           type=int,
           help='Specify a maximum number of entries for freq dictionary. '
                 'This is applied separately to each column.  The default is'
                 ' set at approximately 1 million entries. ')
    parser.add_option('-d', '--delimiter',
           help=('Specify a quoted field delimiter.'
                 'This is essential for multi-column delimiters.'))
    parser.add_option('--recdelimiter',
           help='Specify a quoted end-of-record delimiter. ')
    parser.add_option('--hasheader',
           default=None,
           action='store_true',
           dest='hasheader',
           help='Indicates that there is a header in the file. ')
    parser.add_option('--hasnoheader',
           default=None,
           action='store_false',
           dest='hasheader',
           help=('Indicates that there is no header in the file. Useful for '
                 'overriding automatic csv-guessing behavior.'))
    parser.add_option('-T', '--types',
           type='string',
           dest='column_types',
           help=('Allows manual specification of field types: integer, float, '
                 'string, timestamp. Use format: "colno:type, colno:type, '
                 ' colno:type"'))
    parser.add_option('--metadata',
           default=False,
           action='store_true',
           help=('Indicates whether or not to write results into metadata database'))
    parser.add_option('--schemaid',
           type='int',
           dest='schema_id',
           help=('Used with metadata outputformat to identify structure '
                 'set being analyzed.'))
    parser.add_option('--collectionid',
           type='int',
           dest='collection_id',
           help=('Used with metadata outputformat to identify structure '
                 'being analyzed.'))

    (opts, files) = parser.parse_args()

    if opts.long_help:
        print __doc__
        sys.exit(0)

    # validate opts
    if len(files) == 0:
        parser.error("no filename was provided")
    elif len(files) > 1:
        parser.error("multiple files not yet supported")
    elif not os.path.exists(files[0]):
        parser.error("filename %s could not be accessed" % files[0])

    if opts.brief and opts.column_number:
        parser.error('must not specify both brevity and column number')

    if (opts.number is not None
    and  opts.number < 10):
        parser.error('please specify a number between 10 and 1000000000')

    if ((opts.schema_id or opts.collection_id) and not opts.metadata):
        parser.error('schemaid and collectionid are only for metadata')

    if ((opts.schema_id is None or opts.collection_id is None) and opts.metadata):
        parser.error('metadata requires schemaid and collectionid')

    # set up column_type_overrides
    opts.column_type_overrides = {}
    if opts.column_types:
        for col_type_pair in opts.column_types.split(','):
            try:
                (col_no, col_type) = col_type_pair.split(':')
                try:
                    int(col_no)
                except ValueError:
                    parser.error('invalid column number for types option')
                if col_type not in ['integer', 'float', 'string', 'timestamp']:
                    parser.error('invalid type for types option')
            except ValueError:
                parser.error('invalid format for types option')
            opts.column_type_overrides[int(col_no)] = col_type

    return opts, files



if __name__ == '__main__':
    sys.exit(main())

