#!/usr/bin/env python
"""
Compares two input files and prints a summary of the differences.  Unlike
the diff utility it is file structure-aware and is more accurate.

Usage: gristle_differ used to compare two files based on a key in column 0,
and comparing only columns 1 & 2, and writing the verbose results to stdout:
   $ gristle_differ -1 file_one.dat -2 file_two.dat -k 0 -c 1,2,3,4,5 -v

     In file1 only:           5
     In file1 only:           7
     In file2 only:           45
     In both but different:   44
     file1 data: Barack Obamahttp://en.wikipedia.org/wiki/Barack_Obama20/01/2009IncumbentIllinois
     file2 data: Barack Obamahttp://en.wikipedia.org/wiki/Barack_Obama20/01/200920/01/2017Illinois

     Counts:
        File1 records:        45
        File2 records:        43
        In file1 only:        3
        In file2 only:        1
        Same:                 40
        Changed:              1

Note that in the above example, the Counts section w, and 'file1 data'
and 'file2 data' were only printed due to the verbose option.


Options:
  -h, --help            Show this help message and exit.
  --long-help           Print more verbose help.
  -v, --verbose         Prints far more details on comparisons and stats.
                        The default is False (off).
  -1 FILE1, --file1=FILE1
                        First file to compare.
  -2 FILE2, --file2=FILE2
                        Second file to compare.
  -k KEYCOL, --keycol=KEYCOL
                        Specify the columns that constitute a unique row with
                        a comma-delimited list of numbers using a 0-offset.
                        This is a required option.
  -c COMPARECOL, --comparecol=COMPARECOL
                        Specify the columns to be compared with a comma-
                        delimited listof numbers using a 0-offset.  This is a
                        required option.
  -d DELIMITER, --delimiter=DELIMITER
                        Specify a quoted field delimiter.  If you don't provide
                        this, gristle_differ will attempt to determine it.
  -s, --sorted          Large files must be sorted by key - TBD
  --hasheader           indicates that there is a header in the file.
  --recdelimiter=RECDELIMITER
  --maxsize=MAXSIZE     can override max number of rows to collect.  Defaul is
                        10000 rows.
  --casecompare         Turns case-aware compares on and off.  Default is True
                        (on)

Currently, gristle_differ only provides a single method of performing the
delta: it puts the two intput files into memory then compares them row by row.
This works very well with small, unsorted files.

Future planned enhancements include:
        - Support for large, sorted files.
        - Ability to write entire records to separate output files.
        - multi-char delimiters
        - compare column exceptions (compare everything except col x)
        - print only counts - or only 1 count (ex:  print only total number of
          differences)
        - better structured printed of differences
        - option to print 'same' records
        - simple delta-oriented transforms on writes - such as add a flag to
          deleted records, intelligent handling of from/to timestamps, etc.
        - use a tuple for the key - so that it can be printed as separated cols

This source code is protected by the BSD license.  See the file "LICENSE"
in the source code root directory for the full language or refer to it here:
    http://opensource.org/licenses/BSD-3-Clause
Copyright 2011,2012,2013,2014 Ken Farmer

"""


#--- standard modules ------------------
from __future__ import division
import sys
import os
import optparse
import csv
#from pprint import pprint as pp

#Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)


#--- gristle modules -------------------
# lets get pathing set for running code out of project structure & testing it via tox
sys.path.append('../')
sys.path.append('../../')
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import gristle.file_type           as file_type

#Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)


def main():
    """ runs all processes:
            - gets opts & args
            - analyzes file to determine csv characteristics
            - compares files
            - writes counts
    """
    (opts, dummy) = get_opts_and_args()

    # either file may be empty - but at least one must have data in it.
    my_file       = file_type.FileTyper(opts.file1,
                                       opts.delimiter,
                                       opts.recdelimiter,
                                       opts.hasheader)
    try:
        my_file.analyze_file()
    except file_type.IOErrorEmptyFile:
        try:
            my_file       = file_type.FileTyper(opts.file2,
                                       opts.delimiter,
                                       opts.recdelimiter,
                                       opts.hasheader)
        except file_type.IOErrorEmptyFile:
            return 1

    if opts.sorted:
        file_diff(opts)
    else:
        (f1_cnt, f2_cnt, f1only_cnt, f2only_cnt, same_cnt, chg_cnt) = \
           dict_diff(opts,
                     my_file.dialect,
                     opts.maxsize,
                     opts.verbose,
                     opts.casecompare)

    if opts.verbose:
        print
        print 'Counts: '
        print '   File1 records:        %d' % f1_cnt
        print '   File2 records:        %d' % f2_cnt
        print '   In file1 only:        %d' % f1only_cnt
        print '   In file2 only:        %d' % f2only_cnt
        print '   Same:                 %d' % same_cnt
        print '   Changed:              %d' % chg_cnt

    return 0



def file_diff(opts):
    """ Assumes both input files are sorted in ascending order by the key, this
        function performs the delta operation by walking through the two files
        in synch.  The purpose is to minimize memory usage.

        Not supported yet
    """
    pass


def dict_diff(opts, dialect, maxsize, verbose, casecompare):
    """ This delta operation is useful for small & unsorted files since it
        uses a dictionary-based delta operation that does not require sorting - but
        is otherwise slow and memory-intensive.

        Inputs:
          - options - for file names, key & comparison columns
          - dialect
          - maxsize - for maximum number of rows to store in memory
          - verbose - to deterine how much to print
          - casecompare - to turn case-aware compares on & off

        Outputs:
          - various counts

        Limitations:
          - does not identify records that are identical
          - requires a unique key
          - requires both files to have keys & comparison columns in identical
            locations

        To do:
          - should probably turn this into a class
          - start using maxsize
    """

    f1kv = KeyValManager(opts.file1, dialect, opts.keycol, opts.comparecol,
                         opts.maxsize, casecompare)
    f1kv.reader()

    f2kv = KeyValManager(opts.file2, dialect, opts.keycol, opts.comparecol,
                         opts.maxsize, casecompare)
    f2kv.reader()

    f1only_cnt = 0
    f2only_cnt = 0
    same_cnt   = 0
    chg_cnt    = 0

    for key in f1kv.kv.keys():
        if key not in f2kv.kv:
            print 'In file1 only:           %s' % key
            f1only_cnt += 1

    for key in f2kv.kv.keys():
        if key not in f1kv.kv:
            print 'In file2 only:           %s' % key
            f2only_cnt += 1

    for key in f1kv.kv.keys():
        if key in f2kv.kv:
            if f1kv.kv[key] != f2kv.kv[key]:
                chg_cnt += 1
                print 'In both but different:   %s' % key
                if verbose:
                    print '      file1 data: %s' % f1kv.kv[key]
                    print '      file2 data: %s' % f2kv.kv[key]
            else:
                same_cnt += 1

    return f1kv.rec_cnt, f2kv.rec_cnt, f1only_cnt, f2only_cnt, same_cnt, chg_cnt



class KeyValManager(object):
    """ Builds a dictionary of keys and values from source file, then
        provides access to keys & values via methods.
    """

    def __init__(self, filename, dialect, key_cols, comp_cols, max_size,
                 casecompare=True):
        self.filename    = filename
        self.dialect     = dialect
        self.key_cols    = key_cols
        self.comp_cols   = comp_cols
        self.max_size    = max_size
        self.casecompare = casecompare
        self.kv          = {}
        self.rec_cnt     = 0
        self.truncated   = False

    def reader(self):
        """ populated self.kv dictionary from self.filename
        """

        with open(self.filename,'r') as f:
            for fields in csv.reader(f, dialect=self.dialect):
                self.rec_cnt += 1
                if self.rec_cnt == 1 and self.dialect.has_header:
                    continue
                key          = self._get_key(fields)
                value        = self._get_value(fields)
                self.kv[key] = value
                if len(self.kv) >= self.max_size:
                    print '      WARNING: kv dict is too large - will truncate'
                    self.truncated = True
                    break

    def _caserule(self, field):
        """ translate case if necessary
        """
        if self.casecompare:
            return field
        else:
            return field.lower()

    def _get_key(self, fields):
        """ Returns a string of concatenated keys based on the input fields of a
            single record as well as the previously-provided key list

            to do:
               - support key-ranges (ex:  5:9)
        """
        concat_key = ''
        key_list = self.key_cols.split(',')
        for key_item in key_list:
            concat_key += fields[int(key_item)].strip()
        return self._caserule(concat_key)

    def _get_value(self, fields):
        """ Returns a string of concatenated values, or fields to be compared,
            based on the input fields of a single record as well as the
            previously-provided comparison-column list

            to do:
               - support col-ranges (ex:  5:9)
        """
        concat_value = ''
        value_list = self.comp_cols.split(',')
        for value_item in value_list:
            concat_value += fields[int(value_item)].strip()
        return self._caserule(concat_value)



def get_opts_and_args():
    """ gets opts & args and returns them
        Input:
            - command line args & options
        Output:
            - opts dictionary
            - args dictionary
    """
    use = ("%prog is used to compare two files and writes the differences to "
           "stdout: \n"
           "\n"
           "   example:  %prog -1 filea.dat -2 fileb.dat -k 0 -c 1,2 -v")


    parser = optparse.OptionParser(usage = use)

    parser.add_option('-v', '--verbose',
           default=False,
           action='store_true')
    parser.add_option('--long-help',
           default=False,
           action='store_true',
           help='Print more verbose help')

    parser.add_option('-1', '--file1',
           help='file1')
    parser.add_option('-2', '--file2',
           help='file2')

    parser.add_option('-k', '--keycol',
           help='Specify the columns that constitute a unique row with a comma-'
                'delimited list of numbers using a 0-offset.  This is a '
                'required option.')
    parser.add_option('-c', '--comparecol',
           help='Specify the columns to be compared with a comma-delimited list'
                'of numbers using a 0-offset.  This is a required option.')
    parser.add_option('-d', '--delimiter',
           help='Specify a quoted field delimiter. ')
    parser.add_option('-s', '--sorted',
           default=False,
           action='store_true',
           help='Large files must be sorted by key - TBD')
    parser.add_option('--hasheader',
           default=False,
           action='store_true',
           help='indicates that there is a header in the file.')
    parser.add_option('--recdelimiter')
    parser.add_option('--maxsize',
           default=10000,
           help='can override max number of rows to collect')
    parser.add_option('--casecompare',
           default=True,
           action='store_false',
           help=('Turns case-aware compares on and off.'
                 '  Default is True (on)'))

    (opts, args) = parser.parse_args()

    if opts.long_help:
        print __doc__
        sys.exit(0)


    if opts.file1 is None:
        parser.error("no filename was provided")
    elif not os.path.exists(opts.file1):
        parser.error("file1 %s could not be accessed" % opts.file1)

    if opts.sorted:
        parser.error("the sorted option has not been implemented yet")

    if not opts.keycol:
        parser.error("the keycol must be provided")
    if not opts.comparecol:
        parser.error("the comparecol must be provided")


    return opts, args



if __name__ == '__main__':
    sys.exit(main())

