#!/usr/bin/env python
""" Compares two input files and prints the differences.  Unlike the diff 
    utility it is file structure-aware and is both more accurate and more 
    detailed.

    Two ways of performing the comparisons will be supported:
       - unsorted input files:  for smaller files - all processing is performed
         in memory
       - sorted input files:  for larger files - minimal memory used - not
         implemented yet

    Features that should be added include:
       - sorted input files + sorting the files from within this program
       - multi-char delimiters
       - compare column exceptions (compare everything except col x)
       - print only counts - or only 1 count (ex:  print only total number of 
         differences)
       - better structured printed of differences
       - option to print 'same' records
       - options to transform data prior to comparison: to change case, trim 
         spaces,
         etc
       - improved msg if no args provided - tell about -h
       - use a tuple for the key - so that it can be printed as separated cols
 
    See the file "LICENSE" for the full license governing this code. 
    Copyright 2011 Ken Farmer
"""

#--- standard modules ------------------
from __future__ import division
import sys
import os
import optparse
import csv
#from pprint import pprint as pp

#--- gristle modules -------------------
sys.path.append('../')     # allows running out of project structure
sys.path.append('../../')  # allows running out of project structure test dir

import gristle.file_type           as file_type 


def main():
    """ runs all processes:
            - gets opts & args
            - analyzes file to determine csv characteristics
            - compares files
            - writes counts
    """
    (opts, dummy) = get_opts_and_args()

    # either file may be empty - but at least one must have data in it.
    my_file       = file_type.FileTyper(opts.file1,
                                       opts.delimiter,
                                       opts.recdelimiter,
                                       opts.hasheader)
    try:
        my_file.analyze_file()
    except file_type.IOErrorEmptyFile:
        try:
            my_file       = file_type.FileTyper(opts.file2,
                                       opts.delimiter,
                                       opts.recdelimiter,
                                       opts.hasheader)
        except file_type.IOErrorEmptyFile:
            return 1

    if opts.sorted:
        file_diff(opts)
    else:
        (f1_cnt, f2_cnt, f1only_cnt, f2only_cnt, same_cnt, chg_cnt) = \
           dict_diff(opts, 
                     my_file.dialect, 
                     opts.maxsize, 
                     opts.verbose,
                     opts.casecompare)   

    if opts.verbose:
        print 
        print 'Counts: '
        print '   File1 records:        %d' % f1_cnt
        print '   File2 records:        %d' % f2_cnt
        print '   In file1 only:        %d' % f1only_cnt
        print '   In file2 only:        %d' % f2only_cnt
        print '   Same:                 %d' % same_cnt
        print '   Changed:              %d' % chg_cnt

    return 0



def file_diff(opts):
    """ Assumes both input files are sorted in ascending order by the key, this 
        function performs the delta operation by walking through the two files
        in synch.  The purpose is to minimize memory usage.

        Not supported yet
    """
    pass


def dict_diff(opts, dialect, maxsize, verbose, casecompare):
    """ This delta operation is useful for small & unsorted files since it 
        uses a dictionary-based delta operation that does not require sorting - but
        is otherwise slow and memory-intensive.
   
        Inputs:
          - options - for file names, key & comparison columns
          - dialect
          - maxsize - for maximum number of rows to store in memory
          - verbose - to deterine how much to print
          - casecompare - to turn case-aware compares on & off
      
        Outputs:
          - various counts

        Limitations:
          - does not identify records that are identical
          - requires a unique key
          - requires both files to have keys & comparison columns in identical
            locations  
  
        To do:
          - should probably turn this into a class
          - start using maxsize
    """

    f1kv = KeyValManager(opts.file1, dialect, opts.keycol, opts.comparecol, 
                         opts.maxsize, casecompare)
    f1kv.reader()

    f2kv = KeyValManager(opts.file2, dialect, opts.keycol, opts.comparecol, 
                         opts.maxsize, casecompare)
    f2kv.reader()

    f1only_cnt = 0
    f2only_cnt = 0
    same_cnt   = 0
    chg_cnt    = 0

    for key in f1kv.kv.keys():
        if key not in f2kv.kv:
            print 'In file1 only:           %s' % key
            f1only_cnt += 1

    for key in f2kv.kv.keys():
        if key not in f1kv.kv:
            print 'In file2 only:           %s' % key
            f2only_cnt += 1

    for key in f1kv.kv.keys():
        if key in f2kv.kv:
            if f1kv.kv[key] != f2kv.kv[key]:
                chg_cnt += 1
                print 'In both but different:   %s' % key
                if verbose:
                    print '      file1 data: %s' % f1kv.kv[key]
                    print '      file2 data: %s' % f2kv.kv[key]
            else:
                same_cnt += 1

    return f1kv.rec_cnt, f2kv.rec_cnt, f1only_cnt, f2only_cnt, same_cnt, chg_cnt



class KeyValManager(object):
    """ Builds a dictionary of keys and values from source file, then 
        provides access to keys & values via methods.
    """

    def __init__(self, filename, dialect, key_cols, comp_cols, max_size, 
                 casecompare=True):
        self.filename    = filename
        self.dialect     = dialect
        self.key_cols    = key_cols
        self.comp_cols   = comp_cols
        self.max_size    = max_size
        self.casecompare = casecompare
        self.kv          = {}
        self.rec_cnt     = 0
        self.truncated   = False

    def reader(self):
        """ populated self.kv dictionary from self.filename
        """

        with open(self.filename,'r') as f:
            for fields in csv.reader(f, dialect=self.dialect):
                self.rec_cnt += 1
                if self.rec_cnt == 1 and self.dialect.has_header:
                    continue
                key          = self._get_key(fields)
                value        = self._get_value(fields)
                self.kv[key] = value
                if len(self.kv) >= self.max_size:
                    print '      WARNING: kv dict is too large - will truncate'
                    self.truncated = True
                    break

    def _caserule(self, field):
        """ translate case if necessary 
        """
        if self.casecompare:
            return field
        else:
            return field.lower()

    def _get_key(self, fields):
        """ Returns a string of concatenated keys based on the input fields of a
            single record as well as the previously-provided key list

            to do:
               - support key-ranges (ex:  5:9)
        """
        concat_key = ''
        key_list = self.key_cols.split(',')
        for key_item in key_list:
            concat_key += fields[int(key_item)].strip()
        return self._caserule(concat_key)

    def _get_value(self, fields):
        """ Returns a string of concatenated values, or fields to be compared,
            based on the input fields of a single record as well as the 
            previously-provided comparison-column list
            
            to do:
               - support col-ranges (ex:  5:9)
        """
        concat_value = ''
        value_list = self.comp_cols.split(',')
        for value_item in value_list:
            concat_value += fields[int(value_item)].strip()
        return self._caserule(concat_value)



def get_opts_and_args():
    """ gets opts & args and returns them
        Input:
            - command line args & options
        Output:
            - opts dictionary
            - args dictionary 
    """
    use = ("%prog is used to compare two files and writes the differences to "
           "stdout: \n"
           "\n"
           "   example:  %prog -1 filea.dat -2 fileb.dat -k 0 -c 1,2 -v")


    parser = optparse.OptionParser(usage = use)

    parser.add_option('-v', '--verbose',
           default=False,
           action='store_true')

    parser.add_option('-1', '--file1', 
           help='file1')
    parser.add_option('-2', '--file2', 
           help='file2')

    parser.add_option('-k', '--keycol',
           help='Specify the columns that constitute a unique row with a comma-'
                'delimited list of numbers using a 0-offset.  This is a '
                'required option.')
    parser.add_option('-c', '--comparecol',
           help='Specify the columns to be compared with a comma-delimited list'
                'of numbers using a 0-offset.  This is a required option.')
    parser.add_option('-d', '--delimiter',
           help='Specify a quoted field delimiter. ')
    parser.add_option('-s', '--sorted',
           default=False,
           action='store_true',
           help='Large files must be sorted by key - TBD')
    parser.add_option('--hasheader',
           default=False,
           action='store_true',
           help='indicates that there is a header in the file.')
    parser.add_option('--recdelimiter')
    parser.add_option('--maxsize',
           default=10000,
           help='can override max number of rows to collect')
    parser.add_option('--casecompare',
           default=True,
           action='store_false',
           help=('Turns case-aware compares on and off.'
                 '  Default is True (on)'))

    (opts, args) = parser.parse_args()

    if opts.file1 is None:
        parser.error("no filename was provided")
    elif not os.path.exists(opts.file1):
        parser.error("file1 %s could not be accessed" % opts.file1)

    if opts.sorted:
        parser.error("the sorted option has not been implemented yet")

    if not opts.keycol:
        parser.error("the keycol must be provided")
    if not opts.comparecol:
        parser.error("the comparecol must be provided")


    return opts, args



if __name__ == '__main__':
    sys.exit(main())

