#!/usr/bin/env python
""" Converts file csv dialect - field delimiter and record delimiter.
    Also can be used to convert between multi-columna and single-column
    field delimiters.

    Example Usage:
    $ cat ../data/*crime* | ./gristle_file_converter.py colors.csv -d ',' -D '|' -o /tmp/colors.out

    To do:
    1.  dialect validation in get_dialect()
    2.  improve msg if no args provided, tell user about -h
    3.  support multiple input files

    This source code is protected by the BSD license.  See the file "LICENSE"
    in the source code root directory for the full language or refer to it here:
        http://opensource.org/licenses/BSD-3-Clause
    Copyright 2011,2012,2013 Ken Farmer
"""

#--- standard modules ------------------
from __future__ import division
import sys
import optparse
import csv
import os

#Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)


#--- gristle modules -------------------
# lets get pathing set for running code out of project structure & testing it via tox
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


import gristle.file_type           as file_type 


#------------------------------------------------------------------------------
# Command-line section 
#------------------------------------------------------------------------------
def main():
    """ Analyzes the file to automatically determine input file csv 
        characteristics.  Then reads one record at a time and writes it
        out.

    """
    cmd_parser   = CommandLineParser()
    opts         = cmd_parser.opts
    files        = cmd_parser.files

    try:
        in_dialect    = get_in_dialect(opts, files, cmd_parser)
    except EOFError:
        return 1
    out_dialect   = get_out_dialect(opts)
    in_file       = InFileHandler(files, in_dialect, opts.stripfields)
    out_file      = OutFileHandler(opts.output, out_dialect)

    fields = in_file.readline()
    while fields:

        if (in_file.read_cnt == 1 
        and in_dialect.has_header
        and not out_dialect.has_header):
            pass
        else:
            out_file.writeline(fields)        

        fields = in_file.readline()

    out_file.terminate()

    return 0     


class InFileHandler(object):
    """ Handles the opening, closing and writing to a single input file.
        The file must be a csv, but it can be either a single-column delimiter
        or a multi-column delimiter.
    """
    def  __init__(self, file_names, dialect, stripfields):
        self.file        = file_names[0]
        self.dialect     = dialect
        self.stripfields = stripfields
        self.read_cnt    = 0
        self.bad_cnt     = 0
        self.in_file     = open(self.file,'r')
        self.eof         = False
        if self.dialect.auto:
            self.reader  = csv.reader(self.in_file, 
                                     delimiter=self.dialect.delimiter,
                                     quoting=self.dialect.quote_type)

    def readline(self):
        """ Public method that handles reading.
            It will return None if EOF is hit, otherwise will return a good 
            record consisting of a list of fields - even if this means that 
            it will have to skip multiple bad or partial records.
            Note that it will use the CSV module if possible, otherwise will
            manually remove quotes, line terminators, etc.
        """   
        while True:
            rec = self._readrec()
            if rec:
                return rec
            elif self.eof:
                return None
            else:
                pass    # received bad rec, get another

    def _readrec(self):
        """ Private method responsible for getting the next record, setting the
            eof flag, closing the input file at eof, and parsing the record.
            Parsing the record and splitting it into a list of fields is handled
            automatically by the CSV module, or manually if necessary.
        """
        assert self.eof is False
        if self.dialect.auto:
            try:
                in_rec = self.reader.next()
            except StopIteration:
                in_rec = None
        else:
            in_rec = self.in_file.readline()

        if in_rec:
            self.read_cnt += 1
        else:
            self.eof = True
            self.in_file.close()
            return None

        if self.dialect.auto:
            fields = in_rec
        else:
            in_rec2 = self._strip_lineterminator(in_rec)
            if in_rec2:
                in_rec3 = self._strip_recdelimiter(in_rec2)
                if in_rec3:
                    fields = in_rec3.split(self.dialect.delimiter)
                    if fields:
                        fields = fields
            return None # not a valid rec
        if self.stripfields:
            strip_fields = []
            for field in fields:
                strip_fields.append(field.strip())
            return strip_fields
        else:
            return fields

    def _strip_lineterminator(self, rec):
        """ Removes the dialect-defined lineterminator from the end of the
            record if it exists.
            Returns the results.
        """
        if rec.endswith(self.dialect.lineterminator):
            return rec[:-len(self.dialect.lineterminator)]
        else:
            return rec

    def _strip_recdelimiter(self, rec):
        """ For input files with a dialect that defines a record delimiter it
            will ensure that only one delimiter exists, then return all data prior
            to it.
            For input files without a dialect-defined record delimiter it
            simply returns the record.
        """
        if self.dialect.recdelimiter:
            if rec.count(self.dialect.recdelimiter) != 1:
                print 'bad rec found with len of %s and looks like %s' % (len(rec), rec)
                self.bad_cnt += 1
                return None
            else:
                pre_recdelimiter = rec.split(self.dialect.recdelimiter)[0]
                return pre_recdelimiter
        else:
            return rec



class OutFileHandler(object):
    """ Handles the transformation as well as writing of data to a single output
        file.
        The file must be a csv, but it can be either a single-column delimiter
        or a multi-column delimiter.
        Opening & closing is also handled.
    """

    def __init__(self, output, dialect):
        """ Sets up environment to write files.  Will write to stdout if a file
            name was not provided.  Will use csv module if the delimiter is 
            just a single character.
        """
        self.dialect        = dialect
        self.write_cnt      = 0
        self.csv_writer     = None  # only used by single-char files with csv module
        if output:
            self.out_file   = open(output, 'w')
        else:
            self.out_file   = sys.stdout
        if len(self.dialect.delimiter) == 1:
            self.csv_writer = csv.writer(self.out_file, 
                                         delimiter=self.dialect.delimiter,
                                         quoting=self.dialect.quote_type)

    def writeline(self, fields):
        """ writes a single record (a list of fields) to the output.
            This is handled either through the csv module writer or manually.
        """
        if self.csv_writer:
            self.csv_writer.writerow(fields)
        else:
            fields2 = self._add_quotes(fields)
            rec     = self.dialect.delimiter.join(fields2)
            rec2    = self._add_recdelimiter(rec)
            rec3    = self._add_lineterminator(rec2)
            self.out_file.write(rec3)

    def _add_recdelimiter(self, rec):
        """ Add a record delimiter - if it was defined within the dialect.
            Note that this only applies to manually-written files, not writes 
            handled by csv module.
        """
        if self.dialect.recdelimiter:
            return '%s%s' % (rec, self.dialect.recdelimiter)
        else:
            return rec

    def _add_lineterminator(self, rec):
        """ Add a lineterminator if it was defined within the dialect.
            Note that this only applies to manually-written files, not writes 
            handled by csv module.
        """
        return '%s%s' % (rec, self.dialect.lineterminator)
      
    def _add_quotes(self, fields):
        """ Add quotes around each field if the dialect indicates the fields 
            are quoted.  It will use the quotechar also defined in the dialect.
            NOTE: that this only applies to manually-written files, not writes 
            handled by csv module.
            NOTE: it quotes every single field - not just strings! """
        # pylint: disable=W0612
        if self.dialect.quoting:
            quotechar  = self.dialect.quotechar  
            new_fields = []
            for field in fields:
                new_fields.append('%(quotechar)s%(field)s%(quotechar)s' % locals())
            return new_fields
        else:
            return fields
        # pylint: enable=W0612

    def terminate(self):
        """ Handles final housekeeping for the object.  Specifically, that
            includes closing the output file.
        """
        self.out_file.close()



def get_in_dialect(opts, files, parser):
    """ Tries to use gristle to analyze an input file and automatically
        determine a csv dialect.  If this is not possible it requires
        cmdline options to manually assemble one.  Either way it returns
        this dialect.
        Inputs:
        - options dictionary
        - files list
        Outputs:
        - a csv dialect
    """
    if (len(files) == 1
       and  (opts.delimiter is None or len(opts.delimiter) == 1)):
        # automatically determine input dialect:
        my_file       = file_type.FileTyper(files[0],
                                           opts.delimiter,
                                           opts.recdelimiter,
                                           opts.hasheader)
        try:
            my_file.analyze_file()
        except file_type.IOErrorEmptyFile:
            raise EOFError
        in_dialect                = my_file.dialect
        in_dialect.auto           = True                 # will use csv mod
        in_dialect.recdelimiter   = opts.recdelimiter
        in_dialect.lineterminator = '\n'                 # prefer unix default
        if opts.quoting:
            in_dialect.quote_type = csv.QUOTE_ALL
        else:
            in_dialect.quote_type = csv.QUOTE_NONE
    else:
        # manually configure input dialect:
        if not opts.delimiter:
            parser.parser.error("Manual csv parsing required - but missing delimiter option")
        if not opts.quoting:
            parser.parser.error("Manual csv parsing required - but missing quoting option")
        if not opts.quotechar:
            parser.parser.error("Manual csv parsing required - but missing quotechar option")
        if not opts.hasheader:
            parser.parser.error("Manual csv parsing required - but missing hasheader option")
        in_dialect                = CustomCSV
        in_dialect.delimiter      = opts.delimiter
        in_dialect.quoting        = opts.quoting
        in_dialect.quotechar      = opts.quotechar
        if opts.quoting:
            in_dialect.quote_type = csv.QUOTE_ALL
        else:
            in_dialect.quote_type = csv.QUOTE_NONE
        in_dialect.recdelimiter   = opts.recdelimiter    # custom field
        in_dialect.has_header     = opts.hasheader   
        in_dialect.lineterminator = '\n'                 # naive assumption
        in_dialect.auto           = False                # will not use csv mod

    return in_dialect


def get_out_dialect(opts):
    """ Determines then returns output dialect from command line options.
    """
    out_dialect              = CustomCSV()
    out_dialect.delimiter    = opts.out_delimiter
    out_dialect.recdelimiter = opts.out_recdelimiter
    out_dialect.has_header   = opts.out_hasheader   
    out_dialect.quoting      = opts.out_quoting
    out_dialect.lineterminator = '\n'                    # naive assumption
    if opts.out_quoting:                                 # naive assumption
        out_dialect.quote_type  = csv.QUOTE_ALL
    else:
        out_dialect.quote_type  = csv.QUOTE_NONE
    if len(opts.out_delimiter) == 1:
        out_dialect.auto     = True                      # will use csv mod
    else:
        out_dialect.auto     = False                     # will not use csv mod

    return out_dialect


class CommandLineParser(object):
    """ Manages all parsing, formating and validating of command line options 
        and args.  
        Tested via test-harness.
    """
    def __init__(self):
        use = ('%prog is used to convert a single file from one format to another.\n'
               '\n'
               '   %prog [input file] [misc options] '
               '\n')
        self.parser = optparse.OptionParser(usage = use)
        self._define_options()
        self.opts, self.files = self.parser.parse_args()
        self._validate_misc()

    def _define_options(self):
        """ Defines all command line options.
        """

        self.parser.add_option('-o', '--output', 
               help='Specifies output file. Default is stdout.')
        self.parser.add_option('-q', '--quiet',
               action='store_false',
               dest='verbose',
               default=True,
               help='provides less detail')
        self.parser.add_option('-v', '--verbose',
               action='store_true',
               dest='verbose',
               default=True,
               help='provides more detail')
        self.parser.add_option('-d', '--delimiter',
               help=('Specify the input file delimiter within quotes'
                     'This is especially useful for multi-col delimiters.'))
        self.parser.add_option('-D', '--outdelimiter',
               dest='out_delimiter',
               help=('Specify the output file delimiter within quoted'
                     'This is especially useful for multi-col delimiters.'))
        self.parser.add_option('-r', '--recdelimiter',
               help='Specify a quoted end-of-record delimiter within input file. ')
        self.parser.add_option('-R', '--outrecdelimiter',
               dest='out_recdelimiter',
               help='Specify a quoted end-of-record delimiter in output file. ')
        self.parser.add_option('--quoting',
               default=False,
               action='store_true',
               help='Specify field quoting - generally only used for stdin data.'
                    '  The default is False.')
        self.parser.add_option('-Q', '--outquoting',
               dest='out_quoting',
               default=False,
               action='store_true',
               help='Specify output field quoting')
        self.parser.add_option('--quotechar',
               default='"',
               help='Specify field quoting character - generally only used for '
                    'stdin data.  Default is double-quote')
        self.parser.add_option('--hasheader',
               default=False,
               action='store_true',
               help='Indicates the existance of a header in the input file.')
        self.parser.add_option('-H', '--outhasheader',
               default=False,
               action='store_true',
               dest='out_hasheader',
               help='Specify that a header within the input file will be retained in the output file')
        self.parser.add_option('--stripfields',
               default=False,
               action='store_true',
               dest='stripfields',
               help='Strip leading and trailing spaces from each field')



    def _validate_misc(self):
        """ Handles simple option validation.
        """
        if not self.opts.out_delimiter:
            self.parser.error('Please provide output delimiter')

        if self.files:
            if (len(self.files) > 1 and not self.opts.delimiter):
                self.parser.error('Please provide delimiter when piping data into '
                             'program via stdin or reading multiple input files')
        else: 
            self.parser.error('Input file argument is missing')
    
        if len(self.files) > 1:
            self.parser.error('Too many file names provided')


# pylint: disable=W0231   
class CustomCSV(csv.Dialect):
    """ Subclasses CSV class in order to shut off validation - thereby allowing
        multi-char delimiters to be stored within its normal variables.
    """
    recdelimiter = None   # added custom field to complement lineterminator
    def __init__(self):
        """ do not run validation process - so that multi-char delimiters can
            be kept.
        """
        pass 
# pylint: enable=W0231


if __name__ == '__main__':
    sys.exit(main())

