#!/usr/bin/env python
"""
Parse log files and send them to the NetLogger information broker or a file.
"""
__rcsid__ = "$Id$"
__author__ = "Dan Gunter"

# System
import glob
import imp
import logging
import os
import re
import socket
import sys
import time
import types

# Local
from netlogger import class_info
from netlogger import nlapi
from netlogger import nldate
from netlogger import info_broker
from netlogger import util, module_util
from netlogger.nllog import OptionParser, get_logger, DoesLogging
from netlogger.amqp.connection import NlProduce, Connect, ConnectionException
from netlogger.amqp.scriptutil import AMQPOptionParser

_P = {
    'broker_port' : info_broker.FANOUT_PORT,
    'amqp_port' : 5672,
}

#
# Signal handlers
#

# Stop things that are in a loop
g_stop = False

def on_kill(signo, frame):
    """Signal handler for a graceful exit.
    """
    global g_stop
    log = get_logger(__file__)
    log.warn("killed", signo=signo)
    g_stop = True

def on_hup(signo, frame):    
    return

#
# List of input files
#

class OffsetFileException(Exception): pass

class InputFileList(DoesLogging):
    def __init__(self, filenames, refresh=60, offset_file=None):
        """Constructor.

        Parameters:
        
          filenames - (list/str) Glob patterns or filenames
          refresh - (int) Period in seconds for re-scan of filenames, zero meaning never
          offset_file - (str) Load/save offsets from this filename
          
        Exceptions:
        
          OffsetFileException - offset_file error
          
        """
        DoesLogging.__init__(self)
        self._pat = filenames
        # Time of last scan
        self._scan_time = 0
        self._manual_scan = refresh <= 0
        # Last known set of files
        self._files = [ ]
        # Dictionary whose keys are files and values are file sizes.
        self._file_sizes = { }
        # Init offsets
        if offset_file:
            self._init_offsets(offset_file)
        else:
            self._offs_file = None

    def _init_offsets(self, offset_file):
        """Initialize offset file, which provides for more durable operation.

        Exceptions:
        
          OffsetFileException - offset_file error

        """
        self._offs_file = offset_file
        self._saved_offs = { } # offsets of 'finished' files
        if not os.path.exists(offset_file):
            # initialize as empty file
            try:
                open(self._offs_file, "w")
            except IOError, err:
                raise OffsetFileException("Cannot create: %s" % err)
        else:
            # Read existing offsets and set up files appropriately
            for filename, offset in self._read_offsets().items():
                try:
                    fileobj = open(filename, "r")
                    fileobj.seek(offset)
                    self._files.append(fileobj)
                    self._file_sizes[fileobj] = offset
                except IOError, err:
                    # it's ok if these offsets don't exist any more
                    self.log.warn("get_offsets.file_not_found",
                                  file=filename, action="ignore")
            # immediately write out offset file corrected for reality
            self._write_offsets()

    def close(self):
        """Finalize and clean up.
        """
        self.log.debug("close.start")
        if self._offs_file:
            self._write_offsets()
        self.log.debug("close.end", status=0)
            
    def get_files(self):
        """Get a list of current readable files.

        Return: list of file objects
        """
        if not self._manual_scan:
            if time.time() - self._scan_time >= refresh:
                self.rescan()
        return self._files

    def rescan(self):
        """Scan all patterns and update file list.
        """
        if self._dbg:
            self.log.debug("rescan.start", num_files=len(self._files))
        # Record time of this scan.
        self._scan_time = time.time()
        # Init loop vars.
        scan_files, cur_file_dict, scan_sizes = [ ], { }, { }
        for f in self._files: # use this in loop below
            cur_file_dict[f.name] = f
        # Loop over all file patterns.
        for pattern in self._pat:
            if pattern == '-':
                scan_files.append(sys.stdin)
                continue
            # Loop over each file matched by pattern.
            for filename in glob.glob(pattern):
                old_fileobj = cur_file_dict.get(filename, None)
                # Try to open the file.
                try:
                    fileobj = open(filename, mode='r')
                    filesz = os.stat(filename).st_size
                except IOError, err:
                    # Only report IO errors if the file was previously open.
                    if old_fileobj:
                        self.log.warn("rescan.open.error", file=filename, msg=err)
                    continue
                # Check if file was truncated (always False for new files).
                is_truncated = filesz < self._file_sizes.get(old_fileobj, 0)
                if is_truncated:
                    self.log.info("rescan.file_truncated", file=filename,
                                  when=nldate.utcFormatISO(
                                      os.stat(filename).st_mtime))
                # Factor truncation into new-file logic.
                is_new_file = (not old_fileobj) or is_truncated
                # For new files, remove the saved offset if any.
                if is_new_file:
                    if self._offs_file and filename in self._saved_offs:
                        del self._saved_offs[filename]
                # For existing files, seek to current offset.
                # Failure to tell() or seek() will log errors, but not reject
                # the file for reading.
                else:
                    offs = -1 # flag that the tell() failed
                    try:
                        offs = old_fileobj.tell()
                    except IOError, err:
                        self.log.error("rescan.tell.error",
                                      file=filename, msg=err)
                    if offs >= 0:
                        try:
                            fileobj.seek(offs)
                        except IOError, err:
                            self.log.error("rescan.seek.error", offset=offs,
                                           file=filename, msg=err)
                # Add the new file object to the list of files found on this scan.
                scan_files.append(fileobj)
                # Put the file size into the dictionary
                scan_sizes[fileobj] = filesz
        # Replace old list of files with those from this scan.
        self._files = scan_files
        # Replace file size dictionary with new values from this scan.
        self._file_sizes = scan_sizes
        # If we're persisting offsets, do that now.
        if self._offs_file:
            self._write_offsets()
        if self._dbg:
            self.log.debug("rescan.end", num_files=len(self._files), status=0)
            if self._trace:
                self.log.trace("rescan.stats", files=self._files,
                               sizes=self._file_sizes)

    def save_offset(self, fileobj):
        """Alert this class the file is closing, so its offset
        can be saved (only needed when offset_file is not None).
        """
        if self._offs_file:
            self._saved_offs[fileobj.name] = fileobj.tell()
            
    def _read_offsets(self):
        """Read offsets from the offset file.

        Returns: dict of { filename : offset }
        """
        offsets = { }
        try:
            f = open(self._offs_file, "r")
        except IOError, err:
            raise OffsetFileException(str(err))
        for i, line in enumerate(f):
            line = line.strip()
            try:
                offset, filename = line.split(None,1)
                offsets[filename] = int(offset)
            except ValueError:
                raise OffsetFileException("Bad offset value at line %d of '%s': '%s'" % (
                    i+1, self._offs_file, line))
        return offsets

    def _write_offsets(self):
        """Write current offsets to the offset file.
        """
        if self._dbg:
            self.log.debug("offsets.write.start", num_files=self._files)
        try:
            f = open(self._offs_file, "w")
        except IOError, err:
            raise OffsetFileException(str(err))
        # write current 'live' files
        for fileobj in filter(lambda f: not f.closed, self._files):
            name = fileobj.name
            offset = fileobj.tell()
            f.write("%d %s\n" % (offset, name))
        # write saved (closed) offsets as well
        for name, offset in self._saved_offs.items():
            if os.path.exists(name): # maybe someone removed it?
                f.write("%d %s\n" % (offset, name))
        f.close()
        if self._dbg:
            self.log.debug("offsets.write.end", status=0)
                
def load_class(module_name, class_name):
    """Load class 'class_name', for specified module, which is specified
    as a dotted name following 'netlogger', e.g. "parsers.modules.bp".
    If the name starts with a leading '.', then "netlogger" is NOT
    prepended.

    Raises ImportError if there was a problem loading the module
    or instantiating the appropriate class.
    """
    clazz = None
    mod_name = "netlogger.parsers.modules.%s" % module_name
    try:
        mod = module_util.load_module(mod_name)
        clazz = vars(mod)[class_name]
    except module_util.ModuleLoadError, err:
        raise ImportError(err)
    return clazz

def init_class(clazz, filename, init_kw):
    """Create new instance of parser module, class 'clazz'

    Return new class instance or None on error
    """
    log.debug("init.class.begin", name=clazz, keywords=init_kw)
    try:
        parser = clazz(filename, **init_kw)
        log.debug("init.class.end", status=0)
    except Exception, error:
        if log.isEnabledFor(logging.DEBUG):
            log.error("init.class.end", status=-1, msg=error, 
                      traceback=util.traceback())
        else:
            log.error("init.class.end", status=-1, msg=error)
        parser = None
    return parser

def parser_flush(parser, log_out):
    """Flush parser.
    Return the number of events written
    to the output.
    """
    log = get_logger(__file__)
    log.debug("parser.flush.start")
    count = 0
    parser.finalize()
    for datum in parser.flush():
        log_out.write(**datum)
        count += 1
    log.debug("parser.flush.end", count=count, status=0)
    return count

def connect_tcp(url, reconnect=1):
    """Try connecting to a url until it succeeds.
    If reconnect is 0, give up after first failure.
    Otherwise try forever, sleeping 'reconnect' seconds between each.

    Return: nlapi.Log instance, or None
    """
    global g_stop
    log = get_logger(__file__)
    
    result = None
    while not g_stop:
        try:
            result = nlapi.Log(url, level=nlapi.Level.ALL)
            log.debug("run", value="Connection successful")
            break
        except nlapi.Log.OpenError, error:
            if reconnect > 0:
                log.warn("run.error", type="socket", msg=error)
                time.sleep(reconnect)
            else:
                log.warn("run.error", type="socket", 
                         msg="reconnect set to 0 - not retrying")
                g_stop = True
    return result
#
# Run function
#

def run(parser_class, init_kw, 
        output_file=None, host=None, port=None, reconnect=None, 
        input_patterns=[ ], tail=None, rescan=None, offset_file=None,
        flush_sec=None, amqp_host=None, amqp_options={ },
        amqp_disconnect=False, progress=False):
    global g_stop
    _fname = lambda f: f.name # for more easily print lists of fileobj
    log = get_logger(__file__)
    log.info("run.start")
    if progress:
        progress_meter = util.ProgressMeter(sys.stderr, units="line")
    else:
        progress_meter = util.NullProgressMeter()        
    ifl = InputFileList(input_patterns, refresh=0, offset_file=offset_file)
    parsers = { } # map of input files to parser instances
    last_scan = 0 # time of last scan of files
    max_per_parser = 100 # max events per parser in one loop
    log_stream = None
    if output_file is not None:
        url = output_file
    elif (host is not None) and (port is not None):
        p = (port, _P['broker_port'])[port < 0]
        url = "x-netlog://%s:%d" % (host, p)
    else:
        url = "-" # stdout
    log.debug("open.url", value=url)
    #log_out = nlapi.Log(url, level=nlapi.Level.ALL)
    log_out = None
    # if the url is a network connection, honor the -r/reconnect flag
    # when trying to connect
    if isinstance(url,types.StringType) and url.startswith('x-netlog'):
        log_out = connect_tcp(url, reconnect=reconnect)
    elif amqp_host is not None:
        if not NlProduce:
            log.error('run.error', msg='py-amqplib support not enabled')
            return -1
        p = (port, _P['amqp_port'])[port < 0]
        try:
            conn = Connect(amqp_host, p, **amqp_options)
            log_out = NlProduce(connection=conn, **amqp_options)
        except ConnectionException, e:
            log.error('run.error', msg=e)
            return -1
    else:
        log_out = nlapi.Log(url, level=nlapi.Level.ALL)
    
    blacklist = set([])
    # Main loop
    total_count = 0
    # these two variables are for periodic output flushing
    first_write, is_flushed = 0, True
    while not g_stop:
        log.debug("run.loop.start")
        # Periodically re-scan input files
        if time.time() - last_scan > rescan:
            last_scan = time.time()
            num_before = len(parsers)
            log.debug("scan.start", num_files=num_before)
            ifl.rescan()
            input_files = frozenset(ifl.get_files())
            if log.isEnabledFor(nlapi.Level.TRACE):            
                log.trace("scan.input_files", value=map(_fname, input_files))
            # Remove old parsers for removed files
            parser_files = frozenset(parsers.keys())
            for to_remove in parser_files - input_files:
                parsers[to_remove].close()
                del parsers[to_remove]
            # Add new parsers for new files, that are not on blacklist
            parser_files = frozenset(parsers.keys())
            for to_add in input_files - parser_files - blacklist:
                p = parser_class(to_add, **init_kw)
                if p is None:
                    raise IOError("Cannot init parser for file '%s'" %
                                  to_add)
                parsers[to_add] = p
            log.debug("scan.end", num_files=len(parsers))
            if len(parsers) != num_before:
                if len(parsers) > num_before:
                    log.info("scan.added_files", num=len(parsers) - num_before)
                else:
                    log.info("scan.dropped_files", num=num_before - len(parsers))
        # If not tailing files, stop when no more
        if not tail and not parsers:
            log.debug("run.loop.end", msg="no more parsers")
            g_stop = True
            break
        # Get data from all parsers
        count , eof_files, hit_eof = 0, [ ], 0
        for fileobj, parser in parsers.items():
            for i in xrange(max_per_parser):
                datum = None
                try:
                    datum = parser.next()
                except StopIteration:
                    if tail:
                        hit_eof += 1
                        eof_files.append(fileobj)
                    else:
                        # Tell parser EOF, see if any more events come back
                        parser.done()
                        try:
                            datum = parser.next()
                        except StopIteration:
                            # Nope, this really is EOF
                            eof_files.append(fileobj)
                if datum is None:
                    break
                count += 1
                total_count += 1                
                log_out.write(**datum)
                progress_meter.advance(total_count)
                # if this is first write since flush, reset time
                if is_flushed:
                    log.debug("first_write")
                    first_write = time.time()
                    is_flushed = False
        # Remove files at EOF
        if eof_files:
            log.debug("parsers.remove.start", num=len(eof_files), total=len(parsers))
            for fileobj in eof_files:
                # if not tailing, remove the file forever
                if not tail:
                    parser = parsers[fileobj]
                    total_count += parser_flush(parser, log_out)
                    ifl.save_offset(fileobj) # remember where we were..
                    parser.close()
                    del parsers[fileobj]
                    blacklist.add(fileobj)                    
            log.debug("parsers.remove.end", remaining=len(parsers))
        # Sleep if all non-removed parsers hit eof
        if hit_eof >= len(parsers):
            time.sleep(0.2)
            # flush if enough time has elapsed
            if not is_flushed and time.time() - first_write > flush_sec:
                log.debug("flush.start")
                log_out.flush()
                first_write, is_flushed = 0, True
                log.debug("flush.end", status=0)
        if log.isEnabledFor(logging.DEBUG):
            log.debug("run.loop.end", total=total_count, count=count,
                      hit_eof=hit_eof, num_files=len(parsers))
    # cleanup
    log.info("run.cleanup.start", num_parsers=len(parsers))
    # Flush parsers, then save offsets, then close parsers.
    for parser in parsers.values():
        total_count += parser_flush(parser, log_out)
    ifl.close()
    for parser in parsers.values():
        parser.close()
    # Disconnect from AMQP if using it.
    if amqp_disconnect:
        if hasattr(log_out, 'send_disconnect'):
            log_out.send_disconnect()
    # Close output
    if hasattr(log_out, 'close'):
        log_out.close()
    log.info("run.cleanup.end", status=0)
    log.info("run.end", status=0, count=total_count)
    return 0

# usage: nl_parse [OPTIONS] MODULE [name=value ...] [FILES..]
#
# Arguments:
#
def main():
    """Program entry point.
    """
    usage = """%prog [options] module [params..] [files..]
Details:
    options - Program options.
    module  - Name of the parsing module, found in netlogger/parsers/modules/.
              For example, module "bp" would refer to netlogger/parsers/modules/bp.py
    params  - Module arguments, in the form "keyword=value", passed to the
              constructor of the module's Parser class. Non-string arguments,
              e.g. files, can be passed by surrounding expressions to create them in
              curly braces, e.g.  ostrm={file('foo.out','w')}.
    files   - Extra arguments that are NOT in the form keyword=value are interpreted
              as input files. In order to get a running nl_parse to find "new" files 
              that match a pattern such as /var/log/my*.log, you must quote the pattern
              e.g., "/var/log/my*.log".
"""
    desc = ' '.join(__doc__.split())
    parser = AMQPOptionParser(usage=usage, description=desc)
    parser.add_option('-c', '--broker', dest='host', metavar='HOST', default=None,
                      help='Write parsed data to NetLogger broker at HOST' +
                      '(default port=%d)' % _P['broker_port'])
    parser.add_option('-d', '--amqp_disconnect', action="store_true", default=False,
                      dest="amqp_disconnect",
                      help="send disconnect message to AMQP server when done. " +
                      "No effect if not used with -a.")
    parser.add_option('-f', '--flush', dest='flush', action='store',
                      type='int', default=1, metavar='INTERVAL',
                      help='Flush output file after INTERVAL seconds of '
                      'inactivity (default=%default)')
    parser.add_option("-g", "--progress", action="store_true",
                      dest="progress", default=False,
                      help="report progress to stderr")
    parser.add_option('-i', '--info', dest='info', action='store_true',
                      default=False,
                      help="Print information on selected module")
    parser.add_option('-l', '--list', dest='listmod', action='store_true',
                      help="List available modules")
    parser.add_option('-o', '--output', dest='filename', metavar='FILE',
                      default=None,
                      help='Write NetLogger logs to FILE (default=stdout)')
    parser.add_option('-O', '--offset-file', dest='offset_file', metavar='FILE',
                      default=None, help="Load/maintain file offsets in FILE, so that "
                      "subsequent runs don't process duplicate data (default=%default)")
    parser.add_option('-p', '--port', dest='port', type='int', metavar='PORT',
                      default=-1,
                      help="For info_broker or amqp server, the port to connect to"
                      " (default=info_broker %d, amqp broker %d)" \
                      % (_P['broker_port'], _P['amqp_port']))
    parser.add_option('-r', '--reconnect', dest='reconnect', type='int', metavar='SEC',
                      default=10,
                      help="If connection to broker at HOST fails, "
                      "try again every SEC seconds (default=%default). "
                      "0=don't retry")
    parser.add_option('-s', '--rescan', dest="rescan", type='int', metavar='SEC',
                      default=10,
                      help="Rescan directory for files matching the "
                      "input patterns every SEC seconds (default=%default)")
    parser.add_option('-t', '--tail', dest='tail', action='store_true', default=False,
                      help="Tail input files instead of stopping at EOF")
    options, args = parser.parse_args(sys.argv[1:])
    log = get_logger(__file__)  # Must come after parsing args
    log.debug("parse.args.start")
    # Check for 'list' mode
    if options.listmod:
        print 'Available modules:'
        avail = [ ]
        try:
            avail = module_util.list_modules('parsers','modules')
        except ImportError, error:
            parser.error('No module found: %s' % error)
        print ', '.join(avail)
        return 0
    # Load the class
    if len(args) == 0:
        parser.error("A module name is required")
    module_name = args[0]
    try:
        clazz = load_class(module_name, 'Parser')
    except ImportError, err:
        log.error("import.error", module=module_name, msg=err)
        return -1
    # Check for 'info' mode:
    if options.info:
        print(module_util.module_info('parser', module_name, clazz))
        return 0
    # Parse keywords
    init_kw, input_files = { }, [ ]
    for init_arg in args[1:]:
        parts = init_arg.split('=', 1)
        if len(parts) != 2:
            input_files.append(init_arg)
            continue
        key, value = parts[0], parts[1]
        init_kw[key] = value
    # By default read from stdin
    if not input_files:
        input_files = ['-']
    log.debug("parse.args.end", status=0)
    # Set up signal handlers
    log.debug("init.signals.start")
    util.handleSignals(
        (on_kill, ('SIGTERM', 'SIGINT', 'SIGUSR2')),
        (on_hup, ('SIGHUP',)) )
    log.debug("init.signals.end", status=0)
    if options.host and options.amqp_host:
        parser.error("Cannot write to both amqp and info-broker")
    # parse AMQP options, if appropriate
    if options.amqp_host or options.amqp_option:
        amqp_options = parser.get_amqp_options(options)
        options.ensure_value('amqp_host', 'localhost')
    else:
        amqp_options = { }
    # Run
    try:
        status = run(clazz, init_kw, output_file=options.filename,
                     host=options.host, port=options.port,
                     reconnect=options.reconnect, input_patterns=input_files,
                     tail=options.tail, rescan=options.rescan, offset_file=options.offset_file,
                     flush_sec=options.flush, amqp_host=options.amqp_host,
                     amqp_options=amqp_options,
                     amqp_disconnect=options.amqp_disconnect,
                     progress=options.progress)
    except OffsetFileException, err:
        log.critical("run.error", msg=err)
        status = -2
    except Exception, error:
        log.critical("run.error", msg=util.traceback())
        status = -1
    return status

if __name__ == '__main__':
    sys.exit(main())
