#!/usr/bin/env python
"""
Read NetLogger logs, output the interval between
the .start and .end events.

The output is the log format, or a simple CSV with the event name, key,
and interval
"""
__author__ = "Dan Gunter <dkgunter@lbl.gov>"
__rcsid__ = "$Id: nl_interval 28609 2011-10-07 12:23:22Z dang $"

import csv
import math
import re
import sys
#
from netlogger.analysis.startend import StartEndMatcher, BeginEndMatcher
from netlogger.nllog import DoesLogging
from netlogger.nlapi import Level, Log
from netlogger.nlapi import EVENT_FIELD, TS_FIELD, LEVEL_FIELD
from netlogger.nllog import get_logger, OptionParser
from netlogger.parsers.base import NLSimpleParser
from netlogger.util import ProgressMeter, NullProgressMeter, handleSignals

## Signal handlers

def on_kill(signo, frame):
    "Signal handler for a graceful exit."
    get_logger(__file__).warn("abort", signal=signo)
    sys.exit(1)

## Constants and globals

INTVL_FIELD = 'nl.intvl'

## Classes

class Interval(DoesLogging):
    """Calculate and output the interval between input start/end events.
    """
    def __init__(self, matchers=None, output_type="csv", output_args=(),    
                 progress_meter=None, add_ts=True,
                 drop_duplicate_events=True, ordered=True,
                 save_unfinished_file=None,
                 csv_col=None):
        """Initialize output handlers and store parameters.
        """
        DoesLogging.__init__(self)
        self._matchers = matchers
        if output_type == "csv":
            self._handler = self._csv_handler
            self._handler_kw = { 'writer': output_args[0],
                                 'add_ts' : add_ts }
            self._hdr = True
            self._columns = csv_col
        elif output_type == "log":
            self._handler = self._log_handler
            self._handler_kw = { 'log' : output_args[0] }
            self._req_fields = (TS_FIELD, EVENT_FIELD, LEVEL_FIELD)
        elif output_type == "hist":
            self._handler = self._histogram_handler
            self._handler_kw = {'event_intervals' : { }}
            self._nbins = output_args[0]
        if save_unfinished_file is not None:
            self._ulog = Log(None)
            self._ufile = open(save_unfinished_file, 'w')
        else:
            self._ulog, self._ufile = None, None
        self._progress_meter = progress_meter

    def _advance(self, n):
        self._progress_meter.advance(n)
        
    def process_file(self, infile=None):
        """Process one file.
        """
        parser = NLSimpleParser()
        used_matchers = set()
        i = 0
        for line in infile:
            i += 1
            d = parser.parseLine(line)
            self._advance(i)
            # Skip input without an event name (i.e., junk)
            if EVENT_FIELD not in d:
                continue
            # Find appropriate start/end matcher for this event
            matcher = self._matchers.get_matcher(d[EVENT_FIELD])
            if matcher is None:
                continue
            # Add event, invoke appropriate handler for results
            matcher.add(d)   
            used_matchers.add(matcher)
            for result in matcher.getResults():
                start, end, key = result
                if start and end:
                    self._handler(start, end, '/'.join(key),
                                  end['ts'] - start['ts'],
                                  **self._handler_kw)
        # Optionally write out any unmatched start events to a separate file
        if self._ulog:
            for matcher in used_matchers:
                matcher.flush() # force unfinished events into result set
                for start in filter(None,
                                    [r[0] for r in matcher.getResults()]):
                    buf = self._ulog.write(**start)
                    self._ufile.write(buf)

    def finalize(self):
        """Perform any final output
        """
        if self._handler == self._histogram_handler:
            for event_name, values in \
                    self._handler_kw['event_intervals'].items():
                print("Event: %s" % event_name)
                print_hist(values, self._nbins)

    def _csv_handler(self, start, end, key, interval, writer=None, add_ts=True):
        """Handle one event, writing it out as CSV.
        """
        if self._hdr:
            if add_ts:
                row = ['ts','event', 'key', 'interval_sec']
            else:
                row = ['event', 'key', 'interval_sec']
            row = row + self._columns # add user-defined columns
            writer.writerow(row)
            self._hdr = False
        event_base = start[EVENT_FIELD][:-6]
        interval_str = '%lf' % interval
        if add_ts:
            row = [start[TS_FIELD], event_base, key, interval_str]
        else:
            row = [event_base, key, interval_str]
        for extra in self._columns:
            if start.has_key(extra):
                if end.has_key(extra):                    
                    value = end[extra]
                    v2 = start[extra]
                    if value != v2:
                        self.log.warn("attr.mismatch", attr=extra,
                                      start__value=v2,
                                      end__value=value, msg="use end value")
                else:
                    value = start[extra]
            else:
                value = ""
            row.append(value)            
        writer.writerow(row)

    def _log_handler(self, start, end, key, interval, log=None):
        """Handle one event, writing it out as a NetLogger log
        """
        intvl = start.copy()
        event_base = start[EVENT_FIELD][:-6]
        for k, v in end.items():
            if k in self._req_fields:
                del intvl[k]
            elif intvl.has_key(k):
                if v != intvl[k]:
                    intvl[k + '.start'] = intvl[k]
                    intvl[k + '.end'] = v
                    del intvl[k]
            else:
                intvl[k] = v
        intvl[TS_FIELD] = start[TS_FIELD]
        intvl[LEVEL_FIELD] = Level.INFO
        intvl[INTVL_FIELD] = interval
        log.write(event_base + '.intvl', **intvl)

    def _histogram_handler(self, start, end, key, interval,
                           event_intervals=None):
        """Handle one event, saving it as a histogram.
        """
        event_base = start[EVENT_FIELD][:-6]
        bucket = event_intervals.get(event_base, None)
        if bucket is None:
            bucket = event_intervals[event_base] = [ ]
        bucket.append(interval)

class EventMatcherList:
    def __init__(self, patterns=[ ], matcher_class=None, **matcher_kw):
        """Encapsulate list of event-id matcher objects with patterns of
        event names to which they are assigned.
        
        Kwargs:
          - patterns (list): List of strings of the form:
              ["foo:bar,baz", "elmo:room,theme"]
    
        Raises:
          - ValueError: If an event pattern occurs more than once
        """
        self._expr, all_fields, self._matchers = [ ], None, [ ]
        seen_regex = set()
        # Loop over input patterns
        for pat in patterns:
            if ':' not in pat:
                if all_fields:
                    raise ValueError("Duplicate empty event pattern")
                all_fields = pat.split(',')
            else:
                regex, fields = pat.split(':', 1)
                if regex in seen_regex:
                    raise ValueError("Duplicate event pattern: '%s'" % regex)
                seen_regex.add(regex)
                self._expr.append(re.compile(regex))
                field_list = fields.split(',')
                matcher = matcher_class(field_list, **matcher_kw)
                self._matchers.append(matcher)
        # Add default pattern last, if there is one
        if all_fields:
            self._expr.append(None)
            matcher = matcher_class(all_fields, **matcher_kw)
            self._matchers.append(matcher)
            
    def get_matcher(self, event_name):
        """Find and return the StartEndMatcher instance for this event.
        """
        result = None
        for i, expr in enumerate(self._expr):
            if expr is None or expr.match(event_name):
                result = self._matchers[i]
                break
        return result

## Functions
        
def print_hist(values, nbins):
    if nbins <= 0:
        nbins = 10 # XXX
    bins = [0 for i in xrange(nbins)]
    vmin, vmax = min(values), max(values)
    vrange = vmax - vmin
    if vrange == 0:
        print "Range of data is 0"
        return
    binwidth = vrange / nbins
    for v in values:
        bin_num = int((v - vmin) / binwidth)
        if bin_num == nbins:
            bin_num = nbins - 1
        bins[bin_num] += 1
    bmin, bmax = min(bins), max(bins)
    brange = bmax - bmin
    display_width = 60
    hdr_col = 2 * len("%.3lf" % vmax) + 6
    display_avail = display_width - hdr_col
    for i in xrange(nbins):
        count = bins[i]
        s = "%6.6lf-%6.6lf %4d: " % (
            bmin + i*binwidth + vmin, bmin + (i+1)*binwidth + vmin, count)
        num_symbols = int((1.0 * count / brange) * display_avail)
        if count > 0 and num_symbols == 0:
            symbols = "|" # less than one
        else:
            symbols = "#" * num_symbols
        print(s + symbols)


def main(cmdline=None):
    usage = "%prog [options] [files..]"
    desc = ' '.join(__doc__.split('\n')[1:3])
    parser = OptionParser(usage=usage, description=desc)
    parser.add_option("-b", "--begin", action="store_true", dest="use_begin",
                      help="Use .begin as the suffix for first event in pair, "
                      "instead of .start") 
    parser.add_option("-c", "--columns", action="store", dest="columns",
                      help="For type 'csv', comma-separated list of "
                      "additional columns that should "
                      "be in the output", default="")
    parser.add_option('-d', '--duplicates', action="store_true", dest="dup",
                      help="Allow duplicate start events without "
                      "end events, or end events without a start, and "
                      "match them in FIFO order. Default is to drop "
                      "old .start or .end events when new ones come in")
    parser.add_option("-g", "--progress", action="store_true",
                      dest="progress",
                      default=False, help="report progress to stderr")
    parser.add_option('-i', '--ids',
                      action='append', default=[], dest='ids',
                      help="Set of identifying fields for a given "
                      "event pattern, " +
                      "using the syntax: " +
                      "[EVENT_REGEX:]FIELD1,..,FIELDN "
                      "(default=.*:event,guid). "
                      "May be repeated.")
    parser.add_option("-n", "--nbins", action="store", type="int",
                      dest="nbins",
                      help="For --type=hist, number of histogram bins. "
                      "The default is to automatically choose the number "
                      "of bins using the standard 'Scott' formula")
    parser.add_option('-r', '--ordered', action="store_true", dest="ordered",
                      help="Process data in file order: drop duplicate ends, "
                      "replace duplicated starts")
    parser.add_option('-s', '--save-file', dest='save_unfinished_file',
                      metavar="FILE",default=None,
                      help="Write unfinished events to FILE "
                      "(default=drop them)")
    parser.add_option('-t', '--type',
                      action="store", type="choice",
                      choices=('csv', 'log', 'hist'), dest='fmt',
                      default='csv',
                      help="Output type (default=%default). "
                      "Other choices are: csv=Comma-separated values, "
                      "log=NetLogger log format, hist=Histogram")
    if cmdline is None:
        cmdline = sys.argv[1:]
    options, args = parser.parse_args(cmdline)
    log = get_logger(__file__)  # Should be first done, just after parsing args
    # Set up signal handlers
    handleSignals((on_kill, ('SIGTERM', 'SIGINT', 'SIGUSR1',
                             'SIGUSR2', 'SIGHUP')))
    # Parse event matcher patterns
    if options.use_begin:
        matcher_class = BeginEndMatcher
    else:
        matcher_class = StartEndMatcher
    try:
        matchers = EventMatcherList(patterns=options.ids, matcher_class=matcher_class)
    except ValueError, err:
        parser.error("option -i/--ids: %s" % err)
    # Set output format
    outp_args, csv_col = None, None
    if options.fmt == 'csv':
        outp_type = "csv"
        outp_args = (csv.writer(sys.stdout),)
        if options.columns:
            csv_col = options.columns.split(",")
        else:
            csv_col = []
    elif options.fmt == 'log':
        outp_type = "log"
        olog = Log(logfile=sys.stdout)
        olog.setLevel(999) # everything!
        outp_args = (olog,)
    elif options.fmt == 'hist':
        outp_type = "hist"
        outp_args = (options.nbins,)
    # Note: final 'else:' not needed, optparse should catch illegal values
    # Init progress meter
    if options.progress:
        pm = ProgressMeter(sys.stderr)
    else:
        pm = NullProgressMeter()
    # Set input files
    if args:
        infile_names = args
    else:
        infile_names = [sys.stdin.name]
    # Run
    log.info("run.start", infiles=infile_names)
    intvl = Interval(matchers=matchers, output_type=outp_type,
                     output_args=outp_args,
                     progress_meter=pm,
                     drop_duplicate_events=(not options.dup),
                     ordered=options.ordered,
                     save_unfinished_file=options.save_unfinished_file,
                     csv_col=csv_col)
    for filename in infile_names:
        if filename == sys.stdin.name:
            infile = sys.stdin
        else:
            try:
                infile = file(filename)
            except IOError, E:
                log.error("run.end", status=-1, msg=E, file=filename)
                parser.error("Bad input file: %s" % E)
        intvl.process_file(infile)
    intvl.finalize()
    log.info("run.end", status=0)

if __name__ == "__main__":
    sys.exit(main())
