#!/usr/bin/env python
## Copyright (c) 2004, The Regents of the University of California, through
## Lawrence Berkeley National Laboratory (subject to receipt of any required
## approvals from the U.S. Dept. of Energy).  All rights reserved.
"""
Check for duplicate lines in NetLogger logs or, really, any
line-oriented log file.  Default input is stdin.

"""
__author__ = "Dan Gunter dkgunter@lbl.gov"
__rcsid__ = "$Id: nl_date 1230 2008-10-25 04:11:03Z dang $"

import hashlib
import sys
from netlogger import util
from netlogger.nllog import OptionParser, get_logger

hasher = hashlib.md5
g_tbl = { }
g_num = 0

def killHandler(signo, frame):
    sys.stderr.write("Exiting on signal. Partial report:\n")
    report()
    sys.exit(1)

def run(infile, progress, unique_file=None):
    global g_num
    while 1:
        line = infile.readline()
        if not line: break
        key = hasher(line).digest()
        if g_tbl.has_key(key):
            g_tbl[key] += 1
        else:
            g_tbl[key] = 1
            if unique_file is not None:
                unique_file.write(line)
        if progress and not(g_num % 1000):
            sys.stdout.write("%-6d lines\r" % g_num)
            sys.stdout.flush()
        g_num += 1
    report()

def report():
    print "%d unique lines out of %d (%d duplicates)" % (
            len(g_tbl), g_num, g_num - len(g_tbl))

def main():
    desc = ' '.join(__doc__.split())
    parser = OptionParser(usage="%prog [file]", description=desc)
    parser.add_option('-g',  dest='progress', action='store_true',
                      default=False,
                      help="Show a progress bar")
    parser.add_option('-o', dest="unique", action="store",
                      metavar="FILE", default=None,
                      help="Write unique lines to FILE (default=no)")
    (options, args) = parser.parse_args()
    log = get_logger(__file__)  # Should be first done, just after parsing args

    # Open files
    try:
        if args:
            infile = file(args[0])
        else:
            infile = sys.stdin
        if options.unique:
            unique_file = file(options.unique,'w')
        else:
            unique_file = None
    except IOError, e:
        log.exc("open.error", e, status=-1)
        return -1

    # Set up signal handlers
    util.handleSignals((killHandler, ('SIGTERM', 'SIGINT', 'SIGUSR1',
                                      'SIGUSR2', 'SIGHUP')))

    log.debug("run.start", file=infile.name)
    run(infile, options.progress, unique_file=unique_file)
    log.debug("run.end", status=0)

if __name__ == "__main__":
    sys.exit(main())
