#!/usr/bin/env python
import sys
import logging
import chardet


from optparse import OptionParser

from boilerpipy import (Extractor, isvalidhtml,
                        compat_urllib_request)

def main():
    parser = OptionParser(usage="%prog: [options] [file]")
    parser.add_option('-u', '--url', help="use URL instead of a local file")
    parser.add_option('-d', help="enable debug", action="store_true", default=False, dest="debug")
    (options, args) = parser.parse_args()

    if not options.url:
        parser.print_help()
        sys.exit(1)

    loglevel = logging.INFO
    if options.debug:
        loglevel = logging.DEBUG

    url = None
    if options.url:
        if not isvalidhtml(options.url):
            print "Unrecognized URL, please provide a content-type of text/html"
            sys.exit(255)

        url = compat_urllib_request.urlopen(options.url)
    try:
        content = url.read()
        try:
            enc = chardet.detect(content)['encoding']
            content = content.decode(enc)
        except:
            pass
        out = Extractor(content, loglevel=loglevel).extracted()
        if out is None:
            raise
        print out.encode('utf-8','ignore')

    except Exception as err:
        print "Error in printing the extracted html () %s" % err

    finally:
        url.close()

if __name__ == '__main__':
    main()
