#! /usr/bin/env python2


# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


"""
HTML5Tidy
=========

Simple wrapper around html5lib & lxml.etree to "tidy" html in the wild to
well-formed xml/html


Usage
-----

    html5tidy [-h] [-f] [--inputencoding INPUTENCODING] [--inputnometa]
              [--inputnochardet] [-m METHOD] [--prettyprint]
              [--outputencoding OUTPUTENCODING] [--xmldeclaration]
              [source]

Copyright
---------

- 2011, 2012 [The active archives contributors](http://activearchives.org/)
- 2011, 2012 Michael Murtaugh

All rights reserved.

This software is released under the GPL3 license. See gpl-3.0.txt for details.
"""


if __name__ == "__main__":
    import argparse
    import urllib2
    import sys
    import urlparse
    from html5tidy import tidy

    argparser = argparse.ArgumentParser(description='Tidy HTML input using html5lib (input) + lxml (output)')

    # input
    argparser.add_argument('source', nargs="?", default=None, help='path or URL to read, stdin if not given')
    argparser.add_argument('-f', '--fragment', action='store_true', help='parse as fragment (no header etc added)')
    argparser.add_argument('--inputencoding', default=None, help='force INPUTENCODING on input, default is meta/auto detection')
    argparser.add_argument('--inputnometa', dest="meta", action='store_false', help='do not parse the meta tag on input')
    argparser.add_argument('--inputnochardet', dest="chardet", action='store_false', help='do not attempt to auto detect character set on input')

    # output
    argparser.add_argument('-m', '--method', default='xml', help="'xml', 'html', plain 'text' (text content without tags) or 'c14n'. Default is 'xml'.")
    argparser.add_argument('--prettyprint', action='store_true', help="enables (minimally) formatted XML")
    argparser.add_argument('--outputencoding', default="utf-8", help="output encoding. Use ascii to force character entity escaping of non-ascii. Default is utf-8.")
    argparser.add_argument('--xmldeclaration', action='store_true', help="force xml declaration, (default is only if not utf-8/ascii)")

    args = argparser.parse_args()

    xml_declaration = None

    if args.xmldeclaration:
        xml_declaration = True

    if args.source:
        if urlparse.urlparse(args.source).scheme:
            f = urllib2.urlopen(args.source)

        else:
            f = open(args.source)

    else:
        f = sys.stdin

    sys.stdout.write(tidy(f,
        encoding=args.inputencoding,
        parseMeta=args.meta,
        useChardet=args.chardet,
        method=args.method,
        pretty_print=args.prettyprint,
        xml_declaration=xml_declaration,
        output_encoding=args.outputencoding
    ))
