# -*- coding: utf-8 -*-
# Copyright (C) 2010  Michał Masłowski  <mtjm@mtjm.eu>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


"""
Functions for XML and HTML parsing.
"""


from __future__ import unicode_literals

import lxml.etree as etree
from lxml.html import HTMLParser, parse
from urlreader.document import Document

from getmediumurl.compat import StringIO


__all__ = ("read_xml", "get_encoding", "read_html", "unescape")


def read_xml(string):
    """Return an ``ElementTree`` object from XML document `string`."""
    # Some sites use addditional whitespace before the XML declaration.
    string = string.strip()
    try:
        stringio = StringIO(string)
    except TypeError:
        stringio = StringIO(bytes(string, "utf-8"))
    return etree.parse(stringio)


def get_encoding(content_type):
    """Get ``charset`` of MIME ``Content-Type`` header or `None`."""
    try:
        content_type = content_type.decode("latin1")
    except AttributeError:
        pass
    for fragment in content_type.split():
        for fragment in fragment.split(";"):
            if fragment.startswith("charset="):
                return fragment[8:]


def read_html(string, encoding=None):
    """Read an HTML document `string` into an ``ElementTree`` object.

    If `string` is an instance of `urlreader.document.Document`, then
    it is used to obtain content and encoding of the document.
    """
    if isinstance(string, Document):
        content_encoding = get_encoding(string.content_type)
        if encoding is not None and encoding != content_encoding:
            raise ValueError("specified encoding of %s "
                             "but Document says %s"
                             % (encoding, content_encoding))
        string = string.content
        encoding = content_encoding
    try:
        stringio = StringIO(string)
    except TypeError:
        stringio = StringIO(bytes(string, "utf-8"))
    parser = HTMLParser(encoding=encoding)
    return parse(stringio, parser=parser)


def unescape(string):
    """Unescape a string using some HTML entities."""
    return string.replace("&quot;", '"')
