#!/usr/bin/env python3

# Copyright 2013-4, Sean B. Palmer
# Source: http://inamidst.com/saxo/

import html.entities
import os
import re
import saxo

regex_link = re.compile(r"^(http[s]?://[^<> \"\x01]+)[,.]?$")
regex_title = re.compile(r"(?ims)<title[^>]*>(.*?)</title>")
regex_script = re.compile(r"(?ims)<script(.*?)</script>")
regex_tag = re.compile(r"<[^>]+>")
regex_entity = re.compile(r"&([^;\s]+);")

def longest(input, sep):
    longest = 0
    result = ""
    for part in input.split(sep):
        if len(part) > longest:
           longest = len(part)
           result = part
    return result

blacklist = (
    "swhack.com",
    "translate.google.com",
    "tumbolia.appspot.com",
    "wikia.com",
    "wikipedia.org"
)

def decode_entities(hypertext):
    def entity(match):
        name = match.group(1).lower()

        if name.startswith("#x"):
            return chr(int(name[2:], 16))
        elif name.startswith("#"):
            return chr(int(name[1:]))
        elif name in html.entities.name2codepoint:
            return chr(html.entities.name2codepoint[name])
        return "[" + name + "]"

    return regex_entity.sub(entity, hypertext)

@saxo.pipe
def title(url):
    if not url:
        url = os.environ.get("SAXO_URL")
    if not url:
        return "Sorry, no link found to title"

    if not url.startswith("http"):
        url = "http://" + url

    if "#" in url:
        url = url.split("#", 1)[0]

    for blacklisted in blacklist:
        if blacklisted in url:
            return

    if "nytimes.com" in url:
        return url.rsplit(".html", 1)[0].rsplit("/").pop()

    page = saxo.request(url, limit=262144, follow=True)
    text = regex_script.sub("", page["html"])
    search = regex_title.search(text)
    if search:
        title = search.group(1)
        title = regex_tag.sub("", title)
        title = decode_entities(title)
        title = title.replace("\r", "")
        title = title.replace("\n", "")

        title = longest(title, " : ")
        title = longest(title, " | ")
        title = longest(title, "| ")
        title = longest(title, " — ")
        if "youtube.com" not in url:
            title = longest(title, " - ")
        elif title.endswith(" - YouTube"):
            title = title[:-10]
        title = title.replace('"', "'")
        return title.strip()
    return "No title found"
