# -*- test-case-name: pycabulary.tests.test_dicios -*-
# Copyright (c) 2010 Marco Giusti
# See LICENSE.txt for details.

from __future__ import absolute_import

import types
import re
import HTMLParser

from twisted.internet import defer
from twisted.python import log

from . import utils


class DiciosError(Exception):
    pass


class InvalidLanguageError(DiciosError):
    pass


EMPTY = re.compile('^\s+$')


def notEmptyStr(s):
    return not EMPTY.match(s)


class DiciosHTMLParser(HTMLParser.HTMLParser):

    error = HTMLParser.HTMLParseError

    def reset(self):
        HTMLParser.HTMLParser.reset(self)
        self.brand= False
        self.inTable = False
        self.grabLink = False
        self.getWord = False
        self.traslation = False
        self.word = None
        self.terms = []
        self.encoding = 'utf8'

    def handle_starttag(self, tag, attrs):
        handler = getattr(self, 'start_' + tag, self.defaultStart)
        handler(dict(attrs))

    def handle_endtag(self, tag):
        handler = getattr(self, 'end_' + tag, self.defaultEnd)
        handler()

    def defaultStart(self, attrs):
        pass

    def defaultEnd(self):
        pass

    # @countStart('table')
    def start_table(self, attrs):
        if 'id' in attrs and attrs['id'] == 'PGtbltrad':
            self.inTable = True

    # @countEnd('table')
    def end_table(self):
        if self.inTable:
            self.inTable = False

    def start_tr(self, attrs):
        if 'id' in attrs and attrs['id'] == 'PGgRbrand':
            self.brand = True

    def end_tr(self):
        if self.brand:
            self.brand = False

    def start_td(self, attrs):
        if self.inTable and not self.brand and 'colspan' in attrs and \
           attrs['colspan'] == '2':
            self.grabLink = True

    def end_td(self):
        if self.grabLink:
            self.grabLink = False

    def start_a(self, attrs):
        if self.grabLink:
            self.getWord = True

    def end_a(self):
        if self.getWord:
            self.getWord = False

    def start_meta(self, attrs):
        if 'http-equiv' in attrs and attrs['http-equiv'] == 'content-type' \
           and 'content' in attrs:
            for attr in attrs['content'].split():
                attr = attr.strip(' ;')
                if attr.startswith('charset='):
                    self.encoding = attr[8:]

    def handle_data(self, data):
        if self.getWord:
            if self.traslation:
                self.terms.append((self.word, data))
                self.traslation = False
            else:
                self.word = data
                self.traslation = True

Parser = DiciosHTMLParser


try:
    from lxml import html

    class LxmlParser(object):

        error = Exception

        def reset(self):
            self.encoding = 'utf8'
            self.terms = []

        def feed(self, data):
            tree = html.fromstring(data)
            contenttype = tree.xpath('/html/head/meta[@http-equiv="Content-Type"'
                                     ' and @content]')
            if len(contenttype) > 0:
                for attr in contenttype[0].attrib['content'].split():
                    attr = attr.strip(' ;')
                    if attr.startswith('charset='):
                        self.encoding = attr[8:]

            trs = [tr for tr in tree.xpath('//table[@id="PGtbltrad"]//tr[@title]')
                   if 'id' not in tr.attrib or tr.attrib['id'] != 'PGgRbrand']

            for tr in trs:
                try:
                    term, trad = filter(notEmptyStr, tr.xpath('./td//text()'))
                    self.terms.append((term, trad))
                except (IndexError, ValueError):
                    continue

        def getTerms(self):
            return self.terms


    Parser = LxmlParser
except ImportError:
    pass


class Dicios(object):

    url = 'http://it.dicios.com'
    languages = {'it': 'Italiano',
                 'en': 'English',
                 'de': 'Deutsch',
                 'fr': 'Francais'}

    def __init__(self, client, parserFactory=Parser):
        self.parser = parserFactory()
        self.client = client

    def translate(self, word, frm, to, li=None, timeout=30):
        assert type(word) is types.UnicodeType, 'word param must be unicode'
        word = utils.stripAccents(word).encode('ascii', 'ignore')
        self.parser.reset()
        if frm not in self.languages or to not in self.languages:
            return defer.fail(InvalidLanguageError)

        if li is None:
            li = []
        url = '%s/%s/%s' % (self.url, frm+to, word)
        d = self.client.getPage(url, timeout=timeout)
        d.addCallback(self._parse, li)
        d.addErrback(self._handleParseError, li)
        d.addCallback(self._extends)
        return d

    def _parse(self, page, li):
        self.parser.feed(page)
        return li

    def _extends(self, li):
        for i, (w, tr) in enumerate(self.parser.getTerms()):
            li.insert(i, (w, tr))

        return li

    def _handleParseError(self, failure, li):
        log.err(failure)
        failure.trap(self.parser.error)
        return li
