﻿# coding: utf-8
#
# Copyright (c) 2012-2013, Niklas Rosenstein
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in
#    the documentation and/or other materials provided with the
#    distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and
# documentation are those of the authors and should not be interpreted
# as representing official policies,  either expressed or implied, of
# the FreeBSD Project.

from scan.scanner import Scanner

import abc
import string
import itertools

class LexError(Exception):
    r"""
    Base exception class for this module.
    """

class TokenizationError(LexError):
    r"""
    Raised when tokenization was not successful.
    """

    def __init__(self, message, cursor):
        super(TokenizationError, self).__init__(message % cursor)
        self.cursor = cursor

class UnexpectedTokenError(LexError):
    r"""
    Convenient exception class. Used to be raised when an unexpecting
    token appeared while parsing. Includes a convenient error-message
    generated from the parameters passed at construction-time.
    """

    def __init__(self, token, expected):
        self.token = token
        self.expected = expected

    def __str__(self):
        expected = [t.name or t.__class__.__name__ for t in self.expected]

        string = None
        if len(expected) <= 0:
            string = 'Expected no token at all, found %s.' % self.token
        elif len(expected) == 1:
            appendum = 'type is "%s".' % expected[0]
        else:
            appendum = 'types are [%s].' % ', '.join(expected)

        if not string:
            if self.token and not self.token.invalid:
                string = 'Unexpected token %s, expected ' + appendum
                string = string % self.token
            else:
                string = 'Require token at %s, expected ' + appendum
                string = string % self.token.cursor

        return string

class TokenType(object):
    r"""
    Subclasses of this class implement parsing a token from the scanner
    and returning an associated :class:`Token` instance.

    .. attribute:: name
        The name of the token-type it was associated with a
        :class:`TokenSet` instance. This attribute is set by
        :meth:`~TokenSet.add`.

    .. attribute:: skip
        When this attribute is True, the token is skipped by
        :class:`Lexer` and the next token is read in.
    """

    __metaclass__ = abc.ABCMeta

    # Assigned by the TokenSet.
    name = None

    # Wether this tokentype is skipped on lexing or not.
    skip = False

    @abc.abstractmethod
    def parse_token(self, lexer, scanner):
        return

    def __eq__(self, other):
        if self.__class__ == other:
            return True
        return self.__class__ is other.__class__

    def __ne__(self, other):
        return not self == other

    def token(self, value, length):
        r"""
        Convenient method creating a token of the passed data.
        """
        return Token(self, value, length)

    def tostr(self, token):
        r"""
        Return a string representation of the :class:`Token` which is
        of the tokentype of *self*.
        """
        v = token.value
        l = token.cursor.line
        c = token.cursor.column
        n = self.name or '<unnamed>'
        return '<%r at line: %d, col: %d, type: %s>' % (v, l, c, n)

    def torepr(self, token):
        r"""
        Return a complex string-representation of the :class:`Token` which
        is of the tokentype of *self*.
        """
        tcls = token.__class__.__name__
        scls = self.__class__.__name__
        name = self.name or '<unnamed>'
        return '<%s(%s) %r at %s: %r>' % (scls, tcls, name,
                                          token.cursor, token.value)

    def compare(self, t1, t2):
        return t1.value == t2.value

class Token(object):
    r"""
    This class represents a token from a string. A token's type is an
    instance of the :class:`TokenType` class. A token's value is stored in
    the :attr:`value` attribute.

    .. attribute:: value
        The token's value. Can be of any type.

    .. attribute:: type
        The token's type. An instance of :class:`TokenType`.

    .. attribute:: length
        The token's value's length. Required, so that the lexer can choose
        what token of the generated tokens should be used (the longest
        token is used). This attribute must explicitly be set since the
        token-value *might* be different from the value read in, depending
        on the tokentype implementation. This is *not* encouraged, however!

    .. attribute:: invalid
        When this is set to True, the token is considered invalid and
        ``not token == True`. An invalid token is created and assigned to
        the lexer at the end of the input stream.
    """

    value = None
    type = None
    length = None
    invalid = False

    @classmethod
    def invalid_token(cls):
        r"""
        Returns an invalid :class:`Token`.
        """
        instance = cls.__new__(cls)
        instance.invalid = True
        return instance

    def __init__(self, type, value, length):
        super(Token, self).__init__()
        assert isinstance(type, TokenType)
        self.type = type
        self.value = value
        self.length = length
        self.cursor = None

    def __str__(self):
        if self.invalid:
            return '<Invalid Token>'
        return self.type.tostr(self)

    def __repr__(self):
        print "__repr__"
        if self.invalid:
            return '<Invalid Token>'
        return self.type.torepr(self)

    def __eq__(self, other):
        if not self.sametype(other):
            return False
        return self.type.compare(self, other)

    def __ne__(self, other):
        return not self == other

    def __nonzero__(self):
        return not self.invalid

    def sametype(self, other):
        r"""
        Returns true when the :class:`Token` *other* is equal to *self*.
        """
        if not isinstance(other, Token):
            raise TypeError('expected scan.lexer.TokenType instance.')
        return self.type == other.type

class TokenSet(object):
    r"""
    A set of :class:`TokenType` instances, can be exchanged quickly
    for a lexer.
    """

    def __init__(self):
        super(TokenSet, self).__init__()
        self.tokentypes = {}
        self.on_init()

    def __getattr__(self, name):
        if not name in self.tokentypes:
            raise KeyError('No tokentype registered with name %s.' % name)
        return self.tokentypes[name][1]

    def on_init(self):
        pass

    def on_attach(self, lexer):
        pass

    def add(self, name, priority, tokentype):
        r"""
        Add a new :class:`TokenType` to the Lexer.
        """
        assert isinstance(tokentype, TokenType)
        if name in self.tokentypes:
            raise KeyError('Tokentype with name %s already registered.' % name)
        self.tokentypes[name] = (priority, tokentype)
        tokentype.name = name

    def iter_tokentypes(self):
        r"""
        The returned iterator iterates over a sequence of tuples with two
        elements each. The first is the priority and the second is the
        tokentype passed to :meth:`add`.
        """
        return self.tokentypes.values()

class Lexer(object):
    r"""
    This class implements splitting a string into a stream of tokens. A token
    is either a fixed identifier or matching a specific charset. A token
    is identified by an integral type.

    .. attribute:: skip_characters
        A sequence of characters that are intended to be simply skipped
        by the lexer. Defaults to :attr:`string.whitespace`.

    .. attribute:: tokentype_prefix
        The prefix for the tokentypes of the registered tokenset. The
        tokentypes may be accessed via ``<tokentype_prefix><tokentype_name>``.
        This attribute defaults to `'t_'`.

        .. code-block:: python

            lexer.add('kw', 100, Keyword('foobar', ignore_case=True))
            token = lexer.read_token()
            if token.type == lexer.t_kw:
                print "foobar keyword reached!"
    """

    @classmethod
    def from_string(cls, string, tokenset=None):
        scanner = Scanner.from_string(string)
        scanner.read()
        return cls(scanner, tokenset)

    @classmethod
    def from_filename(cls, filename, tokenset=None):
        scanner = Scanner.from_filename(filename)
        scanner.read()
        return cls(scanner, tokenset)

    @classmethod
    def from_file(cls, file_object, tokenset=None):
        scanner = Scanner(file_object)
        scanner.read()
        return cls(scanner, tokenset)


    skip_characters = string.whitespace
    tokentype_prefix = 't_'

    def __init__(self, scanner, tokenset=None):
        super(Lexer, self).__init__()
        self.scanner = scanner
        self.tokenset = tokenset or TokenSet()
        self.token = Token.invalid_token()

    def __getattr__(self, name):
        if name.startswith(self.tokentype_prefix):
            name = name[len(self.tokentype_prefix):]
            return getattr(self._tokenset, name)
        else:
            return super(Lexer, self).__getattribute__(name)

    def read_token(self, skip_unknown_characters=False):
        r"""
        Read the next token from the scanner passed on construction and
        stores it in the :attr:`token` attribute.

        :param skip_unknown_characters: When True, the lexer will skip
                characters it could not generate a token from.
        :raise TokenizationError:
                - When no token could be generated and
                  ``skip_unknown_characters`` is not True.
                - When equal or more than 2 tokens could be generated match
                  exactly and they have the same priority (the lexer can
                  not decide which one to use!)
        """

        if self.skip_characters:
            self.scanner.skip_set(self.skip_characters)

        cursor = self.scanner.cursor
        possible_tokens = []

        # When there is no character, we assume to be at the end of the
        # input stream.
        if not self.scanner.char:
            self.token = Token.invalid_token()
            self.token.cursor = cursor
            return self.token

        for priority, tokentype in self._tokenset.iter_tokentypes():
            token = tokentype.parse_token(self, self.scanner)
            if token:
                token.cursor = cursor
                possible_tokens.append([token, priority, self.scanner.cursor])

            self.scanner.seek_to(cursor)

        # No tokens, no luck. Throw an exception if skip_unknown_characters
        # is not True. Otherwise, skip the current character and read in
        # the next token (recursively).
        if not possible_tokens:
            if skip_unknown_characters:
                self.scanner.read()
                return self.read_token(skip_unknown_characters)
            else:
                raise TokenizationError('No token could be extracted at %s.',
                                        cursor)

        # Find the longest tokens. All tokens with the longest and same
        # length will be stored in `final_tokens`, the proper token to
        # use is later decided by the tokentype's priority.
        longest_token = possible_tokens[0]
        possible_tokens = possible_tokens[1:]
        final_tokens = [longest_token]
        for token in possible_tokens:
            if token[0].length == longest_token[0].length:
                final_tokens.append(token)
            elif token[0].length > longest_token[0].length:
                longest_token = token
                final_tokens[:] = []
                final_tokens.append(longest_token)

        # Sort the tokens in descending order to their priority and check
        # if there are two priorities equal, because if so, it is not
        # defined what token to use.
        final_tokens.sort(key=lambda x: -x[1])
        if len(final_tokens) >= 2 and final_tokens[0][1] == final_tokens[1][1]:
            raise TokenizationError('Imprecisely defined tokentypes. Found '
                    'more than one token with the same priority and length '
                    'at %s.', cursor)

        longest_token = final_tokens[0]
        token = longest_token[0]
        self.scanner.seek_to(longest_token[2])
        self.token = token

        # Check if this token is actually needed to be skipped.
        if token.type.skip:
            return self.read_token()
        else:
            return token

    def assert_type(self, *types):
        r"""
        Ensure that the current token in the scanner is of one of the
        passed token-types. Raises :class:`UnexpectedTokenError` if
        the current token is not of one of the passed types.
        """

        found = False
        for t in types:
            if self.token.type == t:
                found = True
                break

        if not found:
            raise UnexpectedTokenError(self.token, types)

    def tokenlist(self, tokens=None, exc_handler=None):
        if tokens is None:
            tokens = []
        while self.token:
            tokens.append(self.token)
            try:
                self.read_token()
            except TokenizationError as exc:
                if exc_handler:
                    exc_handler(exc)
                else:
                    raise
        return tokens

    def tokengen(self, exc_handler=None):
        while self.token:
            yield self.token
            try:
                self.read_token()
            except TokenizationError as exc:
                if exc_handler:
                    exc_handler(exc)
                else:
                    raise

    @property
    def tokenset(self):
        return self._tokenset

    @tokenset.setter
    def tokenset(self, tokenset):
        if not isinstance(tokenset, TokenSet):
            raise ValueError('expected TokenSet instance.')
        self._tokenset = tokenset
        self._tokenset.on_attach(self)

# ====== Token Types ==========================================================

class CharacterSet(TokenType):
    r"""
    This TokenType class implements reading a token generalized to a set
    of characters. The differnet tokentypes generated by this class are
    distungished by their character sequences.

    A CharacterSet matching an identifier like a python-variable would be:

    .. code-block::
        letters = string.letters + '_'
        digits = string.digits
        CharacterSet(letters, letters + digits)

    We're passing ``letters`` as the first argument since the first
    character of an identifier may *not* contain digits. All following
    characters however may contain digits.
    """

    def __init__(self, charset, *charsets, **kwargs):
        super(CharacterSet, self).__init__()
        self.charsets = (charset,) + charsets
        self.max_length = kwargs.pop('max_length', False)

        for k in kwargs:
            raise TypeError('__init__() got unexpected keyword argument %s' %
                            k)

    def __eq__(self, other):
        if not super(CharacterSet, self).__eq__(other):
            return False
        elif not isinstance(other, CharacterSet):
            return False

        return (self.charsets == other.charsets and
                self.max_length == other.max_length)

    def parse_token(self, lexer, scanner):
        others = set(self.charsets[:-1])
        last = set(self.charsets[-1])

        value = ''
        for other in others:
            if not scanner.char or scanner.char not in other:
                return
            value += scanner.char
            scanner.read()

        value += scanner.read_set(last)
        if value:
            if self.max_length > 0 and len(value) > self.max_length:
                return None
            t = self.token(value, len(value))
            return t

class Keyword(TokenType):
    r"""
    This TokenType represents a keyword. Make sure to register this
    tokentype with a higer precision than other tokentypes that *may*
    also match the keyword (like a :class:`CharacterSet`).

    .. code-block:
        set_ = TokenSet()
        set_.add('identifier', 0, CharacterSet(string.letters))
        set_.add('kw_python', 100, Keyword('python', ignore_case=True))

    In the code above, if the :class:`Keyword` would not be registered
    with a higher priority than the :class:`CharacterSet`, it is
    ambigious, since both tokentypes can tokenize the word ``"python"``. If
    they would have the same priority, the :meth:`Lexer.read_token` would
    raise a :class:`TokenizationError`.
    """

    def __init__(self, keyword, ignore_case=False):
        super(Keyword, self).__init__()
        self.keyword = keyword
        self.ignore_case = ignore_case

    def __eq__(self, other):
        if not super(Keyword, self).__eq__(other):
            return False
        elif not isinstance(other, Keyword):
            return False

        return self.keyword == other.keyword

    def parse_token(self, lexer, scanner):
        keyword = self.keyword
        func = lambda x: x
        if self.ignore_case:
            keyword = keyword.lower()
            func = str.lower

        word = scanner.match(keyword, charprocessor=func)
        if word:
            return self.token(word, len(word))

class HashComment(TokenType):
    r"""
    Simple tokentype matching a ``#`` (hash-sign) and reading gathering
    all characters until a new-line appears.
    """

    def __init__(self, skip=False):
        r"""
        Pass *skip* as True to make the lexer skip tokens of this
        tokentype.

        .. note:: The constructor of this class overwrites the class-level
                  :attr:`skip` attribute.
        """
        super(HashComment, self).__init__()
        self.skip = skip

    def parse_token(self, lexer, scanner):
        if scanner.char == '#':
            string = scanner.read_line()
            return self.token(string, len(string))


