#!/usr/bin/env python3

"""
Directory : mistool/parse_use
Name      : parse_use
Version   : 2013.10
Author    : Christophe BAL
Mail      : projetmbc@gmail.com

This module contains some basic tools for tokenizing a text.
"""

from collections import OrderedDict

from mistool.config.token import SEPARATORS
from mistool.parse_use.group import Groups, FindGroups


# ------------------------- #
# -- FOR ERRORS TO RAISE -- #
# ------------------------- #

class ParseUseError(ValueError):
    """
:::::::::::::::::
Small description
:::::::::::::::::

Base class for errors in the ``regex_use`` module of the package ``mistool``.
    """
    pass


# --------------- #
# -- TOKENIZER -- #
# --------------- #

class Tokens:
    """
:::::::::::::::::
Small description
:::::::::::::::::

----------------
Default settings
----------------

The aim of this class is to tokenize a text, that means to split a text into
meaning pieces. Here is a basic example.

python::
    from mistool import parse_use

    text = '''
    One simple with

    back returns...'''

    myTokens = parse_use.token.Tokens(text)

    for oneType, oneTok in myTokens:
        print(oneType, repr(oneTok),sep = " --> ")


Launched in a terminal, you will see that the class ``Tokens`` splits the text
into pieces and gives them a type.

terminal::
    back --> '\n'
    word --> 'One'
    space --> ' '
    word --> 'simple'
    space --> ' '
    word --> 'with'
    back --> '\n'
    back --> '\n'
    word --> 'back'
    space --> ' '
    word --> 'returns...'


By default, ``Tokens`` uses the dictionary ``SEPARATORS["minimal"]`` given above
to split the text. Anything that is not in this dictionary is seen as a word.
We'll talk of this dictionary later.

python::
    SEPARATORS["minimal"] = {
        'space': [" ", "\t"],
        'back' : "\n"
    }


info::
    Internally, the class builds a list contained in the attribut ``tokens``. You
    can also have the size of this list by using the attribut ``size``. For our
    first example, we have ``myTokens.size = 11" and the following list.

    python::
        myTokens.tokens = [
            ('back', '\n'),
            ('word', 'One'),
            ('space', ' '),
            ('word', 'simple'),
            ('space', ' '),
            ('word', 'with'),
            ('back', '\n'),
            ('back', '\n'),
            ('word', 'back'),
            ('space', ' '),
            ('word', 'returns...')
        ]


---------------------
Cleaning extra spaces
---------------------

You can clean the extra spaces and back returns if you want. In the following
example, we just want to have single spaces (this can be useful for minimal
printing of a text). To do that we use the argument ``rules``.

python::
    from mistool import parse_use

    text = '''   One test with   extra  spaces cleaned

    but back returns are kept !'''

    myTokens = parse_use.token.Tokens(
        text  = text,
        rules = "singleSpace"
    )

    for oneType, oneTok in myTokens:
        print(oneType, repr(oneTok),sep = " --> ")


The output in a terminal is the following one.

terminal::
    space --> ' '
    word --> 'One'
    space --> ' '
    word --> 'test'
    space --> ' '
    word --> 'with'
    space --> ' '
    word --> 'extra'
    space --> ' '
    word --> 'spaces'
    space --> ' '
    word --> 'cleaned'
    back --> '\n'
    back --> '\n'
    word --> 'but'
    space --> ' '
    word --> 'back'
    space --> ' '
    word --> 'returns'
    space --> ' '
    word --> 'are'
    space --> ' '
    word --> 'kept'
    space --> ' '
    word --> '!'


With the argument ``rules``, you can use one of the well named strings contained
in the class attribut ``ALL_RULES``. For example, if we use ``"noEmpty"`` in the
preceding code, we will obtain the output above.

terminal::
    word --> 'One'
    word --> 'test'
    word --> 'with'
    word --> 'extra'
    word --> 'spaces'
    word --> 'cleaned'
    word --> 'but'
    word --> 'back'
    word --> 'returns'
    word --> 'are'
    word --> 'kept'
    word --> '!'


--------------------------------
Protect groups like quoted texts
--------------------------------

The class ``Tokens`` can works with an instance of the class ``Groups`` of the
module ``parse_use.group``. Here is an example showing how to keep a quoted text
as a single token.

python::
    from mistool import parse_use

    myGroups = parse_use.group.Groups(
        groups = '"',
        untoken = '"'
    )

    text = 'One "test with quoted text", just for see'

    myTokens = parse_use.token.Tokens(
        text   = text,
        groups = myGroups,
        rules = "noEmpty"
    )

    for oneType, oneTok in myTokens:
        print(oneType, repr(oneTok),sep = " --> ")


The output is the one expected.

terminal::
    word --> 'One'
    group --> '"test with quoted text"'
    word --> ','
    word --> 'just'
    word --> 'for'
    word --> 'see'


--------------------
More kinds of tokens
--------------------

For the moment, we have always used the splitting made by default, except in the
previous section, but let's consider the code above.

python::
    from mistool import parse_use

    text = "One test with ponctuation... Separation?"

    myTokens = parse_use.token.Tokens(
        text  = text,
        rules = "noEmpty"
    )

    for oneType, oneTok in myTokens:
        print(oneType, repr(oneTok),sep = " --> ")


This code will produce the following output.

terminal::
    word --> 'One'
    word --> 'test'
    word --> 'with'
    word --> 'ponctuation...'
    word --> 'Separation?'


We see that the characters of ponctuation "..." and "?" are not seen as special
tokens. This is normal because the dictionary of separators used by default is
the following one.

python::
    SEPARATORS["minimal"] = {
        'space': [" ", "\t"],
        'back' : "\n"
    }


To add new separators, you just have to give to the argument ``sep`` a dictionary
with keys that are the kinds of the separators, and the values the corresponding
lists of separators. Here is an example.

python::
    from mistool import parse_use

    text = "One test with ponctuation... Separation?"

    myTokens = parse_use.token.Tokens(
        text  = text,
        rules = "noEmpty",
        sep   = parse_use.token.SEPARATORS["natural"],
    )

    for oneType, oneTok in myTokens:
        print(oneType, repr(oneTok),sep = " --> ")


With this lines of code, we obtain the wanted list of tokens.

terminal::
    word --> 'One'
    word --> 'test'
    word --> 'with'
    word --> 'ponctuation'
    ponct --> '...'
    word --> 'Separation'
    ponct --> '?'


In the code, we have used ``parse_use.token.SEPARATORS["natural"]`` which is the
dictionary above.

python::
    {
        'space': [" ", "\t"],
        'back' : "\n",
        'ponct': [",", ";", ".", "...", "?", "!"]
    }


The use of personal keys allows to be near a real tokenization like in the
following fictive example.

python::
    from mistool import parse_use

    text = "Zozor is a dog and Zaza a cat"

    myPonctuations = parse_use.token.SEPARATORS["natural"]
    myPonctuations['PET'] = ["Zozor", "Zaza"]

    myTokens = parse_use.token.Tokens(
        text  = text,
        rules = "noEmpty",
        sep   = myPonctuations,
    )

    for oneType, oneTok in myTokens:
        print(oneType, repr(oneTok),sep = " --> ")


With this, we obtain the tokens above.

terminal::
    PET --> 'Zozor'
    word --> 'is'
    word --> 'a'
    word --> 'dog'
    word --> 'and'
    PET --> 'Zaza'
    word --> 'a'
    word --> 'cat'


:::::::::::::
The arguments
:::::::::::::

The instanciation of this class uses the following variables.

    1) ``text`` is the text to tokenize.

    2) ``groups`` is an optional argument that must be an instance of the class
    ``Groups``. The default value is ``Groups()``.

    info::
        The class ``Groups`` from the module ``parse_use.group`` has be renamed
        so as to be directly used into the module ``token``.

    3) ``sep`` in an optional dictionary defining the separators delimiting the
    words. The default value is ``SEPARATORS["minimal"]``.

    info::
        The dictionary ``SEPARATORS`` from the module ``config.token`` has be
        renamed so as to be directly used into the module ``token``.

    4) ``rules`` is an optional set which can contain one of the well named
    values contains in the class attribut ``ALL_RULES``. The default value is
    ``set()``.

    info::
        If you only need one special rule, you can just use a string.
    """
    ALL_RULES = set([
        "gatherSpace",
        "singleSpace",
        "noSpace",
        "singleBack",
        "noBack",
        "noEmpty"
    ])

    def __init__(
        self,
        text   = "",
        groups = Groups(),
        sep    = SEPARATORS["minimal"],
        rules  = set()
    ):
# Validations and buildings of internal constants
        self.__internalRules(rules)
        self.__internalSep(sep)

# Arguments
        if not isinstance(groups, Groups):
            raise ParseUseError("``groups`` must be an instance of ``Groups``.")

        self.groups = groups

# Internal constant.
        self.__untoken = self.groups.untoken
        self.__tokens  = []
        self.__size    = 0

        self._answer = None
        self._oneSep = None

        self._kind     = None
        self._prevKind = None

# Upadte of the text
        self.__lastText        = None
        self._text = self.text = text


# -- THE PROPERTY ``text`` -- #

    @property
    def text(self):
        return self._text

    @text.setter
    def text(self, text):
        if text != self.__lastText:
            self.__lastText = self._text = text

            self.__tokens = self.tokenize(
                FindGroups(
                    text   = text,
                    groups = self.groups
                ).groupView
            )


# -- INTERNAL VERSIONS OF THE RULES -- #

    def __internalRules(
        self,
        rules
    ):
        """
:::::::::::::::::
Small description
:::::::::::::::::

This hidden method builds the internal version of the special rules.
        """
# Set version ?
        if isinstance(rules, str):
            rules = set([rules])

        elif not isinstance(rules, set):
            raise ParseUseError('``rules`` must be a set.')

# Illegal keys ?
        if not rules <= self.ALL_RULES:
            raise ParseUseError(
                "Unknown key(s) in the set ``rules``.\n" \
                + ' , '.join([
                    '<< {} >>'.format(x)
                    for x in rules - self.ALL_RULES
                ])
            )

# Everything is good.
        self.rules = {}

        for prefix in ["single", "no", "gather"]:
            lenPrefix = len(prefix)

            for oneRule in rules:
                if oneRule.startswith(prefix):
                    kind = oneRule[lenPrefix:].lower()

                    if kind == "empty":
                        self.rules["space"] = self.rules["back"] = prefix

                    else:
                        self.rules[kind] = prefix

    def __internalSep(
        self,
        sep
    ):
        """
:::::::::::::::::
Small description
:::::::::::::::::

This hidden method builds the internal version of the infos about the separators.
        """
        if not isinstance(
            sep,
            (dict, OrderedDict)
        ):
            raise ParseUseError('``sep`` must be a dict.')

        unsortedSep = {}

        for kind, someSeps in sep.items():
            if isinstance(someSeps, str):
                someSeps = [someSeps]

            someSeps = set(someSeps)

            for oneSep in someSeps:
                if oneSep in unsortedSep:
                    raise ParseUseError(
                        "A separator is used in two differents contexts.\n"
                        "See << {0} >> and << {1} >>.".format(
                            unsortedSep[oneSep],
                            kind
                        )
                    )

                unsortedSep[oneSep] = kind

        self.sep = OrderedDict([])

        for oneSep in sorted(unsortedSep.keys(), key = lambda x: -len(x)):
            self.sep[oneSep] = unsortedSep[oneSep]


# -- TOKENIZING FACTORY -- #

    @property
    def tokens(self):
        """
:::::::::::::::::
Small description
:::::::::::::::::

This property like method returns the list of the tokens found.
        """
        return self.__tokens

    def tokenize(
        self,
        groupView
    ):
        """
:::::::::::::::::
Small description
:::::::::::::::::

This method is the manager of the tokenization.
        """
        tokens = []

        for kind, delim, content in groupView:
            if kind == 'text':
                tokens += self.split(content)

            elif kind == 'group':
                open, close = delim

                if open in self.__untoken:
                    tokens.append((
                        kind,
                        "{0}{2}{1}".format(
                            open, close,
                            self.textify(content)
                        )
                    ))

                else:
                    tokens.append(("open", open))
                    tokens += self.tokenize(content)
                    tokens.append(("close", close))

        return tokens

    def textify(
        self,
        groupView
    ):
        """
:::::::::::::::::
Small description
:::::::::::::::::

This method builds the whole text version of a group and its content.
        """
        text = ""

        for kind, delim, content in groupView:
            if kind == 'text':
                text += content

            elif kind == 'group':
                text += "{0[0]}{1}{0[1]}".format(
                    delim,
                    self.textify(content)
                )

        return text

    def split(
        self,
        text
    ):
        """
:::::::::::::::::
Small description
:::::::::::::::::

This method splits the text regarding to the value of ``self.sep`` which contains
the list of the characters where to split the text.
        """
        self._answer = []

        iMax  = len(text)
        i     = 0
        iLast = 0


        while(i < iMax):
# The bigger is the winner !
            for oneSep, kind in self.sep.items():
                self._oneSep, self._kind = oneSep, kind

                if text[i:].startswith(self._oneSep):
                    lastPiece = text[iLast: i]

                    if lastPiece:
                        self._answer.append(('word', lastPiece))
                        self._prevKind = 'word'

                    self.__addNewSep()

                    self._prevKind = self._kind

# There is ``i += 1`` for the next loop step.
                    i += len(oneSep) - 1
                    iLast = i + 1

                    break

            i += 1

        endText = text[iLast:]

        if endText:
            self._answer.append(('word', endText))

        return self._answer

    def __addNewSep(self):
        """
:::::::::::::::::
Small description
:::::::::::::::::

This hidden method adds, or not, a separator regarding to some specific rules.
It is used by ``self.split``.
        """
        addIt = True

        for kind, rule in self.rules.items():
# The spaces and the back returns.
            if kind in ["space", "back"] and self._kind == kind:
                if rule == "no" \
                or (rule == "single" and self._prevKind == kind):
                    addIt = False

                elif rule == "gather" and self._prevKind == kind:
                    addIt = False
                    prevKind, prevText = self._answer[-1]
                    prevText += self._oneSep
                    self._answer[-1] = (prevKind, prevText)

        if addIt:
            self._answer.append((self._kind, self._oneSep))

# -- ITERATING METHODS -- #

    @property
    def size(self):
        """
:::::::::::::::::
Small description
:::::::::::::::::

A well named method...
        """
        return len(self.tokens)

    def __iter__(self):
        for oneToken in self.tokens:
            yield oneToken

    def __getitem__(self,key):
        return self.tokens[key]
