#!/usr/bin/env python
# -- coding: utf-8

#       Copyright 2007 Manuel Vazquez Acosta <mva.led@gmail.com>
#
#       This program is free software; you can redistribute it and/or modify
#       it under the terms of the GNU General Public License as published by
#       the Free Software Foundation; either version 2 of the License, or
#       (at your option) any later version.
#
#       This program is distributed in the hope that it will be useful,
#       but WITHOUT ANY WARRANTY; without even the implied warranty of
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#       GNU General Public License for more details.
#
#       You should have received a copy of the GNU General Public License
#       along with this program; if not, write to the Free Software
#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#       MA 02110-1301, USA.

"""Knowledge Base System for both Guglebot and
Guglestash

Guglebot is the robot (spider) Gugle uses
to fetch and discover updates and new pages
in the Web.

Guglestash is the indexer Gugle uses to
index its KB.

Guglesearch is the Search System Gugle
provides to its users"""


import datetime, types, re

from threading import RLock
from urlparse import urlparse
from Queue import Empty

from BTrees.OOBTree import OOBTree
from BTrees.IOBTree import IOBTree

from ZODB import DB, FileStorage

from persistent import Persistent
from persistent.dict import PersistentDict
from persistent.list import PersistentList

import transaction

# We use this variable to construct the path of the DB file
# We include the revision number in order to guard us against
# changes in the structure of the database that may be unaligned
# with the current data structures.
# If you change any of the persistent structures in this file is best to
# remove your old DB file, test, and then commit your changes.
GugleRevision = "$Rev: 478 $"
GugleRevisionNumber = GugleRevision[GugleRevision.find(" ")+1:-2]


class BasicKBEntry(Persistent):
    """What we stash in our Basic Knowledge Base

    $Rev: 478 $

    We should keep this entry in 4kb for efficient
    file mapping (How does this relate to ZODB?)"""

    def __init__(self, url, contenttype="text/html", httpResponseCode = 200, charset=None, timestamp=None, size=None):
        self.url = url

        # Do we really need title and summary in BKB?
        #self.title = title
        #self.summary = summary

        self.contentType = contenttype
        self.charset = charset
        self.timestamp = timestamp
        self.httpResponseCode = httpResponseCode
        self.size = size

        #Currently not used
        #parsed_url = urlparse(url)
        #self.cluster = parsed_url[1] #host:port

        #Just to debug purposes
        #self.dirty = False

class WebGraph(PersistentDict):
    """
    This class hold links relationships between
    different nodes of the web.

    $Rev: 478 $
    """

    def __len__(self):
        count = 0
        for x in self.generateGugleMappings():
            count += 1

        return count

    def __repr__(self):
        rels = len(self)
        return "<WebGraph: %d relations>" % rels

    def registerGugleLink(self, referrer, target):
        """
        This method registers a link relationship.

        Warning: This method is not thread-safe.
        """
        if referrer not in self:
            self[referrer] = PersistentList()

        if target not in self[referrer]:
            self[referrer].append(target)

    def generateGugleMappings(self):
        'Generates pairs of (referer, target) urls for the whole web'
        for key in self:
            for which in self[key]:
                yield (key, which)

class BKB(Persistent):
    '''This a persistent object which holds basic information about
    collected urls'''

    def __init__(self):
        self.__Entries__ = OOBTree()
        self.__Index__ = IOBTree()
        self.__Count__ = 0
        self.__head__ = 0

    def __repr__(self):
        return "<BKB: Total URL: %s; Head at: %s>" % (self.__Count__,
                                                      self.__head__)

    def put(self, key, entry):
        'Appends an entry to the BKB'

        if not self.__Entries__.has_key(key):
            i = self.__Count__
            self.__Index__[i] = key
            self.__Entries__[key] = entry
            self.__Count__ = self.__Count__ + 1

    def __len__(self):
        return self.__Count__

    def has_key(self, key):
        'Checks whether a given key is storaged in the BKB'

        return self.__Entries__.has_key(key)

    def __getitem__(self, key):
        #TODO: Is it best to have self.__Entries__[self.__Index__[key]]?
        return self.__Entries__[key]

    def __setitem__(self, key, value):
        self.put(key, value)

    def get(self, index):
        'Returns the entry at an index'

        if self.__Index__.has_key(index):
            key = self.__Index__[index]
            return self.__Entries__[key]
        else:
            raise KeyError, 'Index %d out of bounds' % index

class BasicKnowledgeBase:
    """Gugle's Basic Knowledge Base System

    $Rev: 478 $

    BKB design goals:
    * Key search (has_key)
    * Simple addition (dic[key] = obj)
    * Iteration (dict iteritems() does not
      work in a multi-threading env)
    * Thread-safe
    * Fast key search (dict-like)
    * Simple addition (dict-like)
    * Simple dequeueing (Queue.get()-like)
    """

    def __init__(self, fs = 'var/gugle-%s.db' % GugleRevisionNumber, *args, **kargs):
        self.__storage__ = FileStorage.FileStorage(fs)
        self.__db__ = DB(self.__storage__, pool_size=10)
        self.__connection__ = self.__db__.open()
        self.__lock = RLock()
        self.__head__ = 0
        self.__urlRegExps = []
        self.__starttime = datetime.datetime.now()

        self.__lock.acquire()
        try:
            self.__Gugle__ = self.__connection__.root()
            if not self.__Gugle__.has_key('BKB'):
                self.__Gugle__['BKB'] = BKB()

            if not self.__Gugle__.has_key('BlackList'):
                self.__Gugle__['BlackList'] = BKB()

            if not self.__Gugle__.has_key('WebGraph'):
                self.__Gugle__['WebGraph'] = WebGraph()

            transaction.commit()
            self.__BKB__ = self.__Gugle__['BKB']
            self.__BlackList__ = self.__Gugle__['BlackList']
            self.__WebGraph__ = self.__Gugle__['WebGraph']

            self.__head__ = self.__BKB__.__head__
        finally:
            self.__lock.release()

    def __del__(self):
        try:
            self.__connection__.close()
            self.__db__.close()
            self.__storage__.close()
        except:
            pass

        del self.__lock
        del self.__urlRegExps

    def isURLallowed(self, url):
        """
        Checks whether url is allowed to be in BKB.

        BKB may hold a list of URL regular expressions,
        if any is given, this function will return true
        only if there exists a regular expression that
        matches the given url which is policy is `allow`.

        If no URL filters are provided this method always
        return True.
        """
        if len(self.__urlRegExps) != 0:
            regExpsIterator = self.__urlRegExps.__iter__()
            try:
                while True:
                    (policy, which) = regExpsIterator.next()
                    if which.search(url):
                        return policy == "allow"
            except:
                return False

        return True

    def addURLFilter(self, which, policy):
        if isinstance(which, types.StringType):
            whichObj = re.compile(which.rstrip())
        elif "match" in dir(which):  #TODO: Check for type with isinstance
            whichObj = which
        else:
            raise "String or Reg Exp Object expected"

        self.__urlRegExps.append((policy, whichObj))

    def registerFailure(self, url, referrer = None):
        entry = BasicKBEntry(url, contenttype=None, httpResponseCode = 404)
        self.__lock.acquire()
        try:
            self.__BlackList__[url] = entry
            if referrer:
                self.__WebGraph__.registerGugleLink(referrer, entry.url)

            transaction.commit()
        finally:
            self.__lock.release()

    def put(self, entry, referrer = None):
        assert isinstance(entry, BasicKBEntry)

        if self.isURLallowed(entry.url):
            print u"BKB: URL added %s" % entry.url
            queue = self.__BKB__
        else:
            print u"BKB: URL rejected by a filter %s" % entry.url
            queue = self.__BlackList__

        self.__lock.acquire()
        try:
            queue.put(entry.url, entry)
            if referrer:
                self.__WebGraph__.registerGugleLink(referrer, entry.url)

            transaction.commit()
        finally:
            self.__lock.release()

    def get(self, timeout=None):
        self.__lock.acquire()
        try:
            if len(self.__BKB__) > 0 and self.__head__ < len(self.__BKB__):
                entry = self.__BKB__.get(self.__head__)
                self.__head__ = self.__head__ + 1

                self.__BKB__.__head__ = self.__head__
                transaction.commit()

                # According to the Python lang spec the finally suite is executed:
                return entry
            else:
                raise Empty()
        finally:
            self.__lock.release()


    def reset(self):
        self.__lock.acquire()
        try:
            self.__head__ = 0
        finally:
            self.__lock.release()

    def dump(self):
        self.__lock.acquire()
        try:
            diff = datetime.datetime.now() - self.__starttime

            print "BKB: Collected %d items in %d secs" % (len(self.__BKB__), diff.seconds)
            print "BKB: Inspected %d items" % (self.__head__)
        finally:
            self.__lock.release()

    def has_key(self, key):
        self.__lock.acquire()
        try:
            return self.__BKB__.has_key(key) or self.__BlackList__.has_key(key)
        finally:
            self.__lock.release()
