# -*- coding: utf-8 -*-

import time
from urlparse import urljoin, urlparse

import requests
from bunch import Bunch
from bs4 import BeautifulSoup


try:
    str
except NameError:
    str = basestring


class Crawler(object):

    def __init__(self, urls=list(), agent=None, request_limit=0, auto_add_links=False,
                 wait=0, stay_on_domain=False, skip_errors=True):
        self.urls = [urls] if isinstance(urls, str) else urls
        self.agent = agent
        self.request_limit = request_limit
        self.auto_add_links = auto_add_links
        self.wait = 0 if wait < 0 else wait
        self.stay_on_domain = stay_on_domain
        self.skip_errors = skip_errors
        self.__current_url = None

    def run(self):
        self.initialize()
        self.__fix_urls()
        self.__start_urls = self.urls
        self.__run()
        self.terminate()

    def initialize(self):
        """Abstract method which gets executed before the crawler starts"""
        pass

    def process(self, url, tree):
        """Abstract method which gets executed with each request"""
        pass

    def terminate(self):
        """Abstract method which gets executed when the all urls are visited"""
        pass

    def add(self, href):
        """Adds a url to the list if it's not already in it"""
        url = self.fix(href)
        if url not in self.urls:
            self.urls.append(url)

    def fix(self, href):
        """Joins the current url with the given url"""
        return urljoin(self.__current_url, href)

    def __run(self):
        """Handles the loop that visits the urls in the stack"""
        visits = 0

        for url in self.urls:
            self.__current_url = url
            self.__visit(url)

            visits += 1
            if visits == self.request_limit:
                break
            else:
                time.sleep(self.wait)

    def __visit(self, url):
        r = requests.get(url, headers=self.__get_headers())
        tree = BeautifulSoup(r.text) if r.status_code == 200 else None

        if not(self.skip_errors and tree is None):
            self.process(url, tree)

            if self.auto_add_links:
                self.__add_links_from_tree(tree)

    def __get_headers(self):
        """Constructs a dictionary with the headers for a request"""
        return {
            'User-Agent': self.agent,
        }

    def __fix_urls(self):
        """Adds the 'http' scheme to the urls without one"""
        for i in range(len(self.urls)):
            self.urls[i] = urljoin('http:', self.urls[i])
            self.urls[i] = self.urls[i].replace('///', '//')

    def __add_links_from_tree(self, tree):
        """Adds all the links in the tree to the url stack"""
        if tree is not None:
            for link in tree.find_all('a'):
                href = link.get('href')
                if self.stay_on_domain and self.__url_on_same_domain(href):
                    self.add(href)
                elif not self.stay_on_domain:
                    self.add(href)

    def __url_on_same_domain(self, href):
        """Returns if the given url has the same domain as one of the
        starting urls"""
        url = urlparse(self.fix(href))
        for start in self.__start_urls:
            domain = urlparse(start).netloc
            if url.netloc == domain or url.netloc.endswith('.' + domain):
                return True
        return False


class Tractor(Crawler):

    def __init__(self, *args, **kwargs):
        super(Tractor, self).__init__(*args, **kwargs)

        self._terminate_functions = []
        self._harvest_functions = []

    def harvest(self, *selectors):
        def decorator(f):
            for selector in selectors:
                self._harvest_functions.append(Bunch(
                    selector=selector,
                    function=f,
                ))
            return f
        return decorator

    @property
    def done(self):
        def decorator(f):
            self._terminate_functions.append(f)
            return f
        return decorator

    def process(self, url, tree):
        for bit in self._harvest_functions:
            for element in tree.select(bit.selector):
                bit.function(element)

    def terminate(self):
        for function in self._terminate_functions:
            function()
