#!/usr/bin/env python
# -*- coding: utf-8 -*-
from goose import Goose 
import newspaper

import lauteur
from siegfried import (
  prepare_url, urls_from_html, get_domain, is_article_url
  )

from util import (
  get_html, get_title, node_to_string, imgs_from_html, strip_tags
  )

USER_AGENT = "NewsLynx | (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1)" \
             " AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3"

TIMEOUT = 10

class ParticleInitError(Exception):
  pass

class Particle:
  """
  multi-method article extraction framework for NewsLynx.

  Our strategy will be as follows:
  1. Get the page html
  2. Extract the author using `lauteur`
  3. Extract the meta title tag.
  4. TK: Apply multiple extraction methods, returning only the article_html.
  5. TK: Come up with a method of scoring results and picking best option.
    * For now we'll just choose newspaper / goose because it seemed to do the 
      best of all open-source tools in this comparision:
      -  http://readwrite.com/2011/06/10/head-to-head-comparison-of-tex#awesm=~oDemRcIbyN4lYB
  6. TK: Apply image / url / movie / metadata extraction to raw page html.
    * For now we'll just go with newspaper / goose's built-in methods 
  7. Return a dictionary that is identical to `SupeRSS`'s
  """
  def __init__(self, **kwargs):

    self.user_agent = kwargs.get('user_agent', USER_AGENT)
    self.timeout = kwargs.get('timeout', TIMEOUT)
    self.g = Goose()

  def extract_goose(self, html):
    """
    Extract an article with goose.
    """
    g_article = self.g.extract(raw_html = html)
    return g_article

  def extract_newspaper(self, url, html=None):
    """
    NOTE: This is probably duplicitous since 
    newspaper extends python-goose's source code. 
    """
    try:
      import newspaper
    except ImportError:
      return

    np_article = newspaper.Article(url = url)
    np_article.download()
    np_article.parse()
    return np_article

  def extract_boilerpipe(self, html):
    """ 
    Extract an article with Boilerpipe 
    
    NOTE: This is an optional method as 
    boilerpipe is dependency-heavy and
    will be potentially cumbersome 
    to run on manta.
    """
    try:
      from boilerpipe.extract import Extractor
    except ImportError:
      return 

    bp_extract = Extractor(html=html)
    return bp_extract.getHTML()

  def extract_readability(self, html):
    """
    Extract an article with Readability.
    """
    try:
      from readability.readability import Document
    except ImportError:
      return 

    rdb_extract = Document(html)      
    return rdb.summary()

  def get_best_candidate(self, html):
    """
    TK: determine best extraction candidate.
    """
    pass 

  def extract(self, url, **kwargs):
    """
    primary method for extraction, for now 
    we're basically just wrapping newspaper's 
    Article.build()
    """      
    source_url = "http://%s" % get_domain(url)
    
    # get raw html    
    raw_html = get_html(url, user_agent=self.user_agent, timeout=self.timeout)
    
    # extract with goose
    article = self.extract_newspaper(url)

    # get the extracted html if we found it
    if article.top_node is not None:

      article_html = node_to_string(article.top_node)

      # get article urls on the page
      article_links = [prepare_url(u, source_url = source_url) for u in urls_from_html(raw_html)]
      article_links = list(set([a for a in article_links if is_article_url(a)]))

      # map data identically to `SupeRSS`
      data = {
        'url':           prepare_url(url),
        'article_html':  article_html,
        'text':          strip_tags(article_html),
        'title':         article.title,
        'tags':          article.meta_keywords,
        'authors':       lauteur.from_html(raw_html),
        'datetime':      None,
        'img_urls':      [article.top_image],
        'article_links': article_links,
        'article_imgs':  list(set(article.imgs))
      }

      return data
