import requests
from cookielib import CookieJar as cj
import lxml.html
import lxml.etree
from HTMLParser import HTMLParser
import re
from bs4 import BeautifulSoup

# html stripping
class MLStripper(HTMLParser):
  def __init__(self):
    self.reset()
    self.fed = []

  def handle_data(self, d):
    self.fed.append(d)

  def get_data(self):
    return ''.join(self.fed)


def strip_tags(html):
  """
  strip tags from html and clean text.
  """
  s = MLStripper()
  s.feed(html)
  raw_text = s.get_data()
  raw_text = re.sub(r'\n|\t|\r', ' ', raw_text)
  return re.sub('\s+', ' ', raw_text).strip()


def node_to_string(node):
  """
  get the inner html of an lxml node.
  """
  return lxml.html.tostring(node)


def format_title(string):
  """
  Try to parse out delimited titles
  """
  if " - " in string:
    return string.split(' - ')[0].strip()
  elif " | " in string:
    return string.split(' | ')[0].strip()
  else:
    return string


def get_title(html):
  """
  Parse the pages <title> tag.
  """
  soup = BeautifulSoup(html)
  return format_title(soup.title.text)


def imgs_from_html(html):
  """
  Get the `src` attribute from `img` tags.
  """
  soup = BeautifulSoup(html)
  imgs = []
  for el in soup.find_all('img'):
    src = el.attrs.get('src', None)
    if src:
      imgs.append(src)

  return list(set(imgs))


# helpers for getting static page
def get_request_kwargs(timeout, useragent):
  """
  This Wrapper method exists b/c some values in req_kwargs dict
  are methods which need to be called every time we make a request.
  """
  return {
    'headers' : {'User-Agent': useragent},
    'cookies' : cj(),
    'timeout' : timeout,
    'allow_redirects' : True
  }


def get_html(url, response=None, **kwargs):
  """
  Retrieves the html for either a url or a response object. All html
  extractions MUST come from this method due to some intricies in the
  requests module. To get the encoding, requests only uses the HTTP header
  encoding declaration requests.utils.get_encoding_from_headers() and reverts
  to ISO-8859-1 if it doesn't find one. This results in incorrect character
  encoding in a lot of cases.
  """
  FAIL_ENCODING = 'ISO-8859-1'
  useragent = kwargs.get('useragent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0')
  timeout = kwargs.get('timeout', 10)

  if response is not None:
    
    if response.encoding != FAIL_ENCODING:
      return response.text
    
    return response.content # not unicode, fix later

  try:
    html = None
    response = requests.get(url=url, **get_request_kwargs(timeout, useragent))
    
    if response.encoding != FAIL_ENCODING:
      html = response.text
    
    else:
      html = response.content # not unicode, fix later
    
    if html is None:
      html = u''
    
    return html

  except Exception, e:
    print '%s on %s' % (e, url)
    return u''

