#!/usr/bin/env python
"""
ADS to BibDesk -- frictionless import of ADS publications into BibDesk
Copyright (C) 2012  Rui Pereira <rui.pereira@gmail.com> and
                    Jonathan Sick <jonathansick@mac.com>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Based on ADS to Bibdesk automator action by
Jonathan Sick, jonathansick@mac.com, August 2007

Input may be one of the following:
- ADS abstract page URL
- ADS bibcode
- arXiv abstract page
- arXiv identifier
"""
import sys
import os
import fnmatch
import glob
import re
import time
import optparse
import tempfile
import socket
import binascii
import zlib
import subprocess
import logging

import cgi
import urllib2
import urlparse

import subprocess as sp
from HTMLParser import HTMLParser, HTMLParseError
from htmlentitydefs import name2codepoint

# default timeout for url calls
socket.setdefaulttimeout(30)


def main():
    """Parse options and launch main loop"""
    usage = "Usage: %prog [options] [article_token or pdf_directory]"
    version = "3.0.5"
    description = """adsbibdesk helps you add astrophysics articles listed
on NASA/ADS and arXiv.org to your BibDesk database. There are two modes
in this command line interface:

1. Article mode, for adding single papers to BibDesk given tokens.
2. PDF Ingest mode, where PDFs in a directory are analyzed and added to
   BibDesk with ADS meta data.
In article mode, adsbibdesk accepts many kinds of article tokens:
The URL of an ADS or arXiv article page,
The ADS bibcode of an article (e.g. 1998ApJ...500..525S), or
the arXiv identifier of an article (e.g. 0911.4956).
(Example: `adsbibdesk 1998ApJ...500..525S`)
In PDF Ingest mode, you specify a directory containing PDFs instead of
an article token (Example: `adsbibdesk -p pdfs` will ingest PDFs from
the pdfs/ directory).
"""
    epilog = "For more information, visit www.jonathansick.ca/adsbibdesk" \
             + " email jonathansick at mac.com or tweet @jonathansick"
    parser = optparse.OptionParser(usage=usage, version=version,
            description=description, epilog=epilog)
    parser.add_option('-d', '--debug',
            dest="debug", default=False, action="store_true",
            help="Debug mode; prints extra statements")
    pdfIngestGroup = optparse.OptionGroup(parser, "PDF Ingest Mode",
            description=None)
    pdfIngestGroup.add_option('-p', '--ingest_pdfs',
            dest="ingestPdfs", default=False, action="store_true",
            help="Ingest a folder of PDFs."
                 " Positional argument should be directory"
                 " containing PDFs."
                 " e.g., `adsbibdesk -p .` for the current directory")
    pdfIngestGroup.add_option('-r', '--recursive',
            dest='recursive', default=True, action="store_false",
            help="Search for PDFs recursively in the directory tree.")
    parser.add_option_group(pdfIngestGroup)
    options, args = parser.parse_args()

    # Get preferences from (optional) config file
    prefs = Preferences()
    if options.debug:
        prefs['debug'] = True

    # Logging saves to log file on when in DEBUG mode
    # Always prints to STDOUT as well
    logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s %(name)s %(levelname)s %(message)s',
        filename=prefs['log_path'])
    if not prefs['debug']:
        logging.getLogger('').setLevel(logging.INFO)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    logging.getLogger('').addHandler(ch)

    logging.info("Starting ADS to BibDesk")
    logging.debug("ADS to BibDesk version %s", version)
    logging.debug("Python: %s", sys.version)

    if options.ingestPdfs:
        ingest_pdfs(options, args, prefs)
    else:
        process_articles(options, args, prefs)


def process_articles(options, args, prefs):
    """Workflow for processing article tokens"""
    if len(args) == 1:
        articleTokens = list(args)
    else:
        # Try to use standard input
        articleTokens = map(lambda s: s.strip(), sys.stdin.readlines())

    # Make the embedder script
    insertScript = EmbeddedInsertionScript()
    insertScript.install()

    for articleToken in articleTokens:
        process_token(articleToken, prefs, insertScript)
        if len(articleTokens) > 1: time.sleep(15)


def process_token(articleToken, prefs, insertScript):
    """Process a single article token from the user.
    :param articleToken: Any user-supplied `str` token.
    :param prefs: A `Preferences` instance.
    :param insertScript: An `EmbeddedInsertionScript` instance.
    """
    # Determine what we're dealing with. The goal is to get a URL into ADS
    # adsURL = parseURL(articleID[0], prefs)
    logging.debug("process_token found article token %s", articleToken)
    connector = ADSConnector(articleToken, prefs)
    logging.debug("process_token derived url %s", connector.adsURL)
    if connector.adsRead is None:
        logging.debug("process_token skipping %s", articleToken)
        return

    # parse the ADS HTML file
    ads = ADSHTMLParser(prefs=prefs)
    ads.parse(connector.adsRead)
    # pdf local file, title, first author, abstract, bibtex code
    # UTF-8 encoded
    output = ''.join(map(lambda x: x.encode('utf-8'), [ads.getPDF(), '|||',
                                                    ads.title, '|||',
                                                    ads.author[0], '|||',
                                                    ads.abstract, '|||',
                                                    ads.bibtex.__str__()]))
    # Escpe backslashes
    # output = output.replace('\\', '\\\\')
    # Escape double quotes
    # output = output.replace('"', '\\"')
    f = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
    logging.debug("process_token payload in tmp file: %s", f.name)
    f.write(output)
    f.close()
    cmd = 'osascript %s "%s"' % (insertScript.compiledPath, f.name)
    logging.debug("process_token osacript cmd: %s", cmd)
    subprocess.call(cmd, shell=True)
    os.remove(f.name)


def ingest_pdfs(options, args, prefs):
    """Workflow for attempting to ingest a directory of PDFs into BibDesk.
    
    This workflow attempts to scape DOIs from the PDF text, which are then
    added to BibDesk using the usual `process_token` function.
    """
    assert len(args) == 1, "Please pass a path to a directory"
    pdfDir = args[0]
    assert os.path.exists(pdfDir) is True, "%s does not exist" % pdfDir
    print "Searching", pdfDir
   
    if options.recursive:
        # Recursive glob solution from
        # http://stackoverflow.com/questions/2186525/use-a-glob-to-find-files-recursively-in-python
        pdfPaths = []
        for root, dirnames, filenames in os.walk(pdfDir):
            for filename in fnmatch.filter(filenames, '*.pdf'):
                pdfPaths.append(os.path.join(root, filename))
    else:
        pdfPaths = glob.glob(os.path.join(pdfDir, "*.pdf"))

    # Make the embedder script
    insertScript = EmbeddedInsertionScript()
    insertScript.install()

    # Process each PDF, looking for a DOI
    grabber = PDFDOIGrabber()
    for i, pdfPath in enumerate(pdfPaths):
        print "%i of %i" % (i + 1, len(pdfPaths))
        dois = grabber.search(pdfPath)
        if len(dois) == 0:
            print "No DOIs for", pdfPath
        else:
            for doi in dois:
                print os.path.basename(pdfPath), "=", doi
                process_token(doi, prefs, insertScript)
        # Pacing so ADS won't treat us like a reckless 'bot!
        if len(dois) > 1: time.sleep(15.)


class ADSConnector(object):
    """Receives input (token), derives an ADS url, and attempts to connect
    to the corresponding ADS abstract page with urllib2.urlopen().
    
    Tokens are tested in order of:
    
    - arxiv identifiers
    - bibcodes / digital object identifier (DOI)
    - ADS urls
    - arxiv urls
    """
    def __init__(self, token, prefs):
        super(ADSConnector, self).__init__()
        self.token = str(token)
        self.prefs = prefs
        self.adsURL = None # string URL to ADS
        self.adsRead = None # a urllib2.urlopen connection to ADS
        self.urlParts = urlparse.urlsplit(token) # supposing it is a URL

        # An arXiv identifier?
        if self._is_arxiv():
            logging.debug("ADSConnector found arXiv ID %s", self.token)
        # A bibcode from ADS?
        elif not self.urlParts.scheme and self._is_bibcode():
            logging.debug("ADSConnector found bibcode/DOI %s", self.token)
        else:
            # If the path lacks http://, tack it on because the token *must* be a URL now
            if not self.token.startswith("http://"):
                self.token = 'http://' + self.token
            self.urlParts = urlparse.urlsplit(self.token) # supposing it is a URL

            # An abstract page at any ADS mirror site?
            if self.urlParts.netloc in self.prefs.adsmirrors and self._is_ads_page():
                logging.debug("ADSConnector found ADS page %s", self.token)
            elif "arxiv" in self.urlParts.netloc and self._is_arxiv_page():
                logging.debug("ADSConnector found arXiv page %s", self.token)

    def _is_arxiv(self):
        """Try to classify the token as an arxiv article, either:
        - new style (YYMM.NNNN), or
        - old style (astro-ph/YYMMNNN)
        :return: True if ADS page is recovered
        """
        arxivPattern = re.compile('(\d{4,6}.\d{4,6}|astro\-ph/\d{7})')
        arxivMatches = arxivPattern.findall(self.token)
        if len(arxivMatches) == 1:
            arxivID = arxivMatches[0]
            self.adsURL = urlparse.urlunsplit(('http', self.prefs['ads_mirror'],
                                               'cgi-bin/bib_query', 'arXiv:%s' % arxivID, ''))
            # Try to open the ADS page
            return self._read(self.adsURL)
        else:
            return False
    
    def _is_bibcode(self):
        """Test if the token corresponds to an ADS bibcode or DOI"""

        self.adsURL = urlparse.urlunsplit(('http', self.prefs['ads_mirror'],
                                           'doi/%s' % self.token, '', ''))
        read =  self._read(self.adsURL)
        if read: 
            return read
        else:
            self.adsURL = urlparse.urlunsplit(('http', self.prefs['ads_mirror'],
                                               'abs/%s' % self.token, '', ''))
            read =  self._read(self.adsURL)
            return read

    def _is_ads_page(self):
        """Test if the token is a url to an ADS abstract page"""
        # use our ADS mirror
        url = self.urlParts
        self.adsURL = urlparse.urlunsplit((url.scheme, self.prefs['ads_mirror'],
                                           url.path, url.query, url.fragment))
        return self._read(self.adsURL)
    
    def _is_arxiv_page(self):
        """Test if the token is a url to an arxiv abstract page."""
        # get paper identifier from URL and inject into ADS query
        url = self.urlParts
        arxivid = '/'.join(url.path.split('/')[2:]),
        self.adsURL = urlparse.urlunsplit(('http', self.prefs['ads_mirror'],
                                           'cgi-bin/bib_query', 'arXiv:%s' % arxivid, ''))
        return self._read(self.adsURL)
    
    def _read(self, adsURL):
        """Attempt a connection to adsURL, saving the read to
        self.adsread.
        :return: True if successful, False otherwise
        """
        try:
            self.adsRead = urllib2.urlopen(adsURL).read()
            return True
        except urllib2.HTTPError:
            return False


class Preferences(object):
    """Manages the preferences on disk and in memory. Preferences are accessed
    with by a dictionary-like interface.
    """

    def __init__(self):
        self.prefsPath = os.path.join(os.getenv('HOME'), '.adsbibdesk')
        self._adsmirrors = ['adsabs.harvard.edu',
                            'cdsads.u-strasbg.fr',
                            'ukads.nottingham.ac.uk',
                            'esoads.eso.org',
                            'ads.ari.uni-heidelberg.de',
                            'ads.inasan.ru',
                            'ads.mao.kiev.ua',
                            'ads.astro.puc.cl',
                            'ads.on.br',
                            'ads.nao.ac.jp',
                            'ads.bao.ac.cn',
                            'ads.iucaa.ernet.in',
                            'www.ads.lipi.go.id']

        self.prefs = self._getDefaultPrefs() # Hard coded defaults dictionary
        newPrefs = self._getPrefs() # load user prefs from disk
        self.prefs.update(newPrefs) # override defaults with user prefs
        self._keys = self.prefs.keys()
        self._iterIndex = -1

    def __getitem__(self, key):
        return self.prefs[key]

    def __setitem__(self, key, value):
        self.prefs[key] = value
        self._keys = self.prefs.keys()

    def __iter__(self):
        return self

    def next(self):
        if self._iterIndex == len(self._keys)-1:
            self._iterIndex = -1
            raise StopIteration
        self._iterIndex += 1
        return self._keys[self._iterIndex]

    def _getDefaultPrefs(self):
        """:return: a dictionary of the full set of default preferences. This
        is done in case the user's preference file is missing a key-value pair.
        """
        return {"ads_mirror": "adsabs.harvard.edu",
                "arxiv_mirror": None,
                "download_pdf": True,
                "ssh_user": None,
                "ssh_server": None,
                "debug": False,
                "log_path": os.path.expanduser("~/.adsbibdesk.log")}

    def _getPrefs(self):
        """Read preferences files from `self.prefsPath`, creates one otherwise."""
        prefs = {}
        # create a default preference file if non existing
        if not os.path.exists(self.prefsPath):
            self._writeDefaultPrefs()

        for l in open(self.prefsPath):
            if l.strip() and not l.strip().startswith('#'):
                if '=' not in l:
                    # badly formed setting
                    continue
                k, v = l.strip().split('=')
                if not v:
                    v = None
                elif v.strip().lower() in ('true', 'yes'):
                    v = True
                elif v.strip().lower() in ('false', 'no'):
                    v = False
                elif v.strip().lower() == 'none':
                    v = None
                prefs[k] = v

        return prefs

    def _writeDefaultPrefs(self):
        """
        Set a default preferences file (~/.adsbibdesk)
        """
        prefs = open(self.prefsPath, 'w')
        print >> prefs, """# ADS mirror
ads_mirror=%s

# arXiv mirror
# (leave it unset to use the arXiv mirror pointed by your ADS mirror)
arxiv_mirror=%s

# download PDFs?
download_pdf=%s

# set these to use your account on a remote machine for fetching
# (refereed) PDF's you have no access locally
ssh_user=%s
ssh_server=%s""" % (self.prefs['ads_mirror'], self.prefs['arxiv_mirror'],
                    self.prefs['download_pdf'], self.prefs['ssh_user'],
                    self.prefs['ssh_server'])

        prefs.close()

    @property
    def adsmirrors(self):
        return self._adsmirrors


class BibTex:

    def __init__(self, url):
        """
        Create BibTex instance from ADS BibTex URL
        """
        bibtex = urllib2.urlopen(url).readlines()
        bibtex = ' '.join([l.strip() for l in bibtex]).strip()
        bibtex = bibtex[re.search('@[A-Z]+\{', bibtex).start():]
        self.type, self.bibcode, self.info = self.parsebib(bibtex)

    def __str__(self):
        return ','.join(['@' + self.type + '{' + self.bibcode] + ['%s=%s' % (i, j) for i, j in self.info.items()]) + '}'

    def parsebib(self, bibtex):
        """
        Parse bibtex code into dictionary
        """
        r = re.search('(?<=^@)(?P<type>[A-Z]+){(?P<bibcode>\S+)(?P<info>,.+)}$', bibtex)
        s = re.split('(,\s\w+\s=\s)', r.group('info'))
        info = dict([(i[1:].replace('=', '').strip(), j.strip()) for i, j in zip(s[1::2], s[2::2])])
        return r.group('type'), r.group('bibcode'), info


class ADSException(Exception):
    pass


class ADSHTMLParser(HTMLParser):

    def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self)
        self.links = {}
        self.tag = ''
        self.get_abs = False
        self.entities = {}

        self.bibtex = None
        self.abstract = None
        self.title = ''
        self.author = []

        self.prefs = kwargs.get('prefs', Preferences()).prefs

    def mathml(self):
        """
        Generate dictionary with MathML -> unicode conversion from
        http://www.w3.org/Math/characters/byalpha.html
        """
        w3 = 'http://www.w3.org/Math/characters/byalpha.html'
        mathml = re.search('(?<=<pre>).+(?=</pre>)', urllib2.urlopen(w3).read(), re.DOTALL).group()
        entities = {}
        for l in mathml[:-1].split('\n'):
            s = l.split(',')
            #ignore double hex values like 'U02266-00338'
            if '-' not in s[1]:
                #hexadecimal -> int values, for unichr
                entities[s[0].strip()] = int(s[1].strip()[1:], 16)
        return entities

    def parse_at_url(self, url):
        """Helper method to read data from URL, and passes on to parse()."""
        try:
            htmlData = urllib2.urlopen(url).read()
        except urllib2.URLError, err:
            logging.debug("ADSHTMLParser timed out on URL: %s", url)
            raise ADSException(err)
        self.parse(htmlData)
    
    def parse(self, htmlData):
        """
        Feed url into our own HTMLParser and parse found bibtex

        htmlData is a string containing HTML data from ADS page.
        """
        cleanContent = self._preprocess_html(htmlData)
        self.feed(cleanContent)

        logging.debug("ADSHTMLParser found links: %s", str(self.links))

        if 'bibtex' in self.links:
            self.bibtex = BibTex(self.links['bibtex'])
            self.title = re.search('(?<={).+(?=})', self.bibtex.info['title']).group().replace('{', '').replace('}', '')
            self.author = [a.strip() for a in
                           re.search('(?<={).+(?=})', self.bibtex.info['author']).group().split(' and ')]

    def _preprocess_html(self, htmlData):
        """Cleans ADS HTML for compatibility with HTMLParser.
        
        ADS inserts bibliographic data into the HTML header as meta data,
        but does not escape these fields. The HTMLParser in Python 2.7.1
        chokes on such bad HTML. Here we simply delete these metadata fields.
        """
        cleanData = "\n".join([line for line in htmlData.split("\n")
            if not line.startswith("<meta name")])
        return cleanData
    
    def handle_starttag(self, tag, attrs):
        #abstract
        if tag.lower() == 'hr' and self.get_abs:
            self.abstract = self.tag.strip().decode('utf-8')
            self.get_abs = False
            self.tag = ''
        #handle old scanned articles abstracts
        elif tag.lower() == 'img' and self.get_abs:
            self.tag += dict(attrs)['src'].replace('&#38;', unichr(38))
        #links
        elif tag.lower() == 'a':
            if 'href' in dict(attrs):
                href = dict(attrs)['href'].replace('&#38;', unichr(38))
                query = cgi.parse_qs(urlparse.urlsplit(href).query)
                if 'bibcode' in query:
                    if 'link_type' in query:
                        self.links[query['link_type'][0].lower()] = href
                    elif 'data_type' in query:
                        self.links[query['data_type'][0].lower()] = href

    def handle_data(self, data):
        if self.get_abs:
            self.tag += data.replace('\n', ' ')

        #beginning of abstract found
        if data.strip() == 'Abstract':
            self.get_abs = True

    #handle html entities
    def handle_entityref(self, name):
        if self.get_abs:
            if name in name2codepoint:
                c = name2codepoint[name]
                self.tag += unichr(c).encode('utf-8')
            else:
                #fetch mathml
                if not self.entities:
                    #cache dict
                    self.entities = self.mathml()
                if name in self.entities:
                    c = self.entities[name]
                    self.tag += unichr(c).encode('utf-8')
                else:
                    #nothing worked, leave it as-is
                    self.tag += '&' + name + ';'

    #handle unicode chars in utf-8
    def handle_charref(self, name):
        if self.get_abs:
            self.tag += unichr(int(name)).encode('utf-8')

    def getPDF(self):
        """
        Fetch PDF and save it locally in a temporary file.
        Tries by order:
        - refereed article
        - refereed article using another machine (set ssh_user & ssh_server)
        - arXiv preprint
        - electronic journal link
        """
        if not self.links:
            return 'failed'
        elif 'download_pdf' in self.prefs and not self.prefs['download_pdf']:
            return 'not downloaded'

        def filetype(filename):
            return sp.Popen('file %s' % filename, shell=True,
                            stdout=sp.PIPE,
                            stderr=sp.PIPE).stdout.read()

        # refereed
        if 'article' in self.links:
            url = self.links['article']
            if "MNRAS" in url: # Special case for MNRAS URLs to deal with iframe
                parser = MNRASParser(self.prefs)
                try:
                    parser.parse(url)
                except MNRASException:
                    # this probably means we have a PDF directly from ADS, just continue
                    pass
                if parser.pdfURL is not None:
                    url = parser.pdfURL

            # try locally
            pdf = tempfile.mktemp() + '.pdf'
            # test for HTTP auth need
            try:
                open(pdf, 'wb').write(urllib2.urlopen(url).read())
            except urllib2.HTTPError:
                # dummy file
                open(pdf, 'w').write('dummy')

            if 'PDF document' in filetype(pdf):
                return pdf

            # try in remote server
            # you need to set SSH public key authentication for this to work!
            elif 'ssh_user' in self.prefs and self.prefs['ssh_user'] is not None:
                pdf = tempfile.mktemp() + '.pdf'
                cmd = 'ssh %s@%s \"touch adsbibdesk.pdf; wget -O adsbibdesk.pdf \\"%s\\"\"' % (self.prefs['ssh_user'], self.prefs['ssh_server'], url)
                cmd2 = 'scp -q %s@%s:adsbibdesk.pdf %s' % (self.prefs['ssh_user'], self.prefs['ssh_server'], pdf)
                sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE).communicate()
                sp.Popen(cmd2, shell=True, stdout=sp.PIPE, stderr=sp.PIPE).communicate()
                if 'PDF document' in filetype(pdf):
                    return pdf

        # arXiv
        if 'preprint' in self.links:
            # arXiv page
            url = self.links['preprint']
            mirror = None
            for line in urllib2.urlopen(url):
                if '<h1><a href="/">' in line:
                    mirror = re.search('<h1><a href="/">(.*ar[xX]iv.org)', line)
                elif 'dc:identifier' in line:
                    begin = re.search('dc:identifier="', line).end()
                    url = urlparse.urlsplit(line[begin:-2].replace('&#38;', unichr(38)).lower())
                    # use automatic mirror chosen by the ADS mirror
                    if ('arxiv_mirror' not in self.prefs or not self.prefs['arxiv_mirror']) and mirror is not None:
                        url = urlparse.urlunsplit((url.scheme, mirror.group(1), url.path, url.query, url.fragment))
                    elif self.prefs['arxiv_mirror']:
                        url = urlparse.urlunsplit((url.scheme, self.prefs['arxiv_mirror'], url.path, url.query, url.fragment))
                    # get arXiv PDF
                    pdf = tempfile.mktemp() + '.pdf'
                    open(pdf, 'wb').write(urllib2.urlopen(url.replace('abs', 'pdf')).read())
                    if 'PDF document' in filetype(pdf):
                        return pdf
                    else:
                        return url

        #electronic journal
        if 'ejournal' in self.links:
            return self.links['ejournal']

        return 'failed'

    
def test_mnras():
    prefs = Preferences()
    prefs['debug'] = True
    
    data = '<iframe id="pdfDocument" src="http://onlinelibrary.wiley.com/store/10.1111/j.1365-2966.2010.18174.x/asset/j.1365-2966.2010.18174.x.pdf?v=1&amp;t=gp75eg4q&amp;s=c7ec3f26d269f5f4187799ff6faf44ebe01bbb01" width="100%" height="100%"></iframe>'
    parser = MNRASParser(prefs)
    # parser.parse(mnrasURL)
    parser.feed(data)
    print parser.pdfURL


class MNRASException(Exception):
    pass


class MNRASParser(HTMLParser):
    """Handle MNRAS refereed article PDFs.
    
    Unlike other journals, the ADS "Full Refereed Journal Article" URL for a
    MNRAS article points to a PDF embedded in an iframe. This class extracts
    the PDF url given the ADS link.
    """
    def __init__(self, prefs):
        HTMLParser.__init__(self)
        self.prefs = prefs
        self.pdfURL = None
    
    def parse(self, url):
        """Parse URL to MNRAS PDF page"""
        try:
            self.feed(urllib2.urlopen(url).read())
        except urllib2.URLError, err: # HTTP timeout
            logging.debug("MNRASParser timed out: %s", url)
            raise MNRASException(err)
        except HTMLParseError, err:
            raise MNRASException(err)
    
    def handle_starttag(self, tag, attrs):
        """
        def get_mnras_pdf(url):
           soup = BeautifulSoup(urllib2.urlopen(url))
           pdfurl = soup.find('iframe')['src']
           open('mnras.pdf', 'wb').write(urllib2.urlopen(pdfurl).read())
        """
        if tag.lower() == "iframe":
            attrDict = dict(attrs)
            self.pdfURL = attrDict['src']


class EmbeddedInsertionScript(object):
    """Manages the bibdesk insertion applescript.
    
    In order to make adsbibdesk.py a single-file installation (and Automator
    action) we need this Python script to spawn the AppleScript interface to
    BibDesk itself. The `build.py` script is responsible for embedding
    adsbibdesk.applescript into this Python script.
    """
    def __init__(self):
        super(EmbeddedInsertionScript, self).__init__()
        self._txtData = "eJylWW1v28gR/mz9igU/JBIqWXYORRv3LndKbCduU8ewHVyDujBW5FJiTHLZXdKK4Pi/95nZ5ZslX+LUCBCSOzM788zrriaT2eGFKLV4ncwPlb0Rk4mITRKWic5TZa1IskKbUuhYEGFRzdMklLSKpbzlG0wmb3SxNsliWYrhm5F4sbf3UojzKhFnyqjESPGzqZLdwr38tshkku6GOnsFTvy7XCZWFEYvjMwEHmOjlLA6LlfSqAOx1pUIZS6MihJbmmRelUokpZB5NNVGZDpK4jXE4FOVR8qIcqlEqUxmSXN6eXv6UbxVuTIyFWdshXifhCq3Skjr7LJLFYk5iSGGY9LgwmsgjjXkst1joRKsG3GrjMW7+KnewssbC20gYyhLUtsIXRDbCLquRSrLlnN3u+mthREgZslLXcCcJSTCwFWSpmKuRGVVXKVjSACt+P3k8t2Hj5didvpJ/D47P5+dXn76G2jLpcaqulVOEtyZJhAMo4zMyzV0h4B/Hp2/eQeO2euT9yeXn2CAOD65PD26uBDHH87FTJzNzi9P3nx8PzsXZx/Pzz5cHO0KcaFIKQX+PwA3ZvcAwUiVcLn1Rn+CQy1USyOxlLcKjg1VcgvFpAgRR9/2GmTIVOcLNhG0LYbQ7CQWuS7HwkLDn5dlWRxMp6vVaneRV7vaLKapk2Gnr7w6QryWFrvr/IDfZkWRqovQJEVJyRHqHD4rRaRXeaplBErEfam+UAAKOYfDZAjn5LA24yghLtkklcpLs95lyX/XuYQnc3GRhDdj8dm/Wrz9lsmQUmIsZtWisiXl0F8G4PIWxFqndqXNDYAriHAqSU3Lak5B/Nfp3p+nMp90Pk+MKlIZqgmULSeZQjhE04GmTOLvl/g8jJM8GtdfgFo1/6zCcjTYsaoEqur28uTQkkUkBCGoMngzTTI8GU6wDlyO6TsISR5t7Bj8nr1NmMMvDJ4k15uyIToIxLNG4lME1iBADaPKyuSNFIUA6GBJ7hLeX/N1anZzVU5/gnP296d7L7sOmyB0JgujVyn7g59OdZnEvr4Oy6QEbWnGUM2GeIBDIJxD3acOBYNLgLfEztHIT+9UikoLIwY7pUK5oH29XBFcrC0ZfISaUNqAzCMkrlkDfltSxc9lpggJlA6zptwKqR2slhpJHRolSyR2uUZdgjrB2/Pz82Cwk8RiGKJScr9wAkfildijXM4HO4y4ttI2uAYDCIamKEQy7ZlvxYEQd1dBnUJ5Z+0quG/5VC7nqYp+iLeGGPRXHBvZupcYAb6ORXB1xf/X1CMQYqUV4/3zfVJqZ3ohgw3/OBlISmN9aHbAdEwCf0Yt0CjQTdC++uze7KuAMO0ZbzdRjlQsq7R8QLYNVN6VydYu5jhCHkPZodUiHPmwJxVrvLpq94WBvzCJNgnQ3Wu6mC1RJdeccoTaIPAhhcadpm2yBm2ITZQgMP9baeqmVJ0JzHb9GS2/EtNI3U7zCmg9I5nJIsfW6C1dBY2yhaamAYKdSLtNhZfT0QDLpGAtY+DeYtZ6I8m5XIRLFd5wozw7PEYOA0jvBkRYVIXQvLKkzhmqUKJW1J5nodFzWRI7+8JqwOdUyuQCbRJCBMaDFEEO/FVENWYp7ayVPkRSHiepQllBd/JQhlnEENIIki/sNvQ8G2P3FSapQojJkXj+RuclVZR/i//c7Y3376+uhs8JTQ7wTBFsXHGX6kNVFhW76iGO2N7jxw7egdLKGMxTW3iDwGHLyvuq3BKgKhEEICKavuWM+iKhmUiiB8GWISCNdYrJcSS8KCkwEZbU+jG35ORJat88lRSSRg6N5ISLMB2i3kUJJhiUxPXYkdESh3Qdu/SBt0JFwOzBgyTKdUvdX+y0ERpcZKrCzzLUc4uxkTr/Ss1TvZj+Wvzy4sVL8i30sOqahAypQ/CT7+A+xa87HQ7Kdzrcc7u1D3K7QUl3SLF+QFV6oLimC4qTgiftJMeMFhwECEUqSdyFCUiW4lkYEeltdS/swEpt4MjNJJWdEtjaRSJcgI7EL7xnp7982646eHxI1ULxdejGjn18NbpaLMXkxaM7j9rnJsefosBBMGgVcNZfw3pWozMCPV2XRigD3BMnJvvfJwMuqxsDMiPXbWBSCKisQF3e2K+l8QAncUcRn0goKbs/4q/dnsN6W1FKcqS0zLqzc5dvE5JNhBvC/9PDm2nXlKm7B04ft5uOHxh4z/Wrze4BJbuVsTpUqSop2RUORj7Tb1RRoofw9nf3fvaLdFhllFvs+94k6LttU6WdMKrUCACZruTaQiYKPAvNpLlBH0AZdU0QlR9jYH0Ac00BB9ZhmuRER9rWAyNOjCf/cnWzjYNrlnQdcBHoUMBg66QGu0UUByPajO0jKGBD18wOX9ubiMEP5tDQ2WNUpnHQJL4fVz7SyvUVb8VTjQCsjOdchjdV4dYvbpKsBZWThiKq/dxtuk1bo7zEnPmgpW9FY1Sn287OltPAcUIXJoFbrrvsRRXHyRfCd7+zcBefQfJYxKccqPHRl/KeaKBHr/3UY8WON5mOeegbuc4n6guaKs0yHgGfoDuNR/wL7+doTn3O8q40dTSQP+uo+szjXPMnfAUghkPWmZYPiKEVORKsi3Xo0EspNlXoYdG+/KnBpR9mzt4QB/qF6nS5YT0j892NOzekdEiIFUpA6K6yqlLT3UGIEX096qBeH8Rck/JxAG1aUzrE38yQx/Fo9ZcpejjGKvYtJQ1V2F1LARn77PLb0bczPw31BTfeYr7aLfBKw+Lg78TmFnN78r/p/+5Odb2un9086Q2MuHa6qlYPaG0GqtSq78sWL6frmS07tqr0gqVD4+XU5bdd8f2idif3grb4cy8wFXpyjpG3N+wd/tCsl/vJ2U/FHDN0BZkV084gLIJCrukajM7EiIxMlTKSpeSROG/vVpw4Hqv4qZ013Hoc0yJyIefDjwz5hsGFq5s2W7P8jm6sxP/glc6SJw1dX79+9WPXHYF9eEwuYysu6aDKT7MKAWHco7/Z4xfk8BHd491vXlJ57QY7YUoNw1sS/8i40Dpu8KT+DfdZJU24tA7MlGBauyQj5s7NvWs76gsMo+MhPGi5xPKBndqYu3qQjIOfnh5OFp2rnqHbl7etgRz1rn2gnI9wHAOEztWWBozA5/67TVjdF+sNnWZ1X6Txx21Hurflo7WhJWynj8bRnQL00FD0tW2TFv1hafPGLjisnHNUD243eUR8BxT4qsWGudscPxr8QZXAZ+dhGcFAtepKb0/GZB7fqXbvjYb+95vY6KwbxKPNkc+LESF4AQNX/wXfvdPpv/namYkdT7Z+g7V/OIaazM+Q5G8byjynm32D5pa6Gssngya9IBCL9bjkj71B65hM3ii22w9nH8/fO1quOsOupM7JUDbdsGXjzZv6XlvcXOC7Hr8pbNDzD080Kf20UeXRr605vpxsjH6tIeS/YafAtUwj7uhqkeRcYVuleRB1zBV717eY+ibYD4SuYo8F/RQEh9HvUzIni2uDRV/Heo6MJd6ijorktBxlgVTFVMJjlDj8cML60D24pWpCmIPGKFHfDs/lvFNM2pS9lWnlE1IhGIJIJwEd2oPuVPptB9d6f49/H6RS4zQyiQLS37B1we0WI18BqBo1fXeb8/wie66vSCP2kX7fPHHX5ToyeKSinPaznWx4vJIQOu7+2WfkaOB+n6jywf8AT9PcMA=="
        self.version = 2  # serializes the current version of this script
        self.compiledPath = os.path.expanduser(
                "~/.adsbibdesk_injector_%i.scpt" % self.version)

    def install(self):
        """Install the compiled script"""
        self._clean_old_scripts()
        if os.path.exists(self.compiledPath):
            return
        # Write the embedded applescript to a temp file
        scptTxt = zlib.decompress(binascii.a2b_base64(self._txtData))
        txtPath = tempfile.mktemp() + '.applescript'
        tmpFile = open(txtPath, 'w')
        tmpFile.write(scptTxt)
        tmpFile.close()
        cmd = "osacompile -o %s %s" % (self.compiledPath, txtPath)
        # print "Compiling applescript via:"
        # print cmd
        subprocess.call(cmd, shell=True)

    def _clean_old_scripts(self):
        """Find all injector scripts, and make sure that only the current one
        exists
        """
        paths = glob.glob(os.path.expanduser("~/.adsbibdesk_injector*.scpt"))
        for p in paths:
            if p == self.compiledPath:
                continue
            else:
                os.remove(p)


class PDFDOIGrabber(object):
    """Converts PDFs to text (via pdf2json) and attempts to match all DOIs
    in that text.
    """
    def __init__(self):
        super(PDFDOIGrabber, self).__init__()
        regstr = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b'
        self.pattern = re.compile(regstr)

    def search(self, pdfPath):
        """Return a list of DOIs in the text of the PDF at `pdfPath`"""
        jsonPath = os.path.splitext(pdfPath)[0] + ".json"
        if os.path.exists(jsonPath): os.remove(jsonPath)
        subprocess.call("pdf2json -q %s %s" % (pdfPath, jsonPath), shell=True)
        f = open(jsonPath, 'r')
        data = f.read()
        f.close()
        doiMatches = self.pattern.findall(data)
        if os.path.exists(jsonPath): os.remove(jsonPath)
        return doiMatches


if __name__ == '__main__':
    main()