#!/usr/bin/env python

import argparse
from eulxml.xmlmap import load_xmlobject_from_file
from eulxml.xmlmap.eadmap import EncodedArchivalDescription as EAD
from eulxml.xmlmap.teimap import Tei, TEI_NAMESPACE
from requests.exceptions import HTTPError
import sys

from namedropper import spotlight, util


class LowerCaseAction(argparse.Action):
    # convert input argument to lower case before storing
    def __call__(self, parser, namespace, value, option_string=None):
        setattr(namespace, self.dest, value.lower())


class LookupNames(object):
    # object to wrap script functionality

    _queried_text = set()

    _unique_names = set()

    def __init__(self):
        # parse command-line arguments and init spotlight client
        parser = argparse.ArgumentParser(description='Look up named entities in a file.')
        parser.add_argument('filename', metavar='INPUT_FILE', type=str,
            help='name of the file to be processed')
        parser.add_argument('--input', metavar='INPUT_TYPE', type=str,
            help='type of file to be processed (%(choices)s)',
            action=LowerCaseAction,
            choices=['EAD', 'ead', 'tei', 'TEI', 'text'])
        parser.add_argument('--unique', action='store_true',
            help='only list unique names found anywhere in the content')
        parser.add_argument('--viaf', action='store_true', dest='viaf_lookup', default=False,
            help='look up VIAF identifiers for recognized Person entities')

        # tei option (TODO: probably should be required if input is tei)
        tei_opts = parser.add_argument_group('TEI options')
        tei_opts.add_argument('--tei-xpath', type=str, metavar='XPATH',
            help='xpath for TEI section-level content to be processed (use t: for tei namespace)')

        # dbpedia-specific options
        spotlight_opts = parser.add_argument_group('DBpedia Spotlight options')
        spotlight_opts.add_argument('--url', '-u', metavar='URL', type=str,
            default=spotlight.SpotlightClient.default_url, dest='dbpedia_url',
            help='URL for DBpedia Spotlight service (default: %(default)s )')
        spotlight_opts.add_argument('--confidence', '-c', metavar='N', type=float, default=0.4,
            help='minimum confidence score (default: %(default)s)')
        spotlight_opts.add_argument('--support', '-s', metavar='N', type=int, default=20,
            help='minimum support score (default: %(default)s)')
        spotlight_opts.add_argument('--scores', default=False, action='store_true',
            help='Display similarity score and support numbers (ignored if --unique is used)')
        spotlight_opts.add_argument('--types', '-t', metavar='TYPES', type=str, default='',  # Person,Place,Organisation',
            help='restrict to specific types of resources, e.g. Person,Place,Organization')  # (default: %(default)s)')
        # NOTE! restricting to person/place/org leaves out literary prizes, which are otherwise being
        # recognized; check if these be tagged/identified in EAD for inclusion
        # - probably do want to exclude dates (don't seem to be recognized in a useful way...)

        self.args = parser.parse_args()

        # auto-detect input type if not specified
        if not self.args.input:
            self.args.input = util.autodetect_file_type(self.args.filename)
            # exit if we still don't have an input type
            if not self.args.input:
                print >> sys.stderr, 'Could not determine document input type; please specify with --input'
                exit(-1)

        if self.args.input == 'tei' and not self.args.tei_xpath:
            print 'Error! --tei-xpath is required when input document is TEI\n'
            parser.print_help()
            exit(-1)

        spotlight_args = {'base_url': self.args.dbpedia_url,
            'confidence': self.args.confidence, 'support': self.args.support,
            'types': self.args.types}

        # for now, script only has a single mode: output a list of recognized names and URIs

        self.sc = spotlight.SpotlightClient(**spotlight_args)

        # TODO: first check that file exists and is readable! otherwise, messy error...

        if self.args.input == 'text':
            # NOTE: would probably need to be read / processed in chunks
            # if we want to handle text files of any significant size
            with open(self.args.filename) as txtfile:
                text = txtfile.read()

            if self.args.unique:
                self.unique_names(text)
            else:
                self.list_names(text)

        # TEI or EAD
        else:

            xmlobj = self.init_xml_object()

            if self.args.unique:
                # NOTE: some duplication here of unique_names method,
                # to allow aggregating results from multiple calls
                names = set()
                for label, text_list in self.get_xml_sections(xmlobj):
                    for txt in text_list:
                        if txt in self._queried_text:
                                continue
                        names.update(set([(r['surfaceForm'], r['URI']) for r in self.get_names(txt)]))

                names = sorted(names)
                for name, uri in names:
                    self.print_name(name, uri)

            else:
                print 'Looking up names by section'
                for label, text_list in self.get_xml_sections(xmlobj):
                    print "\n%s" % label
                    for txt in text_list:
                        # for EAD only, skip look-ups on exactly repeated text
                        # (e.g. "undated correspondence" or "miscellaneous invitations")
                        # and print the text user can compare original and recognized names
                        if self.args.input == 'ead':
                            if txt in self._queried_text:
                                continue
                            print txt

                        # not printing text or checking for repeated text in TEI mode,
                        # since content is likely to be much longer and less repetitive
                        self.list_names(txt)

        # Brief summary of API call activity
        print >> sys.stderr, '\nMade %d API call%s in %s' % (self.sc.total_api_calls,
            's' if self.sc.total_api_calls != 1 else '',
            self.sc.total_api_duration)

    def init_xml_object(self):
        # initialize an xmlobject based on user-specified arguments
        # for filename and type

        # input arg is required, so one of these options should be valid
        if self.args.input == 'ead':
            xmlobj_class = EAD
        elif self.args.input == 'tei':
            xmlobj_class = Tei

        try:
            return load_xmlobject_from_file(self.args.filename, xmlobj_class)
        except Exception as err:
            print 'Error loading %s as XML: %s' % (self.args.filename, err)
            exit(-1)

    def get_names(self, text):
        # run spotlight annotation on a text string and print out identified
        # resources
        self._queried_text.add(text)

        try:
            results = self.sc.annotate(text)
        except HTTPError as err:
            # for now, assume any error means the service is unavailable
            # and exit the script (could check status for more fine-grained response)
            print 'Error accessing DBpedia Spotlight -- %s' % err
            exit(-1)

        if not results or not 'Resources' in results:
            return []
        else:
            return results['Resources']

    def list_names(self, text):
        # run spotlight annotation on a text string and print out identified
        # resources
        results = self.get_names(text)

        if not results:
            print 'No resources identified'
            return
        else:
            for resource in results:
                self.print_name(resource['surfaceForm'], resource['URI'],
                    resource['similarityScore'], resource['support'])
                if self.args.viaf_lookup:
                    viafid = util.get_viafid(resource)
                    if viafid:
                        print '  %s' % viafid

        # NOTE: dbpedia annotate result is per offset within the text, so
        # may include duplicates - e.g., different "surfaceForm" text variants
        # for the same URI, or same exact text and URI

    def unique_names(self, text):
        names = sorted(set([(r['surfaceForm'], r['URI']) for r in self.get_names(text)]))
        for name, uri in names:
            self.print_name(name, uri)

    def print_name(self, name, uri, similarity_score=None, support=None):
        # if args.scores is set and values are availble, include similarity/support
        if self.args.scores and similarity_score and support:
            print '%s  %s (%.2f, %s)' % (name.ljust(40), uri, float(similarity_score), support)
        else:
            print '%s  %s' % (name.ljust(40), uri)

    def get_xml_sections(self, xmlobj):
        # generic method to return the appropriate headings and sections
        # based on user-selected type
        if self.args.input.lower() == 'ead':
            return self.get_ead_sections(xmlobj)
        elif self.args.input.lower() == 'tei':
            return self.get_tei_sections(xmlobj)

    def get_ead_sections(self, ead):
        # generator: returns tuples of section label, list of strings

        # biographical statement
        yield (unicode(ead.archdesc.biography_history.head),
            [unicode(p) for p in ead.archdesc.biography_history.content])
        # note: beware that using unicode on xmlmap elements normalizes whitespace
        # (good for lookup, bad for annotating original xml)

        # return sections for series/subseries
        if ead.dsc.c and ead.dsc.c[0].c:
            for c01 in ead.dsc.c:
                for section in self.get_ead_component_sections(c01):
                    yield section
        # return elements for findingaid with a single container list
        else:
            yield ('Container List',
                [unicode(c.did.unittitle) for c in ead.dsc.c])

    def get_ead_component_sections(self, cseries):
        # recursive generator for c01/c02 series/subseries elements
        series_title = unicode(cseries.did.unittitle)
        if cseries.scope_content:
            yield ('%s : %s' % (series_title, unicode(cseries.scope_content.head)),
                [unicode(p) for p in cseries.scope_content.content])
        if cseries.hasSubseries():
            for subseries in cseries.c:
                for section in self.get_ead_component_sections(subseries):
                    yield section
        else:
            yield ('%s: item descriptions' % series_title,
                [unicode(c.did.unittitle) for c in cseries.c])

    def get_tei_sections(self, tei):
        # generator to return tuples of tei headings and text
        # for sections of the document based on user-specified xpath
        try:
            sections = tei.node.xpath(self.args.tei_xpath,
                namespaces={'t': TEI_NAMESPACE})
        except:
            print "Error evaluating XPath '%s'" % self.args.tei_xpath
            exit(-1)

        for section in sections:
            # find the first heading or title
            headings = ['docTitle', 'head']
            # for now, only looking in TEI namespace; may want to revise to support
            # un-namespaced TEI xml
            xpath_headings = '|'.join(['.//t:%s' % tag for tag in headings])
            head = section.xpath('normalize-space((%s)[1])' % xpath_headings,
                                    namespaces={'t': TEI_NAMESPACE})
            yield (head, [section.xpath('normalize-space(.)')])
            # NOTE: needs to be a list of text to match ead sections, even though
            # for now there is only one text element

    _viafids = {}
    _dbpedia_viaf_lookups = []


if __name__ == '__main__':
    LookupNames()
