#!/usr/bin/env python

import argparse
from eulxml.xmlmap import load_xmlobject_from_file
from eulxml.xmlmap.eadmap import EncodedArchivalDescription as EAD
from eulxml.xmlmap.teimap import Tei, TEI_NAMESPACE
from requests.exceptions import HTTPError
import sys
import unicodecsv

from namedropper import spotlight, util


class LowerCaseAction(argparse.Action):
    # convert input argument to lower case before storing
    def __call__(self, parser, namespace, value, option_string=None):
        setattr(namespace, self.dest, value.lower())


class LookupNames(object):
    # object to wrap script functionality

    _queried_text = set()

    _unique_names = set()

    # number of characters on either side of an entity in the
    # text to include in the context of the csv_file
    __context_pad = 100

    def __init__(self):
        # parse command-line arguments and init spotlight client
        parser = argparse.ArgumentParser(description='Look up named entities in a file.')
        parser.add_argument('filename', metavar='INPUT_FILE', type=str,
            help='name of the file to be processed')
        parser.add_argument('--input', metavar='INPUT_TYPE', type=str,
            help='type of file to be processed (%(choices)s)',
            action=LowerCaseAction,
            choices=['EAD', 'ead', 'tei', 'TEI', 'text'])
        parser.add_argument('--unique', action='store_true',
            help='only list unique names found anywhere in the content')
        parser.add_argument('--viaf', action='store_true', dest='viaf_lookup', default=False,
            help='look up VIAF identifiers for recognized Person entities')

        parser.add_argument('--output', '-o', dest='output_file',
            help='''Generate an updated XML file with recognized names as the
            specified filename; it is recommended to restrict types to
            Person, Place, and/or Organisation (semi-EXPERIMENTAL)''')
        parser.add_argument('--csv', dest='csv_file',
            help=''' Generate a CSV file with recognized names, uri, similarity score, support score, and context''')

        # tei option (TODO: probably should be required if input is tei)
        tei_opts = parser.add_argument_group('TEI options')
        tei_opts.add_argument('--tei-xpath', type=str, metavar='XPATH',
            help='xpath for TEI section-level content to be processed (use t: for tei namespace)')

        # dbpedia-specific options
        spotlight_opts = parser.add_argument_group('DBpedia Spotlight options')
        spotlight_opts.add_argument('--url', '-u', metavar='URL', type=str,
            default=spotlight.SpotlightClient.default_url, dest='dbpedia_url',
            help='URL for DBpedia Spotlight service (default: %(default)s )')
        spotlight_opts.add_argument('--confidence', '-c', metavar='N', type=float, default=0.4,
            help='minimum confidence score (default: %(default)s)')
        spotlight_opts.add_argument('--support', '-s', metavar='N', type=int, default=20,
            help='minimum support score (default: %(default)s)')
        spotlight_opts.add_argument('--scores', default=False, action='store_true',
            help='Display similarity score and support numbers (ignored if --unique is used)')
        spotlight_opts.add_argument('--types', '-t', metavar='TYPES', type=str, default='',  # Person,Place,Organisation',
            help='restrict to specific types of resources, e.g. Person,Place,Organization')  # (default: %(default)s)')
        # NOTE! restricting to person/place/org leaves out literary prizes, which are otherwise being
        # recognized; check if these be tagged/identified in EAD for inclusion
        # - probably do want to exclude dates (don't seem to be recognized in a useful way...)

        self.args = parser.parse_args()

        # auto-detect input type if not specified
        if not self.args.input:
            self.args.input = util.autodetect_file_type(self.args.filename)
            # exit if we still don't have an input type
            if not self.args.input:
                print >> sys.stderr, 'Could not determine document input type; please specify with --input'
                exit(-1)

        if self.args.input == 'tei' and not self.args.tei_xpath:
            print 'Error! --tei-xpath is required when input document is TEI\n'
            parser.print_help()
            exit(-1)

        self.csv_writer = None
        if self.args.csv_file:
            #init self.csv_writer
            self.csv_writer = unicodecsv.writer(open(self.args.csv_file, 'wb'), encoding="utf-8")

            #early exit if we don't have csv writer
            if self.args.csv_file and not self.csv_writer:
                print >> sys.stderr, "CSV output file could not be created"
                exit(-1)

            # write the header row of the CSV file
            # done here because in certain modes it will call populate_csv_from_results more than once
            self.csv_writer.writerow(["Name", "URI", "Similarity Score", "Support Score", "Context"])

        spotlight_args = {'base_url': self.args.dbpedia_url,
            'confidence': self.args.confidence, 'support': self.args.support,
            'types': self.args.types}

        # for now, script only has a single mode: output a list of recognized names and URIs

        self.sc = spotlight.SpotlightClient(**spotlight_args)

        # TODO: first check that file exists and is readable! otherwise, messy error...

        if self.args.input == 'text':
            # NOTE: would probably need to be read / processed in chunks
            # if we want to handle text files of any significant size
            with open(self.args.filename) as txtfile:
                text = txtfile.read()

            if self.args.unique:
                self.unique_names(text)
            else:
                self.list_names(text)

        # TEI or EAD
        else:

            xmlobj = self.init_xml_object()

            if self.args.unique:
                # NOTE: some duplication here of unique_names method,
                # to allow aggregating results from multiple calls
                names = set()
                for label, text_list in self.get_xml_sections(xmlobj):
                    for txt in text_list:
                        if txt in self._queried_text:
                                continue
                        names.update(set([(r['surfaceForm'], r['URI']) for r in self.get_names(txt)]))

                names = sorted(names)
                for name, uri in names:
                    self.print_name(name, uri)

            else:
                print 'Looking up names by section'
                for label, node_list in self.get_xml_sections(xmlobj):
                    print "\n%s" % label
                    #for txt in text_list:
                    for node in node_list:
                        txt = node.xpath('normalize-space(.)')
                        # for EAD only, skip look-ups on exactly repeated text
                        # (e.g. "undated correspondence" or "miscellaneous invitations")
                        # and print the text user can compare original and recognized names
                        if self.args.input == 'ead':
                            if txt in self._queried_text:
                                continue
                            print txt

                        # not printing text or checking for repeated text in TEI mode,
                        # since content is likely to be much longer and less repetitive
                        self.list_names(txt)

                        # NOTE: currently only works for TEI, and only supports simple xml
                        # if output file was requested and we have results, attempt
                        # to update the lxml node with the identified resources.
                        if self.args.output_file and self.last_results and 'Resources' in self.last_results:
                            util.annotate_xml(node, self.last_results, self.args.input)

        # Brief summary of API call activity
        print >> sys.stderr, '\nMade %d API call%s in %s' % (self.sc.total_api_calls,
            's' if self.sc.total_api_calls != 1 else '',
            self.sc.total_api_duration)

        # if output file was specified, write out the result
        if self.args.output_file:
            with open(self.args.output_file, 'w') as outfile:
                xmlobj.serializeDocument(stream=outfile, pretty=True)

    def init_xml_object(self):
        # initialize an xmlobject based on user-specified arguments
        # for filename and type

        # input arg is required, so one of these options should be valid
        if self.args.input == 'ead':
            xmlobj_class = EAD
        elif self.args.input == 'tei':
            xmlobj_class = Tei

        try:
            return load_xmlobject_from_file(self.args.filename, xmlobj_class)
        except Exception as err:
            print 'Error loading %s as XML: %s' % (self.args.filename, err)
            exit(-1)

    def get_names(self, text):
        # run spotlight annotation on a text string and print out identified
        # resources
        self._queried_text.add(text)

        try:
            results = self.sc.annotate(text)
            self.last_results = results

        except HTTPError as err:
            # for now, assume any error means the service is unavailable
            # and exit the script (could check status for more fine-grained response)
            print 'Error accessing DBpedia Spotlight -- %s' % err
            exit(-1)
            self.last_results = None

        if not results or not 'Resources' in results:
            return []
        else:
            if self.args.csv_file:
                self.populate_csv_from_results(results)

            return results['Resources']

    def populate_csv_from_results(self, results):
        # loop through results to populate rows in csv
        for result in results['Resources']:
            # start and end indexes for grabbing context around the entity
            sub_start = max(0, int(result['offset']) - self.__context_pad)
            sub_end = min(len(results['text']), int(result['offset']) + \
                len(result['surfaceForm']) + self.__context_pad)
            # slice to get context around the identified term
            # normalizing whitespace for plain-text input
            context = util.normalize_whitespace(results['text'][sub_start:sub_end])

            self.csv_writer.writerow([
                    result['surfaceForm'],
                    result['URI'],
                    result['similarityScore'],
                    result['support'],
                    context
                ])

    def list_names(self, text):
        # run spotlight annotation on a text string and print out identified
        # resources
        results = self.get_names(text)

        if not results:
            print 'No resources identified'
            return
        else:
            for resource in results:
                self.print_name(resource['surfaceForm'], resource['URI'],
                    resource['similarityScore'], resource['support'])
                if self.args.viaf_lookup:
                    viafid = util.get_viafid(resource)
                    if viafid:
                        print '  %s' % viafid

        # NOTE: dbpedia annotate result is per offset within the text, so
        # may include duplicates - e.g., different "surfaceForm" text variants
        # for the same URI, or same exact text and URI

    def unique_names(self, text):
        names = sorted(set([(r['surfaceForm'], r['URI']) for r in self.get_names(text)]))
        for name, uri in names:
            self.print_name(name, uri)

    def print_name(self, name, uri, similarity_score=None, support=None):
        # if args.scores is set and values are availble, include similarity/support
        if self.args.scores and similarity_score and support:
            print '%s  %s (%.2f, %s)' % (name.ljust(40), uri, float(similarity_score), support)
        else:
            print '%s  %s' % (name.ljust(40), uri)

    def get_xml_sections(self, xmlobj):
        # generic method to return the appropriate headings and sections
        # based on user-selected type
        if self.args.input.lower() == 'ead':
            return self.get_ead_sections(xmlobj)
        elif self.args.input.lower() == 'tei':
            return self.get_tei_sections(xmlobj)

    def get_ead_sections(self, ead):
        # generator: returns tuples of section label, list of NODES

        # biographical statement
        yield (unicode(ead.archdesc.biography_history.head),
            [p.node for p in ead.archdesc.biography_history.content])
        # note: beware that using unicode on xmlmap elements normalizes whitespace
        # (good for lookup, bad for annotating original xml)

        # return sections for series/subseries
        if ead.dsc.c and ead.dsc.c[0].c:
            for c01 in ead.dsc.c:
                for section in self.get_ead_component_sections(c01):
                    yield section
        # return elements for findingaid with a single container list
        else:
            yield ('Container List',
                [c.did.unittitle.node for c in ead.dsc.c])

    def get_ead_component_sections(self, cseries):
        # recursive generator for c01/c02 series/subseries elements
        series_title = unicode(cseries.did.unittitle)
        if cseries.scope_content:
            yield ('%s : %s' % (series_title, unicode(cseries.scope_content.head)),
                [p.node for p in cseries.scope_content.content])
        if cseries.hasSubseries():
            for subseries in cseries.c:
                for section in self.get_ead_component_sections(subseries):
                    yield section
        else:
            yield ('%s: item descriptions' % series_title,
                [c.did.unittitle.node for c in cseries.c])

    def get_tei_sections(self, tei):
        # generator to return tuples of tei headings and text
        # for sections of the document based on user-specified xpath
        try:
            sections = tei.node.xpath(self.args.tei_xpath,
                namespaces={'t': TEI_NAMESPACE})
        except:
            print "Error evaluating XPath '%s'" % self.args.tei_xpath
            exit(-1)

        for section in sections:
            # find the first heading or title
            headings = ['docTitle', 'head']
            # for now, only looking in TEI namespace; may want to revise to support
            # un-namespaced TEI xml
            xpath_headings = '|'.join(['.//t:%s' % tag for tag in headings])
            head = section.xpath('normalize-space((%s)[1])' % xpath_headings,
                                    namespaces={'t': TEI_NAMESPACE})
            #yield (head, [section.xpath('normalize-space(.)')])
            yield (head, [section])
            # NOTE: needs to be a list of text to match ead sections, even though
            # for now there is only one text element

    _viafids = {}
    _dbpedia_viaf_lookups = []


if __name__ == '__main__':
    LookupNames()
