#!/usr/bin/env python

import argparse
from eulxml.xmlmap.teimap import TEI_NAMESPACE
import logging
from logging import config as logging_config
import os
from requests.exceptions import HTTPError
import signal
import sys
import unicodecsv

from namedropper import spotlight, util, scripts, LOGGING_CONFIG




class LowerCaseAction(argparse.Action):
    # convert input argument to lower case before storing
    def __call__(self, parser, namespace, value, option_string=None):
        setattr(namespace, self.dest, value.lower())


class LookupNames(scripts.ScriptBase):
    'Look up named entities in a file.'

    _queried_text = set()
    _unique_names = set()

    # number of characters on either side of an entity in the
    # text to include in the context of the csv_file
    __context_pad = 100

    total = 0
    # total number of resources identified

    # interrupt flag to exit the main processing loop when a signal is caught
    interrupted = False

    def init_parser(self):
        parser = super(LookupNames, self).init_parser()
        # default args include filename and input type
        parser.add_argument('--unique', action='store_true',
            help='only list unique names found anywhere in the content')
        parser.add_argument('--viaf', action='store_true', dest='viaf_lookup', default=False,
            help='look up VIAF identifiers for recognized Person entities')
        parser.add_argument('--geonames', action='store_true', dest='geonames_lookup', default=False,
            help='look up GeoNames identifiers for recognized Place entities')

        parser.add_argument('--loglevel', dest='loglevel', default='WARNING',
            choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
            help='Set output log level (default: %(default)s)')

        parser.add_argument('--output', '-o', dest='output_file',
            help='''Generate an updated XML file with recognized names as the
            specified filename; it is recommended to restrict types to
            Person, Place, and/or Organisation (semi-EXPERIMENTAL)''')
        parser.add_argument('--oxygen-track-changes', action='store_true',
            dest='track_changes', default=False,
            help='Use Oxygen track changes for changes made to generated XML')
        parser.add_argument('--csv', dest='csv_file',
            help=''' Generate a CSV file with recognized names, uri, similarity score, support score, and context''')

        # tei option (TODO: probably should be required if input is tei)
        tei_opts = parser.add_argument_group('TEI options')
        tei_opts.add_argument('--tei-xpath', type=str, metavar='XPATH',
            help='xpath for TEI section-level content to be processed (use t: for tei namespace)')

        # dbpedia-specific options
        spotlight_opts = parser.add_argument_group('DBpedia Spotlight options')
        spotlight_opts.add_argument('--url', '-u', metavar='URL', type=str,
            default=spotlight.SpotlightClient.default_url, dest='dbpedia_url',
            help='URL for DBpedia Spotlight service (default: %(default)s )')
        spotlight_opts.add_argument('--confidence', '-c', metavar='N', type=float, default=0.4,
            help='minimum confidence score (default: %(default)s)')
        spotlight_opts.add_argument('--support', '-s', metavar='N', type=int, default=20,
            help='minimum support score (default: %(default)s)')
        spotlight_opts.add_argument('--scores', default=False, action='store_true',
            help='Display similarity score and support numbers (ignored if --unique is used)')
        spotlight_opts.add_argument('--types', '-t', metavar='TYPES', type=str, default='',  # Person,Place,Organisation',
            help='restrict to specific types of resources, e.g. Person,Place,Organization')  # (default: %(default)s)')
        # NOTE! restricting to person/place/org leaves out literary prizes, which are otherwise being
        # recognized; check if these be tagged/identified in EAD for inclusion
        # - probably do want to exclude dates (don't seem to be recognized in a useful way...)

        self.parser = parser
        return parser

    def run(self):
        # bind a handler for interrupt signal
        signal.signal(signal.SIGINT, self.interrupt_handler)
        # input type auto-detection handled in base class
        if self.args.input == 'tei' and not self.args.tei_xpath:
            print 'Error! --tei-xpath is required when input document is TEI\n'
            self.parser.print_help()
            exit(-1)

        self.csv_writer = None
        if self.args.csv_file:
            self.csv_writer = unicodecsv.writer(open(self.args.csv_file, 'wb'),
                                                encoding="utf-8")

            # early exit if we don't have csv writer
            if self.args.csv_file and not self.csv_writer:
                print >> sys.stderr, "CSV output file could not be created"
                exit(-1)

            # write the header row of the CSV file
            # done here because in certain modes it will call
            # populate_csv_from_results more than once
            self.csv_writer.writerow(["Name", "URI", "Similarity Score",
                                      "Support Score", "Type", "Context"])

        spotlight_args = {
            'base_url': self.args.dbpedia_url,
            'confidence': self.args.confidence,
            'support': self.args.support,
            'types': self.args.types
        }

        # configur logging based on --loglevel option
        log_cfg = LOGGING_CONFIG.copy()
        log_cfg['handlers']['console']['level'] = self.args.loglevel
        logging_config.dictConfig(log_cfg)

        # check and warn if no proxy is set
        if 'HTTP_PROXY' not in os.environ:

            # if an output file is requested, this an error because
            # output requires validation which currently requires
            # an http proxy
            if self.args.output_file:
                error = '''
ERROR: A web proxy is currently required in order to generate and validate
an annotated XML output file.  You should configure a proxy via the
HTTP_PROXY environment variable.
'''
                print >> sys.stderr, error
                exit(-1)

            warning = '''
WARNING: It is recommended to use a web proxy to improve performance
and reduce the load on external services.  If possible, you should
configure your proxy via the HTTP_PROXY environment variable.
'''
            print >> sys.stderr, warning

        # for now, script only has a single mode: output a list of recognized names and URIs

        self.sc = spotlight.SpotlightClient(**spotlight_args)

        # TODO: first check that file exists and is readable! otherwise, messy error...

        if self.args.input == 'text':
            # NOTE: would probably need to be read / processed in chunks
            # if we want to handle text files of any significant size
            with open(self.args.filename) as txtfile:
                text = txtfile.read()

            if self.args.unique:
                self.unique_names(text)
            else:
                self.list_names(text)
            # not checking interrupt flag here because not looping
            # and making multiple annotate requests

        # TEI or EAD
        else:

            xmlobj = self.init_xml_object()

            if self.args.unique:
                # NOTE: some duplication here of unique_names method,
                # to allow aggregating results from multiple calls
                names = set()
                for label, text_list in self.get_xml_sections(xmlobj):
                    for txt in text_list:
                        # if interrupt flag is set, bail out
                        if self.interrupted:
                            break

                        if txt in self._queried_text:
                                continue
                        names.update(set([(r['surfaceForm'], r['URI']) for r in self.get_names(txt)]))

                names = sorted(names)
                for name, uri in names:
                    self.print_name(name, uri)

            else:
                print 'Looking up names by section'
                for label, node_list in self.get_xml_sections(xmlobj):
                    print "\n%s" % label
                    for node in node_list:
                        # if interrupt flag is set, bail out
                        if self.interrupted:
                            break

                        txt = node.xpath('normalize-space(.)')
                        # for EAD only, skip look-ups on exactly repeated text
                        # (e.g. "undated correspondence" or "miscellaneous invitations")
                        # and print the text user can compare original and recognized names
                        if self.args.input == 'ead':
                            if txt in self._queried_text:
                                continue
                            print txt

                        # not printing text or checking for repeated text in TEI mode,
                        # since content is likely to be much longer and less repetitive
                        self.list_names(txt)

                        # if output file was requested and we have results, attempt
                        # to update the lxml node with the identified resources.
                        # NOTE: not guaranteed to work for all cases of complicated xml
                        if self.args.output_file and self.last_results \
                                and 'Resources' in self.last_results:

                            # FIXME: ideally, init annotater once on entire document
                            annotater = util.AnnotateXml(
                                self.args.input,
                                track_changes=self.args.track_changes,
                                viaf=self.args.viaf_lookup,
                                geonames=self.args.geonames_lookup,
                                xml_object=xmlobj)
                            annotater.annotate(node, self.last_results)
                            # util.annotate_xml(node, self.last_results, self.args.input,
                            #     self.args.track_changes)

                # if generating an xml file with track changes enabled,
                # add a flag to tell Oxygen to turn on track changes for this document
                if self.args.output_file and self.args.track_changes:
                    util.enable_oxygen_track_changes(xmlobj.node)

        # Brief summary of API call activity
        # NOTE: this only reports spotlight annotation calls
        # TODO: consider reporting viaf api calls, dbpedia data lookups, etc
        print >> sys.stderr, '\nMade %d DBpedia Spotlight API call%s in %s' % \
            (self.sc.total_api_calls,
             's' if self.sc.total_api_calls != 1 else '',
             self.sc.total_api_duration)

        if self.args.csv_file and self.total == 0:
            # close csv writer and remove the created file (headers only)
            del(self.csv_writer)
            os.remove(self.args.csv_file)
            print >> sys.stderr, 'CSV file (%s) not created ' % self.args.csv_file \
                + 'because no resources were identified'

        # if output file was specified, write out the result
        # - don't generate output if exiting via interrupt
        if self.args.output_file and not self.interrupted:
            if self.total != 0:
                with open(self.args.output_file, 'w') as outfile:
                    xmlobj.serializeDocument(stream=outfile, pretty=True)
            else:
                print >> sys.stderr, 'Annotated XML file (%s) ' % self.args.output_file \
                    + 'not created because no resources were identified'

    def get_names(self, text):
        # run spotlight annotation on a text string and print out identified
        # resources
        self._queried_text.add(text)

        try:
            results = self.sc.annotate(text)
            self.last_results = results

        except HTTPError as err:
            # for now, assume any error means the service is unavailable
            # and exit the script (could check status for more fine-grained response)
            print 'Error accessing DBpedia Spotlight -- %s' % err
            exit(-1)
            self.last_results = None

        if not results or not 'Resources' in results:
            return []
        else:
            if self.args.csv_file:
                self.populate_csv_from_results(results)

            return results['Resources']

    def populate_csv_from_results(self, results):
        # loop through results to populate rows in csv
        for result in results['Resources']:
            # start and end indexes for grabbing context around the entity
            sub_start = max(0, int(result['offset']) - self.__context_pad)
            sub_end = min(len(results['text']), int(result['offset']) +
                          len(result['surfaceForm']) + self.__context_pad)
            # slice to get context around the identified term
            # normalizing whitespace for plain-text input
            context = util.normalize_whitespace(results['text'][sub_start:sub_end])

            # Use the DBpediaResource class to determine type of entity
            dbres = spotlight.DBpediaResource(result['URI'],
                                              spotlight_info=result)
            rsrc_type = ''
            if dbres.is_person:
                rsrc_type = 'person'
            elif dbres.is_org:
                rsrc_type = 'organization'
            elif dbres.is_place:
                rsrc_type = 'place'

            self.csv_writer.writerow([
                result['surfaceForm'],
                result['URI'],
                result['similarityScore'],
                result['support'],
                rsrc_type,
                context
            ])

    def list_names(self, text):
        # run spotlight annotation on a text string and print out identified
        # resources
        results = self.get_names(text)
        self.total += len(results)

        if not results:
            print 'No resources identified'
            return
        else:
            for resource in results:
                self.print_name(resource['surfaceForm'], resource['URI'],
                                resource['similarityScore'], resource['support'])
                dbres = spotlight.DBpediaResource(resource['URI'],
                                                  spotlight_info=resource)
                if self.args.viaf_lookup and dbres.is_person:
                    #viafid = util.get_viafid(resource)
                    if dbres.viaf_uri:
                        print '  %s' % dbres.viaf_uri

                if self.args.geonames_lookup and dbres.is_place:
                    if dbres.geonames_uri:
                        print '  %s' % dbres.geonames_uri

        # NOTE: dbpedia annotate result is per offset within the text, so
        # may include duplicates - e.g., different "surfaceForm" text variants
        # for the same URI, or same exact text and URI

    def unique_names(self, text):
        names = sorted(set([(r['surfaceForm'], r['URI']) for r in self.get_names(text)]))
        for name, uri in names:
            self.print_name(name, uri)

    def print_name(self, name, uri, similarity_score=None, support=None):
        # if args.scores is set and values are availble, include similarity/support
        if self.args.scores and similarity_score and support:
            print '%s  %s (%.2f, %s)' % (name.ljust(40), uri, float(similarity_score), support)
        else:
            print '%s  %s' % (name.ljust(40), uri)

    def get_xml_sections(self, xmlobj):
        # generic method to return the appropriate headings and sections
        # based on user-selected type
        if self.args.input.lower() == 'ead':
            return self.get_ead_sections(xmlobj)
        elif self.args.input.lower() == 'tei':
            return self.get_tei_sections(xmlobj)

    def get_ead_sections(self, ead):
        # generator: returns tuples of section label, list of NODES


        if ead.archdesc and ead.archdesc.biography_history:
            # biographical statement
            yield (unicode(ead.archdesc.biography_history.head),
                [p.node for p in ead.archdesc.biography_history.content])
            # note: beware that using unicode on xmlmap elements normalizes whitespace
            # (good for lookup, bad for annotating original xml)
        else:
            print >> sys.stderr, '  Warning: collection level biography/history (archdesc/bioghist) not found'

        # return sections for series/subseries
        if ead.dsc:
            if ead.dsc.c and ead.dsc.c[0].c:
                for c01 in ead.dsc.c:
                    for section in self.get_ead_component_sections(c01):
                        yield section
            # return elements for findingaid with a single container list
            else:
                yield ('Container List',
                    [c.did.unittitle.node for c in ead.dsc.c])

        else:
            print >> sys.stderr, '  Warning: Description of Subordinate Components (dsc) not found'

    def get_ead_component_sections(self, cseries):
        # recursive generator for c01/c02 series/subseries elements
        series_title = unicode(cseries.did.unittitle)
        if cseries.scope_content:
            yield ('%s : %s' % (series_title, unicode(cseries.scope_content.head)),
                [p.node for p in cseries.scope_content.content])
        if cseries.hasSubseries():
            for subseries in cseries.c:
                for section in self.get_ead_component_sections(subseries):
                    yield section
        else:
            yield ('%s: item descriptions' % series_title,
                [c.did.unittitle.node for c in cseries.c])

    def get_tei_sections(self, tei):
        # generator to return tuples of tei headings and text
        # for sections of the document based on user-specified xpath
        try:
            sections = tei.node.xpath(self.args.tei_xpath,
                namespaces={'t': TEI_NAMESPACE})
        except:
            print "Error evaluating XPath '%s'" % self.args.tei_xpath
            exit(-1)

        for section in sections:
            # find the first heading or title
            headings = ['docTitle', 'head']
            # for now, only looking in TEI namespace; may want to revise to support
            # un-namespaced TEI xml
            xpath_headings = '|'.join(['.//t:%s' % tag for tag in headings])
            head = section.xpath('normalize-space((%s)[1])' % xpath_headings,
                                    namespaces={'t': TEI_NAMESPACE})
            #yield (head, [section.xpath('normalize-space(.)')])
            yield (head, [section])
            # NOTE: needs to be a list of text to match ead sections, even though
            # for now there is only one text element

    _viafids = {}
    _dbpedia_viaf_lookups = []

    def interrupt_handler(self, signum, frame):
        '''Gracefully handle a SIGINT, if possible. Sets a flag so main script
        loop can exit cleanly, and restores the default SIGINT behavior,
        so that a second interrupt will stop the script.
        '''
        if signum == signal.SIGINT:
            # restore default signal handler so a second SIGINT can be used to quit
            signal.signal(signal.SIGINT, signal.SIG_DFL)
            # set interrupt flag so main loop knows to quit at a reasonable time
            self.interrupted = True
            # report if script is in the middle of an object
            print >> sys.stderr, '\nScript will exit after processing the current request.'
            print >> sys.stderr, '(Ctrl-C / Interrupt again to quit immediately)\n'


if __name__ == '__main__':
    LookupNames()
