#!/usr/bin/env python3
#
# Maintain a MEDLINE/PubMed repository
#
"""
parse:   Medline XML files into raw table files for DB dumping; ==
insert:  PubMed XML files or a list of PMIDs (contacting EUtils) into the DB
         (slower than using "parse" and a DB dump); ==
update:  existing records or add new records from PubMed XML files or a list
         of PMIDs (slow!); ==
write:   records in various formats for a given list of PMIDs (implicitly sets
         --pmid-lists); ==
delete:  records from the DB for a given list of PMIDs (implicitly sets
         --pmid-lists)
"""
import logging
import os
import sys

from sqlalchemy.exc import OperationalError

__author__ = 'Florian Leitner'
__version__ = '2.0.2'

DATABANK_LINK = {
    'GDB': "#",
    'GENBANK': "http://www.ncbi.nlm.nih.gov/nucgss/",
    'OMIM': "http://omim.org/entry/",
    'PDB': "http://www.rcsb.org/pdb/explore/explore.do?structureId=",
    'PIR': "#",
    'RefSeq': "http://www.ncbi.nlm.nih.gov/nuccore/",
    'SWISSPROT': "http://www.uniprot.org/uniprot/",
    'ClinicalTrials.gov': "#",
    'ISRCTN': "#",
    'GEO': "#",
    'PubChem-Substance': "http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?sid=",
    'PubChem-Compound': "http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=",
    'PubChem-BioAssay': "http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=",
}

ABSTRACT = frozenset({
    'Methods', 'Conclusions', 'Abstract', 'Results', 'Unassigned',
    'Background', 'Objective', 'Unlabelled'
})


def Main(command, files_or_pmids, session, unique=True):
    """
    :param command: one of create/read/update/delete
    :param files_or_pmids: the list of files or PMIDs to process
    :param session: the DB session
    :param unique: flag to skip versioned records if VersionID != "1"
    """
    from medic.crud import insert, select, update, delete

    if command == 'insert':
        return insert(session, files_or_pmids, unique)
    elif command == 'write':
        return select(session, [int(i) for i in files_or_pmids])
    elif command == 'update':
        return update(session, files_or_pmids, unique)
    elif command == 'delete':
        return delete(session, [int(i) for i in files_or_pmids])


def WriteTabular(query, output_file: str):
    def prune(strings):
        return ' '.join(s.replace('\n', ' ').replace('\t', ' ') for s in strings)

    logging.debug("writing to file %s", output_file)
    file = open(output_file, 'wt') if output_file != '.' else sys.stdout

    try:
        for rec in query:
            logging.debug("writing PMID %i as TSV", rec.pmid)
            title = prune(s.content for s in rec.sections if s.name == 'Title')
            abstract = prune(s.content for s in rec.sections if s.name in ABSTRACT)
            print(rec.pmid, title, abstract, sep='\t', file=file)
    finally:
        if output_file != '.':
            file.close()


def WriteHTML(query, output_file: str):
    def MakeLabel(label):
        if label is None:
            return ""
        else:
            return '{}<br/>'.format(label.upper())

    logging.debug("writing to file %s", output_file)
    file = open(output_file, 'wt', encoding='utf-8') if output_file != '.' else sys.stdout
    p = lambda txt: print(txt, file=file)
    p("""<!doctype html>
<html><head>
  <meta charset="UTF-8"/>
  <title>PubMed Articles</title>
  <script>
function toggle(e) {
    if (e.style.display == 'none')
        e.style.display = 'block'
    else
        e.style.display = 'none'
}
function toggleAll(items) {
    for (var i = items.length; i--; i != 0) {
      toggle(items.item(i))
    }
}
function toggleVisibility(pmid, target) {
    var metadata = document.getElementById(pmid).getElementsByTagName('div').item(0)
    var elements = metadata.childNodes;
    for (i = elements.length; i--; i != 0) {
      var e = elements.item(i)
      if (typeof(e.getAttribute) != 'undefined' && e.getAttribute("class") == target) {
        toggle(e)
      }
    }
}
  </script>
</head><body>""")  # .format(file.encoding))

    href = lambda pmid: "http://www.ncbi.nlm.nih.gov/pubmed/{}".format(pmid)
    button = (
        lambda pmid, target, title:
        '<button onclick="toggleVisibility({}, \'{}\')">{}</button>'.format(pmid, target, title)
    )
    doi = lambda doi: '<a href="http://dx.doi.org/{}">{}</a>'.format(doi, doi)
    pmc = (
        lambda pmc:
        '<a href="http://www.ncbi.nlm.nih.gov/pmc/articles/{}">{}</a>'.format(pmc, pmc)
    )
    dates = lambda rec: 'created: {}{}{}'.format(
        rec.created,
        "" if rec.completed is None else ", completed: {}".format(rec.completed),
        "" if rec.revised is None else ", revised: {}".format(rec.revised))
    formatMajor = lambda e: '<b>{}</b>'.format(e.name) if e.major else e.name

    try:
        for rec in query:
            logging.debug('writing PMID %i as HTML', rec.pmid)
            citation = '{}, {}'.format(rec.journal, rec.citation())
            p('<article id={}>'.format(rec.pmid))
            link = None

            if 'doi' in rec.identifiers:
                link = '<a href="http://dx.doi.org/{}">'.format(rec.identifiers['doi'].value)
            elif 'pmc' in rec.identifiers:
                link = '<a href="http://www.ncbi.nlm.nih.gov/pmc/articles/{}">'.format(
                        rec.identifiers['pmc'].value
                )

            if link:
                p('<p class="citation">{}{}</a></p>'.format(link, citation))
            else:
                p('<p class="citation">{}</p>'.format(citation))

            copyright = None

            for sec in rec.sections:
                if sec.name == 'Title':
                    p('  <h1><a href="{}">{}</a></h1>\n  <ol>'.format(href(rec.pmid), sec.content))

                    for author in rec.authors:
                        p('    <li>{}</li>'.format(author.fullName()))

                    p('  </ol>')
                elif sec.name == 'Copyright':
                    copyright = sec.content
                elif sec.name in ABSTRACT:
                    p('  <section title="{}"><p>{}{}</p></section>'.format(
                        sec.name, MakeLabel(sec.label), sec.content))
                else:
                    logging.debug('dropping section %s', sec.name)

            if copyright:
                p('  <p class="copyright">{}</p>'.format(copyright))

            p('  <div title="Metadata">')

            if rec.descriptors:
                p('    {}'.format(button(rec.pmid, "mesh", "MeSH Terms")))
            if rec.keywords:
                p('    {}'.format(button(rec.pmid, "kwds", "Keywords")))
            if rec.chemicals:
                p('    {}'.format(button(rec.pmid, "chem", "Chemicals")))
            if rec.databases:
                p('    {}'.format(button(rec.pmid, "xref", "DB Links")))
            if rec.identifiers:
                p('    {}'.format(button(rec.pmid, "ids", "Article IDs")))

            if rec.descriptors:
                p('    <dl class="mesh">')

                for desc in rec.descriptors:
                    p('      <dt>{}</dt><dd><ol>'.format(formatMajor(desc)))

                    if desc.qualifiers:
                        for qual in desc.qualifiers:
                            p('        <li>{}</li>'.format(formatMajor(qual)))

                    p('      </ol></dd>')
                p('    </dl>')

            if rec.chemicals:
                p('    <ul class="chem">')

                for chem in rec.chemicals:
                    p('      <li>{}{}</li>'.format(
                        chem.name, "" if chem.uid is None else " ({})".format(chem.uid)))

                p('    </ul>')

            if rec.keywords:
                p('    <dl class="kwds">')
                owner = None

                for kwd in rec.keywords:
                    if kwd.owner != owner:
                        if owner is not None:
                            p('        </dd>')

                        p('      <dt>{}</dt><dd>'.format(kwd.owner))
                        owner = kwd.owner

                    p('      <li>{}</li>'.format(formatMajor(kwd)))

                p('    </dd></dl>')

            if rec.databases:
                p('    <ul class="xref">')

                for xref in rec.databases:
                    try:
                        p('      <li>{} <a href="{}{}">{}</a></li>'.format(
                            xref.name, DATABANK_LINK[xref.name], xref.accession, xref.accession))
                    except KeyError:
                        logging.error('unknown DB name: "{}"'.format(xref.name))

                p('    </ul>')

            if rec.identifiers:
                p('    <ul class="ids">')

                for ns, i in rec.identifiers.items():
                    if ns == 'doi':
                        p('      <li>{}</li>'.format(doi(i.value)))
                    elif ns == 'pmc':
                        p('      <li>{}</li>'.format(pmc(i.value)))
                    else:
                        p('      <li>{}:{}</li>'.format(ns, i.value))

                p('    </ul>')

            p('  </div>\n</article><hr/>')
        p("""  <script>
window.onload = function() {
    toggleAll(document.getElementsByTagName("ul"))
    toggleAll(document.getElementsByTagName("dl"))
}
  </script>
</body></html>""")
    finally:
        if output_file != '.':
            file.close()


def WriteTIAB(query, output_dir: str):
    logging.debug("writing to directory %s", output_dir)

    for rec in query:
        logging.debug("writing PMID %i as TIAB", rec.pmid)
        file = open(os.path.join(output_dir, "{}.txt".format(rec.pmid)), 'wt')

        try:
            WriteSection(file, rec, 'Title')

            for sec in ('Abstract', 'Background', 'Objective', 'Methods', 'Results',
                        'Conclusions', 'Unlabelled'):
                WriteSection(file, rec, sec)
        finally:
            file.close()


def WriteSection(file, rec, sec):
    for s in rec.sections:
        if s.name == sec:
            if (s.label is not None):
                print(s.label, file=file)

            print(s.content, end='\n\n', file=file)


def WriteMedline(query, output_dir: str):
    logging.debug("writing to directory %s", output_dir)

    for rec in query:
        logging.debug("writing PMID %i as MEDLINE", rec.pmid)
        file = open(os.path.join(output_dir, "{}.txt".format(rec.pmid)), 'wt')

        try:
            WriteMedlineRecord(rec, file)
        finally:
            file.close()


def WriteMedlineRecord(rec, file):
    def WriteMedlineItem(key: str, value):
        print("{:<4}- {}".format(
            key[:4], str(value).replace('\\', '\\\\').replace('\n', '\\n')
        ), file=file)

    abstract = []
    other_abstract = []
    medlineDate = lambda d: str(d).replace('-', '')

    WriteMedlineItem('PMID', rec.pmid)
    WriteMedlineItem('STAT', rec.status)
    WriteMedlineItem('TA', rec.journal)

    if rec.issue:
        WriteMedlineItem('VI', rec.issue)

    if rec.pagination:
        WriteMedlineItem('PG', rec.pagination)

    WriteMedlineItem('DP', rec.pub_date)
    WriteMedlineItem('DA', medlineDate(rec.created))

    if rec.revised:
        WriteMedlineItem('LR', medlineDate(rec.revised))

    if rec.completed:
        WriteMedlineItem('DCOM', medlineDate(rec.completed))

    for sec in rec.sections:
        if sec.name == 'Copyright':
            WriteMedlineItem('CI', sec.content)
        elif sec.name == 'Title':
            WriteMedlineItem('TI', sec.content)
        elif sec.name == 'Vernacular':
            WriteMedlineItem('TT', sec.content)
        elif sec.name in ABSTRACT:
            abstract.append(sec.content)
        elif sec.name.endswith('Copyright'):
            WriteMedlineItem('OCI', sec.content)
        else:
            other_abstract.append(sec.content)

    if abstract:
        WriteMedlineItem('AB', ' '.join(abstract))

    addIfExists = lambda val: ' {}'.format(val) if val else ''

    for author in rec.authors:
        if author.initials == '':  # corporate authors
            WriteMedlineItem('CN', author.name)
        else:
            WriteMedlineItem('AU', '{}{}{}'.format(
                author.name, addIfExists(author.initials), addIfExists(author.suffix)
            ))

            if author.forename:
                WriteMedlineItem('FAU', '{}, {}{}'.format(
                    author.name, author.forename, addIfExists(author.suffix)
                ))

    if other_abstract:
        WriteMedlineItem('OAB', ' '.join(other_abstract))

    for pub_type in rec.publication_types:
        WriteMedlineItem('PT', pub_type.value)

    formatMajor = lambda e: '*{}'.format(e.name) if e.major else e.name

    for desc in rec.descriptors:
        heading = [formatMajor(desc)]

        for qual in desc.qualifiers:
            heading.append(formatMajor(qual))

        WriteMedlineItem('MH', '/'.join(heading))

    owner = None

    for kwd in rec.keywords:
        if kwd.owner != owner:
            WriteMedlineItem('OTO', kwd.owner)
            owner = kwd.owner

        WriteMedlineItem('OT', formatMajor(kwd))

    for chem in rec.chemicals:
        if chem.uid:
            WriteMedlineItem('RN', '{} ({})'.format(chem.uid, chem.name))
        else:
            WriteMedlineItem('RN', chem.name)

    for xref in rec.databases:
        WriteMedlineItem('SI', '{}/{}'.format(xref.name, xref.accession))

    for ns, uid in rec.identifiers.items():
        if ns == 'pmc':
            WriteMedlineItem('PMC', uid.value)
        else:
            WriteMedlineItem('AID', "{} [{}]".format(uid.value, ns))


if __name__ == '__main__':
    from argparse import ArgumentParser
    from medic.orm import InitDb, Session

    epilog = 'system (default) encoding: {}'.format(sys.getdefaultencoding())

    parser = ArgumentParser(
        usage='%(prog)s [options] CMD FILE/PMID ...',
        description=__doc__, epilog=epilog,
        prog=os.path.basename(sys.argv[0])
    )

    parser.set_defaults(loglevel=logging.WARNING)

    parser.add_argument(
        'command', metavar='CMD', choices=['parse', 'insert', 'write', 'update', 'delete'],
        help='one of {parse,insert,write,update,delete}; see above'
    )
    parser.add_argument(
        'files', metavar='FILE/PMID', nargs='+',
        help='MEDLINE XML file, PMID (integer), or PMID list file'
    )
    parser.add_argument('--version', action='version', version=__version__)
    parser.add_argument(
        '--url', metavar='URL', help='a database URL string [postgresql://localhost/medline]',
        default='postgresql://localhost/medline'
    )
    parser.add_argument(
        '--format', choices=['full', 'html', 'tiab', 'tsv'], default='medline',
        help='medline: [default] write all content to one MEDLINE file per PMID; ' +
             'tiab: write only title and abstract plain-text to files; ' +
             'html: write all content to one large HTML file; ' +
             'tsv: only write per line PMID-title-abstract to one single .tsv table'
    )
    parser.add_argument(
        '--all', action='store_true',
        help='parse records with VersionID != "1"'
    )
    parser.add_argument(
        '--update', action='store_true',
        help='parsing MEDLINE update files: list all updated PMIDs in delete.sql'
    )
    parser.add_argument(
        '--pmid-lists', action='store_true',
        help='assume input files are lists of PMIDs, not XML files ' +
             '(implicitly set for delete and write operations)'
    )
    parser.add_argument(
        '--output', metavar='DIR', default=os.path.curdir,
        help='dump/write to a specific directory (or file for format "tab" or "html")'
    )
    parser.add_argument(
        '--error', action='store_const', const=logging.ERROR,
        dest='loglevel', help='error log level only [warn]'
    )
    parser.add_argument(
        '--info', action='store_const', const=logging.INFO,
        dest='loglevel', help='info log level [warn]'
    )
    parser.add_argument(
        '--debug', action='store_const', const=logging.DEBUG,
        dest='loglevel', help='debug log level [warn]'
    )
    parser.add_argument('--logfile', metavar='FILE', help='log to file, not STDERR')

    args = parser.parse_args()
    logging.basicConfig(
        filename=args.logfile, level=args.loglevel,
        format='%(asctime)s %(name)s %(levelname)s: %(message)s'
    )

    if args.command not in ('parse', 'write', 'insert', 'update', 'delete'):
        parser.error('illegal command "{}"'.format(args.command))

    if args.command in ('write', 'delete'):
        args.pmid_lists = True

    def ParseListOrYield(file):
        if os.path.isfile(file):
            for line in open(file):
                yield line.strip()
        else:
            yield file

    if args.pmid_lists:
        args.files = [
            pmid for f in args.files for pmid in ParseListOrYield(f)
        ]

    if args.command == 'parse':
        from medic.crud import dump

        result = dump(args.files, args.output, not args.all, args.update)
    else:
        try:
            InitDb(args.url)
        except OperationalError as e:
            parser.error(str(e))

        result = Main(args.command, args.files, Session(), not args.all)

        if args.command == 'write':
            if args.format == 'tsv':
                WriteTabular(result, args.output)
            elif args.format == 'html':
                WriteHTML(result, args.output)
            elif args.format == 'tiab':
                WriteTIAB(result, args.output)
            else:
                WriteMedline(result, args.output)
            result = True

    sys.exit(0 if result else 1)
