#!/usr/bin/env python

# Created on Mon Nov 03 19:32:33 2014

# Author: XiaoTao Wang
# Organization: HuaZhong Agricultural University

## Required modules
from __future__ import division
import argparse, sys, logging, logging.handlers

def getargs():
    ## Construct an ArgumentParser object for command-line arguments
    parser = argparse.ArgumentParser(usage = '%(prog)s <-f filename> <--callTAD | --loadTAD> '
                                     '[options]\n\nCALFEA -- CALculate FEAture for TADs',
                                    description = '''The calculation is implemented by identifying
                                    long-range interactions for each TAD and looking for the
                                    aggregation patterns thereof. For more details, please
                                    refer to online TADLib documentation. Our algorithm is
                                    basically parameter-free, i.e., all you need to input are
                                    the necessary data. As is implied by the usage line, you
                                    should specify a flag --callTAD/--loadTAD to tell calfea
                                    whether to identify TADs itself or just load from an
                                    existing file.''',
                                    formatter_class = argparse.ArgumentDefaultsHelpFormatter)

    # Version
    parser.add_argument('-v', '--version', action = 'version', version = '%(prog)s 0.1.1',
                        help = 'Print version number and exit')

    # Output
    parser.add_argument('-f', '--filename', type = argparse.FileType('w'),
                        default = sys.stdout, help = 'Output file name.')
    
    ## Mutual Groups
    mg = parser.add_mutually_exclusive_group(required = True)
    mg.add_argument('--loadTAD', action = 'store_true', help = 'Load external TAD data')
    mg.add_argument('--callTAD', action = 'store_true', help = 'Identify TADs from loaded Hi-C data')
    
    # The sole parameter used in ineraction analysis
    parser.add_argument('--offset', type = int, default = 2, help = 'Offset from the main '
                        'diagonal of TAD interaction matrix. All entries with offset less than'
                        ' this value will be ignored during our analysis.')
    
    # About Logging
    parser.add_argument('--verbose', type = int, default = 2, choices = [0, 1, 2, 3],
                        help = 'Set logging level. 0: only show error messages, 1: also report '
                               'warnings, 2: show process information, 3: debug. '
                               'Note: We log all messages of all levels to a disk file'
                               '(calfea.log), while simultaneously logging process '
                               'information or above to the console.')
    
    ## Argument Groups
    ## First one, about Hi-C data
    group_1 = parser.add_argument_group(title = 'Relate to Hi-C data:')
    group_1.add_argument('-p', '--path', default = '.',
                         help = 'Path to the folder with Hi-C data. Support both absolute'
                                ' and relative path.')
    group_1.add_argument('-F', '--Format', default = 'TXT', choices = ['TXT', 'NPZ'],
                         help = 'Format of source data file')
    group_1.add_argument('-R', '--resolution', default = 10000, type = int,
                         help = 'Resolution of the binned data')
    group_1.add_argument('-T', '--template', default = 'chr%s_chr%s.int',
                         help = 'Template for matching file names using regular expression.'
                         ' Needed when "--Format" is set to be "TXT". Note only within-chromosome'
                         ' data are allowed, and don\'t place inter-chromosome data under the '
                         'folder.')
    group_1.add_argument('-C', '--chroms', nargs = '*', default = ['#', 'X'],
                         help = 'Which chromosomes to read. Specially, "#" stands'
                         ' for chromosomes with numerical labels. "--chroms" with zero argument'
                         ' will generate an empty list, in which case all chromosome data will'
                         ' be loaded.')
    group_1.add_argument('-c', '--cols', nargs = 3, type = int,
                         help = 'Which 3 columns to read, with 0 being the first. For example,'
                         ' "--cols 1 3 4" will extract the 2nd, 4th and 5th columns. Only '
                         'required when "--Format=TXT".')
    group_1.add_argument('--immortal', action = 'store_true',
                         help = 'When specified, a Numpy .npz file will be generated under the same '
                         'folder. This operation will greatly speed up data loading process next'
                         ' time.')
    group_1.add_argument('-P', '--prefix', help = 'Prefix of input .npz file name, path not'
                         ' included. Required when "--Format=NPZ".')
    group_1.add_argument('-S', '--saveto', help = 'Prefix of output .npz file name, path '
                         'not included. Required with "--immortal".')
                     
    ## Second one, load TAD data
    group_2 = parser.add_argument_group(title = 'TAD Loading:', description = 'With "--loadTAD" '
                                        'flag, you need to provide TAD data yourself. Your file '
                                        'should contain 3 columns indicating "chromosome name",'
                                        ' "start site" and "end site", respectively.')
    group_2.add_argument('-s', '--source', help = 'Complete source file name. Both absolute '
                     'and relative path are supported.')
    group_2.add_argument('--chromname', help = 'Template of chromosome names')
    ## Third one, calculate TAD
    group_3 = parser.add_argument_group(title = 'TAD Calculating:', description = 'With "--callTAD"'
                                        ' flag, a constrained hierarchical clustering algorithm '
                                        'called CONISS will be used to identify TADs according to'
                                        ' loaded Hi-C data.')
    group_3.add_argument('-O', '--output', help = 'Prefix of the generated TAD file, which is '
                         'created under the folder of Hi-C data.')
    group_3.add_argument('-w', '--window', type = int, default = 2000,
                         help = 'Window size used in TAD identification, with RESOLUTION as '
                         'the unit. For example, "-w 2000" and "-R 10000" generate a 20Mb '
                         'window. Then this window slides along each chromosome, and at each time,'
                         ' only a region of WINDOW size is taken into account and subjected to'
                         ' clustering.')
    ## Parse the command-line arguments
    commands = sys.argv[1:]
    if not commands:
        commands.append('-h')
    args = parser.parse_args(commands)
    
    args.Format = args.Format.upper()

    ## Conflicts
    if (args.Format == 'TXT') and (not args.cols):
        parser.print_help()
        parser.exit(1, 'You have to specify "--cols" with "--Format TXT"!')
    if (args.Format == 'NPZ') and (not args.prefix):
        parser.print_help()
        parser.exit(1, 'You have to specify "--prefix" with "--Format NPZ"!')
    if args.immortal and (not args.saveto):
        parser.print_help()
        parser.exit(1, 'You have to specify "--saveto" with "--immortal" flag!')
    if args.loadTAD and (not args.source):
        parser.print_help()
        parser.exit(1, '"--source" is required with "--loadTAD" flag!')
    if args.callTAD and (not args.output):
        parser.print_help()
        parser.exit(1, 'You have to specify "--output" with "--callTAD" flag!')
    
    return args, commands

## Pipeline
def pipe():
    """The Main pipeline for CALFEA.
    
    """
    # Parse Arguments
    args, commands = getargs()
    # Improve the performance if you don't want to run it
    if commands[0] not in ['-h', '-v', '--help', '--version']:
        ## Root Logger Configuration
        logger = logging.getLogger()
        # Logger Level
        logger.setLevel((4 - args.verbose) * 10)
        console = logging.StreamHandler()
        filehandler = logging.handlers.RotatingFileHandler('calfea.log',
                                                           maxBytes = 15000,
                                                           backupCount = 5)
        # Set level for Handlers
        console.setLevel('INFO')
        filehandler.setLevel('DEBUG')
        # Customizing Formatter
        formatter = logging.Formatter(fmt = '%(name)-14s %(levelname)-7s @ %(asctime)s: %(message)s',
                                      datefmt = '%m/%d/%y %H:%M:%S')
        ## Unified Formatter
        console.setFormatter(formatter)
        filehandler.setFormatter(formatter)
        # Add Handlers
        logger.addHandler(console)
        logger.addHandler(filehandler)
        ## Logging for argument setting
        arglist = ['# ARGUMENT LIST:',
                   '# output file name = %s' % args.filename,
                   '# data folder = %s' % args.path,
                   '# Hi-C format = %s' % args.Format,
                   '# chromosomes = %s' % args.chroms,
                   '# data resolution = %s' % args.resolution
                   ]
        if args.Format == 'TXT':
            arglist.extend(['# Hi-C file template = %s' % args.template,
                            '# which columns = %s' % args.cols])
        if args.Format == 'NPZ':
            arglist.extend(['# input NPZ prefix = %s' % args.prefix])
        if args.immortal:
            arglist.append('# output NPZ prefix = %s' % args.saveto)
        if args.callTAD:
            arglist.extend(['# window size = %s' % args.window,
                            '# generated TAD prefix = %s' % args.output])
        if args.loadTAD:
            arglist.extend(['# TAD source file = %s' % args.source,
                            '# chromosome name template = %s' % args.source])
        arglist.append('# offset = %s' % args.offset)
        argtxt = '\n'.join(arglist)
        logger.info('\n' + argtxt)
    
        # Interface for Inters and Queue Object
        dicts = {'path': args.path, 'Format': args.Format, 'resolution': args.resolution,
                 'template': args.template, 'chroms': args.chroms, 'cols': args.cols,
                 'prefix': args.prefix, 'immortal': args.immortal, 'saveto': args.saveto}
                 
        from tadlib import analyze
        # Two Cases
        if args.callTAD:
            # Package Dependencies
            try:
                from tadlib import coniss
            except ImportError:
                logging.error('R and rpy2 must be installed with "--callTAD" flag!')
                sys.exit(2)
            # Queue Object for Data Loading and TAD Identification
            logger.info('Read Hi-C data and identify TADs thereof ...')
            workQueue = coniss.Queue(output = args.output, window = args.window,
                                     **dicts)
            logger.info('Done!')
            # Interaction Data
            Idata = workQueue.data
            # Chromosome Labels
            chroms = workQueue.labels
            # TAD Data
            Tdata = workQueue.TAD
        else:
            logger.info('Read Hi-C data ...')
            # Inters Object, Load Hi-C Data
            workInters = analyze.Inters(**dicts)
            logger.info('Done!')
            # TAD Object, Load External TAD File, Columns 0,1,2
            logger.info('Read external TAD data ...')
            workTAD = analyze.TAD(source = args.source)
            logger.info('Done!')
            # Interaction Data
            Idata = workInters.data
            # Chromosome Labels
            chroms = workInters.labels
            # TAD Data
            Tdata = workTAD.data
    
        # Header
        H = ['ChromID', 'Start', 'End', 'Density', 'Gap-Ratio']
        args.filename.write('\t'.join(H) + '\n')
        logger.info('Calculate feature for each TAD ...')
        # Loop for Each Chromosome and Each TAD
        for c in chroms:
            logger.info('Chromosome %s', c)
            # Extract Data of One Chromosome
            i_data = Idata[c]
            d_data = Tdata[Tdata['chr'] == c]
            for d in d_data:
                ## Boundary of Current TAD
                left = d['start'] // args.resolution
                right = d['end'] // args.resolution
                # Interaction Matrix
                matrix = analyze.getmatrix(i_data, left, right)
                # Core Object for Interaction Analysis
                workCore = analyze.Core(matrix, start = args.offset)
                # Ratio of Gaps (Vacant Rows or Columns) 
                gaps = 1 - len(workCore.newM) / len(matrix)
                # Logical Condition
                if len(workCore.newM) >= (args.offset + 4):
                    # Extract Long-Range Interactions
                    workCore.longrange()
                    # Natural Clusters
                    workCore.DBSCAN()
                    # Feature
                    workCore.gdensity()
                    # Line by Line
                    wL = [c, str(d['start']), str(d['end']), str(workCore.gden), str(gaps)]
                else:
                    # Bad Domain!
                    wL = [c, str(d['start']), str(d['end']), '-1', str(gaps)]
                args.filename.write('\t'.join(wL) + '\n')
        logger.info('Done!')
        logger.info('Write results to %s ...', args.filename)
        args.filename.flush()
        args.filename.close()
        logger.info('Done!\n')

if __name__ == '__main__':
    pipe()
