#!/usr/bin/env python

# Created on Tue Dec 16 10:22:41 2014

# Author: XiaoTao Wang
# Organization: HuaZhong Agricultural University

## Required Modules
import os, sys, argparse, logging, logging.handlers, glob

try:
    import numpy as np
except ImportError:
    pass

def getargs():
    ## Construct an ArgumentParser object for command-line arguments
    parser = argparse.ArgumentParser(description = '''This software is based on hiclib
                                    (https://bitbucket.org/mirnylab/hiclib), a comprehensive
                                    Python package for Hi-C data analysis. Before running this
                                    program, you should: 1.Install all required software or
                                    libraries; 2.Re-organize your directory arrangements; (A
                                    data folder with all genome and sequencing data placed
                                    here, and a separate working directory); 3.Place genome
                                    data under the data folder, each named after the corresponding
                                    genome name. Genome sequences should be stored chromosome
                                    by chromosome in FASTA format. The gap file is also needed,
                                    but if it is not provided, we will generate a dummy one;
                                    4.Construct a metadata file describing your sequencing data
                                    under the working directory. Four columns are required: prefix
                                    of SRA file name, cell line name, biological replicate label,
                                    and restriction enzyme name. An example file is distributed
                                    along with this software, please check it.''',
                                    formatter_class = argparse.ArgumentDefaultsHelpFormatter)
    
    # Version
    parser.add_argument('-v', '--version', action = 'version', version = '%(prog)s 0.1.8',
                        help = 'Print version number and exit')
    
    ## One for all
    common = argparse.ArgumentParser(add_help = False)
    common.add_argument('-p', '--dataFolder', default = '.',
                        help = '''Root directory of original data. We recommend placing sequencing
                        and genome data here.''')
    common.add_argument('-g', '--genomeName',
                        help = '''Genome folder name. This folder must be placed under dataFolder.
                        Genome sequences should be stored chromosome by chromosome in FASTA format.
                        If gap file is not contained, we will generate a dummy one.''')
    common.add_argument('-C', '--chroms', nargs = '*', default = ['#', 'X'],
                       help = '''Which chromosomes will be involved. Specially, "#" stands for
                       chromosomes with numerical labels. "--chroms" with zero argument will
                       generate an empty list, in which case all chromosome data will be loaded.''')
    common.add_argument('-T', '--template', default = 'chr%s.fa',
                        help = '''Template of FASTA file names''')
    common.add_argument('-G', '--gapFile', default = 'gap.txt', help = '''Gap file name.''')
    common.add_argument('-m', '--metadata', default = 'datasets.tsv',
                        help = '''Metadata file describing each SRA file. You should place
                        it under current working directory. Four columns are required: prefix
                        of SRA file name, cell line name, biological replicate label, and
                        restriction enzyme name. An example file is distributed along with
                        this software, please check it.''')
    common.add_argument('--logFile', default = 'runHiC.log',
                        help = '''Logging file name.''')
    
    ## Sub-commands
    subparser = parser.add_subparsers(title = 'sub-commands',
                                      description = '''Read pair mapping, filtering, binning
                                      and iterative correction are contained. You can perform
                                      each stage of the analysis separately, or streamline the
                                      pipeline using "pileup" subcommand.''',
                                      dest = 'subcommand')
    ## Iterative Mapping
    iterM = subparser.add_parser('mapping',
                                 parents = [common],
                                 help = '''Map raw pair-end sequencing data to a supplied
                                 genome. Both SRA and FASTQ format are admissible.''',
                                 description = '''An iterative mapping schema is used. The
                                 minimum length is always 25, then the step will be calculated
                                 automatically based on the sequence length. The bowtie2 mapping
                                 software and a fastq-dump tool from SRA toolkit are required.
                                 At least, you should specify --fastqDir, --genomeName,
                                 --bowtiePath, --dataFolder and --metadata yourself.''',
                                 epilog = '''After this command, a BAM folder containing BAM
                                 files for each side of Hi-C molecules and a HDF5 folder containing
                                 hdf5 (dict-like structure format) files for library of matched
                                 Hi-C reads are created under current working directory.''',
                                 formatter_class = argparse.ArgumentDefaultsHelpFormatter)
    iterM.add_argument('-f', '--fastqDir', help = 'Sequencing data folder. Relative path to dataFolder')
    iterM.add_argument('-F', '--Format', default = 'SRA', choices = ['SRA', 'FASTQ'],
                       help = 'Format of the sequencing data.')
    iterM.add_argument('-b', '--bowtiePath', help = 'Path to bowtie2 executable program file.')
    iterM.add_argument('-t', '--threads', type = int, default = 4, help = 'Number bowtie2 threads.')
    iterM.add_argument('-i', '--bowtieIndex',
                       help = '''Path to the bowtie2 genome index. Since the index consists of
                       several files with the different suffices (e.g., hg19.1.bt2, hg19.2.bt.2),
                       provide only the common part. For example, if your genome data hg19.fa
                       and corresponding index files are stored in ~/data/hg19, you need to
                       specify --bowtieIndex as this "--bowtieIndex ~/data/hg19/hg19". When not
                       specified, we will generate one under the genome folder.''')
    iterM.add_argument('--cache', default = '/tmp',
                       help = ''''Set the cache folder. Absolute path is needed.''')
    iterM.set_defaults(func = mapping)
    
    ## Merge files from the same experiment
    multiF = subparser.add_parser('merge',
                                  parents = [common],
                                  help = '''Merge files corresponding to the same experiment
                                  together.''',
                                  description = '''This command is useful when you want to merge
                                  several HDF5 files belonging to the same experiment. (Metadata
                                  file is used again)''',
                                  epilog = '''A folder with one or more merged hdf5 files are
                                  generated under current working directory after this command
                                  is called.''',
                                  formatter_class = argparse.ArgumentDefaultsHelpFormatter)
    multiF.add_argument('--HDF5',
                        help = '''Path to the folder with hdf5 files which are generated by
                        mapping command.''')
    multiF.add_argument('-l', '--level', type = int, default = 2, choices = [1, 2],
                        help = '''Set merging level. 1: hdf5 files from the same biological
                        replicate will be merged, 2: hdf5 files from the same cell line will be
                        merged.''')
    multiF.set_defaults(func = merge)
    
    ## Fragment-level filtering
    removeNoise = subparser.add_parser('filtering',
                                       parents = [common],
                                       help = '''Filtering at the level of aligned read pairs
                                       and restriction fragments.''',
                                       description = '''PCR duplications, self-ligation products,
                                       unligated "dangling end" products, random breaks, too
                                       large and too small fragments, and fragments with high
                                       cis-to-trans ratio are all taken into account.''',
                                       epilog = '''A folder with corresponding filtered hdf5
                                       files are generated under current working directory after
                                       calling this command.''',
                                       formatter_class = argparse.ArgumentDefaultsHelpFormatter)
    removeNoise.add_argument('-u', '--mergedDir',
                             help = '''Path to the merged HDF5 files generated by merge command.
                             If the path points to one file, filtering will only be performed
                             on that file. If the path points to a folder, filtering will be
                             performed on all files contained in that folder.''')
    removeNoise.add_argument('--duplicates', action = 'store_true',
                             help = '''Remove read pairs resulting from PCR amplification.''')
    removeNoise.add_argument('--sameFragments', action = 'store_true',
                             help = '''Remove read pairs which located in the same restriction
                             fragments. Two cases are included: self-ligation products and unligated
                             "dangling end" products.''')
    removeNoise.add_argument('--startNearRsite', action = 'store_true',
                             help = '''Remove reads that start within 5 bp near a restriction site.''')
    removeNoise.add_argument('--RandomBreaks', action = 'store_true',
                             help = '''Remove "random breaks" in which corresponding fragments
                             did not arise from normal restriction digestion.''')
    removeNoise.add_argument('--extremeFragments', action = 'store_true',
                             help = '''Remove too large and too small fragments.''')
    removeNoise.add_argument('--cistotrans', action = 'store_true',
                             help = '''Remove certain fraction of fragments with the greatest
                             number of reads.''')
    removeNoise.set_defaults(func = filtering)
    
    ## Binning
    binReads = subparser.add_parser('binning',
                                    parents = [common],
                                    help = '''Bin filtered reads at certain resolution.''',
                                    description = '''For varying resolutions, three modes are
                                    provided, just choose a proper one.''',
                                    epilog = '''After calling this command, a folder with
                                    created HeatMaps (in HDF5 format) is created under current
                                    working directory.''',
                                    formatter_class = argparse.ArgumentDefaultsHelpFormatter)
    binReads.add_argument('-f', '--filteredDir',
                          help = '''Path to the filtered HDF5 files generated by filtering
                          command. If the path points to one file, we will only create a HeatMap
                          for that file. If the path points to a folder, we will construct a
                          HeatMap for each file in that folder.''')
    binReads.add_argument('-M', '--mode', default = 'wholeGenome',
                          choices = ['wholeGenome', 'byChromosome', 'withOverlaps'],
                          help = '''Memory usage: withOverlaps > byChromosome > wholeGenome.
                          Resolution capacity (take human genome for example):
                          withOverlaps (10kb) > byChromosome (40kb) > wholeGenome (200kb).''')
    binReads.add_argument('-R', '--resolution', type = int, default = 200000,
                          help = 'Resolution of a heatmap. Unit: bp')
    binReads.set_defaults(func = binning)
    
    ## Iterative Correction
    iterC = subparser.add_parser('correcting',
                                 parents = [common],
                                 help = '''Perform iterative corrections on the original HeatMap.''',
                                 description = '''Two modes are provided for different resolutions.
                                 The program will choose a better one for you according to the data
                                 format.''',
                                 epilog = '''After calling this command, a folder with corrected
                                 HeatMaps (in HDF5 format) is created under current working
                                 directory.''',
                                 formatter_class = argparse.ArgumentDefaultsHelpFormatter)
    iterC.add_argument('-H', '--HeatMap',
                       help = '''Path to the HeatMap files generated by binning command. If the
                       path points to one file, we only correct for that HeatMap. If the path
                       points to a folder, we will perform iterative corrections for all HeaMaps
                       in that folder.''')
    iterC.set_defaults(func = correcting)
                    
    ## Pile Up
    streamline = subparser.add_parser('pileup',
                                      parents = [iterM],
                                      help = '''Perform the entire analysis from sequencing
                                      data to corrected HeatMaps.''',
                                      description = '''A more convenient but less flexible
                                      command for Hi-C data processing.''',
                                      formatter_class = argparse.ArgumentDefaultsHelpFormatter,
                                      add_help = False)
    streamline.add_argument('-M', '--mode', default = 'wholeGenome',
                            choices = ['wholeGenome', 'byChromosome', 'withOverlaps'],
                            help = '''Memory usage: withOverlaps > byChromosome > wholeGenome.
                            Resolution capacity (take human genome for example):
                            withOverlaps (10kb) > byChromosome (40kb) > wholeGenome (200kb).''')
    streamline.add_argument('-R', '--resolution', type = int, default = 200000,
                          help = 'Resolution of a heatmap. Unit: bp')
    streamline.set_defaults(func = pileup)
    
     ## Parse the command-line arguments
    commands = sys.argv[1:]
    if ((not commands) or ((commands[0] in ['mapping', 'merge', 'filtering', 'binning','correcting', 'pileup'])
        and len(commands) == 1)):
        commands.append('-h')
    args = parser.parse_args(commands)
    
    return args, commands


def run():
    # Parse Arguments
    args, commands = getargs()
    # Improve the performance if you don't want to run it
    if commands[-1] not in ['-h', '-v', '--help', '--version']:
        # Define a special level name
        logging.addLevelName(21, 'main')
        ## Root Logger Configuration
        logger = logging.getLogger()
        # Logger Level
        logger.setLevel(21)
        filehandler = logging.handlers.RotatingFileHandler(args.logFile,
                                                           maxBytes = 50000,
                                                           backupCount = 5)
        # Set level for Handlers
        filehandler.setLevel(21)
        # Customizing Formatter
        formatter = logging.Formatter(fmt = '%(name)-20s %(levelname)-7s @ %(asctime)s: %(message)s',
                                      datefmt = '%m/%d/%y %H:%M:%S')
        ## Unified Formatter
        filehandler.setFormatter(formatter)
        # Add Handlers
        logger.addHandler(filehandler)
        ## Logging for argument setting
        arglist = ['# ARGUMENT LIST:',
                   '# Sub-Command Name = %s' % commands[0],
                   '# Data Root Directory = %s' % args.dataFolder,
                   '# MetaData = %s' % args.metadata,
                   '# Genome Name = %s' % args.genomeName,
                   '# Chromosomes = %s' % args.chroms,
                   '# FASTA template = %s' % args.template,
                   '# Gap File = %s' % args.gapFile
                   ]
        if (commands[0] == 'mapping') or (commands[0] == 'pileup'):
            arglist.extend(['# Sequencing data = %s' % args.fastqDir,
                            '# Sequencing Format = %s' % args.Format,
                            '# Bowtie2 Path = %s' % args.bowtiePath,
                            '# Bowtie2 Threads = %s' % args.threads,
                            ])
            if '--bowtieIndex' in commands:
                arglist.extend(['# Bowtie2 Genome Index = %s' % args.bowtieIndex])
            arglist.extend(['# Cache Folder = %s' % args.cache])
        if commands[0] == 'merge':
            arglist.extend(['# Source Files = %s' % args.HDF5,
                            '# Merging Level = %s' % args.level])
        if commands[0] == 'filtering':
            arglist.extend(['# Source Files = %s' % args.mergedDir,
                            '# Remove PCR Duplicates = %s' % args.duplicates,
                            '# Remove Same Fragments = %s' % args.sameFragments,
                            '# Remove Random Breaks = %s' % args.RandomBreaks,
                            '# Remove Extreme Fragments = %s' % args.extremeFragments,
                            '# Remove startNearRsite = %s' % args.startNearRsite,
                            '# Remove cistotrans = %s' % args.cistotrans])
        if commands[0] == 'binning':
            arglist.extend(['# Source Files = %s' % args.filteredDir,
                            '# HeatMap Mode = %s' % args.mode,
                            '# HeatMap Resolution = %s' % args.resolution])
        if commands[0] == 'correcting':
            arglist.extend(['# Source HeatMap = %s' % args.HeatMap])
        
        if commands[0] == 'pileup':
            arglist.extend(['# Merging Level = 2',
                            '# Remove PCR Duplicates = True',
                            '# Remove Same Fragments = True',
                            '# Remove Random Breaks = True',
                            '# Remove Extreme Fragments = True',
                            '# Remove startNearRsite = True',
                            '# Remove cistotrans = True',
                            '# HeatMap Mode = %s' % args.mode,
                            '# HeatMap Resolution = %s' % args.resolution])
        
        argtxt = '\n'.join(arglist)
        logging.log(21, '\n' + argtxt)
            
        # Subcommand
        args.func(args, commands)

def initialize(args):
    ## Necessary Modules
    from mirnylib import genome
    ## Validity of arguments
    dataLocation = os.path.abspath(os.path.expanduser(args.dataFolder))
    if not os.path.exists(dataLocation):
        logging.error('There is no folder named %s on your system!',
                      dataLocation)
        sys.exit(1)
    genomeFolder = os.path.join(dataLocation, args.genomeName)
    if not os.path.exists(genomeFolder):
        logging.error('%s can not be found at %s', args.genomeName,
                      dataLocation)
        sys.exit(1)

    ## Generate a dummy gap file under genome folder if there's no one yet
    gapFile = os.path.join(genomeFolder, args.gapFile)
    if not os.path.exists(gapFile):
        logging.log(21, 'No gap file can be found at %s, generating a dummy one ...',
                    genomeFolder)
        tempfile = open(gapFile, 'w')
        tempfile.write('0\tNA1000\t0\t0\t0\tN\t0\tcentromere\tno\n')
        tempfile.flush()
        tempfile.close()
        logging.log(21, 'Done!')
    
    # Python Genome Object
    genome_db = genome.Genome(genomeFolder, readChrms = args.chroms)
    
    return dataLocation, genomeFolder, genome_db
    

def mapping(args, commands):
    ## Import necessary modules
    import atexit
    import hiclib.mapping as iterM
    from mirnylib import h5dict
    
     # Initialization
    dataLocation, genomeFolder, genome_db = initialize(args)
    
    # A Local Function
    def cleanFile(filename):
        if os.path.exists(filename):
            os.remove(filename)
    
    # Construct bowtie2 genome index
    def buildIndex(genomeFolder):
        """
        Build bowtie2 index files under the provided genome folder.
        
        """
        fastaNames = [os.path.join(genomeFolder, i)
                      for i in glob.glob(os.path.join(
                      genomeFolder, args.template % ('*',)))]
        wholeGenome = os.path.join(genomeFolder,
                                   '.'.join([args.genomeName, 'fa']))
        if not os.path.exists(wholeGenome):
            os.system('cat ' + ' '.join(fastaNames) + ' > ' + wholeGenome)
        bowtieIndex = os.path.join(genomeFolder, args.genomeName)
        buildCmd = ['bowtie2-build', '--quiet', wholeGenome, bowtieIndex]
        os.system(' '.join(buildCmd))
        
        return bowtieIndex
    
    def calculateStep(length, minlen, approxStep=10, maxSteps=4):
        """
        Returns minimum length and step based on the length of sequence and
        proposed minimum length.
        """
        actualDif = length - minlen
        if actualDif < approxStep * 0.6:
            return length, 100

        numIter = np.array(np.around(actualDif / float(approxStep)), dtype=int)
        if numIter == 0:
            numIter = 1
        if numIter > maxSteps:
            numIter = maxSteps
        actualStep = actualDif / numIter

        minlen = length - actualStep * numIter

        return minlen, actualStep
    
    ## Validity of arguments
    bowtiePath = os.path.abspath(os.path.expanduser(args.bowtiePath))
    if not os.path.exists(bowtiePath):
        logging.error('Bowtie2 can not be found at %s', bowtiePath)
        sys.exit(1)
    fastqDir = os.path.join(dataLocation, args.fastqDir)
    if not os.path.exists(fastqDir):
        logging.error('%s should be placed under %s', args.fastqDir, dataLocation)
        sys.exit(1)
    mFile = args.metadata
    if not os.path.exists(mFile):
        logging.error('Metadata file %s can not be found at current working directory!',
                      mFile)
        sys.exit(1)
    cache = os.path.abspath(os.path.expanduser(args.cache))
    if not os.path.exists(cache):
        logging.warning('%s does not exist on your system, trying to create one',
                        cache)
        os.makedirs(cache)
    
    ## Construct bowtie2 genome index if there's no one yet
    if '--bowtieIndex' in commands:
        bowtieIndex = os.path.abspath(os.path.expanduser(args.bowtieIndex))
    else:
        logging.log(21, 'You haven\'t specify the Bowtie2 Genome Index Files.')
        logging.log(21, 'Try to find them at %s ...', genomeFolder)
        icheck = glob.glob(os.path.join(genomeFolder, '%s*.bt2' % args.genomeName))
        if len(icheck) != 0:
            logging.log(21, 'Index files are found at %s', genomeFolder)
            bowtieIndex = os.path.join(genomeFolder, args.genomeName)
            logging.log(21, 'Set --bowtieIndex to %s', bowtieIndex)
        else:
            logging.log(21, 'Index files can not be found. Generating them under the'
                        ' genome folder ...')
            bowtieIndex = buildIndex(genomeFolder)
            logging.log(21, 'Done!')
            
    logging.log(21, 'Now, extract read pairs from %s files and map them to %s',
                args.Format, args.genomeName)
    ## Sequencing Data Format
    Format = args.Format.lower()
    sraNames = [os.path.join(fastqDir, i) for i in glob.glob(os.path.join(
                fastqDir, '%s.%s' % ('*', Format)))]
    ## Sequencing Length
    lengths = os.path.join(fastqDir, 'lengths')
    if not os.path.exists(lengths):
        os.mkdir(lengths)
    if Format == 'sra':
        Set = set([os.path.basename(i)[:-4] for i in sraNames])
        for i in sraNames:
            calLength = ['fastq-dump', '-Z', i, '|', 'head', '-n', '2',
                         '|', 'tail', '-n', '1', '|', 'wc', '-c', '>',
                         os.path.join(lengths, os.path.basename(i)[:-4])]
            os.system(' '.join(calLength))
    else:
        Set = set([os.path.basename(i)[:-8] for i in sraNames])
        for i in Set:
            leftSide = os.path.join(fastqDir, i + '_1.fastq')
            calLength = ['head', '-n', '2', leftSide, '|', 'tail', '-n', '1',
                         'wc', '-c', '>', os.path.join(lengths, i)]
            os.system(' '.join(calLength))
    
    ## Output Folders
    bamFolder = 'bams-%s' % args.genomeName
    hdf5F = 'hdf5-%s' % args.genomeName
    args.HDF5 = hdf5F # To communicate with next processing step (merge)
    if not os.path.exists(bamFolder):
        os.mkdir(bamFolder)
    if not os.path.exists(hdf5F):
        os.mkdir(hdf5F)
    
    logging.log(21, 'Bowtie2 alignment results will be saved in bam format under %s',
                bamFolder)
    logging.log(21, 'Bam files will be parsed into hdf5 format under %s', hdf5F)
    
    # Read Metadata
    metadata = [l.rstrip().split() for l in open(mFile)]
    database = dict([(i[0], i[-1]) for i in metadata])
    for i in sorted(list(Set)):
        if i in database:
            logging.log(21, 'Current %s file: %s', args.Format, i)
        
            finalFile = os.path.join(hdf5F, '%s.hdf5' % i)
            lockFile = os.path.join(hdf5F, '%s.lock' % i)
        
            if os.path.exists(finalFile) and not os.path.exists(lockFile):
                logging.log(21, '%s already exists, skipping', finalFile)
                continue
        
            if os.path.exists(lockFile):
                logging.log(21, 'Someone is working on %s, skipping', finalFile)
                continue
        
            # Parameters used in iterative mapping
            lengthFile = os.path.join(lengths, i)
            if Format == 'sra':
                length = (int(open(lengthFile).readlines()[0]) - 1) / 2
            else:
                length = int(open(lengthFile).readlines()[0])
            logging.log(21, 'Extract sequence length ... %s', length)
            logging.log(21, 'Determining parameters for iterative mapping ...')
            minlen, step = calculateStep(length, 25)
            logging.log(21, 'minlen = %s, step = %s', minlen, step)
        
            logging.log(21, 'Create %s to ensure process safety ...', lockFile)
            lock = open(lockFile, 'w')
            lock.close()
            logging.log(21, '%s will be removed if the program terminates normally.', lockFile)
        
            atexit.register(cleanFile, lockFile)
        
            cleanup = ['rm', '-rf', os.path.join(bamFolder, '%s*' % i)]
            os.system(' '.join(cleanup))
        
            ## Iterative Mapping
            # Common Parameters
            Parameters = {'bowtie_path': bowtiePath, 'bowtie_index_path': bowtieIndex,
                          'min_seq_len': minlen, 'len_step': step, 'nthreads': args.threads,
                          'temp_dir': cache, 'bowtie_flags': '--very-sensitive'}
            if Format == 'sra':
                sourceFile = os.path.join(fastqDir, i + '.sra')
                # The First Side
                logging.log(21, 'Mapping first side of the reads ...')
                iterM.iterative_mapping(fastq_path = sourceFile,
                                        out_sam_path = '%s/%s_1.bam' % (bamFolder, i),
                                        seq_start = 0,
                                        seq_end = length,
                                        bash_reader = 'fastq-dump -Z',
                                        **Parameters)
                logging.log(21, 'Done!')
                logging.log(21, 'Mapping second side of the reads ...')
                # The Second Side
                iterM.iterative_mapping(fastq_path = sourceFile,
                                        out_sam_path = '%s/%s_2.bam' % (bamFolder, i),
                                        seq_start = length,
                                        seq_end = 2 * length,
                                        bash_reader = 'fastq-dump -Z',
                                        **Parameters)
                logging.log(21, 'Done!')
            else:
                logging.log(21, 'Mapping first side of the reads ...')
                iterM.iterative_mapping(fastq_path = os.path.join(fastqDir, i + '_1.fastq'),
                                        out_sam_path = '%s/%s_1.bam' % (bamFolder, i),
                                        **Parameters)
                logging.log(21, 'Done!')
                logging.log(21, 'Mapping second side of the reads ...')
                iterM.iterative_mapping(fastq_path = os.path.join(fastqDir, i + '_2.fastq'),
                                        out_sam_path = '%s/%s_2.bam' % (bamFolder, i),
                                        **Parameters)
                logging.log(21, 'Done!')
        
            logging.log(21, 'Parsing mapped sequences ...')
            ## Parse the mapped sequences into a Python data structure
            ## Assign the ultra-sonic fragments to restriction fragments
            lib = h5dict.h5dict(finalFile)
            iterM.parse_sam(sam_basename1 = '%s/%s_1.bam' % (bamFolder, i),
                            sam_basename2 = '%s/%s_2.bam' % (bamFolder, i),
                            out_dict = lib,
                            genome_db = genome_db,
                            enzyme_name = database[i],
                            save_seqs = False)
            logging.log(21, 'Done!')
        
            os.remove(lockFile)

def merge(args, commands):
    # Necessary Modules
    from hiclib.fragmentHiC import HiCdataset
    
    ## Validity of arguments
    Sources = os.path.abspath(os.path.expanduser(args.HDF5))
    if not os.path.exists(Sources):
        logging.error('There is no folder named %s on your system!', Sources)
        sys.exit(1)
    mFile = args.metadata
    if not os.path.exists(mFile):
        logging.error('%s can not be found under current working directory!', mFile)
        sys.exit(1)
    
    logging.log(21, 'According to %s, merge hdf5 files under %s', args.metadata, Sources)
    # Output Folder
    mergedFolder = 'merged-%s' % args.genomeName
    if not os.path.exists(mergedFolder):
        os.mkdir(mergedFolder)
    
    logging.log(21, 'Merged files will be saved under %s', mergedFolder)
    
    args.mergedDir = mergedFolder # To communicate with next processing step (filtering)
    ## Now, start merging
    metadata = [l.rstrip().split() for l in open(mFile)]
    ## Hierarchical merging structures
    bioReps = set((i[1], i[3], i[2]) for i in metadata)
    cellLines = set((i[1], i[3]) for i in metadata)
    ## The First level, biological replicates
    logging.log(21, 'Merging data from the same biological replicate ...')
    queueL1 = []
    for rep in bioReps:
        filenames = [os.path.join(Sources, '%s.hdf5' % i[0]) for i in metadata
                    if ((i[1], i[3], i[2]) == rep)]
        outfile = os.path.join(mergedFolder, '%s-%s-%s-merged.hdf5' % rep)
        enzyme = rep[1]
        queueL1.append((filenames, outfile, enzyme))
    for member in queueL1:
        # Initialize a Genome Object
        dataLocation, genomeFolder, genome_db = initialize(args)
        genome_db.setEnzyme(member[-1])
        ## Parsing individual files, no filtering processes are applied.
        lanePools = []
        for source in member[0]:
            parseName = os.path.join(mergedFolder, '%s-parsed.hdf5' % os.path.basename(source).replace('.hdf5', ''))
            parseF = HiCdataset(filename = parseName,
                                genome = genome_db,
                                mode = 'w')
            parseF.parseInputData(source, noFiltering = True)
            lanePools.append(parseName)
        ## Merge files altogether
        fragments = HiCdataset(filename = member[1], genome = genome_db, mode = 'w')
        fragments.merge(lanePools)
        # Clean up parsed individual files
        for delFile in lanePools:
            os.remove(delFile)
    logging.log(21, 'Done!')
    
    if args.level == 2:
        ## The Second level, cell lines, optional
        logging.log(21, 'Merging data of the same cell line using the same restriction enzyme ...')
        
        queueL2 = []
        for cell in cellLines:
            filenames = [os.path.join(mergedFolder, '%s-%s-%s-merged.hdf5' % i) for i in bioReps
                         if ((i[0], i[1]) == cell)]
            outfile = os.path.join(mergedFolder, '%s-%s-allReps-merged.hdf5' % cell)
            enzyme = cell[-1]
            queueL2.append((filenames, outfile, enzyme))
        
        for member in queueL2:
            # Initialize a Genome Object
            dataLocation, genomeFolder, genome_db = initialize(args)
            genome_db.setEnzyme(member[-1])
            fragments = HiCdataset(filename = member[1], genome = genome_db, mode = 'w')
            fragments.merge(member[0])
    logging.log(21, 'Done!')

def filtering(args, commands):
    # Necessary Modules
    from runHiC.utilities import cHiCdataset
    
    def core(filename, args):
        # Parse restriction enzyme name from the file name
        enzyme = os.path.basename(filename).split('-')[1]
        # Initialize a Genome Object
        dataLocation, genomeFolder, genome_db = initialize(args)
        genome_db.setEnzyme(enzyme)
        ## Create a cHiCdataset object
        filteredF = os.path.join(filteredFolder, os.path.basename(filename).replace('merged', 'filtered'))
        fragments = cHiCdataset(filteredF, genome = genome_db, mode = 'w')
        ## Self-Circles, Dangling-Ends, and Random-Breaks may be filtered
        fragments.parseInputData(filename, commandArgs = args)
        ## Additional Filtering
        if args.duplicates:
            fragments.filterDuplicates()
        if args.startNearRsite:
            fragments.filterRsiteStart(offset=5)
        if args.extremeFragments:
            fragments.filterLarge()
        if args.cistotrans:
            fragments.filterExtreme(cutH=0.005, cutL=0)
        
    ## Validity of arguments
    Sources = os.path.abspath(os.path.expanduser(args.mergedDir))
    if not os.path.exists(Sources):
        logging.error('%s does not exists on your system!', Sources)
        sys.exit(1)
    
    # Output Folder
    filteredFolder = 'filtered-%s' % args.genomeName
    if not os.path.exists(filteredFolder):
        os.mkdir(filteredFolder)
    args.filteredDir = filteredFolder # To communicate with next processing step (binning)
    
    logging.log(21, 'Filtered files will be saved under %s', filteredFolder)
    
    ## Two cases: a directory or a single file
    if os.path.isdir(Sources):
        logging.log(21, 'Perform filtering process on all merged hdf5 files under %s ...', Sources)
        queue = [os.path.join(Sources, i) for i in glob.glob(os.path.join(Sources, '*-merged.hdf5'))]
        if len(queue) == 0:
            logging.error('No proper files can be found at %s!', Sources)
            sys.exit(1)
        else:
            for f in queue:
                logging.log(21, 'Current file: %s', f)
                core(f, args)
    else:
        if not Sources.endswith('-merged.hdf5'):
            logging.error('Invalid file name: suffix "-merged.hdf5" can not be found!')
            sys.exit(1)
        logging.log(21, 'Perform filtering process on %s ...', Sources)
        core(Sources, args)

def binning(args, commands):
    # Necessary Modules
    from runHiC.utilities import cHiCdataset
    ## Validity of arguments
    Sources = os.path.abspath(os.path.expanduser(args.filteredDir))
    if not os.path.exists(Sources):
        logging.error('%s does not exists on your system!', Sources)
        sys.exit(1)
    
    # Output Dir
    hFolder = 'Heatmaps-%s' % args.genomeName
    if not os.path.exists(hFolder):
        os.mkdir(hFolder)
    # To communicate with next processing step (correcting)
    args.HeatMap = hFolder
    
    logging.log(21, 'HeatMaps will be saved in hdf5 format under %s', hFolder)
    
    ## Generate HeatMaps
    if os.path.isdir(Sources):
        logging.log(21, 'Generate HeatMaps based on filtered hdf5 files under %s', Sources)
        queue = [os.path.join(Sources, i) for i in glob.glob(os.path.join(Sources, '*-filtered.hdf5'))]
        if len(queue) == 0:
            logging.error('No proper files can be found at %s!', Sources)
            sys.exit(1)
    else:
        if not Sources.endswith('-filtered.hdf5'):
            logging.error('Invalid file name: suffix "-filtered.hdf5" can not be found!')
            sys.exit(1)
        logging.log(21, 'HeatMap will be generated using data from %s', Sources)
        queue = [Sources]
    
    # Appropriate Units
    unit, denominator = ('K', 1000) if (args.resolution / 1000 < 1000) else ('M', 1000000)
    nLabel = str(args.resolution / denominator) + unit
    for f in queue:
        logging.log(21, 'Current source file: %s', f)
        hFile = os.path.join(hFolder, os.path.basename(f).replace('.hdf5', '-%s.hm' % nLabel))
        # Parse restriction enzyme name from the file name
        enzyme = os.path.basename(f).split('-')[1]
         # Initialize a Genome Object
        dataLocation, genomeFolder, genome_db = initialize(args)
        genome_db.setEnzyme(enzyme)
        fragments = cHiCdataset(f, genome = genome_db, mode = 'r')
        ## Different Modes
        if args.mode == 'wholeGenome':
            fragments.saveHeatmap(hFile, resolution = args.resolution)
        if args.mode == 'byChromosome':
            fragments.saveByChromosomeHeatmap(hFile, resolution = args.resolution)
        if args.mode == 'withOverlaps':
            fragments.saveHiResHeatmapWithOverlaps(hFile, resolution = args.resolution)
    
    logging.log(21, 'Done!')

def correcting(args, commands):
    ## Necessary Modules
    from mirnylib import h5dict
    
     # Initialization
    dataLocation, genomeFolder, genome_db = initialize(args)
    
    ## Two modes
    def Lcore(filename, resolution):
        # Necessary Modules
        from hiclib import binnedData
        # Output file
        cFile = os.path.join(cFolder, os.path.basename(filename).replace('.hm', '_c.hm'))
        # Create a binnedData object, load the data.
        BD = binnedData.binnedData(resolution, genome_db)
        name = '-'.join(os.path.basename(filename).split('-')[:3])
        BD.simpleLoad(filename, name)
        ## Perform ICE
        # Remove the contacts between loci located within the same bin.
        BD.removeDiagonal()
        # Remove bins with less than half of a bin sequenced.
        BD.removeBySequencedCount(0.5)
        # Remove 0.5% bins with the lowest number of records
        BD.removePoorRegions(cutoff = 0.5, coverage = True)
        BD.removePoorRegions(cutoff = 0.5, coverage = False)
        # Truncate top 0.05% of inter-chromosomal counts (possibly, PCR blowouts).
        BD.truncTrans(high = 0.0005)
        # Perform iterative correction.
        BD.iterativeCorrectWithoutSS()
        # Save the iteratively corrected heatmap.
        BD.export(name, cFile)

    def Hcore(filename, resolution):
        # Necessary Modules
        from mirnylib.numutils import completeIC
        # Output file
        cFile = os.path.join(cFolder, os.path.basename(filename).replace('.hm', '_c.hm'))
        # Create a hdf5 file manually
        mydict = h5dict.h5dict(cFile)
        # Raw HeatMap
        raw = h5dict.h5dict(f, mode = 'r')
        ## Perform ICE for each chsomosome
        keys = raw.keys()
        cisKeys = [i for i in keys if len(set(i.split()))==1]
        for i in cisKeys:
            # Extract Cis HeatMap
            rawHeatMap = raw[i]
            # Only work for cis-heatmaps
            cHeatMap, bias = completeIC(rawHeatMap, returnBias=True)
            mydict[i] = cHeatMap
        mydict['resolution'] = resolution
        
    ## Validity of arguments
    Sources = os.path.abspath(os.path.expanduser(args.HeatMap))
    if not os.path.exists(Sources):
        logging.error('%s does not exists on your system!', Sources)
        sys.exit(1)
    
    ## Output Dir
    cFolder = 'Corrected-%s' % args.genomeName
    if not os.path.exists(cFolder):
        os.mkdir(cFolder)
    
    logging.log(21, 'Corrected HeatMaps will be generated under %s', cFolder)
    
    ## Corrections start
    if os.path.isdir(Sources):
        logging.log(21, 'Correct all HeatMaps under %s', Sources)
        queue = [os.path.join(Sources, i) for i in glob.glob(os.path.join(Sources, '*.hm'))]
        if len(queue) == 0:
            logging.error('No proper files can be found at %s!', Sources)
            sys.exit(1)
    else:
        if not Sources.endswith('.hm'):
            logging.error('Invalid file name: suffix ".hm" can not be found!')
            sys.exit(1)
        logging.log(21, 'Only %s will be corrected', Sources)
        queue = [Sources]
    
    for f in queue:
        logging.log(21, 'Current source file: %s', f)
        # Raw Data
        raw = h5dict.h5dict(f, mode = 'r')
        Keys = raw.keys()
        resolution = int(raw['resolution'])
        if 'heatmap' in Keys: # Low resolution case
            Lcore(f, resolution)
        else: # High resolution case
            try:
                Hcore(f, resolution)
            except ValueError:
                logging.warning('Iterative correction will remove more than a half of the matrix')
                logging.warning('Skipping ...')
                continue
    
    logging.log(21, 'Done!')
            
def pileup(args, commands):
    """
    A customized pipeline covering the whole process.
    
    """
    mapping(args, commands)
    args.level = 2
    merge(args, commands)
    args.duplicates = args.sameFragments = args.startNearRsite = True
    args.extremeFragments = True
    args.RandomBreaks = args.cistotrans = True
    filtering(args, commands)
    binning(args, commands)
    correcting(args, commands)
    

if __name__ == '__main__':
    run()