#!/usr/bin/env python

import sys
from optparse import OptionParser
import errno


import pysam
from pysam import Samfile, Fastafile
import pysamstats


if __name__ == '__main__':
    
    stats_types = ('coverage', 
                   'coverage_strand', 
                   'coverage_ext', 
                   'coverage_ext_strand',
                   'coverage_gc',
                   'variation', 
                   'variation_strand',
                   'tlen',
                   'tlen_strand',
                   'mapq',
                   'mapq_strand',
                   'baseq',
                   'baseq_strand',
                   'baseq_ext',
                   'baseq_ext_strand',
                   'coverage_binned',
                   'coverage_ext_binned',
                   'mapq_binned',
                   'alignment_binned',
                   'tlen_binned')
    stats_types_requiring_fasta = ('variation', 
                                   'variation_strand', 
                                   'baseq_ext', 
                                   'baseq_ext_strand', 
                                   'coverage_gc',
                                   'coverage_binned',
                                   'coverage_ext_binned')
    usage = 'usage: %prog [options] FILE'
    description = "Calculate statistics against genome positions based on sequence alignments from a SAM or BAM file and print them to stdout."
    epilog = """
Pileup-based statistics types (each row has statistics over reads in a pileup column):

    * coverage            - Number of reads aligned to each genome position
                            (total and properly paired).
    * coverage_strand     - As coverage but with forward/reverse strand counts.
    * coverage_ext        - Various additional coverage metrics, including
                            coverage for reads not properly paired (mate 
                            unmapped, mate on other chromosome, ...).
    * coverage_ext_strand - As coverage_ext but with forward/reverse strand counts.
    * coverage_gc         - As coverage but also includes a column for %GC.
    * variation           - Numbers of matches, mismatches, deletions,
                            insertions, etc.
    * variation_strand    - As variation but with forward/reverse strand counts.
    * tlen                - Insert size statistics.
    * tlen_strand         - As tlen but with statistics by forward/reverse strand.
    * mapq                - Mapping quality statistics.
    * mapq_strand         - As mapq but with statistics by forward/reverse strand.
    * baseq               - Base quality statistics.
    * baseq_strand        - As baseq but with statistics by forward/reverse strand.
    * baseq_ext           - Extended base quality statistics, including qualities
                            of bases matching and mismatching reference.
    * baseq_ext_strand    - As baseq_ext but with statistics by forward/reverse strand.

Binned statistics types (each row has statistics over reads aligned starting within a genome window):

    * coverage_binned     - As coverage but binned.
    * coverage_ext_binned - As coverage_ext but binned.
    * mapq_binned         - Similar to mapq but binned.
    * alignment_binned    - Aggregated counts from cigar strings.
    * tlen_binned         - As tlen but binned.

Examples:

    pysamstats --type coverage example.bam > example.coverage.txt
    pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt

Version: {version} (pysam {pysamversion})

""".format(version=pysamstats.__version__, pysamversion=pysam.__version__)

    OptionParser.format_epilog = lambda self, formatter: self.epilog
    parser = OptionParser(usage=usage, description=description, epilog=epilog)
    parser.add_option('-t', '--type', dest='type', help='Type of statistics to print, one of: %s.' % ', '.join(stats_types), default='coverage')
    parser.add_option('-c', '--chromosome', dest='chromosome', help='Chromosome name.', default=None)
    parser.add_option('-s', '--start', dest='start', type='int', help='Start position (1-based).', default=None)
    parser.add_option('-e', '--end', dest='end', type='int', help='End position (1-based).', default=None)
    parser.add_option('-z', '--zero-based', dest='zero_based', help='Use zero-based coordinates (default is false, i.e., use one-based coords).', action='store_true', default=False)
    parser.add_option('-u', '--truncate', dest='truncate', help='Truncate pileup-based stats so no records are emitted outside the specified position range.', action='store_true', default=False)
    parser.add_option('-d', '--pad', dest='pad', help='Pad pileup-based stats so a record is emitted for every position (default is only covered positions).', action='store_true', default=False)
    parser.add_option('-D', '--max-depth', dest='max_depth', type=int, default=8000, help='Maximum read depth permitted in pileup-based statistics. The default limit is 8000.')
    parser.add_option('-f', '--fasta', dest='fasta', help='Reference sequence file, only required for some statistics.', default=None)
    parser.add_option('-o', '--omit-header', dest='omit_header', help='Omit header row from output.', action='store_true')
    parser.add_option('-p', '--progress', dest='progress', type='int', help='Report progress every N rows.', metavar='N', default=None)
    parser.add_option('--window-size', dest='window_size', type='int', help='Size of window for binned statistics (default is 300).', metavar='N', default=300)
    parser.add_option('--window-offset', dest='window_offset', type=int, help='Window offset to use for deciding which genome position to report binned statistics against. The default is 150, i.e., the middle of 300bp window.', default=None, metavar='N')
    options, args = parser.parse_args()
    
    if len(args) != 1:
        parser.error('missing file operand\n\nTry "pysamstats --help" for more information.')
    
    samfile = Samfile(args[0])
    one_based = not options.zero_based
    write_header = not options.omit_header
    
    try:

        if options.type in stats_types_requiring_fasta:
            
            if options.fasta is None:
                parser.error('missing --fasta option\n\nTry "pysamstats --help" for more information.')
            else:
                fafile = Fastafile(options.fasta)
                fname = 'write_' + options.type
                f = getattr(pysamstats, fname)
                f(sys.stdout, samfile, fafile,
                  write_header=write_header, 
                  chrom=options.chromosome, start=options.start, end=options.end, 
                  one_based=one_based,
                  truncate=options.truncate,
                  pad=options.pad,
                  max_depth=options.max_depth,
                  progress=options.progress,
                  window_size=options.window_size,
                  window_offset=options.window_offset)
            
        elif options.type in stats_types:
            
            fname = 'write_' + options.type
            f = getattr(pysamstats, fname)
            f(sys.stdout, samfile,
              write_header=write_header, 
              chrom=options.chromosome, start=options.start, end=options.end, 
              one_based=one_based, 
              truncate=options.truncate,
              pad=options.pad,
              max_depth=options.max_depth,
              progress=options.progress,
              window_size=options.window_size,
              window_offset=options.window_offset)
                
        else:
            parser.error('unsupported statistics type: "%s"\nTry one of %s or "pysamstats --help" for more information.' % (options.type, stats_types))
        
    except IOError as e:
        if e.errno == errno.EPIPE:
            pass # ignore broken pipe
        else:
            raise
