#!/usr/bin/env python


import sys
from optparse import OptionParser
import errno


import pysam
from pysam import Samfile, Fastafile
import pysamstats


if __name__ == '__main__':
    
    stats_types_noref = ('coverage',
                         'coverage_strand',
                         'coverage_ext',
                         'coverage_ext_strand',
                         'tlen',
                         'tlen_strand',
                         'mapq',
                         'mapq_strand',
                         'baseq',
                         'baseq_strand',
                         'mapq_binned',
                         'alignment_binned',
                         'tlen_binned')

    stats_types_withref = ('variation',
                           'variation_strand',
                           'baseq_ext',
                           'baseq_ext_strand',
                           'coverage_gc',
                           'coverage_binned',
                           'coverage_ext_binned')

    stats_types = sorted(stats_types_noref + stats_types_withref)

    usage = 'usage: %prog [options] FILE'
    description = "Calculate statistics against genome positions based on " \
                  "sequence alignments from a SAM or BAM file and print them " \
                  "to stdout."
    epilog = """
Pileup-based statistics types (each row has statistics over reads in a pileup column):

    * coverage            - Number of reads aligned to each genome position
                            (total and properly paired).
    * coverage_strand     - As coverage but with forward/reverse strand counts.
    * coverage_ext        - Various additional coverage metrics, including
                            coverage for reads not properly paired (mate 
                            unmapped, mate on other chromosome, ...).
    * coverage_ext_strand - As coverage_ext but with forward/reverse strand counts.
    * coverage_gc         - As coverage but also includes a column for %GC.
    * variation           - Numbers of matches, mismatches, deletions,
                            insertions, etc.
    * variation_strand    - As variation but with forward/reverse strand counts.
    * tlen                - Insert size statistics.
    * tlen_strand         - As tlen but with statistics by forward/reverse strand.
    * mapq                - Mapping quality statistics.
    * mapq_strand         - As mapq but with statistics by forward/reverse strand.
    * baseq               - Base quality statistics.
    * baseq_strand        - As baseq but with statistics by forward/reverse strand.
    * baseq_ext           - Extended base quality statistics, including qualities
                            of bases matching and mismatching reference.
    * baseq_ext_strand    - As baseq_ext but with statistics by forward/reverse strand.

Binned statistics types (each row has statistics over reads aligned starting within a genome window):

    * coverage_binned     - As coverage but binned.
    * coverage_ext_binned - As coverage_ext but binned.
    * mapq_binned         - Similar to mapq but binned.
    * alignment_binned    - Aggregated counts from cigar strings.
    * tlen_binned         - As tlen but binned.

Examples:

    pysamstats --type coverage example.bam > example.coverage.txt
    pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt

Version: {version} (pysam {pysamversion})

""".format(version=pysamstats.__version__, pysamversion=pysam.__version__)

    OptionParser.format_epilog = lambda self, formatter: self.epilog
    parser = OptionParser(usage=usage, description=description, epilog=epilog)

    parser.add_option('-t', '--type', dest='type', default='coverage',
                      help='Type of statistics to print, one of: %s.' % ', '.join(stats_types))

    parser.add_option('-c', '--chromosome', dest='chromosome', default=None,
                      help='Chromosome name.')

    parser.add_option('-s', '--start', dest='start', type='int', default=None,
                      help='Start position (1-based).')

    parser.add_option('-e', '--end', dest='end', type='int', default=None,
                      help='End position (1-based).')

    parser.add_option('-z', '--zero-based', dest='zero_based',
                      action='store_true', default=False,
                      help='Use zero-based coordinates (default is false, i.e.,'
                           ' use one-based coords).')

    parser.add_option('-u', '--truncate', dest='truncate', action='store_true',
                      default=False,
                      help='Truncate pileup-based stats so no records are '
                           'emitted outside the specified position range.')

    parser.add_option('-d', '--pad', dest='pad', action='store_true',
                      default=False,
                      help='Pad pileup-based stats so a record is emitted for '
                           'every position (default is only covered positions).')

    parser.add_option('-D', '--max-depth', dest='max_depth', type=int,
                      default=8000,
                      help='Maximum read depth permitted in pileup-based '
                           'statistics. The default limit is 8000.')

    parser.add_option('-f', '--fasta', dest='fasta', default=None,
                      help='Reference sequence file, only required for some '
                           'statistics.')

    parser.add_option('-o', '--omit-header', dest='omit_header', default=False,
                      action='store_true', help='Omit header row from output.')

    parser.add_option('-p', '--progress', dest='progress', type='int',
                      metavar='N', default=None,
                      help='Report progress every N rows.')

    parser.add_option('--window-size', dest='window_size', type='int',
                      metavar='N', default=300,
                      help='Size of window for binned statistics (default is '
                           '300).')

    parser.add_option('--window-offset', dest='window_offset', type=int,
                      default=None, metavar='N',
                      help='Window offset to use for deciding which genome '
                           'position to report binned statistics against. The '
                           'default is 150, i.e., the middle of 300bp window.')

    options, args = parser.parse_args()
    
    if len(args) != 1:
        parser.error('missing SAM or BAM file operand\n\nTry "pysamstats '
                     '--help" for more information.')
    
    samfile = args[0]
    one_based = not options.zero_based
    write_header = not options.omit_header
    
    try:

        if options.type not in stats_types:
            parser.error('unsupported statistics type: "%s"\nTry one of %s or '
                         '"pysamstats --help" for more information.'
                         % (options.type, stats_types))

        elif options.type in stats_types_withref \
                and options.fasta is None:
            parser.error('missing --fasta option\n\nTry "pysamstats --help"'
                         ' for more information.')

        else:
            fafile = options.fasta
            fname = 'write_' + options.type
            f = getattr(pysamstats, fname)
            f(sys.stdout, samfile,
              fafile=options.fasta,
              write_header=write_header,
              chrom=options.chromosome, start=options.start, end=options.end,
              one_based=one_based,
              truncate=options.truncate,
              pad=options.pad,
              max_depth=options.max_depth,
              progress=options.progress,
              window_size=options.window_size,
              window_offset=options.window_offset)
            
    except IOError as e:
        if e.errno == errno.EPIPE:
            pass # ignore broken pipe
        else:
            raise
