#!/usr/bin/env python


import sys
from optparse import OptionParser
import errno


import pysam
import pysamstats


if __name__ == '__main__':
    
    stats_types_noref = ('coverage',
                         'coverage_strand',
                         'coverage_ext',
                         'coverage_ext_strand',
                         'tlen',
                         'tlen_strand',
                         'mapq',
                         'mapq_strand',
                         'baseq',
                         'baseq_strand',
                         'mapq_binned',
                         'alignment_binned',
                         'tlen_binned')

    stats_types_withref = ('variation',
                           'variation_strand',
                           'baseq_ext',
                           'baseq_ext_strand',
                           'coverage_gc',
                           'coverage_binned',
                           'coverage_ext_binned')

    stats_types = sorted(stats_types_noref + stats_types_withref)

    usage = 'usage: %prog [options] FILE'
    description = "Calculate statistics against genome positions based on " \
                  "sequence alignments from a SAM or BAM file and print them " \
                  "to stdout."
    epilog = """
Pileup-based statistics types (each row has statistics over reads in a pileup column):

    * coverage            - Number of reads aligned to each genome position
                            (total and properly paired).
    * coverage_strand     - As coverage but with forward/reverse strand counts.
    * coverage_ext        - Various additional coverage metrics, including
                            coverage for reads not properly paired (mate 
                            unmapped, mate on other chromosome, ...).
    * coverage_ext_strand - As coverage_ext but with forward/reverse strand counts.
    * coverage_gc         - As coverage but also includes a column for %GC.
    * variation           - Numbers of matches, mismatches, deletions,
                            insertions, etc.
    * variation_strand    - As variation but with forward/reverse strand counts.
    * tlen                - Insert size statistics.
    * tlen_strand         - As tlen but with statistics by forward/reverse strand.
    * mapq                - Mapping quality statistics.
    * mapq_strand         - As mapq but with statistics by forward/reverse strand.
    * baseq               - Base quality statistics.
    * baseq_strand        - As baseq but with statistics by forward/reverse strand.
    * baseq_ext           - Extended base quality statistics, including qualities
                            of bases matching and mismatching reference.
    * baseq_ext_strand    - As baseq_ext but with statistics by forward/reverse strand.

Binned statistics types (each row has statistics over reads aligned starting within a genome window):

    * coverage_binned     - As coverage but binned.
    * coverage_ext_binned - As coverage_ext but binned.
    * mapq_binned         - Similar to mapq but binned.
    * alignment_binned    - Aggregated counts from cigar strings.
    * tlen_binned         - As tlen but binned.

Examples:

    pysamstats --type coverage example.bam > example.coverage.txt
    pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt

Version: {version} (pysam {pysamversion})

""".format(version=pysamstats.__version__, pysamversion=pysam.__version__)

    OptionParser.format_epilog = lambda self, formatter: self.epilog
    parser = OptionParser(usage=usage, description=description, epilog=epilog)

    parser.add_option('-t', '--type', dest='type', default='coverage',
                      help='Type of statistics to print, one of: %s.' % ', '.join(stats_types))

    parser.add_option('-c', '--chromosome', dest='chromosome', default=None,
                      help='Chromosome name.')

    parser.add_option('-s', '--start', dest='start', type='int', default=None,
                      help='Start position (1-based).')

    parser.add_option('-e', '--end', dest='end', type='int', default=None,
                      help='End position (1-based).')

    parser.add_option('-z', '--zero-based', dest='zero_based',
                      action='store_true', default=False,
                      help='Use zero-based coordinates (default is false, i.e.,'
                           ' use one-based coords).')

    parser.add_option('-u', '--truncate', dest='truncate', action='store_true',
                      default=False,
                      help='Truncate pileup-based stats so no records are '
                           'emitted outside the specified position range.')

    parser.add_option('-d', '--pad', dest='pad', action='store_true',
                      default=False,
                      help='Pad pileup-based stats so a record is emitted for '
                           'every position (default is only covered positions).')

    parser.add_option('-D', '--max-depth', dest='max_depth', type=int,
                      default=8000,
                      help='Maximum read depth permitted in pileup-based '
                           'statistics. The default limit is 8000.')

    parser.add_option('-f', '--fasta', dest='fasta', default=None,
                      help='Reference sequence file, only required for some '
                           'statistics.')

    parser.add_option('-o', '--omit-header', dest='omit_header', default=False,
                      action='store_true', help='Omit header row from output.')

    parser.add_option('-p', '--progress', dest='progress', type='int',
                      metavar='N', default=None,
                      help='Report progress every N rows.')

    parser.add_option('--window-size', dest='window_size', type='int',
                      metavar='N', default=300,
                      help='Size of window for binned statistics (default is '
                           '300).')

    parser.add_option('--window-offset', dest='window_offset', type=int,
                      default=None, metavar='N',
                      help='Window offset to use for deciding which genome '
                           'position to report binned statistics against. The '
                           'default is 150, i.e., the middle of 300bp window.')

    parser.add_option('--format', dest='format', default='tsv',
                      help='Output format, one of {tsv, csv, hdf5} (defaults '
                           'to tsv). N.B., hdf5 requires PyTables to be '
                           'installed.')

    parser.add_option('--output', dest='output',
                      help='Path to output file. If not provided, write to '
                           'stdout.')

    parser.add_option('--fields', dest='fields', default=None,
                      help='Comma-separated list of fields to output ('
                           'defaults to all fields).')

    parser.add_option('--hdf5-group', dest='hdf5_group', default='/',
                      help='Name of HDF5 group to write to (defaults to the '
                           'root group).')

    parser.add_option('--hdf5-dataset', dest='hdf5_dataset', default='data',
                      help='Name of HDF5 dataset to create (defaults to '
                           '"data").')

    parser.add_option('--hdf5-complib', dest='hdf5_complib', default='zlib',
                      help='HDF5 compression library (defaults to zlib).')

    parser.add_option('--hdf5-complevel', dest='hdf5_complevel', type=int,
                      default=5,
                      help='HDF5 compression level (defaults to 5).')

    parser.add_option('--hdf5-chunksize', dest='hdf5_chunksize', type=int,
                      default=2**17,
                      help='Size of chunks in number of bytes (defaults to '
                           '2**17).')

    options, args = parser.parse_args()
    
    if len(args) != 1:
        parser.error('missing SAM or BAM file operand\n\nTry "pysamstats '
                     '--help" for more information.')
    
    samfile = args[0]
    one_based = not options.zero_based
    write_header = not options.omit_header
    if options.fields:
        fields = options.fields.split(',')
    else:
        fields = None
    
    try:

        if options.type not in stats_types:
            parser.error('unsupported statistics type: "%s"\nTry one of %s or '
                         '"pysamstats --help" for more information.'
                         % (options.type, stats_types))

        elif options.type in stats_types_withref \
                and options.fasta is None:
            parser.error('missing --fasta option\n\nTry "pysamstats --help"'
                         ' for more information.')

        else:

            if options.format.lower() in ['tsv', 'csv']:

                # setup
                dialect = {'tsv': 'excel-tab', 'csv': 'excel'}[options.format]
                if options.output is None:
                    output = sys.stdout
                    needs_closing = False
                else:
                    output = open(options.output, 'wb')
                    needs_closing = True

                try:
                    pysamstats.write_csv(
                        options.type,
                        output,
                        samfile,
                        fafile=options.fasta,
                        write_header=write_header,
                        dialect=dialect,
                        chrom=options.chromosome,
                        start=options.start,
                        end=options.end,
                        one_based=one_based,
                        truncate=options.truncate,
                        pad=options.pad,
                        max_depth=options.max_depth,
                        window_size=options.window_size,
                        window_offset=options.window_offset,
                        progress=options.progress,
                        fields=fields,
                    )
                finally:
                    if needs_closing:
                        output.close()

            elif options.format.lower() == 'hdf5':

                assert options.output is not None, '--output must be specified'

                pysamstats.write_hdf5(
                    options.type,
                    options.output,
                    samfile,
                    fafile=options.fasta,
                    chrom=options.chromosome,
                    start=options.start,
                    end=options.end,
                    one_based=one_based,
                    truncate=options.truncate,
                    pad=options.pad,
                    max_depth=options.max_depth,
                    window_size=options.window_size,
                    window_offset=options.window_offset,
                    progress=options.progress,
                    hdf5_group=options.hdf5_group,
                    hdf5_dataset=options.hdf5_dataset,
                    hdf5_complib=options.hdf5_complib,
                    hdf5_complevel=options.hdf5_complevel,
                    hdf5_chunksize=options.hdf5_chunksize,
                    fields=fields,
                )

    except IOError as e:
        if e.errno == errno.EPIPE:
            pass  # ignore broken pipe
        else:
            raise
