#!python

"""Determines and plots correlation between preferences or differential preferences.

Written by Jesse Bloom."""


import sys
import os
import time
import scipy.stats
import dms_tools.utils
import dms_tools.parsearguments
import dms_tools.file_io
import dms_tools.plot



def main():
    """Main body of script."""

    # Parse command line arguments
    parser = dms_tools.parsearguments.CorrelateParser()
    args = vars(parser.parse_args())
    prog = parser.prog

    # print initial information
    versionstring = dms_tools.file_io.Versions() 
    print 'Beginning execution of %s in directory %s\n' % (prog, os.getcwd())
    print versionstring
    print 'Parsed the following arguments:\n%s\n' % '\n'.join(['\t%s = %s' % tup for tup in args.iteritems()])

    assert not (args['pref_entropy'] and args['enrichment']), "The --pref_entropy and --enrichment options are mutually exclusive."

    # remove output files if they exist
    corrfile = '%s.txt' % args['outfileprefix']
    plotfile = '%s.pdf' % args['outfileprefix']
    if os.path.dirname(corrfile) and not os.path.isdir(os.path.dirname(corrfile)):
        raise ValueError('outfileprefix specifies non-existent directory as part of name:\n%s' % args['outfileprefix'])
    print "Correlation will be written to %s" % corrfile
    if os.path.isfile(corrfile):
        os.remove(corrfile)
        print "Removing existing file %s" % corrfile
    if not args['noplot']:
        print "Correlation will be plotted to %s" % plotfile
        if os.path.isfile(plotfile):
            os.remove(plotfile)
            print "Removing existing file %s" % plotfile

    if args['plot_title'].upper().strip() == 'NONE':
        args['plot_title'] = None

    # default plot formatting, may be changed depending on options below
    logx = logy = fixaxes = False

    # read data
    (sites, characters, wts, databyfile) = dms_tools.file_io.ReadMultiPrefOrDiffPref([args['file1'], args['file2']], args['excludestop'])
    (data1, data2) = (databyfile[args['file1']], databyfile[args['file2']])
    if 'preferences' == dms_tools.utils.Pref_or_DiffPref(data1) == dms_tools.utils.Pref_or_DiffPref(data2):
        print "%s and %s both specify preferences." % (args['file1'], args['file2'])
        assert not args['rms_dpi'], 'rms_dpi argument cannot be used if files specify preferences; it is only valid for differential preferences'
        if args['pref_entropy']:
            print "Computing site entropies from preferences; the correlations will be for these entropies rather than the preferences themselves."
            xvalues = [dms_tools.utils.SiteEntropy(data1[r].values()) for r in sites]
            yvalues = [dms_tools.utils.SiteEntropy(data2[r].values()) for r in sites]
        elif args['enrichment']:
            print("The correlations will be for enrichment ratios calculated from the preferences.")
            data1 = dms_tools.utils.PrefsToEnrichments(wts, data1, include_wt=False)
            data2 = dms_tools.utils.PrefsToEnrichments(wts, data2, include_wt=False)
            xvalues = []
            yvalues = []
            for r in sites:
                for x in [x for x in characters if x != wts[r]]:
                    xvalues.append(data1[r][x])
                    yvalues.append(data2[r][x])
            logx = logy = True
        else:
            print "The correlations will be for the preferences."
            xvalues = []
            yvalues = []
            for r in sites:
                for x in characters:
                    xvalues.append(data1[r][x])
                    yvalues.append(data2[r][x])
            fixaxes = True
    elif 'diffprefs' == dms_tools.utils.Pref_or_DiffPref(data1) == dms_tools.utils.Pref_or_DiffPref(data2):
        print "%s and %s both specify differential preferences." % (args['file1'], args['file2'])
        assert not args['pref_entropy'], 'pref_entropy argument cannot be used if files specify differential preferences; it is only valid for preferences'
        assert not args['enrichment'], 'enrichment option cannot be usef if files specify differential preferences; it is only valid for preferences'
        if args['rms_dpi']:
            print "Computing RMS differential preferences; the correlations will be for these RMS values rather than the differential preferences themselves."
            xvalues = [dms_tools.utils.RMS(data1[r].values()) for r in sites]
            yvalues = [dms_tools.utils.RMS(data2[r].values()) for r in sites]
        else:
            print "The correlations will be for the differential preferences."
            xvalues = []
            yvalues = []
            for r in sites:
                for x in characters:
                    xvalues.append(data1[r][x])
                    yvalues.append(data2[r][x])
    else:
        raise ValueError("file1 and file2 do not both specify either valid preferences or valid differential preferences")

    # compute and plot correlation
    (r, p) = scipy.stats.pearsonr(xvalues, yvalues)
    n = len(xvalues)
    print "\nThe Pearson correlation is %g (P = %g, N = %d).\nWriting this correlation to %s\n" % (r, p, n, corrfile)
    with open(corrfile, 'w') as f:
        f.write('R = %g\nP = %g\nN = %d' % (r, p, n))
    if not args['noplot']:
        print "\nNow plotting the data in scatterplot form to %s" % plotfile
        if args['corr_on_plot']:
            corr = (r, p)
        else:
            corr = None
        dms_tools.plot.PlotCorrelation(xvalues, yvalues, plotfile, args['name1'], args['name2'], alpha=args['alpha'], title=args['plot_title'], corr=corr, symmetrize=True, fixaxes=fixaxes, r2=args['r2'], logx=logx, logy=logy)

    print 'Successfully completed %s at %s' % (prog, time.asctime())



if __name__ == '__main__':
    main() # run the script
