import graphlab.connect as _mt
from _job import Job as _Job
from graphlab.deploy import Task as _Task, _Pipeline, _default_session
import environment as _environment
import _executionenvironment as _env
import graphlab as _gl

import logging as _logging
import time as _time

__LOGGER__ = _logging.getLogger(__name__)


def create(tasks, name=None, environment=None):
    """
    Run a list of tasks, with optional bindings, in the specified environment.

    By default, this method will kick off asynchronous work, and return a Job
    object to monitor/manage that work.

    Parameters
    ----------
    tasks : list [Task | str | tuple [ str, dict ]] | str
        List of tasks to run, or names of artifacts to run.

    name : str, optional
        Name for this execution (names the returned Job). Default is environment name + timestamp.

    environment : :class:`~graphlab.deploy.environment.EC2` | 
                  :class:`~graphlab.deploy.environment.Hadoop` | 
                  :class:`~graphlab.deploy.environment.LocalAsync`, optional
        Optional environment for execution. This would commonly hold access
        keys, launch locations etc.  Also included in the environment object is
        a dictionary for associated metadata to pass to the execution. Default
        is 'LocalAsync', which will have the execution occur in the background
        locally.

    Returns
    -------
    job : :py:class:`~graphlab.deploy._job.Job`
        Used for monitoring and managing the execution of the Job.

    Notes
    -----
    - When this method is invoked, each Task specified is cloned and a snapshot
      of it is used for execution. This snapshot can then be queried by
      inspecting the Job object returned.

    Examples
    --------
    Each entry in the tasks list could be a pair with a dictionary of bindings
    for that entry. For example:

        >>> tasks = [('task1', {'input':'s3://big-file'}), 
        >>>          ('task2', {'input':'/localfile'})]
        >>> graphlab.deploy.job.create(tasks, name='with-bindings')

    """
    _session = _gl.deploy._default_session

    if environment is None:
        environment = _gl.deploy.environments['async']
        if environment is None:
            environment = _environment.LocalAsync('async')
            environment.save()
    else:
        if isinstance(environment, str):
            __LOGGER__.debug("Loading environment: %s" % environment)
            environment = _gl.deploy.environments[environment]

        elif not isinstance(environment, _environment.Environment):
            raise Exception("Unknown type of environment")
        
        if environment is None:
            raise TypeError("Environment cannot be loaded correctly with name '%s', please confirm this environment exists by calling graphlab.deploy.environments." % environment)

        environment = environment.clone()
    __LOGGER__.info("Preparing using environment: %s" % environment.name)

    if name is not None:
        if type(name) is not str:
            raise TypeError("The name you gave for this job is not a string.")

    # now make the artifacts a list of objects
    if not isinstance(tasks, list):
        # not a list, let's turn it into a list
        tasks = [tasks]

    if name is None or name == '':
        task = tasks[0]
        if isinstance(task, tuple):
            task = task[0]
        if isinstance(task, str):
            names = task
        else:
            names = task.name
        name = 'job-%s-%s-%d' % (names, environment.name, _time.time())

    __LOGGER__.info("Validating job: '%s'" % name)
    validation_msgs = []

    # verify job name is unique
    if _gl.deploy.jobs[name] is not None:
        # found another job same name, fail
        raise RuntimeError("Validation Error: Job already exists with the name '%s', please rename or delete the exiting job." % name)

    # Create artifact from their names, if necessary. Clone all artifacts. Add any bindings.
    cloned_artifacts = []
    using_pipeline = False
    for steps in tasks:

        # handle pipeline differently then task
        if isinstance(steps, _Pipeline):
            using_pipeline = True
            binding = None
            if isinstance(steps, tuple):
                (cur_artifact, binding) = steps
            else:
                cur_artifact = steps
            if not isinstance(cur_artifact, _Task) and not isinstance(cur_artifact, _Pipeline):
                cur_artifact = _session._open(cur_artifact, {}, check_cache=True, typename='Task')

            clone = cur_artifact._clone(cur_artifact.name, session_aware=False)

            # apply bindings if paired with task
            if binding is not None:
                _apply_binding_to_task(clone, binding)
            cloned_artifacts.append(clone)
            continue

        if not isinstance(steps, list):
            steps = [steps]

        cloned_step = []
        for step in steps:
            binding = None
            if isinstance(step, tuple):
                (cur_artifact, binding) = step
            else:
                cur_artifact = step
            if not isinstance(cur_artifact, _Task) and not isinstance(cur_artifact, _Pipeline):
                cur_artifact = _session._open(cur_artifact, {}, check_cache=True, typename='Task')

            if cur_artifact is None:
                raise TypeError('Unable to find Task to try to run')

            clone = cur_artifact._clone(cur_artifact.name, session_aware=False)

            # apply bindings if paired with task
            if binding is not None:
                _apply_binding_to_task(clone, binding)

            # if environment is not local then write out any outputs not bound to a location to an 
            # intermediate location, so any subsequent steps can find the output
            _validate_output_to_environment(clone, environment, validation_msgs)

            cloned_step.append(clone)

        cloned_artifacts.append(cloned_step)

    num_tasks = len(cloned_artifacts)
    tracker = _mt._get_metric_tracker()
    if isinstance(environment, _environment.Local):
        tracker.track('graphlab.deploy.job.create.local', value=1, properties={'num_tasks':num_tasks})
        env = _env.LocalExecutionEnvironment()
    elif isinstance(environment, _environment.LocalAsync):
        tracker.track('graphlab.deploy.job.create.localasync', value=1, properties={'num_tasks':num_tasks})
        env = _env.LocalAsynchronousEnvironment()
    elif isinstance(environment, _environment.EC2):
        tracker.track('graphlab.deploy.job.create.ec2', value=1, properties={'num_tasks':num_tasks})
        if(not(using_pipeline) and environment.num_hosts > 1):
            __LOGGER__.warning("Currently jobs other than graphlab.toolkits.model_parameter_search cannot use more than one EC2 host.")
            import copy
            environment = copy.copy(environment)
            environment.num_hosts = 1
        env = _env.Ec2ExecutionEnvironment()
    elif isinstance(environment, _environment.Hadoop):
        tracker.track('graphlab.deploy.job.create.hadoop', value=1, properties={'num_tasks':num_tasks})
        env = _env.HadoopExecutionEnvironment()
    else:
        raise Exception("Unknown execution environment")
    
    if len(validation_msgs) > 0:
        for msg in validation_msgs:
            __LOGGER__.error(msg)
        raise RuntimeError("Validation Failed: output(s) not set to appropriate location for execution environment. See logs for more details.")

    __LOGGER__.info("Validation complete. Job: '%s' ready for execution" % name)
    job = env.run(_session, cloned_artifacts, name, environment)
    _session.register(job)
    job.save() # save the job once prior to returning.
    return job

def _apply_binding_to_task(task, binding, allow_object=False):
    """
    Helper method to apply bindings to a given task. This function modifies the given task object and returns 
    it modified.

    When specifying bindings, there is no qualification for a binding being an input, output, or param, so when 
    trying to apply the binding we need to try each. A TypeError is thrown whenever the name for an input is already 
    applied to another type of input (ex. set_input('foo') called, then later set_param('foo','bar') will throw a 
    TypeError. So it is expected that the name for each binding throw a TypeError for the other types of slot names 
    (so if param named 'foo', then TypeError should be thrown for set_input('foo') and set_output('foo')
    """
    for (param_name, param_value) in binding.iteritems():
        bound = False

        if param_name in task.get_inputs():
            try:
                # special-case when task.run() is specifying input binding with object
                # this way when _realize_input is called it will find the object dependency
                if allow_object is True and _Task._is_valid_data_structure(param_value):
                    if not hasattr(task, '_local_binding') or task._local_binding is None:
                        task._local_binding = {}
                    task._local_binding[param_name] = param_value
                else:
                    task.set_inputs({param_name : param_value})
                bound = True
                __LOGGER__.debug("Applied binding named: '%s' as input, with value: '%s'" % (param_name, param_value))
            except TypeError:
                pass

        if param_name in task.get_outputs():
            try:
                task.set_outputs({param_name : param_value})
                bound = True
                __LOGGER__.debug("Applied binding named: '%s' as output, with value: '%s'" % (param_name, param_value))
            except TypeError:
                pass

        if param_name in task.get_params():
            try:
                task.set_params({param_name:param_value})
                bound = True
                __LOGGER__.debug("Applied binding named: '%s' as param, with value: '%s'" % (param_name, param_value))
            except TypeError:
                pass

        if bound is False:
            __LOGGER__.warning("Binding not applied since not found in input, output, or params. Name: '%s', value: '%s'" % (param_name, param_value))

def _validate_output_to_environment(task, environment, validation_msgs):
    """
    Validate that output paths match the environment - if Ec2/Hadoop environment then paths should be S3/HDFS etc.
    Also emit warnings if params begin with local paths and environment is not local
    """
    if isinstance(environment, _environment.LocalAsync) or isinstance(environment, _environment.Local):
        # nothing to validate here, since the paths might be correct when running locally
        return
    elif isinstance(environment, _environment.Hadoop):
        __validate_path_prefixes(task, 'hdfs://', validation_msgs)
    elif isinstance(environment, _environment.EC2):
        __validate_path_prefixes(task, 's3://', validation_msgs)
    else:
        raise TypeError("Invalid environment encountered, unable to create Job, Supported environments are of type: HadoopEnvironment, EC2Environment, LocalEnvironment, or LocalAsynchronous")

def __validate_path_prefixes(task, prefix, validation_msgs):
    """
    Validate that task outputs all begin with specified prefix, otherwise raise TypeError.
    """
    for name, value in task.get_outputs().iteritems():
        if value is not None and not str(value).strip().lower().startswith(prefix):
           validation_msgs.append("Validation error: Task: '%s', output named: '%s' not being saved to expected prefix: '%s', being saved to: '%s'" % (task.name, name, prefix, value))

