#!/usr/bin/python

# Copyright 2012-2013 Erik-Jan van Baaren (erikjan@gmail.com)
# This tool is released as part of the python package ESClient which can be found on PyPI.org
# or on github at https://github.com/eriky/ESClient

import esclient
import json
import argparse
import sys


parser = argparse.ArgumentParser(description="Dump one or more ElasticSearch" +
" indexes to stdout. This tool will dump all the _source fields. If you chose"+
" not to store the _source field, you can not make backups of your index(es)"+
" with this tool.")

parser.add_argument('--url', '-u', required=True, help="The full URL to the ElasticSearch server, including port")
parser.add_argument('--file', '-f', required=False, help="The output file to dump to. By default esdump will dump to stdout.")
parser.add_argument('--indexes', '-i', nargs='+', help="One or more index names to dump, may also be aliases. If none specified, ALL indexes are dumped.")
parser.add_argument('--gzip', '-z', action="store_true", help="Use gzip to compress the output")
parser.add_argument('--bzip2', '-b', action="store_true", help="Use bzip2 to compress the output")
parser.add_argument('--stored-fields', '-s', nargs='+', help="A list of fields that you want to include in the backup (_source, _id, _parent and _routing are included automatically")

arguments = parser.parse_args()

if not arguments.indexes:
    indexes = ['_all']
else:
    indexes = arguments.indexes

if arguments.bzip2 and arguments.gzip:
    sys.stderr.write("Invalid combination of options: I can write either bzip2 or gzip, not both.\n")
    sys.exit(1)
 
es = esclient.ESClient(arguments.url)

# TODO: check cluster state before continuing

# Open a file to write to, based on the arguments given 
if arguments.bzip2 and arguments.file:
    import bz2
    f = bz2.BZ2File(arguments.file, 'wb')
elif arguments.gzip and arguments.file:
    import gzip
    f = gzip.open(arguments.file, 'wb')
elif arguments.file:
    f = open(arguments.file, "w")
else:
    if arguments.bzip2 or arguments.gzip:
        sys.stderr.write("This tool will not write compressed output to stdout. You can however pipe the output through gzip or bzip2 to compress the data.\n")
        sys.exit(1)
    else:    
        # use stdout as a file
        f = sys.stdout
    

# Get the fields that the user wants to backup in addition to the system fields as listed below
fields = set(["_parent", "_routing", "_source"])
if arguments.stored_fields:
    for field in arguments.stored_fields:
        fields.add(field)
fields = list(fields)

query_body = { "query": { "match_all": {} }, "fields": fields }
scroll_id = es.scan(query_body = query_body, indexes = indexes)

indexes = set()

while True:
    scrollres = es.scroll(scroll_id)
    # get next scroll_id
    scroll_id = scrollres["_scroll_id"]
    
    hits = scrollres["hits"]["hits"]

    num_results = 0
    for hit in scrollres["hits"]["hits"]:
        # Delete this field, since it is useless for restore purposes
        del(hit["_score"])
        if hit["_index"] not in indexes:
            indexes.add(hit["_index"])

        f.write(json.dumps(hit))
        f.write('\n')
        num_results += 1

    # See if we reached the end of the data
    if num_results == 0:
        break
# TODO
# write_mappings(indexes)

f.close()
