/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.util.Function;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;

public class DocumentPreprocessor
implements Iterable<List<HasWord>> {
    private Reader inputReader = null;
    private String inputPath = null;
    private DocType docType = DocType.Plain;
    private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.factory();
    private String encoding = null;
    private String[] sentenceFinalPuncWords = new String[]{".", "?", "!"};
    private Function<List<HasWord>, List<HasWord>> escaper = null;
    private String sentenceDelimiter = null;
    private String tagDelimiter = null;
    private String elementDelimiter = ".*";
    private final String[] sentenceFinalFollowers = new String[]{")", "]", "\"", "'", "''", "-RRB-", "-RSB-", "-RCB-"};

    public DocumentPreprocessor(Reader input) {
        this(input, DocType.Plain);
    }

    public DocumentPreprocessor(Reader input, DocType t) {
        if (input == null) {
            throw new RuntimeException("Cannot read from null object!");
        }
        this.docType = t;
        this.inputReader = input;
    }

    public DocumentPreprocessor(String docPath) {
        this(docPath, DocType.Plain);
    }

    public DocumentPreprocessor(String docPath, DocType t) {
        if (docPath == null) {
            throw new RuntimeException("Cannot open null document path!");
        }
        this.docType = t;
        this.inputPath = docPath;
    }

    public void setEncoding(String encoding) throws IllegalCharsetNameException {
        if (Charset.isSupported(encoding)) {
            this.encoding = encoding;
        }
    }

    public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
        this.sentenceFinalPuncWords = sentenceFinalPuncWords;
    }

    public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory) {
        this.tokenizerFactory = newTokenizerFactory;
    }

    public void setEscaper(Function<List<HasWord>, List<HasWord>> e) {
        this.escaper = e;
    }

    public void setSentenceDelimiter(String s) {
        this.sentenceDelimiter = s;
    }

    public void setTagDelimiter(String s) {
        this.tagDelimiter = s;
    }

    public void setElementDelimiter(String s) {
        this.elementDelimiter = s;
    }

    @Override
    public Iterator<List<HasWord>> iterator() {
        try {
            if (this.inputReader == null) {
                this.inputReader = this.getReaderFromPath(this.inputPath);
            }
            if (this.docType == DocType.Plain) {
                return new PlainTextIterator();
            }
            if (this.docType == DocType.XML) {
                return new XMLIterator();
            }
        }
        catch (IOException e) {
            System.err.printf("%s: Could not open path %s\n", this.getClass().getName(), this.inputPath);
        }
        return new Iterator<List<HasWord>>(){

            @Override
            public boolean hasNext() {
                return false;
            }

            @Override
            public List<HasWord> next() {
                throw new NoSuchElementException();
            }

            @Override
            public void remove() {
            }
        };
    }

    private Reader getReaderFromPath(String path) throws IOException {
        try {
            URL url = new URL(path);
            URLConnection connection = url.openConnection();
            return new BufferedReader(new InputStreamReader(connection.getInputStream()));
        }
        catch (MalformedURLException e) {
            File file = new File(path);
            if (file.exists()) {
                return this.encoding == null ? new FileReader(path) : new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(file), this.encoding));
            }
            throw new IOException("Unable to open " + path);
        }
    }

    public static void main(String[] args) throws IOException {
        if (args.length < 1) {
            System.err.println("usage: DocumentPreprocessor filename [OPTS]");
            return;
        }
        DocumentPreprocessor docPreprocessor = new DocumentPreprocessor(args[0]);
        String encoding = "utf-8";
        boolean printSentenceLengths = false;
        for (int i = 1; i < args.length; ++i) {
            String options;
            if (args[i].equals("-xml")) {
                docPreprocessor = new DocumentPreprocessor(args[0], DocType.XML);
                docPreprocessor.setElementDelimiter(args[++i]);
                continue;
            }
            if (args[i].equals("-encoding") && i + 1 < args.length) {
                encoding = args[++i];
                continue;
            }
            if (args[i].equals("-printSentenceLengths")) {
                printSentenceLengths = true;
                continue;
            }
            if (args[i].equals("-suppressEscaping")) {
                options = "ptb3Escaping=false";
                docPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(), options));
                continue;
            }
            if (args[i].equals("-tokenizerOptions") && i + 1 < args.length) {
                options = args[++i];
                docPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(), options));
                continue;
            }
            if (args[i].equals("-noTokenization")) {
                docPreprocessor.setTokenizerFactory(null);
                docPreprocessor.setSentenceDelimiter(System.getProperty("line.separator"));
                continue;
            }
            if (args[i].equals("-tag")) {
                docPreprocessor.setTagDelimiter(args[++i]);
                continue;
            }
            System.err.println("Unknown option: " + args[i]);
        }
        docPreprocessor.setEncoding(encoding);
        int numSents = 0;
        PrintWriter pw = new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.out, encoding), true);
        for (List<HasWord> sentence : docPreprocessor) {
            ++numSents;
            if (printSentenceLengths) {
                System.err.println("Length:\t" + sentence.size());
            }
            boolean printSpace = false;
            for (HasWord word : sentence) {
                if (printSpace) {
                    pw.print(" ");
                }
                printSpace = true;
                pw.print(word.word());
            }
            pw.println();
        }
        pw.close();
        System.err.println("Read in " + numSents + " sentences.");
    }

    private class XMLIterator
    implements Iterator<List<HasWord>> {
        private final XMLBeginEndIterator<String> xmlItr;
        private final Reader originalDocReader;
        private PlainTextIterator plainItr;
        private List<HasWord> nextSent;

        public XMLIterator() {
            this.xmlItr = new XMLBeginEndIterator(DocumentPreprocessor.this.inputReader, DocumentPreprocessor.this.elementDelimiter);
            this.originalDocReader = DocumentPreprocessor.this.inputReader;
            this.primeNext();
        }

        private void primeNext() {
            do {
                if (this.plainItr != null && this.plainItr.hasNext()) {
                    this.nextSent = this.plainItr.next();
                    continue;
                }
                if (this.xmlItr.hasNext()) {
                    String block = this.xmlItr.next();
                    DocumentPreprocessor.this.inputReader = new BufferedReader(new StringReader(block));
                    this.plainItr = new PlainTextIterator();
                    if (this.plainItr.hasNext()) {
                        this.nextSent = this.plainItr.next();
                        continue;
                    }
                    this.nextSent = null;
                    continue;
                }
                IOUtils.closeIgnoringExceptions(this.originalDocReader);
                this.nextSent = null;
                break;
            } while (this.nextSent == null);
        }

        @Override
        public boolean hasNext() {
            return this.nextSent != null;
        }

        @Override
        public List<HasWord> next() {
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> thisSentence = this.nextSent;
            this.primeNext();
            return thisSentence;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    private class PlainTextIterator
    implements Iterator<List<HasWord>> {
        private Tokenizer<? extends HasWord> tokenizer;
        private Set<String> sentDelims;
        private Set<String> delimFollowers;
        private Function<String, String[]> splitTag;
        private List<HasWord> nextSent;
        private List<HasWord> nextSentCarryover;

        public PlainTextIterator() {
            this.delimFollowers = new HashSet<String>(Arrays.asList(DocumentPreprocessor.this.sentenceFinalFollowers));
            this.nextSent = null;
            this.nextSentCarryover = new ArrayList<HasWord>();
            this.sentDelims = new HashSet<String>();
            boolean eolIsSignificant = false;
            if (DocumentPreprocessor.this.sentenceDelimiter == null) {
                if (DocumentPreprocessor.this.sentenceFinalPuncWords != null) {
                    this.sentDelims = new HashSet<String>(Arrays.asList(DocumentPreprocessor.this.sentenceFinalPuncWords));
                }
            } else {
                this.sentDelims.add(DocumentPreprocessor.this.sentenceDelimiter);
                this.delimFollowers = new HashSet<String>();
                eolIsSignificant = DocumentPreprocessor.this.sentenceDelimiter.matches("\\s+");
                if (eolIsSignificant) {
                    this.sentDelims.add("*NL*");
                }
            }
            this.tokenizer = DocumentPreprocessor.this.tokenizerFactory == null ? WhitespaceTokenizer.newWordWhitespaceTokenizer(DocumentPreprocessor.this.inputReader, eolIsSignificant) : (eolIsSignificant ? DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader, "tokenizeNLs") : DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader));
            if (DocumentPreprocessor.this.tagDelimiter != null) {
                this.splitTag = new Function<String, String[]>(){
                    private final String splitRegex;
                    {
                        this.splitRegex = String.format("%s(?!.*%s)", DocumentPreprocessor.this.tagDelimiter, DocumentPreprocessor.this.tagDelimiter);
                    }

                    @Override
                    public String[] apply(String in) {
                        String[] splits = in.trim().split(this.splitRegex);
                        if (splits.length == 2) {
                            return splits;
                        }
                        String[] oldStr = new String[]{in};
                        return oldStr;
                    }
                };
            }
        }

        private void primeNext() {
            this.nextSent = new ArrayList<HasWord>(this.nextSentCarryover);
            this.nextSentCarryover.clear();
            boolean seenBoundary = false;
            while (this.tokenizer.hasNext()) {
                HasWord token = this.tokenizer.next();
                if (this.splitTag != null) {
                    String[] toks = this.splitTag.apply(token.word());
                    token.setWord(toks[0]);
                    if (token instanceof Label) {
                        ((Label)((Object)token)).setValue(toks[0]);
                    }
                    if (toks.length == 2 && token instanceof HasTag) {
                        ((HasTag)((Object)token)).setTag(toks[1]);
                    }
                }
                if (this.sentDelims.contains(token.word())) {
                    seenBoundary = true;
                } else if (seenBoundary && !this.delimFollowers.contains(token.word())) {
                    this.nextSentCarryover.add(token);
                    break;
                }
                if (!token.word().matches("\\s+") && !token.word().equals("*NL*")) {
                    this.nextSent.add(token);
                }
                if (!seenBoundary || this.delimFollowers.size() != 0) continue;
                if (this.nextSent.size() > 0) break;
                seenBoundary = false;
            }
            if (this.nextSent.size() == 0 && this.nextSentCarryover.size() == 0) {
                IOUtils.closeIgnoringExceptions(DocumentPreprocessor.this.inputReader);
                DocumentPreprocessor.this.inputReader = null;
                this.nextSent = null;
            } else if (DocumentPreprocessor.this.escaper != null) {
                this.nextSent = (List)DocumentPreprocessor.this.escaper.apply(this.nextSent);
            }
        }

        @Override
        public boolean hasNext() {
            if (this.nextSent == null) {
                this.primeNext();
            }
            return this.nextSent != null;
        }

        @Override
        public List<HasWord> next() {
            if (this.nextSent == null) {
                this.primeNext();
            }
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> thisIteration = this.nextSent;
            this.nextSent = null;
            return thisIteration;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    public static enum DocType {
        Plain,
        XML;

    }
}

