/*
 * Decompiled with CFR 0.152.
 */
import java.net.URL;
import java.util.List;
import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.MasonTagTypes;
import net.htmlparser.jericho.MicrosoftConditionalCommentTagTypes;
import net.htmlparser.jericho.PHPTagTypes;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.TextExtractor;

public class ExtractText {
    public static void main(String[] args) throws Exception {
        String sourceUrlString = "data/test.html";
        if (args.length == 0) {
            System.err.println("Using default argument of \"" + sourceUrlString + '\"');
        } else {
            sourceUrlString = args[0];
        }
        if (sourceUrlString.indexOf(58) == -1) {
            sourceUrlString = "file:" + sourceUrlString;
        }
        MicrosoftConditionalCommentTagTypes.register();
        PHPTagTypes.register();
        PHPTagTypes.PHP_SHORT.deregister();
        MasonTagTypes.register();
        Source source = new Source(new URL(sourceUrlString));
        source.fullSequentialParse();
        System.out.println("Document title:");
        String title = ExtractText.getTitle(source);
        System.out.println(title == null ? "(none)" : title);
        System.out.println("\nDocument description:");
        String description = ExtractText.getMetaValue(source, "description");
        System.out.println(description == null ? "(none)" : description);
        System.out.println("\nDocument keywords:");
        String keywords = ExtractText.getMetaValue(source, "keywords");
        System.out.println(keywords == null ? "(none)" : keywords);
        System.out.println("\nLinks to other documents:");
        List<Element> linkElements = source.getAllElements("a");
        for (Element linkElement : linkElements) {
            String href = linkElement.getAttributeValue("href");
            if (href == null) continue;
            String label = linkElement.getContent().getTextExtractor().toString();
            System.out.println(String.valueOf(label) + " <" + href + '>');
        }
        System.out.println("\nAll text from file (exluding content inside SCRIPT and STYLE elements):\n");
        System.out.println(source.getTextExtractor().setIncludeAttributes(true).toString());
        System.out.println("\nSame again but this time extend the TextExtractor class to also exclude text from P elements and any elements with class=\"control\":\n");
        TextExtractor textExtractor = new TextExtractor(source){

            @Override
            public boolean excludeElement(StartTag startTag) {
                return startTag.getName() == "p" || "control".equalsIgnoreCase(startTag.getAttributeValue("class"));
            }
        };
        System.out.println(textExtractor.setIncludeAttributes(true).toString());
    }

    private static String getTitle(Source source) {
        Element titleElement = source.getFirstElement("title");
        if (titleElement == null) {
            return null;
        }
        return CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
    }

    private static String getMetaValue(Source source, String key) {
        int pos = 0;
        while (pos < source.length()) {
            StartTag startTag = source.getNextStartTag(pos, "name", key, false);
            if (startTag == null) {
                return null;
            }
            if (startTag.getName() == "meta") {
                return startTag.getAttributeValue("content");
            }
            pos = startTag.getEnd();
        }
        return null;
    }
}

