Using a Lexer to Extract Links

If you are after raw link text only, then you can use a Lexer to access the links:

import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.nodes.TagNode;
import org.htmlparser.util.ParserException;

public class LinkDemo
{
    public static void main (String[] args) throws ParserException, IOException
    {
        Node node;

        URL url = new URL ("http://urlIWantToParse.com");
        URLConnection connection = url.openConnection ();
        Lexer lexer = new Lexer (connection);
        while (null != (node = lexer.nextNode ()))
            if (node instanceof TagNode)
            {
                TagNode tag = (TagNode)node;
                if (tag.getTagName ().equals ("A") && !tag.isEndTag ())
                {
                    String href = tag.getAttribute ("href");
                    if (null != href)
                        System.out.println (href);
                }
            }
     }
}




Last edited on Thursday, January 8, 2004 4:06:57 am.