Email Extraction

This is very similar to link extraction. You have to extract links from a page and verify that they are email addresses. Link tags have a method - isMailLink() to check if the HREF starts with "mailto:". Using an inner class in the NodeFilter example:

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class EmailLinkDemo
{
    public static void main (String[] args) throws ParserException
    {
        Parser parser = new Parser ("http://urlIWantToParse.com");
        NodeFilter filter = new NodeFilter ()
        {
            /**
             * Accept nodes that are mail links.
             * @param node The node to check.
             */
            public boolean accept (Node node)
            {
                 return (LinkTag.class.isAssignableFrom (node.getClass ())
                    && ((LinkTag)node).isMailLink ());
            }
        };
        NodeList links = new NodeList ();
        for (NodeIterator e = parser.elements (); e.hasMoreNodes (); )
            e.nextNode ().collectInto (links, filter);
        for (int i = 0; i < links.size (); i++)
        {
            LinkTag linkTag = (LinkTag)links.elementAt (i);
            System.out.print ("\"" + linkTag.getLinkText () + "\" => ");
            System.out.println (linkTag.getLink ());
        }
    }
}




Last edited on Wednesday, January 7, 2004 5:26:12 pm.