Over a million developers have joined DZone.
{{announcement.body}}
{{announcement.title}}

Webcrawler

DZone's Guide to

Webcrawler

·
Free Resource
The core classes from my webcrawler implementation.

Again, not really working code as there are a bunch of dependencies missing. This is really for demonstration purposes.


public class WebCrawler
  
    implements Iterable
   
    
{    
    private final HashSet
    
      visitedPages = new HashSet
     
      ();
    private final LinkedList
       workQueue = new LinkedList();
    private final PageProcessor
        
          processor;
            
    // Map of URLs to pages.
    private final Map
         
           pages = new HashMap
          
           ();     
    
    private final Predicate
            unvisited = new Predicate() { 
        public boolean satisfies(Object page){
            return !WebCrawler.this.visitedPages.contains(page);}};    

            
    public WebCrawler(PageProcessor
             
               processor, String... urls){
        this.processor = processor;
        
        for (String url : urls){
            this.workQueue.add(processor.page(url));}}
    
    /* Iterator which iterates over all WebPages that haven't yet been visited.
     * It is thoroughly lazy and a web page will never be visited until it turns
     * up in this iterator.*/
    public final Iterator
              
                pageIterator = (Iterator
               
                )
        new FlatteningIterator(
            new ListeningIterator
                (
                    new FilterIterator(
                        unvisited, 
                        new PoppingIterator(this.workQueue))){
                @Override public void onNext(Object next){
                    if (next instanceof WebPage){
                        WebCrawler.this.visitedPages.add((T)next);
                        WebCrawler.this.workQueue.add(
                            new FilterIterator(unvisited, WebCrawler.this.processor.linkedPages((WebPage)next)));}}});
                    
    public Iterator
                   
                     iterator(){ return IteratorUtils.link(this.visitedPages.iterator(), this.pageIterator); }                        
}

/**
 * Abstract class representing a mechanism for processing urls into pages. Contains 
 * utility methods and a cacheing strategy.
 *
 * @author david
 */
public abstract class PageProcessor
                    
                      implements Transformer
                     
                      
{
    private final PageCache
                      
                        cache;
    private final IteratorTransformer
                       
                         iteratorTransformer = new IteratorTransformer
                        
                         (this);
    private Predicate
                         
                           domain;
    
    public PageProcessor(Predicate
                          
                            domain, PageCache
                           
                             cache){
        this.domain = domain;
        this.cache = cache;}
    
    public PageProcessor(String domainPrefix, PageCache
                            
                              cache){
        this(StringUtils.startsWith(domainPrefix), cache);}
    
    /**
     * Take the Url and return a WebPage corresponding to it.
     */
    protected abstract T process(String url);
        
    public T transform(String url){ return this.page(url); }
    
    /**
     * If the page has previously been processed, retrieve it from the internal cache.
     * Else process it and put it in the eternal cache.
     */
    public T page(String url){
        T page = cache.getCachedPage(url);
        
        if (page == null){
                page = this.process(url);
                cache.cachePage(page);}
        
        return page;}
    
    /**
     * Returns an iterator over all pages linked to by this page.
     */
    public Iterator
                             
                               linkedPages(WebPage page){
        return iteratorTransformer.transform(new FilterIterator(domain, page.getLinkUrls()));}
}

/**
 * A very simple PageProcessor
                              
                                implementation based on the HTMLParser library
 * which uses a MapBackedPageCache.
 *
 * @author david
 */
public class HtmlParserPageProcessor extends PageProcessor
                               
                                
{       
    private static NodeFilter ALLOWED_TAGS = new NodeFilter(){
        public boolean accept(Node node){ 
            return (node instanceof LinkTag) || (node instanceof TitleTag);}};
    
    public HtmlParserPageProcessor(Predicate
                                
                                  domain){
        super(domain,  new MapBackedPageCache
                                 
                                  ());}
            
    public HtmlParserPageProcessor(String domain){
        super(domain,  new MapBackedPageCache
                                  
                                   ());}
                   
    /**
     * Fetches the resource represented by the URL, parses the HTML and extracts
     * the title element and all the links and uses them to build a WebPage object.
     */
    public WebPage process(String url){
        try{
            Parser parser = new Parser(url);
            NodeIterator iterator = parser.parse(ALLOWED_TAGS).elements();
        
            String title = "";
            List
                                   
                                     links = new ArrayList
                                    
                                     ();
        
            while (iterator.hasMoreNodes()){
                Node node = iterator.nextNode();
                
                if (node instanceof TitleTag) title = ((TitleTag)node).getTitle();
                else if (node instanceof LinkTag) links.add(((LinkTag)node).extractLink());}
            return new WebPage(url, title, links);}               
        catch (Exception e){ throw new RuntimeException(e); }}
}


                                    
                                   
                                  
                                 
                                
                               
                              
                             
                            
                           
                          
                         
                        
                       
                      
                     
                    
                   
               
              
             
          
         
        
     
    
   
  
Topics:

Opinions expressed by DZone contributors are their own.

{{ parent.title || parent.header.title}}

{{ parent.tldr }}

{{ parent.urlSource.name }}