欢迎访问悦橙教程(wld5.com),关注java教程。悦橙教程  java问答|  每日更新
页面导航 : > > 文章正文

java搜索引擎爬虫,抓取url示例,java爬虫url示例,package cn.o

来源: javaer 分享于  点击 7949 次 点评:181

java搜索引擎爬虫,抓取url示例,java爬虫url示例,package cn.o


package cn.outofmemory.robot;import java.io.IOException;import java.util.LinkedList;import java.util.List;import java.util.Queue;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Robot {    // robot url    private List<String> urlList;    // cache url    private Queue<String> urlQueue;    // define Host    public final static String HOST = "debugs.tk";    // constructor    public Robot() {        super();        // initialization robot's member        setUrlList(new LinkedList<String>());        setUrlQueue(new LinkedList<String>());    }    // url    public List<String> getUrlList() {        return urlList;    }    public void setUrlList(List<String> urlList) {        this.urlList = urlList;    }    // cache    public Queue<String> getUrlQueue() {        return urlQueue;    }    public void setUrlQueue(Queue<String> urlQueue) {        this.urlQueue = urlQueue;    }    // Legal link    private boolean isURL(String url) {        try {            // judge url            Pattern pattern = Pattern.compile("^[a-zA-z]+://[^\\s]*");            Matcher matcher = pattern.matcher(url);            if (matcher.matches()) {                return true;            } else {                return false;            }        } catch (Exception e) {            e.printStackTrace();            return false;        }    }    // whether the url is belong to host    public static boolean isHost(String url) {        return url.contains(HOST);    }    // travel all url    public void traverse(String seed) {        for (this.getUrlQueue().add(seed); !this.getUrlQueue().isEmpty();) {            boolean flag = true;            Document document = null;            try {                document = Jsoup.connect(seed).timeout(5000).get();            } catch (IOException e) {                e.printStackTrace();                // whether connect success                flag = false;            }            // whether connect success,then select a tag            // add these aTag into queue            if (flag) {        // get url                Elements elements = document.select("a[href]");                for (Element e : elements) {                    String s = e.attr("abs:href");                    // Legal link and belong host                    // and url not in list                    // then add it                    if (isURL(s) && s.contains(HOST)                            && (!getUrlQueue().contains(s))                            && (!getUrlList().contains(s))) {                        this.getUrlQueue().add(s);                    }                }            }            // get head of queue            // and set it seed            // travel seed it again            seed = this.getUrlQueue().poll();            this.getUrlList().add(seed);            // show information            // System.out.println("SIZE:"             // + this.getUrlQueue().size() + "---"            // + seed + " connect!");        }    }}
相关栏目:

用户点评