The traditional crawler starts with the URL of one or several initial web pages and obtains the URL on the initial web pages. In the process of crawling the web page, it constantly extracts new URLs from the current page and puts them in the queue until it meets some stop conditions of the system. For vertical search, it is more suitable to pay attention to reptiles, that is, reptiles that crawl web pages with specific topics.
The following is a simple crawler core code implemented in java:
Public void crawl () throws Throwable {
while (continueCrawling()) {
crawler URL URL = get nexturl(); //Get the next URL to crawl in the queue.
If (URL! = null) {
printCrawlInfo();
string content = get content(URL); //Get the text information of URL
//The focused crawler only crawls the web pages related to the theme content, and here it uses rule matching for simple processing.
if(iscontentrelevent(content,this.regexpSearchPattern)) {
SaveContent(url, content); //Save the webpage locally.
//Get the links in the webpage content and put them in the queue for crawling.
Set urlStrings = extractUrls (content, URL);
addUrlsToUrlQueue(url,URL strings);
} Otherwise {
System.out.println(url+"irrelevant ignore ...");
}
//delay to prevent shielding by the other party
thread . sleep(this . delaybetweenurls);
}
}
closeOutputStream();
}
Private CrawlerUrl getNextUrl () throws Throwable {
CrawlerUrl nextUrl = null
while((nextUrl = = null)& amp; & amp(! urlQueue.isEmpty())) {
crawler URL crawler URL = this . URL queue . remove();
//doWeHavePermissionToVisit: Whether you have access to the URL or not, a friendly crawler will crawl according to the rules configured in "Robot.txt" provided by the website.
//IsurlReadyVisited: Has the URL been visited? Large search engines often use BloomFilter to copy it. Here, simply use HashMap.
///isdepthaceptable: Whether the specified upper depth limit has been reached. Reptiles usually adopt a breadth-first approach. Some websites will build a crawler trap (automatically generating some invalid links to make the crawler fall into an infinite loop) and adopt depth restrictions to avoid it.
if(doWeHavePermissionToVisit(crawler URL)
& amp& amp(! isurlareadyvisited(crawler URL))
& amp& ampis depth acceptable(crawler URL)){
nextUrl = crawlerUrl
// System.out.println ("The next url to visit is" Next URL.
}
}
Return nextUrl
}
Private string getContent(CrawlerUrl) throws Throwable {
//HttpClient4. 1 is called differently than before.
http client client = new default http client();
http get http get = new http get(URL . geturlstring());
string buffer strBuf = new string buffer();
HttpResponse response = client . execute(http get);
if (HttpStatus。 SC_OK == response.getStatusLine()。 getStatusCode()) {
HttpEntity entity = response . get entity();
If (entity! = null) {
BufferedReader reader = new buffered reader (
new InputStreamReader(entity . get content(),“UTF-8”));
String line = null
if(entity . get contentlength()& gt; 0) {
strBuf = new string buffer((int)entity . get contentlength());
while ((line = reader.readLine())! = null) {
StrBuf.append (line);
}
}
}
If (entity! = null) {
nsumeContent();
}
}
//Mark the url as visited.
markUrlAsVisited(URL);
Returns strbuf.tostring ();
}
Public static boolean is content relevant (string content,
Pattern regexpPattern) {
Boolean return value = false;
If (content! = null) {
//Does it meet the conditions of regular expression?
matcher m = regexppattern . matcher(content . tolowercase());
retValue = m . find();
}
Return retValue
}
Public list extraction Url (string text, crawler URL crawler URL) (
map URL map = new HashMap();
extrachttpurls(URL map,text);
extractreativeurls(URL map,text,crawler URL);
Returns a new ArrayList (urlmap.keyset ());
}
Private void extractHttpUrls (mapped urlMap, string text) (
matcher m =(text);
while (m.find()) {
string URL = m . group();
string[]terms = URL . split(" a href = \ ");
For (string term: term) (
//system . out . println(" Term = "+Term);
if (term.startsWith("http")) {
int index = term . index of(" \ ");
if(index & gt; 0) {
$ term = term.substring(0 (0, index);
}
urlMap.put(term,term);
System.out.println ("hyperlink:"+$ TERM);
}
}
}
}
Private void extractRelativeUrls (mapping urlMap, string text,
Reptiles (Reptiles) (
matcher m = relative regexp . matcher(text);
URL textURL = crawler URL . geturl();
string host = texturl . gethost();
while (m.find()) {
string URL = m . group();
string[]terms = URL . split(" a href = \ ");
For (string term: term) (
if(term . starts with("/"){
int index = term . index of(" \ ");
if(index & gt; 0) {
$ term = term.substring(0 (0, index);
}
String s =//+ host+term;
urlMap.put(s,s);
System.out.println ("relative URL:"+s);
}
}
}
}
Public static void main(String[] args) {
Try {
String url =
queue URL queue = new linked list();
String regexp = " java
urlQueue.add(new CrawlerUrl(url,0));
naive crawler crawler = new naive crawler(URL queue, 100,5, 1000L,
regexp);
//boolean allow crawl = crawler . areweallowedtovisit(URL);
// System.out.println ("Crawling allowed:"+url+""+
//allow crawl);
Crawler. Crawl ();
} catch (Throwable t) {
system . out . println(t . tostring());
t . printstacktrace();
}
}