手上的項(xiàng)目需要到網(wǎng)絡(luò)上去爬取wsdl文件,來充實(shí)服務(wù)網(wǎng)絡(luò)。一開始的想法是自己采用網(wǎng)絡(luò)爬蟲的想法,可以自己編寫一個(gè)只針對(duì)網(wǎng)頁上的超鏈接進(jìn)行遍歷的爬蟲;或者可以對(duì)現(xiàn)有的爬蟲器進(jìn)行改進(jìn),使其只關(guān)注網(wǎng)頁的鏈接部分,至于網(wǎng)頁的內(nèi)容不去理會(huì),這樣可以避免爬蟲的效率和數(shù)據(jù)流量過大的問題。后來發(fā)現(xiàn)Goolge的高級(jí)搜索可以對(duì)文件的類型進(jìn)行約束,所以決定利用Google的搜索引擎來實(shí)現(xiàn)該目標(biāo)。利用Google搜索引擎進(jìn)行檢索,能夠有很高的查詢效率和很高的查全率與查準(zhǔn)率,同時(shí)還可以通過關(guān)鍵字進(jìn)行限制查詢。
利用Google進(jìn)行檢索主要可以通過以下兩種方法實(shí)現(xiàn):
1.使用Google API,可以直接設(shè)定查詢的條件,并且得到結(jié)構(gòu)簡(jiǎn)單的查詢結(jié)果。但是Google API目前好像已經(jīng)停止使用了,非常遺憾。
2.通過像Google發(fā)送頁面請(qǐng)求消息來實(shí)現(xiàn)查詢。這就需要設(shè)計(jì)到兩個(gè)問題:首先要對(duì)Google的查詢請(qǐng)求的格式有一定了解,如何把查詢的約束條件信息包含在請(qǐng)求消息中;其次,通過頁面請(qǐng)求查詢,得到的是html頁面代碼,所以要對(duì)頁面中的內(nèi)容進(jìn)行解析、提取,過濾出有用的超鏈接結(jié)果。
這里主要介紹一下后一種方法,但從理論上講前一共方法可加科學(xué)和簡(jiǎn)便。
一、Google的查詢頁面請(qǐng)求消息格式
http://www.google.com/search?num=50&hl=en&lr=&newwindow=1&as_qdr=all&q=a++filetype:wsdl&start=50&sa=N
其中 num為每頁的結(jié)果個(gè)數(shù);hl表示語言;q=后面為關(guān)鍵字;filetype為文件格式;start為開始的結(jié)果標(biāo)號(hào)。
二、發(fā)送頁面請(qǐng)求
private String searchPage(String keyString, String filetype, int start) throws IOException

{
URL url = buildurl(keyString, filetype, start);
String htmlString = new String();
HttpURLConnection http = (HttpURLConnection) url.openConnection();
http.setRequestProperty("User-Agent", "Mozilla/5.0");
http.connect();
InputStream urlstream = http.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(urlstream);
BufferedReader bufReader = new BufferedReader(inputStreamReader);
String currentLine = null;
while ((currentLine = bufReader.readLine()) != null)
htmlString += currentLine;
return htmlString;
}
三、對(duì)返回頁面內(nèi)容進(jìn)行解析
這里使用了HTML Parser進(jìn)行解析,代碼如下:
private List<String> listURL(String html, String filetype)

{
List<String> urlList = new ArrayList<String>();
Parser parser = Parser.createParser(html, "UTF-8");
NodeList nodeList = null;

try
{
nodeList = parser.extractAllNodesThatMatch(

new NodeFilter()
{

private static final long serialVersionUID = 0L;


public boolean accept(Node node)
{
if (node instanceof LinkTag)
return true;
else
return false;
}
});

} catch (ParserException e)
{
e.printStackTrace();
}

for (int i=0; i<nodeList.size(); i++)
{
LinkTag node = (LinkTag) nodeList.elementAt(i);
if(node.extractLink().endsWith("." + filetype) && !node.getStringText().equalsIgnoreCase("Similar pages"))
urlList.add(node.extractLink());
}
return urlList;
}
四、附完整程序代碼
package cn.edu.tju.cs.ikse.sn.spider;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class GoogleSearch
{
private final int intmax = (int) (Math.pow(2, 31) - 1);
private List<String> splitString(String keyString)

{
if(keyString == null)
return null;
String[] keyWords = keyString.split(" ");
List<String> keyList = new ArrayList<String>();
for(int i=0; i<keyWords.length; i++)
if(keyWords[i].length() != 0)
keyList.add(keyWords[i]);
return keyList;
}
private URL buildurl(String keyString, String filetype, int start)

{
String urlString = "http://www.google.com/search?num=100&hl=en&lr=&newwindow=1&as_qdr=all&q=";
if(splitString(keyString) != null)

{
Iterator<String> keyIt = splitString(keyString).iterator();
while(keyIt.hasNext())
urlString += keyIt.next() + "+";
}
urlString += "filetype:" + filetype + "&start=" + start + "&sa=N";
URL url = null;

try
{
url = new URL(urlString);

} catch (MalformedURLException e)
{
System.out.println("String to URL Errors!");
e.printStackTrace();
}
return url;
}
private String searchPage(String keyString, String filetype, int start) throws IOException

{
URL url = buildurl(keyString, filetype, start);
String htmlString = new String();
HttpURLConnection http = (HttpURLConnection) url.openConnection();
http.setRequestProperty("User-Agent", "Mozilla/5.0");
http.connect();
InputStream urlstream = http.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(urlstream);
BufferedReader bufReader = new BufferedReader(inputStreamReader);
String currentLine = null;
while ((currentLine = bufReader.readLine()) != null)
htmlString += currentLine;
return htmlString;
}
private List<String> listURL(String html, String filetype)

{
List<String> urlList = new ArrayList<String>();
Parser parser = Parser.createParser(html, "UTF-8");
NodeList nodeList = null;

try
{
nodeList = parser.extractAllNodesThatMatch(

new NodeFilter()
{

private static final long serialVersionUID = 0L;


public boolean accept(Node node)
{
if (node instanceof LinkTag)
return true;
else
return false;
}
});

} catch (ParserException e)
{
e.printStackTrace();
}

for (int i=0; i<nodeList.size(); i++)
{
LinkTag node = (LinkTag) nodeList.elementAt(i);
if(node.extractLink().endsWith("." + filetype) && !node.getStringText().equalsIgnoreCase("Similar pages"))
urlList.add(node.extractLink());
}
return urlList;
}
public List<String> search(String keyString, String filetype, int num)

{
List<String> urlList = new ArrayList<String>();
int start = 0;
while(urlList.size() < num)

{
String html = null;

try
{
html = searchPage(keyString, filetype, start);
start += 100;

} catch (IOException e)
{
e.printStackTrace();
}
List<String> urlListOfPage = listURL(html, filetype);
if(urlListOfPage.size() == 0)

{
System.out.println("The maximum number of the results is " + urlList.size());
return urlList;
}
urlList.addAll(urlListOfPage);
}
while(urlList.size() > num)
urlList.remove(urlList.size()-1);
return urlList;
}
public List<String> search(String keyString, String filetype)

{
return search(keyString, filetype, intmax);
}

public static void main(String[] args)
{
GoogleSearch googleSearch = new GoogleSearch();
List<String> re = googleSearch.search("book", "owl", 1000);
System.out.println(re.size());
for(int i=0;i<re.size();i++)
System.out.println(re.get(i));
}
}



posted on 2008-08-01 20:37
胖胖泡泡 閱讀(167)
評(píng)論(0) 編輯 收藏