<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    中文JAVA技術平等自由協作創造

    Java專題文章博客和開源

    常用鏈接

    統計

    最新評論

    多線程實現的Java爬蟲程序

      以下是一個Java爬蟲程序,它能從指定主頁開始,按照指定的深度抓取該站點域名下的網頁并維護簡單索引。

      參數:private static int webDepth = 2;//爬蟲深度。

      主頁的深度為1,設置深度后超過該深度的網頁不會抓取。 private int intThreadNum = 10;//線程數。開啟的線程數。

      抓取時也會在程序源文件目錄下生成一個report.txt文件記錄爬蟲的運行情況,并在抓取結束后生成一個fileindex.txt文件維護網頁文件索引。

      本程序用到了多線程(靜態變量和同步),泛型,文件操作,URL類和連接,Hashtable類關聯數組,正則表達式及其相關類。托福答案

      運行時需使用命令行參數,第一個參數應使用http://開頭的有效URL字符串作為爬蟲的主頁,第二個參數(可選)應輸入可轉換為int型的字符串(用 Integer.parseInt(String s)靜態方法可以轉換的字符串,如3)作為爬蟲深度,如果沒有,則默認深度為2.

      本程序的不足之處是:只考慮了href= href=' href="后加絕對url的這三種情況(由于url地址在網頁源文件中情況比較復雜,有時處理也會出現錯誤),還有相對url和 window.open('的情況沒有考慮。異常處理程序也只是簡單處理。如果讀者有改進辦法可以把源代碼帖出,不勝感激。托福改分

      附上源代碼如下(保存名為GetWeb.java):

      import java.io.File;

      import java.io.BufferedReader;

      import java.io.FileOutputStream;

      import java.io.InputStream;

      import java.io.InputStreamReader;

      import java.io.OutputStreamWriter;

      import java.io.PrintWriter;

      import java.net.URL;

      import java.net.URLConnection;

      import java.util.ArrayList;

      import java.util.regex.Matcher;

      import java.util.regex.Pattern;

      import java.util.Hashtable;

      public class GetWeb {

      private int webDepth = 2;// 爬蟲深度

      private int intThreadNum = 10;// 線程數

      private String strHomePage = "";// 主頁地址

      private String myDomain;// 域名

      private String fPath = "web";// 儲存網頁文件的目錄名

      private ArrayList<String> arrUrls = new ArrayList<String>();// 存儲未處理URL

      private ArrayList<String> arrUrl = new ArrayList<String>();// 存儲所有URL供建立索引

      private Hashtable<String, Integer> allUrls = new Hashtable<String, Integer>();// 存儲所有URL的網頁號

      private Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>();// 存儲所有URL深度

      private int intWebIndex = 0;// 網頁對應文件下標,從0開始

      private String charset = "GB2312";

      private String report = "";

      private long startTime;

      private int webSuccessed = 0;

      private int webFailed = 0;

      public GetWeb(String s) {

      this.strHomePage = s;

      }

      public GetWeb(String s, int i) {

      this.strHomePage = s;

      this.webDepth = i;

      }

      public synchronized void addWebSuccessed() {

      webSuccessed++;

      }

      public synchronized void addWebFailed() {

      webFailed++;

      }

      public synchronized void addReport(String s) {

      try {

      report += s;

      PrintWriter pwReport = new PrintWriter(new FileOutputStream(

      "report.txt"));

      pwReport.println(report);

      pwReport.close();

      } catch (Exception e) {

      System.out.println("生成報告文件失敗!");

      }

      }

      public synchronized String getAUrl() {

      String tmpAUrl = arrUrls.get(0);

      arrUrls.remove(0);

      return tmpAUrl;

      }

      public synchronized String getUrl() {

      String tmpUrl = arrUrl.get(0);

      arrUrl.remove(0);

      return tmpUrl;

      }

      public synchronized Integer getIntWebIndex() {

      intWebIndex++;

      return intWebIndex;

      }

      /**

      * @param args

      */

      public static void main(String[] args) {

      if (args.length == 0 || args[0].equals("")) {

      System.out.println("No input!");

      System.exit(1);

      } else if (args.length == 1) {

      GetWeb gw = new GetWeb(args[0]);

      gw.getWebByHomePage();

      } else {

      GetWeb gw = new GetWeb(args[0], Integer.parseInt(args[1]));

      gw.getWebByHomePage();

      }

      }

      public void getWebByHomePage() {

      startTime = System.currentTimeMillis();

      this.myDomain = getDomain();

      if (myDomain == null) {

      System.out.println("Wrong input!");

      // System.exit(1);

      return;

      }

      System.out.println("Homepage = " + strHomePage);

      addReport("Homepage = " + strHomePage + "!\n");

      System.out.println("Domain = " + myDomain);

      addReport("Domain = " + myDomain + "!\n");

      arrUrls.add(strHomePage);

      arrUrl.add(strHomePage);

      allUrls.put(strHomePage, 0);

      deepUrls.put(strHomePage, 1);

      File fDir = new File(fPath);

      if (!fDir.exists()) {

      fDir.mkdir();

      }

      System.out.println("Start!");

      this.addReport("Start!\n");

      String tmp = getAUrl();

      this.getWebByUrl(tmp, charset, allUrls.get(tmp) + "");

      int i = 0;

      for (i = 0; i < intThreadNum; i++) {

      new Thread(new Processer(this))。start();

      }

      while (true) {

      if (arrUrls.isEmpty() && Thread.activeCount() == 1) {

      long finishTime = System.currentTimeMillis();

      long costTime = finishTime - startTime;

      System.out.println("\n\n\n\n\nFinished!");

      addReport("\n\n\n\n\nFinished!\n");

      System.out.println("Start time = " + startTime + " "

      + "Finish time = " + finishTime + " "

      + "Cost time = " + costTime + "ms");

      addReport("Start time = " + startTime + " "

      + "Finish time = " + finishTime + " "

      + "Cost time = " + costTime + "ms" + "\n");

      System.out.println("Total url number = "

      + (webSuccessed + webFailed) + " Successed: "

      + webSuccessed + " Failed: " + webFailed);

      addReport("Total url number = " + (webSuccessed + webFailed)

      + " Successed: " + webSuccessed + " Failed: "

      + webFailed + "\n");

      String strIndex = "";

      String tmpUrl = "";

      while (!arrUrl.isEmpty()) {

      tmpUrl = getUrl();

      strIndex += "Web depth:" + deepUrls.get(tmpUrl)

      + " Filepath: " + fPath + "/web"

      + allUrls.get(tmpUrl) + ".htm" + " url:" + tmpUrl

      + "\n\n";

      }

      System.out.println(strIndex);

      try {

      PrintWriter pwIndex = new PrintWriter(new FileOutputStream(

      "fileindex.txt"));

      pwIndex.println(strIndex);

      pwIndex.close();

      } catch (Exception e) {

      System.out.println("生成索引文件失敗!");

      }

      break;

      }

      }

      }

      public void getWebByUrl(String strUrl, String charset, String fileIndex) {

      try {

      // if(charset==null||"".equals(charset))charset="utf-8";

      System.out.println("Getting web by url: " + strUrl);

      addReport("Getting web by url: " + strUrl + "\n");

      URL url = new URL(strUrl);

      URLConnection conn = url.openConnection();

      conn.setDoOutput(true);

      InputStream is = null;

      is = url.openStream();

      String filePath = fPath + "/web" + fileIndex + ".htm";

      PrintWriter pw = null;

      FileOutputStream fos = new FileOutputStream(filePath);

      OutputStreamWriter writer = new OutputStreamWriter(fos);

      pw = new PrintWriter(writer);

      BufferedReader bReader = new BufferedReader(new InputStreamReader(

      is));

      StringBuffer sb = new StringBuffer();

      String rLine = null;

      String tmp_rLine = null;

      while ((rLine = bReader.readLine()) != null) {

      tmp_rLine = rLine;

      int str_len = tmp_rLine.length();

      if (str_len > 0) {

      sb.append("\n" + tmp_rLine);

      pw.println(tmp_rLine);

      pw.flush();

      if (deepUrls.get(strUrl) < webDepth)

      getUrlByString(tmp_rLine, strUrl);

      }

      tmp_rLine = null;

      }

      is.close();

      pw.close();

      System.out.println("Get web successfully! " + strUrl);

      addReport("Get web successfully! " + strUrl + "\n");

      addWebSuccessed();

      } catch (Exception e) {

      System.out.println("Get web failed! " + strUrl);

      addReport("Get web failed! " + strUrl + "\n");

      addWebFailed();

      }

      }

      public String getDomain() {

      String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)";

      Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE);

      Matcher m = p.matcher(strHomePage);

      boolean blnp = m.find();

      if (blnp == true) {

      return m.group(0);

      }

      return null;

      }

      public void getUrlByString(String inputArgs, String strUrl) {

      String tmpStr = inputArgs;

      String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*("

      + myDomain + ")[^\\s\"\'>]*";

      Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);

      Matcher m = p.matcher(tmpStr);

      boolean blnp = m.find();

      // int i = 0;

      while (blnp == true) {

      if (!allUrls.containsKey(m.group(0))) {

      System.out.println("Find a new url,depth:"

      + (deepUrls.get(strUrl) + 1) + " " + m.group(0));

      addReport("Find a new url,depth:" + (deepUrls.get(strUrl) + 1)

      + " " + m.group(0) + "\n");

      arrUrls.add(m.group(0));

      arrUrl.add(m.group(0));

      allUrls.put(m.group(0), getIntWebIndex());

      deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1));

      }

      tmpStr = tmpStr.substring(m.end(), tmpStr.length());

      m = p.matcher(tmpStr);

      blnp = m.find();

      }

      }

      class Processer implements Runnable {

      GetWeb gw;

      public Processer(GetWeb g) {

      this.gw = g;

      }

      public void run() {

      // Thread.sleep(5000);

      while (!arrUrls.isEmpty()) {

      String tmp = getAUrl();

      getWebByUrl(tmp, charset, allUrls.get(tmp) + "");

      }

      }

      }

      }

    posted on 2013-10-12 17:38 好不容易 閱讀(268) 評論(0)  編輯  收藏


    只有注冊用戶登錄后才能發表評論。


    網站導航:
     
    PK10開獎 PK10開獎
    主站蜘蛛池模板: 亚洲av日韩av永久无码电影| 久久精品国产精品亚洲毛片| 亚洲av永久中文无码精品| 在线视频免费观看爽爽爽| 97久久精品亚洲中文字幕无码| 日本视频免费高清一本18| 亚洲成色在线综合网站| 青青草免费在线视频| 亚洲AV综合色区无码二区偷拍| 国产精品自拍亚洲| 免费久久精品国产片香蕉| 看免费毛片天天看| 久久久无码精品亚洲日韩软件| 国产特黄特色的大片观看免费视频| 亚洲毛片网址在线观看中文字幕| 一区二区免费在线观看| 亚洲日韩欧洲无码av夜夜摸| 亚洲电影免费在线观看| 亚洲乱码中文论理电影| 在线看片无码永久免费aⅴ| 美女露隐私全部免费直播| 亚洲国产一区视频| 青青青国产手机频在线免费观看| 亚洲精品国产免费| 日本视频免费在线| 丝袜捆绑调教视频免费区| 亚洲无成人网77777| 成人永久免费高清| 一区二区免费在线观看| 亚洲欧洲综合在线| vvvv99日韩精品亚洲| 久久国产免费观看精品| 亚洲日韩精品国产3区| 伊人久久亚洲综合| 免费无码一区二区| 四虎国产精品免费久久影院| 国产一级一毛免费黄片| 亚洲高清有码中文字| 亚洲午夜国产精品无码老牛影视 | 在线亚洲人成电影网站色www| 亚欧免费一级毛片|