<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    中文JAVA技術平等自由協作創造

    Java專題文章博客和開源

    常用鏈接

    統計

    最新評論

    多線程實現的Java爬蟲程序

      以下是一個Java爬蟲程序,它能從指定主頁開始,按照指定的深度抓取該站點域名下的網頁并維護簡單索引。

      參數:private static int webDepth = 2;//爬蟲深度。

      主頁的深度為1,設置深度后超過該深度的網頁不會抓取。 private int intThreadNum = 10;//線程數。開啟的線程數。

      抓取時也會在程序源文件目錄下生成一個report.txt文件記錄爬蟲的運行情況,并在抓取結束后生成一個fileindex.txt文件維護網頁文件索引。

      本程序用到了多線程(靜態變量和同步),泛型,文件操作,URL類和連接,Hashtable類關聯數組,正則表達式及其相關類。托福答案

      運行時需使用命令行參數,第一個參數應使用http://開頭的有效URL字符串作為爬蟲的主頁,第二個參數(可選)應輸入可轉換為int型的字符串(用 Integer.parseInt(String s)靜態方法可以轉換的字符串,如3)作為爬蟲深度,如果沒有,則默認深度為2.

      本程序的不足之處是:只考慮了href= href=' href="后加絕對url的這三種情況(由于url地址在網頁源文件中情況比較復雜,有時處理也會出現錯誤),還有相對url和 window.open('的情況沒有考慮。異常處理程序也只是簡單處理。如果讀者有改進辦法可以把源代碼帖出,不勝感激。托福改分

      附上源代碼如下(保存名為GetWeb.java):

      import java.io.File;

      import java.io.BufferedReader;

      import java.io.FileOutputStream;

      import java.io.InputStream;

      import java.io.InputStreamReader;

      import java.io.OutputStreamWriter;

      import java.io.PrintWriter;

      import java.net.URL;

      import java.net.URLConnection;

      import java.util.ArrayList;

      import java.util.regex.Matcher;

      import java.util.regex.Pattern;

      import java.util.Hashtable;

      public class GetWeb {

      private int webDepth = 2;// 爬蟲深度

      private int intThreadNum = 10;// 線程數

      private String strHomePage = "";// 主頁地址

      private String myDomain;// 域名

      private String fPath = "web";// 儲存網頁文件的目錄名

      private ArrayList<String> arrUrls = new ArrayList<String>();// 存儲未處理URL

      private ArrayList<String> arrUrl = new ArrayList<String>();// 存儲所有URL供建立索引

      private Hashtable<String, Integer> allUrls = new Hashtable<String, Integer>();// 存儲所有URL的網頁號

      private Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>();// 存儲所有URL深度

      private int intWebIndex = 0;// 網頁對應文件下標,從0開始

      private String charset = "GB2312";

      private String report = "";

      private long startTime;

      private int webSuccessed = 0;

      private int webFailed = 0;

      public GetWeb(String s) {

      this.strHomePage = s;

      }

      public GetWeb(String s, int i) {

      this.strHomePage = s;

      this.webDepth = i;

      }

      public synchronized void addWebSuccessed() {

      webSuccessed++;

      }

      public synchronized void addWebFailed() {

      webFailed++;

      }

      public synchronized void addReport(String s) {

      try {

      report += s;

      PrintWriter pwReport = new PrintWriter(new FileOutputStream(

      "report.txt"));

      pwReport.println(report);

      pwReport.close();

      } catch (Exception e) {

      System.out.println("生成報告文件失敗!");

      }

      }

      public synchronized String getAUrl() {

      String tmpAUrl = arrUrls.get(0);

      arrUrls.remove(0);

      return tmpAUrl;

      }

      public synchronized String getUrl() {

      String tmpUrl = arrUrl.get(0);

      arrUrl.remove(0);

      return tmpUrl;

      }

      public synchronized Integer getIntWebIndex() {

      intWebIndex++;

      return intWebIndex;

      }

      /**

      * @param args

      */

      public static void main(String[] args) {

      if (args.length == 0 || args[0].equals("")) {

      System.out.println("No input!");

      System.exit(1);

      } else if (args.length == 1) {

      GetWeb gw = new GetWeb(args[0]);

      gw.getWebByHomePage();

      } else {

      GetWeb gw = new GetWeb(args[0], Integer.parseInt(args[1]));

      gw.getWebByHomePage();

      }

      }

      public void getWebByHomePage() {

      startTime = System.currentTimeMillis();

      this.myDomain = getDomain();

      if (myDomain == null) {

      System.out.println("Wrong input!");

      // System.exit(1);

      return;

      }

      System.out.println("Homepage = " + strHomePage);

      addReport("Homepage = " + strHomePage + "!\n");

      System.out.println("Domain = " + myDomain);

      addReport("Domain = " + myDomain + "!\n");

      arrUrls.add(strHomePage);

      arrUrl.add(strHomePage);

      allUrls.put(strHomePage, 0);

      deepUrls.put(strHomePage, 1);

      File fDir = new File(fPath);

      if (!fDir.exists()) {

      fDir.mkdir();

      }

      System.out.println("Start!");

      this.addReport("Start!\n");

      String tmp = getAUrl();

      this.getWebByUrl(tmp, charset, allUrls.get(tmp) + "");

      int i = 0;

      for (i = 0; i < intThreadNum; i++) {

      new Thread(new Processer(this))。start();

      }

      while (true) {

      if (arrUrls.isEmpty() && Thread.activeCount() == 1) {

      long finishTime = System.currentTimeMillis();

      long costTime = finishTime - startTime;

      System.out.println("\n\n\n\n\nFinished!");

      addReport("\n\n\n\n\nFinished!\n");

      System.out.println("Start time = " + startTime + " "

      + "Finish time = " + finishTime + " "

      + "Cost time = " + costTime + "ms");

      addReport("Start time = " + startTime + " "

      + "Finish time = " + finishTime + " "

      + "Cost time = " + costTime + "ms" + "\n");

      System.out.println("Total url number = "

      + (webSuccessed + webFailed) + " Successed: "

      + webSuccessed + " Failed: " + webFailed);

      addReport("Total url number = " + (webSuccessed + webFailed)

      + " Successed: " + webSuccessed + " Failed: "

      + webFailed + "\n");

      String strIndex = "";

      String tmpUrl = "";

      while (!arrUrl.isEmpty()) {

      tmpUrl = getUrl();

      strIndex += "Web depth:" + deepUrls.get(tmpUrl)

      + " Filepath: " + fPath + "/web"

      + allUrls.get(tmpUrl) + ".htm" + " url:" + tmpUrl

      + "\n\n";

      }

      System.out.println(strIndex);

      try {

      PrintWriter pwIndex = new PrintWriter(new FileOutputStream(

      "fileindex.txt"));

      pwIndex.println(strIndex);

      pwIndex.close();

      } catch (Exception e) {

      System.out.println("生成索引文件失敗!");

      }

      break;

      }

      }

      }

      public void getWebByUrl(String strUrl, String charset, String fileIndex) {

      try {

      // if(charset==null||"".equals(charset))charset="utf-8";

      System.out.println("Getting web by url: " + strUrl);

      addReport("Getting web by url: " + strUrl + "\n");

      URL url = new URL(strUrl);

      URLConnection conn = url.openConnection();

      conn.setDoOutput(true);

      InputStream is = null;

      is = url.openStream();

      String filePath = fPath + "/web" + fileIndex + ".htm";

      PrintWriter pw = null;

      FileOutputStream fos = new FileOutputStream(filePath);

      OutputStreamWriter writer = new OutputStreamWriter(fos);

      pw = new PrintWriter(writer);

      BufferedReader bReader = new BufferedReader(new InputStreamReader(

      is));

      StringBuffer sb = new StringBuffer();

      String rLine = null;

      String tmp_rLine = null;

      while ((rLine = bReader.readLine()) != null) {

      tmp_rLine = rLine;

      int str_len = tmp_rLine.length();

      if (str_len > 0) {

      sb.append("\n" + tmp_rLine);

      pw.println(tmp_rLine);

      pw.flush();

      if (deepUrls.get(strUrl) < webDepth)

      getUrlByString(tmp_rLine, strUrl);

      }

      tmp_rLine = null;

      }

      is.close();

      pw.close();

      System.out.println("Get web successfully! " + strUrl);

      addReport("Get web successfully! " + strUrl + "\n");

      addWebSuccessed();

      } catch (Exception e) {

      System.out.println("Get web failed! " + strUrl);

      addReport("Get web failed! " + strUrl + "\n");

      addWebFailed();

      }

      }

      public String getDomain() {

      String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)";

      Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE);

      Matcher m = p.matcher(strHomePage);

      boolean blnp = m.find();

      if (blnp == true) {

      return m.group(0);

      }

      return null;

      }

      public void getUrlByString(String inputArgs, String strUrl) {

      String tmpStr = inputArgs;

      String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*("

      + myDomain + ")[^\\s\"\'>]*";

      Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);

      Matcher m = p.matcher(tmpStr);

      boolean blnp = m.find();

      // int i = 0;

      while (blnp == true) {

      if (!allUrls.containsKey(m.group(0))) {

      System.out.println("Find a new url,depth:"

      + (deepUrls.get(strUrl) + 1) + " " + m.group(0));

      addReport("Find a new url,depth:" + (deepUrls.get(strUrl) + 1)

      + " " + m.group(0) + "\n");

      arrUrls.add(m.group(0));

      arrUrl.add(m.group(0));

      allUrls.put(m.group(0), getIntWebIndex());

      deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1));

      }

      tmpStr = tmpStr.substring(m.end(), tmpStr.length());

      m = p.matcher(tmpStr);

      blnp = m.find();

      }

      }

      class Processer implements Runnable {

      GetWeb gw;

      public Processer(GetWeb g) {

      this.gw = g;

      }

      public void run() {

      // Thread.sleep(5000);

      while (!arrUrls.isEmpty()) {

      String tmp = getAUrl();

      getWebByUrl(tmp, charset, allUrls.get(tmp) + "");

      }

      }

      }

      }

    posted on 2013-10-12 17:38 好不容易 閱讀(268) 評論(0)  編輯  收藏


    只有注冊用戶登錄后才能發表評論。


    網站導航:
     
    PK10開獎 PK10開獎
    主站蜘蛛池模板: 亚洲人成亚洲人成在线观看 | 嫩草在线视频www免费观看| 国产在线国偷精品免费看| 99久久99热精品免费观看国产| 日韩免费视频播播| 亚洲第一视频网站| 又粗又长又爽又长黄免费视频 | 一级毛片aaaaaa视频免费看| 1000部禁片黄的免费看| 亚洲午夜激情视频| 久久精品亚洲AV久久久无码| 中文字幕av无码不卡免费| 亚洲男人的天堂一区二区| 亚洲日本成本人观看| 波多野结衣免费在线| 久久亚洲国产午夜精品理论片| 国产精品亚洲精品久久精品| 亚洲精品免费网站| 亚洲黄色在线视频| 十八禁在线观看视频播放免费| 免费h黄肉动漫在线观看| 亚洲 日韩 色 图网站| 青青青免费国产在线视频小草| 久久亚洲精品国产亚洲老地址| 永久免费毛片在线播放 | 亚洲第一永久在线观看| 2022免费国产精品福利在线| 日本高清免费不卡在线| 特级av毛片免费观看| 国产乱子伦精品免费女| 亚洲无人区码一二三码区别图片| 亚洲人成在线免费观看| 亚洲av丰满熟妇在线播放| 亚洲一区二区免费视频| 亚洲Av无码国产一区二区| 午夜视频在线在免费| 亚洲日韩精品无码专区加勒比☆| 国产精品酒店视频免费看| 国产精品亚洲一区二区在线观看 | 天黑黑影院在线观看视频高清免费| 四虎精品亚洲一区二区三区|