锘??xml version="1.0" encoding="utf-8" standalone="yes"?>亚洲精品久久无码,亚洲韩国—中文字幕,亚洲自偷自偷在线成人网站传媒http://www.tkk7.com/human2008/category/28773.htmlzh-cnFri, 11 Jan 2008 10:27:55 GMTFri, 11 Jan 2008 10:27:55 GMT60鍒嗘瀽/瑙f瀽Html欏甸潰:HTML Parser鐨勮瘯鐢?http://www.tkk7.com/human2008/archive/2008/01/11/174505.html鐏?鐏?Fri, 11 Jan 2008 01:45:00 GMThttp://www.tkk7.com/human2008/archive/2008/01/11/174505.htmlhttp://www.tkk7.com/human2008/comments/174505.htmlhttp://www.tkk7.com/human2008/archive/2008/01/11/174505.html#Feedback0http://www.tkk7.com/human2008/comments/commentRss/174505.htmlhttp://www.tkk7.com/human2008/services/trackbacks/174505.html鏈榪戝湪鐮旂┒lucene鐨勫叏鏂囨绱?鍦ㄥ緢澶氬湴鏂歸渶瑕佽В鏋愭垨鑰呰鍒嗘瀽Html鍐呭鎴栬匟tml欏甸潰,Lucene鏈韓鐨勬紨紺虹▼搴忎腑涔熸彁渚涗簡涓涓狧tml Parser,浣嗘槸涓嶆槸綰疛ava鐨勮В鍐蟲柟妗?浜庢槸鍒板鎼滅儲,鍦ㄧ綉涓婃壘鍒頒簡涓涓?HTMLParser".

緗戝潃鏄? http://htmlparser.sourceforge.net ,褰撳墠鐗堟湰涓?.5.

涓嬭澆涓嬫潵,璇曠敤涓鐣?鎰熻涓嶉敊,瀹屽叏鑳芥弧瓚砽ucene瑙f瀽Html鐨勯渶姹?

榪囧嚑澶╄創(chuàng)鍑簂ucene榪涜鍏ㄦ枃媯(gè)绱㈢殑浠g爜.(媯(gè)绱㈡湰绔欑殑鏂囩珷絳?.

璇曠敤浠g爜濡備笅,渚涘ぇ瀹跺弬鑰?

package com.jscud.test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.TextExtractingVisitor;

import com.jscud.util.LogMan; //涓涓棩蹇楄褰曠被

/**
 * 婕旂ず浜咹tml Parse鐨勫簲鐢?
 *
 * @author scud http://www.jscud.com
 */

public class ParseHtmlTest
{

    public static void main(String[] args) throws Exception
    {
        String aFile = "e:/jscud/temp/test.htm";

        String content = readTextFile(aFile, "GBK");

        test1(content);
        System.out.println("====================================");

        test2(content);
        System.out.println("====================================");

        test3(content);
        System.out.println("====================================");

        test4(content);
        System.out.println("====================================");

        test5(aFile);
        System.out.println("====================================");

        //璁塊棶澶栭儴璧勬簮,鐩稿鎱?br />         test5("         System.out.println("====================================");

    }

    /**
     * 璇誨彇鏂囦歡鐨勬柟寮忔潵鍒嗘瀽鍐呭.
     * filePath涔熷彲浠ユ槸涓涓猆rl.
     *
     * @param resource 鏂囦歡/Url
     */
    public static void test5(String resource) throws Exception
    {
        Parser myParser = new Parser(resource);

        //璁劇疆緙栫爜
        myParser.setEncoding("GBK");

        HtmlPage visitor = new HtmlPage(myParser);

        myParser.visitAllNodesWith(visitor);

        String textInPage = visitor.getTitle();

        System.out.println(textInPage);
    }

    /**
     * 鎸夐〉闈㈡柟寮忓鐞?瀵逛竴涓爣鍑嗙殑Html欏甸潰,鎺ㄨ崘浣跨敤姝ょ鏂瑰紡.
     */
    public static void test4(String content) throws Exception
    {
        Parser myParser;
        myParser = Parser.createParser(content, "GBK");

        HtmlPage visitor = new HtmlPage(myParser);

        myParser.visitAllNodesWith(visitor);

        String textInPage = visitor.getTitle();

        System.out.println(textInPage);
    }

    /**
     * 鍒╃敤Visitor妯″紡瑙f瀽html欏甸潰.
     *
     * 灝忎紭鐐?緲昏瘧浜?lt;>絳夌鍙?
     * 緙虹偣:濂藉絀烘牸,鏃犳硶鎻愬彇link
     *  
     */
    public static void test3(String content) throws Exception
    {
        Parser myParser;
        myParser = Parser.createParser(content, "GBK");

        TextExtractingVisitor visitor = new TextExtractingVisitor();

        myParser.visitAllNodesWith(visitor);

        String textInPage = visitor.getExtractedText();

        System.out.println(textInPage);
    }

    /**
     * 寰楀埌鏅氭枃鏈拰閾炬帴鐨勫唴瀹?
     *
     * 浣跨敤浜嗚繃婊ゆ潯浠?
     */
    public static void test2(String content) throws ParserException
    {
        Parser myParser;
        NodeList nodeList = null;

        myParser = Parser.createParser(content, "GBK");

        NodeFilter textFilter = new NodeClassFilter(TextNode.class);
        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);

        //鏆傛椂涓嶅鐞?meta
        //NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);

        OrFilter lastFilter = new OrFilter();
        lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter });

        nodeList = myParser.parse(lastFilter);

        Node[] nodes = nodeList.toNodeArray();

        for (int i = 0; i < nodes.length; i++)
        {
            Node anode = (Node) nodes[i];

            String line = "";
            if (anode instanceof TextNode)
            {
                TextNode textnode = (TextNode) anode;
                //line = textnode.toPlainTextString().trim();
                line = textnode.getText();
            }
            else if (anode instanceof LinkTag)
            {
                LinkTag linknode = (LinkTag) anode;

                line = linknode.getLink();
               
//@todo 榪囨護(hù)jsp鏍囩:鍙互鑷繁瀹炵幇榪欎釜鍑芥暟
                //line = StringFunc.replace(line, "<%.*%>", "");
            }

            if (isTrimEmpty(line))
                continue;

            System.out.println(line);
        }
    }

    /**
     * 瑙f瀽鏅氭枃鏈妭鐐?
     *
     * @param content
     * @throws ParserException
     */
    public static void test1(String content) throws ParserException
    {
        Parser myParser;
        Node[] nodes = null;

        myParser = Parser.createParser(content, null);

        nodes = myParser.extractAllNodesThatAre(TextNode.class); //exception could be thrown here

        for (int i = 0; i < nodes.length; i++)
        {
            TextNode textnode = (TextNode) nodes[i];
            String line = textnode.toPlainTextString().trim();
            if (line.equals(""))
                continue;
            System.out.println(line);
        }

    }

    /**
     * 璇誨彇涓涓枃浠跺埌瀛楃涓查噷.
     *
     * @param sFileName  鏂囦歡鍚?br />      * @param sEncode   String
     * @return 鏂囦歡鍐呭
     */
    public static String readTextFile(String sFileName, String sEncode)
    {
        StringBuffer sbStr = new StringBuffer();

        try
        {
            File ff = new File(sFileName);
            InputStreamReader read = new InputStreamReader(new FileInputStream(ff),
                    sEncode);
            BufferedReader ins = new BufferedReader(read);

            String dataLine = "";
            while (null != (dataLine = ins.readLine()))
            {
                sbStr.append(dataLine);
                sbStr.append("\r\n");
            }

            ins.close();
        }
        catch (Exception e)
        {
            LogMan.error("read Text File Error", e);
        }

        return sbStr.toString();
    }

    /**
     * 鍘繪帀宸﹀彸絀烘牸鍚庡瓧絎︿覆鏄惁涓虹┖
     * @param astr String
     * @return boolean
     */
    public static boolean isTrimEmpty(String astr)
    {
        if ((null == astr) || (astr.length() == 0))
        {
            return true;
        }
        if (isBlank(astr.trim()))
        {
            return true;
        }
        return false;
    }

    /**
     * 瀛楃涓叉槸鍚︿負(fù)絀?null鎴栬呴暱搴︿負(fù)0.
     * @param astr 婧愬瓧絎︿覆.
     * @return boolean
     */
    public static boolean isBlank(String astr)
    {
        if ((null == astr) || (astr.length() == 0))
        {
            return true;
        }
        else
        {
            return false;
        }
    }

}

 




鐏? 2008-01-11 09:45 鍙戣〃璇勮
]]>
主站蜘蛛池模板: 区久久AAA片69亚洲| 免费人成在线观看播放国产| 在线观看国产区亚洲一区成人 | 成人浮力影院免费看| 亚洲AV乱码久久精品蜜桃| 亚洲午夜在线播放| 无遮免费网站在线入口| 亚洲精品字幕在线观看| 亚洲综合一区二区精品久久| 久久国产精品萌白酱免费| 日本免费人成视频播放| 韩国亚洲伊人久久综合影院| 国内一级一级毛片a免费| 日本亚洲中午字幕乱码| 亚洲人成色7777在线观看不卡 | 久久久久亚洲Av无码专| 无套内射无矿码免费看黄| 亚洲国产成人乱码精品女人久久久不卡 | 中文字幕亚洲综合小综合在线| 成人嫩草影院免费观看| 在线观看无码AV网站永久免费| 久久精品国产亚洲AV麻豆王友容| 亚洲爆乳无码专区www| 免费女人高潮流视频在线观看| 亚洲国产精品无码成人片久久| 亚洲中文字幕无码久久2020| 成人性生交大片免费看中文| 亚洲成熟xxxxx电影| 成人爽A毛片免费看| 美女一级毛片免费观看 | 99精品免费观看| 亚洲精品免费网站| 永久在线免费观看| 久久人午夜亚洲精品无码区| 五月婷婷综合免费| 亚洲天堂电影在线观看| 免费国产成人高清视频网站| 永久免费av无码入口国语片| 亚洲人成综合网站7777香蕉| 亚洲А∨精品天堂在线| 久久亚洲中文字幕无码|