<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    posts - 495,comments - 227,trackbacks - 0

    為了支持全文檢索,有必要將HTML格式的文章轉化為純文本格式,因此我設計了一個基本的WebFormatter類,提供一個簡單的public static String html2text(String html),將HTML格式轉化為Text:

    /*
    ?* File: WebFormatter.java
    ?* Created on 2005-6-24
    ?* Author: Liao Xuefeng,
    asklxf@163.com
    ?* Copyright (C) 2005, Liao Xuefeng.
    ?*/
    package com.mboker.blog.web.util;

    import java.util.*;
    import java.text.SimpleDateFormat;

    /**
    ?* Do some format on web display.
    ?*
    ?* @author Xuefeng
    ?*/
    public class WebFormatter {

    ??? public static String html2text(String html) {
    ??????? StringBuffer sb = new StringBuffer(html.length());
    ??????? char[] data = html.toCharArray();
    ??????? int start = 0;
    ??????? boolean previousIsPre = false;
    ??????? Token token = null;
    ??????? for(;;) {
    ??????????? token = parse(data, start, previousIsPre);
    ??????????? if(token==null)
    ??????????????? break;
    ??????????? previousIsPre = token.isPreTag();
    ??????????? sb = sb.append(token.getText());
    ??????????? start += token.getLength();
    ??????? }
    ??????? return sb.toString();
    ??? }

    ??? private static Token parse(char[] data, int start, boolean previousIsPre) {
    ??????? if(start>=data.length)
    ??????????? return null;
    ??????? // try to read next char:
    ??????? char c = data[start];
    ??????? if(c=='<') {
    ??????????? // this is a tag or comment or script:
    ??????????? int end_index = indexOf(data, start+1, '>');
    ??????????? if(end_index==(-1)) {
    ??????????????? // the left is all text!
    ??????????????? return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
    ??????????? }
    ??????????? String s = new String(data, start, end_index-start+1);
    ??????????? // now we got s="<...>":
    ??????????? if(s.startsWith("<!--")) { // this is a comment!
    ??????????????? int end_comment_index = indexOf(data, start+1, "-->");
    ??????????????? if(end_comment_index==(-1)) {
    ??????????????????? // illegal end, but treat as comment:
    ??????????????????? return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);
    ??????????????? }
    ??????????????? else
    ??????????????????? return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);
    ??????????? }
    ??????????? String s_lowerCase = s.toLowerCase();
    ??????????? if(s_lowerCase.startsWith("<script")) { // this is a script:
    ??????????????? int end_script_index = indexOf(data, start+1, "</script>");
    ??????????????? if(end_script_index==(-1))
    ??????????????????? // illegal end, but treat as script:
    ??????????????????? return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);
    ??????????????? else
    ??????????????????? return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);
    ??????????? }
    ??????????? else { // this is a tag:
    ??????????????? return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);
    ??????????? }
    ??????? }
    ??????? // this is a text:
    ??????? int next_tag_index = indexOf(data, start+1, '<');
    ??????? if(next_tag_index==(-1))
    ??????????? return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
    ??????? return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);
    ??? }

    ??? private static int indexOf(char[] data, int start, String s) {
    ??????? char[] ss = s.toCharArray();
    ??????? // TODO: performance can improve!
    ??????? for(int i=start; i<(data.length-ss.length); i++) {
    ??????????? // compare from data[i] with ss[0]:
    ??????????? boolean match = true;
    ??????????? for(int j=0; j<ss.length; j++) {
    ??????????????? if(data[i+j]!=ss[j]) {
    ??????????????????? match = false;
    ??????????????????? break;
    ??????????????? }
    ??????????? }
    ??????????? if(match)
    ??????????????? return i;
    ??????? }
    ??????? return (-1);
    ??? }

    ??? private static int indexOf(char[] data, int start, char c) {
    ??????? for(int i=start; i<data.length; i++) {
    ??????????? if(data[i]==c)
    ??????????????? return i;
    ??????? }
    ??????? return (-1);
    ??? }

    }

    class Token {

    ??? public static final int TOKEN_TEXT??? = 0; // html text.
    ??? public static final int TOKEN_COMMENT = 1; // comment like <!-- comments... -->
    ??? public static final int TOKEN_TAG???? = 2; // tag like <pre>, <font>, etc.
    ??? public static final int TOKEN_SCRIPT? = 3;

    ??? private static final char[] TAG_BR? = "<br".toCharArray();
    ??? private static final char[] TAG_P?? = "<p".toCharArray();
    ??? private static final char[] TAG_LI? = "<li".toCharArray();
    ??? private static final char[] TAG_PRE = "<pre".toCharArray();
    ??? private static final char[] TAG_HR? = "<hr".toCharArray();

    ??? private static final char[] END_TAG_TD = "</td>".toCharArray();
    ??? private static final char[] END_TAG_TR = "</tr>".toCharArray();
    ??? private static final char[] END_TAG_LI = "</li>".toCharArray();

    ??? private static final Map SPECIAL_CHARS = new HashMap();

    ??? private int type;
    ??? private String html;?????????? // original html
    ??? private String text = null;??? // text!
    ??? private int length = 0;??????? // html length
    ??? private boolean isPre = false; // isPre tag?

    ??? static {
    ??????? SPECIAL_CHARS.put("&quot;", "\"");
    ??????? SPECIAL_CHARS.put("&lt;",?? "<");
    ??????? SPECIAL_CHARS.put("&gt;",?? ">");
    ??????? SPECIAL_CHARS.put("&amp;",? "&");
    ??????? SPECIAL_CHARS.put("&reg;",? "(r)");
    ??????? SPECIAL_CHARS.put("&copy;", "(c)");
    ??????? SPECIAL_CHARS.put("&nbsp;", " ");
    ??????? SPECIAL_CHARS.put("&pound;", "?");
    ??? }

    ??? public Token(int type, char[] data, int start, int end, boolean previousIsPre) {
    ??????? this.type = type;
    ??????? this.length = end - start;
    ??????? this.html = new String(data, start, length);
    ??????? System.out.println("[Token] html=" + html + ".");
    ??????? parseText(previousIsPre);
    ??????? System.out.println("[Token] text=" + text + ".");
    ??? }

    ??? public int getLength() {
    ??????? return length;
    ??? }

    ??? public boolean isPreTag() {
    ??????? return isPre;
    ??? }

    ??? private void parseText(boolean previousIsPre) {
    ??????? if(type==TOKEN_TAG) {
    ??????????? char[] cs = html.toCharArray();
    ??????????? if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))
    ??????????????? text = "\n";
    ??????????? else if(compareTag(TAG_LI, cs))
    ??????????????? text = "\n* ";
    ??????????? else if(compareTag(TAG_PRE, cs))
    ??????????????? isPre = true;
    ??????????? else if(compareTag(TAG_HR, cs))
    ??????????????? text = "\n--------\n";
    ??????????? else if(compareString(END_TAG_TD, cs))
    ??????????????? text = "\t";
    ??????????? else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))
    ??????????????? text = "\n";
    ??????? }
    ??????? // text token:
    ??????? else if(type==TOKEN_TEXT) {
    ??????????? text = toText(html, previousIsPre);
    ??????? }
    ??? }

    ??? public String getText() {
    ??????? return text==null ? "" : text;
    ??? }

    ??? private String toText(String html, final boolean isPre) {
    ??????? char[] cs = html.toCharArray();
    ??????? StringBuffer buffer = new StringBuffer(cs.length);
    ??????? int start = 0;
    ??????? boolean continueSpace = false;
    ??????? char current, next;
    ??????? for(;;) {
    ??????????? if(start>=cs.length)
    ??????????????? break;
    ??????????? current = cs[start]; // read current char
    ??????????? if(start+1<cs.length) // and next char
    ??????????????? next = cs[start+1];
    ??????????? else
    ??????????????? next = '\0';
    ??????????? if(current==' ') {
    ??????????????? if(isPre || !continueSpace)
    ??????????????????? buffer = buffer.append(' ');
    ??????????????? continueSpace = true;
    ??????????????? // continue loop:
    ??????????????? start++;
    ??????????????? continue;
    ??????????? }
    ??????????? // not ' ', so:
    ??????????? if(current=='\r' && next=='\n') {
    ??????????????? if(isPre)
    ??????????????????? buffer = buffer.append('\n');
    ??????????????? // continue loop:
    ??????????????? start+=2;
    ??????????????? continue;
    ??????????? }
    ??????????? if(current=='\n' || current=='\r') {
    ??????????????? if(isPre)
    ??????????????????? buffer = buffer.append('\n');
    ??????????????? // continue loop:
    ??????????????? start++;
    ??????????????? continue;
    ??????????? }
    ??????????? // cannot continue space:
    ??????????? continueSpace = false;
    ??????????? if(current=='&') {
    ??????????????? // maybe special char:
    ??????????????? int length = readUtil(cs, start, ';', 10);
    ??????????????? if(length==(-1)) { // just '&':
    ??????????????????? buffer = buffer.append('&');
    ??????????????????? // continue loop:
    ??????????????????? start++;
    ??????????????????? continue;
    ??????????????? }
    ??????????????? else { // check if special character:
    ??????????????????? String spec = new String(cs, start, length);
    ??????????????????? String specChar = (String)SPECIAL_CHARS.get(spec);
    ??????????????????? if(specChar!=null) { // special chars!
    ??????????????????????? buffer = buffer.append(specChar);
    ??????????????????????? // continue loop:
    ??????????????????????? start+=length;
    ??????????????????????? continue;
    ??????????????????? }
    ??????????????????? else { // check if like '&#1234':
    ??????????????????????? if(next=='#') { // maybe a char
    ??????????????????????????? String num = new String(cs, start+2, length-3);
    ??????????????????????????? try {
    ??????????????????????????????? int code = Integer.parseInt(num);
    ??????????????????????????????? if(code>0 && code<65536) { // this is a special char:
    ??????????????????????????????????? buffer = buffer.append((char)code);
    ??????????????????????????????????? // continue loop:
    ??????????????????????????????????? start++;
    ??????????????????????????????????? continue;
    ??????????????????????????????? }
    ??????????????????????????? }
    ??????????????????????????? catch(Exception e) {}
    ??????????????????????????? // just normal char:
    ??????????????????????????? buffer = buffer.append("&#");
    ??????????????????????????? // continue loop:
    ??????????????????????????? start+=2;
    ??????????????????????????? continue;
    ??????????????????????? }
    ??????????????????????? else { // just '&':
    ??????????????????????????? buffer = buffer.append('&');
    ??????????????????????????? // continue loop:
    ??????????????????????????? start++;
    ??????????????????????????? continue;
    ??????????????????????? }
    ??????????????????? }
    ??????????????? }
    ??????????? }
    ??????????? else { // just a normal char!
    ??????????????? buffer = buffer.append(current);
    ??????????????? // continue loop:
    ??????????????? start++;
    ??????????????? continue;
    ??????????? }
    ??????? }
    ??????? return buffer.toString();
    ??? }

    ??? // read from cs[start] util meet the specified char 'util',
    ??? // or null if not found:
    ??? private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {
    ??????? int end = start+maxLength;
    ??????? if(end>cs.length)
    ??????????? end = cs.length;
    ??????? for(int i=start; i<start+maxLength; i++) {
    ??????????? if(cs[i]==util) {
    ??????????????? return i-start+1;
    ??????????? }
    ??????? }
    ??????? return (-1);
    ??? }

    ??? // compare standard tag "<input" with tag "<INPUT value=aa>"
    ??? private boolean compareTag(final char[] ori_tag, char[] tag) {
    ??????? if(ori_tag.length>=tag.length)
    ??????????? return false;
    ??????? for(int i=0; i<ori_tag.length; i++) {
    ??????????? if(Character.toLowerCase(tag[i])!=ori_tag[i])
    ??????????????? return false;
    ??????? }
    ??????? // the following char should not be a-z:
    ??????? if(tag.length>ori_tag.length) {
    ??????????? char c = Character.toLowerCase(tag[ori_tag.length]);
    ??????????? if(c<'a' || c>'z')
    ??????????????? return true;
    ??????????? return false;
    ??????? }
    ??????? return true;
    ??? }

    ??? private boolean compareString(final char[] ori, char[] comp) {
    ??????? if(ori.length>comp.length)
    ??????????? return false;
    ??????? for(int i=0; i<ori.length; i++) {
    ??????????? if(Character.toLowerCase(comp[i])!=ori[i])
    ??????????????? return false;
    ??????? }
    ??????? return true;
    ??? }

    ??? public String toString() {
    ??????? return html;
    ??? }
    }

    注意,請先將html中的<body>...</body>部分提取出來,再交給WebFormatter處理,因為html->text轉換實質是刪除所有標簽(某些標簽如<br>被轉化為'\n')、Script和注釋,對于JavaScript生成的動態內容(例如document.write)無能為力。

    posted on 2006-04-07 16:33 SIMONE 閱讀(808) 評論(0)  編輯  收藏 所屬分類: JAVA
    主站蜘蛛池模板: 国产99在线|亚洲| a级毛片在线视频免费观看| 久久精品国产精品亚洲人人| 四虎影视无码永久免费| 亚洲成a人片在线观看播放| 国产午夜免费福利红片| 本道天堂成在人线av无码免费| 久久久亚洲欧洲日产国码二区| 日韩免费观看的一级毛片| 精品四虎免费观看国产高清午夜| 亚洲六月丁香婷婷综合| 国产亚洲精久久久久久无码77777 国产亚洲精品成人AA片新蒲金 | 亚洲国产综合专区电影在线| 美女黄网站人色视频免费国产| 两个人看的www免费| 亚洲AV无码专区在线观看成人| 亚洲AV午夜成人影院老师机影院| 妞干网免费观看视频| 一个人免费视频观看在线www| 亚洲AV无码片一区二区三区| 亚洲日本一区二区三区| 亚洲精品视频在线观看你懂的| 97性无码区免费| 你是我的城池营垒免费观看完整版| 亚洲一卡一卡二新区无人区| 亚洲AV成人一区二区三区AV| 亚洲成A∨人片天堂网无码| 丁香花免费完整高清观看| 国产在线观看免费视频软件| 免费观看亚洲人成网站| 亚洲男人的天堂久久精品| 久久亚洲国产精品| 亚洲中文字幕丝袜制服一区| 日韩免费在线观看| 人成午夜免费视频在线观看| 污视频在线免费观看| 好湿好大好紧好爽免费视频 | 国产福利视精品永久免费| 久久免费高清视频| 中文字幕不卡免费高清视频| 国产大陆亚洲精品国产|