需要解決的問題是 根據一輸入流讀取一段XML內容,然后對其進行過濾截取,最后寫回輸出流中。具體說明如下:
1.對XML根據特定需求,過濾標簽(如SCRIPT,FRAME等非標準HTML標簽),過濾屬性(如onclick,onblur等)
2.對XML進行長度截取,具體做法如下:
(1)對start標簽的處理: 若加上start標簽長度后超過最大允許長度,則去除該標簽,且同時去除后面和該標簽同一等級的所有標簽。
(2)對text內容的處理:若加上text內容的長度后超過最大允許的長度,則從中截取text長度,并加上省略號......
(3)對end標簽內容的處理:不做長度截取,且要做到自動補齊end標簽。
有關SAX的詳細介紹,請查看最好的參考資料
http://www.saxproject.org/ 。其中有一個很重要的類 DefaultHandler, 該類中的startElement, endElement, characters 3個方法尤為重要。 為解決上述問題,需要設計2個類:HTMLWriter, HTMLFilter, 其中HTMLFilter是HTMLWriter的子類,HTMLWriter繼承了DefaultHandler,其中最為關鍵的是要重寫上述3個關鍵方法。
一.HTMLWriter類的代碼:
這個類主要用于寫操作,最重要是理解變量strippedElementLevel 的用法。上面問題的具體業務邏輯處理(標簽的過濾和長度截取)將在子類HTMLFilter 解決。
package org.util.sax.html

import openxml.parser.HTMLdtd;
import openxml.parser.HTMLSAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.ErrorHandler;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.DefaultHandler;

import java.io.*;




public class HTMLWriter extends DefaultHandler implements LexicalHandler
{

private ErrorHandler errorHandler;

private Writer out;

private int strippedElementLevel = 0; //用來作為start標簽和end標簽成對出現的標記(極為重要),具體算法思路類似于堆棧
private boolean inRawElement;



public void filter(String htmlContent) throws IOException, SAXException
{
filter(new StringReader(htmlContent));
}


public void filter(Reader in) throws IOException, SAXException
{
filter(new InputSource(in));
}


public void filter(InputSource in) throws IOException, SAXException
{
HTMLSAXParser parser = new HTMLSAXParser(errorHandler, false);
parser.setLexicalHandler(this);

XMLReader htmlReader = new HTMLParserAdapter(parser);
htmlReader.setFeature("http://xml.org/sax/features/namespaces", false);
htmlReader.setContentHandler(this);

prepare();
htmlReader.parse(in);
}



protected void prepare()
{

if (out == null)
{
out = new StringWriter();
}
}



public void setErrorHandler(ErrorHandler errorHandler)
{
this.errorHandler = errorHandler;
}


public void setOut(Writer out)
{
this.out = out;
}


public Writer getOut()
{
return out;
}


public String getResultAsString()
{

if (out instanceof StringWriter)
{
return out.toString();
}
throw new IllegalStateException("Not a buffered target");
}


@Override

public void startDocument() throws SAXException
{
prepare();
}


@Override
public final void startElement(String namespaceURI,
String localName,
String qName,

Attributes attrs) throws SAXException
{

if (strippedElementLevel > 0)
{
strippedElementLevel++;
return;
}

// features/namespace is false

if (!startTag(qName, attrs))
{
strippedElementLevel = 1;
}
}


@Override
public final void endElement(String namespaceURI,
String localName,

String qName) throws SAXException
{

if (strippedElementLevel > 0)
{
strippedElementLevel--;
return;
}

// features/namespace is false
endTag(qName);
}



protected boolean startTag(String tagName, Attributes attrs) throws SAXException
{

String tagUpper = tagName.toUpperCase();

inRawElement = "SCRIPT".equals(tagUpper) || "STYLE".equals(tagUpper);

write('<');
write(tagName);

for (int i = 0; i < attrs.getLength(); i++)
{
// features/namespace is false
String attrName = attrs.getQName(i);
attribute(tagUpper, attrName.toLowerCase(), attrName, attrs.getValue(i));
}
write('>');

return true;
}



protected void endTag(String tagName) throws SAXException
{
inRawElement = false;

if (!isEmptyTag(tagName.toUpperCase()))
{
write("</");
write(tagName);
write('>');
}
}


@Override

public void characters(char[] ch, int start, int length) throws SAXException
{

if (strippedElementLevel != 0)
{
return;
}


if (inRawElement)
{
write(ch, start, length);
return;
}

text(ch, start, length);
}



protected void text(char[] ch, int start, int length) throws SAXException
{
writeText(ch, start, length);
}



public void startDTD(String tagName, String publicId, String systemId) throws SAXException
{
write("<!DOCTYPE ");
write(tagName);
write(" PUBLIC ");
write('"');
write(publicId);
write('"');
write('>');
}



public void endDTD()
{}

public void startEntity(String name)
{}

public void endEntity(String name)
{}

public void startCDATA()
{}

public void endCDATA()
{}


public void comment(char ch[], int start, int length) throws SAXException
{

/**//*
if (strippedElementLevel == 0) {
write("<!--");
write(ch, start, length);
write("-->");
}
*/
}


@Override

public void ignorableWhitespace(char ch[], int start, int length) throws SAXException
{

if (strippedElementLevel == 0)
{
write(ch, start, length);
}
}


protected void attribute(final String tagUpper, // 規范化的 TAG 名稱 - 使用大寫字母
final String attrLower, // 規范化的 屬性 名稱 - 使用小寫字母
String attrName,

String attrValue) throws SAXException
{
write(' ');
write(attrName);

if (!isBoolean(attrLower, tagUpper))
{
write('=');
write('"');

for (int i = 0; i < attrValue.length(); i++)
{
writeEncoded(attrValue.charAt(i), true);
}
write('"');
}
}



protected final void writeText(char[] ch, int start, int length) throws SAXException
{
writeTextWithEnd(ch, start, start + length);
}



protected final void writeTextWithEnd(char[] ch, int begin, int end) throws SAXException
{

for (int i = begin; i < end; i++)
{
writeEncoded(ch[i], false);
}
}



protected void writeEncoded(char c, boolean isAttr) throws SAXException
{

switch (c)
{
case '<':
write("<");
break;
case '>':
write(">");
break;
case '&':
write("&");
break;
case 0xa0: // NBSP
// 暫時只特殊處理特殊字符 NBSP
// 當組信 NBSP 在轉換到純文本時可變成空格
// 但其它特殊字符沒有簡單的Ascii字符可替代, 因而這里也不執行替代
write(" ");
break;
case '"':

if (isAttr)
{
write(""");
break;
}
default:
write(c);
}
}


protected void write(char c) throws SAXException
{

try
{
out.write(c);

} catch (IOException e)
{
throw new SAXException(e);
}
}



protected void write(char ch[], int start, int length) throws SAXException
{

try
{
out.write(ch, start, length);

} catch (IOException e)
{
throw new SAXException(e);
}
}


protected void write(String s) throws SAXException
{

try
{
out.write(s);

} catch (IOException e)
{
throw new SAXException(e);
}
}



private static boolean isBoolean(String attrLower, String tagUpper)
{
return HTMLdtd.isBoolean(attrLower, tagUpper);
}


private static boolean isEmptyTag(String tagUpper)
{
return HTMLdtd.isEmptyTag(tagUpper);
}

}

二. HTMLFilter 類的代碼:
主要解決標簽過濾,即哪些標簽和屬性需要過濾,解決長度截取問題,即斷點出現在startTag,text,endTag的情況應該如何解決。
主要理解重寫父類HTMLWriter的幾個方法:startTag(),characters(),comment(),attribute(), 另外需要一個成員變量currentLen記錄當前寫入的長度,在進行write()方法時要對currentLen變量進行疊加。
package org.util.sax.html;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

import java.util.Map;
import java.io.Writer;
import java.io.CharArrayWriter;
import java.io.IOException;


public class HTMLFilter extends HTMLWriter
{

ConfigManager conf = CM.getConfig();
Map<String, String> cidMap; //cid 和 正文內容圖片 filename的 映射
private int currentLen; //當前已經寫入out的長度
private int maxLen; //允許push的最大長度
private boolean ignore=false; //當出現要截取時,就設為 true ,意味著如果ignore 為true時, 就以后的內容都要忽略。


public HTMLFilter(Map<String,String> map,int allowMessage_BodyLen)
{
//super.setAllowContentLen(allowMessage_BodyLen);
this.maxLen=allowMessage_BodyLen;
this.cidMap=map;
}

@Override

protected boolean startTag(String tagName, Attributes attrs) throws SAXException
{

if (!isTagAllowed(tagName, attrs))
{
return false;
}


if (ignore)
{
return false;
}

Writer originalOutput = getOut();
int remainChars = getRemainChars();


if(remainChars == 0)
{
ignore = true;
write("
");
return false;
}

CharArrayWriter capturedOutput = new CharArrayWriter();
setOut(capturedOutput);


try
{

if (super.startTag(tagName, attrs))
{

if (capturedOutput.toCharArray().length < remainChars)
{

try
{
originalOutput.write(capturedOutput.toCharArray());
return true;

} catch (IOException e)
{
throw new SAXException(e);
}
}
}

} finally
{
setOut(originalOutput);
}

ignore = true;
write("
");
return false;
}


@Override

public void characters(char[] ch, int start, int length) throws SAXException
{

if (ignore)
{ //如果長度已經超出限制,則不寫
return;
}
int remainChars = getRemainChars();


if (remainChars == 0)
{
ignore = true;
write("
");
return;
}


if (remainChars < length)
{ //當將要寫入的 text 長度 大于 remainChars 時, 就寫入所能夠寫入的字符,然后添加省略號

ignore = true;
super.characters(ch, start, remainChars);
write("
");

} else
{
super.characters(ch, start, length);
}
}

@Override

protected void endTag(String tagName) throws SAXException
{
super.endTag(tagName);
}


public void comment(char ch[], int start, int length) throws SAXException
{

if(ignore)
{
return;
}
int remainChars = getRemainChars();


if (remainChars == 0)
{
ignore = true;
write("
");
return;
}

if (remainChars < length)
{
ignore=true;
super.comment(ch, start, remainChars);

} else
{
super.comment(ch, start, length);
}

}

@Override
protected void attribute(final String tagUpper,
final String attrLower,
final String attrName,

String attrValue) throws SAXException
{


if (attrLower.startsWith("on"))
{
return;
}

if (tagUpper.equalsIgnoreCase("IMG") && attrLower.equalsIgnoreCase("src") && attrValue.trim().indexOf("cid:") != -1)
{
attrValue=attrValue.trim();
int cid_idx = attrValue.indexOf("cid:");
String cid = attrValue.substring(cid_idx + 4);
// System.out.println("cid is: "+ cid);
String photoName = cidMap.get(cid);
// System.out.println("photoName is: "+ photoName);

if (photoName != null)
{
super.attribute(tagUpper, attrLower, attrName, "#{" + photoName + "}");

} else
{
super.attribute(tagUpper, attrLower, attrName, "#{" + " " + "}");
}



} else
{
attrValue = transformScript(attrValue);
super.attribute(tagUpper, attrLower, attrName, attrValue);
}
}


private String transformScript(final String data)
{

if (true)
{
final String trimedData = data.trim();
final String scriptData = mySubstringAfterIgnoreCase(trimedData, "javascript:");

if (scriptData != null)
{
return "";
}
}
return data;
}


protected boolean isTagAllowed(String tagName, Attributes attrs)
{

if (tagName.equalsIgnoreCase("SCRIPT"))
{
return false;
}

if(tagName.equalsIgnoreCase("A"))
{ //超鏈接標簽不push
return false;
}

if (tagName.equalsIgnoreCase("PARAM"))
{
String name = getAttrIgnoreCase(attrs, "name");

if ("movie".equalsIgnoreCase(name) || "src".equalsIgnoreCase(name))
{
return false;
}
}

/**//*
if (tagName.equalsIgnoreCase("STYLE")) {
return false;
}
*/
if (tagName.equalsIgnoreCase("LINK") &&

"stylesheet".equalsIgnoreCase(getAttrIgnoreCase(attrs, "rel")))
{
return false;
}

if (tagName.equals("FRAME") || tagName.equals("FRAMESET"))
{
return false;
}
return true;
}



private static String getAttrIgnoreCase(Attributes attrs, String name)
{

for (int i = 0, len = attrs.getLength(); i < len; i++)
{

if (name.equalsIgnoreCase(attrs.getQName(i)))
{
return attrs.getValue(i);
}
}
return null;
}



/** *//**
* 忽略控制字符后, 判斷是否以某字符串開始, 并返回匹配后的截取部分.
* <p/>
* <p/>
* 注: 忽略控制字符是為了對付IE的安全漏洞
*
* @param source 源字符串
* @param prefix 要匹配的前綴字符串
* @return 如果測試成功, 返回截取后的字符串; 否則, 返回 null;
*/

static String mySubstringAfterIgnoreCase(String source, String prefix)
{
int sourceLength = source.length();
int targetLength = prefix.length();


if (sourceLength < targetLength)
{
return null;
}

int sourceOffset = 0;
int targetOffset = 0;
char targetChar = Character.toUpperCase(prefix.charAt(targetOffset));


for (; sourceOffset < sourceLength; sourceOffset++)
{
char c = source.charAt(sourceOffset);

if (c < ' ')
{
// 忽略控制字符
continue;
}


if (Character.toUpperCase(c) != targetChar)
{
break;
}

targetOffset++;

if (targetOffset == targetLength)
{
return source.substring(sourceOffset + 1);
}

targetChar = Character.toUpperCase(prefix.charAt(targetOffset));
}

return null;
}


protected void write(char c) throws SAXException
{
super.write(c);
currentLen++;
}


protected void write(char ch[], int start, int length) throws SAXException
{
super.write(ch, start, length);
currentLen += length;
}


protected void write(String s) throws SAXException
{
super.write(s);
currentLen += s.length();
}


protected int getRemainChars()
{ //求出還剩多少個字符可以寫入
return (maxLen - currentLen);
}


}