锘??xml version="1.0" encoding="utf-8" standalone="yes"?>
]]>
鑷繁鍐欑殑鍒嗚瘝閮藉繀欏葷戶鎵?/span>Analyzer錛岃岃繖涓?/span>Analyzer鐨勬簮浠g爜鏄繖鏍風殑錛?/span>
package org.apache.lucene.analysis;
import java.io.Reader;
public abstract class Analyzer
{
public abstract TokenStream tokenStream(String string, Reader reader);
public int getPositionIncrementGap(String fieldName) {
return 0;
}
}
綰㈣壊鎶借薄鏂規硶闇瑕佸疄鐜幫紝榪斿洖鐨勭被鍨嬫槸TokenStream錛岃?/span>TokenStream鏄釜鎶借薄綾伙紝鐪嬫簮浠g爜錛?/span>
package org.apache.lucene.analysis;
import java.io.IOException;
public abstract class TokenStream
{
public abstract Token next() throws IOException;
public void close() throws IOException {
/* empty */
}
}
鎵浠ヨ繑鍥炵殑搴旇鏄繖涓娊璞$被鐨勫疄鐜扮被鐨勫疄渚嬨傦紝鍦ㄨ繖涓娊璞$被涓紝鏈変釜鎶借薄鏂規硶錛堢孩鑹茬矖浣擄級闇瑕佸叿浣撳疄鐜幫紝榪斿洖Token錛?/span>Token鍙堟槸涓涓被錛屾簮浠g爜鏄細
package org.apache.lucene.analysis;
public final class Token
{
String termText;
int startOffset;
int endOffset;
String type = "word";
private int positionIncrement = 1;
public Token(String text, int start, int end) {
termText = text;
startOffset = start;
endOffset = end;
}
public Token(String text, int start, int end, String typ) {
termText = text;
startOffset = start;
endOffset = end;
type = typ;
}
銆傘傘?/span>
public final String toString() {
StringBuffer sb = new StringBuffer();
sb.append("(" + termText + "," + startOffset + "," + endOffset);
if (!type.equals("word"))
sb.append(",type=" + type);
if (positionIncrement != 1)
sb.append(",posIncr=" + positionIncrement);
sb.append(")");
return sb.toString();
}
}
鍥涗釜鍩烘湰鍙傛暟鏋勯犱簡浠栫殑鏍峰瓙Token 鏍煎紡:(word,寮濮?/span>,緇撴潫,綾誨瀷)
鎵浠ユ垜浠鍦?/span>next錛堬級鏂規硶涓緱鍒拌繖鏍風殑Token銆?/span>
鍒嗘瀽鍒版涓烘錛岀湅涓疄鍦ㄧ殑錛?/span>
棣栧厛鏈変釜綾昏緇ф壙Analyzer
public class ChineseAnalyzer extends Analyzer {
public final static String[] STOP_WORDS = {"鐨?/span>","鍜?/span>"};
private Set stopTable;
public MMChineseAnalyzer() {
stopTable = StopFilter.makeStopSet(STOP_WORDS);
}
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new ChineseTokenizer(reader), stopTable);
}
}
StopFilter鏄?/span>StopFilter extends TokenFilter錛岃?/span>TokenFilter鏄?/span>TokenFilter extends TokenStream
鎵浠?/span>StopFilter涔熸槸涓?/span>TokenStream銆?/span>
鏈涓昏鐨勬槸ChineseTokenizer(reader)
浠栦篃鏄釜TokenStream錛屼粬緇ф壙ChineseTokenizer extends Tokenizer錛岃?/span>Tokenizer extends TokenStream錛屾墍浠ヤ粬涔熻閲嶈澆next錛堬級鏂規硶錛?/span>
榪欓噷閲囩敤鍓嶅悜鏈澶у尮閰嶏紝鐢ㄥ埌瀛楀吀錛?/span>
瀛楀吀鍔犺澆鐢?/span>TreeMap淇濆瓨
TreeMap綾婚氳繃浣跨敤鏍戞潵瀹炵幇Map鎺ュ彛.TreeMap鎻愪緵浜?strong>鎸夋帓搴忛『搴忓瓨鍌ㄥ叧閿瓧/鍊煎鐨勬湁鏁堟墜孌碉紝 鍚屾椂鍏佽蹇熸绱€備笉鍍忔暎鍒楁槧灝勶紝鏍戞槧灝勪繚璇佸畠鐨勫厓绱犳寜鐓у叧閿瓧鍗囧簭鎺掑簭銆?/span>
瀛楀吀鍔犺澆浠g爜
public void loadWords() {
if (dictionary == null) {//闃叉涓嶅仠鐨勫姞杞?/span>,鍚т互鍙婂姞杞界殑鏀懼埌鍏ㄥ眬鍙橀噺
dictionary = new TreeMap<String, String>();
InputStream is = null;
InputStreamReader isr = null;
BufferedReader br = null;
try {
is = new FileInputStream("c:/dictionary.txt");//瀛楀吀鏂囦歡璺緞
isr = new InputStreamReader(is, "UTF-8");
br = new BufferedReader(isr);
String word = null;
while ((word = br.readLine()) != null) {
int wordLength = word.length();
if ((word.indexOf("#") == -1)//鍙互涓哄瓧鍏稿鍔犳敞閲?/span>,涓昏鍓嶉潰鍔?/span>#灝卞彲浠?/span>
&& (wordLength <= WORD_MAX_LENGTH)) {
dictionary.put(word.intern(), "1");
int i = wordLength - 1;
while (i >= 2) {
String temp = word.substring(0, i).intern();
if (!dictionary.containsKey(temp)) {
dictionary.put(temp, "2");
}
i--;
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (br != null) {
br.close();
}
if (isr != null) {
isr.close();
}
if (is != null) {
is.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.out.println(dictionary.size());
}
涓嬮潰灝辨槸next()鏂規硶
//榪欎釜next灝辨槸榪斿洖Token 鏍煎紡:(word,寮濮?/span>,緇撴潫,綾誨瀷)
public Token next() throws IOException {
System.out.println("load dictory");
// 瑁呰澆璇嶅吀
loadWords();
System.out.println("load dictory over");
StringBuffer word = new StringBuffer();
while (true) {
char c;//涓涓瓧絎?/span>
char nextChar;//涓嬩釜瀛楃
Character.UnicodeBlock cUnicodeBlock;//涓涓瓧絎︾殑鎵灞?/span>unicode鍧?/span>
Character.UnicodeBlock nextCharUnicodeBlock;//涓嬩釜瀛楃鐨勬墍灞?/span>unicode鍧?/span>
offset++;//鍋忕Щ閲?/span>
if (bufferIndex >= dataLength) {//鍒濆鍖?/span>,璇誨彇input,騫朵笖緙撳瓨鐨勬寚閽堜負寮濮?/span>
dataLength = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLength == -1) {//緇撴潫榪斿洖
if (word.length() == 0) {
return null;
} else {
break;
}
}
c = ioBuffer[bufferIndex++];//鍙栧緱絎竴涓瓧絎?/span>
cUnicodeBlock = Character.UnicodeBlock.of(c);//鍙栧緱絎竴涓瓧絎︾殑unicode鍧?/span>
nextChar = ioBuffer[bufferIndex];//鍙栧緱涓嬪瓧絎?/span>
nextCharUnicodeBlock = Character.UnicodeBlock.of(nextChar);
//榪?/span>2涓瓧絎︽槸鍚︽槸涓鏍?/span>
boolean isSameUnicodeBlock = cUnicodeBlock.toString()
.equalsIgnoreCase(nextCharUnicodeBlock.toString());
//絎竴瀛楃鏄簹媧插瓧
if (cUnicodeBlock == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
//綾誨瀷涓哄弻瀛楄妭
tokenType = "double";//鍙屽瓧鑺?/span>
if (word.length() == 0) {
word.append(c);
// 澧炲己閮ㄥ垎--寮濮?/span> 瀛楃鎵灞?/span>unicode鍧椾笉鍚?/span>,閫鍑哄驚鐜?/span> 澶氫綑浠g爜
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 澧炲己閮ㄥ垎--緇撴潫
} else {
//鍚?/span>2涓瓧絎︾粍鍚?/span>,鏄惁鏄瓧鍏擱噷闈㈡湁鐨?/span>,濡傛灉鏄?/span>,澧炲姞鍒?/span>word
String temp = (word.toString() + c).intern();
if (dictionary.containsKey(temp)) {
word.append(c);
// 澧炲己閮ㄥ垎--寮濮?/span>
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 澧炲己閮ㄥ垎--緇撴潫
} else {
bufferIndex--;
offset--;
break;
}
}
} else if (cUnicodeBlock == Character.UnicodeBlock.BASIC_LATIN) {
tokenType = "single";//鍗曞瓧鑺?/span>
if (Character.isWhitespace(c)) {
if (word.length() != 0)
break;
} else {
word.append(c);
// 澧炲己閮ㄥ垎--寮濮?/span>
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 澧炲己閮ㄥ垎--緇撴潫
}
}
System.out.println("word="+word);
}
//鏋勯?/span>token榪斿洖
Token token = new Token(word.toString(), offset - word.length(),
offset, tokenType);
//word娓呯┖
word.setLength(0);
System.out.println(token);
return token;
}
鏁翠釜while寰幆灝辨槸鏈涓昏鐨勪簡.(END)