網絡上有很多lucene的分詞介紹,但沒有注釋,看的云里霧里,自己看了點源代碼,做了點注釋。
自己寫的分詞都必須繼承Analyzer,而這個Analyzer的源代碼是這樣的:
package org.apache.lucene.analysis;
import java.io.Reader;
public abstract class Analyzer
{
public abstract TokenStream tokenStream(String string, Reader reader);
public int getPositionIncrementGap(String fieldName) {
return 0;
}
}
紅色抽象方法需要實現,返回的類型是TokenStream,而TokenStream是個抽象類,看源代碼:
package org.apache.lucene.analysis;
import java.io.IOException;
public abstract class TokenStream
{
public abstract Token next() throws IOException;
public void close() throws IOException {
/* empty */
}
}
所以返回的應該是這個抽象類的實現類的實例。,在這個抽象類中,有個抽象方法(紅色粗體)需要具體實現,返回Token,Token又是一個類,源代碼是:
package org.apache.lucene.analysis;
public final class Token
{
String termText;
int startOffset;
int endOffset;
String type = "word";
private int positionIncrement = 1;
public Token(String text, int start, int end) {
termText = text;
startOffset = start;
endOffset = end;
}
public Token(String text, int start, int end, String typ) {
termText = text;
startOffset = start;
endOffset = end;
type = typ;
}
。。。
public final String toString() {
StringBuffer sb = new StringBuffer();
sb.append("(" + termText + "," + startOffset + "," + endOffset);
if (!type.equals("word"))
sb.append(",type=" + type);
if (positionIncrement != 1)
sb.append(",posIncr=" + positionIncrement);
sb.append(")");
return sb.toString();
}
}
四個基本參數構造了他的樣子Token 格式:(word,開始,結束,類型)
所以我們要在next()方法中得到這樣的Token。
分析到此為止,看個實在的:
首先有個類要繼承Analyzer
public class ChineseAnalyzer extends Analyzer {
public final static String[] STOP_WORDS = {"的","和"};
private Set stopTable;
public MMChineseAnalyzer() {
stopTable = StopFilter.makeStopSet(STOP_WORDS);
}
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new ChineseTokenizer(reader), stopTable);
}
}
StopFilter是StopFilter extends TokenFilter,而TokenFilter是TokenFilter extends TokenStream
所以StopFilter也是個TokenStream。
最主要的是ChineseTokenizer(reader)
他也是個TokenStream,他繼承ChineseTokenizer extends Tokenizer,而Tokenizer extends TokenStream,所以他也要重載next()方法;
這里采用前向最大匹配,用到字典;
字典加載用TreeMap保存
TreeMap類通過使用樹來實現Map接口.TreeMap提供了按排序順序存儲關鍵字/值對的有效手段, 同時允許快速檢索。不像散列映射,樹映射保證它的元素按照關鍵字升序排序。
字典加載代碼
public void loadWords() {
if (dictionary == null) {//防止不停的加載,吧以及加載的放到全局變量
dictionary = new TreeMap<String, String>();
InputStream is = null;
InputStreamReader isr = null;
BufferedReader br = null;
try {
is = new FileInputStream("c:/dictionary.txt");//字典文件路徑
isr = new InputStreamReader(is, "UTF-8");
br = new BufferedReader(isr);
String word = null;
while ((word = br.readLine()) != null) {
int wordLength = word.length();
if ((word.indexOf("#") == -1)//可以為字典增加注釋,主要前面加#就可以
&& (wordLength <= WORD_MAX_LENGTH)) {
dictionary.put(word.intern(), "1");
int i = wordLength - 1;
while (i >= 2) {
String temp = word.substring(0, i).intern();
if (!dictionary.containsKey(temp)) {
dictionary.put(temp, "2");
}
i--;
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (br != null) {
br.close();
}
if (isr != null) {
isr.close();
}
if (is != null) {
is.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.out.println(dictionary.size());
}
下面就是next()方法
//這個next就是返回Token 格式:(word,開始,結束,類型)
public Token next() throws IOException {
System.out.println("load dictory");
// 裝載詞典
loadWords();
System.out.println("load dictory over");
StringBuffer word = new StringBuffer();
while (true) {
char c;//一個字符
char nextChar;//下個字符
Character.UnicodeBlock cUnicodeBlock;//一個字符的所屬unicode塊
Character.UnicodeBlock nextCharUnicodeBlock;//下個字符的所屬unicode塊
offset++;//偏移量
if (bufferIndex >= dataLength) {//初始化,讀取input,并且緩存的指針為開始
dataLength = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLength == -1) {//結束返回
if (word.length() == 0) {
return null;
} else {
break;
}
}
c = ioBuffer[bufferIndex++];//取得第一個字符
cUnicodeBlock = Character.UnicodeBlock.of(c);//取得第一個字符的unicode塊
nextChar = ioBuffer[bufferIndex];//取得下字符
nextCharUnicodeBlock = Character.UnicodeBlock.of(nextChar);
//這2個字符是否是一樣
boolean isSameUnicodeBlock = cUnicodeBlock.toString()
.equalsIgnoreCase(nextCharUnicodeBlock.toString());
//第一字符是亞洲字
if (cUnicodeBlock == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
//類型為雙字節
tokenType = "double";//雙字節
if (word.length() == 0) {
word.append(c);
// 增強部分--開始 字符所屬unicode塊不同,退出循環 多余代碼
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 增強部分--結束
} else {
//吧2個字符組合,是否是字典里面有的,如果是,增加到word
String temp = (word.toString() + c).intern();
if (dictionary.containsKey(temp)) {
word.append(c);
// 增強部分--開始
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 增強部分--結束
} else {
bufferIndex--;
offset--;
break;
}
}
} else if (cUnicodeBlock == Character.UnicodeBlock.BASIC_LATIN) {
tokenType = "single";//單字節
if (Character.isWhitespace(c)) {
if (word.length() != 0)
break;
} else {
word.append(c);
// 增強部分--開始
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 增強部分--結束
}
}
System.out.println("word="+word);
}
//構造token返回
Token token = new Token(word.toString(), offset - word.length(),
offset, tokenType);
//word清空
word.setLength(0);
System.out.println(token);
return token;
}
整個while循環就是最主要的了.(END)