看了網(wǎng)上的許多對于lucene 分詞解析的文章一知半解且代碼比較老舊,為透徹、系統(tǒng)、全面、深刻的了解分詞是怎么一個過程,通過自定義一個分詞器來分析理解。 其中分詞部分利用ICTCLAS4j接口實現(xiàn)。結構如下所示:

要實現(xiàn)自定義的ICTCLAS4jAnalyzer必須繼承Analyzer類,并重寫createComponents方法。直接上代碼,看到了吧是從StandardAnalyzer 類中直接復制過來的。把實現(xiàn)ICTCLAS4j的ICTCLAS4jTokenizer替換就搞定了。
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
final ICTCLAS4jTokenizer src = new ICTCLAS4jTokenizer(reader);
//src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new ICTCLAS4jFilter(matchVersion, src);
tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(matchVersion, tok, STOP_WORDS_SET);
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) throws IOException {
//src.setMaxTokenLength(ICTCLAS4jAnalyzer.this.maxTokenLength);
super.setReader(reader);
}
};
}
ICTCLAS4jTokenizer需重新incrementToken方法,并設定CharTermAttribute(存放詞條),OffsetAttribute(存放詞條的偏移地址),構造函數(shù)中寫入需分詞的字符串,通過ICTCLAS4j返回分詞列表在通過incrementToken實現(xiàn)分詞。代碼如下:
package com.zhy.analysis.ictclas4j;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.ArrayList;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.ictclas4j.bean.SegResult;
import org.ictclas4j.segment.SegTag;
/**
* @author brockhong
*
*/
public class ICTCLAS4jTokenizer extends Tokenizer {
private static SegTag segment;
private StringBuilder sb = new StringBuilder();
private ArrayList<String> words = new ArrayList<String>();
private int startOffest = 0;
private int length = 0;
private int wordIdx = 0;
public ICTCLAS4jTokenizer(Reader input) {
super(input);
char[] buf = new char[8192];
int d = -1;
try {
while ((d = input.read(buf)) != -1) {
sb.append(buf, 0, d);
}
} catch (IOException e) {
e.printStackTrace();
}
SegResult sr = seg().split(sb.toString());
words = sr.getWords();
}
private static SegTag seg() {
try {
if (segment == null) {
final InputStream coreDictIn = new FileInputStream(
"data/coreDict.dct");
final InputStream bigramDictIn = new FileInputStream(
"data/BigramDict.dct");
final InputStream personTaggerDctIn = new FileInputStream(
"data/nr.dct");
final InputStream personTaggerCtxIn = new FileInputStream(
"data/nr.ctx");
final InputStream transPersonTaggerDctIn = new FileInputStream(
"data/tr.dct");
final InputStream transPersonTaggerCtxIn = new FileInputStream(
"data/tr.ctx");
final InputStream placeTaggerDctIn = new FileInputStream(
"data/ns.dct");
final InputStream placeTaggerCtxIn = new FileInputStream(
"data/ns.ctx");
final InputStream lexTaggerCtxIn = new FileInputStream(
"data/lexical.ctx");
segment = new SegTag(1, coreDictIn, bigramDictIn,
personTaggerDctIn, personTaggerCtxIn,
transPersonTaggerDctIn, transPersonTaggerCtxIn,
placeTaggerDctIn, placeTaggerCtxIn, lexTaggerCtxIn);
}
} catch (Exception e) {
e.printStackTrace();
}
return segment;
}
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public boolean incrementToken() throws IOException {
while (true) {
length = 0;
if (wordIdx < words.size()) {
String word = words.get(wordIdx);
termAtt.copyBuffer(word.toCharArray(), 0, word.length());
offsetAtt.setOffset(correctOffset(startOffest),
correctOffset(startOffest + length));
wordIdx++;
startOffest += length;
return true;
} else {
return false;
}
}
}
}
ICTCLAS4jFilter 分詞過濾器直接使用StandardAnalyzer的過濾器,作為自定義過濾器。
ICTCLAS4j改造過程來自網(wǎng)上,修改SegTag的outputResult讓其輸出的分詞輸入到列表中。并修復了ICTCLAS4j 在分詞中沒有時報錯代碼。
附上analyzer 測試類如下:
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import java.io.StringReader;
import com.zhy.analysis.ictclas4j.ICTCLAS4jAnalyzer;
/**
* @author brockhong
*/
public class Ictclas4janalyzer {
public static void main(String[] args) throws Exception {
Analyzer analyzer = new ICTCLAS4jAnalyzer(Version.LUCENE_45);
Reader r = new StringReader("張萌萌是勤奮地漂亮的姑娘,/用一塊錢打造經(jīng)濟的航空領域中的航空母艦地點在深圳。ABCD.#$% Hello World!\n又一段文本123輛 !3.0");
TokenStream ts=analyzer.tokenStream("fff", r);
CharTermAttribute term=ts.addAttribute(CharTermAttribute.class);
ts.reset();
while(ts.incrementToken()){
System.out.println(term.toString());
}
ts.end();
ts.close();
}
}
Lucene寫入測試類:
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.zhy.analysis.ictclas4j.ICTCLAS4jAnalyzer;
/** @author brockhong */
public class Testictclas4j {
public static void main(String[] args) throws Exception {
// 設置寫入目錄(好幾種呵呵)
Directory d = FSDirectory.open(new File("D:/luceneTest2"));
// 設置分詞 StandardAnalyzer(會把句子中的字單個分詞)
Analyzer analyzer = new ICTCLAS4jAnalyzer(Version.LUCENE_45);
// 設置索引寫入配置
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, analyzer);
IndexWriter indexwriter = new IndexWriter(d, config);
Document doc = new Document();
doc.add(new StringField("id", "1", Store.YES));
doc.add(new StringField("name", "brockhong", Store.YES));
doc.add(new TextField("content",
"張萌萌是勤奮地漂亮的姑娘,/用一塊錢打造經(jīng)濟的航空領域中的航空母艦地點在深圳。ABCD.#$% Hello World!\n又一段文本123輛 !3.0",Store.YES));
// 寫入數(shù)據(jù)
indexwriter.addDocument(doc);
// 提交
indexwriter.commit(); }}

下載jar/Files/brock/ictclas4j.7z