??? 最近要做一個站內的全文檢索功能,主要是針對clob字段的,于是去網上找了點lucene的資料,現在新版本的是2.0.0,網上的例子多是1.4.3的,有些方法已經廢棄了,搞了n久終于把2.0.0的功能實現了,呵呵,下面把實現的代碼貼出來,實現了索引的創建、檢索和刪除功能,并可以從檢索結果去查詢數據庫~
//?創建索引

????public?void?indexFiles()?
{
????????//?創建索引文件存放路徑
????????File?indexDir?=?new?File("E:\\lucene_Learning\\lucene-2.0.0src\\src\\demo\\index");


????????try?
{
????????????Date?start?=?new?Date();
????????????//?創建分析器,主要用于從文本中抽取那些需要建立索引的內容,把不需要參與建索引的文本內容去掉.
????????????//?比如去掉一些a?the之類的常用詞,還有決定是否大小寫敏感.
????????????StandardAnalyzer?standardAnalyzer?=?new?StandardAnalyzer();
????????????//?參數true用于確定是否覆蓋原有索引的
????????????IndexWriter?indexWriter?=?new?IndexWriter(indexDir,?standardAnalyzer,?true);
????????????indexWriter.setMergeFactor(100);
????????????indexWriter.setMaxBufferedDocs(100);
????????????//?只索引這個Field的前5000個字,默認為10000
????????????indexWriter.setMaxFieldLength(5000);
????????????//?從數據庫取出所有紀錄
????????????List?articleList?=?articleManager.getArticles(null);

????????????for?(int?i?=?0;?i?<?articleList.size();?i++)?
{
????????????????Article?article?=?(Article)?articleList.get(i);
????????????????//?在Document方法是創建索引的具體代碼
????????????????Document?doc?=?Document(article);
????????????????indexWriter.addDocument(doc);
????????????}
????????????//?Optimize的過程就是要減少剩下的Segment的數量,盡量讓它們處于一個文件中.
????????????indexWriter.optimize();
????????????indexWriter.close();
????????????Date?end?=?new?Date();
????????????System.out.println("create?index:?"?+?(end.getTime()?-?start.getTime())?+?"?total?milliseconds");

????????}?catch?(IOException?e)?
{
????????????System.out.println("?caught?a?"?+?e.getClass()?+?"\n?with?message:?"?+?e.getMessage());
????????}
????}
????public?static?Document?Document(Article?article)

????????????throws?java.io.IOException?
{
????????Document?doc?=?new?Document();
????????//?為article表的主健創建索引,關于Field的幾個參數下面有詳細解釋
????????Field?fieldId?=?new?Field("uid",?article.getArticleId(),?Field.Store.YES,?Field.Index.UN_TOKENIZED,?Field.TermVector.YES);
????????//?為detail字段創建索引,detail在DB中是clob字段,內容為html文本
????????String?contentHtml?=?article.getDetail();
????????Reader?read?=?new?StringReader(contentHtml);
????????//?用HTMLParser把detail字段中的HTML分析成文本在索引
????????//?HTMLParser這個類可以在lucene的demo中找到
????????HTMLParser?htmlParser?=?new?HTMLParser(read);
????????BufferedReader?breader?=?new?BufferedReader(htmlParser.getReader());
????????String?htmlContent?="";
????????String?tempContent?=?breader.readLine();

????????while?(tempContent?!=?null?&&?tempContent.length()?>?0)?
{
????????????htmlContent?=?htmlContent?+?tempContent;
????????????tempContent?=?breader.readLine();
????????}
????????Field?fieldContents?=?new?Field("content",?htmlContent,
????????????????Field.Store.COMPRESS,?Field.Index.TOKENIZED,Field.TermVector.YES);
????????//?db中的每條紀錄對應一個doc,每個字段對應一個field
????????doc.add(fieldId);
????????doc.add(fieldContents);
????????return?doc;
????}
//?搜索文件,keyword是你在頁面上輸入的查找關鍵字,這里查找的是detail字段

????public?List?searchFiles(String?keyword)
{
????????String?index?=?"E:\\lucene_Learning\\lucene-2.0.0src\\src\\demo\\index";
????????//?hitsList用來保存db的紀錄,這些紀錄可以通過查詢結果取到
????????List?hitsList?=?new?ArrayList();

????????try?
{
????????????Date?start?=?new?Date();
????????????IndexReader?reader?=?IndexReader.open(index);
????????????Searcher?searcher?=?new?IndexSearcher(reader);
????????????Analyzer?analyzer?=?new?StandardAnalyzer();
????????????QueryParser?parser?=?new?QueryParser("content",?analyzer);
????????????//?解析查詢關鍵字,比如輸入的是以空格等分開的多個查詢關鍵字,這里解析后,可以多條件查詢
????????????Query?query?=?parser.parse(keyword);
????????????//?hits用來保存查詢結果,這里的hits相當于sql中的result
????????????Hits?hits?=?searcher.search(query);

????????????for?(int?i?=?0;?i?<?hits.length();?i++)?
{
????????????????Document?doc?=?hits.doc(i);
????????????????//?獲得article表的主健
????????????????String?id?=?doc.get("uid");
????????????????//?根據主健去db中取紀錄,返回到hitsList中

????????????????try?
{
????????????????????Article?article?=?articleManager.getArticle(id);

????????????????}?catch?(ObjectRetrievalFailureException?e)?
{
????????????????????article?=?null;
????????????????}
??????????????????????//?如果沒有找到該紀錄,表示該紀錄已經不存在,不必添加到hitsList中
????????????????if(article!=null)?hitsList.add(article);
????????????}
????????????searcher.close();
????????????reader.close();
????????????Date?end?=?new?Date();
????????????System.out.println("search?files:?"?+?(end.getTime()?-?start.getTime())?+?"?total?milliseconds");

????????}?catch?(IOException?e)?
{
????????????System.out.println("?caught?a?"?+?e.getClass()?+?"\n?with?message:?"?+?e.getMessage());

????????}?catch?(ParseException?e)?
{
????????????System.out.println("?caught?a?"?+?e.getClass()?+?"\n?with?message:?"?+?e.getMessage());
????????}
????????return?hitsList;

?//?刪除索引

????public?void?deleteIndex()
{
????????String?index?=?"E:\\lucene_Learning\\lucene-2.0.0src\\src\\demo\\index";

????????try?
{
????????????Date?start?=?new?Date();
????????????IndexReader?reader?=?IndexReader.open(index);
????????????int?numFiles?=?reader.numDocs();

????????????for?(int?i?=?0;?i?<?numFiles;?i++)?
{
????????????????//?這里的刪除只是給文檔做一個刪除標記,你可以看到執行deleteDocument后會產生一個del后綴的文件,
????????????????//?用來記錄這些標記過的文件
????????????????reader.deleteDocument(i);
????????????}
????????????reader.close();
????????????Date?end?=?new?Date();
????????????System.out.println("delete?index:?"?+?(end.getTime()?-?start.getTime())?+?"?total?milliseconds");

????????}?catch?(IOException?e)?
{
????????????System.out.println("?caught?a?"?+?e.getClass()?+?"\n?with?message:?"?+?e.getMessage());
????????}

????}
//?恢復已刪除的索引

????public?void?unDeleteIndex()
{
????????String?index?=?"E:\\lucene_Learning\\lucene-2.0.0src\\src\\demo\\index";

????????try?
{
????????????IndexReader?reader?=?IndexReader.open(index);
????????????reader.undeleteAll();
????????????reader.close();

????????}?catch?(IOException?e)?
{
????????????System.out.println("?caught?a?"?+?e.getClass()?+?"\n?with?message:?"?+?e.getMessage());
????????}

}??? Field就像我們學過的數據庫中的字段,簡單的說,就是一個名值對。這個域有三種屬性,分別是isStored - 是否被存儲
isIndexed - 是否被索引
isTokenized - 是否分詞這些屬性的組合又構成了四種不同類型的Field,而且各有用途? Stored Indexed Tokenized
Keyword Y Y N
UnIndexed Y N N
UnStored N Y Y
Text: String Y Y Y
Text : Reader N Y Y
??? 關于Field,2.0.0版本和1.4.3版本方法相比改動比較大,具體見下表 1.4.3版本中的下面方法都被Field(String name, String value, Store store, Index index, TermVector termVector)取代Keyword(String name, String value) // only version 1.4.3
存儲、索引、不分詞,用于URI(比如MSN聊天記錄的日期域、比如MP3文件的文件全路徑等等)
Field(String name, String value, Field.Store.YES, Field.Index.UN_TOKENIZED) // version 2.0.0UnIndexed(String name, String value) // only version 1.4.3
存儲、不索引、不分詞,比如文件的全路徑
Field(String name, String value,Field.Store.YES, Field.Index.NO)// version 2.0.0UnStored(String name, String value) // only version 1.4.3
不存儲、索引、分詞,比如HTML的正文、Word的內容等等,這部分內容是要被索引的,但是由于具體內容通常很大,沒有必要再進行存儲,可以到時候根據URI再來挖取。所以,這部分只分詞、索引,而不存儲。
Field(String name, String value,Field.Store.YES, Field.Index.TOKENIZED)// version 2.0.0Text(String name, String value) // only version 1.4.3
存儲、索引、分詞,比如文件的各種屬性,比如MP3文件的歌手、專輯等等。Field.Store.YES, Field(String name, String value,Field.Index.TOKENIZED)// version 2.0.0Text(String name, Reader value) // only version 1.4.3 Field(String name, Reader reader)? // version 2.0.0
不存儲、索引、分詞。
摘自:
http://hi.baidu.com/nju918http://hi.baidu.com/nju918/blog/item/3970aaec40ad5c2763d09f0a.html