??xml version="1.0" encoding="utf-8" standalone="yes"?>
apache"~10
jakarta" to be more relevant boost it using the ^ symbol along with the boost factor next to the term. You would type:
^4 apache jakarta apache"^4 " lucene"
apache" apache" OR
apache" AND " lucene"
jakarta
apache" NOT " lucene" apache"
apache" -" lucene"
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class Index {
public static void main(String[] args) {
try {
Index index = new Index();
String path = "d:\\index";//索引文g的存放\?/span>
String storeIdPath = "d:\\storeId.txt";//存储ID的\?/span>
String storeId ="";
storeId = index.getStoreId(storeIdPath);
ResultSet rs = index.getResult(storeId);
index.indexBuilding(path, storeIdPath, rs);
storeId = index.getStoreId(storeIdPath);
System.out.println(storeId);//打印?gu)ơ存储v来的ID
} catch (Exception e) {
e.printStackTrace();
}
}
public ResultSet getResult(String storeId) throws Exception{
Class.forName("com.mysql.jdbc.Driver").newInstance();
String url = "jdbc:mysql://localhost:3306/ding";
String userName = "root";
String password = "ding";
Connection conn = DriverManager.getConnection(url,userName,password);
Statement stmt = conn
.createStatement();
ResultSet rs = stmt
.executeQuery("select * from newitem where id > '"+storeId+"'order by id");
return rs;
}
public boolean indexBuilding(String path,String storeIdPath, ResultSet rs) {// 把RS换成LIST原理一?/span>
try {
Analyzer luceneAnalyzer = new StandardAnalyzer();
// 取得存储h的IDQ以判定是增量烦(ch)引还是重新烦(ch)?/span>
boolean isEmpty = true;
try {
File file = new File(storeIdPath);
if (!file.exists()) {
file.createNewFile();
}
FileReader fr = new FileReader(storeIdPath);
BufferedReader br = new BufferedReader(fr);
if(br.readLine()!= null) {
isEmpty = false;
}
br.close();
fr.close();
} catch (IOException e) {
e.printStackTrace();
}
IndexWriter writer = new IndexWriter(path, luceneAnalyzer, isEmpty);//参数isEmpty是false表示增量索引
String storeId = "";
boolean indexFlag = false;
String id;
String title;
while (rs.next()) {
// for(Iterator it = list.iterator();it.hasNext();){
id = rs.getString("id");
title = rs.getString("title");
writer.addDocument(Document(id, title));
storeId = id;//拿到的idlstoreIdQ这U拿法不合理Q这里ؓ(f)?jin)方?/span>
indexFlag = true;
}
writer.optimize();
writer.close();
if(indexFlag){
// 最后一个的ID存到盘文g?/span>
this.writeStoreId(storeIdPath, storeId);
}
return true;
} catch (Exception e) {
e.printStackTrace();
System.out.println("出错?/span>" + e.getClass() + "\n 错误信息? "
+ e.getMessage());
return false;
}
}
public static Document Document(String id, String title) {
Document doc = new Document();
doc.add(new Field("ID", id, Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("TITLE", title, Field.Store.YES,
Field.Index.TOKENIZED));
return doc;
}
// 取得存储在磁盘(sh)的ID
public static String getStoreId(String path) {
String storeId = "";
try {
File file = new File(path);
if (!file.exists()) {
file.createNewFile();
}
FileReader fr = new FileReader(path);
BufferedReader br = new BufferedReader(fr);
storeId = br.readLine();
if (storeId == null || storeId == "")
storeId = "0";
br.close();
fr.close();
} catch (Exception e) {
e.printStackTrace();
}
return storeId;
}
// ID写入到磁盘文件中
public static boolean writeStoreId(String path,String storeId) {
boolean b = false;
try {
File file = new File(path);
if (!file.exists()) {
file.createNewFile();
}
FileWriter fw = new FileWriter(path);
PrintWriter out = new PrintWriter(fw);
out.write(storeId);
out.close();
fw.close();
b=true;
} catch (IOException e) {
e.printStackTrace();
}
return b;
}
}
q里代码写的比较单,很多需要改q的地方Q自己改q就行了(jin)Q这里只是说明了(jin)增量索引的原理,望指正?br />
]]>
public static void main(String[] args) throws Exception {
/* 指明要烦(ch)引文件夹的位|?q里是d盘的s文g夹下 */
File fileDir = new File("d:\\s");
/* q里攄(ch)引文件的位置 */
File indexDir = new File("d:\\index");
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,
true);
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
//增加document到烦(ch)引去
System.out.println("File正在被烦(ch)?img src="http://www.tkk7.com/Images/dot.gif" alt="" />.");
/*
* 注意要变的就是这里,路径和读取文件的Ҏ(gu)
* */
String path ="d:\\s\\2.doc";
String temp = ReadFile.readWord(path);
// String path ="d:\\s\\index.htm";
// String temp = ReadFile.readHtml(path);
Document document = new Document();
Field FieldPath = new Field("path",path,
Field.Store.YES, Field.Index.NO);
Field FieldBody = new Field("body", temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
//optimize()Ҏ(gu)是对索引q行优化
indexWriter.optimize();
indexWriter.close();
//试一下烦(ch)引的旉
long endTime = new Date().getTime();
System.out
.println("q花费了(jin)"
+ (endTime - startTime)
+ " 毫秒来把文档增加到烦(ch)引里面去!"
+ fileDir.getPath());
}
}
上面已经注释?jin)要换的地方Q我们要做的是换文件的路径和读取文件的Ҏ(gu)?/span>
下面来具体看下读取文件的Ҏ(gu)
1.首先来看WORD文Q?/span>
我这里用的是poiQ相关jar包自己去下蝲Q然后加到工E中Q以下所要用的jar包也是,不再重复_(d)(j)?br />
来看相关代码Q?br />
StringBuffer content = new StringBuffer("");// 文内容
try {
HWPFDocument doc = new HWPFDocument(new FileInputStream(path));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// D落
for (int i = 0; i < paragraphCount; i++) {// 遍历D落d数据
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
} catch (Exception e) {
}
return content.toString().trim();
}
2.PDF文g用的是PDFboxQ?br />
StringBuffer content = new StringBuffer("");// 文内容
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
return content.toString().trim();
}
3.html文gQ?br />
StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// d面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis,"utf-8"));//q里的字W编码要注意Q要对上html头文件的一_(d)否则?x)出q
String line = null;
while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
4.txt文gQ?/span>
StringBuffer content = new StringBuffer("");// 文内容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null;
while ((s1 = br.readLine()) != null) {
content.append(s1 + "\r");
}
br.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return content.toString().trim();
}
接下来数搜烦(ch)代码Q?/span>
public static void main(String[] args) throws IOException, ParseException {
Hits hits = null;
//搜烦(ch)内容自己?/span>
String queryString = "Ҏ(gu)国务院的军_";
Query query = null;
IndexSearcher searcher = new IndexSearcher("d:\\index"); //q里注意索引存放的\?nbsp;
Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser("body", analyzer);
/**
* 建烦(ch)引的时候我们指定了(jin)body建立为内容,我们搜烦(ch)的时候也是针对body的,所?br />
* QueryParser qp = new QueryParser("body", analyzer);
* q句和徏立烦(ch)引时?br />
Field FieldBody = new Field("body", temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
*的这句的"body"是对应的?br />
*/
query = qp.parse(queryString);
} catch (ParseException e) {
System.out.println("异常");
}
if (searcher != null) {
hits = searcher.search(query);
if (hits.length() > 0) {
System.out.println("扑ֈ:" + hits.length() + " 个结?");
for (int i = 0; i < hits.length(); i++) {//输出搜烦(ch)信息
Document document = hits.doc(i);
System.out.println("contentsQ?/span>"+document.get("body"));
//同样原理q里的document.get("body")是取得建立在烦(ch)引文仉面的额body的所有内?br />
//你若惌出文件\径就用document.get("path")可以了(jin)
}
} else{
System.out.println("0个结?");
}
}
}
]]>
]]>
Apache LuceneJavaLuceneLucenAPI LuceneapacheLucene2. LuceneLuceneLuceneLucenewebWordHTMLPDFLuceneLucene+George +Rice –eat –pudding, Apple –pie +Tiger, animal:monkey AND food:bananaLuceneemailWiki……3. Lucene1Lucene82Lucene4Token5Lucene6搜烦(ch)q程优化?/span>LuceneDocument100TopDocsID7Lucene4. Analyzerof“the(1) "text"test"te?t
0test, tests testertest*
te*t
*?(2) LuceneLevenshtein DistanceEdit Distance"~""roam"roam~
foamroamsboost factor0.2.
(3) LuceneAND, "+", OR, NOT "-"(4) Lucene+ - && || ! ( ) { } [ ] ^ " ~ * ? : "
",(1+1):2"(1"+1")":2
5. (1) OR AND TOlucene(2) (3) tmplock(4) luceneyyMMddHHmmssyy-MM-dd HH:mm:sslucene(5) lucenedisk(6) lucenelucene(7) jiangxi strong jiangstronjiangxistrong
]]>
在eclipse下徏立web工程luceneTest
jar包加载到你的web工程里面
新徏cIndex.java,代码如下Q?/span>
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
/*
* Create Date:2007-10-26 下午02:52:53
*
* Author:dingkm
*
* Version: V1.0
*
* DescriptionQ对q行修改的功能进行描q?br />
*
*
*/
public class Index {
/**
* @Description Ҏ(gu)实现功能描述
* @param args
* void
* @throws 抛出异常说明
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
new Index().index();
System.out.println("create index success!!!");
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void index() throws CorruptIndexException, LockObtainFailedException, IOException{
long start = System.currentTimeMillis();
// 建立索引的\?br />
String path = "c:\\index2";
Document doc1 = new Document();
doc1.add( new Field("name", "中华人民共和?,Field.Store.YES,Field.Index.TOKENIZED));
doc1.add( new Field("content", "标题或正文包?,Field.Store.YES,Field.Index.TOKENIZED));
doc1.add( new Field("time", "20080715",Field.Store.YES,Field.Index.TOKENIZED));
Document doc2 = new Document();
doc2.add(new Field("name", "大中国中?,Field.Store.YES,Field.Index.TOKENIZED));
IndexWriter writer = new IndexWriter(FSDirectory.getDirectory(path, true), new StandardAnalyzer(), true);
writer.setMaxMergeDocs(10);
writer.setMaxFieldLength(3);
writer.addDocument(doc1);
writer.setMaxFieldLength(3);
writer.addDocument(doc2);
writer.close();
System.out.println("=========================");
System.out.print(System.currentTimeMillis() - start);
System.out.println("total milliseconds");
System.out.println("=========================");
}
}
执行q个c,可以看到l果Q?br />
=========================
375total milliseconds
=========================
create index success!!!
可以看到索引创徏成功?br />
下面我们来创建搜索类QSearch.java
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
/*
* Create Date:2007-10-26 下午02:56:12
*
* Author:dingkm
*
* Version: V1.0
*
* DescriptionQ对q行修改的功能进行描q?
*
*
*/
public class Search {
/**
* @Description Ҏ(gu)实现功能描述
* @param args
* void
* @throws 抛出异常说明
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String path = "c:\\index2";
try {
new Search().search(path);
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void search(String path) throws CorruptIndexException, IOException, ParseException{
IndexSearcher searcher = new IndexSearcher(path);
Hits hits = null;
Query query = null;
QueryParser qp = new QueryParser("name",new StandardAnalyzer());
query = qp.parse("?);
hits = searcher.search(query);
java.text.NumberFormat format = java.text.NumberFormat.getNumberInstance();
System.out.println("查找到共" + hits.length() + "个结?);
for (int i = 0; i < hits.length(); i++) {
//开始输出查询结?nbsp;
Document doc = hits.doc(i);
System.out.println(doc.get("name"));
System.out.println("content="+doc.get("content"));
System.out.println("time="+doc.get("time"));
System.out.println("准确度ؓ(f)Q? + format.format(hits.score(i) * 100.0) + "%");
// System.out.println(doc.get("CONTENT"));
}
}
}
执行它,?x)得C下结果:(x)
查找到共2个结?br />
中华人民共和?br />
content=标题或正文包?br />
time=20080715
准确度ؓ(f)Q?9.727%
大中国中?br />
content=null
time=null
准确度ؓ(f)Q?9.727%
q样完成了(jin)我们的程?br />
q是我第一ơ发表文?br />
说的比较单,可能很多地方说的不清?br />
希望大家多多支持
有什么不明白的欢q留a?/span>