package org.itat.index;
import java.io.File;import java.io.IOException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.HashMap;import java.util.Map;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.NumericField;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;public class CreateIndex { private static IndexWriter writer = null; private static String[] ids = { "1", "2", "3", "4", "5", "6" }; private static String[] emails = { "aa@itat.org", "bb@itat.org", "cc@cc.org", "dd@sina.org", "ee@zttc.edu", "ff@itat.org" }; private static String[] contents = { "welcome to visited the space,I like book", "hello boy, I like pingpeng ball", "my name is cc I like game", "I like football", "I like football and I like basketball too", "I like movie and swim" }; private static Date[] dates = null; private static int[] attachs = { 2, 3, 1, 4, 5, 5 }; private static String[] names = { "zhangsan", "lisi", "john", "jetty", "mike", "jake" }; private static Directory directory = null; private static Map<String, Float> scores = new HashMap<String, Float>(); private static IndexReader reader = null; static { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); try { // 初始化date数组 dates = new Date[ids.length]; dates[0] = sdf.parse("2010-02-19"); dates[1] = sdf.parse("2012-01-11"); dates[2] = sdf.parse("2011-09-19"); dates[3] = sdf.parse("2010-12-22"); dates[4] = sdf.parse("2012-01-01"); dates[5] = sdf.parse("2011-05-19"); // 设置搜索邮箱名的优先级 scores.put("itat.org", 2.0f); scores.put("zttc.edu", 1.5f); directory = FSDirectory.open(new File("d:/lucene/index02"));//打开磁盘作为索引的存储 //directory = new RAMDirectory();// 索引建在内存中 } catch (ParseException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } public CreateIndex() throws Exception, Exception{//利用索引器创建索引 //实例化了一个索引器 //writer= new IndexWriter("D:/index/", new PanGuAnalyzer(), true); //第一个参数path;第二个参数分词器;第三个参数 create:它是一个boolean型变量,如果为true, //表示要重写指定的存放索引目录下的索引文件;如果为false,表示在指定存放索引目录下已经存在的 //索引文件的基础上,向其中继续追加新的索引文件。 writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); Document doc = null; for(int i=0;i<ids.length;i++) { doc = new Document(); //ANALYZED进行分词和索引,适用于标题、内容等不适合精确操作 doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); //进行分词但是不存储norms信息,这个norms中包括了创建索引的时间和权值等信息 doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("email","test"+i+"@test.com",Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED)); doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); //存储数字在索引中 doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i])); //存储日期在索引中 doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime())); String et = emails[i].substring(emails[i].lastIndexOf("@")+1); System.out.println(et); if(scores.containsKey(et)) { doc.setBoost(scores.get(et));//设置搜索的优先级 } else { doc.setBoost(0.5f); } writer.addDocument(doc); writer.optimize(); //添加完所有document,我们对索引进行优化, //优化主要是将多个索引文件合并到一个,有利于提高索引速度。 } writer.close();//一定要关闭,不然会出现Lock obtain timed out: NativeFSLock@D:\lucene\index02\write.lock } public void undelete() throws Exception, Exception{ /*lucene中文档的删除是推迟到IndexReader实例关闭的时候才进行的,所以lucene允许在文档被标记删除但还 * 没有执行最后的删除操作之前,恢复被标记为删除的文档。可以通过调用undelete方法移除索引目录中的.del文件 * 。随后关闭IndexReader实例。这样就保存了所有标记为删除的文档。如果用IndexReader标记了删除文档,那么 * 只有同一个IndexReader实例的deleteAll()方法才能在最初的位置恢复各个被标记为删除的文档。也就是说IndexReader实例只能处理自身标记的删除文档,无法恢复其他实例的文档。 lucene恢复被删除的文档 * */ //使用IndexReader进行恢复 //恢复时,必须把IndexReader的只读(readOnly)设置为false IndexReader reader = IndexReader.open(directory,false); reader.undeleteAll(); reader.close(); } public void forceDelete() throws Exception { /** * 假如你想要强制删除回收站的信息可以调用writer.forceMergeDeletes()这个方法, * 但是这个方法不推荐使用,比较消耗内存,lucene会自动根据容量的大小删除所删除的文件 */ Directory directory1 = FSDirectory.open(new File("d:/lucene/index02"));// IndexWriter writer = null; try { writer = new IndexWriter(directory1, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35))); writer.forceMergeDeletes(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(writer!=null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void merge() throws Exception, Exception, Exception { Directory directory1 = FSDirectory.open(new File("d:/lucene/index01"));// writer = new IndexWriter(directory1, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35))); //会将索引合并为2段,这两 段中的被删除的数据会被清空 //特别注意:此处Lucene在3.5之后不建议使用,因为会消耗大量的开销, //Lucene会根据情况自动处理的 writer.commit(); writer.forceMerge(2); writer.close(); } public void delete() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35))); //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值 //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复 writer.deleteDocuments(new Term("id","1")); writer.commit(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(writer!=null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void delete02() { try { reader = IndexReader.open(directory,false); reader.deleteDocuments(new Term("id","1")); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void update() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35))); /* * Lucene并没有提供更新,这里的更新操作其实是如下两个操作的合集 * 先删除之后再添加 */ Document doc = new Document(); doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED)); doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); writer.updateDocument(new Term("id","1"), doc); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if(writer!=null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void query() { try { IndexReader reader = IndexReader.open(directory); //通过reader可以有效的获取到文档的数量 System.out.println("numDocs:"+reader.numDocs()); System.out.println("maxDocs:"+reader.maxDoc()); System.out.println("deleteDocs:"+reader.numDeletedDocs()); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void search01() { try { IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TermQuery query = new TermQuery(new Term("email","test0@test.com")); TopDocs tds = searcher.search(query, 10); for(ScoreDoc sd:tds.scoreDocs) { Document doc = searcher.doc(sd.doc); System.out.println("("+sd.doc+"-"+doc.getBoost()+"-"+sd.score+")"+ doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+ doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]); } reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void search02() { try { if(reader==null) { reader = IndexReader.open(directory,false); } else { IndexReader tr = IndexReader.openIfChanged(reader); if(tr!=null) { reader.close(); reader = tr; } } IndexSearcher searcher=new IndexSearcher(reader); TermQuery query = new TermQuery(new Term("content","like")); TopDocs tds = searcher.search(query, 10); for(ScoreDoc sd:tds.scoreDocs) { Document doc = searcher.doc(sd.doc); System.out.println(doc.get("id")+"---->"+ doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+ doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]); } searcher.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }}