`
zhaohaolin
  • 浏览: 983973 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
社区版块
存档分类
最新评论

Lucene 3.6 中文分词、分页查询、高亮显示等

    博客分类:
  • JAVA
 
阅读更多

1、准备工作

下载lucene 3.6.1 : http://lucene.apache.org/ 

下载中文分词IK Analyzer: http://code.google.com/p/ik-analyzer/downloads/list (注意下载的是IK Analyzer 2012_u5_source.zip,其他版本有bug) 

下载solr 3.6.1:  http://lucene.apache.org/solr/(编译IK Analyzer时需引用包) 

OK,将lucene 、solr 相关包(lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar)拷贝到项目lib下,IK源码置于项目src下。

2、从Oracle数据库中取数据创建索引(使用IK分词)

001 package lucene.util;
002  
003 import org.apache.lucene.index.IndexWriter;
004 import org.apache.lucene.index.IndexWriterConfig;
005 import org.apache.lucene.index.CorruptIndexException;
006 import org.apache.lucene.store.FSDirectory;
007 import org.apache.lucene.store.Directory;
008 import org.apache.lucene.analysis.Analyzer;
009 import org.apache.lucene.analysis.standard.StandardAnalyzer;
010 import org.apache.lucene.util.Version;
011 import org.apache.lucene.document.Document;
012 import org.apache.lucene.document.Field;
013 import org.wltea.analyzer.lucene.IKAnalyzer;
014  
015 import java.sql.Connection;
016 import java.io.File;
017 import java.io.IOException;
018 import java.util.ArrayList;
019 import java.util.Date;
020  
021 import modules.gk.Gk_info;
022 import modules.gk.Gk_infoSub;
023 import web.sys.Globals;
024 import web.db.DBConnector;
025 import web.db.ObjectCtl;
026 import web.util.StringUtil;
027 //Wizzer.cn
028 public class LuceneIndex {
029     IndexWriter writer = null;
030     FSDirectory dir = null;
031     boolean create = true;
032  
033     public void init() {
034         long a1 = System.currentTimeMillis();
035         System.out.println("[Lucene 开始执行:" new Date() + "]");
036         Connection con = DBConnector.getconecttion(); //取得一个数据库连接
037         try {
038             final File docDir = newFile(Globals.SYS_COM_CONFIG.get("sys.index.path").toString());//E:\lucene
039             if (!docDir.exists()) {
040                 docDir.mkdirs();
041             }
042             String cr = Globals.SYS_COM_CONFIG.get("sys.index.create").toString();//true or false
043             if ("false".equals(cr.toLowerCase())) {
044                 create = false;
045             }
046             Directory dir = FSDirectory.open(docDir);
047 //            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
048             Analyzer analyzer = new IKAnalyzer(true);
049             IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
050             if (create) {
051                 // Create a new index in the directory, removing any
052                 // previously indexed documents:
053                 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
054             else {
055                 // Add new documents to an existing index:
056                 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
057             }
058             IndexWriter writer = new IndexWriter(dir, iwc);
059             String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 ";
060             int rowCount = ObjectCtl.getRowCount(con, sql);
061             int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get("sys.index.size").toString());   //每页记录数
062             int pages = (rowCount - 1) / pageSize + 1//计算总页数
063             ArrayList list = null;
064             Gk_infoSub gk = null;
065             for (int i = 1; i < pages+1; i++) {
066                 long a = System.currentTimeMillis();
067                 list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub());
068                 for (int j = 0; j < list.size(); j++) {
069                     gk = (Gk_infoSub) list.get(j);
070                     Document doc = new Document();
071                     doc.add(new Field("indexno", StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//主键不分词
072                     doc.add(new Field("title", StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED));
073                     doc.add(new Field("describes", StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED));
074                     doc.add(new Field("pdate", StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//日期不分词
075                     doc.add(new Field("keywords", StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED));
076                     writer.addDocument(doc);
077                     ObjectCtl.executeUpdateBySql(con,"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"+gk.getIndexno()+"'");//更新已索引状态
078                 }
079  
080                 long b = System.currentTimeMillis();
081                 long c = b - a;
082                 System.out.println("[Lucene " + rowCount + "条," + pages + "页,第" + i + "页花费时间:" + c + "毫秒]");
083             }
084             writer.commit();
085  
086         catch (Exception e) {
087             e.printStackTrace();
088         finally {
089             DBConnector.freecon(con); //释放数据库连接
090             try {
091                 if (writer != null) {
092                     writer.close();
093                 }
094             catch (CorruptIndexException e) {
095                 e.printStackTrace();
096             catch (IOException e) {
097                 e.printStackTrace();
098             finally {
099                 try {
100                     if (dir != null && IndexWriter.isLocked(dir)) {
101                         IndexWriter.unlock(dir);//注意解锁
102                     }
103                 catch (IOException e) {
104                     e.printStackTrace();
105                 }
106             }
107         }
108         long b1 = System.currentTimeMillis();
109         long c1 = b1 - a1;
110         System.out.println("[Lucene 执行完毕,花费时间:" + c1 + "毫秒,完成时间:" newDate() + "]");
111     }
112 }
 

3、单字段查询以及多字段分页查询高亮显示

 
001 package lucene.util;
002  
003 import org.apache.lucene.store.FSDirectory;
004 import org.apache.lucene.store.Directory;
005 import org.apache.lucene.search.*;
006 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
007 import org.apache.lucene.search.highlight.Highlighter;
008 import org.apache.lucene.search.highlight.SimpleFragmenter;
009 import org.apache.lucene.search.highlight.QueryScorer;
010 import org.apache.lucene.queryParser.QueryParser;
011 import org.apache.lucene.queryParser.MultiFieldQueryParser;
012 import org.apache.lucene.analysis.TokenStream;
013 import org.apache.lucene.analysis.Analyzer;
014 import org.apache.lucene.analysis.KeywordAnalyzer;
015 import org.apache.lucene.document.Document;
016 import org.apache.lucene.index.IndexReader;
017 import org.apache.lucene.index.Term;
018 import org.apache.lucene.util.Version;
019 import modules.gk.Gk_infoSub;
020  
021 import java.util.ArrayList;
022 import java.io.File;
023 import java.io.StringReader;
024 import java.lang.reflect.Constructor;
025  
026 import web.util.StringUtil;
027 import web.sys.Globals;
028 import org.wltea.analyzer.lucene.IKAnalyzer;
029 //Wizzer.cn
030 public class LuceneQuery {
031     private static String indexPath;// 索引生成的目录
032     private int rowCount;// 记录数
033     private int pages;// 总页数
034     private int currentPage;// 当前页数
035     private int pageSize;   //每页记录数
036  
037     public LuceneQuery() {
038         this.indexPath = Globals.SYS_COM_CONFIG.get("sys.index.path").toString();
039     }
040  
041     public int getRowCount() {
042         return rowCount;
043     }
044  
045     public int getPages() {
046         return pages;
047     }
048  
049     public int getPageSize() {
050         return pageSize;
051     }
052  
053     public int getCurrentPage() {
054         return currentPage;
055     }
056  
057     /**
058      * 函数功能:根据字段查询索引
059      */
060     public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) {
061         ArrayList list = new ArrayList();
062         try {
063             if (curpage <= 0) {
064                 curpage = 1;
065             }
066             if (pageSize <= 0) {
067                 pageSize = 20;
068             }
069             this.pageSize = pageSize;   //每页记录数
070             this.currentPage = curpage;   //当前页
071             int start = (curpage - 1) * pageSize;
072             Directory dir = FSDirectory.open(new File(indexPath));
073             IndexReader reader = IndexReader.open(dir);
074             IndexSearcher searcher = new IndexSearcher(reader);
075             Analyzer analyzer = new IKAnalyzer(true);
076             QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title", analyzer);
077             queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
078             Query query = queryParser.parse(keyWord);
079             int hm = start + pageSize;
080             TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
081             searcher.search(query, res);
082  
083             SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>""</span>");
084             Highlighter highlighter = new Highlighter(simpleHTMLFormatter, newQueryScorer(query));
085             this.rowCount = res.getTotalHits();
086             this.pages = (rowCount - 1) / pageSize + 1//计算总页数
087             TopDocs tds = res.topDocs(start, pageSize);
088             ScoreDoc[] sd = tds.scoreDocs;
089             for (int i = 0; i < sd.length; i++) {
090                 Document hitDoc = reader.document(sd[i].doc);
091                 list.add(createObj(hitDoc, analyzer, highlighter));
092             }
093  
094         catch (Exception e) {
095             e.printStackTrace();
096         }
097  
098         return list;
099  
100     }
101     /**
102      * 函数功能:根据字段查询索引
103      */
104     public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) {
105         ArrayList list = new ArrayList();
106         try {
107             if (curpage <= 0) {
108                 curpage = 1;
109             }
110             if (pageSize <= 0) {
111                 pageSize = 20;
112             }
113             this.pageSize = pageSize;   //每页记录数
114             this.currentPage = curpage;   //当前页
115             int start = (curpage - 1) * pageSize;
116             Directory dir = FSDirectory.open(new File(indexPath));
117             IndexReader reader = IndexReader.open(dir);
118             IndexSearcher searcher = new IndexSearcher(reader);
119             BooleanQuery bQuery = new BooleanQuery();  //组合查询
120             if (!"".equals(allkeyword)) {//包含全部关键词
121                 KeywordAnalyzer analyzer = new KeywordAnalyzer();
122                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//AND
123                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{"title""describes""keywords"}, flags, analyzer);
124                 bQuery.add(query, BooleanClause.Occur.MUST);  //AND
125             }
126             if (!"".equals(onekeyword)) { //包含任意关键词
127                 Analyzer analyzer = new IKAnalyzer(true);
128                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//OR
129                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{"title""describes""keywords"}, flags, analyzer);
130                 bQuery.add(query, BooleanClause.Occur.MUST);  //AND
131             }
132             if (!"".equals(nokeyword)) { //排除关键词
133                 Analyzer analyzer = new IKAnalyzer(true);
134                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//NOT
135                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{"title""describes""keywords"}, flags, analyzer);
136                 bQuery.add(query, BooleanClause.Occur.MUST_NOT);  //AND
137  
138             }
139             int hm = start + pageSize;
140             TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
141             searcher.search(bQuery, res);
142             SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>""</span>");
143             Highlighter highlighter = new Highlighter(simpleHTMLFormatter, newQueryScorer(bQuery));
144             this.rowCount = res.getTotalHits();
145             this.pages = (rowCount - 1) / pageSize + 1//计算总页数
146             System.out.println("rowCount:" + rowCount);
147             TopDocs tds = res.topDocs(start, pageSize);
148             ScoreDoc[] sd = tds.scoreDocs;
149             Analyzer analyzer = new IKAnalyzer();
150             for (int i = 0; i < sd.length; i++) {
151                 Document hitDoc = reader.document(sd[i].doc);
152                 list.add(createObj(hitDoc, analyzer, highlighter));
153             }
154  
155         catch (Exception e) {
156             e.printStackTrace();
157         }
158  
159         return list;
160  
161     }
162  
163     /**
164      * 创建返回对象(高亮)
165      */
166  
167     private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) {
168  
169         Gk_infoSub gk = new Gk_infoSub();
170         try {
171  
172             if (doc != null) {
173                 gk.setIndexno(StringUtil.null2String(doc.get("indexno")));
174                 gk.setPdate(StringUtil.null2String(doc.get("pdate")));
175                 String title = StringUtil.null2String(doc.get("title"));
176                 gk.setTitle(title);
177                 if (!"".equals(title)) {
178                     highlighter.setTextFragmenter(newSimpleFragmenter(title.length()));
179                     TokenStream tk = analyzer.tokenStream("title"newStringReader(title));
180                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title));
181                     if (!"".equals(htext)) {
182                         gk.setTitle(htext);
183                     }
184                 }
185                 String keywords = StringUtil.null2String(doc.get("keywords"));
186                 gk.setKeywords(keywords);
187                 if (!"".equals(keywords)) {
188                     highlighter.setTextFragmenter(newSimpleFragmenter(keywords.length()));
189                     TokenStream tk = analyzer.tokenStream("keywords"newStringReader(keywords));
190                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords));
191                     if (!"".equals(htext)) {
192                         gk.setKeywords(htext);
193                     }
194                 }
195                 String describes = StringUtil.null2String(doc.get("describes"));
196                 gk.setDescribes(describes);
197                 if (!"".equals(describes)) {
198                     highlighter.setTextFragmenter(newSimpleFragmenter(describes.length()));
199                     TokenStream tk = analyzer.tokenStream("keywords"newStringReader(describes));
200                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes));
201                     if (!"".equals(htext)) {
202                         gk.setDescribes(htext);
203                     }
204                 }
205  
206             }
207             return gk;
208         }
209         catch (Exception e) {
210  
211             e.printStackTrace();
212             return null;
213         }
214         finally {
215             gk = null;
216         }
217  
218     }
219  
220     private synchronized static Object createObj(Document doc) {
221  
222         Gk_infoSub gk = new Gk_infoSub();
223         try {
224  
225             if (doc != null) {
226                 gk.setIndexno(StringUtil.null2String(doc.get("indexno")));
227                 gk.setPdate(StringUtil.null2String(doc.get("pdate")));
228                 gk.setTitle(StringUtil.null2String(doc.get("title")));
229                 gk.setKeywords(StringUtil.null2String(doc.get("keywords")));
230                 gk.setDescribes(StringUtil.null2String(doc.get("describes")));
231             }
232             return gk;
233         }
234         catch (Exception e) {
235  
236             e.printStackTrace();
237             return null;
238         }
239         finally {
240             gk = null;
241         }
242  
243     }
244 }
  单字段查询:
01 long a = System.currentTimeMillis();
02 try {
03     int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));
04     int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));
05     String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("title")));
06     LuceneQuery lu = new LuceneQuery();
07     form.addResult("list", lu.queryIndexTitle(title, curpage, pagesize));
08     form.addResult("curPage", lu.getCurrentPage());
09     form.addResult("pageSize", lu.getPageSize());
10     form.addResult("rowCount", lu.getRowCount());
11     form.addResult("pageCount", lu.getPages());
12 catch (Exception e) {
13     e.printStackTrace();
14 }
15 long b = System.currentTimeMillis();
16 long c = b - a;
17 System.out.println("[搜索信息花费时间:" + c + "毫秒]");
多字段查询:
01 long a = System.currentTimeMillis();
02 try {
03     int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));
04     int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));
05     String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("allkeyword")));
06     String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("onekeyword")));
07     String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("nokeyword")));
08     LuceneQuery lu = new LuceneQuery();
09     form.addResult("list", lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize));
10     form.addResult("curPage", lu.getCurrentPage());
11     form.addResult("pageSize", lu.getPageSize());
12     form.addResult("rowCount", lu.getRowCount());
13     form.addResult("pageCount", lu.getPages());
14 catch (Exception e) {
15     e.printStackTrace();
16 }
17 long b = System.currentTimeMillis();
18 long c = b - a;
19 System.out.println("[高级检索花费时间:" + c + "毫秒]");

4、Lucene通配符查询

1 BooleanQuery bQuery = new BooleanQuery();  //组合查询
2 if (!"".equals(title)) {
3     WildcardQuery w1 = new WildcardQuery(new Term("title", title+ "*"));
4  
5     bQuery.add(w1, BooleanClause.Occur.MUST);  //AND
6 }
7 int hm = start + pageSize;
8 TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
9 searcher.search(bQuery, res);
 

5、Lucene嵌套查询

实现SQL:(unitid like 'unitid%'  and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)
01 BooleanQuery bQuery = new BooleanQuery();
02 BooleanQuery b1 = new BooleanQuery();
03 WildcardQuery w1 = new WildcardQuery(new Term("unitid", unitid + "*"));
04 WildcardQuery w2 = new WildcardQuery(new Term("idml", id2 + "*"));
05 b1.add(w1, BooleanClause.Occur.MUST);//AND
06 b1.add(w2, BooleanClause.Occur.MUST);//AND
07 bQuery.add(b1, BooleanClause.Occur.SHOULD);//OR
08 BooleanQuery b2 = new BooleanQuery();
09 WildcardQuery w3 = new WildcardQuery(new Term("tounitid", unitid + "*"));
10 WildcardQuery w4 = new WildcardQuery(new Term("tomlid", id2 + "*"));
11 WildcardQuery w5 = new WildcardQuery(new Term("tostate""1"));
12 b2.add(w3, BooleanClause.Occur.MUST);//AND
13 b2.add(w4, BooleanClause.Occur.MUST);//AND
14 b2.add(w5, BooleanClause.Occur.MUST);//AND
15 bQuery.add(b2, BooleanClause.Occur.SHOULD);//OR

6、Lucene先根据时间排序后分页

01 int hm = start + pageSize;
02 Sort sort = new Sort(new SortField("pdate", SortField.STRING, true));
03 TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false);
04 searcher.search(bQuery, res);
05 this.rowCount = res.getTotalHits();
06 this.pages = (rowCount - 1) / pageSize + 1//计算总页数
07 TopDocs tds =searcher.search(bQuery,rowCount,sort);// res.topDocs(start, pageSize);
08 ScoreDoc[] sd = tds.scoreDocs;
09 System.out.println("rowCount:" + rowCount);
10 int i=0;
11 for (ScoreDoc scoreDoc : sd) {
12     i++;
13     if(i<start){
14         continue;
15     }
16     if(i>hm){
17         break;
18     }
19     Document doc = searcher.doc(scoreDoc.doc);
20     list.add(createObj(doc));
21 }
这个效率不高,正常的做饭是创建索引的时候进行排序,之后使用分页方法,不要这样进行2次查询。
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics