package com.lucene;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.sql.DataSource;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.mapping.Environment;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;
import com.dao.core.SqlSessionSingle;
import com.dao.reptile.WriteAlreadyUrlDao;
import com.reptile.util.GlobalContains;
public class Lucene {
public static void main(String[] args) throws SQLException, InvalidTokenOffsetsException {
Lucene lucene = new Lucene();
loadMybatis();
lucene.createIndex(GlobalContains.index_path);
// lucene.indexSearch(GlobalContains.index_path,"title","111");//"content"
}
private void createIndex(String indexFile) {
Analyzer analyzer = new IKAnalyzer();
Directory d;
try {
// File dir = new File(GlobalContains.reptile_root);
WriteAlreadyUrlDao alreadyDao = new WriteAlreadyUrlDao();
Map paramMap = new HashMap();
paramMap.put("is_index","0");
List list = alreadyDao.queryList(paramMap);
if(list!=null && list.size()>0){
d = FSDirectory.open(new File(indexFile));
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_33,
analyzer);
IndexWriter indexWriter = new IndexWriter(d, conf);
for (int i = 0; i < list.size(); i++) {
Map map = (Map)list.get(i);
String path = map.get("path").toString();
File ff = new File(path);
if(ff==null || !ff.exists()){
System.out.println("文件:"+path+"不存在。");
continue;
}
Document doc = new Document();
doc.add(new Field("title", map.get("title").toString(), Store.YES,
Index.ANALYZED));
doc.add(new Field("url", map.get("url").toString(), Store.YES,
Index.ANALYZED));
doc.add(new Field("content", new FileReader(ff)));
indexWriter.addDocument(doc);
System.out.println(map.get("url").toString()+"\tcount:"+"\t当前:" + (i + 1)+",总共:"+list.size());
}
indexWriter.close();
d.close();
}else{
System.out.println("没有任何数据需要被索引。");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public List indexSearch(String indexFile,String key,String keywork) throws InvalidTokenOffsetsException {
Analyzer analyzer = new IKAnalyzer();
Directory d;
List resultList = null;
IndexSearcher isearcher = null;
try {
// d = SimpleFSDirectory.open(new File(indexFile));
// d= MMapDirectory.open(new File(indexFile));
d = FSDirectory.open(new File(indexFile));
isearcher = new IndexSearcher(d);
// 在索引中使用IKSimilarity相似度评估器
isearcher.setSimilarity(new IKSimilarity());
Query query = IKQueryParser.parse(key, keywork);
// 搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query, 1000);
System.out.println("命中:" + topDocs.totalHits);
ScoreDoc[] result = topDocs.scoreDocs;
if(result.length>0){
resultList = new ArrayList();
for (int i = 0; i < result.length; i++) {
Document document = isearcher.doc(result[i].doc);
System.out.println("找到:" + document.get("url")+"\t"+
document.get("title"));
//org.apache.lucene.search.highlight
String text = document.get("title");
System.out.println("key:"+text);
if(text!=null){
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(text.length()));
TokenStream tokenStream = analyzer.tokenStream(key, new StringReader(text));
String highlighterText = highlighter.getBestFragment(tokenStream, text);
System.out.println("【高亮显示第】"+(i+1)+"条,检索结果如下:"+highlighterText);
//set result
LuceneResultBean luceneResultBean = new LuceneResultBean();
luceneResultBean.setUrl(document.get("url"));
luceneResultBean.setTitle(highlighterText);
resultList.add(luceneResultBean);
}
}
}
return resultList;
} catch (IOException e) {
e.printStackTrace();
}finally{
if(isearcher!=null)
try {
isearcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
private static void loadMybatis() throws SQLException{
// 加载myBatis的数据库配置文件,不用spring则需要手动加载
Reader reader = null;
try {
reader = Resources.getResourceAsReader("myBatis3.xml");
} catch (IOException e) {
e.printStackTrace();
}
SqlSessionFactory sqlSession = new SqlSessionFactoryBuilder()
.build(reader);
Environment en = sqlSession.getConfiguration().getEnvironment();
DataSource ds = en.getDataSource();
System.out.println("连接:" + ds.getConnection().toString());
SqlSessionSingle.sqlSession = sqlSession;
System.out.println(sqlSession);
// System.out.println("测试连接数据库是否成功。。。");
// SqlSession session = sqlSession.openSession();
// Map map = new HashMap();
// List list = session.selectList("t_url.queryList", map);
// System.out.println(list);
// session.close();
}
}
分享到:
相关推荐
Lucene5.21+IkAnalyzer2012_V5入门案例,看不懂你来打我。
Lucene4.7+IK Analyzer中文分词入门教程
支持lucene5的 IKAnalyzer中文分词器 IKAnalyzer5.jar
solr的IK分词器JAR及配置文件 jar包和配置文件的放置位置不一样,详情可搜索 IK Analyzer 是一个开源的,基于java语言开发的轻量级的中文分词工具包。...org.wltea.analyzer.lucene.IKAnalyzer jar
此版本是基于IK-Analyzer-2012FF修改而来,专门适用于Lucene 5.2.1。 IK Analyzer 是一个开源的,基于java语言开发的轻量级的中文分词工具包。从2006年12月推出1.0版开始, IKAnalyzer已经推出了4个大版本。最初,它...
lucene4.3.0+IK Analyzer2012FF
这个是lucene5和IKAnalyzer5的jar包 相匹配,导入到lib可以使用,本人从网上下载测试可以使用的jar包
该jar包之前只支持Lucene4.7.2,因为我自己的项目用到的是Lucene5.3.1,所以我自己重写了IKAnalyzer.java以及IKTokenizer.java,并且重新编译之后替换了之前的.class文件,现在可以适用于Lucene5.3.1
lucene5.4 + IKAnalyzer支持同义词、停用词、扩展词,IKAnalyzer是中同义词是自己改的,就没打包了,如果还有其它需求可以自己改改.
lucene3.5 IKAnalyzer3.2.5 实例中文分词通过,目前在网上找的lucene 和IKAnalyzer 的最新版本测试通过。内含:示例代码,以及最新jar包。 lucene lucene3.5 IKAnalyzer IKAnalyzer3.2.5 jar 中文 分词
lucene3.6+IKAnalyzer2012FF_u1,配套的JAR包,google code关了好不容易才下载回来的
Lucene IK Analyzer 3.0 Lucene的IK Analyzer 3.0 中文分词器 Lucene IK Analyzer 3.0 Lucene的IK Analyzer 3.0 中文分词器Lucene IK Analyzer 3.0 Lucene的IK Analyzer 3.0 中文分词器
由于林良益先生在2012之后未对IKAnalyzer进行更新,后续lucene分词接口发生变化,导致不可使用,所以此jar包支持lucene6.0以上版本
导入: import net.teamhot.lucene.ThesaurusAnalyzer; import org.apache.lucene.analysis.Analyzer; 实例化: Analyzer analyzer = new ThesaurusAnalyzer();
lucene中有扩展词库和停用词的概念,利用StopFilter加入违禁词的概念,可以返回检索到的违禁词。
提示:IKAnalyzer中文分词器支持Lucene6.0以上,IKAnalyzer中文分词器支持Lucene6.0以上。
使用lucene-3.5和IKAnalyzer2012,实现基础的全文检索实现
关于lucene的IKAnalyzer分词器以及与lucene4.3共同使用时发生的问题解决包
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,true); for (int i = 0; i (); i++) { LuceneVO vo = (LuceneVO)list.get(i); Document doc = new Document(); Field FieldId = ...
学习用非常不错,可以抓取网易头条,加入Lucene的检索。直接导入eclipse+Tomcat就可以使用。