Lucene---全文检索(数据库)

wu_quanyin

浏览: 205424 次
性别:
来自: 福建省

最近访客更多访客>>

wu_quanyin1011

lsj20040708

yangbo126

892848153

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Search Engine

全文检索 lucene quartz SQL Office

一,对Lucene的知识进行介绍http://lym6520.iteye.com/category/82172

二,以下对最近所使用的Lucene,进行总结下:

为了使Lucene创建的索引文件,能够及时与数据库中同步,使用了quartz进行任务调度可查看

http://wuquanyin1011.iteye.com/admin/blogs/745382

下面是一个任务调度执行Lucene创建索引

以下给个大概重建索引，是使用了建模端配置创建索引

package com.fdauto.bws.business.module.lucene.index.job;

import java.io.File;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.quartz.Job;
import org.quartz.JobDataMap;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;

import com.fdauto.bws.business.datasource.DataSource;
import com.fdauto.bws.business.datasource.DataSourceHelper;
import com.fdauto.bws.common.logger.SystemLogsHelper;
import com.fdauto.bws.service.config.BWSConfigHelper;

/**
 * 任务调度定时创建索引
 * 
 * @author wu_quanyin(09817)
 * @version 1.0
 * @date 2010-7-16 上午10:54:01
 */

public class LuceneIndexJob implements Job {

	private String indexParentDir = BWSConfigHelper.getBWSConfig()
			.getProperties().getProperty("indexDir");

	@SuppressWarnings("unchecked")
	public void execute(JobExecutionContext context)
			throws JobExecutionException {
		JobDataMap jobDataMap = context.getJobDetail().getJobDataMap();
		String sql = jobDataMap.getString("sql");
		String indexDir = jobDataMap.getString("indexFileDir");
		String dataSourceName = jobDataMap.getString("dataSource");

		if (sql == null || indexDir == null || dataSourceName == null) {
			SystemLogsHelper.error("索引数据集中:sql语句-->" + sql + "\n" + "索引文件:-->"
					+ indexDir + "\n" + "数据源-->" + dataSourceName + "都不能为空!");
		}

		// 指定父目录
		indexDir = indexParentDir + "/" + indexDir;

		File indexFile = new File(indexDir);
		if (!indexFile.exists()) {
			indexFile.mkdirs();
		}

		// 获取字段索引策略
		ColumnIndexStrategy columnIndexStrategy = new ColumnIndexStrategy();

		// 删除sql字段后,,对查询字段进行处理
		jobDataMap.remove("sql");
		jobDataMap.remove("dataSource");
		jobDataMap.remove("indexFileDir");
		Set<Map.Entry<String, String>> columnSet = jobDataMap.entrySet();
		for (Iterator<Map.Entry<String, String>> iter = columnSet.iterator(); iter
				.hasNext();) {
			Map.Entry<String, String> columnEntry = iter.next();
			String columnKey = columnEntry.getKey();
			String columnValue = columnEntry.getValue();
			String[] strategys = columnValue.split(",");
			if (strategys.length == 3) {
				columnIndexStrategy.add(columnKey.toUpperCase(), strategys[0]
						.toUpperCase(), strategys[1].toUpperCase(),
						strategys[2], 1);
			} else if (strategys.length == 4) {
				columnIndexStrategy.add(columnKey.toUpperCase(), strategys[0]
						.toUpperCase(), strategys[1].toUpperCase(),
						strategys[2], Integer.parseInt(strategys[3]));
			}
		}

		IndexWriter indexWriter = null;
		Connection conn = null;
		PreparedStatement ps = null;
		ResultSet rs = null;

		try {
			DataSource dataSource = DataSourceHelper
					.getDataSource(dataSourceName);
			conn = dataSource.getConnection();
			LuceneIndex luceneIndex = LuceneIndexFactory.getLuceneIndex();
			indexWriter = luceneIndex.getIndexWriter(indexDir, true);
			ps = conn.prepareStatement(sql);
			rs = ps.executeQuery();
			ResultSetMetaData rsmd = rs.getMetaData();

			while (rs.next()) {
				Document doc = new Document();
				for (int i = 1; i <= rsmd.getColumnCount(); i++) {
					String columnName = rsmd.getColumnName(i).toUpperCase();
					if (!jobDataMap.containsKey(columnName)) {
						continue;
					}
					String columnValue = SQLDataType.requireValueByColumnType(
							rs, rsmd.getColumnType(i), i, false);
					if (columnValue == null || columnValue.trim().length() == 0) {
						continue;
					}
					if ("HTML".equalsIgnoreCase(columnIndexStrategy
							.getFieldContentType(columnName))
							&& columnIndexStrategy.getFieldIndex(columnName) != Index.NO) {
						columnValue = filterHtmlLable(columnValue);
					}

					// ----判断如果是检索不分词时，，值转换为小写
					if (columnIndexStrategy.getFieldIndex(columnName) == Index.NOT_ANALYZED) {
						columnValue = columnValue.toLowerCase();
					}

					Field f = new Field(
							columnName,// 名称
							columnValue,// 值
							// 对每一个字段执行不同的索引策略
							columnIndexStrategy.getFieldStore(columnName),
							columnIndexStrategy.getFieldIndex(columnName));
					int boost = columnIndexStrategy.getBoost(columnName);
					if (boost > 0)
						f.setBoost(boost);
					doc.add(f);
				}

				indexWriter.addDocument(doc);

			}
			//将索引的信息打印到控制台上。
			if(SystemLogsHelper.isDebugger()){
				System.out.println("index infos－－－－－－－－－------------>");
				indexWriter.setInfoStream(System.out);
			}

			indexWriter.optimize();
			indexWriter.commit();

		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (indexWriter != null)
					indexWriter.close();
			} catch (Exception e) {
				SystemLogsHelper.trace("Could not close Lucene IndexWriter", e);
			}

			try {
				if (rs != null)
					rs.close();
			} catch (Exception e) {
				SystemLogsHelper.trace("Could not close JDBC ResultSet", e);
			}

			try {
				if (ps != null)
					ps.close();
			} catch (Exception e) {
				SystemLogsHelper.trace("Could not close JDBC Statement", e);
			}

			try {
				if (conn != null)
					conn.close();
			} catch (Exception e) {
				SystemLogsHelper.trace("Could not close JDBC Connection", e);
			}

		}
	}

	/**
	 * 对有字段标签先过滤再全文检索
	 * 
	 * @param field
	 *            含有html标签的字段
	 * @return
	 */
	private static String filterHtmlLable(String field) {
		StringBuffer result = new StringBuffer();
		try {
			String body = field;

			Parser nodesParser = Parser.createParser(body, "UTF-8");
			NodeFilter textFilter = new NodeClassFilter(TextNode.class);
			NodeList nodeList = nodesParser.parse(textFilter);
			Node[] nodes = nodeList.toNodeArray();

			for (int i = 0; i < nodes.length; i++) {
				Node nextNode = (Node) nodes[i];
				String content = "";
				if (nextNode instanceof TextNode) {
					TextNode textnode = (TextNode) nextNode;
					content = textnode.getText();
				}
				result.append(" ");
				result.append(content);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		field = result.toString();
		if (StringHelper.isEmpty(field))
			return field;
		// field = field.replaceAll("</p(?:\\s*)>(?:\\s*)<p(?:\\s*)>", "\n\n");
		// field = field.replaceAll("<br(?:\\s*)/>", "\n");
		// field = field.replaceAll("\"", "''");
		field = field.replaceAll("<[^>]+>?", "");
		return field;
	}

	public static void main(String[] args) {
		System.out
				.println(LuceneIndexJob
						.filterHtmlLable("<p class=MsoNormal style=\"MARGIN: 0cm 0cm 0pt; TEXT-INDENT: 28pt; TEXT-ALIGN: left; mso-layout-grid-align: none\" align=left><span style=\"FONT-SIZE: 9pt; FONT-FAMILY: 宋体; mso-ascii-font-family: 'Times New Roman'; mso-hansi-font-family: 'Times New Roman'\">在主仓库内备货，按库存金额的</span><span lang=EN-US style=\"FONT-SIZE: 9pt\"><font face=\"Times New Roman\">0.1</font></span><span style=\"FONT-SIZE: 9pt; FONT-FAMILY: 宋体; mso-ascii-font-family: 'Times New Roman'; mso-hansi-font-family: 'Times New Roman'\">‰每天计算占库费用，在其他仓库则需按仓库库存金额的</span><span lang=EN-US style=\"FONT-SIZE: 9pt\"><font face=\"Times New Roman\">0.12</font></span><span style=\"FONT-SIZE: 9pt; FONT-FAMILY: 宋体; mso-ascii-font-family: 'Times New Roman'; mso-hansi-font-family: 'Times New Roman'\">‰计算占库费。</span><span lang=EN-US style=\"FONT-SIZE: 9pt\"><?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" /></span></p>"));
	}

}

数据库类型判断:

public class SQLDataType {

	public static String requireValueByColumnType(ResultSet rs, int columnType,
			int columnIndex) throws SQLException {
		String returnValue = "";

		switch (columnType) {
		case Types.BOOLEAN:
			returnValue = rs.getBoolean(columnIndex) ? "TRUE" : "FALSE";
			break;
		case Types.TIMESTAMP:
			returnValue = rs.getTimestamp(columnIndex).toString();
			break;
		case Types.BLOB:
			Blob b = rs.getBlob(columnIndex);
			//maybe affect to return result
			byte[] blobs = b.getBytes(0, (int) b.length());
			returnValue = new String(Base64.decodeBase64(blobs));
			break;
		case Types.CLOB:
			returnValue = new String(Base64.decodeBase64(rs.getString(
					columnIndex).getBytes()));
			break;
		default:
			returnValue = rs.getString(columnIndex);
		}

		return returnValue;

	}
}

Lucene进行查询:

public class DatabaseSearch extends SearchSupport {
	private static final String __SEARCHTEXT = "__q";
	// "="(默认)为精确匹配，"like"为不精确匹配
	private String operator = "like";

	// 用于加高亮时的显示
	private Query useHighLightQuery;

	@Override
	public TopDocs execSearch(int pageSize, int pageIndex, Parameters ps,
			LuceneStore luceneStore) throws Exception {
		// 用来判断唯一字段
		String uniqueField = "";
		// 默认为or的策略
		String searchType = "SEARCH_OR";
		FieldSet fieldSet = luceneStore.getFields();
		int fieldSize = fieldSet.fieldSize();
		if (fieldSet.getField("luceneScore") != null)
			fieldSize = fieldSize - 1;
		String[] indexFields = new String[fieldSize];

		BooleanClause.Occur[] clauses = new BooleanClause.Occur[fieldSize];

		for (int i = 0; i < fieldSet.fieldSize(); i++) {
			LuceneField luceneField = (LuceneField) fieldSet.getField(i);
			if (luceneField.getName().equalsIgnoreCase("LuceneScore"))
				continue;
			indexFields[i] = luceneField.getName();
			searchType = luceneField.getSearchType();
			clauses[i] = ColumnSearchStrategy.getClause(searchType
					.toUpperCase());

			// 取第一个标为唯一的字段
			if (luceneField.isUnique() && uniqueField.equals("")) {
				uniqueField = indexFields[i];
			}
		}
		ArrayList<MatchRule> matchRules = luceneStore.getMatchRules();
		DefaultExprResolver der = new DefaultExprResolver();
		for (Iterator<MatchRule> itr = matchRules.iterator(); itr.hasNext();) {
			MatchRule matchRule = itr.next();
			if (matchRule instanceof SQLMatchRule) {
				SQLMatchRule sqlMatchRule = (SQLMatchRule) matchRule;
				if (__SEARCHTEXT.equalsIgnoreCase(sqlMatchRule.getName())) {
					this.operator = sqlMatchRule.getOperator().trim();
				}
				if (ps.exists(sqlMatchRule.getName()))
					continue;
				if (StringHelper.isNotEmpty(ps
						.getString(sqlMatchRule.getName())))
					continue;
				if (luceneStore.getFields().getField(sqlMatchRule.getName()) == null)
					continue;
				// if (StringHelper.isEmpty(sqlMatchRule.getRightSide()))
				// continue;
				ps.setDataType(sqlMatchRule.getName(), sqlMatchRule
						.getDataType());
				ps.setValue(sqlMatchRule.getName(), QLExpressHelper.getRunner()
						.execute(
								(String) der.evaluate(null, sqlMatchRule
										.getRightSide()), null, null, false,
								false));
			}
		}
		boolean addQuery = false;
		Query query;
		query = new BooleanQuery();
		for (int i = 0, ilen = ps.count(); i < ilen; ++i) {
			String name = ps.indexToName(i);
			if (name.equalsIgnoreCase(__SEARCHTEXT))
				continue;
			if (luceneStore.getFields().getField(name) == null)
				continue;
			String value = ps.getString(i);
			if (StringHelper.isEmpty(value))
				continue;
			// QueryParser qp = new QueryParser(Version.LUCENE_30, name, this
			// .getAnalyzer());
			// Query q1 = qp.parse(value);
			Query q1 = BWSQueryParser.parseMultiField(new String[] { name },
					value, new BooleanClause.Occur[] { Occur.SHOULD }, false);
			if (q1 != null) {
				((BooleanQuery) query).add(q1, BooleanClause.Occur.MUST);
				addQuery = true;
			}
		}
		// 要查询的值
		String queryValue = ps.getString(__SEARCHTEXT);

		if (!StringHelper.isEmpty(queryValue)) {
			// 是否精确查找(是否对传过来的值再进行分词查找)
			boolean exactMatch = false;
			if (!operator.equalsIgnoreCase("=")) {// 不精确查找＂或＂的关系
				exactMatch = false;
				StringBuffer buffers = new StringBuffer();
				buffers.append(queryValue);
				buffers.append(" ");
				buffers.append("\"");
				buffers.append(queryValue.replaceAll("[\\s|　]+", "-"));
				buffers.append("\"");
				queryValue = buffers.toString();// 增加权重
			} else {// 精确查找＂且＂的关系
				exactMatch = true;
			}

			Query q2 = BWSQueryParser.parseMultiField(indexFields, queryValue,
					clauses, exactMatch);
			((BooleanQuery) query).add(q2, BooleanClause.Occur.MUST);
			// Query q2 = IKQueryParser.parseMultiField(indexFields, queryValue,
			// clauses);
			addQuery = true;
		}
		if (!addQuery)
			throw new NoKeywordsException("查询关键字不能为空！");
		int topSize = 100;
		if (pageSize > 0) {
			if (pageIndex > 0) {
				topSize = pageSize * pageIndex;
			} else
				topSize = pageSize;
		}
		Sort sort = null;
		ArrayList<SortField> sortFields = new ArrayList<SortField>();
		for (Iterator<SortRule> itr = luceneStore.getSortRules().iterator(); itr
				.hasNext();) {
			SortRule sortRule = itr.next();
			Field f = luceneStore.getFields().getField(sortRule.getFieldName());
			if (f != null) {
				sortFields.add(new SortField(f.getName(), SortField.STRING,
						sortRule.isDescent()));
			}
		}
		if (sortFields.size() > 0) {
			SortField[] sortArray = new SortField[sortFields.size()];
			sort = new Sort(sortFields.toArray(sortArray));
		}
		// 因为下面的过虑重复，扩展的query，用于高度时会有问题，故在此提取出来
		setUseHighLightQuery((Query) query.clone());

		if (StringHelper.isNotEmpty(uniqueField)) {
			// 以下的filter,与query要配合使用才能执行
			DuplicateExtendFilter filter = new DuplicateExtendFilter(
					uniqueField);
			query = new DuplicateQuery(query, filter);
		}
		if (sort == null)
			return getIndexSearcher().search(query, null, topSize);
		else
			return getIndexSearcher().search(query, null, topSize, sort);
	}

	public Query getUseHighLightQuery() {
		return useHighLightQuery;
	}

	public void setUseHighLightQuery(Query useHighLightQuery) {
		this.useHighLightQuery = useHighLightQuery;
	}


}

根据以上两个类,与公司建模端结合,建模端所配置的参数不同,创建不同的工作,执行不同的索引........

分享到：

Web 2.0应用客户端性能问题十大根源 | 任务调度---Quartz

2010-08-31 10:01
浏览 2826
评论(1)
分类:编程语言
查看更多

1 楼 chhbwf 2011-09-05

很好的文章！！！

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Lucene---全文检索(数据库)

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Lucene---全文检索(数据库)

评论

发表评论

相关推荐

Solr研究

Lucene---全文检索(处理一对多去重问题 )

Lucene---全文检索(问题分析)

Lucene---全文检索(文档pdf/txt/office/html)

最近访客更多访客>>