`
DavyJones2010
  • 浏览: 148056 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
社区版块
存档分类
最新评论

Lucene: Introduction to Lucene (Part I)

阅读更多

1. Why do we use Lucene?

    1) If we want to execute the query like this:

        (content like '%DataStructure%') or (content like '%XMU%') in DB. Then it starts searching the whole content from start to end. That would be low efficiency.

        The Lucene comes to build index for the whole content. If we want to execute operations above. We just have to search from index file and not the real content. That would be much more efficient.

    2) If we want to search the content in the attachment, it would be impossible using DB techonlogy.

 

2.The versions of Lucene?

    1) 2.9-Core

    2) 3.0-Core --> There is a big difference from 2.9

    3) 3.5-Core --> There are some big differences from 3.0

 

3.In all kinds of full text indexing tools, they are all consists of three parts:

    1) Index part ---> What kind of information should be stored in index files?

                         ---> Eg. (I am a boy.) Should 'a' be stored in index files?

    2) Participle part ---> How should the sentence be breaked into part?

    3) Search part---> How should the sentence be searched in index file?

 

4. An example of Create Index using Lucene

    1. Core function

package edu.xmu.lucene.Lucene_ModuleOne;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

/**
 * Hello world!
 * 
 */
public class App
{
	/**
	 * Create Index
	 * 
	 * @throws IOException
	 * @throws LockObtainFailedException
	 * @throws CorruptIndexException
	 */
	public void buildIndex() throws CorruptIndexException,
			LockObtainFailedException, IOException
	{
		// 1. Create Directory
		// --> Where the directory be stored? Memory or HardDisk?
		// Directory dir = new RAMDirectory(); --> Index File Stored in MEM
		Directory dir = FSDirectory.open(new File("E:/LuceneIndex"));

		// 2. Create IndexWriter
		// --> It is used to write data into index files
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35,
				new StandardAnalyzer(Version.LUCENE_35));
		IndexWriter writer = new IndexWriter(dir, config);
		// Before 3.5 the way to create index is like below(depreciated):
		// new IndexWriter(Direcotry d, Analyzer a, boolean c, MaxFieldLength
		// mfl);
		// d: Directory, a: Analyzer, c: Shoule we create new one each time
		// mlf: The max length of the field to be indexed.

		// 3. Create Document
		// --> The target we want to search may be a doc file or a table in DB.
		// --> The path, name, size and modified date of the file.
		// --> All the information of the file should be stored in the Document.
		Document doc = null;

		// 4. Each Item of The Document is Called a Field.
		// --> The relationship of document and field is like table and cell.

		// Eg. We want to build index for all the txt file in the c:/lucene dir.
		// So each txt file in this dir is called a document.
		// And the name, size, modified date, content is called a field.
		File files = new File("E:/LuceneData");
		for (File file : files.listFiles())
		{
			doc = new Document();
			doc.add(new Field("content", new FileReader(file)));
			doc.add(new Field("name", file.getName(), Field.Store.YES,
					Field.Index.NOT_ANALYZED));
			// Field.Store.YES --> The field should be stored in index file
			// Field.Index.ANALYZED --> The filed should be participled
			doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
					Field.Index.NOT_ANALYZED));

			// 5. Create Index File for Target Document by IndexWriter.
			writer.addDocument(doc);
		}

		// 6. Close Index Writer
		if (null != writer)
		{
			writer.close();
		}
	}
}

   2. Test Case

package edu.xmu.lucene.Lucene_ModuleOne;

import java.io.IOException;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.LockObtainFailedException;
import org.junit.Test;

/**
 * Unit test for simple App.
 */
public class AppTest
{
	@Test
	public void buildIndex()
	{
		App app = new App();
		try
		{
			app.buildIndex();
		} catch (CorruptIndexException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (LockObtainFailedException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

 

5. An Example of Query Using Index Files

    1. Core Function of Query

package edu.xmu.lucene.Lucene_ModuleOne;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

/**
 * Hello world!
 * 
 */
public class App
{
	/**
	 * Create Index
	 * 
	 * @throws IOException
	 * @throws LockObtainFailedException
	 * @throws CorruptIndexException
	 */
	public void buildIndex() throws CorruptIndexException,
			LockObtainFailedException, IOException
	{
		// 1. Create Directory
		// --> Where the directory be stored? Memory or HardDisk?
		// Directory dir = new RAMDirectory(); --> Index File Stored in MEM
		Directory dir = FSDirectory.open(new File("E:/LuceneIndex"));

		// 2. Create IndexWriter
		// --> It is used to write data into index files
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35,
				new StandardAnalyzer(Version.LUCENE_35));
		IndexWriter writer = new IndexWriter(dir, config);
		// Before 3.5 the way to create index is like below(depreciated):
		// new IndexWriter(Direcotry d, Analyzer a, boolean c, MaxFieldLength
		// mfl);
		// d: Directory, a: Analyzer, c: Shoule we create new one each time
		// mlf: The max length of the field to be indexed.

		// 3. Create Document
		// --> The target we want to search may be a doc file or a table in DB.
		// --> The path, name, size and modified date of the file.
		// --> All the information of the file should be stored in the Document.
		Document doc = null;

		// 4. Each Item of The Document is Called a Field.
		// --> The relationship of document and field is like table and cell.

		// Eg. We want to build index for all the txt file in the c:/lucene dir.
		// So each txt file in this dir is called a document.
		// And the name, size, modified date, content is called a field.
		File files = new File("E:/LuceneData");
		for (File file : files.listFiles())
		{
			doc = new Document();
			doc.add(new Field("content", new FileReader(file)));
			doc.add(new Field("name", file.getName(), Field.Store.YES,
					Field.Index.NOT_ANALYZED));
			// Field.Store.YES --> The field should be stored in index file
			// Field.Index.ANALYZED --> The filed should be participled
			doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
					Field.Index.NOT_ANALYZED));

			// 5. Create Index File for Target Document by IndexWriter.
			writer.addDocument(doc);
		}

		// 6. Close Index Writer
		if (null != writer)
		{
			writer.close();
		}
	}
	
	/**
	 * Search
	 * @throws IOException 
	 * @throws ParseException 
	 */
	public void search() throws IOException, ParseException
	{
		// 1. Create Directory
		Directory dir = FSDirectory.open(new File("E:/LuceneIndex"));
		
		// 2. Create IndexReader
		IndexReader reader = IndexReader.open(dir);
		
		// 3. Create IndexSearcher using IndexReader
		IndexSearcher searcher = new IndexSearcher(reader);
		
		// 4. Create query for search
		// Search the documents whose content have 'java' key word
		QueryParser parser = new QueryParser(Version.LUCENE_35, "content", new StandardAnalyzer(Version.LUCENE_35));
		Query query = parser.parse("java");
		
		// 5. Execute query and return TopDocs
		// param1: The query to be executed
		// param2: The number of result items 
		TopDocs topDocs = searcher.search(query, 10);
		
		// 6. Get ScoreDoc according to TopDocs
		ScoreDoc[] docs = topDocs.scoreDocs;
		System.out.println("Hits: " + docs.length);
		for(ScoreDoc scoreDoc : docs)
		{
			// 7. Get Document using searcher and ScoreDoc
			Document d = searcher.doc(scoreDoc.doc);
			
			// 8. Get information using Document
			System.out.println("File Name : " + d.get("path"));
		}
		
		// 9. Close Reader
		reader.close();
	}
}

     2. Test Case

package edu.xmu.lucene.Lucene_ModuleOne;

import java.io.IOException;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.LockObtainFailedException;
import org.junit.Test;

/**
 * Unit test for simple App.
 */
public class AppTest
{
	@Test
	public void buildIndex()
	{
		App app = new App();
		try
		{
			app.buildIndex();
		} catch (CorruptIndexException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (LockObtainFailedException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	@Test
	public void search()
	{
		App app = new App();
		
		try
		{
			app.search();
		} catch (IOException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ParseException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics