retrography
1/26/2013 - 4:53 AM

Lucene 4.1 querying sample

Lucene 4.1 querying sample

import static org.junit.Assert.*;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties;

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;

public class QueryTest {
	
	@Test
	public void testTermQuery() throws IOException {
		Directory dir = new RAMDirectory();
		new BookDataIndexer(dir, "bookdata").indexBookData();
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		
		// isbn が "0854402624" に一致するドキュメントを検索
		Term t = new Term("isbn", "0854402624");
		Query query = new TermQuery(t);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
	}
	
	@Test
	public void testTermRangeQuery() throws IOException {
		Directory dir = new RAMDirectory();
		new BookDataIndexer(dir, "bookdata").indexBookData();
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);

		// title2 が(辞書順で) "d" より大きく、"j" より小さいドキュメントを検索
		// 上限値と下限値は範囲に含まれる
		BytesRef lower = new BytesRef("d");
		BytesRef upper = new BytesRef("j");
		TermRangeQuery query = new TermRangeQuery("title2", lower, upper, true, true);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(3, docs.totalHits);
	}
	
	@Test
	public void testNumericRangeQuery() throws IOException {
		Directory dir = new RAMDirectory();
		new BookDataIndexer(dir, "bookdata").indexBookData();
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);

		// pubmonth が 200406 より大きく、 200409 より小さいドキュメントを検索
		// 上限値と下限値は範囲に含まれる
		NumericRangeQuery<Integer> query = 
				NumericRangeQuery.newIntRange("pubmonth", 200406, 200409, true, true);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
	}
	@Test
	public void testPrefixQueryTest() throws IOException {
		Directory dir = new RAMDirectory();
		new BookDataIndexer(dir, "bookdata").indexBookData();
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);

		Term t = new Term("category", "/technology/computers/programming");
		
		// category が "/technology/computers/programming" から始まるドキュメントを検索
		PrefixQuery query = new PrefixQuery(t);
		TopDocs docs = searcher.search(query, 10);
		int programmingAndBelow = docs.totalHits;
		
		// category が "/technology/computers/programming" に一致するドキュメントを検索
		docs = searcher.search(new TermQuery(t), 10);
		int justProgramming = docs.totalHits;
		
		// ヒット数が違う
		assertTrue(programmingAndBelow > justProgramming);
	}
	
	@Test
	public void testBooleanQueryAnd() throws IOException {
		Directory dir = new RAMDirectory();
		new BookDataIndexer(dir, "bookdata").indexBookData();
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);

		// AND 検索
		TermQuery searchingBooks =
				new TermQuery(new Term("subject", "search"));
		Query books2004 =
				NumericRangeQuery.newIntRange("pubmonth", 200401, 200412, true, true);
		BooleanQuery query = new BooleanQuery();
		// Occur.MUST を指定
		query.add(searchingBooks, Occur.MUST); 
		query.add(books2004, Occur.MUST);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
		assertTrue(hitsIncludeTitle(searcher, docs, "Lucene in Action"));
	}
	
	@Test
	public void testBooleanQueryOr() throws IOException {
		Directory dir = new RAMDirectory();
		new BookDataIndexer(dir, "bookdata").indexBookData();
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);

		// OR 検索
		TermQuery methodologyBooks = 
				new TermQuery(new Term("category", "/technology/computers/programming/methodology"));
		TermQuery easternPhilosophyBooks =
				new TermQuery(new Term("category", "/philosophy/eastern"));
		BooleanQuery query = new BooleanQuery();
		// Occur.SHOULD を指定
		query.add(methodologyBooks, Occur.SHOULD);
		query.add(easternPhilosophyBooks, Occur.SHOULD);
		TopDocs docs = searcher.search(query, 10);
		assertTrue(hitsIncludeTitle(searcher, docs, "Extreme Programming Explained"));
		assertTrue(hitsIncludeTitle(searcher, docs, "Tao Te Ching 道德經"));
	}

	private boolean hitsIncludeTitle(IndexSearcher searcher, TopDocs hits, String title) 
			throws IOException {
		for (ScoreDoc match : hits.scoreDocs) {
			Document doc = searcher.doc(match.doc);
			if (title.equals(doc.get("title"))) {
				return true;
			}
		}
		return false;
	}
	
	@Test
	public void testPhraseQuery() throws IOException {
		Directory dir = new RAMDirectory();
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, 
				new WhitespaceAnalyzer(Version.LUCENE_41));
		IndexWriter writer = new IndexWriter(dir, config);
		Document doc = new Document();
		doc.add(new TextField("field", 
				"the quick brown fox jumped over the lazy dog",
				Store.YES));
		writer.addDocument(doc);
		writer.close();
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		
		// "quick" のすぐ後に "fox" が出現するドキュメントを検索
		assertFalse(matched(searcher, new String[]{"quick", "fox"}, 0));
		// "quick" と "fox" が距離1語以内で出現するドキュメントを検索
		assertTrue(matched(searcher, new String[]{"quick", "fox"}, 1));

		// "quick" と "jumped" と "lazy" が距離3語以内で出現するドキュメントを検索
		assertFalse(matched(searcher, new String[]{"quick", "jumped", "lazy"}, 3));
		// "quick" と "jumped" と "lazy" が距離4語以内で出現するドキュメントを検索
		assertTrue(matched(searcher, new String[]{"quick", "jumped", "lazy"}, 4));
	}

	private boolean matched(IndexSearcher searcher, String[] phrase, int slop) throws IOException {
		PhraseQuery query = new PhraseQuery();
		query.setSlop(slop);
		for (String word : phrase) {
			query.add(new Term("field", word));
		}
		TopDocs docs = searcher.search(query, 10);
		return docs.totalHits > 0;
	}
	
	@Test
	public void testWildcardQuery() throws IOException {
		Directory dir = new RAMDirectory();
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, 
				new WhitespaceAnalyzer(Version.LUCENE_41));
		IndexWriter writer = new IndexWriter(dir, config);
		String[] fields = new String[]{"wild", "child", "mild", "mildew"};
		for (String field : fields) {
			Document doc = new Document();
			doc.add(new TextField("contents", field, Store.YES));
			writer.addDocument(doc);
		}
		writer.close();
		
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		// パターン "?ild*" にマッチするドキュメントを検索
		WildcardQuery query = new WildcardQuery(new Term("contents", "?ild*"));
		TopDocs docs = searcher.search(query, 10);
		assertEquals(3, docs.totalHits);
		assertEquals(docs.scoreDocs[0].score, docs.scoreDocs[1].score, 0.0);
		assertEquals(docs.scoreDocs[1].score, docs.scoreDocs[2].score, 0.0);
	}
	
	@Test
	public void testFuzzyQuery() throws IOException {
		Directory dir = new RAMDirectory();
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, 
				new WhitespaceAnalyzer(Version.LUCENE_41));
		IndexWriter writer = new IndexWriter(dir, config);
		String[] fields = new String[]{"fuzzy", "wuzzy"};
		for (String field : fields) {
			Document doc = new Document();
			doc.add(new TextField("contents", field, Store.YES));
			writer.addDocument(doc);
		}
		writer.close();
		
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		// "wuzza" と編集距離が近いドキュメントを検索
		FuzzyQuery query = new FuzzyQuery(new Term("contents", "wuzza"));
		TopDocs docs = searcher.search(query, 10);
		assertEquals(2, docs.totalHits);
		assertTrue(docs.scoreDocs[0].score != docs.scoreDocs[1].score);
		// "wuzzy" がより "wuzza" に近いのでスコアが高い
		Document doc = searcher.doc(docs.scoreDocs[0].doc);
		assertEquals("wuzzy", doc.get("contents"));
	}
}

// テスト用インデックス作成クラス
// データは Lucene in Action サポートサイトからダウンロードできる
// http://www.manning.com/hatcher3/
class BookDataIndexer {

	private Directory directory;
	private String baseDir;
	
	public BookDataIndexer(Directory directory, String baseDir) {
		this.directory = directory;
		this.baseDir = baseDir;
	}
	
	public void indexBookData() throws IOException {
		File d = new File(baseDir);
		IndexWriter writer = getWriter();
		indexBookData(writer, d);
		writer.close();
	}
	
	private void indexBookData(IndexWriter writer, File file) throws IOException {
		if (file.isDirectory()) {
			for (File f : file.listFiles()) {
				indexBookData(writer, f);
			}
		} else {
			String category = file.getParent().substring(baseDir.length())
					.replace(File.separatorChar, '/');
			Document doc = getDocument(category, file);
			writer.addDocument(doc);
		}
	}
	
	private IndexWriter getWriter() throws IOException {
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, 
				new StandardAnalyzer(Version.LUCENE_41));
		IndexWriter writer = new IndexWriter(directory, config);
		return writer;
	}
	
	private Document getDocument(String category, File file) throws IOException {
		Properties props = new Properties();
		props.load(new FileInputStream(file));
		String isbn = props.getProperty("isbn");
		String title = props.getProperty("title");
		String author = props.getProperty("author");
		String url = props.getProperty("url");
		String subject = props.getProperty("subject");
		String pubmonth = props.getProperty("pubmonth");
		
		Document doc = new Document();
		doc.add(new StringField("isbn", isbn, Store.YES));
		doc.add(new StringField("category", category, Store.YES));
		doc.add(new StringField("title", title, Store.YES));
		doc.add(new StringField("title2", title.toLowerCase(), Store.NO));
		for (String val : author.split(",")) {
			doc.add(new StringField("author", val, Store.YES));
		}
		doc.add(new StoredField("url", url));
		doc.add(new TextField("subject", subject, Store.NO));
		doc.add(new IntField("pubmonth", Integer.parseInt(pubmonth), Store.YES));
		String contents = title + " " + subject + " " + author;
		doc.add(new TextField("contents", contents, Store.NO));
		return doc;
	}
}