发新话题

研究lucene几天的一点心得

首先感谢老紫竹同志给我一个机会让我正式的第一次发帖(呵呵,我在csdn上经常潜水,不是我不想回答,而是别人回答的已经很好了),快毕业了,老师分任务的时候让我做网站的搜索引擎,当时只听了一个单词叫lucene,只知道要用这个技术,得,回来就开始研究,找资料,问了好多人,都说没整过这玩意,五一节就是在这种郁闷中过的,后来跟老紫竹同志交谈,敢尽人都不用lucene,用数据库自带的,不知道自己是不是白研究了,不过感觉研究这个东西还是很光荣的,好多人都不知道(小人得志),其实我的想法是面试的时候就跟别人说我会lucene,希望能增加别人的好感,说实话,不知道自己达到什么水平才能在外面很顺利的找到事情,也不知道该去那个地方闯,我现在是在武汉,反正就是不想留在这,说了很多废话,希望路过的大哥大姐门给我点意见,好了,我就开始说lucene 吧;

数据库users表
id integer primary key
name varchar(50)

---------------------
java代码
连接类
package test.lucene;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
public class DBAccess {
  /**
   * @param args
   */
  private static Connection conn = null;
  private static DBAccess db = null;
  private DBAccess() throws ClassNotFoundException, SQLException {
    this("root", "admin");
  }
  private DBAccess(String userName, String pwd) throws ClassNotFoundException, SQLException {
    Class.forName("com.mysql.jdbc.Driver");
    conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/test", userName, pwd);
  }
  public static Connection getConnection() {
    try {
      if (db == null)
        db = new DBAccess();
    } catch (ClassNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (SQLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return conn;
  }
  public static void close(ResultSet rs, PreparedStatement ps, Statement st, Connection conn) {
    try {
      if (rs != null)
        rs.close();
      if (ps != null)
        ps.close();
      if (st != null) {
        st.close();
      }
      if (conn != null) {
        conn.close();
      }
    } catch (SQLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
  public static void main(String[] args) {
    System.out.println(DBAccess.getConnection());
  }
}
测试类
package test.lucene;
import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Date;
import java.util.Timer;
import java.util.TimerTask;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.FSDirectory;
public final class LuceneTest {
  final String path = "d:/index/";// 索引文件的存放物理位置
  public synchronized ResultSet getResult(String sql) {// 根据sql语句从数据库中取得结果集
    Connection conn = DBAccess.getConnection();
    Statement stmt = null;
    ResultSet rs = null;
    try {
      stmt = conn.createStatement();
      rs = stmt.executeQuery(sql);
    } catch (SQLException e) {
      e.printStackTrace();
    }
    return rs;
  }
  public synchronized void Index(ResultSet rs) {// 通过结果集就可以获得数据源了
    try {
      FSDirectory fsDir = FSDirectory.getDirectory(path, true);
      IndexWriter writer = new IndexWriter(fsDir, getAnalyzer(), true);
      Date start = new Date();
      while (rs.next()) {
        Document doc = new Document();// 一个文档相当与表的一条记录
        doc.add(Field.Keyword("id", rs.getString("id")));// 字段id放的是数据库表中的id,lucene的一条记录的一个字段下的数据可以放多个值,这点与数据库表不同
        doc.add(Field.Text("name", rs.getString("userName")));
        writer.addDocument(doc);
      }
      fsDir.close();// 一定要关闭文件,因为前面的IndexWriter设置为true
      writer.optimize();// 优化
      writer.close();// 一定要关闭,否则不能把内存中的数据写到文件
      Date end = new Date();
      System.out.println("索引建立成功!!!!" + "用时" + (end.getTime() - start.getTime()) + "毫秒");
    } catch (IOException e) {
      System.out.println(e);
    } catch (SQLException e) {
      System.out.println(e);
    }
  }
  public synchronized Analyzer getAnalyzer() {
    return new StandardAnalyzer();
  }
  public Hits seacher(String queryString) {// 根据关键字搜索
    Hits hits = null;
    try {
      Query query = new WildcardQuery(new Term("name", "*" + queryString + "*"));// 模糊查询一下
      Searcher searcher = new IndexSearcher("d:/index/");
      hits = searcher.search(query);
      searcher.close();
    } catch (Exception e) {
      System.out.print(e);
    }
    return hits;
  }
  public static void main(String[] args) {
    final Timer timer = new Timer();
    timer.schedule(new TimerTask() {// 设置一个定时器,每分钟创建一个索引,为什么呢,因为数据库中数据是在变的呀
          public synchronized void run() {
            LuceneTest lt = new LuceneTest();
            ResultSet rs = lt.getResult("select * from users");
            lt.Index(rs);
          }
        }, 0, 60 * 1000);
  }
}
页面部分
<%@ page language="java" import="java.util.*" pageEncoding="gbk"%>


<%@ page import="test.lucene.LuceneTest"%>
<%@ page import="org.apache.lucene.search.*,org.apache.lucene.document.*" %>
<%
String path = request.getContextPath();
String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
%>

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
  <head>
    <base href="<%=basePath%>">
    
    <title>helloworld lucene </title>
        
        
  </head>
  
  <body>
   <%
   LuceneTest lucent=new LuceneTest();
   Hits hits=null;
if(request.getParameter("queryString")!=null){
  hits =lucent.seacher(new String(request.getParameter("queryString").getBytes("iso-8859-1"),"gb2312"));
   } %> <br>
    
    <form action="TestLucene.jsp">
  <input  type="text" name="queryString"/>
  <input type="submit" value="搜索"/>
</form>

<table>
  <%if(hits!=null){%>
  <tr>
    <td>作者号</td>
    <td>作者名</td>
  </tr> 

<% for(int i=0;i<hits.length();i++){
    Document doc=hits.doc(i);
   %>
    <tr>
    <td><%=doc.get("id") %></td>
    <td><%=doc.get("name") %></td>
  </tr>
<% }}%>
</table>
  </body>
</html>
不过我有个问题,就是document的数据源是直接用结果集里面的数据,还是把结果集的数据解析为xml后在从xml中读取;
欢迎大家和我讨论,我qq:729215495



编辑 回复 快速回复 TOP
感觉lz应该先较详细的讲述下lucene是什么东东 然后再举例子 那样就容易理解了

说实话,光看到一堆代码, 就看到里面调用了那个lucene的类 最后还没明白这个lucene是什么 到底有什么好处~
编辑 回复 快速回复 TOP
其实简单全文检索,键个数据库表就可以实现了
id key relationid
字段key保存关键字用于检索,很多数据库也自带全文检索功能,但是数据库自带分词不是很好,很多时候还是得自己搞定。

下面是我自己写得一个通用lucene类,indexwriter和indexreader都使用单例,提高性能

public class LucenTemplate {

/**
* 2008-6-1 Administrator
*
* @param args
* void
*
*/
private static Log log = LogFactory.getLog(LucenTemplate.class);

private final Object lock = new Object();

private IndexWriter indexwriter;

private IndexReader indexreader;

private String dirPath;

private Analyzer analyzer;

private WhitespaceAnalyzer seachAnalyzer;// 查询语句是按空白符分解,这点要注意

private QueryParser queryparser;

// 评分降序,评分一样时后索引的排前面
private Sort sort = new Sort(new SortField[] { SortField.FIELD_SCORE, new SortField(null, SortField.DOC, true) });

LucenTemplate(String indexPath) {
dirPath = indexPath;
analyzer = new MIK_CAnalyzer();
seachAnalyzer = new WhitespaceAnalyzer();
queryparser = new QueryParser("id", seachAnalyzer);
try {
initIndexWriter();
initIndexReader();
} catch (IOException e) {
log.error("索引初始化失败", e);
}
}

public synchronized void initIndexWriter() throws IOException {
if (indexwriter == null) {
File dir = new File(dirPath);
if (!dir.isDirectory() || !dir.exists())
throw new FileNotFoundException("索引目录不存在或者无效");
if (IndexReader.isLocked(dirPath))
throw new IOException("索引目录已被锁定,无法初始化");

indexwriter = new IndexWriter(dir, analyzer);
indexwriter.setUseCompoundFile(true);// 会合并多个 Segments 文件到一个 .cfs
// 中。此方式有助于减少索引文件数量,减少同时打开的文件数量
}
}

public synchronized void initIndexReader() throws IOException {
if (indexreader == null) {
File dir = new File(dirPath);
if (!dir.isDirectory() || !dir.exists())
throw new FileNotFoundException("索引目录不存在或者无效");
indexreader = IndexReader.open(dir);
}
}

public void refreashIndexReader() throws IOException {
IndexReader temp = indexreader.reopen();
if (temp != indexreader)
indexreader.close();
indexreader = temp;
}

public IndexWriter getIndexWriter() throws IOException {
if (indexwriter == null)
initIndexWriter();
return indexwriter;
}

public IndexReader getIndexReader() throws IOException {
if (indexreader == null)
initIndexReader();
return indexreader;
}

// 获取IndexSearcher,切记要在外层代码使用完毕后close它
public IndexSearcher getIndexSearcher() throws IOException {
IndexReader reader = getIndexReader();
IndexSearcher searcher = new IndexSearcher(reader);
return searcher;
}

// 写操作需要同步
public void optimize() {

try {
indexwriter.optimize();
} catch (IOException e) {
log.error("索引优化失败", e);
}
}

// ///////////////////////////////////////////
// 业务常用方法
public void addDocument(Document doc) throws LuceenAccessException {

try {
indexwriter.addDocument(doc);
} catch (IOException e) {
throw new LuceenAccessException(e);
}
}

// 多个条件或查询
public List searchDocumentByOR(List fields) throws LuceenAccessException {

String queryStr = "";
for (Iterator iter = fields.iterator(); iter.hasNext();) {
SearchField field = (SearchField) iter.next();
queryStr = queryStr + field.getKey() + ":" + field.getValue() + ' ';
}
return searchDocument(queryStr);

}

// 需要外层传入解析语句进行查询,用于复杂查询
// 应该包装一个分页方法
public List searchDocument(String queryStr) throws LuceenAccessException {
List result = new ArrayList();
Query query = null;
IndexSearcher searcher = null;
try {
synchronized (lock) {
// queryparser多线程并发下需要同步
query = queryparser.parse(queryStr);
}
searcher = getIndexSearcher();

Hits hits = searcher.search(query, sort);
for (int i = 0; i < hits.length(); i++) {
Document document = hits.doc(i);
document.setBoost(hits.score(i));
result.add(document);

}
return result;
} catch (ParseException e) {
throw new LuceenAccessException(e);
} catch (IOException e) {
throw new LuceenAccessException(e);
} finally {
try {
searcher.close();
} catch (IOException e) {
throw new LuceenAccessException("IndexSearch关闭失败", e);
}
}
}

// 分页检索,注意n从0开始
public List searchDocumentNToMByQueryStr(String queryStr, int n, int m) throws LuceenAccessException {
if (n > m || n < 0 || m < 0)
throw new IllegalArgumentException("N或者M参数无效");
List result = new ArrayList();
Query query = null;
IndexSearcher searcher = null;
try {
synchronized (lock) {
// queryparser多线程并发下需要同步
query = queryparser.parse(queryStr);
}
searcher = getIndexSearcher();

Hits hits = searcher.search(query, sort);
for (int i = n; i <= m; i++) {

Document document = hits.doc(i);
document.setBoost(hits.score(i));
result.add(document);
}
return result;
} catch (ParseException e) {
throw new LuceenAccessException(e);
} catch (IOException e) {
throw new LuceenAccessException(e);
} finally {
try {
searcher.close();
} catch (IOException e) {
throw new LuceenAccessException("IndexSearch关闭失败", e);
}
}
}

// N到M查询,注意n从0开始
public List searchDocumentNToM(List fields, int n, int m) throws LuceenAccessException {
String queryStr = "";
for (Iterator iter = fields.iterator(); iter.hasNext();) {
SearchField field = (SearchField) iter.next();
queryStr = queryStr + field.getKey() + ":" + field.getValue() + ' ';
}

return searchDocumentNToMByQueryStr(queryStr, n, m);
}

// 分页查询
public List searchDocumentPaged(List fields, int pageSize, int page) throws LuceenAccessException {
if (pageSize < 1 || page < 1)
throw new IllegalArgumentException("pageSize或者page参数无效");
int n = pageSize * (page - 1);
int m = n + pageSize - 1;

return searchDocumentNToM(fields, n, m);
}

public void close() {
try {
if (indexwriter != null)
indexwriter.close();

if (indexreader != null)
indexreader.close();
} catch (IOException e) {

log.error("关闭writer和reader失败", e);
}
}

public String getDirPath() {
return dirPath;
}

public void setDirPath(String dirPath) {
this.dirPath = dirPath;
}



public static class SearchField {
private String key;

private String value;

SearchField(String key, String value) {
this.key = key;
this.value = value;
}

public String getKey() {
return key;
}

public void setKey(String key) {
this.key = key;
}

public String getValue() {
return value;
}

public void setValue(String value) {
this.value = value;
}

}
编辑 回复 快速回复 TOP
发新话题