比较常用的几种英文分析器,他们之间的区别见程序中的注释。
SimpleAnalyzer
StandardAnalyzer
WhitespaceAnalyzer
StopAnalyzer
 
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
public class TestAnalyzer {
  private static String testString1 = "The quick brown fox jumped over the lazy dogs";
  private static String testString2 = "xy&z mail is - xyz@sohu.com";
  public static void testWhitespace(String testString) throws Exception {
    Analyzer analyzer = new WhitespaceAnalyzer();
    Reader r = new StringReader(testString);
    Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
    System.err.println("=====Whitespace analyzer====");
    System.err.println("分析方法:空格分割");
    Token t;
    while ((t = ts.next()) != null) {
      System.out.println(t.termText());
    }
  }
  public static void testSimple(String testString) throws Exception {
    Analyzer analyzer = new SimpleAnalyzer();
    Reader r = new StringReader(testString);
    Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
    System.err.println("=====Simple analyzer====");
    System.err.println("分析方法:空格及各种符号分割");
    Token t;
    while ((t = ts.next()) != null) {
      System.out.println(t.termText());
    }
  }
  public static void testStop(String testString) throws Exception {
    Analyzer analyzer = new StopAnalyzer();
    Reader r = new StringReader(testString);
    StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
    System.err.println("=====stop analyzer====");
    System.err.println("分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词");
    // 停止词
    Token t;
    while ((t = sf.next()) != null) {
      System.out.println(t.termText());
    }
  }
  public static void testStandard(String testString) throws Exception {
    Analyzer analyzer = new StandardAnalyzer();
    Reader r = new StringReader(testString);
    StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
    System.err.println("=====standard analyzer====");
    System.err.println("分析方法:混合分割,包括了去掉停止词,支持汉语");
    Token t;
    while ((t = sf.next()) != null) {
      System.out.println(t.termText());
    }
  }
  public static void main(String[] args) throws Exception {
    // String testString = testString1;
    String testString = testString2;
    System.out.println(testString);
    testWhitespace(testString);
    testSimple(testString);
    testStop(testString);
    testStandard(testString);
  }
}


快乐渡过每一天,减肥坚持每一天