发新话题
移动帖子 加入精华 加入置顶 加入收藏 关注此帖

HTMLParser 解析类库的使用例子



HTMLParser 解析类库的使用例子

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.HtmlPage;

/**
 * HTML Parser的测试例子。
 * 
 * @author 赵学庆,Java世纪网(java2000.net)
 * 
 */
public class T {
  public static void main(String args[]) throws Exception {
    String path = "http://www.java2000.net";
    StringBuffer sbStr = new StringBuffer();
    URL url = new URL(path);
    BufferedReader reader = new BufferedReader(
        new InputStreamReader(url.openStream(), "UTF-8"));
    String temp = "";
    while ((temp = reader.readLine()) != null) {
      sbStr.append(temp);
      sbStr.append("\r\n");
    }
    reader.close();

    String result = sbStr.toString();
    readAll(result);
    readTextAndLink(result);
    readByHtml(result);
    readTextAndTitle(result);
  }

  /**
   * 解析所有的节点
   * 
   * @param result
   * @throws Exception
   */
  public static void readAll(String result) throws Exception {
    Parser parser;
    parser = Parser.createParser(result, "UTF-8");
    NodeFilter filter = new NodeClassFilter(Node.class);
    NodeList list = parser.extractAllNodesThatMatch(filter);
    String line = null;
    // 读取所有的内容节点
    for (int i = 0; i < list.size(); i++) {
      Node textnode = (Node) list.elementAt(i);
      line = textnode.toPlainTextString().trim();
      if (line.length() == 0)
        continue;
      System.out.println(line);
    }
  }

  /**
   * 按页面方式处理.解析标准的html页面
   * 
   * @param content
   * @throws Exception
   */
  public static void readByHtml(String content) throws Exception {
    Parser myParser;
    myParser = Parser.createParser(content, "UTF-8");

    HtmlPage visitor = new HtmlPage(myParser);

    myParser.visitAllNodesWith(visitor);

    String textInPage = visitor.getTitle();
    System.out.println(textInPage);
    NodeList nodelist;
    nodelist = visitor.getBody();
    System.out.print(nodelist.asString().trim());

  }

  // 读取文本内容和标题
  public static void readTextAndTitle(String result) throws Exception {
    Parser parser;
    NodeList nodelist;
    parser = Parser.createParser(result, "UTF-8");
    NodeFilter textFilter = new NodeClassFilter(TextNode.class);
    NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
    OrFilter lastFilter = new OrFilter();
    lastFilter.setPredicates(new NodeFilter[] { textFilter, titleFilter });
    nodelist = parser.parse(lastFilter);
    Node[] nodes = nodelist.toNodeArray();
    String line = "";
    for (int i = 0; i < nodes.length; i++) {
      Node node = nodes[i];
      if (node instanceof TextNode) {
        TextNode textnode = (TextNode) node;
        line = textnode.getText();
      } else if (node instanceof TitleTag) {
        TitleTag titlenode = (TitleTag) node;
        line = titlenode.getTitle();
      }
      if (isTrimEmpty(line))
        continue;
      System.out.println(line);
    }
  }

  // 分别读纯文本和链接

  public static void readTextAndLink(String result) throws Exception {
    Parser parser;
    NodeList nodelist;
    parser = Parser.createParser(result, "UTF-8");
    NodeFilter textFilter = new NodeClassFilter(TextNode.class);
    NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
    OrFilter lastFilter = new OrFilter();
    lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter });
    nodelist = parser.parse(lastFilter);
    Node[] nodes = nodelist.toNodeArray();
    String line = "";
    for (int i = 0; i < nodes.length; i++) {
      Node node = nodes[i];
      if (node instanceof TextNode) {
        TextNode textnode = (TextNode) node;
        line = textnode.getText();
      } else if (node instanceof LinkTag) {
        LinkTag link = (LinkTag) node;
        line = link.getLink();
      }
      if (isTrimEmpty(line))
        continue;
      System.out.println(line);
    }
  }

  /**
   * 去掉左右空格后字符串是否为空
   */
  public static boolean isTrimEmpty(String astr) {
    if ((null == astr) || (astr.length() == 0)) {
      return true;
    }
    if (isBlank(astr.trim())) {
      return true;
    }
    return false;
  }

  /**
   * 字符串是否为空:null或者长度为0.
   */
  public static boolean isBlank(String astr) {
    if ((null == astr) || (astr.length() == 0)) {
      return true;
    } else {
      return false;
    }
  }

}
快乐渡过每一天,减肥坚持每一天
编辑 回复 快速回复 TOP

Re:HTMLParser 解析类库的使用例子

好。楼主辛苦了。
htmlparser在实际应用中有哪些呢。
编辑 回复 快速回复 TOP
发新话题