draco1023
5/3/2018 - 1:48 AM

Jsoup html to text with line breaks

public static String htmlToText(String html) {
  Element body = Jsoup.parseBodyFragment(html).body();

  // 参考Element#text()的实现,将追加字符修改为换行符
  StringBuilder sb = new StringBuilder();
  NodeTraversor.traverse(new NodeVisitor() {
      public void head(Node node, int depth) {
          if (node instanceof TextNode) {
              TextNode textNode = (TextNode)node;
              appendNormalisedText(sb, textNode);
          } else if (node instanceof Element) {
              Element element = (Element)node;
              if (sb.length() > 0 && (element.isBlock() || element.tagName().equals("br"))) {
                  sb.append('\n');
              }
          }
      }

      public void tail(Node node, int depth) {
      }
  }, body);
  return sb.toString().trim();
}

/**
 * 参考{@link Element#appendNormalisedText(StringBuilder, TextNode)}
 * @param sb
 * @param textNode
 */
private static void appendNormalisedText(StringBuilder sb, TextNode textNode) {
    String text = textNode.getWholeText();
    if (preserveWhitespace(textNode.parentNode())) {
        sb.append(text);
    } else {
        StringUtil.appendNormalisedWhitespace(sb, text, lastCharIsWhitespace(sb));
    }
}

/**
 * 参考{@link Element#preserveWhitespace(Node)}
 * @param node
 * @return
 */
private static boolean preserveWhitespace(Node node) {
    if (node != null && node instanceof Element) {
        Element el = (Element)node;
        int i = 0;

        do {
            if (el.tag().preserveWhitespace()) {
                return true;
            }

            el = el.parent();
            ++i;
        } while(i < 6 && el != null);
    }

    return false;
}

/**
 * 最后一个字符是否为空格
 * @param sequence 字符串
 * @return 是否空格
 */
public static boolean lastCharIsWhitespace(CharSequence sequence) {
    return sequence.length() != 0 && sequence.charAt(sequence.length() - 1) == ' ';
}