public static String htmlToText(String html) {
Element body = Jsoup.parseBodyFragment(html).body();
// 参考Element#text()的实现,将追加字符修改为换行符
StringBuilder sb = new StringBuilder();
NodeTraversor.traverse(new NodeVisitor() {
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode)node;
appendNormalisedText(sb, textNode);
} else if (node instanceof Element) {
Element element = (Element)node;
if (sb.length() > 0 && (element.isBlock() || element.tagName().equals("br"))) {
sb.append('\n');
}
}
}
public void tail(Node node, int depth) {
}
}, body);
return sb.toString().trim();
}
/**
* 参考{@link Element#appendNormalisedText(StringBuilder, TextNode)}
* @param sb
* @param textNode
*/
private static void appendNormalisedText(StringBuilder sb, TextNode textNode) {
String text = textNode.getWholeText();
if (preserveWhitespace(textNode.parentNode())) {
sb.append(text);
} else {
StringUtil.appendNormalisedWhitespace(sb, text, lastCharIsWhitespace(sb));
}
}
/**
* 参考{@link Element#preserveWhitespace(Node)}
* @param node
* @return
*/
private static boolean preserveWhitespace(Node node) {
if (node != null && node instanceof Element) {
Element el = (Element)node;
int i = 0;
do {
if (el.tag().preserveWhitespace()) {
return true;
}
el = el.parent();
++i;
} while(i < 6 && el != null);
}
return false;
}
/**
* 最后一个字符是否为空格
* @param sequence 字符串
* @return 是否空格
*/
public static boolean lastCharIsWhitespace(CharSequence sequence) {
return sequence.length() != 0 && sequence.charAt(sequence.length() - 1) == ' ';
}