JAVA截取HTML部分内容
超级晕死,昨天弄了整天截取HTML内容。由昨天中午2:00左右到零辰4:00钟都在弄。在网上搜了一些相关的资料。方法大概是以下几种:
1.将截取出来的不正规的HTML内容,经过递归过滤。把未闭合的HTML标签添加上。
2.用现有成熟的开源项目:Html Parser来解释处理HTML。
3.将HTML标签全部过滤掉。<br/>标签除外。将<br />转成' '。转换过后再截取长度。
现在介绍自己在网上看到解决代码:
1.Javascript解决方案
function Generate_Brief(text,length){
if(text.length < length) return text;
var Foremost = text.substr(0,length);
var re = /<(\/?)(BODY|STYLE|SCRIPT|P|DIV|H1|H2|H3|H4|H5|H6|ADDRESS|PRE|TABLE|TR|TD|TH|INPUT|SELECT|TEXTAREA|OBJECT|A|UL|OL|LI|BASE|META|LINK|HR|BR|PARAM|IMG|AREA|INPUT|SPAN)[^>]*(>?)/ig;
var Singlable = /BASE|META|LINK|HR|BR|PARAM|IMG|AREA|INPUT/i
var Stack = new Array(), posStack = new Array();
while(true){
var newone = re.exec(Foremost);
if(newone == null) break;
if(newone[1] == ""){
var Elem = newone[2];
if(Elem.match(Singlable) && newone[3]!= ""){
continue;
}
Stack.push(newone[2].toUpperCase());
posStack.push(newone.index);
if(newone[3] == "") break;
}else{
var StackTop = Stack[Stack.length-1];
var End = newone[2].toUpperCase();
if(StackTop == End){
Stack.pop();
posStack.pop();
if(newone[3] == ""){
Foremost = Foremost+">";
}
}
};
}
var cutpos = posStack.shift();
Foremost = Foremost.substring(0,cutpos);
return Foremost;
}
2.PHP解决方案
function Generate_Brief($text){
global $Briefing_Length;
mb_regex_encoding("UTF-8");
if(mb_strlen($text) <= BRIEF_LENGTH ) return $text;
$Foremost = mb_substr($text, 0, BRIEF_LENGTH);
$re = "<(\/?)(P|DIV|H1|H2|H3|H4|H5|H6|ADDRESS|PRE|TABLE|TR|TD|TH|INPUT|SELECT|TEXTAREA|OBJECT|A|UL|OL|LI|BASE|META|LINK|HR|BR|PARAM|IMG|AREA|INPUT|SPAN)[^>]*(>?)";
$Single = "/BASE|META|LINK|HR|BR|PARAM|IMG|AREA|INPUT|BR/i";
$Stack = array(); $posStack = array();
mb_ereg_search_init($Foremost, $re, 'i');
while($pos = mb_ereg_search_pos()){
$match = mb_ereg_search_getregs();
/* [Child-matching Formulation]:
$matche[1] : A "/" charactor indicating whether current "<...>" Friction is Closing Part
$matche[2] : Element Name.
$matche[3] : Right > of a "<...>" Friction
*/
if($match[1]==""){
$Elem = $match[2];
if(mb_eregi($Single, $Elem) && $match[3] !=""){
continue;
}
array_push($Stack, mb_strtoupper($Elem));
array_push($posStack, $pos[0]);
}else{
$StackTop = $Stack[count($Stack)-1];
$End = mb_strtoupper($match[2]);
if(strcasecmp($StackTop,$End)==0){
array_pop($Stack);
array_pop($posStack);
if($match[3] ==""){
$Foremost = $Foremost.">";
}
}
}
}
$cutpos = array_shift($posStack) - 1;
$Foremost = mb_substr($Foremost,0,$cutpos,"UTF-8");
return $Foremost;
};
3.JAVA解决方案
package cn.kgnews.util;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlparser.*;
import org.htmlparser.tags.CompositeTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
/**
*
* Functions for HTML.
*
*
*
* @author Scud http://www.javascud.org Date: Nov 3, 2006 10:22:20 AM
*
*/
public class HtmlSubstring
{
private static Log log = LogFactory.getLog(HtmlSubstring.class);
/**
*
* get parser for substring.
*
* @return Parser
*
*/
public static Parser getMyParser()
{
Parser parser = new Parser();
PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
// register tags which htmlParser not have
factory.registerTag(new StrongTag());
factory.registerTag(new BoldTag());
factory.registerTag(new ItalicTag());
factory.registerTag(new UnderlineTag());
factory.registerTag(new CenterTag());
factory.registerTag(new FontTag());
parser.setNodeFactory(factory);
return parser;
}
/**
*
* Substring for Html String.
*
*
*
* @param htmlString
* Html string
*
* @param maxlength
* maxlength
*
* @return String
*
*/
public static String substring(String htmlString, int maxlength)
{
StringBuffer htmlOut = new StringBuffer();
StringBuffer stringOut = new StringBuffer();
try
{
Parser parser = getMyParser();
parser.setInputHTML(htmlString);
NodeIterator nit = parser.elements();
boolean breaked = false;
while (nit.hasMoreNodes())
{
Node node = nit.nextNode();
if (node instanceof Text)
{
breaked = dealText(node, stringOut, htmlOut, maxlength);
}
else if (node instanceof Tag)
{
Tag tag = (Tag) node;
breaked = dealTag(tag, stringOut, htmlOut, maxlength);
}
else if (node instanceof Remark)
{
// nothing to do
}
if (breaked)
{
break;
}
}
}
catch (Exception e)
{
log.error("Error occured when parse Html String", e);
}
return htmlOut.toString();
}
private static boolean dealText(Node node, StringBuffer stringOut,
StringBuffer htmlOut, int maxlength)
{
String currentText = node.getText();
int previousLength = stringOut.length();
if (previousLength + currentText.length() >= maxlength)
{
String cutString = currentText.substring(0, maxlength
- previousLength);
stringOut.append(cutString);
htmlOut.append(cutString);
log.debug(cutString);
return true;
}
else
{
stringOut.append(node.getText());
htmlOut.append(node.getText());
log.debug(node.getText());
}
return false;
}
private static boolean dealTag(Tag aTag, StringBuffer stringOut,
StringBuffer htmlOut, int maxlength) throws Exception
{
NodeList list = aTag.getChildren();
log.debug(getStartTagString(aTag));
htmlOut.append(getStartTagString(aTag));
boolean breaked = false;
if (list != null)
{
NodeIterator it = list.elements();
while (it.hasMoreNodes())
{
Node node = it.nextNode();
if (node instanceof Text)
{
breaked = dealText(node, stringOut, htmlOut, maxlength);
}
else if (node instanceof Tag)
{
Tag tag = (Tag) node;
breaked = dealTag(tag, stringOut, htmlOut, maxlength);
}
else if (node instanceof Remark)
{
// nothing to do
}
if (breaked)
{
break;
}
}
}
Tag endTag = aTag.getEndTag();
if (endTag != null)
{
htmlOut.append(aTag.getEndTag().toHtml());
log.debug(aTag.getEndTag().toHtml());
}
return breaked;
}
private static String getStartTagString(Tag aTag)
{
StringBuffer start = new StringBuffer("<");
for (Object o : aTag.getAttributesEx())
{
Attribute ab = (Attribute) o;
start.append(ab.toString());
}
start.append(">");
return start.toString();
}
}
class StrongTag extends CompositeTag
{
private static final String[] mIds = new String[] { "STRONG" };
public StrongTag()
{
}
public String[] getIds()
{
return (mIds);
}
public String[] getEnders()
{
return (mIds);
}
public String[] getEndTagEnders()
{
return (new String[0]);
}
}
class BoldTag extends CompositeTag
{
private static final String[] mIds = new String[] { "B" };
public BoldTag()
{
}
public String[] getIds()
{
return (mIds);
}
public String[] getEnders()
{
return (mIds);
}
public String[] getEndTagEnders()
{
return (new String[0]);
}
}
class ItalicTag extends CompositeTag
{
private static final String[] mIds = new String[] { "I" };
public ItalicTag()
{
}
public String[] getIds()
{
return (mIds);
}
public String[] getEnders()
{
return (mIds);
}
public String[] getEndTagEnders()
{
return (new String[0]);
}
}
class UnderlineTag extends CompositeTag
{
private static final String[] mIds = new String[] { "U" };
public UnderlineTag()
{
}
public String[] getIds()
{
return (mIds);
}
public String[] getEnders()
{
return (mIds);
}
public String[] getEndTagEnders()
{
return (new String[0]);
}
}
class CenterTag extends CompositeTag
{
private static final String[] mIds = new String[] { "CENTER" };
public CenterTag()
{
}
public String[] getIds()
{
return (mIds);
}
public String[] getEnders()
{
return (mIds);
}
public String[] getEndTagEnders()
{
return (new String[0]);
}
}
class FontTag extends CompositeTag
{
private static final String[] mIds = new String[] { "FONT" };
public FontTag()
{
}
public String[] getIds()
{
return (mIds);
}
public String[] getEnders()
{
return (mIds);
}
public String[] getEndTagEnders()
{
return (new String[0]);
}
}