当前位置:首页 > JavaServer Page > 正文内容

JAVA截取HTML部分内容

canca18年前 (2007-08-18)JavaServer Page556

超级晕死,昨天弄了整天截取HTML内容。由昨天中午2:00左右到零辰4:00钟都在弄。在网上搜了一些相关的资料。方法大概是以下几种:

1.将截取出来的不正规的HTML内容,经过递归过滤。把未闭合的HTML标签添加上。

2.用现有成熟的开源项目:Html Parser来解释处理HTML。

3.将HTML标签全部过滤掉。<br/>标签除外。将<br />转成' '。转换过后再截取长度。

现在介绍自己在网上看到解决代码:

1.Javascript解决方案

function Generate_Brief(text,length){
    if(text.length < length) return text;
    var Foremost = text.substr(0,length);
           
    var re = /<(\/?)(BODY|STYLE|SCRIPT|P|DIV|H1|H2|H3|H4|H5|H6|ADDRESS|PRE|TABLE|TR|TD|TH|INPUT|SELECT|TEXTAREA|OBJECT|A|UL|OL|LI|BASE|META|LINK|HR|BR|PARAM|IMG|AREA|INPUT|SPAN)[^>]*(>?)/ig;
   
    var Singlable = /BASE|META|LINK|HR|BR|PARAM|IMG|AREA|INPUT/i
    var Stack = new Array(), posStack = new Array();
    while(true){
        var newone = re.exec(Foremost);
        if(newone == null) break;
       
        if(newone[1] == ""){
            var Elem = newone[2];
            if(Elem.match(Singlable) && newone[3]!= ""){
                continue;
            }
            Stack.push(newone[2].toUpperCase());
            posStack.push(newone.index);
           
            if(newone[3] == "") break;
        }else{
            var StackTop = Stack[Stack.length-1];
            var End  = newone[2].toUpperCase();
            if(StackTop == End){
                Stack.pop();
                posStack.pop();
                if(newone[3] == ""){
                    Foremost = Foremost+">";
                }
            }
           
        };
    }   
    var cutpos = posStack.shift();
    Foremost = Foremost.substring(0,cutpos);
   
    return Foremost;
}

2.PHP解决方案

function Generate_Brief($text){
    global $Briefing_Length;
    mb_regex_encoding("UTF-8");
    if(mb_strlen($text) <= BRIEF_LENGTH ) return $text;    
    $Foremost = mb_substr($text, 0, BRIEF_LENGTH);
    $re = "<(\/?)(P|DIV|H1|H2|H3|H4|H5|H6|ADDRESS|PRE|TABLE|TR|TD|TH|INPUT|SELECT|TEXTAREA|OBJECT|A|UL|OL|LI|BASE|META|LINK|HR|BR|PARAM|IMG|AREA|INPUT|SPAN)[^>]*(>?)";
    $Single = "/BASE|META|LINK|HR|BR|PARAM|IMG|AREA|INPUT|BR/i";    
    
    $Stack = array(); $posStack = array();
    
    mb_ereg_search_init($Foremost, $re, 'i');
    
    while($pos = mb_ereg_search_pos()){
        $match = mb_ereg_search_getregs();
        /*    [Child-matching Formulation]:
        
            $matche[1] : A "/" charactor indicating whether current "<...>" Friction is Closing Part
            $matche[2] : Element Name.
            $matche[3] : Right > of a "<...>" Friction    
        */
        if($match[1]==""){
            $Elem = $match[2];
            if(mb_eregi($Single, $Elem) && $match[3] !=""){
                continue;
            }
            array_push($Stack, mb_strtoupper($Elem));
            array_push($posStack, $pos[0]);            
        }else{
            $StackTop = $Stack[count($Stack)-1];
            $End = mb_strtoupper($match[2]);
            if(strcasecmp($StackTop,$End)==0){
                array_pop($Stack);
                array_pop($posStack);
                if($match[3] ==""){
                    $Foremost = $Foremost.">";
                }
            }
        }
    }
    
    $cutpos = array_shift($posStack) - 1;    
    $Foremost =  mb_substr($Foremost,0,$cutpos,"UTF-8");
    return $Foremost;
};

3.JAVA解决方案

package cn.kgnews.util;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.htmlparser.*;

import org.htmlparser.tags.CompositeTag;

import org.htmlparser.util.NodeIterator;

import org.htmlparser.util.NodeList;

/**
 *
 * Functions for HTML.
 *
 *
 *
 * @author Scud http://www.javascud.org Date: Nov 3, 2006 10:22:20 AM
 *
 */

public class HtmlSubstring

{

 private static Log log = LogFactory.getLog(HtmlSubstring.class);

 /**
  *
  * get parser for substring.
  *
  * @return Parser
  *
  */

 public static Parser getMyParser()

 {

  Parser parser = new Parser();

  PrototypicalNodeFactory factory = new PrototypicalNodeFactory();

  // register tags which htmlParser not have

  factory.registerTag(new StrongTag());

  factory.registerTag(new BoldTag());

  factory.registerTag(new ItalicTag());

  factory.registerTag(new UnderlineTag());

  factory.registerTag(new CenterTag());

  factory.registerTag(new FontTag());
  
  parser.setNodeFactory(factory);

  return parser;

 }

 /**
  *
  * Substring for Html String.
  *
  *
  *
  * @param htmlString
  *            Html string
  *
  * @param maxlength
  *            maxlength
  *
  * @return String
  *
  */

 public static String substring(String htmlString, int maxlength)

 {

  StringBuffer htmlOut = new StringBuffer();

  StringBuffer stringOut = new StringBuffer();

  try

  {

   Parser parser = getMyParser();

   parser.setInputHTML(htmlString);

   NodeIterator nit = parser.elements();

   boolean breaked = false;

   while (nit.hasMoreNodes())

   {

    Node node = nit.nextNode();


     if (node instanceof Text)

     {

      breaked = dealText(node, stringOut, htmlOut, maxlength);

     }

     else if (node instanceof Tag)

     {

      Tag tag = (Tag) node;

      breaked = dealTag(tag, stringOut, htmlOut, maxlength);

     }

     else if (node instanceof Remark)

     {

      // nothing to do

     }

    if (breaked)

    {

     break;

    }

   }

  }

  catch (Exception e)

  {

   log.error("Error occured when parse Html String", e);

  }

  return htmlOut.toString();

 }

 private static boolean dealText(Node node, StringBuffer stringOut,
   StringBuffer htmlOut, int maxlength)

 {

  String currentText = node.getText();

  int previousLength = stringOut.length();

  if (previousLength + currentText.length() >= maxlength)

  {

   String cutString = currentText.substring(0, maxlength
     - previousLength);

   stringOut.append(cutString);

   htmlOut.append(cutString);

   log.debug(cutString);

   return true;

  }

  else

  {

   stringOut.append(node.getText());

   htmlOut.append(node.getText());

   log.debug(node.getText());

  }

  return false;

 }

 private static boolean dealTag(Tag aTag, StringBuffer stringOut,
   StringBuffer htmlOut, int maxlength) throws Exception

 {

  NodeList list = aTag.getChildren();

  log.debug(getStartTagString(aTag));

  htmlOut.append(getStartTagString(aTag));

  boolean breaked = false;

  if (list != null)

  {

   NodeIterator it = list.elements();

   while (it.hasMoreNodes())

   {

    Node node = it.nextNode();

    if (node instanceof Text)

    {

     breaked = dealText(node, stringOut, htmlOut, maxlength);

    }

    else if (node instanceof Tag)

    {

     Tag tag = (Tag) node;

     breaked = dealTag(tag, stringOut, htmlOut, maxlength);

    }

    else if (node instanceof Remark)

    {

     // nothing to do

    }

    if (breaked)

    {

     break;

    }

   }

  }

  Tag endTag = aTag.getEndTag();

  if (endTag != null)

  {

   htmlOut.append(aTag.getEndTag().toHtml());

   log.debug(aTag.getEndTag().toHtml());

  }

  return breaked;

 }

 private static String getStartTagString(Tag aTag)

 {

  StringBuffer start = new StringBuffer("<");

  for (Object o : aTag.getAttributesEx())

  {

   Attribute ab = (Attribute) o;

   start.append(ab.toString());

  }

  start.append(">");

  return start.toString();

 }

}

class StrongTag extends CompositeTag

{

 private static final String[] mIds = new String[] { "STRONG" };

 public StrongTag()

 {

 }

 public String[] getIds()

 {

  return (mIds);

 }

 public String[] getEnders()

 {

  return (mIds);

 }

 public String[] getEndTagEnders()

 {

  return (new String[0]);

 }

}

class BoldTag extends CompositeTag

{

 private static final String[] mIds = new String[] { "B" };

 public BoldTag()

 {

 }

 public String[] getIds()

 {

  return (mIds);

 }

 public String[] getEnders()

 {

  return (mIds);

 }

 public String[] getEndTagEnders()

 {

  return (new String[0]);

 }

}

class ItalicTag extends CompositeTag

{

 private static final String[] mIds = new String[] { "I" };

 public ItalicTag()

 {

 }

 public String[] getIds()

 {

  return (mIds);

 }

 public String[] getEnders()

 {

  return (mIds);

 }

 public String[] getEndTagEnders()

 {

  return (new String[0]);

 }

}

class UnderlineTag extends CompositeTag

{

 private static final String[] mIds = new String[] { "U" };

 public UnderlineTag()

 {

 }

 public String[] getIds()

 {

  return (mIds);

 }

 public String[] getEnders()

 {

  return (mIds);

 }

 public String[] getEndTagEnders()

 {

  return (new String[0]);

 }

}

class CenterTag extends CompositeTag

{

 private static final String[] mIds = new String[] { "CENTER" };

 public CenterTag()

 {

 }

 public String[] getIds()

 {

  return (mIds);

 }

 public String[] getEnders()

 {

  return (mIds);

 }

 public String[] getEndTagEnders()

 {

  return (new String[0]);

 }

}

class FontTag extends CompositeTag

{

 private static final String[] mIds = new String[] { "FONT" };

 public FontTag()

 {

 }

 public String[] getIds()

 {

  return (mIds);

 }

 public String[] getEnders()

 {

  return (mIds);

 }

 public String[] getEndTagEnders()

 {

  return (new String[0]);

 }

}

 

扫描二维码推送至手机访问。

版权声明:本文由Ant.Master's Blog发布,如需转载请注明出处。

本文链接:https://iant.work/post/565.html

标签: JavaServer Page
分享给朋友:

“JAVA截取HTML部分内容” 的相关文章

JBoss,Tomcat 中文URL支持方法

JBOSS 找到jboss4的deploy\jbossweb-tomcat50.sar\server.xml,编辑该文件,在下面的XML节点中增加红色的字<Connector port="8080" address="${jboss.bind.address}"  &nbs...

在web.xml不认<taglib>解决办法

在web.xml不认<taglib>解决办法: 如果是头是这样的<!DOCTYPE web-app PUBLIC "-//Sun Microsystems, Inc.//DTD Web Application&n...

struts,ajax乱码解决方案

乱码问题好像跟我们中国程序员特别有缘,一直困扰着我们,从开始的JSP乱码问题,STRUTS乱码问题,到现在的AJAX乱码问题,无一不是搞得许多程序员焦头烂额的,整天骂XXX产品对中文支持不了,UTF-8无法使用中文啊什么的,其实这里面被骂的产品中其实99%以上是对中文支持非常好的,而出现乱码的原...

浏览网页时的错误代号

① 客户方错误    100  继续    101  交换协议  ② 成功    200  OK    201  已创建 &nbs...

FCKeditor的秘密

       哈哈。。由于项目的需要,这几天一直在搞FCKeditor。其实,FCKeditor配置很简单。但不知道怎么样。在我的项目里FCKeditor总不能在FireFox里显示。开始我还以为是我的配置有问题。但我从头到尾检查了配置文件...

发表评论

访客

◎欢迎参与讨论,请在这里发表您的看法和观点。