`
zhangyaou
  • 浏览: 5958 次
  • 性别: Icon_minigender_1
  • 来自: 上海
文章分类
社区版块
存档分类
最新评论

大白菜

阅读更多
package com.inetpsa.mau.util;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;

import au.id.jericho.lib.html.Element;
import au.id.jericho.lib.html.EndTag;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTag;
import au.id.jericho.lib.html.Tag;

public class HTML2WordML {
/**
* need jericho-html-2.4.jar
* Convert HTML code to WordML code
* @param src
* @return
*/
@SuppressWarnings("unchecked")
public static String convert(String src){
if (StringUtils.isEmpty(src)) return "";
if (!src.startsWith("<p")) src = String.format("<p>%s</p>", src);
src = getOneLevelPTag(src);
Source source = new Source(src);
source.fullSequentialParse();
List<Element> elementList = source.findAllElements();
StringBuffer out = new StringBuffer();
for (Element element : elementList) {
if (element.getDepth() == 0)
out.append(walkElement(src, element, new ArrayList<String>(), new ArrayList<String>(), -1));
}
String outStr = out.toString();
if (!outStr.startsWith("<w:p")) outStr = String.format("<w:p>%s</w:p>", outStr);
int index = outStr.indexOf("</w:p>");
while(index != -1) {
//<w:br /><w:r><w:rPr><w:sz w:val='13.8'/><w:sz-cs w:val='13.8'/><w:i /></w:rPr><w:t></w:t></w:r></w:p>
//no have <w:p>
if( outStr.substring(index+6).startsWith("<w:br />")) {
index += 8;
continue;
}
if( outStr.substring(index+6).startsWith("<w:r>")) {
outStr = outStr.substring(0,index+6) + "<w:p>" + outStr.substring(index+6);
}
index = outStr.indexOf("</w:p>",index+6);
}
index = outStr.indexOf("<w:p>");
while(index != -1) {
if( index != 0 && outStr.substring(index-6,index).endsWith("w:r>")) {
outStr = outStr.substring(0,index) + "</w:p>" + outStr.substring(index);
}
index = outStr.indexOf("<w:p>",index+5);
}
if( !outStr.endsWith("</w:p>")) {
outStr += "</w:p>";
}
return outStr;
}

/**
* Visit all element and apply conversion with current context
* @param str
* @param element
* @param context
* @param level
* @return
*/
public static String walkElement(String source, Element element, List<String> pStyle, List<String> context, int level) {

StringBuffer out = new StringBuffer();
StringBuffer paragraphContent = new StringBuffer();
List<String> styles = new ArrayList<String>(context);
String tagName = element.getStartTag().getName();
boolean pTagClose = false;
// Add Paragraph
// ----------------------------
if (tagName.equalsIgnoreCase("p") || tagName.equalsIgnoreCase("li") ){
// ==> Particularité : Les ul>li imbriqués necessitent la fermeture prematuré du paragraphe (ils ont leur propre paragraphe avec un niveau différent..)
if (level > 0) {
out.append("</w:p>");
pTagClose = true;
}
//out.append("<w:p>\n");
pStyle = new ArrayList<String>();
}
if (tagName.equalsIgnoreCase("ul")){
level++;
}
// Apply styles
// ----------------------------
applyStyle(out, element, pStyle, styles, level);
// Recurse into child element
// ----------------------------
int begin = element.getContent().getBegin();
int end = element.getContent().getEnd();
for (Element child : (List<Element>) element.getChildElements()) {
end = child.getStartTag().getBegin();
if (end > begin){
// Get word and write it
String word = source.substring(begin, end);
paragraphContent.append(write(word, styles, true));
}
if (child.isEmptyElementTag()){
begin = child.getStartTag().getEnd();
}else if (child.getEndTag() != null){
begin = child.getEndTag().getEnd();
}
// walk child
if (child.getStartTag().getName().equalsIgnoreCase("br")) {
paragraphContent.append("<w:br />");
continue;
}
paragraphContent.append(walkElement(source, child, pStyle, styles, level));
}
end = element.getContent().getEnd();
if (end > begin){
if( tagName.equalsIgnoreCase("a")) {
paragraphContent.append("<w:r>");
paragraphContent.append(" <w:rPr>\n");
for (String style : styles) {
out.append(String.format("\t%s\n", style));
}
paragraphContent.append(" </w:rPr>\n");
paragraphContent.append(" <w:t></w:t>\n");
paragraphContent.append("</w:r>");
String wLink = "<w:hlink w:dest='%s'><w:r><w:rPr><w:rStyle w:val='Hyperlink'/><w:lang w:val='EN-US'/></w:rPr><w:t>%s</w:t></w:r></w:hlink>";
String hrefStr = element.getAttributeValue("href");
paragraphContent.append("    " + String.format(wLink, hrefStr,hrefStr));
} else {
String word = source.substring(begin, end);
paragraphContent.append(write(word, styles, true));
}
}
// Close paragraph
// ----------------------------
if (tagName.equalsIgnoreCase("p") || tagName.equalsIgnoreCase("li") ){
out.append("<w:p><w:pPr><w:jc w:val='left'/></w:pPr>\n");
// Add paragraph styles
if (pStyle.size() > 0){
out.append("<w:pPr>\n");
for (String style : pStyle) {
out.append(style + "\n");
}
out.append("</w:pPr>\n");
}
// Write p content
out.append(paragraphContent.toString());
// Close tag
if (!pTagClose) out.append("</w:p>");

return out.toString();
}
return paragraphContent.toString();
}

/**
* Apply style for tagName
* @param out
*/
private static void applyStyle(StringBuffer out, Element element, List<String> paragraphStyle, List<String> styles, int level){
String tagName = element.getStartTag().getName();
if (tagName.equalsIgnoreCase("li")){
paragraphStyle.add(" <w:listPr>");
paragraphStyle.add(String.format(" <w:ilvl w:val='%s'/>", level));
paragraphStyle.add(" <w:ilfo w:val='12'/>");
paragraphStyle.add(" </w:listPr>");
};
if (tagName.equalsIgnoreCase("b") || tagName.equalsIgnoreCase("strong")){
styles.add(" <w:b />");
}
if (tagName.equalsIgnoreCase("i") || tagName.equalsIgnoreCase("em")){
styles.add(" <w:i />");
}
if (element.getAttributes().size() > 0){
String styleAttribute = element.getAttributeValue("style");
if (styleAttribute == null) styleAttribute = "";
// Text souligné
if (styleAttribute.indexOf("underline") > 0) styles.add(" <w:u w:val='single'/>");
// Couleur de texte
if (styleAttribute.indexOf("color") > -1){
int beginIndex = styleAttribute.indexOf("color");
boolean isColor = true;
// Check this is not the background-color property
if (beginIndex > 0){
if (styleAttribute.charAt(beginIndex-1) == '-') isColor = false;
}
int beginColor = styleAttribute.indexOf("#", beginIndex);
if (beginColor > 0 && isColor){
int endColor = styleAttribute.indexOf(";", beginColor);
String color = styleAttribute.substring(beginColor, (endColor > 0 ? endColor : styleAttribute.length()));
styles.add(String.format(" <w:color w:val='%s'/>", color));
}
}
String bgColor = styleAttribute.toLowerCase();
if (bgColor.indexOf("background-color") > -1){
int beginIndex = bgColor.indexOf("background-color");
int beginColor = styleAttribute.indexOf("#", beginIndex);
if (beginColor > 0){
int endColor = styleAttribute.indexOf(";", beginColor);
String color = styleAttribute.substring(beginColor, (endColor > 0 ? endColor : styleAttribute.length()));
styles.add(String.format(" <w:shd w:val='clear' w:color='auto' w:fill='%s'/>", color.toUpperCase()));
}
}

if (bgColor.indexOf("font-size") > -1) {
String size = "20";
String[] tempArr = bgColor.substring(bgColor.indexOf("font-size")).split(":");
if(tempArr.length > 1) {
String temp = tempArr[1];
int len = 0;
String strSize = "";
while(temp.charAt(len) > '0' && temp.charAt(len) < '9' && len < temp.length()) {
strSize += String.valueOf(temp.charAt(len));
len++;
}
if(StringUtils.isNotBlank(strSize)) {
size = String.valueOf(Integer.parseInt(strSize) * 2);
}
}
if (bgColor.indexOf("small") > 0) size = "20";
if (bgColor.indexOf("x-small") > 0) size = "16.6";
if (bgColor.indexOf("xx-small") > 0) size = "13.8";
if (bgColor.indexOf("medium") > 0) size = "24";
if (bgColor.indexOf("large") > 0) size = "28.8";
if (bgColor.indexOf("x-large") > 0) size = "34.56";
if (bgColor.indexOf("xx-large") > 0) size = "41.4";
styles.add(String.format(" <w:sz w:val='%s'/><w:sz-cs w:val='%s'/>", size, size));
}
if (bgColor.indexOf("text-align") > -1){
String align = "both";

if (bgColor.indexOf("center") > 0) align = "center";
if (bgColor.indexOf("right") > 0) align = "right";
paragraphStyle.add(String.format(" <w:jc w:val='%s'/>", align));
}
}

}

/**
* Write word with styles
* @param word
* @param styles
* @return
*/
public static String write(String word, List<String> styles, boolean escape){
StringBuffer out = new StringBuffer();
if (StringUtils.isEmpty(word)) return "";
out.append("<w:r>");
out.append(" <w:rPr>\n");
for (String style : styles) {
out.append(String.format("\t%s\n", style));
}
out.append(" </w:rPr>\n");
String tempStr = StringEscapeUtils.unescapeHtml(word);
if (escape){
out.append(String.format(" <w:t><![CDATA[%s]]></w:t>\n", tempStr));
}else
out.append(String.format(" <w:t>%s</w:t>\n", tempStr));
out.append("</w:r>");

return out.toString();
}

/**
* total the tag count
* @param tagName String
* @param source String
* @return int
*/
private static int getTagCount(String tagName, String source) {
int count = 0;
int beginIndex = source.indexOf(tagName,0);
while(beginIndex != -1) {
count++;
beginIndex = source.indexOf(tagName,beginIndex+1);
}
return count;
}

/**
* @param args
*/
private static String getOneLevelPTag(String strHtml) {
Source source = new Source(strHtml);
source.fullSequentialParse();
List tagsList = source.findAllTags();
ArrayList<Integer> startReplaceList = new ArrayList<Integer>();
ArrayList<Integer> endReplaceList = new ArrayList<Integer>();
for(int i=0; i<tagsList.size(); i++) {
if( tagsList.get(i) instanceof Tag) {
Tag tempTag = (Tag)tagsList.get(i);
if( tempTag.getName().equals("p")) {
int beginIndex = tempTag.getBegin();
String startStr = strHtml.substring(0,beginIndex);
int startCount = getTagCount("<p",startStr);
int endCount = getTagCount("</p>",startStr);
if( tempTag instanceof StartTag) {
if( startCount != endCount) {
startReplaceList.add(new Integer(beginIndex));
}
}  else if( tempTag instanceof EndTag) {
if( startCount != endCount + 1) {
endReplaceList.add(new Integer(beginIndex));
}
}
}
}
}
int size = startReplaceList.size();
for(int i=0; i<size; i++) {
int begin = startReplaceList.get(i).intValue();
strHtml = strHtml.substring(0,begin) + "<x" + strHtml.substring(begin+2);
}
size = endReplaceList.size();
for(int i=0; i<size; i++) {
int begin = endReplaceList.get(i).intValue();
strHtml = strHtml.substring(0,begin) + "</x>" + strHtml.substring(begin+4);
}
strHtml = strHtml.replaceAll("<x", "<span");
strHtml = strHtml.replaceAll("</x>", "</span>");

return strHtml;
}
}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics