Browse Source
# Conflicts: # fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java # fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.javarelease/10.0
Harrison
4 years ago
2 changed files with 1568 additions and 0 deletions
@ -0,0 +1,788 @@ |
|||||||
|
/* |
||||||
|
* Copyright 2004 Paulo Soares |
||||||
|
* |
||||||
|
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
||||||
|
* (the "License"); you may not use this file except in compliance with the License. |
||||||
|
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
|
||||||
|
* |
||||||
|
* Software distributed under the License is distributed on an "AS IS" basis, |
||||||
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
||||||
|
* for the specific language governing rights and limitations under the License. |
||||||
|
* |
||||||
|
* The Original Code is 'iText, a free JAVA-PDF library'. |
||||||
|
* |
||||||
|
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by |
||||||
|
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. |
||||||
|
* All Rights Reserved. |
||||||
|
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer |
||||||
|
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. |
||||||
|
* |
||||||
|
* Contributor(s): all the names of the contributors are added in the source code |
||||||
|
* where applicable. |
||||||
|
* |
||||||
|
* Alternatively, the contents of this file may be used under the terms of the |
||||||
|
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the |
||||||
|
* provisions of LGPL are applicable instead of those above. If you wish to |
||||||
|
* allow use of your version of this file only under the terms of the LGPL |
||||||
|
* License and not to allow others to use your version of this file under |
||||||
|
* the MPL, indicate your decision by deleting the provisions above and |
||||||
|
* replace them with the notice and other provisions required by the LGPL. |
||||||
|
* If you do not delete the provisions above, a recipient may use your version |
||||||
|
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. |
||||||
|
* |
||||||
|
* This library is free software; you can redistribute it and/or modify it |
||||||
|
* under the terms of the MPL as stated above or under the terms of the GNU |
||||||
|
* Library General Public License as published by the Free Software Foundation; |
||||||
|
* either version 2 of the License, or any later version. |
||||||
|
* |
||||||
|
* This library is distributed in the hope that it will be useful, but WITHOUT |
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
||||||
|
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more |
||||||
|
* details. |
||||||
|
* |
||||||
|
* Contributions by: |
||||||
|
* Lubos Strapko |
||||||
|
* |
||||||
|
* If you didn't download this code from the following link, you should check if |
||||||
|
* you aren't using an obsolete version: |
||||||
|
* http://www.lowagie.com/iText/
|
||||||
|
*/ |
||||||
|
|
||||||
|
package com.fr.third.com.lowagie.text.html.simpleparser; |
||||||
|
|
||||||
|
import com.fr.third.com.lowagie.text.Chunk; |
||||||
|
import com.fr.third.com.lowagie.text.DocListener; |
||||||
|
import com.fr.third.com.lowagie.text.DocumentException; |
||||||
|
import com.fr.third.com.lowagie.text.Element; |
||||||
|
import com.fr.third.com.lowagie.text.ElementTags; |
||||||
|
import com.fr.third.com.lowagie.text.ExceptionConverter; |
||||||
|
import com.fr.third.com.lowagie.text.FontFactoryImp; |
||||||
|
import com.fr.third.com.lowagie.text.HeaderFooter; |
||||||
|
import com.fr.third.com.lowagie.text.Image; |
||||||
|
import com.fr.third.com.lowagie.text.List; |
||||||
|
import com.fr.third.com.lowagie.text.ListItem; |
||||||
|
import com.fr.third.com.lowagie.text.Paragraph; |
||||||
|
import com.fr.third.com.lowagie.text.Phrase; |
||||||
|
import com.fr.third.com.lowagie.text.Rectangle; |
||||||
|
import com.fr.third.com.lowagie.text.TextElementArray; |
||||||
|
import com.fr.third.com.lowagie.text.html.CSSUtils; |
||||||
|
import com.fr.third.com.lowagie.text.html.HtmlTags; |
||||||
|
import com.fr.third.com.lowagie.text.html.Markup; |
||||||
|
import com.fr.third.com.lowagie.text.pdf.PdfPTable; |
||||||
|
import com.fr.third.com.lowagie.text.pdf.draw.LineSeparator; |
||||||
|
import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLDocHandler; |
||||||
|
import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLParser; |
||||||
|
import com.fr.third.sun.misc.BASE64Decoder; |
||||||
|
|
||||||
|
import java.io.File; |
||||||
|
import java.io.IOException; |
||||||
|
import java.io.Reader; |
||||||
|
import java.util.ArrayList; |
||||||
|
import java.util.HashMap; |
||||||
|
import java.util.Stack; |
||||||
|
import java.util.StringTokenizer; |
||||||
|
|
||||||
|
public class HTMLWorker implements SimpleXMLDocHandler, DocListener { |
||||||
|
|
||||||
|
protected ArrayList objectList; |
||||||
|
|
||||||
|
protected DocListener document; |
||||||
|
|
||||||
|
private Paragraph currentParagraph; |
||||||
|
|
||||||
|
private ChainedProperties cprops = new ChainedProperties(); |
||||||
|
|
||||||
|
private Stack stack = new Stack(); |
||||||
|
|
||||||
|
private boolean pendingTR = false; |
||||||
|
|
||||||
|
private boolean pendingTD = false; |
||||||
|
|
||||||
|
private boolean pendingLI = false; |
||||||
|
|
||||||
|
private StyleSheet style = new StyleSheet(); |
||||||
|
|
||||||
|
private boolean isPRE = false; |
||||||
|
|
||||||
|
private Stack tableState = new Stack(); |
||||||
|
|
||||||
|
private boolean skipText = false; |
||||||
|
|
||||||
|
private HashMap interfaceProps; |
||||||
|
|
||||||
|
private FactoryProperties factoryProperties = new FactoryProperties(); |
||||||
|
|
||||||
|
/** Creates a new instance of HTMLWorker |
||||||
|
* @param document A class that implements <CODE>DocListener</CODE> |
||||||
|
* */ |
||||||
|
public HTMLWorker(DocListener document) { |
||||||
|
this.document = document; |
||||||
|
} |
||||||
|
|
||||||
|
public void setStyleSheet(StyleSheet style) { |
||||||
|
this.style = style; |
||||||
|
} |
||||||
|
|
||||||
|
public StyleSheet getStyleSheet() { |
||||||
|
return style; |
||||||
|
} |
||||||
|
|
||||||
|
public void setInterfaceProps(HashMap interfaceProps) { |
||||||
|
this.interfaceProps = interfaceProps; |
||||||
|
FontFactoryImp ff = null; |
||||||
|
if (interfaceProps != null) |
||||||
|
ff = (FontFactoryImp) interfaceProps.get("font_factory"); |
||||||
|
if (ff != null) |
||||||
|
factoryProperties.setFontImp(ff); |
||||||
|
} |
||||||
|
|
||||||
|
public HashMap getInterfaceProps() { |
||||||
|
return interfaceProps; |
||||||
|
} |
||||||
|
|
||||||
|
public void parse(Reader reader) throws IOException { |
||||||
|
SimpleXMLParser.parse(this, null, reader, true); |
||||||
|
} |
||||||
|
|
||||||
|
public static ArrayList parseToList(Reader reader, StyleSheet style) |
||||||
|
throws IOException { |
||||||
|
return parseToList(reader, style, null); |
||||||
|
} |
||||||
|
|
||||||
|
public static ArrayList parseToList(Reader reader, StyleSheet style, |
||||||
|
HashMap interfaceProps) throws IOException { |
||||||
|
HTMLWorker worker = new HTMLWorker(null); |
||||||
|
if (style != null) |
||||||
|
worker.style = style; |
||||||
|
worker.document = worker; |
||||||
|
worker.setInterfaceProps(interfaceProps); |
||||||
|
worker.objectList = new ArrayList(); |
||||||
|
worker.parse(reader); |
||||||
|
return worker.objectList; |
||||||
|
} |
||||||
|
|
||||||
|
public void endDocument() { |
||||||
|
try { |
||||||
|
for (int k = 0; k < stack.size(); ++k) |
||||||
|
document.add((Element) stack.elementAt(k)); |
||||||
|
if (currentParagraph != null) |
||||||
|
document.add(currentParagraph); |
||||||
|
currentParagraph = null; |
||||||
|
} catch (Exception e) { |
||||||
|
throw new ExceptionConverter(e); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
public void startDocument() { |
||||||
|
HashMap h = new HashMap(); |
||||||
|
style.applyStyle("body", h); |
||||||
|
cprops.addToChain("body", h); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
public void startElement(String tag, HashMap h) { |
||||||
|
if (!tagsSupported.containsKey(tag)) |
||||||
|
return; |
||||||
|
try { |
||||||
|
style.applyStyle(tag, h); |
||||||
|
if(tag.equals("p")){ |
||||||
|
h.put(Markup.CSS_KEY_MARGINTOP, "16px"); |
||||||
|
h.put(Markup.CSS_KEY_MARGINBOTTOM, "16px"); |
||||||
|
} |
||||||
|
String follow = (String) FactoryProperties.followTags.get(tag); |
||||||
|
if (follow != null) { |
||||||
|
HashMap prop = new HashMap(); |
||||||
|
prop.put(follow, null); |
||||||
|
FactoryProperties.insertStyle(h, this.cprops); |
||||||
|
prop.putAll(h); |
||||||
|
|
||||||
|
cprops.addToChain(follow, prop); |
||||||
|
return; |
||||||
|
} |
||||||
|
FactoryProperties.insertStyle(h, cprops); |
||||||
|
if (tag.equals(HtmlTags.ANCHOR)) { |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
if (currentParagraph == null) { |
||||||
|
currentParagraph = new Paragraph(); |
||||||
|
} |
||||||
|
stack.push(currentParagraph); |
||||||
|
currentParagraph = new Paragraph(); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.NEWLINE)) { |
||||||
|
if (currentParagraph == null) { |
||||||
|
currentParagraph = new Paragraph(); |
||||||
|
} |
||||||
|
currentParagraph.add(factoryProperties |
||||||
|
.createChunk("\n", cprops)); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.HORIZONTALRULE)) { |
||||||
|
// Attempting to duplicate the behavior seen on Firefox with
|
||||||
|
// http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_hr_test
|
||||||
|
// where an initial break is only inserted when the preceding element doesn't
|
||||||
|
// end with a break, but a trailing break is always inserted.
|
||||||
|
boolean addLeadingBreak = true; |
||||||
|
if (currentParagraph == null) { |
||||||
|
currentParagraph = new Paragraph(); |
||||||
|
addLeadingBreak = false; |
||||||
|
} |
||||||
|
if (addLeadingBreak) { // Not a new paragraph
|
||||||
|
int numChunks = currentParagraph.getChunks().size(); |
||||||
|
if (numChunks == 0 || |
||||||
|
((Chunk)(currentParagraph.getChunks().get(numChunks - 1))).getContent().endsWith("\n")) |
||||||
|
addLeadingBreak = false; |
||||||
|
} |
||||||
|
String align = (String) h.get("align"); |
||||||
|
int hrAlign = Element.ALIGN_CENTER; |
||||||
|
if (align != null) { |
||||||
|
if (align.equalsIgnoreCase("left")) |
||||||
|
hrAlign = Element.ALIGN_LEFT; |
||||||
|
if (align.equalsIgnoreCase("right")) |
||||||
|
hrAlign = Element.ALIGN_RIGHT; |
||||||
|
} |
||||||
|
String width = (String) h.get("width"); |
||||||
|
float hrWidth = 1; |
||||||
|
if (width != null) { |
||||||
|
float tmpWidth = Markup.parseLength(width, Markup.DEFAULT_FONT_SIZE); |
||||||
|
if (tmpWidth > 0) hrWidth = tmpWidth; |
||||||
|
if (!width.endsWith("%")) |
||||||
|
hrWidth = 100; // Treat a pixel width as 100% for now.
|
||||||
|
} |
||||||
|
String size = (String) h.get("size"); |
||||||
|
float hrSize = 1; |
||||||
|
if (size != null) { |
||||||
|
float tmpSize = Markup.parseLength(size, Markup.DEFAULT_FONT_SIZE); |
||||||
|
if (tmpSize > 0) |
||||||
|
hrSize = tmpSize; |
||||||
|
} |
||||||
|
if (addLeadingBreak) |
||||||
|
currentParagraph.add(Chunk.NEWLINE); |
||||||
|
currentParagraph.add(new LineSeparator(hrSize, hrWidth, null, hrAlign, currentParagraph.getLeading()/2)); |
||||||
|
currentParagraph.add(Chunk.NEWLINE); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.CHUNK) || tag.equals(HtmlTags.SPAN)) { |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.IMAGE)) { |
||||||
|
String src = (String) h.get(ElementTags.SRC); |
||||||
|
if (src == null) |
||||||
|
return; |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
Image img = null; |
||||||
|
if (interfaceProps != null) { |
||||||
|
ImageProvider ip = (ImageProvider) interfaceProps |
||||||
|
.get("img_provider"); |
||||||
|
if (ip != null) |
||||||
|
img = ip.getImage(src, h, cprops, document); |
||||||
|
if (img == null) { |
||||||
|
HashMap images = (HashMap) interfaceProps |
||||||
|
.get("img_static"); |
||||||
|
if (images != null) { |
||||||
|
Image tim = (Image) images.get(src); |
||||||
|
if (tim != null) |
||||||
|
img = Image.getInstance(tim); |
||||||
|
} else { |
||||||
|
if (!src.startsWith("http")) { // relative src references only
|
||||||
|
String baseurl = (String) interfaceProps |
||||||
|
.get("img_baseurl"); |
||||||
|
if (baseurl != null) { |
||||||
|
src = baseurl + src; |
||||||
|
img = Image.getInstance(src); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
//处理base64编码图片
|
||||||
|
if(src.startsWith("data")){ |
||||||
|
BASE64Decoder decoder = new BASE64Decoder(); |
||||||
|
String[] srcArray = src.split(","); |
||||||
|
String base64string = srcArray[srcArray.length -1]; |
||||||
|
byte[] bytes = decoder.decodeBuffer(base64string); |
||||||
|
try { |
||||||
|
img = Image.getInstance(bytes); |
||||||
|
}catch (Exception e){ |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
} |
||||||
|
if (img == null) { |
||||||
|
if (!src.startsWith("http")) { |
||||||
|
String path = cprops.getProperty("image_path"); |
||||||
|
if (path == null) |
||||||
|
path = ""; |
||||||
|
src = new File(path, src).getPath(); |
||||||
|
} |
||||||
|
img = Image.getInstance(src); |
||||||
|
} |
||||||
|
if(img == null){ |
||||||
|
return; |
||||||
|
} |
||||||
|
img.setSrcString(src); |
||||||
|
String align = (String) h.get("align"); |
||||||
|
String width = (String) h.get("width"); |
||||||
|
String height = (String) h.get("height"); |
||||||
|
String before = cprops.getProperty("before"); |
||||||
|
String after = cprops.getProperty("after"); |
||||||
|
if (before != null) |
||||||
|
img.setSpacingBefore(Float.parseFloat(before)); |
||||||
|
if (after != null) |
||||||
|
img.setSpacingAfter(Float.parseFloat(after)); |
||||||
|
float actualFontSize = Markup.parseLength(cprops |
||||||
|
.getProperty(ElementTags.SIZE), |
||||||
|
Markup.DEFAULT_FONT_SIZE); |
||||||
|
if (actualFontSize <= 0f) |
||||||
|
actualFontSize = Markup.DEFAULT_FONT_SIZE; |
||||||
|
float widthInPoints = Markup.parseLength(width, actualFontSize); |
||||||
|
float heightInPoints = Markup.parseLength(height, |
||||||
|
actualFontSize); |
||||||
|
if (widthInPoints > 0 && heightInPoints > 0) { |
||||||
|
img.scaleAbsolute(widthInPoints, heightInPoints); |
||||||
|
} else if (widthInPoints > 0) { |
||||||
|
heightInPoints = img.getHeight() * widthInPoints |
||||||
|
/ img.getWidth(); |
||||||
|
img.scaleAbsolute(widthInPoints, heightInPoints); |
||||||
|
} else if (heightInPoints > 0) { |
||||||
|
widthInPoints = img.getWidth() * heightInPoints |
||||||
|
/ img.getHeight(); |
||||||
|
img.scaleAbsolute(widthInPoints, heightInPoints); |
||||||
|
} |
||||||
|
img.setWidthPercentage(0); |
||||||
|
if (align != null) { |
||||||
|
endElement("p"); |
||||||
|
int ralign = Image.MIDDLE; |
||||||
|
if (align.equalsIgnoreCase("left")) |
||||||
|
ralign = Image.LEFT; |
||||||
|
else if (align.equalsIgnoreCase("right")) |
||||||
|
ralign = Image.RIGHT; |
||||||
|
img.setAlignment(ralign); |
||||||
|
Img i = null; |
||||||
|
boolean skip = false; |
||||||
|
if (interfaceProps != null) { |
||||||
|
i = (Img) interfaceProps.get("img_interface"); |
||||||
|
if (i != null) |
||||||
|
skip = i.process(img, h, cprops, document); |
||||||
|
} |
||||||
|
if (!skip) |
||||||
|
document.add(img); |
||||||
|
cprops.removeChain(tag); |
||||||
|
} else { |
||||||
|
Chunk ck = new Chunk(img, 0, 0); |
||||||
|
if(cprops.hasPropertyInChain("img", "padding-left")){ |
||||||
|
String ss = cprops.getPropertyFromChain("img", "padding-left"); |
||||||
|
ck.setAttribute("padding-left", Float.toString(Markup.parseLength(ss))); |
||||||
|
} |
||||||
|
if(cprops.hasPropertyInChain("img", "padding-right")){ |
||||||
|
String ss = cprops.getPropertyFromChain("img", "padding-right"); |
||||||
|
ck.setAttribute("padding-right", Float.toString(Markup.parseLength(ss))); |
||||||
|
} |
||||||
|
cprops.removeChain(tag); |
||||||
|
if (currentParagraph == null) { |
||||||
|
currentParagraph = FactoryProperties |
||||||
|
.createParagraph(cprops); |
||||||
|
} |
||||||
|
|
||||||
|
currentParagraph.add(ck); |
||||||
|
} |
||||||
|
return; |
||||||
|
} |
||||||
|
endElement("p"); |
||||||
|
if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3") |
||||||
|
|| tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) { |
||||||
|
if (!h.containsKey(ElementTags.SIZE)) { |
||||||
|
int v = 7 - Integer.parseInt(tag.substring(1)); |
||||||
|
h.put(ElementTags.SIZE, Integer.toString(v)); |
||||||
|
} |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.UNORDEREDLIST)) { |
||||||
|
if (pendingLI) |
||||||
|
endElement(HtmlTags.LISTITEM); |
||||||
|
skipText = true; |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
List list = new List(false); |
||||||
|
try{ |
||||||
|
list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue()); |
||||||
|
}catch (Exception e) { |
||||||
|
list.setAutoindent(true); |
||||||
|
} |
||||||
|
list.setListSymbol("\u2022"); |
||||||
|
stack.push(list); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.ORDEREDLIST)) { |
||||||
|
if (pendingLI) |
||||||
|
endElement(HtmlTags.LISTITEM); |
||||||
|
skipText = true; |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
List list = new List(true); |
||||||
|
try{ |
||||||
|
list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue()); |
||||||
|
}catch (Exception e) { |
||||||
|
list.setAutoindent(true); |
||||||
|
} |
||||||
|
stack.push(list); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.LISTITEM)) { |
||||||
|
if (pendingLI) |
||||||
|
endElement(HtmlTags.LISTITEM); |
||||||
|
skipText = false; |
||||||
|
pendingLI = true; |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
ListItem item = FactoryProperties.createListItem(cprops); |
||||||
|
stack.push(item); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.DIV) || tag.equals(HtmlTags.BODY) || tag.equals("p")) { |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.PRE)) { |
||||||
|
if (!h.containsKey(ElementTags.FACE)) { |
||||||
|
h.put(ElementTags.FACE, "Courier"); |
||||||
|
} |
||||||
|
cprops.addToChain(tag, h); |
||||||
|
isPRE = true; |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("tr")) { |
||||||
|
if (pendingTR) |
||||||
|
endElement("tr"); |
||||||
|
skipText = true; |
||||||
|
pendingTR = true; |
||||||
|
cprops.addToChain("tr", h); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("td") || tag.equals("th")) { |
||||||
|
if (pendingTD) |
||||||
|
endElement(tag); |
||||||
|
skipText = false; |
||||||
|
pendingTD = true; |
||||||
|
cprops.addToChain("td", h); |
||||||
|
stack.push(new IncCell(tag, cprops)); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("table")) { |
||||||
|
cprops.addToChain("table", h); |
||||||
|
IncTable table = new IncTable(h); |
||||||
|
stack.push(table); |
||||||
|
tableState.push(new boolean[] { pendingTR, pendingTD }); |
||||||
|
pendingTR = pendingTD = false; |
||||||
|
skipText = true; |
||||||
|
return; |
||||||
|
} |
||||||
|
} catch (Exception e) { |
||||||
|
throw new ExceptionConverter(e); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public void endElement(String tag) { |
||||||
|
if (!tagsSupported.containsKey(tag)) |
||||||
|
return; |
||||||
|
try { |
||||||
|
String follow = (String) FactoryProperties.followTags.get(tag); |
||||||
|
if (follow != null) { |
||||||
|
cprops.removeChain(follow); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("font") || tag.equals("span")) { |
||||||
|
cprops.removeChain(tag); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("a")) { |
||||||
|
if (currentParagraph == null) { |
||||||
|
currentParagraph = new Paragraph(); |
||||||
|
} |
||||||
|
boolean skip = false; |
||||||
|
if (interfaceProps != null) { |
||||||
|
ALink i = (ALink) interfaceProps.get("alink_interface"); |
||||||
|
if (i != null) |
||||||
|
skip = i.process(currentParagraph, cprops); |
||||||
|
} |
||||||
|
if (!skip) { |
||||||
|
String href = cprops.getProperty("href"); |
||||||
|
if (href != null) { |
||||||
|
ArrayList chunks = currentParagraph.getChunks(); |
||||||
|
int size = chunks.size(); |
||||||
|
for (int k = 0; k < size; ++k) { |
||||||
|
Chunk ck = (Chunk) chunks.get(k); |
||||||
|
ck.setAnchor(href); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
Paragraph tmp = (Paragraph) stack.pop(); |
||||||
|
Phrase tmp2 = new Phrase(); |
||||||
|
tmp2.add(currentParagraph); |
||||||
|
tmp.add(tmp2); |
||||||
|
currentParagraph = tmp; |
||||||
|
cprops.removeChain("a"); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("br")) { |
||||||
|
return; |
||||||
|
} |
||||||
|
if (currentParagraph != null) { |
||||||
|
if (stack.empty()) |
||||||
|
document.add(currentParagraph); |
||||||
|
else { |
||||||
|
Object obj = stack.pop(); |
||||||
|
if (obj instanceof TextElementArray) { |
||||||
|
TextElementArray current = (TextElementArray) obj; |
||||||
|
current.add(currentParagraph); |
||||||
|
} |
||||||
|
stack.push(obj); |
||||||
|
} |
||||||
|
} |
||||||
|
currentParagraph = null; |
||||||
|
if (tag.equals(HtmlTags.UNORDEREDLIST) |
||||||
|
|| tag.equals(HtmlTags.ORDEREDLIST)) { |
||||||
|
if (pendingLI) |
||||||
|
endElement(HtmlTags.LISTITEM); |
||||||
|
skipText = false; |
||||||
|
cprops.removeChain(tag); |
||||||
|
if (stack.empty()) |
||||||
|
return; |
||||||
|
Object obj = stack.pop(); |
||||||
|
if (!(obj instanceof List)) { |
||||||
|
stack.push(obj); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (stack.empty()) |
||||||
|
document.add((Element) obj); |
||||||
|
else |
||||||
|
((TextElementArray) stack.peek()).add(obj); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.LISTITEM)) { |
||||||
|
pendingLI = false; |
||||||
|
skipText = true; |
||||||
|
cprops.removeChain(tag); |
||||||
|
if (stack.empty()) |
||||||
|
return; |
||||||
|
Object obj = stack.pop(); |
||||||
|
if (!(obj instanceof ListItem)) { |
||||||
|
stack.push(obj); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (stack.empty()) { |
||||||
|
document.add((Element) obj); |
||||||
|
return; |
||||||
|
} |
||||||
|
Object list = stack.pop(); |
||||||
|
if (!(list instanceof List)) { |
||||||
|
stack.push(list); |
||||||
|
return; |
||||||
|
} |
||||||
|
ListItem item = (ListItem) obj; |
||||||
|
((List) list).add(item); |
||||||
|
ArrayList cks = item.getChunks(); |
||||||
|
if (!cks.isEmpty()) |
||||||
|
item.getListSymbol() |
||||||
|
.setFont(((Chunk) cks.get(0)).getFont()); |
||||||
|
stack.push(list); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("div") || tag.equals("body")) { |
||||||
|
cprops.removeChain(tag); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals(HtmlTags.PRE)) { |
||||||
|
cprops.removeChain(tag); |
||||||
|
isPRE = false; |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("p")) { |
||||||
|
cprops.removeChain(tag); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3") |
||||||
|
|| tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) { |
||||||
|
cprops.removeChain(tag); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("table")) { |
||||||
|
if (pendingTR) |
||||||
|
endElement("tr"); |
||||||
|
cprops.removeChain("table"); |
||||||
|
IncTable table = (IncTable) stack.pop(); |
||||||
|
PdfPTable tb = table.buildTable(); |
||||||
|
tb.setSplitRows(true); |
||||||
|
if (stack.empty()) |
||||||
|
document.add(tb); |
||||||
|
else |
||||||
|
((TextElementArray) stack.peek()).add(tb); |
||||||
|
boolean state[] = (boolean[]) tableState.pop(); |
||||||
|
pendingTR = state[0]; |
||||||
|
pendingTD = state[1]; |
||||||
|
skipText = false; |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("tr")) { |
||||||
|
if (pendingTD) |
||||||
|
endElement("td"); |
||||||
|
pendingTR = false; |
||||||
|
String rowHeightPx = cprops.getLastChainProperty("height"); |
||||||
|
|
||||||
|
cprops.removeChain("tr"); |
||||||
|
ArrayList cells = new ArrayList(); |
||||||
|
IncTable table = null; |
||||||
|
while (true) { |
||||||
|
Object obj = stack.pop(); |
||||||
|
if (obj instanceof IncCell) { |
||||||
|
cells.add(((IncCell) obj).getCell()); |
||||||
|
} |
||||||
|
if (obj instanceof IncTable) { |
||||||
|
table = (IncTable) obj; |
||||||
|
break; |
||||||
|
} |
||||||
|
} |
||||||
|
float rowHeight = 0.0f; |
||||||
|
if(rowHeightPx!=null){ |
||||||
|
rowHeight = CSSUtils.parseFloat(rowHeightPx); |
||||||
|
} |
||||||
|
table.addCols(cells); |
||||||
|
table.endRow(rowHeight); |
||||||
|
|
||||||
|
stack.push(table); |
||||||
|
skipText = true; |
||||||
|
return; |
||||||
|
} |
||||||
|
if (tag.equals("td") || tag.equals("th")) { |
||||||
|
pendingTD = false; |
||||||
|
cprops.removeChain("td"); |
||||||
|
skipText = true; |
||||||
|
return; |
||||||
|
} |
||||||
|
} catch (Exception e) { |
||||||
|
throw new ExceptionConverter(e); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
public void text(String str) { |
||||||
|
if (skipText) |
||||||
|
return; |
||||||
|
String content = str; |
||||||
|
if (isPRE) { |
||||||
|
if (currentParagraph == null) { |
||||||
|
currentParagraph = FactoryProperties.createParagraph(cprops); |
||||||
|
} |
||||||
|
Chunk chunk = factoryProperties.createChunk(content, cprops); |
||||||
|
currentParagraph.add(chunk); |
||||||
|
return; |
||||||
|
} |
||||||
|
if (content.trim().length() == 0 && content.indexOf(' ') < 0) { |
||||||
|
return; |
||||||
|
} |
||||||
|
|
||||||
|
StringBuffer buf = new StringBuffer(); |
||||||
|
int len = content.length(); |
||||||
|
char character; |
||||||
|
boolean newline = false; |
||||||
|
for (int i = 0; i < len; i++) { |
||||||
|
switch (character = content.charAt(i)) { |
||||||
|
case ' ': |
||||||
|
if (!newline) { |
||||||
|
buf.append(character); |
||||||
|
} |
||||||
|
break; |
||||||
|
case '\n': |
||||||
|
if (i > 0) { |
||||||
|
newline = true; |
||||||
|
buf.append(' '); |
||||||
|
} |
||||||
|
break; |
||||||
|
case '\r': |
||||||
|
break; |
||||||
|
case '\t': |
||||||
|
break; |
||||||
|
default: |
||||||
|
newline = false; |
||||||
|
buf.append(character); |
||||||
|
} |
||||||
|
} |
||||||
|
if (currentParagraph == null) { |
||||||
|
currentParagraph = FactoryProperties.createParagraph(cprops); |
||||||
|
} |
||||||
|
Chunk chunk = factoryProperties.createChunk(buf.toString(), cprops); |
||||||
|
currentParagraph.add(chunk); |
||||||
|
} |
||||||
|
|
||||||
|
public boolean add(Element element) throws DocumentException { |
||||||
|
objectList.add(element); |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
public void clearTextWrap() throws DocumentException { |
||||||
|
} |
||||||
|
|
||||||
|
public void close() { |
||||||
|
} |
||||||
|
|
||||||
|
public boolean newPage() { |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
public void open() { |
||||||
|
} |
||||||
|
|
||||||
|
public void resetFooter() { |
||||||
|
} |
||||||
|
|
||||||
|
public void resetHeader() { |
||||||
|
} |
||||||
|
|
||||||
|
public void resetPageCount() { |
||||||
|
} |
||||||
|
|
||||||
|
public void setFooter(HeaderFooter footer) { |
||||||
|
} |
||||||
|
|
||||||
|
public void setHeader(HeaderFooter header) { |
||||||
|
} |
||||||
|
|
||||||
|
public boolean setMarginMirroring(boolean marginMirroring) { |
||||||
|
return false; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* @see DocListener#setMarginMirroring(boolean) |
||||||
|
* @since 2.1.6 |
||||||
|
*/ |
||||||
|
public boolean setMarginMirroringTopBottom(boolean marginMirroring) { |
||||||
|
return false; |
||||||
|
} |
||||||
|
|
||||||
|
public boolean setMargins(float marginLeft, float marginRight, |
||||||
|
float marginTop, float marginBottom) { |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
public void setPageCount(int pageN) { |
||||||
|
} |
||||||
|
|
||||||
|
public boolean setPageSize(Rectangle pageSize) { |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
public static final String tagsSupportedString = "ol ul li a pre font span br p div body table td th tr i b u sub sup em strong s strike" |
||||||
|
+ " h1 h2 h3 h4 h5 h6 img hr"; |
||||||
|
|
||||||
|
public static final HashMap tagsSupported = new HashMap(); |
||||||
|
public static final HashMap tagsPrefixSupported = new HashMap(); |
||||||
|
|
||||||
|
static { |
||||||
|
StringTokenizer tok = new StringTokenizer(tagsSupportedString); |
||||||
|
while (tok.hasMoreTokens()) { |
||||||
|
String s = tok.nextToken(); |
||||||
|
tagsSupported.put(s, null); |
||||||
|
tagsPrefixSupported.put(s.charAt(0), null); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,780 @@ |
|||||||
|
/* |
||||||
|
* Copyright 2003 Paulo Soares |
||||||
|
* |
||||||
|
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
||||||
|
* (the "License"); you may not use this file except in compliance with the License. |
||||||
|
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
|
||||||
|
* |
||||||
|
* Software distributed under the License is distributed on an "AS IS" basis, |
||||||
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
||||||
|
* for the specific language governing rights and limitations under the License. |
||||||
|
* |
||||||
|
* The Original Code is 'iText, a free JAVA-PDF library'. |
||||||
|
* |
||||||
|
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by |
||||||
|
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. |
||||||
|
* All Rights Reserved. |
||||||
|
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer |
||||||
|
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. |
||||||
|
* |
||||||
|
* Contributor(s): all the names of the contributors are added in the source code |
||||||
|
* where applicable. |
||||||
|
* |
||||||
|
* Alternatively, the contents of this file may be used under the terms of the |
||||||
|
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the |
||||||
|
* provisions of LGPL are applicable instead of those above. If you wish to |
||||||
|
* allow use of your version of this file only under the terms of the LGPL |
||||||
|
* License and not to allow others to use your version of this file under |
||||||
|
* the MPL, indicate your decision by deleting the provisions above and |
||||||
|
* replace them with the notice and other provisions required by the LGPL. |
||||||
|
* If you do not delete the provisions above, a recipient may use your version |
||||||
|
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. |
||||||
|
* |
||||||
|
* This library is free software; you can redistribute it and/or modify it |
||||||
|
* under the terms of the MPL as stated above or under the terms of the GNU |
||||||
|
* Library General Public License as published by the Free Software Foundation; |
||||||
|
* either version 2 of the License, or any later version. |
||||||
|
* |
||||||
|
* This library is distributed in the hope that it will be useful, but WITHOUT |
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
||||||
|
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more |
||||||
|
* details. |
||||||
|
* |
||||||
|
* If you didn't download this code from the following link, you should check if |
||||||
|
* you aren't using an obsolete version: |
||||||
|
* http://www.lowagie.com/iText/
|
||||||
|
* |
||||||
|
* The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license: |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
* |
||||||
|
* Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt. |
||||||
|
* The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128). |
||||||
|
* Steven Brandt and JavaWorld gave permission to use the code for free. |
||||||
|
* (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in |
||||||
|
* conformance with the rest of the code). |
||||||
|
* The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
|
||||||
|
* It was substantially refactored by Bruno Lowagie. |
||||||
|
* |
||||||
|
* The method 'private static String getEncodingName(byte[] b4)' was found |
||||||
|
* in org.apache.xerces.impl.XMLEntityManager, originaly published by the |
||||||
|
* Apache Software Foundation under the Apache Software License; now being |
||||||
|
* used in iText under the MPL. |
||||||
|
*/ |
||||||
|
package com.fr.third.com.lowagie.text.xml.simpleparser; |
||||||
|
|
||||||
|
import com.fr.third.com.lowagie.text.html.simpleparser.HTMLWorker; |
||||||
|
import java.io.BufferedReader; |
||||||
|
import java.io.ByteArrayOutputStream; |
||||||
|
import java.io.IOException; |
||||||
|
import java.io.InputStream; |
||||||
|
import java.io.InputStreamReader; |
||||||
|
import java.io.Reader; |
||||||
|
import java.util.HashMap; |
||||||
|
import java.util.Stack; |
||||||
|
|
||||||
|
/** |
||||||
|
* A simple XML and HTML parser. This parser is, like the SAX parser, |
||||||
|
* an event based parser, but with much less functionality. |
||||||
|
* <p> |
||||||
|
* The parser can: |
||||||
|
* <p> |
||||||
|
* <ul> |
||||||
|
* <li>It recognizes the encoding used |
||||||
|
* <li>It recognizes all the elements' start tags and end tags |
||||||
|
* <li>It lists attributes, where attribute values can be enclosed in single or double quotes |
||||||
|
* <li>It recognizes the <code><[CDATA[ ... ]]></code> construct |
||||||
|
* <li>It recognizes the standard entities: &amp;, &lt;, &gt;, &quot;, and &apos;, as well as numeric entities |
||||||
|
* <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11 |
||||||
|
* </ul> |
||||||
|
* <p> |
||||||
|
*/ |
||||||
|
public final class SimpleXMLParser { |
||||||
|
/** possible states */ |
||||||
|
private final static int UNKNOWN = 0; |
||||||
|
private final static int TEXT = 1; |
||||||
|
private final static int TAG_ENCOUNTERED = 2; |
||||||
|
private final static int EXAMIN_TAG = 3; |
||||||
|
private final static int TAG_EXAMINED = 4; |
||||||
|
private final static int IN_CLOSETAG = 5; |
||||||
|
private final static int SINGLE_TAG = 6; |
||||||
|
private final static int CDATA = 7; |
||||||
|
private final static int COMMENT = 8; |
||||||
|
private final static int PI = 9; |
||||||
|
private final static int ENTITY = 10; |
||||||
|
private final static int QUOTE = 11; |
||||||
|
private final static int ATTRIBUTE_KEY = 12; |
||||||
|
private final static int ATTRIBUTE_EQUAL = 13; |
||||||
|
private final static int ATTRIBUTE_VALUE = 14; |
||||||
|
|
||||||
|
/** the state stack */ |
||||||
|
Stack stack; |
||||||
|
/** The current character. */ |
||||||
|
int character = 0; |
||||||
|
/** The previous character. */ |
||||||
|
int previousCharacter = -1; |
||||||
|
/** the line we are currently reading */ |
||||||
|
int lines = 1; |
||||||
|
/** the column where the current character occurs */ |
||||||
|
int columns = 0; |
||||||
|
/** was the last character equivalent to a newline? */ |
||||||
|
boolean eol = false; |
||||||
|
/** |
||||||
|
* A boolean indicating if the next character should be taken into account |
||||||
|
* if it's a space character. When nospace is false, the previous character |
||||||
|
* wasn't whitespace. |
||||||
|
* @since 2.1.5 |
||||||
|
*/ |
||||||
|
boolean nowhite = false; |
||||||
|
/** the current state */ |
||||||
|
int state; |
||||||
|
/** Are we parsing HTML? */ |
||||||
|
boolean html; |
||||||
|
/** current text (whatever is encountered between tags) */ |
||||||
|
StringBuffer text = new StringBuffer(); |
||||||
|
/** current entity (whatever is encountered between & and ;) */ |
||||||
|
StringBuffer entity = new StringBuffer(); |
||||||
|
/** current tagname */ |
||||||
|
String tag = null; |
||||||
|
/** current attributes */ |
||||||
|
HashMap attributes = null; |
||||||
|
/** The handler to which we are going to forward document content */ |
||||||
|
SimpleXMLDocHandler doc; |
||||||
|
/** The handler to which we are going to forward comments. */ |
||||||
|
SimpleXMLDocHandlerComment comment; |
||||||
|
/** Keeps track of the number of tags that are open. */ |
||||||
|
int nested = 0; |
||||||
|
/** the quote character that was used to open the quote. */ |
||||||
|
int quoteCharacter = '"'; |
||||||
|
/** the attribute key. */ |
||||||
|
String attributekey = null; |
||||||
|
/** the attribute value. */ |
||||||
|
String attributevalue = null; |
||||||
|
|
||||||
|
/** |
||||||
|
* Creates a Simple XML parser object. |
||||||
|
* Call go(BufferedReader) immediately after creation. |
||||||
|
*/ |
||||||
|
private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) { |
||||||
|
this.doc = doc; |
||||||
|
this.comment = comment; |
||||||
|
this.html = html; |
||||||
|
stack = new Stack(); |
||||||
|
state = html ? TEXT : UNKNOWN; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Does the actual parsing. Perform this immediately |
||||||
|
* after creating the parser object. |
||||||
|
*/ |
||||||
|
private void go(Reader r) throws IOException { |
||||||
|
BufferedReader reader; |
||||||
|
if (r instanceof BufferedReader) |
||||||
|
reader = (BufferedReader)r; |
||||||
|
else |
||||||
|
reader = new BufferedReader(r); |
||||||
|
doc.startDocument(); |
||||||
|
while(true) { |
||||||
|
// read a new character
|
||||||
|
if (previousCharacter == -1) { |
||||||
|
character = reader.read(); |
||||||
|
} |
||||||
|
// or re-examine the previous character
|
||||||
|
else { |
||||||
|
character = previousCharacter; |
||||||
|
previousCharacter = -1; |
||||||
|
} |
||||||
|
|
||||||
|
// the end of the file was reached
|
||||||
|
if (character == -1) { |
||||||
|
if (html) { |
||||||
|
if (html && state == TEXT) |
||||||
|
flush(); |
||||||
|
doc.endDocument(); |
||||||
|
} else { |
||||||
|
throwException("Missing end tag"); |
||||||
|
} |
||||||
|
return; |
||||||
|
} |
||||||
|
|
||||||
|
// dealing with \n and \r
|
||||||
|
if (character == '\n' && eol) { |
||||||
|
eol = false; |
||||||
|
continue; |
||||||
|
} else if (eol) { |
||||||
|
eol = false; |
||||||
|
} else if (character == '\n') { |
||||||
|
lines++; |
||||||
|
columns = 0; |
||||||
|
} else if (character == '\r') { |
||||||
|
eol = true; |
||||||
|
character = '\n'; |
||||||
|
lines++; |
||||||
|
columns = 0; |
||||||
|
} else { |
||||||
|
columns++; |
||||||
|
} |
||||||
|
|
||||||
|
switch(state) { |
||||||
|
// we are in an unknown state before there's actual content
|
||||||
|
case UNKNOWN: |
||||||
|
if(character == '<') { |
||||||
|
beginnOfTag((char) reader.read(), UNKNOWN); |
||||||
|
} |
||||||
|
break; |
||||||
|
// we can encounter any content
|
||||||
|
case TEXT: |
||||||
|
if(character == '<') { |
||||||
|
beginnOfTag((char) reader.read(), TEXT); |
||||||
|
} else if(character == '&') { |
||||||
|
saveState(state); |
||||||
|
entity.setLength(0); |
||||||
|
state = ENTITY; |
||||||
|
} else if (Character.isWhitespace((char)character) && character != 12288) { |
||||||
|
if (nowhite) |
||||||
|
text.append((char)character); |
||||||
|
nowhite = false; |
||||||
|
} else { |
||||||
|
text.append((char)character); |
||||||
|
nowhite = true; |
||||||
|
} |
||||||
|
break; |
||||||
|
// we have just seen a < and are wondering what we are looking at
|
||||||
|
// <foo>, </foo>, <!-- ... --->, etc.
|
||||||
|
case TAG_ENCOUNTERED: |
||||||
|
initTag(); |
||||||
|
if(character == '/') { |
||||||
|
state = IN_CLOSETAG; |
||||||
|
} else if (character == '?') { |
||||||
|
restoreState(); |
||||||
|
state = PI; |
||||||
|
} else { |
||||||
|
text.append((char)character); |
||||||
|
state = EXAMIN_TAG; |
||||||
|
} |
||||||
|
break; |
||||||
|
// we are processing something like this <foo ... >.
|
||||||
|
// It could still be a <!-- ... --> or something.
|
||||||
|
case EXAMIN_TAG: |
||||||
|
if(character == '>') { |
||||||
|
doTag(); |
||||||
|
processTag(true); |
||||||
|
initTag(); |
||||||
|
state = restoreState(); |
||||||
|
} else if(character == '/') { |
||||||
|
state = SINGLE_TAG; |
||||||
|
} else if(character == '-' && text.toString().equals("!-")) { |
||||||
|
flush(); |
||||||
|
state = COMMENT; |
||||||
|
} else if(character == '[' && text.toString().equals("![CDATA")) { |
||||||
|
flush(); |
||||||
|
state = CDATA; |
||||||
|
} else if(character == 'E' && text.toString().equals("!DOCTYP")) { |
||||||
|
flush(); |
||||||
|
state = PI; |
||||||
|
} else if(Character.isWhitespace((char)character)) { |
||||||
|
doTag(); |
||||||
|
state = TAG_EXAMINED; |
||||||
|
} else { |
||||||
|
text.append((char)character); |
||||||
|
} |
||||||
|
break; |
||||||
|
// we know the name of the tag now.
|
||||||
|
case TAG_EXAMINED: |
||||||
|
if(character == '>') { |
||||||
|
processTag(true); |
||||||
|
initTag(); |
||||||
|
state = restoreState(); |
||||||
|
} else if(character == '/') { |
||||||
|
state = SINGLE_TAG; |
||||||
|
} else if(Character.isWhitespace((char)character)) { |
||||||
|
// empty
|
||||||
|
} else { |
||||||
|
text.append((char)character); |
||||||
|
state = ATTRIBUTE_KEY; |
||||||
|
} |
||||||
|
break; |
||||||
|
|
||||||
|
// we are processing a closing tag: e.g. </foo>
|
||||||
|
case IN_CLOSETAG: |
||||||
|
if(character == '>') { |
||||||
|
doTag(); |
||||||
|
processTag(false); |
||||||
|
if(!html && nested==0) return; |
||||||
|
state = restoreState(); |
||||||
|
} else { |
||||||
|
if (!Character.isWhitespace((char)character)) |
||||||
|
text.append((char)character); |
||||||
|
} |
||||||
|
break; |
||||||
|
|
||||||
|
// we have just seen something like this: <foo a="b"/
|
||||||
|
// and are looking for the final >.
|
||||||
|
case SINGLE_TAG: |
||||||
|
if(character != '>') |
||||||
|
throwException("Expected > for tag: <"+tag+"/>"); |
||||||
|
doTag(); |
||||||
|
processTag(true); |
||||||
|
processTag(false); |
||||||
|
initTag(); |
||||||
|
if(!html && nested==0) { |
||||||
|
doc.endDocument(); |
||||||
|
return; |
||||||
|
} |
||||||
|
state = restoreState(); |
||||||
|
break; |
||||||
|
|
||||||
|
// we are processing CDATA
|
||||||
|
case CDATA: |
||||||
|
if(character == '>' |
||||||
|
&& text.toString().endsWith("]]")) { |
||||||
|
text.setLength(text.length()-2); |
||||||
|
flush(); |
||||||
|
state = restoreState(); |
||||||
|
} else |
||||||
|
text.append((char)character); |
||||||
|
break; |
||||||
|
|
||||||
|
// we are processing a comment. We are inside
|
||||||
|
// the <!-- .... --> looking for the -->.
|
||||||
|
case COMMENT: |
||||||
|
if(character == '>' |
||||||
|
&& text.toString().endsWith("--")) { |
||||||
|
text.setLength(text.length() - 2); |
||||||
|
flush(); |
||||||
|
state = restoreState(); |
||||||
|
} else |
||||||
|
text.append((char)character); |
||||||
|
break; |
||||||
|
|
||||||
|
// We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
|
||||||
|
case PI: |
||||||
|
if(character == '>') { |
||||||
|
state = restoreState(); |
||||||
|
if(state == TEXT) state = UNKNOWN; |
||||||
|
} |
||||||
|
break; |
||||||
|
|
||||||
|
// we are processing an entity, e.g. <, », etc.
|
||||||
|
case ENTITY: |
||||||
|
if(character == ';') { |
||||||
|
state = restoreState(); |
||||||
|
String cent = entity.toString(); |
||||||
|
entity.setLength(0); |
||||||
|
char ce = EntitiesToUnicode.decodeEntity(cent); |
||||||
|
if (ce == '\0') |
||||||
|
text.append('&').append(cent).append(';'); |
||||||
|
else |
||||||
|
text.append(ce); |
||||||
|
} else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') |
||||||
|
&& (character < 'A' || character > 'Z')) || entity.length() >= 7) { |
||||||
|
state = restoreState(); |
||||||
|
previousCharacter = character; |
||||||
|
text.append('&').append(entity.toString()); |
||||||
|
entity.setLength(0); |
||||||
|
} |
||||||
|
else { |
||||||
|
entity.append((char)character); |
||||||
|
} |
||||||
|
break; |
||||||
|
// We are processing the quoted right-hand side of an element's attribute.
|
||||||
|
case QUOTE: |
||||||
|
if (html && quoteCharacter == ' ' && character == '>') { |
||||||
|
flush(); |
||||||
|
processTag(true); |
||||||
|
initTag(); |
||||||
|
state = restoreState(); |
||||||
|
} |
||||||
|
else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) { |
||||||
|
flush(); |
||||||
|
state = TAG_EXAMINED; |
||||||
|
} |
||||||
|
else if (html && quoteCharacter == ' ') { |
||||||
|
text.append((char)character); |
||||||
|
} |
||||||
|
else if(character == quoteCharacter) { |
||||||
|
flush(); |
||||||
|
state = TAG_EXAMINED; |
||||||
|
} else if(" \r\n\u0009".indexOf(character)>=0) { |
||||||
|
text.append(' '); |
||||||
|
} else if(character == '&') { |
||||||
|
saveState(state); |
||||||
|
state = ENTITY; |
||||||
|
entity.setLength(0); |
||||||
|
} else { |
||||||
|
text.append((char)character); |
||||||
|
} |
||||||
|
break; |
||||||
|
|
||||||
|
case ATTRIBUTE_KEY: |
||||||
|
if(Character.isWhitespace((char)character)) { |
||||||
|
flush(); |
||||||
|
state = ATTRIBUTE_EQUAL; |
||||||
|
} else if(character == '=') { |
||||||
|
flush(); |
||||||
|
state = ATTRIBUTE_VALUE; |
||||||
|
} else if (html && character == '>') { |
||||||
|
text.setLength(0); |
||||||
|
processTag(true); |
||||||
|
initTag(); |
||||||
|
state = restoreState(); |
||||||
|
} else { |
||||||
|
text.append((char)character); |
||||||
|
} |
||||||
|
break; |
||||||
|
|
||||||
|
case ATTRIBUTE_EQUAL: |
||||||
|
if(character == '=') { |
||||||
|
state = ATTRIBUTE_VALUE; |
||||||
|
} else if(Character.isWhitespace((char)character)) { |
||||||
|
// empty
|
||||||
|
} else if (html && character == '>') { |
||||||
|
text.setLength(0); |
||||||
|
processTag(true); |
||||||
|
initTag(); |
||||||
|
state = restoreState(); |
||||||
|
} else if (html && character == '/') { |
||||||
|
flush(); |
||||||
|
state = SINGLE_TAG; |
||||||
|
} else if (html) { |
||||||
|
flush(); |
||||||
|
text.append((char)character); |
||||||
|
state = ATTRIBUTE_KEY; |
||||||
|
} else { |
||||||
|
throwException("Error in attribute processing."); |
||||||
|
} |
||||||
|
break; |
||||||
|
|
||||||
|
case ATTRIBUTE_VALUE: |
||||||
|
if(character == '"' || character == '\'') { |
||||||
|
quoteCharacter = character; |
||||||
|
state = QUOTE; |
||||||
|
} else if(Character.isWhitespace((char)character)) { |
||||||
|
// empty
|
||||||
|
} else if (html && character == '>') { |
||||||
|
flush(); |
||||||
|
processTag(true); |
||||||
|
initTag(); |
||||||
|
state = restoreState(); |
||||||
|
} else if (html) { |
||||||
|
text.append((char)character); |
||||||
|
quoteCharacter = ' '; |
||||||
|
state = QUOTE; |
||||||
|
} else { |
||||||
|
throwException("Error in attribute processing"); |
||||||
|
} |
||||||
|
break; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Gets a state from the stack |
||||||
|
* @return the previous state |
||||||
|
*/ |
||||||
|
private int restoreState() { |
||||||
|
if(!stack.empty()) |
||||||
|
return ((Integer)stack.pop()).intValue(); |
||||||
|
else |
||||||
|
return UNKNOWN; |
||||||
|
} |
||||||
|
/** |
||||||
|
* Adds a state to the stack. |
||||||
|
* @param s a state to add to the stack |
||||||
|
*/ |
||||||
|
private void saveState(int s) { |
||||||
|
stack.push(new Integer(s)); |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* 处理标签的开头,若不在支持标签范围内,将<符号作为文本处理,例:<1111 (仿造浏览器的处理方式) |
||||||
|
*/ |
||||||
|
public void beginnOfTag(char c, int type) { |
||||||
|
previousCharacter = c; |
||||||
|
if (c == -1) { |
||||||
|
return; |
||||||
|
} |
||||||
|
if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) { |
||||||
|
if (type == TEXT) { |
||||||
|
flush(); |
||||||
|
} |
||||||
|
saveState(TEXT); |
||||||
|
state = TAG_ENCOUNTERED; |
||||||
|
return; |
||||||
|
} |
||||||
|
text.append((char) character); |
||||||
|
nowhite = true; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Flushes the text that is currently in the buffer. |
||||||
|
* The text can be ignored, added to the document |
||||||
|
* as content or as comment,... depending on the current state. |
||||||
|
*/ |
||||||
|
private void flush() { |
||||||
|
switch(state){ |
||||||
|
case TEXT: |
||||||
|
case CDATA: |
||||||
|
if(text.length() > 0) { |
||||||
|
doc.text(text.toString()); |
||||||
|
} |
||||||
|
break; |
||||||
|
case COMMENT: |
||||||
|
if (comment != null) { |
||||||
|
comment.comment(text.toString()); |
||||||
|
} |
||||||
|
break; |
||||||
|
case ATTRIBUTE_KEY: |
||||||
|
attributekey = text.toString(); |
||||||
|
if (html) |
||||||
|
attributekey = attributekey.toLowerCase(); |
||||||
|
break; |
||||||
|
case QUOTE: |
||||||
|
case ATTRIBUTE_VALUE: |
||||||
|
attributevalue = text.toString(); |
||||||
|
attributes.put(attributekey,attributevalue); |
||||||
|
break; |
||||||
|
default: |
||||||
|
// do nothing
|
||||||
|
} |
||||||
|
text.setLength(0); |
||||||
|
} |
||||||
|
/** |
||||||
|
* Initialized the tag name and attributes. |
||||||
|
*/ |
||||||
|
private void initTag() { |
||||||
|
tag = null; |
||||||
|
attributes = new HashMap(); |
||||||
|
} |
||||||
|
/** Sets the name of the tag. */ |
||||||
|
private void doTag() { |
||||||
|
if(tag == null) |
||||||
|
tag = text.toString(); |
||||||
|
if (html) |
||||||
|
tag = tag.toLowerCase(); |
||||||
|
text.setLength(0); |
||||||
|
} |
||||||
|
/** |
||||||
|
* processes the tag. |
||||||
|
* @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. |
||||||
|
*/ |
||||||
|
private void processTag(boolean start) { |
||||||
|
if (start) { |
||||||
|
nested++; |
||||||
|
doc.startElement(tag,attributes); |
||||||
|
} |
||||||
|
else { |
||||||
|
nested--; |
||||||
|
doc.endElement(tag); |
||||||
|
} |
||||||
|
} |
||||||
|
/** Throws an exception */ |
||||||
|
private void throwException(String s) throws IOException { |
||||||
|
throw new IOException(s+" near line " + lines + ", column " + columns); |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Parses the XML document firing the events to the handler. |
||||||
|
* @param doc the document handler |
||||||
|
* @param r the document. The encoding is already resolved. The reader is not closed |
||||||
|
* @throws IOException on error |
||||||
|
*/ |
||||||
|
public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException { |
||||||
|
SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); |
||||||
|
parser.go(r); |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Parses the XML document firing the events to the handler. |
||||||
|
* @param doc the document handler |
||||||
|
* @param in the document. The encoding is deduced from the stream. The stream is not closed |
||||||
|
* @throws IOException on error |
||||||
|
*/ |
||||||
|
public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException { |
||||||
|
byte b4[] = new byte[4]; |
||||||
|
int count = in.read(b4); |
||||||
|
if (count != 4) |
||||||
|
throw new IOException("Insufficient length."); |
||||||
|
String encoding = getEncodingName(b4); |
||||||
|
String decl = null; |
||||||
|
if (encoding.equals("UTF-8")) { |
||||||
|
StringBuffer sb = new StringBuffer(); |
||||||
|
int c; |
||||||
|
while ((c = in.read()) != -1) { |
||||||
|
if (c == '>') |
||||||
|
break; |
||||||
|
sb.append((char)c); |
||||||
|
} |
||||||
|
decl = sb.toString(); |
||||||
|
} |
||||||
|
else if (encoding.equals("CP037")) { |
||||||
|
ByteArrayOutputStream bi = new ByteArrayOutputStream(); |
||||||
|
int c; |
||||||
|
while ((c = in.read()) != -1) { |
||||||
|
if (c == 0x6e) // that's '>' in ebcdic
|
||||||
|
break; |
||||||
|
bi.write(c); |
||||||
|
} |
||||||
|
decl = new String(bi.toByteArray(), "CP037"); |
||||||
|
} |
||||||
|
if (decl != null) { |
||||||
|
decl = getDeclaredEncoding(decl); |
||||||
|
if (decl != null) |
||||||
|
encoding = decl; |
||||||
|
} |
||||||
|
parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding))); |
||||||
|
} |
||||||
|
|
||||||
|
private static String getDeclaredEncoding(String decl) { |
||||||
|
if (decl == null) |
||||||
|
return null; |
||||||
|
int idx = decl.indexOf("encoding"); |
||||||
|
if (idx < 0) |
||||||
|
return null; |
||||||
|
int idx1 = decl.indexOf('"', idx); |
||||||
|
int idx2 = decl.indexOf('\'', idx); |
||||||
|
if (idx1 == idx2) |
||||||
|
return null; |
||||||
|
if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { |
||||||
|
int idx3 = decl.indexOf('\'', idx2 + 1); |
||||||
|
if (idx3 < 0) |
||||||
|
return null; |
||||||
|
return decl.substring(idx2 + 1, idx3); |
||||||
|
} |
||||||
|
if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { |
||||||
|
int idx3 = decl.indexOf('"', idx1 + 1); |
||||||
|
if (idx3 < 0) |
||||||
|
return null; |
||||||
|
return decl.substring(idx1 + 1, idx3); |
||||||
|
} |
||||||
|
return null; |
||||||
|
} |
||||||
|
|
||||||
|
public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException { |
||||||
|
parse(doc, null, r, false); |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Escapes a string with the appropriated XML codes. |
||||||
|
* @param s the string to be escaped |
||||||
|
* @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE> |
||||||
|
* @return the escaped string |
||||||
|
*/ |
||||||
|
public static String escapeXML(String s, boolean onlyASCII) { |
||||||
|
char cc[] = s.toCharArray(); |
||||||
|
int len = cc.length; |
||||||
|
StringBuffer sb = new StringBuffer(); |
||||||
|
for (int k = 0; k < len; ++k) { |
||||||
|
int c = cc[k]; |
||||||
|
switch (c) { |
||||||
|
case '<': |
||||||
|
sb.append("<"); |
||||||
|
break; |
||||||
|
case '>': |
||||||
|
sb.append(">"); |
||||||
|
break; |
||||||
|
case '&': |
||||||
|
sb.append("&"); |
||||||
|
break; |
||||||
|
case '"': |
||||||
|
sb.append("""); |
||||||
|
break; |
||||||
|
case '\'': |
||||||
|
sb.append("'"); |
||||||
|
break; |
||||||
|
default: |
||||||
|
if ((c == 0x9) || (c == 0xA) || (c == 0xD) |
||||||
|
|| ((c >= 0x20) && (c <= 0xD7FF)) |
||||||
|
|| ((c >= 0xE000) && (c <= 0xFFFD)) |
||||||
|
|| ((c >= 0x10000) && (c <= 0x10FFFF))) { |
||||||
|
if (onlyASCII && c > 127) |
||||||
|
sb.append("&#").append(c).append(';'); |
||||||
|
else |
||||||
|
sb.append((char)c); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
return sb.toString(); |
||||||
|
} |
||||||
|
/** |
||||||
|
* Returns the IANA encoding name that is auto-detected from |
||||||
|
* the bytes specified, with the endian-ness of that encoding where appropriate. |
||||||
|
* (method found in org.apache.xerces.impl.XMLEntityManager, originally published |
||||||
|
* by the Apache Software Foundation under the Apache Software License; now being |
||||||
|
* used in iText under the MPL) |
||||||
|
* @param b4 The first four bytes of the input. |
||||||
|
* @return an IANA-encoding string |
||||||
|
*/ |
||||||
|
private static String getEncodingName(byte[] b4) { |
||||||
|
|
||||||
|
// UTF-16, with BOM
|
||||||
|
int b0 = b4[0] & 0xFF; |
||||||
|
int b1 = b4[1] & 0xFF; |
||||||
|
if (b0 == 0xFE && b1 == 0xFF) { |
||||||
|
// UTF-16, big-endian
|
||||||
|
return "UTF-16BE"; |
||||||
|
} |
||||||
|
if (b0 == 0xFF && b1 == 0xFE) { |
||||||
|
// UTF-16, little-endian
|
||||||
|
return "UTF-16LE"; |
||||||
|
} |
||||||
|
|
||||||
|
// UTF-8 with a BOM
|
||||||
|
int b2 = b4[2] & 0xFF; |
||||||
|
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { |
||||||
|
return "UTF-8"; |
||||||
|
} |
||||||
|
|
||||||
|
// other encodings
|
||||||
|
int b3 = b4[3] & 0xFF; |
||||||
|
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { |
||||||
|
// UCS-4, big endian (1234)
|
||||||
|
return "ISO-10646-UCS-4"; |
||||||
|
} |
||||||
|
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { |
||||||
|
// UCS-4, little endian (4321)
|
||||||
|
return "ISO-10646-UCS-4"; |
||||||
|
} |
||||||
|
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { |
||||||
|
// UCS-4, unusual octet order (2143)
|
||||||
|
// REVISIT: What should this be?
|
||||||
|
return "ISO-10646-UCS-4"; |
||||||
|
} |
||||||
|
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { |
||||||
|
// UCS-4, unusual octet order (3412)
|
||||||
|
// REVISIT: What should this be?
|
||||||
|
return "ISO-10646-UCS-4"; |
||||||
|
} |
||||||
|
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { |
||||||
|
// UTF-16, big-endian, no BOM
|
||||||
|
// (or could turn out to be UCS-2...
|
||||||
|
// REVISIT: What should this be?
|
||||||
|
return "UTF-16BE"; |
||||||
|
} |
||||||
|
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { |
||||||
|
// UTF-16, little-endian, no BOM
|
||||||
|
// (or could turn out to be UCS-2...
|
||||||
|
return "UTF-16LE"; |
||||||
|
} |
||||||
|
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { |
||||||
|
// EBCDIC
|
||||||
|
// a la xerces1, return CP037 instead of EBCDIC here
|
||||||
|
return "CP037"; |
||||||
|
} |
||||||
|
|
||||||
|
// default encoding
|
||||||
|
return "UTF-8"; |
||||||
|
} |
||||||
|
} |
Loading…
Reference in new issue