Harrison
5 years ago
4 changed files with 34 additions and 1580 deletions
@ -1,788 +0,0 @@ |
|||||||
/* |
|
||||||
* Copyright 2004 Paulo Soares |
|
||||||
* |
|
||||||
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
|
||||||
* (the "License"); you may not use this file except in compliance with the License. |
|
||||||
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
|
|
||||||
* |
|
||||||
* Software distributed under the License is distributed on an "AS IS" basis, |
|
||||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
|
||||||
* for the specific language governing rights and limitations under the License. |
|
||||||
* |
|
||||||
* The Original Code is 'iText, a free JAVA-PDF library'. |
|
||||||
* |
|
||||||
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by |
|
||||||
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. |
|
||||||
* All Rights Reserved. |
|
||||||
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer |
|
||||||
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. |
|
||||||
* |
|
||||||
* Contributor(s): all the names of the contributors are added in the source code |
|
||||||
* where applicable. |
|
||||||
* |
|
||||||
* Alternatively, the contents of this file may be used under the terms of the |
|
||||||
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the |
|
||||||
* provisions of LGPL are applicable instead of those above. If you wish to |
|
||||||
* allow use of your version of this file only under the terms of the LGPL |
|
||||||
* License and not to allow others to use your version of this file under |
|
||||||
* the MPL, indicate your decision by deleting the provisions above and |
|
||||||
* replace them with the notice and other provisions required by the LGPL. |
|
||||||
* If you do not delete the provisions above, a recipient may use your version |
|
||||||
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. |
|
||||||
* |
|
||||||
* This library is free software; you can redistribute it and/or modify it |
|
||||||
* under the terms of the MPL as stated above or under the terms of the GNU |
|
||||||
* Library General Public License as published by the Free Software Foundation; |
|
||||||
* either version 2 of the License, or any later version. |
|
||||||
* |
|
||||||
* This library is distributed in the hope that it will be useful, but WITHOUT |
|
||||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
|
||||||
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more |
|
||||||
* details. |
|
||||||
* |
|
||||||
* Contributions by: |
|
||||||
* Lubos Strapko |
|
||||||
* |
|
||||||
* If you didn't download this code from the following link, you should check if |
|
||||||
* you aren't using an obsolete version: |
|
||||||
* http://www.lowagie.com/iText/
|
|
||||||
*/ |
|
||||||
|
|
||||||
package com.fr.third.com.lowagie.text.html.simpleparser; |
|
||||||
|
|
||||||
import com.fr.third.com.lowagie.text.Chunk; |
|
||||||
import com.fr.third.com.lowagie.text.DocListener; |
|
||||||
import com.fr.third.com.lowagie.text.DocumentException; |
|
||||||
import com.fr.third.com.lowagie.text.Element; |
|
||||||
import com.fr.third.com.lowagie.text.ElementTags; |
|
||||||
import com.fr.third.com.lowagie.text.ExceptionConverter; |
|
||||||
import com.fr.third.com.lowagie.text.FontFactoryImp; |
|
||||||
import com.fr.third.com.lowagie.text.HeaderFooter; |
|
||||||
import com.fr.third.com.lowagie.text.Image; |
|
||||||
import com.fr.third.com.lowagie.text.List; |
|
||||||
import com.fr.third.com.lowagie.text.ListItem; |
|
||||||
import com.fr.third.com.lowagie.text.Paragraph; |
|
||||||
import com.fr.third.com.lowagie.text.Phrase; |
|
||||||
import com.fr.third.com.lowagie.text.Rectangle; |
|
||||||
import com.fr.third.com.lowagie.text.TextElementArray; |
|
||||||
import com.fr.third.com.lowagie.text.html.CSSUtils; |
|
||||||
import com.fr.third.com.lowagie.text.html.HtmlTags; |
|
||||||
import com.fr.third.com.lowagie.text.html.Markup; |
|
||||||
import com.fr.third.com.lowagie.text.pdf.PdfPTable; |
|
||||||
import com.fr.third.com.lowagie.text.pdf.draw.LineSeparator; |
|
||||||
import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLDocHandler; |
|
||||||
import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLParser; |
|
||||||
import com.fr.third.sun.misc.BASE64Decoder; |
|
||||||
|
|
||||||
import java.io.File; |
|
||||||
import java.io.IOException; |
|
||||||
import java.io.Reader; |
|
||||||
import java.util.ArrayList; |
|
||||||
import java.util.HashMap; |
|
||||||
import java.util.Stack; |
|
||||||
import java.util.StringTokenizer; |
|
||||||
|
|
||||||
public class HTMLWorker implements SimpleXMLDocHandler, DocListener { |
|
||||||
|
|
||||||
protected ArrayList objectList; |
|
||||||
|
|
||||||
protected DocListener document; |
|
||||||
|
|
||||||
private Paragraph currentParagraph; |
|
||||||
|
|
||||||
private ChainedProperties cprops = new ChainedProperties(); |
|
||||||
|
|
||||||
private Stack stack = new Stack(); |
|
||||||
|
|
||||||
private boolean pendingTR = false; |
|
||||||
|
|
||||||
private boolean pendingTD = false; |
|
||||||
|
|
||||||
private boolean pendingLI = false; |
|
||||||
|
|
||||||
private StyleSheet style = new StyleSheet(); |
|
||||||
|
|
||||||
private boolean isPRE = false; |
|
||||||
|
|
||||||
private Stack tableState = new Stack(); |
|
||||||
|
|
||||||
private boolean skipText = false; |
|
||||||
|
|
||||||
private HashMap interfaceProps; |
|
||||||
|
|
||||||
private FactoryProperties factoryProperties = new FactoryProperties(); |
|
||||||
|
|
||||||
/** Creates a new instance of HTMLWorker |
|
||||||
* @param document A class that implements <CODE>DocListener</CODE> |
|
||||||
* */ |
|
||||||
public HTMLWorker(DocListener document) { |
|
||||||
this.document = document; |
|
||||||
} |
|
||||||
|
|
||||||
public void setStyleSheet(StyleSheet style) { |
|
||||||
this.style = style; |
|
||||||
} |
|
||||||
|
|
||||||
public StyleSheet getStyleSheet() { |
|
||||||
return style; |
|
||||||
} |
|
||||||
|
|
||||||
public void setInterfaceProps(HashMap interfaceProps) { |
|
||||||
this.interfaceProps = interfaceProps; |
|
||||||
FontFactoryImp ff = null; |
|
||||||
if (interfaceProps != null) |
|
||||||
ff = (FontFactoryImp) interfaceProps.get("font_factory"); |
|
||||||
if (ff != null) |
|
||||||
factoryProperties.setFontImp(ff); |
|
||||||
} |
|
||||||
|
|
||||||
public HashMap getInterfaceProps() { |
|
||||||
return interfaceProps; |
|
||||||
} |
|
||||||
|
|
||||||
public void parse(Reader reader) throws IOException { |
|
||||||
SimpleXMLParser.parse(this, null, reader, true); |
|
||||||
} |
|
||||||
|
|
||||||
public static ArrayList parseToList(Reader reader, StyleSheet style) |
|
||||||
throws IOException { |
|
||||||
return parseToList(reader, style, null); |
|
||||||
} |
|
||||||
|
|
||||||
public static ArrayList parseToList(Reader reader, StyleSheet style, |
|
||||||
HashMap interfaceProps) throws IOException { |
|
||||||
HTMLWorker worker = new HTMLWorker(null); |
|
||||||
if (style != null) |
|
||||||
worker.style = style; |
|
||||||
worker.document = worker; |
|
||||||
worker.setInterfaceProps(interfaceProps); |
|
||||||
worker.objectList = new ArrayList(); |
|
||||||
worker.parse(reader); |
|
||||||
return worker.objectList; |
|
||||||
} |
|
||||||
|
|
||||||
public void endDocument() { |
|
||||||
try { |
|
||||||
for (int k = 0; k < stack.size(); ++k) |
|
||||||
document.add((Element) stack.elementAt(k)); |
|
||||||
if (currentParagraph != null) |
|
||||||
document.add(currentParagraph); |
|
||||||
currentParagraph = null; |
|
||||||
} catch (Exception e) { |
|
||||||
throw new ExceptionConverter(e); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
public void startDocument() { |
|
||||||
HashMap h = new HashMap(); |
|
||||||
style.applyStyle("body", h); |
|
||||||
cprops.addToChain("body", h); |
|
||||||
} |
|
||||||
|
|
||||||
|
|
||||||
public void startElement(String tag, HashMap h) { |
|
||||||
if (!tagsSupported.containsKey(tag)) |
|
||||||
return; |
|
||||||
try { |
|
||||||
style.applyStyle(tag, h); |
|
||||||
if(tag.equals("p")){ |
|
||||||
h.put(Markup.CSS_KEY_MARGINTOP, "16px"); |
|
||||||
h.put(Markup.CSS_KEY_MARGINBOTTOM, "16px"); |
|
||||||
} |
|
||||||
String follow = (String) FactoryProperties.followTags.get(tag); |
|
||||||
if (follow != null) { |
|
||||||
HashMap prop = new HashMap(); |
|
||||||
prop.put(follow, null); |
|
||||||
FactoryProperties.insertStyle(h, this.cprops); |
|
||||||
prop.putAll(h); |
|
||||||
|
|
||||||
cprops.addToChain(follow, prop); |
|
||||||
return; |
|
||||||
} |
|
||||||
FactoryProperties.insertStyle(h, cprops); |
|
||||||
if (tag.equals(HtmlTags.ANCHOR)) { |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
if (currentParagraph == null) { |
|
||||||
currentParagraph = new Paragraph(); |
|
||||||
} |
|
||||||
stack.push(currentParagraph); |
|
||||||
currentParagraph = new Paragraph(); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.NEWLINE)) { |
|
||||||
if (currentParagraph == null) { |
|
||||||
currentParagraph = new Paragraph(); |
|
||||||
} |
|
||||||
currentParagraph.add(factoryProperties |
|
||||||
.createChunk("\n", cprops)); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.HORIZONTALRULE)) { |
|
||||||
// Attempting to duplicate the behavior seen on Firefox with
|
|
||||||
// http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_hr_test
|
|
||||||
// where an initial break is only inserted when the preceding element doesn't
|
|
||||||
// end with a break, but a trailing break is always inserted.
|
|
||||||
boolean addLeadingBreak = true; |
|
||||||
if (currentParagraph == null) { |
|
||||||
currentParagraph = new Paragraph(); |
|
||||||
addLeadingBreak = false; |
|
||||||
} |
|
||||||
if (addLeadingBreak) { // Not a new paragraph
|
|
||||||
int numChunks = currentParagraph.getChunks().size(); |
|
||||||
if (numChunks == 0 || |
|
||||||
((Chunk)(currentParagraph.getChunks().get(numChunks - 1))).getContent().endsWith("\n")) |
|
||||||
addLeadingBreak = false; |
|
||||||
} |
|
||||||
String align = (String) h.get("align"); |
|
||||||
int hrAlign = Element.ALIGN_CENTER; |
|
||||||
if (align != null) { |
|
||||||
if (align.equalsIgnoreCase("left")) |
|
||||||
hrAlign = Element.ALIGN_LEFT; |
|
||||||
if (align.equalsIgnoreCase("right")) |
|
||||||
hrAlign = Element.ALIGN_RIGHT; |
|
||||||
} |
|
||||||
String width = (String) h.get("width"); |
|
||||||
float hrWidth = 1; |
|
||||||
if (width != null) { |
|
||||||
float tmpWidth = Markup.parseLength(width, Markup.DEFAULT_FONT_SIZE); |
|
||||||
if (tmpWidth > 0) hrWidth = tmpWidth; |
|
||||||
if (!width.endsWith("%")) |
|
||||||
hrWidth = 100; // Treat a pixel width as 100% for now.
|
|
||||||
} |
|
||||||
String size = (String) h.get("size"); |
|
||||||
float hrSize = 1; |
|
||||||
if (size != null) { |
|
||||||
float tmpSize = Markup.parseLength(size, Markup.DEFAULT_FONT_SIZE); |
|
||||||
if (tmpSize > 0) |
|
||||||
hrSize = tmpSize; |
|
||||||
} |
|
||||||
if (addLeadingBreak) |
|
||||||
currentParagraph.add(Chunk.NEWLINE); |
|
||||||
currentParagraph.add(new LineSeparator(hrSize, hrWidth, null, hrAlign, currentParagraph.getLeading()/2)); |
|
||||||
currentParagraph.add(Chunk.NEWLINE); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.CHUNK) || tag.equals(HtmlTags.SPAN)) { |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.IMAGE)) { |
|
||||||
String src = (String) h.get(ElementTags.SRC); |
|
||||||
if (src == null) |
|
||||||
return; |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
Image img = null; |
|
||||||
if (interfaceProps != null) { |
|
||||||
ImageProvider ip = (ImageProvider) interfaceProps |
|
||||||
.get("img_provider"); |
|
||||||
if (ip != null) |
|
||||||
img = ip.getImage(src, h, cprops, document); |
|
||||||
if (img == null) { |
|
||||||
HashMap images = (HashMap) interfaceProps |
|
||||||
.get("img_static"); |
|
||||||
if (images != null) { |
|
||||||
Image tim = (Image) images.get(src); |
|
||||||
if (tim != null) |
|
||||||
img = Image.getInstance(tim); |
|
||||||
} else { |
|
||||||
if (!src.startsWith("http")) { // relative src references only
|
|
||||||
String baseurl = (String) interfaceProps |
|
||||||
.get("img_baseurl"); |
|
||||||
if (baseurl != null) { |
|
||||||
src = baseurl + src; |
|
||||||
img = Image.getInstance(src); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
//处理base64编码图片
|
|
||||||
if(src.startsWith("data")){ |
|
||||||
BASE64Decoder decoder = new BASE64Decoder(); |
|
||||||
String[] srcArray = src.split(","); |
|
||||||
String base64string = srcArray[srcArray.length -1]; |
|
||||||
byte[] bytes = decoder.decodeBuffer(base64string); |
|
||||||
try { |
|
||||||
img = Image.getInstance(bytes); |
|
||||||
}catch (Exception e){ |
|
||||||
|
|
||||||
} |
|
||||||
|
|
||||||
} |
|
||||||
if (img == null) { |
|
||||||
if (!src.startsWith("http")) { |
|
||||||
String path = cprops.getProperty("image_path"); |
|
||||||
if (path == null) |
|
||||||
path = ""; |
|
||||||
src = new File(path, src).getPath(); |
|
||||||
} |
|
||||||
img = Image.getInstance(src); |
|
||||||
} |
|
||||||
if(img == null){ |
|
||||||
return; |
|
||||||
} |
|
||||||
img.setSrcString(src); |
|
||||||
String align = (String) h.get("align"); |
|
||||||
String width = (String) h.get("width"); |
|
||||||
String height = (String) h.get("height"); |
|
||||||
String before = cprops.getProperty("before"); |
|
||||||
String after = cprops.getProperty("after"); |
|
||||||
if (before != null) |
|
||||||
img.setSpacingBefore(Float.parseFloat(before)); |
|
||||||
if (after != null) |
|
||||||
img.setSpacingAfter(Float.parseFloat(after)); |
|
||||||
float actualFontSize = Markup.parseLength(cprops |
|
||||||
.getProperty(ElementTags.SIZE), |
|
||||||
Markup.DEFAULT_FONT_SIZE); |
|
||||||
if (actualFontSize <= 0f) |
|
||||||
actualFontSize = Markup.DEFAULT_FONT_SIZE; |
|
||||||
float widthInPoints = Markup.parseLength(width, actualFontSize); |
|
||||||
float heightInPoints = Markup.parseLength(height, |
|
||||||
actualFontSize); |
|
||||||
if (widthInPoints > 0 && heightInPoints > 0) { |
|
||||||
img.scaleAbsolute(widthInPoints, heightInPoints); |
|
||||||
} else if (widthInPoints > 0) { |
|
||||||
heightInPoints = img.getHeight() * widthInPoints |
|
||||||
/ img.getWidth(); |
|
||||||
img.scaleAbsolute(widthInPoints, heightInPoints); |
|
||||||
} else if (heightInPoints > 0) { |
|
||||||
widthInPoints = img.getWidth() * heightInPoints |
|
||||||
/ img.getHeight(); |
|
||||||
img.scaleAbsolute(widthInPoints, heightInPoints); |
|
||||||
} |
|
||||||
img.setWidthPercentage(0); |
|
||||||
if (align != null) { |
|
||||||
endElement("p"); |
|
||||||
int ralign = Image.MIDDLE; |
|
||||||
if (align.equalsIgnoreCase("left")) |
|
||||||
ralign = Image.LEFT; |
|
||||||
else if (align.equalsIgnoreCase("right")) |
|
||||||
ralign = Image.RIGHT; |
|
||||||
img.setAlignment(ralign); |
|
||||||
Img i = null; |
|
||||||
boolean skip = false; |
|
||||||
if (interfaceProps != null) { |
|
||||||
i = (Img) interfaceProps.get("img_interface"); |
|
||||||
if (i != null) |
|
||||||
skip = i.process(img, h, cprops, document); |
|
||||||
} |
|
||||||
if (!skip) |
|
||||||
document.add(img); |
|
||||||
cprops.removeChain(tag); |
|
||||||
} else { |
|
||||||
Chunk ck = new Chunk(img, 0, 0); |
|
||||||
if(cprops.hasPropertyInChain("img", "padding-left")){ |
|
||||||
String ss = cprops.getPropertyFromChain("img", "padding-left"); |
|
||||||
ck.setAttribute("padding-left", Float.toString(Markup.parseLength(ss))); |
|
||||||
} |
|
||||||
if(cprops.hasPropertyInChain("img", "padding-right")){ |
|
||||||
String ss = cprops.getPropertyFromChain("img", "padding-right"); |
|
||||||
ck.setAttribute("padding-right", Float.toString(Markup.parseLength(ss))); |
|
||||||
} |
|
||||||
cprops.removeChain(tag); |
|
||||||
if (currentParagraph == null) { |
|
||||||
currentParagraph = FactoryProperties |
|
||||||
.createParagraph(cprops); |
|
||||||
} |
|
||||||
|
|
||||||
currentParagraph.add(ck); |
|
||||||
} |
|
||||||
return; |
|
||||||
} |
|
||||||
endElement("p"); |
|
||||||
if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3") |
|
||||||
|| tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) { |
|
||||||
if (!h.containsKey(ElementTags.SIZE)) { |
|
||||||
int v = 7 - Integer.parseInt(tag.substring(1)); |
|
||||||
h.put(ElementTags.SIZE, Integer.toString(v)); |
|
||||||
} |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.UNORDEREDLIST)) { |
|
||||||
if (pendingLI) |
|
||||||
endElement(HtmlTags.LISTITEM); |
|
||||||
skipText = true; |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
List list = new List(false); |
|
||||||
try{ |
|
||||||
list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue()); |
|
||||||
}catch (Exception e) { |
|
||||||
list.setAutoindent(true); |
|
||||||
} |
|
||||||
list.setListSymbol("\u2022"); |
|
||||||
stack.push(list); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.ORDEREDLIST)) { |
|
||||||
if (pendingLI) |
|
||||||
endElement(HtmlTags.LISTITEM); |
|
||||||
skipText = true; |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
List list = new List(true); |
|
||||||
try{ |
|
||||||
list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue()); |
|
||||||
}catch (Exception e) { |
|
||||||
list.setAutoindent(true); |
|
||||||
} |
|
||||||
stack.push(list); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.LISTITEM)) { |
|
||||||
if (pendingLI) |
|
||||||
endElement(HtmlTags.LISTITEM); |
|
||||||
skipText = false; |
|
||||||
pendingLI = true; |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
ListItem item = FactoryProperties.createListItem(cprops); |
|
||||||
stack.push(item); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.DIV) || tag.equals(HtmlTags.BODY) || tag.equals("p")) { |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.PRE)) { |
|
||||||
if (!h.containsKey(ElementTags.FACE)) { |
|
||||||
h.put(ElementTags.FACE, "Courier"); |
|
||||||
} |
|
||||||
cprops.addToChain(tag, h); |
|
||||||
isPRE = true; |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("tr")) { |
|
||||||
if (pendingTR) |
|
||||||
endElement("tr"); |
|
||||||
skipText = true; |
|
||||||
pendingTR = true; |
|
||||||
cprops.addToChain("tr", h); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("td") || tag.equals("th")) { |
|
||||||
if (pendingTD) |
|
||||||
endElement(tag); |
|
||||||
skipText = false; |
|
||||||
pendingTD = true; |
|
||||||
cprops.addToChain("td", h); |
|
||||||
stack.push(new IncCell(tag, cprops)); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("table")) { |
|
||||||
cprops.addToChain("table", h); |
|
||||||
IncTable table = new IncTable(h); |
|
||||||
stack.push(table); |
|
||||||
tableState.push(new boolean[] { pendingTR, pendingTD }); |
|
||||||
pendingTR = pendingTD = false; |
|
||||||
skipText = true; |
|
||||||
return; |
|
||||||
} |
|
||||||
} catch (Exception e) { |
|
||||||
throw new ExceptionConverter(e); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void endElement(String tag) { |
|
||||||
if (!tagsSupported.containsKey(tag)) |
|
||||||
return; |
|
||||||
try { |
|
||||||
String follow = (String) FactoryProperties.followTags.get(tag); |
|
||||||
if (follow != null) { |
|
||||||
cprops.removeChain(follow); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("font") || tag.equals("span")) { |
|
||||||
cprops.removeChain(tag); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("a")) { |
|
||||||
if (currentParagraph == null) { |
|
||||||
currentParagraph = new Paragraph(); |
|
||||||
} |
|
||||||
boolean skip = false; |
|
||||||
if (interfaceProps != null) { |
|
||||||
ALink i = (ALink) interfaceProps.get("alink_interface"); |
|
||||||
if (i != null) |
|
||||||
skip = i.process(currentParagraph, cprops); |
|
||||||
} |
|
||||||
if (!skip) { |
|
||||||
String href = cprops.getProperty("href"); |
|
||||||
if (href != null) { |
|
||||||
ArrayList chunks = currentParagraph.getChunks(); |
|
||||||
int size = chunks.size(); |
|
||||||
for (int k = 0; k < size; ++k) { |
|
||||||
Chunk ck = (Chunk) chunks.get(k); |
|
||||||
ck.setAnchor(href); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
Paragraph tmp = (Paragraph) stack.pop(); |
|
||||||
Phrase tmp2 = new Phrase(); |
|
||||||
tmp2.add(currentParagraph); |
|
||||||
tmp.add(tmp2); |
|
||||||
currentParagraph = tmp; |
|
||||||
cprops.removeChain("a"); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("br")) { |
|
||||||
return; |
|
||||||
} |
|
||||||
if (currentParagraph != null) { |
|
||||||
if (stack.empty()) |
|
||||||
document.add(currentParagraph); |
|
||||||
else { |
|
||||||
Object obj = stack.pop(); |
|
||||||
if (obj instanceof TextElementArray) { |
|
||||||
TextElementArray current = (TextElementArray) obj; |
|
||||||
current.add(currentParagraph); |
|
||||||
} |
|
||||||
stack.push(obj); |
|
||||||
} |
|
||||||
} |
|
||||||
currentParagraph = null; |
|
||||||
if (tag.equals(HtmlTags.UNORDEREDLIST) |
|
||||||
|| tag.equals(HtmlTags.ORDEREDLIST)) { |
|
||||||
if (pendingLI) |
|
||||||
endElement(HtmlTags.LISTITEM); |
|
||||||
skipText = false; |
|
||||||
cprops.removeChain(tag); |
|
||||||
if (stack.empty()) |
|
||||||
return; |
|
||||||
Object obj = stack.pop(); |
|
||||||
if (!(obj instanceof List)) { |
|
||||||
stack.push(obj); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (stack.empty()) |
|
||||||
document.add((Element) obj); |
|
||||||
else |
|
||||||
((TextElementArray) stack.peek()).add(obj); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.LISTITEM)) { |
|
||||||
pendingLI = false; |
|
||||||
skipText = true; |
|
||||||
cprops.removeChain(tag); |
|
||||||
if (stack.empty()) |
|
||||||
return; |
|
||||||
Object obj = stack.pop(); |
|
||||||
if (!(obj instanceof ListItem)) { |
|
||||||
stack.push(obj); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (stack.empty()) { |
|
||||||
document.add((Element) obj); |
|
||||||
return; |
|
||||||
} |
|
||||||
Object list = stack.pop(); |
|
||||||
if (!(list instanceof List)) { |
|
||||||
stack.push(list); |
|
||||||
return; |
|
||||||
} |
|
||||||
ListItem item = (ListItem) obj; |
|
||||||
((List) list).add(item); |
|
||||||
ArrayList cks = item.getChunks(); |
|
||||||
if (!cks.isEmpty()) |
|
||||||
item.getListSymbol() |
|
||||||
.setFont(((Chunk) cks.get(0)).getFont()); |
|
||||||
stack.push(list); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("div") || tag.equals("body")) { |
|
||||||
cprops.removeChain(tag); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals(HtmlTags.PRE)) { |
|
||||||
cprops.removeChain(tag); |
|
||||||
isPRE = false; |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("p")) { |
|
||||||
cprops.removeChain(tag); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3") |
|
||||||
|| tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) { |
|
||||||
cprops.removeChain(tag); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("table")) { |
|
||||||
if (pendingTR) |
|
||||||
endElement("tr"); |
|
||||||
cprops.removeChain("table"); |
|
||||||
IncTable table = (IncTable) stack.pop(); |
|
||||||
PdfPTable tb = table.buildTable(); |
|
||||||
tb.setSplitRows(true); |
|
||||||
if (stack.empty()) |
|
||||||
document.add(tb); |
|
||||||
else |
|
||||||
((TextElementArray) stack.peek()).add(tb); |
|
||||||
boolean state[] = (boolean[]) tableState.pop(); |
|
||||||
pendingTR = state[0]; |
|
||||||
pendingTD = state[1]; |
|
||||||
skipText = false; |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("tr")) { |
|
||||||
if (pendingTD) |
|
||||||
endElement("td"); |
|
||||||
pendingTR = false; |
|
||||||
String rowHeightPx = cprops.getLastChainProperty("height"); |
|
||||||
|
|
||||||
cprops.removeChain("tr"); |
|
||||||
ArrayList cells = new ArrayList(); |
|
||||||
IncTable table = null; |
|
||||||
while (true) { |
|
||||||
Object obj = stack.pop(); |
|
||||||
if (obj instanceof IncCell) { |
|
||||||
cells.add(((IncCell) obj).getCell()); |
|
||||||
} |
|
||||||
if (obj instanceof IncTable) { |
|
||||||
table = (IncTable) obj; |
|
||||||
break; |
|
||||||
} |
|
||||||
} |
|
||||||
float rowHeight = 0.0f; |
|
||||||
if(rowHeightPx!=null){ |
|
||||||
rowHeight = CSSUtils.parseFloat(rowHeightPx); |
|
||||||
} |
|
||||||
table.addCols(cells); |
|
||||||
table.endRow(rowHeight); |
|
||||||
|
|
||||||
stack.push(table); |
|
||||||
skipText = true; |
|
||||||
return; |
|
||||||
} |
|
||||||
if (tag.equals("td") || tag.equals("th")) { |
|
||||||
pendingTD = false; |
|
||||||
cprops.removeChain("td"); |
|
||||||
skipText = true; |
|
||||||
return; |
|
||||||
} |
|
||||||
} catch (Exception e) { |
|
||||||
throw new ExceptionConverter(e); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
public void text(String str) { |
|
||||||
if (skipText) |
|
||||||
return; |
|
||||||
String content = str; |
|
||||||
if (isPRE) { |
|
||||||
if (currentParagraph == null) { |
|
||||||
currentParagraph = FactoryProperties.createParagraph(cprops); |
|
||||||
} |
|
||||||
Chunk chunk = factoryProperties.createChunk(content, cprops); |
|
||||||
currentParagraph.add(chunk); |
|
||||||
return; |
|
||||||
} |
|
||||||
if (content.trim().length() == 0 && content.indexOf(' ') < 0) { |
|
||||||
return; |
|
||||||
} |
|
||||||
|
|
||||||
StringBuffer buf = new StringBuffer(); |
|
||||||
int len = content.length(); |
|
||||||
char character; |
|
||||||
boolean newline = false; |
|
||||||
for (int i = 0; i < len; i++) { |
|
||||||
switch (character = content.charAt(i)) { |
|
||||||
case ' ': |
|
||||||
if (!newline) { |
|
||||||
buf.append(character); |
|
||||||
} |
|
||||||
break; |
|
||||||
case '\n': |
|
||||||
if (i > 0) { |
|
||||||
newline = true; |
|
||||||
buf.append(' '); |
|
||||||
} |
|
||||||
break; |
|
||||||
case '\r': |
|
||||||
break; |
|
||||||
case '\t': |
|
||||||
break; |
|
||||||
default: |
|
||||||
newline = false; |
|
||||||
buf.append(character); |
|
||||||
} |
|
||||||
} |
|
||||||
if (currentParagraph == null) { |
|
||||||
currentParagraph = FactoryProperties.createParagraph(cprops); |
|
||||||
} |
|
||||||
Chunk chunk = factoryProperties.createChunk(buf.toString(), cprops); |
|
||||||
currentParagraph.add(chunk); |
|
||||||
} |
|
||||||
|
|
||||||
public boolean add(Element element) throws DocumentException { |
|
||||||
objectList.add(element); |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
public void clearTextWrap() throws DocumentException { |
|
||||||
} |
|
||||||
|
|
||||||
public void close() { |
|
||||||
} |
|
||||||
|
|
||||||
public boolean newPage() { |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
public void open() { |
|
||||||
} |
|
||||||
|
|
||||||
public void resetFooter() { |
|
||||||
} |
|
||||||
|
|
||||||
public void resetHeader() { |
|
||||||
} |
|
||||||
|
|
||||||
public void resetPageCount() { |
|
||||||
} |
|
||||||
|
|
||||||
public void setFooter(HeaderFooter footer) { |
|
||||||
} |
|
||||||
|
|
||||||
public void setHeader(HeaderFooter header) { |
|
||||||
} |
|
||||||
|
|
||||||
public boolean setMarginMirroring(boolean marginMirroring) { |
|
||||||
return false; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* @see DocListener#setMarginMirroring(boolean) |
|
||||||
* @since 2.1.6 |
|
||||||
*/ |
|
||||||
public boolean setMarginMirroringTopBottom(boolean marginMirroring) { |
|
||||||
return false; |
|
||||||
} |
|
||||||
|
|
||||||
public boolean setMargins(float marginLeft, float marginRight, |
|
||||||
float marginTop, float marginBottom) { |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
public void setPageCount(int pageN) { |
|
||||||
} |
|
||||||
|
|
||||||
public boolean setPageSize(Rectangle pageSize) { |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
public static final String tagsSupportedString = "ol ul li a pre font span br p div body table td th tr i b u sub sup em strong s strike" |
|
||||||
+ " h1 h2 h3 h4 h5 h6 img hr"; |
|
||||||
|
|
||||||
public static final HashMap tagsSupported = new HashMap(); |
|
||||||
public static final HashMap tagsPrefixSupported = new HashMap(); |
|
||||||
|
|
||||||
static { |
|
||||||
StringTokenizer tok = new StringTokenizer(tagsSupportedString); |
|
||||||
while (tok.hasMoreTokens()) { |
|
||||||
String s = tok.nextToken(); |
|
||||||
tagsSupported.put(s, null); |
|
||||||
tagsPrefixSupported.put(s.charAt(0), null); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
@ -1,780 +0,0 @@ |
|||||||
/* |
|
||||||
* Copyright 2003 Paulo Soares |
|
||||||
* |
|
||||||
* The contents of this file are subject to the Mozilla Public License Version 1.1 |
|
||||||
* (the "License"); you may not use this file except in compliance with the License. |
|
||||||
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
|
|
||||||
* |
|
||||||
* Software distributed under the License is distributed on an "AS IS" basis, |
|
||||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
|
||||||
* for the specific language governing rights and limitations under the License. |
|
||||||
* |
|
||||||
* The Original Code is 'iText, a free JAVA-PDF library'. |
|
||||||
* |
|
||||||
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by |
|
||||||
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. |
|
||||||
* All Rights Reserved. |
|
||||||
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer |
|
||||||
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. |
|
||||||
* |
|
||||||
* Contributor(s): all the names of the contributors are added in the source code |
|
||||||
* where applicable. |
|
||||||
* |
|
||||||
* Alternatively, the contents of this file may be used under the terms of the |
|
||||||
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the |
|
||||||
* provisions of LGPL are applicable instead of those above. If you wish to |
|
||||||
* allow use of your version of this file only under the terms of the LGPL |
|
||||||
* License and not to allow others to use your version of this file under |
|
||||||
* the MPL, indicate your decision by deleting the provisions above and |
|
||||||
* replace them with the notice and other provisions required by the LGPL. |
|
||||||
* If you do not delete the provisions above, a recipient may use your version |
|
||||||
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. |
|
||||||
* |
|
||||||
* This library is free software; you can redistribute it and/or modify it |
|
||||||
* under the terms of the MPL as stated above or under the terms of the GNU |
|
||||||
* Library General Public License as published by the Free Software Foundation; |
|
||||||
* either version 2 of the License, or any later version. |
|
||||||
* |
|
||||||
* This library is distributed in the hope that it will be useful, but WITHOUT |
|
||||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
|
||||||
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more |
|
||||||
* details. |
|
||||||
* |
|
||||||
* If you didn't download this code from the following link, you should check if |
|
||||||
* you aren't using an obsolete version: |
|
||||||
* http://www.lowagie.com/iText/
|
|
||||||
* |
|
||||||
* The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license: |
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
* contributor license agreements. See the NOTICE file distributed with |
|
||||||
* this work for additional information regarding copyright ownership. |
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
* (the "License"); you may not use this file except in compliance with |
|
||||||
* the License. You may obtain a copy of the License at |
|
||||||
* |
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
* |
|
||||||
* Unless required by applicable law or agreed to in writing, software |
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
* See the License for the specific language governing permissions and |
|
||||||
* limitations under the License. |
|
||||||
* |
|
||||||
* Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt. |
|
||||||
* The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128). |
|
||||||
* Steven Brandt and JavaWorld gave permission to use the code for free. |
|
||||||
* (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in |
|
||||||
* conformance with the rest of the code). |
|
||||||
* The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
|
|
||||||
* It was substantially refactored by Bruno Lowagie. |
|
||||||
* |
|
||||||
* The method 'private static String getEncodingName(byte[] b4)' was found |
|
||||||
* in org.apache.xerces.impl.XMLEntityManager, originaly published by the |
|
||||||
* Apache Software Foundation under the Apache Software License; now being |
|
||||||
* used in iText under the MPL. |
|
||||||
*/ |
|
||||||
package com.fr.third.com.lowagie.text.xml.simpleparser; |
|
||||||
|
|
||||||
import com.fr.third.com.lowagie.text.html.simpleparser.HTMLWorker; |
|
||||||
import java.io.BufferedReader; |
|
||||||
import java.io.ByteArrayOutputStream; |
|
||||||
import java.io.IOException; |
|
||||||
import java.io.InputStream; |
|
||||||
import java.io.InputStreamReader; |
|
||||||
import java.io.Reader; |
|
||||||
import java.util.HashMap; |
|
||||||
import java.util.Stack; |
|
||||||
|
|
||||||
/** |
|
||||||
* A simple XML and HTML parser. This parser is, like the SAX parser, |
|
||||||
* an event based parser, but with much less functionality. |
|
||||||
* <p> |
|
||||||
* The parser can: |
|
||||||
* <p> |
|
||||||
* <ul> |
|
||||||
* <li>It recognizes the encoding used |
|
||||||
* <li>It recognizes all the elements' start tags and end tags |
|
||||||
* <li>It lists attributes, where attribute values can be enclosed in single or double quotes |
|
||||||
* <li>It recognizes the <code><[CDATA[ ... ]]></code> construct |
|
||||||
* <li>It recognizes the standard entities: &amp;, &lt;, &gt;, &quot;, and &apos;, as well as numeric entities |
|
||||||
* <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11 |
|
||||||
* </ul> |
|
||||||
* <p> |
|
||||||
*/ |
|
||||||
public final class SimpleXMLParser { |
|
||||||
/** possible states */ |
|
||||||
private final static int UNKNOWN = 0; |
|
||||||
private final static int TEXT = 1; |
|
||||||
private final static int TAG_ENCOUNTERED = 2; |
|
||||||
private final static int EXAMIN_TAG = 3; |
|
||||||
private final static int TAG_EXAMINED = 4; |
|
||||||
private final static int IN_CLOSETAG = 5; |
|
||||||
private final static int SINGLE_TAG = 6; |
|
||||||
private final static int CDATA = 7; |
|
||||||
private final static int COMMENT = 8; |
|
||||||
private final static int PI = 9; |
|
||||||
private final static int ENTITY = 10; |
|
||||||
private final static int QUOTE = 11; |
|
||||||
private final static int ATTRIBUTE_KEY = 12; |
|
||||||
private final static int ATTRIBUTE_EQUAL = 13; |
|
||||||
private final static int ATTRIBUTE_VALUE = 14; |
|
||||||
|
|
||||||
/** the state stack */ |
|
||||||
Stack stack; |
|
||||||
/** The current character. */ |
|
||||||
int character = 0; |
|
||||||
/** The previous character. */ |
|
||||||
int previousCharacter = -1; |
|
||||||
/** the line we are currently reading */ |
|
||||||
int lines = 1; |
|
||||||
/** the column where the current character occurs */ |
|
||||||
int columns = 0; |
|
||||||
/** was the last character equivalent to a newline? */ |
|
||||||
boolean eol = false; |
|
||||||
/** |
|
||||||
* A boolean indicating if the next character should be taken into account |
|
||||||
* if it's a space character. When nospace is false, the previous character |
|
||||||
* wasn't whitespace. |
|
||||||
* @since 2.1.5 |
|
||||||
*/ |
|
||||||
boolean nowhite = false; |
|
||||||
/** the current state */ |
|
||||||
int state; |
|
||||||
/** Are we parsing HTML? */ |
|
||||||
boolean html; |
|
||||||
/** current text (whatever is encountered between tags) */ |
|
||||||
StringBuffer text = new StringBuffer(); |
|
||||||
/** current entity (whatever is encountered between & and ;) */ |
|
||||||
StringBuffer entity = new StringBuffer(); |
|
||||||
/** current tagname */ |
|
||||||
String tag = null; |
|
||||||
/** current attributes */ |
|
||||||
HashMap attributes = null; |
|
||||||
/** The handler to which we are going to forward document content */ |
|
||||||
SimpleXMLDocHandler doc; |
|
||||||
/** The handler to which we are going to forward comments. */ |
|
||||||
SimpleXMLDocHandlerComment comment; |
|
||||||
/** Keeps track of the number of tags that are open. */ |
|
||||||
int nested = 0; |
|
||||||
/** the quote character that was used to open the quote. */ |
|
||||||
int quoteCharacter = '"'; |
|
||||||
/** the attribute key. */ |
|
||||||
String attributekey = null; |
|
||||||
/** the attribute value. */ |
|
||||||
String attributevalue = null; |
|
||||||
|
|
||||||
/** |
|
||||||
* Creates a Simple XML parser object. |
|
||||||
* Call go(BufferedReader) immediately after creation. |
|
||||||
*/ |
|
||||||
private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) { |
|
||||||
this.doc = doc; |
|
||||||
this.comment = comment; |
|
||||||
this.html = html; |
|
||||||
stack = new Stack(); |
|
||||||
state = html ? TEXT : UNKNOWN; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Does the actual parsing. Perform this immediately |
|
||||||
* after creating the parser object. |
|
||||||
*/ |
|
||||||
private void go(Reader r) throws IOException { |
|
||||||
BufferedReader reader; |
|
||||||
if (r instanceof BufferedReader) |
|
||||||
reader = (BufferedReader)r; |
|
||||||
else |
|
||||||
reader = new BufferedReader(r); |
|
||||||
doc.startDocument(); |
|
||||||
while(true) { |
|
||||||
// read a new character
|
|
||||||
if (previousCharacter == -1) { |
|
||||||
character = reader.read(); |
|
||||||
} |
|
||||||
// or re-examine the previous character
|
|
||||||
else { |
|
||||||
character = previousCharacter; |
|
||||||
previousCharacter = -1; |
|
||||||
} |
|
||||||
|
|
||||||
// the end of the file was reached
|
|
||||||
if (character == -1) { |
|
||||||
if (html) { |
|
||||||
if (html && state == TEXT) |
|
||||||
flush(); |
|
||||||
doc.endDocument(); |
|
||||||
} else { |
|
||||||
throwException("Missing end tag"); |
|
||||||
} |
|
||||||
return; |
|
||||||
} |
|
||||||
|
|
||||||
// dealing with \n and \r
|
|
||||||
if (character == '\n' && eol) { |
|
||||||
eol = false; |
|
||||||
continue; |
|
||||||
} else if (eol) { |
|
||||||
eol = false; |
|
||||||
} else if (character == '\n') { |
|
||||||
lines++; |
|
||||||
columns = 0; |
|
||||||
} else if (character == '\r') { |
|
||||||
eol = true; |
|
||||||
character = '\n'; |
|
||||||
lines++; |
|
||||||
columns = 0; |
|
||||||
} else { |
|
||||||
columns++; |
|
||||||
} |
|
||||||
|
|
||||||
switch(state) { |
|
||||||
// we are in an unknown state before there's actual content
|
|
||||||
case UNKNOWN: |
|
||||||
if(character == '<') { |
|
||||||
beginnOfTag((char) reader.read(), UNKNOWN); |
|
||||||
} |
|
||||||
break; |
|
||||||
// we can encounter any content
|
|
||||||
case TEXT: |
|
||||||
if(character == '<') { |
|
||||||
beginnOfTag((char) reader.read(), TEXT); |
|
||||||
} else if(character == '&') { |
|
||||||
saveState(state); |
|
||||||
entity.setLength(0); |
|
||||||
state = ENTITY; |
|
||||||
} else if (Character.isWhitespace((char)character) && character != 12288) { |
|
||||||
if (nowhite) |
|
||||||
text.append((char)character); |
|
||||||
nowhite = false; |
|
||||||
} else { |
|
||||||
text.append((char)character); |
|
||||||
nowhite = true; |
|
||||||
} |
|
||||||
break; |
|
||||||
// we have just seen a < and are wondering what we are looking at
|
|
||||||
// <foo>, </foo>, <!-- ... --->, etc.
|
|
||||||
case TAG_ENCOUNTERED: |
|
||||||
initTag(); |
|
||||||
if(character == '/') { |
|
||||||
state = IN_CLOSETAG; |
|
||||||
} else if (character == '?') { |
|
||||||
restoreState(); |
|
||||||
state = PI; |
|
||||||
} else { |
|
||||||
text.append((char)character); |
|
||||||
state = EXAMIN_TAG; |
|
||||||
} |
|
||||||
break; |
|
||||||
// we are processing something like this <foo ... >.
|
|
||||||
// It could still be a <!-- ... --> or something.
|
|
||||||
case EXAMIN_TAG: |
|
||||||
if(character == '>') { |
|
||||||
doTag(); |
|
||||||
processTag(true); |
|
||||||
initTag(); |
|
||||||
state = restoreState(); |
|
||||||
} else if(character == '/') { |
|
||||||
state = SINGLE_TAG; |
|
||||||
} else if(character == '-' && text.toString().equals("!-")) { |
|
||||||
flush(); |
|
||||||
state = COMMENT; |
|
||||||
} else if(character == '[' && text.toString().equals("![CDATA")) { |
|
||||||
flush(); |
|
||||||
state = CDATA; |
|
||||||
} else if(character == 'E' && text.toString().equals("!DOCTYP")) { |
|
||||||
flush(); |
|
||||||
state = PI; |
|
||||||
} else if(Character.isWhitespace((char)character)) { |
|
||||||
doTag(); |
|
||||||
state = TAG_EXAMINED; |
|
||||||
} else { |
|
||||||
text.append((char)character); |
|
||||||
} |
|
||||||
break; |
|
||||||
// we know the name of the tag now.
|
|
||||||
case TAG_EXAMINED: |
|
||||||
if(character == '>') { |
|
||||||
processTag(true); |
|
||||||
initTag(); |
|
||||||
state = restoreState(); |
|
||||||
} else if(character == '/') { |
|
||||||
state = SINGLE_TAG; |
|
||||||
} else if(Character.isWhitespace((char)character)) { |
|
||||||
// empty
|
|
||||||
} else { |
|
||||||
text.append((char)character); |
|
||||||
state = ATTRIBUTE_KEY; |
|
||||||
} |
|
||||||
break; |
|
||||||
|
|
||||||
// we are processing a closing tag: e.g. </foo>
|
|
||||||
case IN_CLOSETAG: |
|
||||||
if(character == '>') { |
|
||||||
doTag(); |
|
||||||
processTag(false); |
|
||||||
if(!html && nested==0) return; |
|
||||||
state = restoreState(); |
|
||||||
} else { |
|
||||||
if (!Character.isWhitespace((char)character)) |
|
||||||
text.append((char)character); |
|
||||||
} |
|
||||||
break; |
|
||||||
|
|
||||||
// we have just seen something like this: <foo a="b"/
|
|
||||||
// and are looking for the final >.
|
|
||||||
case SINGLE_TAG: |
|
||||||
if(character != '>') |
|
||||||
throwException("Expected > for tag: <"+tag+"/>"); |
|
||||||
doTag(); |
|
||||||
processTag(true); |
|
||||||
processTag(false); |
|
||||||
initTag(); |
|
||||||
if(!html && nested==0) { |
|
||||||
doc.endDocument(); |
|
||||||
return; |
|
||||||
} |
|
||||||
state = restoreState(); |
|
||||||
break; |
|
||||||
|
|
||||||
// we are processing CDATA
|
|
||||||
case CDATA: |
|
||||||
if(character == '>' |
|
||||||
&& text.toString().endsWith("]]")) { |
|
||||||
text.setLength(text.length()-2); |
|
||||||
flush(); |
|
||||||
state = restoreState(); |
|
||||||
} else |
|
||||||
text.append((char)character); |
|
||||||
break; |
|
||||||
|
|
||||||
// we are processing a comment. We are inside
|
|
||||||
// the <!-- .... --> looking for the -->.
|
|
||||||
case COMMENT: |
|
||||||
if(character == '>' |
|
||||||
&& text.toString().endsWith("--")) { |
|
||||||
text.setLength(text.length() - 2); |
|
||||||
flush(); |
|
||||||
state = restoreState(); |
|
||||||
} else |
|
||||||
text.append((char)character); |
|
||||||
break; |
|
||||||
|
|
||||||
// We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
|
|
||||||
case PI: |
|
||||||
if(character == '>') { |
|
||||||
state = restoreState(); |
|
||||||
if(state == TEXT) state = UNKNOWN; |
|
||||||
} |
|
||||||
break; |
|
||||||
|
|
||||||
// we are processing an entity, e.g. <, », etc.
|
|
||||||
case ENTITY: |
|
||||||
if(character == ';') { |
|
||||||
state = restoreState(); |
|
||||||
String cent = entity.toString(); |
|
||||||
entity.setLength(0); |
|
||||||
char ce = EntitiesToUnicode.decodeEntity(cent); |
|
||||||
if (ce == '\0') |
|
||||||
text.append('&').append(cent).append(';'); |
|
||||||
else |
|
||||||
text.append(ce); |
|
||||||
} else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') |
|
||||||
&& (character < 'A' || character > 'Z')) || entity.length() >= 7) { |
|
||||||
state = restoreState(); |
|
||||||
previousCharacter = character; |
|
||||||
text.append('&').append(entity.toString()); |
|
||||||
entity.setLength(0); |
|
||||||
} |
|
||||||
else { |
|
||||||
entity.append((char)character); |
|
||||||
} |
|
||||||
break; |
|
||||||
// We are processing the quoted right-hand side of an element's attribute.
|
|
||||||
case QUOTE: |
|
||||||
if (html && quoteCharacter == ' ' && character == '>') { |
|
||||||
flush(); |
|
||||||
processTag(true); |
|
||||||
initTag(); |
|
||||||
state = restoreState(); |
|
||||||
} |
|
||||||
else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) { |
|
||||||
flush(); |
|
||||||
state = TAG_EXAMINED; |
|
||||||
} |
|
||||||
else if (html && quoteCharacter == ' ') { |
|
||||||
text.append((char)character); |
|
||||||
} |
|
||||||
else if(character == quoteCharacter) { |
|
||||||
flush(); |
|
||||||
state = TAG_EXAMINED; |
|
||||||
} else if(" \r\n\u0009".indexOf(character)>=0) { |
|
||||||
text.append(' '); |
|
||||||
} else if(character == '&') { |
|
||||||
saveState(state); |
|
||||||
state = ENTITY; |
|
||||||
entity.setLength(0); |
|
||||||
} else { |
|
||||||
text.append((char)character); |
|
||||||
} |
|
||||||
break; |
|
||||||
|
|
||||||
case ATTRIBUTE_KEY: |
|
||||||
if(Character.isWhitespace((char)character)) { |
|
||||||
flush(); |
|
||||||
state = ATTRIBUTE_EQUAL; |
|
||||||
} else if(character == '=') { |
|
||||||
flush(); |
|
||||||
state = ATTRIBUTE_VALUE; |
|
||||||
} else if (html && character == '>') { |
|
||||||
text.setLength(0); |
|
||||||
processTag(true); |
|
||||||
initTag(); |
|
||||||
state = restoreState(); |
|
||||||
} else { |
|
||||||
text.append((char)character); |
|
||||||
} |
|
||||||
break; |
|
||||||
|
|
||||||
case ATTRIBUTE_EQUAL: |
|
||||||
if(character == '=') { |
|
||||||
state = ATTRIBUTE_VALUE; |
|
||||||
} else if(Character.isWhitespace((char)character)) { |
|
||||||
// empty
|
|
||||||
} else if (html && character == '>') { |
|
||||||
text.setLength(0); |
|
||||||
processTag(true); |
|
||||||
initTag(); |
|
||||||
state = restoreState(); |
|
||||||
} else if (html && character == '/') { |
|
||||||
flush(); |
|
||||||
state = SINGLE_TAG; |
|
||||||
} else if (html) { |
|
||||||
flush(); |
|
||||||
text.append((char)character); |
|
||||||
state = ATTRIBUTE_KEY; |
|
||||||
} else { |
|
||||||
throwException("Error in attribute processing."); |
|
||||||
} |
|
||||||
break; |
|
||||||
|
|
||||||
case ATTRIBUTE_VALUE: |
|
||||||
if(character == '"' || character == '\'') { |
|
||||||
quoteCharacter = character; |
|
||||||
state = QUOTE; |
|
||||||
} else if(Character.isWhitespace((char)character)) { |
|
||||||
// empty
|
|
||||||
} else if (html && character == '>') { |
|
||||||
flush(); |
|
||||||
processTag(true); |
|
||||||
initTag(); |
|
||||||
state = restoreState(); |
|
||||||
} else if (html) { |
|
||||||
text.append((char)character); |
|
||||||
quoteCharacter = ' '; |
|
||||||
state = QUOTE; |
|
||||||
} else { |
|
||||||
throwException("Error in attribute processing"); |
|
||||||
} |
|
||||||
break; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Gets a state from the stack |
|
||||||
* @return the previous state |
|
||||||
*/ |
|
||||||
private int restoreState() { |
|
||||||
if(!stack.empty()) |
|
||||||
return ((Integer)stack.pop()).intValue(); |
|
||||||
else |
|
||||||
return UNKNOWN; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Adds a state to the stack. |
|
||||||
* @param s a state to add to the stack |
|
||||||
*/ |
|
||||||
private void saveState(int s) { |
|
||||||
stack.push(new Integer(s)); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* 处理标签的开头,若不在支持标签范围内,将<符号作为文本处理,例:<1111 (仿造浏览器的处理方式) |
|
||||||
*/ |
|
||||||
public void beginnOfTag(char c, int type) { |
|
||||||
previousCharacter = c; |
|
||||||
if (c == -1) { |
|
||||||
return; |
|
||||||
} |
|
||||||
if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) { |
|
||||||
if (type == TEXT) { |
|
||||||
flush(); |
|
||||||
} |
|
||||||
saveState(TEXT); |
|
||||||
state = TAG_ENCOUNTERED; |
|
||||||
return; |
|
||||||
} |
|
||||||
text.append((char) character); |
|
||||||
nowhite = true; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Flushes the text that is currently in the buffer. |
|
||||||
* The text can be ignored, added to the document |
|
||||||
* as content or as comment,... depending on the current state. |
|
||||||
*/ |
|
||||||
private void flush() { |
|
||||||
switch(state){ |
|
||||||
case TEXT: |
|
||||||
case CDATA: |
|
||||||
if(text.length() > 0) { |
|
||||||
doc.text(text.toString()); |
|
||||||
} |
|
||||||
break; |
|
||||||
case COMMENT: |
|
||||||
if (comment != null) { |
|
||||||
comment.comment(text.toString()); |
|
||||||
} |
|
||||||
break; |
|
||||||
case ATTRIBUTE_KEY: |
|
||||||
attributekey = text.toString(); |
|
||||||
if (html) |
|
||||||
attributekey = attributekey.toLowerCase(); |
|
||||||
break; |
|
||||||
case QUOTE: |
|
||||||
case ATTRIBUTE_VALUE: |
|
||||||
attributevalue = text.toString(); |
|
||||||
attributes.put(attributekey,attributevalue); |
|
||||||
break; |
|
||||||
default: |
|
||||||
// do nothing
|
|
||||||
} |
|
||||||
text.setLength(0); |
|
||||||
} |
|
||||||
/** |
|
||||||
* Initialized the tag name and attributes. |
|
||||||
*/ |
|
||||||
private void initTag() { |
|
||||||
tag = null; |
|
||||||
attributes = new HashMap(); |
|
||||||
} |
|
||||||
/** Sets the name of the tag. */ |
|
||||||
private void doTag() { |
|
||||||
if(tag == null) |
|
||||||
tag = text.toString(); |
|
||||||
if (html) |
|
||||||
tag = tag.toLowerCase(); |
|
||||||
text.setLength(0); |
|
||||||
} |
|
||||||
/** |
|
||||||
* processes the tag. |
|
||||||
* @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. |
|
||||||
*/ |
|
||||||
private void processTag(boolean start) { |
|
||||||
if (start) { |
|
||||||
nested++; |
|
||||||
doc.startElement(tag,attributes); |
|
||||||
} |
|
||||||
else { |
|
||||||
nested--; |
|
||||||
doc.endElement(tag); |
|
||||||
} |
|
||||||
} |
|
||||||
/** Throws an exception */ |
|
||||||
private void throwException(String s) throws IOException { |
|
||||||
throw new IOException(s+" near line " + lines + ", column " + columns); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Parses the XML document firing the events to the handler. |
|
||||||
* @param doc the document handler |
|
||||||
* @param r the document. The encoding is already resolved. The reader is not closed |
|
||||||
* @throws IOException on error |
|
||||||
*/ |
|
||||||
public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException { |
|
||||||
SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); |
|
||||||
parser.go(r); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Parses the XML document firing the events to the handler. |
|
||||||
* @param doc the document handler |
|
||||||
* @param in the document. The encoding is deduced from the stream. The stream is not closed |
|
||||||
* @throws IOException on error |
|
||||||
*/ |
|
||||||
public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException { |
|
||||||
byte b4[] = new byte[4]; |
|
||||||
int count = in.read(b4); |
|
||||||
if (count != 4) |
|
||||||
throw new IOException("Insufficient length."); |
|
||||||
String encoding = getEncodingName(b4); |
|
||||||
String decl = null; |
|
||||||
if (encoding.equals("UTF-8")) { |
|
||||||
StringBuffer sb = new StringBuffer(); |
|
||||||
int c; |
|
||||||
while ((c = in.read()) != -1) { |
|
||||||
if (c == '>') |
|
||||||
break; |
|
||||||
sb.append((char)c); |
|
||||||
} |
|
||||||
decl = sb.toString(); |
|
||||||
} |
|
||||||
else if (encoding.equals("CP037")) { |
|
||||||
ByteArrayOutputStream bi = new ByteArrayOutputStream(); |
|
||||||
int c; |
|
||||||
while ((c = in.read()) != -1) { |
|
||||||
if (c == 0x6e) // that's '>' in ebcdic
|
|
||||||
break; |
|
||||||
bi.write(c); |
|
||||||
} |
|
||||||
decl = new String(bi.toByteArray(), "CP037"); |
|
||||||
} |
|
||||||
if (decl != null) { |
|
||||||
decl = getDeclaredEncoding(decl); |
|
||||||
if (decl != null) |
|
||||||
encoding = decl; |
|
||||||
} |
|
||||||
parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding))); |
|
||||||
} |
|
||||||
|
|
||||||
private static String getDeclaredEncoding(String decl) { |
|
||||||
if (decl == null) |
|
||||||
return null; |
|
||||||
int idx = decl.indexOf("encoding"); |
|
||||||
if (idx < 0) |
|
||||||
return null; |
|
||||||
int idx1 = decl.indexOf('"', idx); |
|
||||||
int idx2 = decl.indexOf('\'', idx); |
|
||||||
if (idx1 == idx2) |
|
||||||
return null; |
|
||||||
if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { |
|
||||||
int idx3 = decl.indexOf('\'', idx2 + 1); |
|
||||||
if (idx3 < 0) |
|
||||||
return null; |
|
||||||
return decl.substring(idx2 + 1, idx3); |
|
||||||
} |
|
||||||
if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { |
|
||||||
int idx3 = decl.indexOf('"', idx1 + 1); |
|
||||||
if (idx3 < 0) |
|
||||||
return null; |
|
||||||
return decl.substring(idx1 + 1, idx3); |
|
||||||
} |
|
||||||
return null; |
|
||||||
} |
|
||||||
|
|
||||||
public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException { |
|
||||||
parse(doc, null, r, false); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Escapes a string with the appropriated XML codes. |
|
||||||
* @param s the string to be escaped |
|
||||||
* @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE> |
|
||||||
* @return the escaped string |
|
||||||
*/ |
|
||||||
public static String escapeXML(String s, boolean onlyASCII) { |
|
||||||
char cc[] = s.toCharArray(); |
|
||||||
int len = cc.length; |
|
||||||
StringBuffer sb = new StringBuffer(); |
|
||||||
for (int k = 0; k < len; ++k) { |
|
||||||
int c = cc[k]; |
|
||||||
switch (c) { |
|
||||||
case '<': |
|
||||||
sb.append("<"); |
|
||||||
break; |
|
||||||
case '>': |
|
||||||
sb.append(">"); |
|
||||||
break; |
|
||||||
case '&': |
|
||||||
sb.append("&"); |
|
||||||
break; |
|
||||||
case '"': |
|
||||||
sb.append("""); |
|
||||||
break; |
|
||||||
case '\'': |
|
||||||
sb.append("'"); |
|
||||||
break; |
|
||||||
default: |
|
||||||
if ((c == 0x9) || (c == 0xA) || (c == 0xD) |
|
||||||
|| ((c >= 0x20) && (c <= 0xD7FF)) |
|
||||||
|| ((c >= 0xE000) && (c <= 0xFFFD)) |
|
||||||
|| ((c >= 0x10000) && (c <= 0x10FFFF))) { |
|
||||||
if (onlyASCII && c > 127) |
|
||||||
sb.append("&#").append(c).append(';'); |
|
||||||
else |
|
||||||
sb.append((char)c); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
return sb.toString(); |
|
||||||
} |
|
||||||
/** |
|
||||||
* Returns the IANA encoding name that is auto-detected from |
|
||||||
* the bytes specified, with the endian-ness of that encoding where appropriate. |
|
||||||
* (method found in org.apache.xerces.impl.XMLEntityManager, originally published |
|
||||||
* by the Apache Software Foundation under the Apache Software License; now being |
|
||||||
* used in iText under the MPL) |
|
||||||
* @param b4 The first four bytes of the input. |
|
||||||
* @return an IANA-encoding string |
|
||||||
*/ |
|
||||||
private static String getEncodingName(byte[] b4) { |
|
||||||
|
|
||||||
// UTF-16, with BOM
|
|
||||||
int b0 = b4[0] & 0xFF; |
|
||||||
int b1 = b4[1] & 0xFF; |
|
||||||
if (b0 == 0xFE && b1 == 0xFF) { |
|
||||||
// UTF-16, big-endian
|
|
||||||
return "UTF-16BE"; |
|
||||||
} |
|
||||||
if (b0 == 0xFF && b1 == 0xFE) { |
|
||||||
// UTF-16, little-endian
|
|
||||||
return "UTF-16LE"; |
|
||||||
} |
|
||||||
|
|
||||||
// UTF-8 with a BOM
|
|
||||||
int b2 = b4[2] & 0xFF; |
|
||||||
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { |
|
||||||
return "UTF-8"; |
|
||||||
} |
|
||||||
|
|
||||||
// other encodings
|
|
||||||
int b3 = b4[3] & 0xFF; |
|
||||||
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { |
|
||||||
// UCS-4, big endian (1234)
|
|
||||||
return "ISO-10646-UCS-4"; |
|
||||||
} |
|
||||||
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { |
|
||||||
// UCS-4, little endian (4321)
|
|
||||||
return "ISO-10646-UCS-4"; |
|
||||||
} |
|
||||||
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { |
|
||||||
// UCS-4, unusual octet order (2143)
|
|
||||||
// REVISIT: What should this be?
|
|
||||||
return "ISO-10646-UCS-4"; |
|
||||||
} |
|
||||||
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { |
|
||||||
// UCS-4, unusual octet order (3412)
|
|
||||||
// REVISIT: What should this be?
|
|
||||||
return "ISO-10646-UCS-4"; |
|
||||||
} |
|
||||||
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { |
|
||||||
// UTF-16, big-endian, no BOM
|
|
||||||
// (or could turn out to be UCS-2...
|
|
||||||
// REVISIT: What should this be?
|
|
||||||
return "UTF-16BE"; |
|
||||||
} |
|
||||||
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { |
|
||||||
// UTF-16, little-endian, no BOM
|
|
||||||
// (or could turn out to be UCS-2...
|
|
||||||
return "UTF-16LE"; |
|
||||||
} |
|
||||||
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { |
|
||||||
// EBCDIC
|
|
||||||
// a la xerces1, return CP037 instead of EBCDIC here
|
|
||||||
return "CP037"; |
|
||||||
} |
|
||||||
|
|
||||||
// default encoding
|
|
||||||
return "UTF-8"; |
|
||||||
} |
|
||||||
} |
|
Loading…
Reference in new issue