Browse Source

Merge branch 'bugfix/10.0' of http://code.fineres.com/scm/~harrison/base-third into release/10.0

# Conflicts:
#	fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java
#	fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java
release/10.0
Harrison 4 years ago
parent
commit
137f6be183
  1. 788
      fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java
  2. 780
      fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java

788
fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java

@ -0,0 +1,788 @@
/*
* Copyright 2004 Paulo Soares
*
* The contents of this file are subject to the Mozilla Public License Version 1.1
* (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the License.
*
* The Original Code is 'iText, a free JAVA-PDF library'.
*
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
* All Rights Reserved.
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
*
* Contributor(s): all the names of the contributors are added in the source code
* where applicable.
*
* Alternatively, the contents of this file may be used under the terms of the
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
* provisions of LGPL are applicable instead of those above. If you wish to
* allow use of your version of this file only under the terms of the LGPL
* License and not to allow others to use your version of this file under
* the MPL, indicate your decision by deleting the provisions above and
* replace them with the notice and other provisions required by the LGPL.
* If you do not delete the provisions above, a recipient may use your version
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the MPL as stated above or under the terms of the GNU
* Library General Public License as published by the Free Software Foundation;
* either version 2 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
* details.
*
* Contributions by:
* Lubos Strapko
*
* If you didn't download this code from the following link, you should check if
* you aren't using an obsolete version:
* http://www.lowagie.com/iText/
*/
package com.fr.third.com.lowagie.text.html.simpleparser;
import com.fr.third.com.lowagie.text.Chunk;
import com.fr.third.com.lowagie.text.DocListener;
import com.fr.third.com.lowagie.text.DocumentException;
import com.fr.third.com.lowagie.text.Element;
import com.fr.third.com.lowagie.text.ElementTags;
import com.fr.third.com.lowagie.text.ExceptionConverter;
import com.fr.third.com.lowagie.text.FontFactoryImp;
import com.fr.third.com.lowagie.text.HeaderFooter;
import com.fr.third.com.lowagie.text.Image;
import com.fr.third.com.lowagie.text.List;
import com.fr.third.com.lowagie.text.ListItem;
import com.fr.third.com.lowagie.text.Paragraph;
import com.fr.third.com.lowagie.text.Phrase;
import com.fr.third.com.lowagie.text.Rectangle;
import com.fr.third.com.lowagie.text.TextElementArray;
import com.fr.third.com.lowagie.text.html.CSSUtils;
import com.fr.third.com.lowagie.text.html.HtmlTags;
import com.fr.third.com.lowagie.text.html.Markup;
import com.fr.third.com.lowagie.text.pdf.PdfPTable;
import com.fr.third.com.lowagie.text.pdf.draw.LineSeparator;
import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLDocHandler;
import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLParser;
import com.fr.third.sun.misc.BASE64Decoder;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Stack;
import java.util.StringTokenizer;
public class HTMLWorker implements SimpleXMLDocHandler, DocListener {
protected ArrayList objectList;
protected DocListener document;
private Paragraph currentParagraph;
private ChainedProperties cprops = new ChainedProperties();
private Stack stack = new Stack();
private boolean pendingTR = false;
private boolean pendingTD = false;
private boolean pendingLI = false;
private StyleSheet style = new StyleSheet();
private boolean isPRE = false;
private Stack tableState = new Stack();
private boolean skipText = false;
private HashMap interfaceProps;
private FactoryProperties factoryProperties = new FactoryProperties();
/** Creates a new instance of HTMLWorker
* @param document A class that implements <CODE>DocListener</CODE>
* */
public HTMLWorker(DocListener document) {
this.document = document;
}
public void setStyleSheet(StyleSheet style) {
this.style = style;
}
public StyleSheet getStyleSheet() {
return style;
}
public void setInterfaceProps(HashMap interfaceProps) {
this.interfaceProps = interfaceProps;
FontFactoryImp ff = null;
if (interfaceProps != null)
ff = (FontFactoryImp) interfaceProps.get("font_factory");
if (ff != null)
factoryProperties.setFontImp(ff);
}
public HashMap getInterfaceProps() {
return interfaceProps;
}
public void parse(Reader reader) throws IOException {
SimpleXMLParser.parse(this, null, reader, true);
}
public static ArrayList parseToList(Reader reader, StyleSheet style)
throws IOException {
return parseToList(reader, style, null);
}
public static ArrayList parseToList(Reader reader, StyleSheet style,
HashMap interfaceProps) throws IOException {
HTMLWorker worker = new HTMLWorker(null);
if (style != null)
worker.style = style;
worker.document = worker;
worker.setInterfaceProps(interfaceProps);
worker.objectList = new ArrayList();
worker.parse(reader);
return worker.objectList;
}
public void endDocument() {
try {
for (int k = 0; k < stack.size(); ++k)
document.add((Element) stack.elementAt(k));
if (currentParagraph != null)
document.add(currentParagraph);
currentParagraph = null;
} catch (Exception e) {
throw new ExceptionConverter(e);
}
}
public void startDocument() {
HashMap h = new HashMap();
style.applyStyle("body", h);
cprops.addToChain("body", h);
}
public void startElement(String tag, HashMap h) {
if (!tagsSupported.containsKey(tag))
return;
try {
style.applyStyle(tag, h);
if(tag.equals("p")){
h.put(Markup.CSS_KEY_MARGINTOP, "16px");
h.put(Markup.CSS_KEY_MARGINBOTTOM, "16px");
}
String follow = (String) FactoryProperties.followTags.get(tag);
if (follow != null) {
HashMap prop = new HashMap();
prop.put(follow, null);
FactoryProperties.insertStyle(h, this.cprops);
prop.putAll(h);
cprops.addToChain(follow, prop);
return;
}
FactoryProperties.insertStyle(h, cprops);
if (tag.equals(HtmlTags.ANCHOR)) {
cprops.addToChain(tag, h);
if (currentParagraph == null) {
currentParagraph = new Paragraph();
}
stack.push(currentParagraph);
currentParagraph = new Paragraph();
return;
}
if (tag.equals(HtmlTags.NEWLINE)) {
if (currentParagraph == null) {
currentParagraph = new Paragraph();
}
currentParagraph.add(factoryProperties
.createChunk("\n", cprops));
return;
}
if (tag.equals(HtmlTags.HORIZONTALRULE)) {
// Attempting to duplicate the behavior seen on Firefox with
// http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_hr_test
// where an initial break is only inserted when the preceding element doesn't
// end with a break, but a trailing break is always inserted.
boolean addLeadingBreak = true;
if (currentParagraph == null) {
currentParagraph = new Paragraph();
addLeadingBreak = false;
}
if (addLeadingBreak) { // Not a new paragraph
int numChunks = currentParagraph.getChunks().size();
if (numChunks == 0 ||
((Chunk)(currentParagraph.getChunks().get(numChunks - 1))).getContent().endsWith("\n"))
addLeadingBreak = false;
}
String align = (String) h.get("align");
int hrAlign = Element.ALIGN_CENTER;
if (align != null) {
if (align.equalsIgnoreCase("left"))
hrAlign = Element.ALIGN_LEFT;
if (align.equalsIgnoreCase("right"))
hrAlign = Element.ALIGN_RIGHT;
}
String width = (String) h.get("width");
float hrWidth = 1;
if (width != null) {
float tmpWidth = Markup.parseLength(width, Markup.DEFAULT_FONT_SIZE);
if (tmpWidth > 0) hrWidth = tmpWidth;
if (!width.endsWith("%"))
hrWidth = 100; // Treat a pixel width as 100% for now.
}
String size = (String) h.get("size");
float hrSize = 1;
if (size != null) {
float tmpSize = Markup.parseLength(size, Markup.DEFAULT_FONT_SIZE);
if (tmpSize > 0)
hrSize = tmpSize;
}
if (addLeadingBreak)
currentParagraph.add(Chunk.NEWLINE);
currentParagraph.add(new LineSeparator(hrSize, hrWidth, null, hrAlign, currentParagraph.getLeading()/2));
currentParagraph.add(Chunk.NEWLINE);
return;
}
if (tag.equals(HtmlTags.CHUNK) || tag.equals(HtmlTags.SPAN)) {
cprops.addToChain(tag, h);
return;
}
if (tag.equals(HtmlTags.IMAGE)) {
String src = (String) h.get(ElementTags.SRC);
if (src == null)
return;
cprops.addToChain(tag, h);
Image img = null;
if (interfaceProps != null) {
ImageProvider ip = (ImageProvider) interfaceProps
.get("img_provider");
if (ip != null)
img = ip.getImage(src, h, cprops, document);
if (img == null) {
HashMap images = (HashMap) interfaceProps
.get("img_static");
if (images != null) {
Image tim = (Image) images.get(src);
if (tim != null)
img = Image.getInstance(tim);
} else {
if (!src.startsWith("http")) { // relative src references only
String baseurl = (String) interfaceProps
.get("img_baseurl");
if (baseurl != null) {
src = baseurl + src;
img = Image.getInstance(src);
}
}
}
}
}
//处理base64编码图片
if(src.startsWith("data")){
BASE64Decoder decoder = new BASE64Decoder();
String[] srcArray = src.split(",");
String base64string = srcArray[srcArray.length -1];
byte[] bytes = decoder.decodeBuffer(base64string);
try {
img = Image.getInstance(bytes);
}catch (Exception e){
}
}
if (img == null) {
if (!src.startsWith("http")) {
String path = cprops.getProperty("image_path");
if (path == null)
path = "";
src = new File(path, src).getPath();
}
img = Image.getInstance(src);
}
if(img == null){
return;
}
img.setSrcString(src);
String align = (String) h.get("align");
String width = (String) h.get("width");
String height = (String) h.get("height");
String before = cprops.getProperty("before");
String after = cprops.getProperty("after");
if (before != null)
img.setSpacingBefore(Float.parseFloat(before));
if (after != null)
img.setSpacingAfter(Float.parseFloat(after));
float actualFontSize = Markup.parseLength(cprops
.getProperty(ElementTags.SIZE),
Markup.DEFAULT_FONT_SIZE);
if (actualFontSize <= 0f)
actualFontSize = Markup.DEFAULT_FONT_SIZE;
float widthInPoints = Markup.parseLength(width, actualFontSize);
float heightInPoints = Markup.parseLength(height,
actualFontSize);
if (widthInPoints > 0 && heightInPoints > 0) {
img.scaleAbsolute(widthInPoints, heightInPoints);
} else if (widthInPoints > 0) {
heightInPoints = img.getHeight() * widthInPoints
/ img.getWidth();
img.scaleAbsolute(widthInPoints, heightInPoints);
} else if (heightInPoints > 0) {
widthInPoints = img.getWidth() * heightInPoints
/ img.getHeight();
img.scaleAbsolute(widthInPoints, heightInPoints);
}
img.setWidthPercentage(0);
if (align != null) {
endElement("p");
int ralign = Image.MIDDLE;
if (align.equalsIgnoreCase("left"))
ralign = Image.LEFT;
else if (align.equalsIgnoreCase("right"))
ralign = Image.RIGHT;
img.setAlignment(ralign);
Img i = null;
boolean skip = false;
if (interfaceProps != null) {
i = (Img) interfaceProps.get("img_interface");
if (i != null)
skip = i.process(img, h, cprops, document);
}
if (!skip)
document.add(img);
cprops.removeChain(tag);
} else {
Chunk ck = new Chunk(img, 0, 0);
if(cprops.hasPropertyInChain("img", "padding-left")){
String ss = cprops.getPropertyFromChain("img", "padding-left");
ck.setAttribute("padding-left", Float.toString(Markup.parseLength(ss)));
}
if(cprops.hasPropertyInChain("img", "padding-right")){
String ss = cprops.getPropertyFromChain("img", "padding-right");
ck.setAttribute("padding-right", Float.toString(Markup.parseLength(ss)));
}
cprops.removeChain(tag);
if (currentParagraph == null) {
currentParagraph = FactoryProperties
.createParagraph(cprops);
}
currentParagraph.add(ck);
}
return;
}
endElement("p");
if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3")
|| tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) {
if (!h.containsKey(ElementTags.SIZE)) {
int v = 7 - Integer.parseInt(tag.substring(1));
h.put(ElementTags.SIZE, Integer.toString(v));
}
cprops.addToChain(tag, h);
return;
}
if (tag.equals(HtmlTags.UNORDEREDLIST)) {
if (pendingLI)
endElement(HtmlTags.LISTITEM);
skipText = true;
cprops.addToChain(tag, h);
List list = new List(false);
try{
list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue());
}catch (Exception e) {
list.setAutoindent(true);
}
list.setListSymbol("\u2022");
stack.push(list);
return;
}
if (tag.equals(HtmlTags.ORDEREDLIST)) {
if (pendingLI)
endElement(HtmlTags.LISTITEM);
skipText = true;
cprops.addToChain(tag, h);
List list = new List(true);
try{
list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue());
}catch (Exception e) {
list.setAutoindent(true);
}
stack.push(list);
return;
}
if (tag.equals(HtmlTags.LISTITEM)) {
if (pendingLI)
endElement(HtmlTags.LISTITEM);
skipText = false;
pendingLI = true;
cprops.addToChain(tag, h);
ListItem item = FactoryProperties.createListItem(cprops);
stack.push(item);
return;
}
if (tag.equals(HtmlTags.DIV) || tag.equals(HtmlTags.BODY) || tag.equals("p")) {
cprops.addToChain(tag, h);
return;
}
if (tag.equals(HtmlTags.PRE)) {
if (!h.containsKey(ElementTags.FACE)) {
h.put(ElementTags.FACE, "Courier");
}
cprops.addToChain(tag, h);
isPRE = true;
return;
}
if (tag.equals("tr")) {
if (pendingTR)
endElement("tr");
skipText = true;
pendingTR = true;
cprops.addToChain("tr", h);
return;
}
if (tag.equals("td") || tag.equals("th")) {
if (pendingTD)
endElement(tag);
skipText = false;
pendingTD = true;
cprops.addToChain("td", h);
stack.push(new IncCell(tag, cprops));
return;
}
if (tag.equals("table")) {
cprops.addToChain("table", h);
IncTable table = new IncTable(h);
stack.push(table);
tableState.push(new boolean[] { pendingTR, pendingTD });
pendingTR = pendingTD = false;
skipText = true;
return;
}
} catch (Exception e) {
throw new ExceptionConverter(e);
}
}
public void endElement(String tag) {
if (!tagsSupported.containsKey(tag))
return;
try {
String follow = (String) FactoryProperties.followTags.get(tag);
if (follow != null) {
cprops.removeChain(follow);
return;
}
if (tag.equals("font") || tag.equals("span")) {
cprops.removeChain(tag);
return;
}
if (tag.equals("a")) {
if (currentParagraph == null) {
currentParagraph = new Paragraph();
}
boolean skip = false;
if (interfaceProps != null) {
ALink i = (ALink) interfaceProps.get("alink_interface");
if (i != null)
skip = i.process(currentParagraph, cprops);
}
if (!skip) {
String href = cprops.getProperty("href");
if (href != null) {
ArrayList chunks = currentParagraph.getChunks();
int size = chunks.size();
for (int k = 0; k < size; ++k) {
Chunk ck = (Chunk) chunks.get(k);
ck.setAnchor(href);
}
}
}
Paragraph tmp = (Paragraph) stack.pop();
Phrase tmp2 = new Phrase();
tmp2.add(currentParagraph);
tmp.add(tmp2);
currentParagraph = tmp;
cprops.removeChain("a");
return;
}
if (tag.equals("br")) {
return;
}
if (currentParagraph != null) {
if (stack.empty())
document.add(currentParagraph);
else {
Object obj = stack.pop();
if (obj instanceof TextElementArray) {
TextElementArray current = (TextElementArray) obj;
current.add(currentParagraph);
}
stack.push(obj);
}
}
currentParagraph = null;
if (tag.equals(HtmlTags.UNORDEREDLIST)
|| tag.equals(HtmlTags.ORDEREDLIST)) {
if (pendingLI)
endElement(HtmlTags.LISTITEM);
skipText = false;
cprops.removeChain(tag);
if (stack.empty())
return;
Object obj = stack.pop();
if (!(obj instanceof List)) {
stack.push(obj);
return;
}
if (stack.empty())
document.add((Element) obj);
else
((TextElementArray) stack.peek()).add(obj);
return;
}
if (tag.equals(HtmlTags.LISTITEM)) {
pendingLI = false;
skipText = true;
cprops.removeChain(tag);
if (stack.empty())
return;
Object obj = stack.pop();
if (!(obj instanceof ListItem)) {
stack.push(obj);
return;
}
if (stack.empty()) {
document.add((Element) obj);
return;
}
Object list = stack.pop();
if (!(list instanceof List)) {
stack.push(list);
return;
}
ListItem item = (ListItem) obj;
((List) list).add(item);
ArrayList cks = item.getChunks();
if (!cks.isEmpty())
item.getListSymbol()
.setFont(((Chunk) cks.get(0)).getFont());
stack.push(list);
return;
}
if (tag.equals("div") || tag.equals("body")) {
cprops.removeChain(tag);
return;
}
if (tag.equals(HtmlTags.PRE)) {
cprops.removeChain(tag);
isPRE = false;
return;
}
if (tag.equals("p")) {
cprops.removeChain(tag);
return;
}
if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3")
|| tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) {
cprops.removeChain(tag);
return;
}
if (tag.equals("table")) {
if (pendingTR)
endElement("tr");
cprops.removeChain("table");
IncTable table = (IncTable) stack.pop();
PdfPTable tb = table.buildTable();
tb.setSplitRows(true);
if (stack.empty())
document.add(tb);
else
((TextElementArray) stack.peek()).add(tb);
boolean state[] = (boolean[]) tableState.pop();
pendingTR = state[0];
pendingTD = state[1];
skipText = false;
return;
}
if (tag.equals("tr")) {
if (pendingTD)
endElement("td");
pendingTR = false;
String rowHeightPx = cprops.getLastChainProperty("height");
cprops.removeChain("tr");
ArrayList cells = new ArrayList();
IncTable table = null;
while (true) {
Object obj = stack.pop();
if (obj instanceof IncCell) {
cells.add(((IncCell) obj).getCell());
}
if (obj instanceof IncTable) {
table = (IncTable) obj;
break;
}
}
float rowHeight = 0.0f;
if(rowHeightPx!=null){
rowHeight = CSSUtils.parseFloat(rowHeightPx);
}
table.addCols(cells);
table.endRow(rowHeight);
stack.push(table);
skipText = true;
return;
}
if (tag.equals("td") || tag.equals("th")) {
pendingTD = false;
cprops.removeChain("td");
skipText = true;
return;
}
} catch (Exception e) {
throw new ExceptionConverter(e);
}
}
public void text(String str) {
if (skipText)
return;
String content = str;
if (isPRE) {
if (currentParagraph == null) {
currentParagraph = FactoryProperties.createParagraph(cprops);
}
Chunk chunk = factoryProperties.createChunk(content, cprops);
currentParagraph.add(chunk);
return;
}
if (content.trim().length() == 0 && content.indexOf(' ') < 0) {
return;
}
StringBuffer buf = new StringBuffer();
int len = content.length();
char character;
boolean newline = false;
for (int i = 0; i < len; i++) {
switch (character = content.charAt(i)) {
case ' ':
if (!newline) {
buf.append(character);
}
break;
case '\n':
if (i > 0) {
newline = true;
buf.append(' ');
}
break;
case '\r':
break;
case '\t':
break;
default:
newline = false;
buf.append(character);
}
}
if (currentParagraph == null) {
currentParagraph = FactoryProperties.createParagraph(cprops);
}
Chunk chunk = factoryProperties.createChunk(buf.toString(), cprops);
currentParagraph.add(chunk);
}
public boolean add(Element element) throws DocumentException {
objectList.add(element);
return true;
}
public void clearTextWrap() throws DocumentException {
}
public void close() {
}
public boolean newPage() {
return true;
}
public void open() {
}
public void resetFooter() {
}
public void resetHeader() {
}
public void resetPageCount() {
}
public void setFooter(HeaderFooter footer) {
}
public void setHeader(HeaderFooter header) {
}
public boolean setMarginMirroring(boolean marginMirroring) {
return false;
}
/**
* @see DocListener#setMarginMirroring(boolean)
* @since 2.1.6
*/
public boolean setMarginMirroringTopBottom(boolean marginMirroring) {
return false;
}
public boolean setMargins(float marginLeft, float marginRight,
float marginTop, float marginBottom) {
return true;
}
public void setPageCount(int pageN) {
}
public boolean setPageSize(Rectangle pageSize) {
return true;
}
public static final String tagsSupportedString = "ol ul li a pre font span br p div body table td th tr i b u sub sup em strong s strike"
+ " h1 h2 h3 h4 h5 h6 img hr";
public static final HashMap tagsSupported = new HashMap();
public static final HashMap tagsPrefixSupported = new HashMap();
static {
StringTokenizer tok = new StringTokenizer(tagsSupportedString);
while (tok.hasMoreTokens()) {
String s = tok.nextToken();
tagsSupported.put(s, null);
tagsPrefixSupported.put(s.charAt(0), null);
}
}
}

780
fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java

@ -0,0 +1,780 @@
/*
* Copyright 2003 Paulo Soares
*
* The contents of this file are subject to the Mozilla Public License Version 1.1
* (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the License.
*
* The Original Code is 'iText, a free JAVA-PDF library'.
*
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
* All Rights Reserved.
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
*
* Contributor(s): all the names of the contributors are added in the source code
* where applicable.
*
* Alternatively, the contents of this file may be used under the terms of the
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
* provisions of LGPL are applicable instead of those above. If you wish to
* allow use of your version of this file only under the terms of the LGPL
* License and not to allow others to use your version of this file under
* the MPL, indicate your decision by deleting the provisions above and
* replace them with the notice and other provisions required by the LGPL.
* If you do not delete the provisions above, a recipient may use your version
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the MPL as stated above or under the terms of the GNU
* Library General Public License as published by the Free Software Foundation;
* either version 2 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
* details.
*
* If you didn't download this code from the following link, you should check if
* you aren't using an obsolete version:
* http://www.lowagie.com/iText/
*
* The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license:
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt.
* The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128).
* Steven Brandt and JavaWorld gave permission to use the code for free.
* (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in
* conformance with the rest of the code).
* The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
* It was substantially refactored by Bruno Lowagie.
*
* The method 'private static String getEncodingName(byte[] b4)' was found
* in org.apache.xerces.impl.XMLEntityManager, originaly published by the
* Apache Software Foundation under the Apache Software License; now being
* used in iText under the MPL.
*/
package com.fr.third.com.lowagie.text.xml.simpleparser;
import com.fr.third.com.lowagie.text.html.simpleparser.HTMLWorker;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Stack;
/**
* A simple XML and HTML parser. This parser is, like the SAX parser,
* an event based parser, but with much less functionality.
* <p>
* The parser can:
* <p>
* <ul>
* <li>It recognizes the encoding used
* <li>It recognizes all the elements' start tags and end tags
* <li>It lists attributes, where attribute values can be enclosed in single or double quotes
* <li>It recognizes the <code>&lt;[CDATA[ ... ]]&gt;</code> construct
* <li>It recognizes the standard entities: &amp;amp;, &amp;lt;, &amp;gt;, &amp;quot;, and &amp;apos;, as well as numeric entities
* <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
* </ul>
* <p>
*/
public final class SimpleXMLParser {
/** possible states */
private final static int UNKNOWN = 0;
private final static int TEXT = 1;
private final static int TAG_ENCOUNTERED = 2;
private final static int EXAMIN_TAG = 3;
private final static int TAG_EXAMINED = 4;
private final static int IN_CLOSETAG = 5;
private final static int SINGLE_TAG = 6;
private final static int CDATA = 7;
private final static int COMMENT = 8;
private final static int PI = 9;
private final static int ENTITY = 10;
private final static int QUOTE = 11;
private final static int ATTRIBUTE_KEY = 12;
private final static int ATTRIBUTE_EQUAL = 13;
private final static int ATTRIBUTE_VALUE = 14;
/** the state stack */
Stack stack;
/** The current character. */
int character = 0;
/** The previous character. */
int previousCharacter = -1;
/** the line we are currently reading */
int lines = 1;
/** the column where the current character occurs */
int columns = 0;
/** was the last character equivalent to a newline? */
boolean eol = false;
/**
* A boolean indicating if the next character should be taken into account
* if it's a space character. When nospace is false, the previous character
* wasn't whitespace.
* @since 2.1.5
*/
boolean nowhite = false;
/** the current state */
int state;
/** Are we parsing HTML? */
boolean html;
/** current text (whatever is encountered between tags) */
StringBuffer text = new StringBuffer();
/** current entity (whatever is encountered between & and ;) */
StringBuffer entity = new StringBuffer();
/** current tagname */
String tag = null;
/** current attributes */
HashMap attributes = null;
/** The handler to which we are going to forward document content */
SimpleXMLDocHandler doc;
/** The handler to which we are going to forward comments. */
SimpleXMLDocHandlerComment comment;
/** Keeps track of the number of tags that are open. */
int nested = 0;
/** the quote character that was used to open the quote. */
int quoteCharacter = '"';
/** the attribute key. */
String attributekey = null;
/** the attribute value. */
String attributevalue = null;
/**
* Creates a Simple XML parser object.
* Call go(BufferedReader) immediately after creation.
*/
private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) {
this.doc = doc;
this.comment = comment;
this.html = html;
stack = new Stack();
state = html ? TEXT : UNKNOWN;
}
/**
* Does the actual parsing. Perform this immediately
* after creating the parser object.
*/
private void go(Reader r) throws IOException {
BufferedReader reader;
if (r instanceof BufferedReader)
reader = (BufferedReader)r;
else
reader = new BufferedReader(r);
doc.startDocument();
while(true) {
// read a new character
if (previousCharacter == -1) {
character = reader.read();
}
// or re-examine the previous character
else {
character = previousCharacter;
previousCharacter = -1;
}
// the end of the file was reached
if (character == -1) {
if (html) {
if (html && state == TEXT)
flush();
doc.endDocument();
} else {
throwException("Missing end tag");
}
return;
}
// dealing with \n and \r
if (character == '\n' && eol) {
eol = false;
continue;
} else if (eol) {
eol = false;
} else if (character == '\n') {
lines++;
columns = 0;
} else if (character == '\r') {
eol = true;
character = '\n';
lines++;
columns = 0;
} else {
columns++;
}
switch(state) {
// we are in an unknown state before there's actual content
case UNKNOWN:
if(character == '<') {
beginnOfTag((char) reader.read(), UNKNOWN);
}
break;
// we can encounter any content
case TEXT:
if(character == '<') {
beginnOfTag((char) reader.read(), TEXT);
} else if(character == '&') {
saveState(state);
entity.setLength(0);
state = ENTITY;
} else if (Character.isWhitespace((char)character) && character != 12288) {
if (nowhite)
text.append((char)character);
nowhite = false;
} else {
text.append((char)character);
nowhite = true;
}
break;
// we have just seen a < and are wondering what we are looking at
// <foo>, </foo>, <!-- ... --->, etc.
case TAG_ENCOUNTERED:
initTag();
if(character == '/') {
state = IN_CLOSETAG;
} else if (character == '?') {
restoreState();
state = PI;
} else {
text.append((char)character);
state = EXAMIN_TAG;
}
break;
// we are processing something like this <foo ... >.
// It could still be a <!-- ... --> or something.
case EXAMIN_TAG:
if(character == '>') {
doTag();
processTag(true);
initTag();
state = restoreState();
} else if(character == '/') {
state = SINGLE_TAG;
} else if(character == '-' && text.toString().equals("!-")) {
flush();
state = COMMENT;
} else if(character == '[' && text.toString().equals("![CDATA")) {
flush();
state = CDATA;
} else if(character == 'E' && text.toString().equals("!DOCTYP")) {
flush();
state = PI;
} else if(Character.isWhitespace((char)character)) {
doTag();
state = TAG_EXAMINED;
} else {
text.append((char)character);
}
break;
// we know the name of the tag now.
case TAG_EXAMINED:
if(character == '>') {
processTag(true);
initTag();
state = restoreState();
} else if(character == '/') {
state = SINGLE_TAG;
} else if(Character.isWhitespace((char)character)) {
// empty
} else {
text.append((char)character);
state = ATTRIBUTE_KEY;
}
break;
// we are processing a closing tag: e.g. </foo>
case IN_CLOSETAG:
if(character == '>') {
doTag();
processTag(false);
if(!html && nested==0) return;
state = restoreState();
} else {
if (!Character.isWhitespace((char)character))
text.append((char)character);
}
break;
// we have just seen something like this: <foo a="b"/
// and are looking for the final >.
case SINGLE_TAG:
if(character != '>')
throwException("Expected > for tag: <"+tag+"/>");
doTag();
processTag(true);
processTag(false);
initTag();
if(!html && nested==0) {
doc.endDocument();
return;
}
state = restoreState();
break;
// we are processing CDATA
case CDATA:
if(character == '>'
&& text.toString().endsWith("]]")) {
text.setLength(text.length()-2);
flush();
state = restoreState();
} else
text.append((char)character);
break;
// we are processing a comment. We are inside
// the <!-- .... --> looking for the -->.
case COMMENT:
if(character == '>'
&& text.toString().endsWith("--")) {
text.setLength(text.length() - 2);
flush();
state = restoreState();
} else
text.append((char)character);
break;
// We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
case PI:
if(character == '>') {
state = restoreState();
if(state == TEXT) state = UNKNOWN;
}
break;
// we are processing an entity, e.g. &lt;, &#187;, etc.
case ENTITY:
if(character == ';') {
state = restoreState();
String cent = entity.toString();
entity.setLength(0);
char ce = EntitiesToUnicode.decodeEntity(cent);
if (ce == '\0')
text.append('&').append(cent).append(';');
else
text.append(ce);
} else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z')
&& (character < 'A' || character > 'Z')) || entity.length() >= 7) {
state = restoreState();
previousCharacter = character;
text.append('&').append(entity.toString());
entity.setLength(0);
}
else {
entity.append((char)character);
}
break;
// We are processing the quoted right-hand side of an element's attribute.
case QUOTE:
if (html && quoteCharacter == ' ' && character == '>') {
flush();
processTag(true);
initTag();
state = restoreState();
}
else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) {
flush();
state = TAG_EXAMINED;
}
else if (html && quoteCharacter == ' ') {
text.append((char)character);
}
else if(character == quoteCharacter) {
flush();
state = TAG_EXAMINED;
} else if(" \r\n\u0009".indexOf(character)>=0) {
text.append(' ');
} else if(character == '&') {
saveState(state);
state = ENTITY;
entity.setLength(0);
} else {
text.append((char)character);
}
break;
case ATTRIBUTE_KEY:
if(Character.isWhitespace((char)character)) {
flush();
state = ATTRIBUTE_EQUAL;
} else if(character == '=') {
flush();
state = ATTRIBUTE_VALUE;
} else if (html && character == '>') {
text.setLength(0);
processTag(true);
initTag();
state = restoreState();
} else {
text.append((char)character);
}
break;
case ATTRIBUTE_EQUAL:
if(character == '=') {
state = ATTRIBUTE_VALUE;
} else if(Character.isWhitespace((char)character)) {
// empty
} else if (html && character == '>') {
text.setLength(0);
processTag(true);
initTag();
state = restoreState();
} else if (html && character == '/') {
flush();
state = SINGLE_TAG;
} else if (html) {
flush();
text.append((char)character);
state = ATTRIBUTE_KEY;
} else {
throwException("Error in attribute processing.");
}
break;
case ATTRIBUTE_VALUE:
if(character == '"' || character == '\'') {
quoteCharacter = character;
state = QUOTE;
} else if(Character.isWhitespace((char)character)) {
// empty
} else if (html && character == '>') {
flush();
processTag(true);
initTag();
state = restoreState();
} else if (html) {
text.append((char)character);
quoteCharacter = ' ';
state = QUOTE;
} else {
throwException("Error in attribute processing");
}
break;
}
}
}
/**
* Gets a state from the stack
* @return the previous state
*/
private int restoreState() {
if(!stack.empty())
return ((Integer)stack.pop()).intValue();
else
return UNKNOWN;
}
/**
* Adds a state to the stack.
* @param s a state to add to the stack
*/
private void saveState(int s) {
stack.push(new Integer(s));
}
/**
* 处理标签的开头若不在支持标签范围内<符号作为文本处理<1111 (仿造浏览器的处理方式)
*/
public void beginnOfTag(char c, int type) {
previousCharacter = c;
if (c == -1) {
return;
}
if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) {
if (type == TEXT) {
flush();
}
saveState(TEXT);
state = TAG_ENCOUNTERED;
return;
}
text.append((char) character);
nowhite = true;
}
/**
* Flushes the text that is currently in the buffer.
* The text can be ignored, added to the document
* as content or as comment,... depending on the current state.
*/
private void flush() {
switch(state){
case TEXT:
case CDATA:
if(text.length() > 0) {
doc.text(text.toString());
}
break;
case COMMENT:
if (comment != null) {
comment.comment(text.toString());
}
break;
case ATTRIBUTE_KEY:
attributekey = text.toString();
if (html)
attributekey = attributekey.toLowerCase();
break;
case QUOTE:
case ATTRIBUTE_VALUE:
attributevalue = text.toString();
attributes.put(attributekey,attributevalue);
break;
default:
// do nothing
}
text.setLength(0);
}
/**
* Initialized the tag name and attributes.
*/
private void initTag() {
tag = null;
attributes = new HashMap();
}
/** Sets the name of the tag. */
private void doTag() {
if(tag == null)
tag = text.toString();
if (html)
tag = tag.toLowerCase();
text.setLength(0);
}
/**
* processes the tag.
* @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag.
*/
private void processTag(boolean start) {
if (start) {
nested++;
doc.startElement(tag,attributes);
}
else {
nested--;
doc.endElement(tag);
}
}
/** Throws an exception */
private void throwException(String s) throws IOException {
throw new IOException(s+" near line " + lines + ", column " + columns);
}
/**
* Parses the XML document firing the events to the handler.
* @param doc the document handler
* @param r the document. The encoding is already resolved. The reader is not closed
* @throws IOException on error
*/
public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException {
SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);
parser.go(r);
}
/**
* Parses the XML document firing the events to the handler.
* @param doc the document handler
* @param in the document. The encoding is deduced from the stream. The stream is not closed
* @throws IOException on error
*/
public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException {
byte b4[] = new byte[4];
int count = in.read(b4);
if (count != 4)
throw new IOException("Insufficient length.");
String encoding = getEncodingName(b4);
String decl = null;
if (encoding.equals("UTF-8")) {
StringBuffer sb = new StringBuffer();
int c;
while ((c = in.read()) != -1) {
if (c == '>')
break;
sb.append((char)c);
}
decl = sb.toString();
}
else if (encoding.equals("CP037")) {
ByteArrayOutputStream bi = new ByteArrayOutputStream();
int c;
while ((c = in.read()) != -1) {
if (c == 0x6e) // that's '>' in ebcdic
break;
bi.write(c);
}
decl = new String(bi.toByteArray(), "CP037");
}
if (decl != null) {
decl = getDeclaredEncoding(decl);
if (decl != null)
encoding = decl;
}
parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding)));
}
private static String getDeclaredEncoding(String decl) {
if (decl == null)
return null;
int idx = decl.indexOf("encoding");
if (idx < 0)
return null;
int idx1 = decl.indexOf('"', idx);
int idx2 = decl.indexOf('\'', idx);
if (idx1 == idx2)
return null;
if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {
int idx3 = decl.indexOf('\'', idx2 + 1);
if (idx3 < 0)
return null;
return decl.substring(idx2 + 1, idx3);
}
if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {
int idx3 = decl.indexOf('"', idx1 + 1);
if (idx3 < 0)
return null;
return decl.substring(idx1 + 1, idx3);
}
return null;
}
public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException {
parse(doc, null, r, false);
}
/**
* Escapes a string with the appropriated XML codes.
* @param s the string to be escaped
* @param onlyASCII codes above 127 will always be escaped with &amp;#nn; if <CODE>true</CODE>
* @return the escaped string
*/
public static String escapeXML(String s, boolean onlyASCII) {
char cc[] = s.toCharArray();
int len = cc.length;
StringBuffer sb = new StringBuffer();
for (int k = 0; k < len; ++k) {
int c = cc[k];
switch (c) {
case '<':
sb.append("&lt;");
break;
case '>':
sb.append("&gt;");
break;
case '&':
sb.append("&amp;");
break;
case '"':
sb.append("&quot;");
break;
case '\'':
sb.append("&apos;");
break;
default:
if ((c == 0x9) || (c == 0xA) || (c == 0xD)
|| ((c >= 0x20) && (c <= 0xD7FF))
|| ((c >= 0xE000) && (c <= 0xFFFD))
|| ((c >= 0x10000) && (c <= 0x10FFFF))) {
if (onlyASCII && c > 127)
sb.append("&#").append(c).append(';');
else
sb.append((char)c);
}
}
}
return sb.toString();
}
/**
* Returns the IANA encoding name that is auto-detected from
* the bytes specified, with the endian-ness of that encoding where appropriate.
* (method found in org.apache.xerces.impl.XMLEntityManager, originally published
* by the Apache Software Foundation under the Apache Software License; now being
* used in iText under the MPL)
* @param b4 The first four bytes of the input.
* @return an IANA-encoding string
*/
private static String getEncodingName(byte[] b4) {
// UTF-16, with BOM
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF) {
// UTF-16, big-endian
return "UTF-16BE";
}
if (b0 == 0xFF && b1 == 0xFE) {
// UTF-16, little-endian
return "UTF-16LE";
}
// UTF-8 with a BOM
int b2 = b4[2] & 0xFF;
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
return "UTF-8";
}
// other encodings
int b3 = b4[3] & 0xFF;
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
// UCS-4, big endian (1234)
return "ISO-10646-UCS-4";
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
// UCS-4, little endian (4321)
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be?
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
// UCS-4, unusual octet order (3412)
// REVISIT: What should this be?
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
// UTF-16, big-endian, no BOM
// (or could turn out to be UCS-2...
// REVISIT: What should this be?
return "UTF-16BE";
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
// UTF-16, little-endian, no BOM
// (or could turn out to be UCS-2...
return "UTF-16LE";
}
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
// EBCDIC
// a la xerces1, return CP037 instead of EBCDIC here
return "CP037";
}
// default encoding
return "UTF-8";
}
}
Loading…
Cancel
Save