diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java new file mode 100644 index 000000000..11e918722 --- /dev/null +++ b/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java @@ -0,0 +1,788 @@ +/* + * Copyright 2004 Paulo Soares + * + * The contents of this file are subject to the Mozilla Public License Version 1.1 + * (the "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the License. + * + * The Original Code is 'iText, a free JAVA-PDF library'. + * + * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by + * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. + * All Rights Reserved. + * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer + * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. + * + * Contributor(s): all the names of the contributors are added in the source code + * where applicable. + * + * Alternatively, the contents of this file may be used under the terms of the + * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the + * provisions of LGPL are applicable instead of those above. If you wish to + * allow use of your version of this file only under the terms of the LGPL + * License and not to allow others to use your version of this file under + * the MPL, indicate your decision by deleting the provisions above and + * replace them with the notice and other provisions required by the LGPL. + * If you do not delete the provisions above, a recipient may use your version + * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the MPL as stated above or under the terms of the GNU + * Library General Public License as published by the Free Software Foundation; + * either version 2 of the License, or any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more + * details. + * + * Contributions by: + * Lubos Strapko + * + * If you didn't download this code from the following link, you should check if + * you aren't using an obsolete version: + * http://www.lowagie.com/iText/ + */ + +package com.fr.third.com.lowagie.text.html.simpleparser; + +import com.fr.third.com.lowagie.text.Chunk; +import com.fr.third.com.lowagie.text.DocListener; +import com.fr.third.com.lowagie.text.DocumentException; +import com.fr.third.com.lowagie.text.Element; +import com.fr.third.com.lowagie.text.ElementTags; +import com.fr.third.com.lowagie.text.ExceptionConverter; +import com.fr.third.com.lowagie.text.FontFactoryImp; +import com.fr.third.com.lowagie.text.HeaderFooter; +import com.fr.third.com.lowagie.text.Image; +import com.fr.third.com.lowagie.text.List; +import com.fr.third.com.lowagie.text.ListItem; +import com.fr.third.com.lowagie.text.Paragraph; +import com.fr.third.com.lowagie.text.Phrase; +import com.fr.third.com.lowagie.text.Rectangle; +import com.fr.third.com.lowagie.text.TextElementArray; +import com.fr.third.com.lowagie.text.html.CSSUtils; +import com.fr.third.com.lowagie.text.html.HtmlTags; +import com.fr.third.com.lowagie.text.html.Markup; +import com.fr.third.com.lowagie.text.pdf.PdfPTable; +import com.fr.third.com.lowagie.text.pdf.draw.LineSeparator; +import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLDocHandler; +import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLParser; +import com.fr.third.sun.misc.BASE64Decoder; + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Stack; +import java.util.StringTokenizer; + +public class HTMLWorker implements SimpleXMLDocHandler, DocListener { + + protected ArrayList objectList; + + protected DocListener document; + + private Paragraph currentParagraph; + + private ChainedProperties cprops = new ChainedProperties(); + + private Stack stack = new Stack(); + + private boolean pendingTR = false; + + private boolean pendingTD = false; + + private boolean pendingLI = false; + + private StyleSheet style = new StyleSheet(); + + private boolean isPRE = false; + + private Stack tableState = new Stack(); + + private boolean skipText = false; + + private HashMap interfaceProps; + + private FactoryProperties factoryProperties = new FactoryProperties(); + + /** Creates a new instance of HTMLWorker + * @param document A class that implements DocListener + * */ + public HTMLWorker(DocListener document) { + this.document = document; + } + + public void setStyleSheet(StyleSheet style) { + this.style = style; + } + + public StyleSheet getStyleSheet() { + return style; + } + + public void setInterfaceProps(HashMap interfaceProps) { + this.interfaceProps = interfaceProps; + FontFactoryImp ff = null; + if (interfaceProps != null) + ff = (FontFactoryImp) interfaceProps.get("font_factory"); + if (ff != null) + factoryProperties.setFontImp(ff); + } + + public HashMap getInterfaceProps() { + return interfaceProps; + } + + public void parse(Reader reader) throws IOException { + SimpleXMLParser.parse(this, null, reader, true); + } + + public static ArrayList parseToList(Reader reader, StyleSheet style) + throws IOException { + return parseToList(reader, style, null); + } + + public static ArrayList parseToList(Reader reader, StyleSheet style, + HashMap interfaceProps) throws IOException { + HTMLWorker worker = new HTMLWorker(null); + if (style != null) + worker.style = style; + worker.document = worker; + worker.setInterfaceProps(interfaceProps); + worker.objectList = new ArrayList(); + worker.parse(reader); + return worker.objectList; + } + + public void endDocument() { + try { + for (int k = 0; k < stack.size(); ++k) + document.add((Element) stack.elementAt(k)); + if (currentParagraph != null) + document.add(currentParagraph); + currentParagraph = null; + } catch (Exception e) { + throw new ExceptionConverter(e); + } + } + + public void startDocument() { + HashMap h = new HashMap(); + style.applyStyle("body", h); + cprops.addToChain("body", h); + } + + + public void startElement(String tag, HashMap h) { + if (!tagsSupported.containsKey(tag)) + return; + try { + style.applyStyle(tag, h); + if(tag.equals("p")){ + h.put(Markup.CSS_KEY_MARGINTOP, "16px"); + h.put(Markup.CSS_KEY_MARGINBOTTOM, "16px"); + } + String follow = (String) FactoryProperties.followTags.get(tag); + if (follow != null) { + HashMap prop = new HashMap(); + prop.put(follow, null); + FactoryProperties.insertStyle(h, this.cprops); + prop.putAll(h); + + cprops.addToChain(follow, prop); + return; + } + FactoryProperties.insertStyle(h, cprops); + if (tag.equals(HtmlTags.ANCHOR)) { + cprops.addToChain(tag, h); + if (currentParagraph == null) { + currentParagraph = new Paragraph(); + } + stack.push(currentParagraph); + currentParagraph = new Paragraph(); + return; + } + if (tag.equals(HtmlTags.NEWLINE)) { + if (currentParagraph == null) { + currentParagraph = new Paragraph(); + } + currentParagraph.add(factoryProperties + .createChunk("\n", cprops)); + return; + } + if (tag.equals(HtmlTags.HORIZONTALRULE)) { + // Attempting to duplicate the behavior seen on Firefox with + // http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_hr_test + // where an initial break is only inserted when the preceding element doesn't + // end with a break, but a trailing break is always inserted. + boolean addLeadingBreak = true; + if (currentParagraph == null) { + currentParagraph = new Paragraph(); + addLeadingBreak = false; + } + if (addLeadingBreak) { // Not a new paragraph + int numChunks = currentParagraph.getChunks().size(); + if (numChunks == 0 || + ((Chunk)(currentParagraph.getChunks().get(numChunks - 1))).getContent().endsWith("\n")) + addLeadingBreak = false; + } + String align = (String) h.get("align"); + int hrAlign = Element.ALIGN_CENTER; + if (align != null) { + if (align.equalsIgnoreCase("left")) + hrAlign = Element.ALIGN_LEFT; + if (align.equalsIgnoreCase("right")) + hrAlign = Element.ALIGN_RIGHT; + } + String width = (String) h.get("width"); + float hrWidth = 1; + if (width != null) { + float tmpWidth = Markup.parseLength(width, Markup.DEFAULT_FONT_SIZE); + if (tmpWidth > 0) hrWidth = tmpWidth; + if (!width.endsWith("%")) + hrWidth = 100; // Treat a pixel width as 100% for now. + } + String size = (String) h.get("size"); + float hrSize = 1; + if (size != null) { + float tmpSize = Markup.parseLength(size, Markup.DEFAULT_FONT_SIZE); + if (tmpSize > 0) + hrSize = tmpSize; + } + if (addLeadingBreak) + currentParagraph.add(Chunk.NEWLINE); + currentParagraph.add(new LineSeparator(hrSize, hrWidth, null, hrAlign, currentParagraph.getLeading()/2)); + currentParagraph.add(Chunk.NEWLINE); + return; + } + if (tag.equals(HtmlTags.CHUNK) || tag.equals(HtmlTags.SPAN)) { + cprops.addToChain(tag, h); + return; + } + if (tag.equals(HtmlTags.IMAGE)) { + String src = (String) h.get(ElementTags.SRC); + if (src == null) + return; + cprops.addToChain(tag, h); + Image img = null; + if (interfaceProps != null) { + ImageProvider ip = (ImageProvider) interfaceProps + .get("img_provider"); + if (ip != null) + img = ip.getImage(src, h, cprops, document); + if (img == null) { + HashMap images = (HashMap) interfaceProps + .get("img_static"); + if (images != null) { + Image tim = (Image) images.get(src); + if (tim != null) + img = Image.getInstance(tim); + } else { + if (!src.startsWith("http")) { // relative src references only + String baseurl = (String) interfaceProps + .get("img_baseurl"); + if (baseurl != null) { + src = baseurl + src; + img = Image.getInstance(src); + } + } + } + } + } + //处理base64编码图片 + if(src.startsWith("data")){ + BASE64Decoder decoder = new BASE64Decoder(); + String[] srcArray = src.split(","); + String base64string = srcArray[srcArray.length -1]; + byte[] bytes = decoder.decodeBuffer(base64string); + try { + img = Image.getInstance(bytes); + }catch (Exception e){ + + } + + } + if (img == null) { + if (!src.startsWith("http")) { + String path = cprops.getProperty("image_path"); + if (path == null) + path = ""; + src = new File(path, src).getPath(); + } + img = Image.getInstance(src); + } + if(img == null){ + return; + } + img.setSrcString(src); + String align = (String) h.get("align"); + String width = (String) h.get("width"); + String height = (String) h.get("height"); + String before = cprops.getProperty("before"); + String after = cprops.getProperty("after"); + if (before != null) + img.setSpacingBefore(Float.parseFloat(before)); + if (after != null) + img.setSpacingAfter(Float.parseFloat(after)); + float actualFontSize = Markup.parseLength(cprops + .getProperty(ElementTags.SIZE), + Markup.DEFAULT_FONT_SIZE); + if (actualFontSize <= 0f) + actualFontSize = Markup.DEFAULT_FONT_SIZE; + float widthInPoints = Markup.parseLength(width, actualFontSize); + float heightInPoints = Markup.parseLength(height, + actualFontSize); + if (widthInPoints > 0 && heightInPoints > 0) { + img.scaleAbsolute(widthInPoints, heightInPoints); + } else if (widthInPoints > 0) { + heightInPoints = img.getHeight() * widthInPoints + / img.getWidth(); + img.scaleAbsolute(widthInPoints, heightInPoints); + } else if (heightInPoints > 0) { + widthInPoints = img.getWidth() * heightInPoints + / img.getHeight(); + img.scaleAbsolute(widthInPoints, heightInPoints); + } + img.setWidthPercentage(0); + if (align != null) { + endElement("p"); + int ralign = Image.MIDDLE; + if (align.equalsIgnoreCase("left")) + ralign = Image.LEFT; + else if (align.equalsIgnoreCase("right")) + ralign = Image.RIGHT; + img.setAlignment(ralign); + Img i = null; + boolean skip = false; + if (interfaceProps != null) { + i = (Img) interfaceProps.get("img_interface"); + if (i != null) + skip = i.process(img, h, cprops, document); + } + if (!skip) + document.add(img); + cprops.removeChain(tag); + } else { + Chunk ck = new Chunk(img, 0, 0); + if(cprops.hasPropertyInChain("img", "padding-left")){ + String ss = cprops.getPropertyFromChain("img", "padding-left"); + ck.setAttribute("padding-left", Float.toString(Markup.parseLength(ss))); + } + if(cprops.hasPropertyInChain("img", "padding-right")){ + String ss = cprops.getPropertyFromChain("img", "padding-right"); + ck.setAttribute("padding-right", Float.toString(Markup.parseLength(ss))); + } + cprops.removeChain(tag); + if (currentParagraph == null) { + currentParagraph = FactoryProperties + .createParagraph(cprops); + } + + currentParagraph.add(ck); + } + return; + } + endElement("p"); + if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3") + || tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) { + if (!h.containsKey(ElementTags.SIZE)) { + int v = 7 - Integer.parseInt(tag.substring(1)); + h.put(ElementTags.SIZE, Integer.toString(v)); + } + cprops.addToChain(tag, h); + return; + } + if (tag.equals(HtmlTags.UNORDEREDLIST)) { + if (pendingLI) + endElement(HtmlTags.LISTITEM); + skipText = true; + cprops.addToChain(tag, h); + List list = new List(false); + try{ + list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue()); + }catch (Exception e) { + list.setAutoindent(true); + } + list.setListSymbol("\u2022"); + stack.push(list); + return; + } + if (tag.equals(HtmlTags.ORDEREDLIST)) { + if (pendingLI) + endElement(HtmlTags.LISTITEM); + skipText = true; + cprops.addToChain(tag, h); + List list = new List(true); + try{ + list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue()); + }catch (Exception e) { + list.setAutoindent(true); + } + stack.push(list); + return; + } + if (tag.equals(HtmlTags.LISTITEM)) { + if (pendingLI) + endElement(HtmlTags.LISTITEM); + skipText = false; + pendingLI = true; + cprops.addToChain(tag, h); + ListItem item = FactoryProperties.createListItem(cprops); + stack.push(item); + return; + } + if (tag.equals(HtmlTags.DIV) || tag.equals(HtmlTags.BODY) || tag.equals("p")) { + cprops.addToChain(tag, h); + return; + } + if (tag.equals(HtmlTags.PRE)) { + if (!h.containsKey(ElementTags.FACE)) { + h.put(ElementTags.FACE, "Courier"); + } + cprops.addToChain(tag, h); + isPRE = true; + return; + } + if (tag.equals("tr")) { + if (pendingTR) + endElement("tr"); + skipText = true; + pendingTR = true; + cprops.addToChain("tr", h); + return; + } + if (tag.equals("td") || tag.equals("th")) { + if (pendingTD) + endElement(tag); + skipText = false; + pendingTD = true; + cprops.addToChain("td", h); + stack.push(new IncCell(tag, cprops)); + return; + } + if (tag.equals("table")) { + cprops.addToChain("table", h); + IncTable table = new IncTable(h); + stack.push(table); + tableState.push(new boolean[] { pendingTR, pendingTD }); + pendingTR = pendingTD = false; + skipText = true; + return; + } + } catch (Exception e) { + throw new ExceptionConverter(e); + } + } + + + + public void endElement(String tag) { + if (!tagsSupported.containsKey(tag)) + return; + try { + String follow = (String) FactoryProperties.followTags.get(tag); + if (follow != null) { + cprops.removeChain(follow); + return; + } + if (tag.equals("font") || tag.equals("span")) { + cprops.removeChain(tag); + return; + } + if (tag.equals("a")) { + if (currentParagraph == null) { + currentParagraph = new Paragraph(); + } + boolean skip = false; + if (interfaceProps != null) { + ALink i = (ALink) interfaceProps.get("alink_interface"); + if (i != null) + skip = i.process(currentParagraph, cprops); + } + if (!skip) { + String href = cprops.getProperty("href"); + if (href != null) { + ArrayList chunks = currentParagraph.getChunks(); + int size = chunks.size(); + for (int k = 0; k < size; ++k) { + Chunk ck = (Chunk) chunks.get(k); + ck.setAnchor(href); + } + } + } + Paragraph tmp = (Paragraph) stack.pop(); + Phrase tmp2 = new Phrase(); + tmp2.add(currentParagraph); + tmp.add(tmp2); + currentParagraph = tmp; + cprops.removeChain("a"); + return; + } + if (tag.equals("br")) { + return; + } + if (currentParagraph != null) { + if (stack.empty()) + document.add(currentParagraph); + else { + Object obj = stack.pop(); + if (obj instanceof TextElementArray) { + TextElementArray current = (TextElementArray) obj; + current.add(currentParagraph); + } + stack.push(obj); + } + } + currentParagraph = null; + if (tag.equals(HtmlTags.UNORDEREDLIST) + || tag.equals(HtmlTags.ORDEREDLIST)) { + if (pendingLI) + endElement(HtmlTags.LISTITEM); + skipText = false; + cprops.removeChain(tag); + if (stack.empty()) + return; + Object obj = stack.pop(); + if (!(obj instanceof List)) { + stack.push(obj); + return; + } + if (stack.empty()) + document.add((Element) obj); + else + ((TextElementArray) stack.peek()).add(obj); + return; + } + if (tag.equals(HtmlTags.LISTITEM)) { + pendingLI = false; + skipText = true; + cprops.removeChain(tag); + if (stack.empty()) + return; + Object obj = stack.pop(); + if (!(obj instanceof ListItem)) { + stack.push(obj); + return; + } + if (stack.empty()) { + document.add((Element) obj); + return; + } + Object list = stack.pop(); + if (!(list instanceof List)) { + stack.push(list); + return; + } + ListItem item = (ListItem) obj; + ((List) list).add(item); + ArrayList cks = item.getChunks(); + if (!cks.isEmpty()) + item.getListSymbol() + .setFont(((Chunk) cks.get(0)).getFont()); + stack.push(list); + return; + } + if (tag.equals("div") || tag.equals("body")) { + cprops.removeChain(tag); + return; + } + if (tag.equals(HtmlTags.PRE)) { + cprops.removeChain(tag); + isPRE = false; + return; + } + if (tag.equals("p")) { + cprops.removeChain(tag); + return; + } + if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3") + || tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) { + cprops.removeChain(tag); + return; + } + if (tag.equals("table")) { + if (pendingTR) + endElement("tr"); + cprops.removeChain("table"); + IncTable table = (IncTable) stack.pop(); + PdfPTable tb = table.buildTable(); + tb.setSplitRows(true); + if (stack.empty()) + document.add(tb); + else + ((TextElementArray) stack.peek()).add(tb); + boolean state[] = (boolean[]) tableState.pop(); + pendingTR = state[0]; + pendingTD = state[1]; + skipText = false; + return; + } + if (tag.equals("tr")) { + if (pendingTD) + endElement("td"); + pendingTR = false; + String rowHeightPx = cprops.getLastChainProperty("height"); + + cprops.removeChain("tr"); + ArrayList cells = new ArrayList(); + IncTable table = null; + while (true) { + Object obj = stack.pop(); + if (obj instanceof IncCell) { + cells.add(((IncCell) obj).getCell()); + } + if (obj instanceof IncTable) { + table = (IncTable) obj; + break; + } + } + float rowHeight = 0.0f; + if(rowHeightPx!=null){ + rowHeight = CSSUtils.parseFloat(rowHeightPx); + } + table.addCols(cells); + table.endRow(rowHeight); + + stack.push(table); + skipText = true; + return; + } + if (tag.equals("td") || tag.equals("th")) { + pendingTD = false; + cprops.removeChain("td"); + skipText = true; + return; + } + } catch (Exception e) { + throw new ExceptionConverter(e); + } + } + + public void text(String str) { + if (skipText) + return; + String content = str; + if (isPRE) { + if (currentParagraph == null) { + currentParagraph = FactoryProperties.createParagraph(cprops); + } + Chunk chunk = factoryProperties.createChunk(content, cprops); + currentParagraph.add(chunk); + return; + } + if (content.trim().length() == 0 && content.indexOf(' ') < 0) { + return; + } + + StringBuffer buf = new StringBuffer(); + int len = content.length(); + char character; + boolean newline = false; + for (int i = 0; i < len; i++) { + switch (character = content.charAt(i)) { + case ' ': + if (!newline) { + buf.append(character); + } + break; + case '\n': + if (i > 0) { + newline = true; + buf.append(' '); + } + break; + case '\r': + break; + case '\t': + break; + default: + newline = false; + buf.append(character); + } + } + if (currentParagraph == null) { + currentParagraph = FactoryProperties.createParagraph(cprops); + } + Chunk chunk = factoryProperties.createChunk(buf.toString(), cprops); + currentParagraph.add(chunk); + } + + public boolean add(Element element) throws DocumentException { + objectList.add(element); + return true; + } + + public void clearTextWrap() throws DocumentException { + } + + public void close() { + } + + public boolean newPage() { + return true; + } + + public void open() { + } + + public void resetFooter() { + } + + public void resetHeader() { + } + + public void resetPageCount() { + } + + public void setFooter(HeaderFooter footer) { + } + + public void setHeader(HeaderFooter header) { + } + + public boolean setMarginMirroring(boolean marginMirroring) { + return false; + } + + /** + * @see DocListener#setMarginMirroring(boolean) + * @since 2.1.6 + */ + public boolean setMarginMirroringTopBottom(boolean marginMirroring) { + return false; + } + + public boolean setMargins(float marginLeft, float marginRight, + float marginTop, float marginBottom) { + return true; + } + + public void setPageCount(int pageN) { + } + + public boolean setPageSize(Rectangle pageSize) { + return true; + } + + public static final String tagsSupportedString = "ol ul li a pre font span br p div body table td th tr i b u sub sup em strong s strike" + + " h1 h2 h3 h4 h5 h6 img hr"; + + public static final HashMap tagsSupported = new HashMap(); + public static final HashMap tagsPrefixSupported = new HashMap(); + + static { + StringTokenizer tok = new StringTokenizer(tagsSupportedString); + while (tok.hasMoreTokens()) { + String s = tok.nextToken(); + tagsSupported.put(s, null); + tagsPrefixSupported.put(s.charAt(0), null); + } + } +} diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java new file mode 100755 index 000000000..d61de9b2e --- /dev/null +++ b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java @@ -0,0 +1,780 @@ +/* + * Copyright 2003 Paulo Soares + * + * The contents of this file are subject to the Mozilla Public License Version 1.1 + * (the "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the License. + * + * The Original Code is 'iText, a free JAVA-PDF library'. + * + * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by + * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. + * All Rights Reserved. + * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer + * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. + * + * Contributor(s): all the names of the contributors are added in the source code + * where applicable. + * + * Alternatively, the contents of this file may be used under the terms of the + * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the + * provisions of LGPL are applicable instead of those above. If you wish to + * allow use of your version of this file only under the terms of the LGPL + * License and not to allow others to use your version of this file under + * the MPL, indicate your decision by deleting the provisions above and + * replace them with the notice and other provisions required by the LGPL. + * If you do not delete the provisions above, a recipient may use your version + * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the MPL as stated above or under the terms of the GNU + * Library General Public License as published by the Free Software Foundation; + * either version 2 of the License, or any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more + * details. + * + * If you didn't download this code from the following link, you should check if + * you aren't using an obsolete version: + * http://www.lowagie.com/iText/ + * + * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license: + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt. + * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128). + * Steven Brandt and JavaWorld gave permission to use the code for free. + * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in + * conformance with the rest of the code). + * The original code can be found on this url: http://www.javaworld.com/javatips/jw-javatip128_p.html. + * It was substantially refactored by Bruno Lowagie. + * + * The method 'private static String getEncodingName(byte[] b4)' was found + * in org.apache.xerces.impl.XMLEntityManager, originaly published by the + * Apache Software Foundation under the Apache Software License; now being + * used in iText under the MPL. + */ +package com.fr.third.com.lowagie.text.xml.simpleparser; + +import com.fr.third.com.lowagie.text.html.simpleparser.HTMLWorker; +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.HashMap; +import java.util.Stack; + +/** + * A simple XML and HTML parser. This parser is, like the SAX parser, + * an event based parser, but with much less functionality. + *

+ * The parser can: + *

+ *

+ *

+ */ +public final class SimpleXMLParser { + /** possible states */ + private final static int UNKNOWN = 0; + private final static int TEXT = 1; + private final static int TAG_ENCOUNTERED = 2; + private final static int EXAMIN_TAG = 3; + private final static int TAG_EXAMINED = 4; + private final static int IN_CLOSETAG = 5; + private final static int SINGLE_TAG = 6; + private final static int CDATA = 7; + private final static int COMMENT = 8; + private final static int PI = 9; + private final static int ENTITY = 10; + private final static int QUOTE = 11; + private final static int ATTRIBUTE_KEY = 12; + private final static int ATTRIBUTE_EQUAL = 13; + private final static int ATTRIBUTE_VALUE = 14; + + /** the state stack */ + Stack stack; + /** The current character. */ + int character = 0; + /** The previous character. */ + int previousCharacter = -1; + /** the line we are currently reading */ + int lines = 1; + /** the column where the current character occurs */ + int columns = 0; + /** was the last character equivalent to a newline? */ + boolean eol = false; + /** + * A boolean indicating if the next character should be taken into account + * if it's a space character. When nospace is false, the previous character + * wasn't whitespace. + * @since 2.1.5 + */ + boolean nowhite = false; + /** the current state */ + int state; + /** Are we parsing HTML? */ + boolean html; + /** current text (whatever is encountered between tags) */ + StringBuffer text = new StringBuffer(); + /** current entity (whatever is encountered between & and ;) */ + StringBuffer entity = new StringBuffer(); + /** current tagname */ + String tag = null; + /** current attributes */ + HashMap attributes = null; + /** The handler to which we are going to forward document content */ + SimpleXMLDocHandler doc; + /** The handler to which we are going to forward comments. */ + SimpleXMLDocHandlerComment comment; + /** Keeps track of the number of tags that are open. */ + int nested = 0; + /** the quote character that was used to open the quote. */ + int quoteCharacter = '"'; + /** the attribute key. */ + String attributekey = null; + /** the attribute value. */ + String attributevalue = null; + + /** + * Creates a Simple XML parser object. + * Call go(BufferedReader) immediately after creation. + */ + private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) { + this.doc = doc; + this.comment = comment; + this.html = html; + stack = new Stack(); + state = html ? TEXT : UNKNOWN; + } + + /** + * Does the actual parsing. Perform this immediately + * after creating the parser object. + */ + private void go(Reader r) throws IOException { + BufferedReader reader; + if (r instanceof BufferedReader) + reader = (BufferedReader)r; + else + reader = new BufferedReader(r); + doc.startDocument(); + while(true) { + // read a new character + if (previousCharacter == -1) { + character = reader.read(); + } + // or re-examine the previous character + else { + character = previousCharacter; + previousCharacter = -1; + } + + // the end of the file was reached + if (character == -1) { + if (html) { + if (html && state == TEXT) + flush(); + doc.endDocument(); + } else { + throwException("Missing end tag"); + } + return; + } + + // dealing with \n and \r + if (character == '\n' && eol) { + eol = false; + continue; + } else if (eol) { + eol = false; + } else if (character == '\n') { + lines++; + columns = 0; + } else if (character == '\r') { + eol = true; + character = '\n'; + lines++; + columns = 0; + } else { + columns++; + } + + switch(state) { + // we are in an unknown state before there's actual content + case UNKNOWN: + if(character == '<') { + beginnOfTag((char) reader.read(), UNKNOWN); + } + break; + // we can encounter any content + case TEXT: + if(character == '<') { + beginnOfTag((char) reader.read(), TEXT); + } else if(character == '&') { + saveState(state); + entity.setLength(0); + state = ENTITY; + } else if (Character.isWhitespace((char)character) && character != 12288) { + if (nowhite) + text.append((char)character); + nowhite = false; + } else { + text.append((char)character); + nowhite = true; + } + break; + // we have just seen a < and are wondering what we are looking at + // , , , etc. + case TAG_ENCOUNTERED: + initTag(); + if(character == '/') { + state = IN_CLOSETAG; + } else if (character == '?') { + restoreState(); + state = PI; + } else { + text.append((char)character); + state = EXAMIN_TAG; + } + break; + // we are processing something like this . + // It could still be a or something. + case EXAMIN_TAG: + if(character == '>') { + doTag(); + processTag(true); + initTag(); + state = restoreState(); + } else if(character == '/') { + state = SINGLE_TAG; + } else if(character == '-' && text.toString().equals("!-")) { + flush(); + state = COMMENT; + } else if(character == '[' && text.toString().equals("![CDATA")) { + flush(); + state = CDATA; + } else if(character == 'E' && text.toString().equals("!DOCTYP")) { + flush(); + state = PI; + } else if(Character.isWhitespace((char)character)) { + doTag(); + state = TAG_EXAMINED; + } else { + text.append((char)character); + } + break; + // we know the name of the tag now. + case TAG_EXAMINED: + if(character == '>') { + processTag(true); + initTag(); + state = restoreState(); + } else if(character == '/') { + state = SINGLE_TAG; + } else if(Character.isWhitespace((char)character)) { + // empty + } else { + text.append((char)character); + state = ATTRIBUTE_KEY; + } + break; + + // we are processing a closing tag: e.g. + case IN_CLOSETAG: + if(character == '>') { + doTag(); + processTag(false); + if(!html && nested==0) return; + state = restoreState(); + } else { + if (!Character.isWhitespace((char)character)) + text.append((char)character); + } + break; + + // we have just seen something like this: . + case SINGLE_TAG: + if(character != '>') + throwException("Expected > for tag: <"+tag+"/>"); + doTag(); + processTag(true); + processTag(false); + initTag(); + if(!html && nested==0) { + doc.endDocument(); + return; + } + state = restoreState(); + break; + + // we are processing CDATA + case CDATA: + if(character == '>' + && text.toString().endsWith("]]")) { + text.setLength(text.length()-2); + flush(); + state = restoreState(); + } else + text.append((char)character); + break; + + // we are processing a comment. We are inside + // the looking for the -->. + case COMMENT: + if(character == '>' + && text.toString().endsWith("--")) { + text.setLength(text.length() - 2); + flush(); + state = restoreState(); + } else + text.append((char)character); + break; + + // We are inside one of these or one of these + case PI: + if(character == '>') { + state = restoreState(); + if(state == TEXT) state = UNKNOWN; + } + break; + + // we are processing an entity, e.g. <, », etc. + case ENTITY: + if(character == ';') { + state = restoreState(); + String cent = entity.toString(); + entity.setLength(0); + char ce = EntitiesToUnicode.decodeEntity(cent); + if (ce == '\0') + text.append('&').append(cent).append(';'); + else + text.append(ce); + } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') + && (character < 'A' || character > 'Z')) || entity.length() >= 7) { + state = restoreState(); + previousCharacter = character; + text.append('&').append(entity.toString()); + entity.setLength(0); + } + else { + entity.append((char)character); + } + break; + // We are processing the quoted right-hand side of an element's attribute. + case QUOTE: + if (html && quoteCharacter == ' ' && character == '>') { + flush(); + processTag(true); + initTag(); + state = restoreState(); + } + else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) { + flush(); + state = TAG_EXAMINED; + } + else if (html && quoteCharacter == ' ') { + text.append((char)character); + } + else if(character == quoteCharacter) { + flush(); + state = TAG_EXAMINED; + } else if(" \r\n\u0009".indexOf(character)>=0) { + text.append(' '); + } else if(character == '&') { + saveState(state); + state = ENTITY; + entity.setLength(0); + } else { + text.append((char)character); + } + break; + + case ATTRIBUTE_KEY: + if(Character.isWhitespace((char)character)) { + flush(); + state = ATTRIBUTE_EQUAL; + } else if(character == '=') { + flush(); + state = ATTRIBUTE_VALUE; + } else if (html && character == '>') { + text.setLength(0); + processTag(true); + initTag(); + state = restoreState(); + } else { + text.append((char)character); + } + break; + + case ATTRIBUTE_EQUAL: + if(character == '=') { + state = ATTRIBUTE_VALUE; + } else if(Character.isWhitespace((char)character)) { + // empty + } else if (html && character == '>') { + text.setLength(0); + processTag(true); + initTag(); + state = restoreState(); + } else if (html && character == '/') { + flush(); + state = SINGLE_TAG; + } else if (html) { + flush(); + text.append((char)character); + state = ATTRIBUTE_KEY; + } else { + throwException("Error in attribute processing."); + } + break; + + case ATTRIBUTE_VALUE: + if(character == '"' || character == '\'') { + quoteCharacter = character; + state = QUOTE; + } else if(Character.isWhitespace((char)character)) { + // empty + } else if (html && character == '>') { + flush(); + processTag(true); + initTag(); + state = restoreState(); + } else if (html) { + text.append((char)character); + quoteCharacter = ' '; + state = QUOTE; + } else { + throwException("Error in attribute processing"); + } + break; + } + } + } + + /** + * Gets a state from the stack + * @return the previous state + */ + private int restoreState() { + if(!stack.empty()) + return ((Integer)stack.pop()).intValue(); + else + return UNKNOWN; + } + /** + * Adds a state to the stack. + * @param s a state to add to the stack + */ + private void saveState(int s) { + stack.push(new Integer(s)); + } + + /** + * 处理标签的开头,若不在支持标签范围内,将<符号作为文本处理,例:<1111 (仿造浏览器的处理方式) + */ + public void beginnOfTag(char c, int type) { + previousCharacter = c; + if (c == -1) { + return; + } + if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) { + if (type == TEXT) { + flush(); + } + saveState(TEXT); + state = TAG_ENCOUNTERED; + return; + } + text.append((char) character); + nowhite = true; + } + + /** + * Flushes the text that is currently in the buffer. + * The text can be ignored, added to the document + * as content or as comment,... depending on the current state. + */ + private void flush() { + switch(state){ + case TEXT: + case CDATA: + if(text.length() > 0) { + doc.text(text.toString()); + } + break; + case COMMENT: + if (comment != null) { + comment.comment(text.toString()); + } + break; + case ATTRIBUTE_KEY: + attributekey = text.toString(); + if (html) + attributekey = attributekey.toLowerCase(); + break; + case QUOTE: + case ATTRIBUTE_VALUE: + attributevalue = text.toString(); + attributes.put(attributekey,attributevalue); + break; + default: + // do nothing + } + text.setLength(0); + } + /** + * Initialized the tag name and attributes. + */ + private void initTag() { + tag = null; + attributes = new HashMap(); + } + /** Sets the name of the tag. */ + private void doTag() { + if(tag == null) + tag = text.toString(); + if (html) + tag = tag.toLowerCase(); + text.setLength(0); + } + /** + * processes the tag. + * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. + */ + private void processTag(boolean start) { + if (start) { + nested++; + doc.startElement(tag,attributes); + } + else { + nested--; + doc.endElement(tag); + } + } + /** Throws an exception */ + private void throwException(String s) throws IOException { + throw new IOException(s+" near line " + lines + ", column " + columns); + } + + /** + * Parses the XML document firing the events to the handler. + * @param doc the document handler + * @param r the document. The encoding is already resolved. The reader is not closed + * @throws IOException on error + */ + public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException { + SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); + parser.go(r); + } + + /** + * Parses the XML document firing the events to the handler. + * @param doc the document handler + * @param in the document. The encoding is deduced from the stream. The stream is not closed + * @throws IOException on error + */ + public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException { + byte b4[] = new byte[4]; + int count = in.read(b4); + if (count != 4) + throw new IOException("Insufficient length."); + String encoding = getEncodingName(b4); + String decl = null; + if (encoding.equals("UTF-8")) { + StringBuffer sb = new StringBuffer(); + int c; + while ((c = in.read()) != -1) { + if (c == '>') + break; + sb.append((char)c); + } + decl = sb.toString(); + } + else if (encoding.equals("CP037")) { + ByteArrayOutputStream bi = new ByteArrayOutputStream(); + int c; + while ((c = in.read()) != -1) { + if (c == 0x6e) // that's '>' in ebcdic + break; + bi.write(c); + } + decl = new String(bi.toByteArray(), "CP037"); + } + if (decl != null) { + decl = getDeclaredEncoding(decl); + if (decl != null) + encoding = decl; + } + parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding))); + } + + private static String getDeclaredEncoding(String decl) { + if (decl == null) + return null; + int idx = decl.indexOf("encoding"); + if (idx < 0) + return null; + int idx1 = decl.indexOf('"', idx); + int idx2 = decl.indexOf('\'', idx); + if (idx1 == idx2) + return null; + if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { + int idx3 = decl.indexOf('\'', idx2 + 1); + if (idx3 < 0) + return null; + return decl.substring(idx2 + 1, idx3); + } + if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { + int idx3 = decl.indexOf('"', idx1 + 1); + if (idx3 < 0) + return null; + return decl.substring(idx1 + 1, idx3); + } + return null; + } + + public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException { + parse(doc, null, r, false); + } + + /** + * Escapes a string with the appropriated XML codes. + * @param s the string to be escaped + * @param onlyASCII codes above 127 will always be escaped with &#nn; if true + * @return the escaped string + */ + public static String escapeXML(String s, boolean onlyASCII) { + char cc[] = s.toCharArray(); + int len = cc.length; + StringBuffer sb = new StringBuffer(); + for (int k = 0; k < len; ++k) { + int c = cc[k]; + switch (c) { + case '<': + sb.append("<"); + break; + case '>': + sb.append(">"); + break; + case '&': + sb.append("&"); + break; + case '"': + sb.append("""); + break; + case '\'': + sb.append("'"); + break; + default: + if ((c == 0x9) || (c == 0xA) || (c == 0xD) + || ((c >= 0x20) && (c <= 0xD7FF)) + || ((c >= 0xE000) && (c <= 0xFFFD)) + || ((c >= 0x10000) && (c <= 0x10FFFF))) { + if (onlyASCII && c > 127) + sb.append("&#").append(c).append(';'); + else + sb.append((char)c); + } + } + } + return sb.toString(); + } + /** + * Returns the IANA encoding name that is auto-detected from + * the bytes specified, with the endian-ness of that encoding where appropriate. + * (method found in org.apache.xerces.impl.XMLEntityManager, originally published + * by the Apache Software Foundation under the Apache Software License; now being + * used in iText under the MPL) + * @param b4 The first four bytes of the input. + * @return an IANA-encoding string + */ + private static String getEncodingName(byte[] b4) { + + // UTF-16, with BOM + int b0 = b4[0] & 0xFF; + int b1 = b4[1] & 0xFF; + if (b0 == 0xFE && b1 == 0xFF) { + // UTF-16, big-endian + return "UTF-16BE"; + } + if (b0 == 0xFF && b1 == 0xFE) { + // UTF-16, little-endian + return "UTF-16LE"; + } + + // UTF-8 with a BOM + int b2 = b4[2] & 0xFF; + if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { + return "UTF-8"; + } + + // other encodings + int b3 = b4[3] & 0xFF; + if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { + // UCS-4, big endian (1234) + return "ISO-10646-UCS-4"; + } + if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { + // UCS-4, little endian (4321) + return "ISO-10646-UCS-4"; + } + if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { + // UCS-4, unusual octet order (2143) + // REVISIT: What should this be? + return "ISO-10646-UCS-4"; + } + if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { + // UCS-4, unusual octet order (3412) + // REVISIT: What should this be? + return "ISO-10646-UCS-4"; + } + if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { + // UTF-16, big-endian, no BOM + // (or could turn out to be UCS-2... + // REVISIT: What should this be? + return "UTF-16BE"; + } + if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { + // UTF-16, little-endian, no BOM + // (or could turn out to be UCS-2... + return "UTF-16LE"; + } + if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { + // EBCDIC + // a la xerces1, return CP037 instead of EBCDIC here + return "CP037"; + } + + // default encoding + return "UTF-8"; + } +} \ No newline at end of file