From 2b0774489afbf24f55859cfb95b40dcea1511909 Mon Sep 17 00:00:00 2001 From: "Hugh.C" Date: Fri, 15 May 2020 13:48:37 +0800 Subject: [PATCH 1/3] =?UTF-8?q?REPORT-31492=20word=E5=AF=BC=E5=87=BA?= =?UTF-8?q?=E4=B8=ADHtml=E6=97=A0=E6=B3=95=E8=A7=A3=E6=9E=90=E5=B0=8F?= =?UTF-8?q?=E4=BA=8E=E5=8F=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../text/html/simpleparser/HTMLWorker.java | 9 ++++-- .../xml/simpleparser/SimpleXMLParser.java | 29 +++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java index f1ea48571..adb73e007 100644 --- a/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java +++ b/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java @@ -775,11 +775,14 @@ public class HTMLWorker implements SimpleXMLDocHandler, DocListener { + " h1 h2 h3 h4 h5 h6 img hr"; public static final HashMap tagsSupported = new HashMap(); + public static final HashMap tagsPrefixSupported = new HashMap(); static { StringTokenizer tok = new StringTokenizer(tagsSupportedString); - while (tok.hasMoreTokens()) - tagsSupported.put(tok.nextToken(), null); + while (tok.hasMoreTokens()) { + String s = tok.nextToken(); + tagsSupported.put(s, null); + tagsPrefixSupported.put(s.charAt(0), null); + } } - } diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java index fe9c80e03..0d7209ece 100755 --- a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java +++ b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java @@ -75,6 +75,7 @@ */ package com.fr.third.com.lowagie.text.xml.simpleparser; +import com.fr.third.com.lowagie.text.html.simpleparser.HTMLWorker; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -230,16 +231,13 @@ public final class SimpleXMLParser { // we are in an unknown state before there's actual content case UNKNOWN: if(character == '<') { - saveState(TEXT); - state = TAG_ENCOUNTERED; + beginnOfTag((char) reader.read(), UNKNOWN); } break; // we can encounter any content case TEXT: if(character == '<') { - flush(); - saveState(state); - state = TAG_ENCOUNTERED; + beginnOfTag((char) reader.read(), UNKNOWN); } else if(character == '&') { saveState(state); entity.setLength(0); @@ -499,6 +497,27 @@ public final class SimpleXMLParser { private void saveState(int s) { stack.push(new Integer(s)); } + + /** + * 处理标签的开头,若不在支持标签范围内,将<符号作为文本处理,例:<1111 (仿造浏览器的处理方式) + */ + public void beginnOfTag(char c, int type) { + previousCharacter = c; + if (c == -1) { + return; + } + if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) { + if (type == TEXT) { + flush(); + } + saveState(TEXT); + state = TAG_ENCOUNTERED; + return; + } + text.append((char) character); + nowhite = true; + } + /** * Flushes the text that is currently in the buffer. * The text can be ignored, added to the document From 8d1c235eb7a15eeec582591d5e1dbd21e24bc16c Mon Sep 17 00:00:00 2001 From: "Hugh.C" Date: Wed, 20 May 2020 19:48:07 +0800 Subject: [PATCH 2/3] =?UTF-8?q?REPORT-31492=20word=E5=AF=BC=E5=87=BA?= =?UTF-8?q?=E4=B8=ADHtml=E6=97=A0=E6=B3=95=E8=A7=A3=E6=9E=90=E5=B0=8F?= =?UTF-8?q?=E4=BA=8E=E5=8F=B7=EF=BC=88=E6=BC=8F=E6=8F=90=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/lowagie/text/xml/simpleparser/SimpleXMLParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java index 0d7209ece..7a5339a2b 100755 --- a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java +++ b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java @@ -237,7 +237,7 @@ public final class SimpleXMLParser { // we can encounter any content case TEXT: if(character == '<') { - beginnOfTag((char) reader.read(), UNKNOWN); + beginnOfTag((char) reader.read(), TEXT); } else if(character == '&') { saveState(state); entity.setLength(0); From aae332b0fa5e530d02e2bacfbaab688aeaab36fd Mon Sep 17 00:00:00 2001 From: Harrison Date: Wed, 10 Jun 2020 17:15:01 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E6=97=A0=20jira=20=E4=BB=BB=E5=8A=A1?= =?UTF-8?q?=EF=BC=8C=20bugfix=20->=20release=20=E8=A7=A3=E5=86=B3=E5=86=B2?= =?UTF-8?q?=E7=AA=81=EF=BC=8C=20=E6=9B=B4=E6=96=B0=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=E7=BB=93=E6=9E=84=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../text/html/simpleparser/HTMLWorker.java | 788 ------------------ .../xml/simpleparser/SimpleXMLParser.java | 780 ----------------- .../text/html/simpleparser/HTMLWorker.java | 9 +- .../xml/simpleparser/SimpleXMLParser.java | 37 +- 4 files changed, 34 insertions(+), 1580 deletions(-) delete mode 100644 fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java delete mode 100755 fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java deleted file mode 100644 index 11e918722..000000000 --- a/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java +++ /dev/null @@ -1,788 +0,0 @@ -/* - * Copyright 2004 Paulo Soares - * - * The contents of this file are subject to the Mozilla Public License Version 1.1 - * (the "License"); you may not use this file except in compliance with the License. - * You may obtain a copy of the License at http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the License. - * - * The Original Code is 'iText, a free JAVA-PDF library'. - * - * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by - * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. - * All Rights Reserved. - * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer - * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. - * - * Contributor(s): all the names of the contributors are added in the source code - * where applicable. - * - * Alternatively, the contents of this file may be used under the terms of the - * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the - * provisions of LGPL are applicable instead of those above. If you wish to - * allow use of your version of this file only under the terms of the LGPL - * License and not to allow others to use your version of this file under - * the MPL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the LGPL. - * If you do not delete the provisions above, a recipient may use your version - * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. - * - * This library is free software; you can redistribute it and/or modify it - * under the terms of the MPL as stated above or under the terms of the GNU - * Library General Public License as published by the Free Software Foundation; - * either version 2 of the License, or any later version. - * - * This library is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more - * details. - * - * Contributions by: - * Lubos Strapko - * - * If you didn't download this code from the following link, you should check if - * you aren't using an obsolete version: - * http://www.lowagie.com/iText/ - */ - -package com.fr.third.com.lowagie.text.html.simpleparser; - -import com.fr.third.com.lowagie.text.Chunk; -import com.fr.third.com.lowagie.text.DocListener; -import com.fr.third.com.lowagie.text.DocumentException; -import com.fr.third.com.lowagie.text.Element; -import com.fr.third.com.lowagie.text.ElementTags; -import com.fr.third.com.lowagie.text.ExceptionConverter; -import com.fr.third.com.lowagie.text.FontFactoryImp; -import com.fr.third.com.lowagie.text.HeaderFooter; -import com.fr.third.com.lowagie.text.Image; -import com.fr.third.com.lowagie.text.List; -import com.fr.third.com.lowagie.text.ListItem; -import com.fr.third.com.lowagie.text.Paragraph; -import com.fr.third.com.lowagie.text.Phrase; -import com.fr.third.com.lowagie.text.Rectangle; -import com.fr.third.com.lowagie.text.TextElementArray; -import com.fr.third.com.lowagie.text.html.CSSUtils; -import com.fr.third.com.lowagie.text.html.HtmlTags; -import com.fr.third.com.lowagie.text.html.Markup; -import com.fr.third.com.lowagie.text.pdf.PdfPTable; -import com.fr.third.com.lowagie.text.pdf.draw.LineSeparator; -import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLDocHandler; -import com.fr.third.com.lowagie.text.xml.simpleparser.SimpleXMLParser; -import com.fr.third.sun.misc.BASE64Decoder; - -import java.io.File; -import java.io.IOException; -import java.io.Reader; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Stack; -import java.util.StringTokenizer; - -public class HTMLWorker implements SimpleXMLDocHandler, DocListener { - - protected ArrayList objectList; - - protected DocListener document; - - private Paragraph currentParagraph; - - private ChainedProperties cprops = new ChainedProperties(); - - private Stack stack = new Stack(); - - private boolean pendingTR = false; - - private boolean pendingTD = false; - - private boolean pendingLI = false; - - private StyleSheet style = new StyleSheet(); - - private boolean isPRE = false; - - private Stack tableState = new Stack(); - - private boolean skipText = false; - - private HashMap interfaceProps; - - private FactoryProperties factoryProperties = new FactoryProperties(); - - /** Creates a new instance of HTMLWorker - * @param document A class that implements DocListener - * */ - public HTMLWorker(DocListener document) { - this.document = document; - } - - public void setStyleSheet(StyleSheet style) { - this.style = style; - } - - public StyleSheet getStyleSheet() { - return style; - } - - public void setInterfaceProps(HashMap interfaceProps) { - this.interfaceProps = interfaceProps; - FontFactoryImp ff = null; - if (interfaceProps != null) - ff = (FontFactoryImp) interfaceProps.get("font_factory"); - if (ff != null) - factoryProperties.setFontImp(ff); - } - - public HashMap getInterfaceProps() { - return interfaceProps; - } - - public void parse(Reader reader) throws IOException { - SimpleXMLParser.parse(this, null, reader, true); - } - - public static ArrayList parseToList(Reader reader, StyleSheet style) - throws IOException { - return parseToList(reader, style, null); - } - - public static ArrayList parseToList(Reader reader, StyleSheet style, - HashMap interfaceProps) throws IOException { - HTMLWorker worker = new HTMLWorker(null); - if (style != null) - worker.style = style; - worker.document = worker; - worker.setInterfaceProps(interfaceProps); - worker.objectList = new ArrayList(); - worker.parse(reader); - return worker.objectList; - } - - public void endDocument() { - try { - for (int k = 0; k < stack.size(); ++k) - document.add((Element) stack.elementAt(k)); - if (currentParagraph != null) - document.add(currentParagraph); - currentParagraph = null; - } catch (Exception e) { - throw new ExceptionConverter(e); - } - } - - public void startDocument() { - HashMap h = new HashMap(); - style.applyStyle("body", h); - cprops.addToChain("body", h); - } - - - public void startElement(String tag, HashMap h) { - if (!tagsSupported.containsKey(tag)) - return; - try { - style.applyStyle(tag, h); - if(tag.equals("p")){ - h.put(Markup.CSS_KEY_MARGINTOP, "16px"); - h.put(Markup.CSS_KEY_MARGINBOTTOM, "16px"); - } - String follow = (String) FactoryProperties.followTags.get(tag); - if (follow != null) { - HashMap prop = new HashMap(); - prop.put(follow, null); - FactoryProperties.insertStyle(h, this.cprops); - prop.putAll(h); - - cprops.addToChain(follow, prop); - return; - } - FactoryProperties.insertStyle(h, cprops); - if (tag.equals(HtmlTags.ANCHOR)) { - cprops.addToChain(tag, h); - if (currentParagraph == null) { - currentParagraph = new Paragraph(); - } - stack.push(currentParagraph); - currentParagraph = new Paragraph(); - return; - } - if (tag.equals(HtmlTags.NEWLINE)) { - if (currentParagraph == null) { - currentParagraph = new Paragraph(); - } - currentParagraph.add(factoryProperties - .createChunk("\n", cprops)); - return; - } - if (tag.equals(HtmlTags.HORIZONTALRULE)) { - // Attempting to duplicate the behavior seen on Firefox with - // http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_hr_test - // where an initial break is only inserted when the preceding element doesn't - // end with a break, but a trailing break is always inserted. - boolean addLeadingBreak = true; - if (currentParagraph == null) { - currentParagraph = new Paragraph(); - addLeadingBreak = false; - } - if (addLeadingBreak) { // Not a new paragraph - int numChunks = currentParagraph.getChunks().size(); - if (numChunks == 0 || - ((Chunk)(currentParagraph.getChunks().get(numChunks - 1))).getContent().endsWith("\n")) - addLeadingBreak = false; - } - String align = (String) h.get("align"); - int hrAlign = Element.ALIGN_CENTER; - if (align != null) { - if (align.equalsIgnoreCase("left")) - hrAlign = Element.ALIGN_LEFT; - if (align.equalsIgnoreCase("right")) - hrAlign = Element.ALIGN_RIGHT; - } - String width = (String) h.get("width"); - float hrWidth = 1; - if (width != null) { - float tmpWidth = Markup.parseLength(width, Markup.DEFAULT_FONT_SIZE); - if (tmpWidth > 0) hrWidth = tmpWidth; - if (!width.endsWith("%")) - hrWidth = 100; // Treat a pixel width as 100% for now. - } - String size = (String) h.get("size"); - float hrSize = 1; - if (size != null) { - float tmpSize = Markup.parseLength(size, Markup.DEFAULT_FONT_SIZE); - if (tmpSize > 0) - hrSize = tmpSize; - } - if (addLeadingBreak) - currentParagraph.add(Chunk.NEWLINE); - currentParagraph.add(new LineSeparator(hrSize, hrWidth, null, hrAlign, currentParagraph.getLeading()/2)); - currentParagraph.add(Chunk.NEWLINE); - return; - } - if (tag.equals(HtmlTags.CHUNK) || tag.equals(HtmlTags.SPAN)) { - cprops.addToChain(tag, h); - return; - } - if (tag.equals(HtmlTags.IMAGE)) { - String src = (String) h.get(ElementTags.SRC); - if (src == null) - return; - cprops.addToChain(tag, h); - Image img = null; - if (interfaceProps != null) { - ImageProvider ip = (ImageProvider) interfaceProps - .get("img_provider"); - if (ip != null) - img = ip.getImage(src, h, cprops, document); - if (img == null) { - HashMap images = (HashMap) interfaceProps - .get("img_static"); - if (images != null) { - Image tim = (Image) images.get(src); - if (tim != null) - img = Image.getInstance(tim); - } else { - if (!src.startsWith("http")) { // relative src references only - String baseurl = (String) interfaceProps - .get("img_baseurl"); - if (baseurl != null) { - src = baseurl + src; - img = Image.getInstance(src); - } - } - } - } - } - //处理base64编码图片 - if(src.startsWith("data")){ - BASE64Decoder decoder = new BASE64Decoder(); - String[] srcArray = src.split(","); - String base64string = srcArray[srcArray.length -1]; - byte[] bytes = decoder.decodeBuffer(base64string); - try { - img = Image.getInstance(bytes); - }catch (Exception e){ - - } - - } - if (img == null) { - if (!src.startsWith("http")) { - String path = cprops.getProperty("image_path"); - if (path == null) - path = ""; - src = new File(path, src).getPath(); - } - img = Image.getInstance(src); - } - if(img == null){ - return; - } - img.setSrcString(src); - String align = (String) h.get("align"); - String width = (String) h.get("width"); - String height = (String) h.get("height"); - String before = cprops.getProperty("before"); - String after = cprops.getProperty("after"); - if (before != null) - img.setSpacingBefore(Float.parseFloat(before)); - if (after != null) - img.setSpacingAfter(Float.parseFloat(after)); - float actualFontSize = Markup.parseLength(cprops - .getProperty(ElementTags.SIZE), - Markup.DEFAULT_FONT_SIZE); - if (actualFontSize <= 0f) - actualFontSize = Markup.DEFAULT_FONT_SIZE; - float widthInPoints = Markup.parseLength(width, actualFontSize); - float heightInPoints = Markup.parseLength(height, - actualFontSize); - if (widthInPoints > 0 && heightInPoints > 0) { - img.scaleAbsolute(widthInPoints, heightInPoints); - } else if (widthInPoints > 0) { - heightInPoints = img.getHeight() * widthInPoints - / img.getWidth(); - img.scaleAbsolute(widthInPoints, heightInPoints); - } else if (heightInPoints > 0) { - widthInPoints = img.getWidth() * heightInPoints - / img.getHeight(); - img.scaleAbsolute(widthInPoints, heightInPoints); - } - img.setWidthPercentage(0); - if (align != null) { - endElement("p"); - int ralign = Image.MIDDLE; - if (align.equalsIgnoreCase("left")) - ralign = Image.LEFT; - else if (align.equalsIgnoreCase("right")) - ralign = Image.RIGHT; - img.setAlignment(ralign); - Img i = null; - boolean skip = false; - if (interfaceProps != null) { - i = (Img) interfaceProps.get("img_interface"); - if (i != null) - skip = i.process(img, h, cprops, document); - } - if (!skip) - document.add(img); - cprops.removeChain(tag); - } else { - Chunk ck = new Chunk(img, 0, 0); - if(cprops.hasPropertyInChain("img", "padding-left")){ - String ss = cprops.getPropertyFromChain("img", "padding-left"); - ck.setAttribute("padding-left", Float.toString(Markup.parseLength(ss))); - } - if(cprops.hasPropertyInChain("img", "padding-right")){ - String ss = cprops.getPropertyFromChain("img", "padding-right"); - ck.setAttribute("padding-right", Float.toString(Markup.parseLength(ss))); - } - cprops.removeChain(tag); - if (currentParagraph == null) { - currentParagraph = FactoryProperties - .createParagraph(cprops); - } - - currentParagraph.add(ck); - } - return; - } - endElement("p"); - if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3") - || tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) { - if (!h.containsKey(ElementTags.SIZE)) { - int v = 7 - Integer.parseInt(tag.substring(1)); - h.put(ElementTags.SIZE, Integer.toString(v)); - } - cprops.addToChain(tag, h); - return; - } - if (tag.equals(HtmlTags.UNORDEREDLIST)) { - if (pendingLI) - endElement(HtmlTags.LISTITEM); - skipText = true; - cprops.addToChain(tag, h); - List list = new List(false); - try{ - list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue()); - }catch (Exception e) { - list.setAutoindent(true); - } - list.setListSymbol("\u2022"); - stack.push(list); - return; - } - if (tag.equals(HtmlTags.ORDEREDLIST)) { - if (pendingLI) - endElement(HtmlTags.LISTITEM); - skipText = true; - cprops.addToChain(tag, h); - List list = new List(true); - try{ - list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue()); - }catch (Exception e) { - list.setAutoindent(true); - } - stack.push(list); - return; - } - if (tag.equals(HtmlTags.LISTITEM)) { - if (pendingLI) - endElement(HtmlTags.LISTITEM); - skipText = false; - pendingLI = true; - cprops.addToChain(tag, h); - ListItem item = FactoryProperties.createListItem(cprops); - stack.push(item); - return; - } - if (tag.equals(HtmlTags.DIV) || tag.equals(HtmlTags.BODY) || tag.equals("p")) { - cprops.addToChain(tag, h); - return; - } - if (tag.equals(HtmlTags.PRE)) { - if (!h.containsKey(ElementTags.FACE)) { - h.put(ElementTags.FACE, "Courier"); - } - cprops.addToChain(tag, h); - isPRE = true; - return; - } - if (tag.equals("tr")) { - if (pendingTR) - endElement("tr"); - skipText = true; - pendingTR = true; - cprops.addToChain("tr", h); - return; - } - if (tag.equals("td") || tag.equals("th")) { - if (pendingTD) - endElement(tag); - skipText = false; - pendingTD = true; - cprops.addToChain("td", h); - stack.push(new IncCell(tag, cprops)); - return; - } - if (tag.equals("table")) { - cprops.addToChain("table", h); - IncTable table = new IncTable(h); - stack.push(table); - tableState.push(new boolean[] { pendingTR, pendingTD }); - pendingTR = pendingTD = false; - skipText = true; - return; - } - } catch (Exception e) { - throw new ExceptionConverter(e); - } - } - - - - public void endElement(String tag) { - if (!tagsSupported.containsKey(tag)) - return; - try { - String follow = (String) FactoryProperties.followTags.get(tag); - if (follow != null) { - cprops.removeChain(follow); - return; - } - if (tag.equals("font") || tag.equals("span")) { - cprops.removeChain(tag); - return; - } - if (tag.equals("a")) { - if (currentParagraph == null) { - currentParagraph = new Paragraph(); - } - boolean skip = false; - if (interfaceProps != null) { - ALink i = (ALink) interfaceProps.get("alink_interface"); - if (i != null) - skip = i.process(currentParagraph, cprops); - } - if (!skip) { - String href = cprops.getProperty("href"); - if (href != null) { - ArrayList chunks = currentParagraph.getChunks(); - int size = chunks.size(); - for (int k = 0; k < size; ++k) { - Chunk ck = (Chunk) chunks.get(k); - ck.setAnchor(href); - } - } - } - Paragraph tmp = (Paragraph) stack.pop(); - Phrase tmp2 = new Phrase(); - tmp2.add(currentParagraph); - tmp.add(tmp2); - currentParagraph = tmp; - cprops.removeChain("a"); - return; - } - if (tag.equals("br")) { - return; - } - if (currentParagraph != null) { - if (stack.empty()) - document.add(currentParagraph); - else { - Object obj = stack.pop(); - if (obj instanceof TextElementArray) { - TextElementArray current = (TextElementArray) obj; - current.add(currentParagraph); - } - stack.push(obj); - } - } - currentParagraph = null; - if (tag.equals(HtmlTags.UNORDEREDLIST) - || tag.equals(HtmlTags.ORDEREDLIST)) { - if (pendingLI) - endElement(HtmlTags.LISTITEM); - skipText = false; - cprops.removeChain(tag); - if (stack.empty()) - return; - Object obj = stack.pop(); - if (!(obj instanceof List)) { - stack.push(obj); - return; - } - if (stack.empty()) - document.add((Element) obj); - else - ((TextElementArray) stack.peek()).add(obj); - return; - } - if (tag.equals(HtmlTags.LISTITEM)) { - pendingLI = false; - skipText = true; - cprops.removeChain(tag); - if (stack.empty()) - return; - Object obj = stack.pop(); - if (!(obj instanceof ListItem)) { - stack.push(obj); - return; - } - if (stack.empty()) { - document.add((Element) obj); - return; - } - Object list = stack.pop(); - if (!(list instanceof List)) { - stack.push(list); - return; - } - ListItem item = (ListItem) obj; - ((List) list).add(item); - ArrayList cks = item.getChunks(); - if (!cks.isEmpty()) - item.getListSymbol() - .setFont(((Chunk) cks.get(0)).getFont()); - stack.push(list); - return; - } - if (tag.equals("div") || tag.equals("body")) { - cprops.removeChain(tag); - return; - } - if (tag.equals(HtmlTags.PRE)) { - cprops.removeChain(tag); - isPRE = false; - return; - } - if (tag.equals("p")) { - cprops.removeChain(tag); - return; - } - if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3") - || tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) { - cprops.removeChain(tag); - return; - } - if (tag.equals("table")) { - if (pendingTR) - endElement("tr"); - cprops.removeChain("table"); - IncTable table = (IncTable) stack.pop(); - PdfPTable tb = table.buildTable(); - tb.setSplitRows(true); - if (stack.empty()) - document.add(tb); - else - ((TextElementArray) stack.peek()).add(tb); - boolean state[] = (boolean[]) tableState.pop(); - pendingTR = state[0]; - pendingTD = state[1]; - skipText = false; - return; - } - if (tag.equals("tr")) { - if (pendingTD) - endElement("td"); - pendingTR = false; - String rowHeightPx = cprops.getLastChainProperty("height"); - - cprops.removeChain("tr"); - ArrayList cells = new ArrayList(); - IncTable table = null; - while (true) { - Object obj = stack.pop(); - if (obj instanceof IncCell) { - cells.add(((IncCell) obj).getCell()); - } - if (obj instanceof IncTable) { - table = (IncTable) obj; - break; - } - } - float rowHeight = 0.0f; - if(rowHeightPx!=null){ - rowHeight = CSSUtils.parseFloat(rowHeightPx); - } - table.addCols(cells); - table.endRow(rowHeight); - - stack.push(table); - skipText = true; - return; - } - if (tag.equals("td") || tag.equals("th")) { - pendingTD = false; - cprops.removeChain("td"); - skipText = true; - return; - } - } catch (Exception e) { - throw new ExceptionConverter(e); - } - } - - public void text(String str) { - if (skipText) - return; - String content = str; - if (isPRE) { - if (currentParagraph == null) { - currentParagraph = FactoryProperties.createParagraph(cprops); - } - Chunk chunk = factoryProperties.createChunk(content, cprops); - currentParagraph.add(chunk); - return; - } - if (content.trim().length() == 0 && content.indexOf(' ') < 0) { - return; - } - - StringBuffer buf = new StringBuffer(); - int len = content.length(); - char character; - boolean newline = false; - for (int i = 0; i < len; i++) { - switch (character = content.charAt(i)) { - case ' ': - if (!newline) { - buf.append(character); - } - break; - case '\n': - if (i > 0) { - newline = true; - buf.append(' '); - } - break; - case '\r': - break; - case '\t': - break; - default: - newline = false; - buf.append(character); - } - } - if (currentParagraph == null) { - currentParagraph = FactoryProperties.createParagraph(cprops); - } - Chunk chunk = factoryProperties.createChunk(buf.toString(), cprops); - currentParagraph.add(chunk); - } - - public boolean add(Element element) throws DocumentException { - objectList.add(element); - return true; - } - - public void clearTextWrap() throws DocumentException { - } - - public void close() { - } - - public boolean newPage() { - return true; - } - - public void open() { - } - - public void resetFooter() { - } - - public void resetHeader() { - } - - public void resetPageCount() { - } - - public void setFooter(HeaderFooter footer) { - } - - public void setHeader(HeaderFooter header) { - } - - public boolean setMarginMirroring(boolean marginMirroring) { - return false; - } - - /** - * @see DocListener#setMarginMirroring(boolean) - * @since 2.1.6 - */ - public boolean setMarginMirroringTopBottom(boolean marginMirroring) { - return false; - } - - public boolean setMargins(float marginLeft, float marginRight, - float marginTop, float marginBottom) { - return true; - } - - public void setPageCount(int pageN) { - } - - public boolean setPageSize(Rectangle pageSize) { - return true; - } - - public static final String tagsSupportedString = "ol ul li a pre font span br p div body table td th tr i b u sub sup em strong s strike" - + " h1 h2 h3 h4 h5 h6 img hr"; - - public static final HashMap tagsSupported = new HashMap(); - public static final HashMap tagsPrefixSupported = new HashMap(); - - static { - StringTokenizer tok = new StringTokenizer(tagsSupportedString); - while (tok.hasMoreTokens()) { - String s = tok.nextToken(); - tagsSupported.put(s, null); - tagsPrefixSupported.put(s.charAt(0), null); - } - } -} diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java deleted file mode 100755 index d61de9b2e..000000000 --- a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java +++ /dev/null @@ -1,780 +0,0 @@ -/* - * Copyright 2003 Paulo Soares - * - * The contents of this file are subject to the Mozilla Public License Version 1.1 - * (the "License"); you may not use this file except in compliance with the License. - * You may obtain a copy of the License at http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the License. - * - * The Original Code is 'iText, a free JAVA-PDF library'. - * - * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by - * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. - * All Rights Reserved. - * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer - * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. - * - * Contributor(s): all the names of the contributors are added in the source code - * where applicable. - * - * Alternatively, the contents of this file may be used under the terms of the - * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the - * provisions of LGPL are applicable instead of those above. If you wish to - * allow use of your version of this file only under the terms of the LGPL - * License and not to allow others to use your version of this file under - * the MPL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the LGPL. - * If you do not delete the provisions above, a recipient may use your version - * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. - * - * This library is free software; you can redistribute it and/or modify it - * under the terms of the MPL as stated above or under the terms of the GNU - * Library General Public License as published by the Free Software Foundation; - * either version 2 of the License, or any later version. - * - * This library is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more - * details. - * - * If you didn't download this code from the following link, you should check if - * you aren't using an obsolete version: - * http://www.lowagie.com/iText/ - * - * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license: - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt. - * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128). - * Steven Brandt and JavaWorld gave permission to use the code for free. - * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in - * conformance with the rest of the code). - * The original code can be found on this url: http://www.javaworld.com/javatips/jw-javatip128_p.html. - * It was substantially refactored by Bruno Lowagie. - * - * The method 'private static String getEncodingName(byte[] b4)' was found - * in org.apache.xerces.impl.XMLEntityManager, originaly published by the - * Apache Software Foundation under the Apache Software License; now being - * used in iText under the MPL. - */ -package com.fr.third.com.lowagie.text.xml.simpleparser; - -import com.fr.third.com.lowagie.text.html.simpleparser.HTMLWorker; -import java.io.BufferedReader; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.util.HashMap; -import java.util.Stack; - -/** - * A simple XML and HTML parser. This parser is, like the SAX parser, - * an event based parser, but with much less functionality. - *

- * The parser can: - *

- *

    - *
  • It recognizes the encoding used - *
  • It recognizes all the elements' start tags and end tags - *
  • It lists attributes, where attribute values can be enclosed in single or double quotes - *
  • It recognizes the <[CDATA[ ... ]]> construct - *
  • It recognizes the standard entities: &amp;, &lt;, &gt;, &quot;, and &apos;, as well as numeric entities - *
  • It maps lines ending in \r\n and \r to \n on input, in accordance with the XML Specification, Section 2.11 - *
- *

- */ -public final class SimpleXMLParser { - /** possible states */ - private final static int UNKNOWN = 0; - private final static int TEXT = 1; - private final static int TAG_ENCOUNTERED = 2; - private final static int EXAMIN_TAG = 3; - private final static int TAG_EXAMINED = 4; - private final static int IN_CLOSETAG = 5; - private final static int SINGLE_TAG = 6; - private final static int CDATA = 7; - private final static int COMMENT = 8; - private final static int PI = 9; - private final static int ENTITY = 10; - private final static int QUOTE = 11; - private final static int ATTRIBUTE_KEY = 12; - private final static int ATTRIBUTE_EQUAL = 13; - private final static int ATTRIBUTE_VALUE = 14; - - /** the state stack */ - Stack stack; - /** The current character. */ - int character = 0; - /** The previous character. */ - int previousCharacter = -1; - /** the line we are currently reading */ - int lines = 1; - /** the column where the current character occurs */ - int columns = 0; - /** was the last character equivalent to a newline? */ - boolean eol = false; - /** - * A boolean indicating if the next character should be taken into account - * if it's a space character. When nospace is false, the previous character - * wasn't whitespace. - * @since 2.1.5 - */ - boolean nowhite = false; - /** the current state */ - int state; - /** Are we parsing HTML? */ - boolean html; - /** current text (whatever is encountered between tags) */ - StringBuffer text = new StringBuffer(); - /** current entity (whatever is encountered between & and ;) */ - StringBuffer entity = new StringBuffer(); - /** current tagname */ - String tag = null; - /** current attributes */ - HashMap attributes = null; - /** The handler to which we are going to forward document content */ - SimpleXMLDocHandler doc; - /** The handler to which we are going to forward comments. */ - SimpleXMLDocHandlerComment comment; - /** Keeps track of the number of tags that are open. */ - int nested = 0; - /** the quote character that was used to open the quote. */ - int quoteCharacter = '"'; - /** the attribute key. */ - String attributekey = null; - /** the attribute value. */ - String attributevalue = null; - - /** - * Creates a Simple XML parser object. - * Call go(BufferedReader) immediately after creation. - */ - private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) { - this.doc = doc; - this.comment = comment; - this.html = html; - stack = new Stack(); - state = html ? TEXT : UNKNOWN; - } - - /** - * Does the actual parsing. Perform this immediately - * after creating the parser object. - */ - private void go(Reader r) throws IOException { - BufferedReader reader; - if (r instanceof BufferedReader) - reader = (BufferedReader)r; - else - reader = new BufferedReader(r); - doc.startDocument(); - while(true) { - // read a new character - if (previousCharacter == -1) { - character = reader.read(); - } - // or re-examine the previous character - else { - character = previousCharacter; - previousCharacter = -1; - } - - // the end of the file was reached - if (character == -1) { - if (html) { - if (html && state == TEXT) - flush(); - doc.endDocument(); - } else { - throwException("Missing end tag"); - } - return; - } - - // dealing with \n and \r - if (character == '\n' && eol) { - eol = false; - continue; - } else if (eol) { - eol = false; - } else if (character == '\n') { - lines++; - columns = 0; - } else if (character == '\r') { - eol = true; - character = '\n'; - lines++; - columns = 0; - } else { - columns++; - } - - switch(state) { - // we are in an unknown state before there's actual content - case UNKNOWN: - if(character == '<') { - beginnOfTag((char) reader.read(), UNKNOWN); - } - break; - // we can encounter any content - case TEXT: - if(character == '<') { - beginnOfTag((char) reader.read(), TEXT); - } else if(character == '&') { - saveState(state); - entity.setLength(0); - state = ENTITY; - } else if (Character.isWhitespace((char)character) && character != 12288) { - if (nowhite) - text.append((char)character); - nowhite = false; - } else { - text.append((char)character); - nowhite = true; - } - break; - // we have just seen a < and are wondering what we are looking at - // , , , etc. - case TAG_ENCOUNTERED: - initTag(); - if(character == '/') { - state = IN_CLOSETAG; - } else if (character == '?') { - restoreState(); - state = PI; - } else { - text.append((char)character); - state = EXAMIN_TAG; - } - break; - // we are processing something like this . - // It could still be a or something. - case EXAMIN_TAG: - if(character == '>') { - doTag(); - processTag(true); - initTag(); - state = restoreState(); - } else if(character == '/') { - state = SINGLE_TAG; - } else if(character == '-' && text.toString().equals("!-")) { - flush(); - state = COMMENT; - } else if(character == '[' && text.toString().equals("![CDATA")) { - flush(); - state = CDATA; - } else if(character == 'E' && text.toString().equals("!DOCTYP")) { - flush(); - state = PI; - } else if(Character.isWhitespace((char)character)) { - doTag(); - state = TAG_EXAMINED; - } else { - text.append((char)character); - } - break; - // we know the name of the tag now. - case TAG_EXAMINED: - if(character == '>') { - processTag(true); - initTag(); - state = restoreState(); - } else if(character == '/') { - state = SINGLE_TAG; - } else if(Character.isWhitespace((char)character)) { - // empty - } else { - text.append((char)character); - state = ATTRIBUTE_KEY; - } - break; - - // we are processing a closing tag: e.g. - case IN_CLOSETAG: - if(character == '>') { - doTag(); - processTag(false); - if(!html && nested==0) return; - state = restoreState(); - } else { - if (!Character.isWhitespace((char)character)) - text.append((char)character); - } - break; - - // we have just seen something like this: . - case SINGLE_TAG: - if(character != '>') - throwException("Expected > for tag: <"+tag+"/>"); - doTag(); - processTag(true); - processTag(false); - initTag(); - if(!html && nested==0) { - doc.endDocument(); - return; - } - state = restoreState(); - break; - - // we are processing CDATA - case CDATA: - if(character == '>' - && text.toString().endsWith("]]")) { - text.setLength(text.length()-2); - flush(); - state = restoreState(); - } else - text.append((char)character); - break; - - // we are processing a comment. We are inside - // the looking for the -->. - case COMMENT: - if(character == '>' - && text.toString().endsWith("--")) { - text.setLength(text.length() - 2); - flush(); - state = restoreState(); - } else - text.append((char)character); - break; - - // We are inside one of these or one of these - case PI: - if(character == '>') { - state = restoreState(); - if(state == TEXT) state = UNKNOWN; - } - break; - - // we are processing an entity, e.g. <, », etc. - case ENTITY: - if(character == ';') { - state = restoreState(); - String cent = entity.toString(); - entity.setLength(0); - char ce = EntitiesToUnicode.decodeEntity(cent); - if (ce == '\0') - text.append('&').append(cent).append(';'); - else - text.append(ce); - } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') - && (character < 'A' || character > 'Z')) || entity.length() >= 7) { - state = restoreState(); - previousCharacter = character; - text.append('&').append(entity.toString()); - entity.setLength(0); - } - else { - entity.append((char)character); - } - break; - // We are processing the quoted right-hand side of an element's attribute. - case QUOTE: - if (html && quoteCharacter == ' ' && character == '>') { - flush(); - processTag(true); - initTag(); - state = restoreState(); - } - else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) { - flush(); - state = TAG_EXAMINED; - } - else if (html && quoteCharacter == ' ') { - text.append((char)character); - } - else if(character == quoteCharacter) { - flush(); - state = TAG_EXAMINED; - } else if(" \r\n\u0009".indexOf(character)>=0) { - text.append(' '); - } else if(character == '&') { - saveState(state); - state = ENTITY; - entity.setLength(0); - } else { - text.append((char)character); - } - break; - - case ATTRIBUTE_KEY: - if(Character.isWhitespace((char)character)) { - flush(); - state = ATTRIBUTE_EQUAL; - } else if(character == '=') { - flush(); - state = ATTRIBUTE_VALUE; - } else if (html && character == '>') { - text.setLength(0); - processTag(true); - initTag(); - state = restoreState(); - } else { - text.append((char)character); - } - break; - - case ATTRIBUTE_EQUAL: - if(character == '=') { - state = ATTRIBUTE_VALUE; - } else if(Character.isWhitespace((char)character)) { - // empty - } else if (html && character == '>') { - text.setLength(0); - processTag(true); - initTag(); - state = restoreState(); - } else if (html && character == '/') { - flush(); - state = SINGLE_TAG; - } else if (html) { - flush(); - text.append((char)character); - state = ATTRIBUTE_KEY; - } else { - throwException("Error in attribute processing."); - } - break; - - case ATTRIBUTE_VALUE: - if(character == '"' || character == '\'') { - quoteCharacter = character; - state = QUOTE; - } else if(Character.isWhitespace((char)character)) { - // empty - } else if (html && character == '>') { - flush(); - processTag(true); - initTag(); - state = restoreState(); - } else if (html) { - text.append((char)character); - quoteCharacter = ' '; - state = QUOTE; - } else { - throwException("Error in attribute processing"); - } - break; - } - } - } - - /** - * Gets a state from the stack - * @return the previous state - */ - private int restoreState() { - if(!stack.empty()) - return ((Integer)stack.pop()).intValue(); - else - return UNKNOWN; - } - /** - * Adds a state to the stack. - * @param s a state to add to the stack - */ - private void saveState(int s) { - stack.push(new Integer(s)); - } - - /** - * 处理标签的开头,若不在支持标签范围内,将<符号作为文本处理,例:<1111 (仿造浏览器的处理方式) - */ - public void beginnOfTag(char c, int type) { - previousCharacter = c; - if (c == -1) { - return; - } - if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) { - if (type == TEXT) { - flush(); - } - saveState(TEXT); - state = TAG_ENCOUNTERED; - return; - } - text.append((char) character); - nowhite = true; - } - - /** - * Flushes the text that is currently in the buffer. - * The text can be ignored, added to the document - * as content or as comment,... depending on the current state. - */ - private void flush() { - switch(state){ - case TEXT: - case CDATA: - if(text.length() > 0) { - doc.text(text.toString()); - } - break; - case COMMENT: - if (comment != null) { - comment.comment(text.toString()); - } - break; - case ATTRIBUTE_KEY: - attributekey = text.toString(); - if (html) - attributekey = attributekey.toLowerCase(); - break; - case QUOTE: - case ATTRIBUTE_VALUE: - attributevalue = text.toString(); - attributes.put(attributekey,attributevalue); - break; - default: - // do nothing - } - text.setLength(0); - } - /** - * Initialized the tag name and attributes. - */ - private void initTag() { - tag = null; - attributes = new HashMap(); - } - /** Sets the name of the tag. */ - private void doTag() { - if(tag == null) - tag = text.toString(); - if (html) - tag = tag.toLowerCase(); - text.setLength(0); - } - /** - * processes the tag. - * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. - */ - private void processTag(boolean start) { - if (start) { - nested++; - doc.startElement(tag,attributes); - } - else { - nested--; - doc.endElement(tag); - } - } - /** Throws an exception */ - private void throwException(String s) throws IOException { - throw new IOException(s+" near line " + lines + ", column " + columns); - } - - /** - * Parses the XML document firing the events to the handler. - * @param doc the document handler - * @param r the document. The encoding is already resolved. The reader is not closed - * @throws IOException on error - */ - public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException { - SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); - parser.go(r); - } - - /** - * Parses the XML document firing the events to the handler. - * @param doc the document handler - * @param in the document. The encoding is deduced from the stream. The stream is not closed - * @throws IOException on error - */ - public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException { - byte b4[] = new byte[4]; - int count = in.read(b4); - if (count != 4) - throw new IOException("Insufficient length."); - String encoding = getEncodingName(b4); - String decl = null; - if (encoding.equals("UTF-8")) { - StringBuffer sb = new StringBuffer(); - int c; - while ((c = in.read()) != -1) { - if (c == '>') - break; - sb.append((char)c); - } - decl = sb.toString(); - } - else if (encoding.equals("CP037")) { - ByteArrayOutputStream bi = new ByteArrayOutputStream(); - int c; - while ((c = in.read()) != -1) { - if (c == 0x6e) // that's '>' in ebcdic - break; - bi.write(c); - } - decl = new String(bi.toByteArray(), "CP037"); - } - if (decl != null) { - decl = getDeclaredEncoding(decl); - if (decl != null) - encoding = decl; - } - parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding))); - } - - private static String getDeclaredEncoding(String decl) { - if (decl == null) - return null; - int idx = decl.indexOf("encoding"); - if (idx < 0) - return null; - int idx1 = decl.indexOf('"', idx); - int idx2 = decl.indexOf('\'', idx); - if (idx1 == idx2) - return null; - if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { - int idx3 = decl.indexOf('\'', idx2 + 1); - if (idx3 < 0) - return null; - return decl.substring(idx2 + 1, idx3); - } - if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { - int idx3 = decl.indexOf('"', idx1 + 1); - if (idx3 < 0) - return null; - return decl.substring(idx1 + 1, idx3); - } - return null; - } - - public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException { - parse(doc, null, r, false); - } - - /** - * Escapes a string with the appropriated XML codes. - * @param s the string to be escaped - * @param onlyASCII codes above 127 will always be escaped with &#nn; if true - * @return the escaped string - */ - public static String escapeXML(String s, boolean onlyASCII) { - char cc[] = s.toCharArray(); - int len = cc.length; - StringBuffer sb = new StringBuffer(); - for (int k = 0; k < len; ++k) { - int c = cc[k]; - switch (c) { - case '<': - sb.append("<"); - break; - case '>': - sb.append(">"); - break; - case '&': - sb.append("&"); - break; - case '"': - sb.append("""); - break; - case '\'': - sb.append("'"); - break; - default: - if ((c == 0x9) || (c == 0xA) || (c == 0xD) - || ((c >= 0x20) && (c <= 0xD7FF)) - || ((c >= 0xE000) && (c <= 0xFFFD)) - || ((c >= 0x10000) && (c <= 0x10FFFF))) { - if (onlyASCII && c > 127) - sb.append("&#").append(c).append(';'); - else - sb.append((char)c); - } - } - } - return sb.toString(); - } - /** - * Returns the IANA encoding name that is auto-detected from - * the bytes specified, with the endian-ness of that encoding where appropriate. - * (method found in org.apache.xerces.impl.XMLEntityManager, originally published - * by the Apache Software Foundation under the Apache Software License; now being - * used in iText under the MPL) - * @param b4 The first four bytes of the input. - * @return an IANA-encoding string - */ - private static String getEncodingName(byte[] b4) { - - // UTF-16, with BOM - int b0 = b4[0] & 0xFF; - int b1 = b4[1] & 0xFF; - if (b0 == 0xFE && b1 == 0xFF) { - // UTF-16, big-endian - return "UTF-16BE"; - } - if (b0 == 0xFF && b1 == 0xFE) { - // UTF-16, little-endian - return "UTF-16LE"; - } - - // UTF-8 with a BOM - int b2 = b4[2] & 0xFF; - if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { - return "UTF-8"; - } - - // other encodings - int b3 = b4[3] & 0xFF; - if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { - // UCS-4, big endian (1234) - return "ISO-10646-UCS-4"; - } - if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { - // UCS-4, little endian (4321) - return "ISO-10646-UCS-4"; - } - if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { - // UCS-4, unusual octet order (2143) - // REVISIT: What should this be? - return "ISO-10646-UCS-4"; - } - if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { - // UCS-4, unusual octet order (3412) - // REVISIT: What should this be? - return "ISO-10646-UCS-4"; - } - if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { - // UTF-16, big-endian, no BOM - // (or could turn out to be UCS-2... - // REVISIT: What should this be? - return "UTF-16BE"; - } - if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { - // UTF-16, little-endian, no BOM - // (or could turn out to be UCS-2... - return "UTF-16LE"; - } - if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { - // EBCDIC - // a la xerces1, return CP037 instead of EBCDIC here - return "CP037"; - } - - // default encoding - return "UTF-8"; - } -} \ No newline at end of file diff --git a/fine-itext-old/src/main/java/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java b/fine-itext-old/src/main/java/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java index b48594a17..11e918722 100644 --- a/fine-itext-old/src/main/java/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java +++ b/fine-itext-old/src/main/java/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java @@ -775,11 +775,14 @@ public class HTMLWorker implements SimpleXMLDocHandler, DocListener { + " h1 h2 h3 h4 h5 h6 img hr"; public static final HashMap tagsSupported = new HashMap(); + public static final HashMap tagsPrefixSupported = new HashMap(); static { StringTokenizer tok = new StringTokenizer(tagsSupportedString); - while (tok.hasMoreTokens()) - tagsSupported.put(tok.nextToken(), null); + while (tok.hasMoreTokens()) { + String s = tok.nextToken(); + tagsSupported.put(s, null); + tagsPrefixSupported.put(s.charAt(0), null); + } } - } diff --git a/fine-itext-old/src/main/java/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java b/fine-itext-old/src/main/java/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java index 0ef41865b..e2873f902 100644 --- a/fine-itext-old/src/main/java/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java +++ b/fine-itext-old/src/main/java/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java @@ -75,6 +75,7 @@ */ package com.fr.third.com.lowagie.text.xml.simpleparser; +import com.fr.third.com.lowagie.text.html.simpleparser.HTMLWorker; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -117,7 +118,7 @@ public final class SimpleXMLParser { private final static int ATTRIBUTE_KEY = 12; private final static int ATTRIBUTE_EQUAL = 13; private final static int ATTRIBUTE_VALUE = 14; - + /** the state stack */ Stack stack; /** The current character. */ @@ -161,7 +162,7 @@ public final class SimpleXMLParser { String attributekey = null; /** the attribute value. */ String attributevalue = null; - + /** * Creates a Simple XML parser object. * Call go(BufferedReader) immediately after creation. @@ -207,7 +208,7 @@ public final class SimpleXMLParser { } return; } - + // dealing with \n and \r if (character == '\n' && eol) { eol = false; @@ -225,21 +226,18 @@ public final class SimpleXMLParser { } else { columns++; } - + switch(state) { // we are in an unknown state before there's actual content case UNKNOWN: if(character == '<') { - saveState(TEXT); - state = TAG_ENCOUNTERED; + beginnOfTag((char) reader.read(), UNKNOWN); } break; // we can encounter any content case TEXT: if(character == '<') { - flush(); - saveState(state); - state = TAG_ENCOUNTERED; + beginnOfTag((char) reader.read(), TEXT); } else if(character == '&') { saveState(state); entity.setLength(0); @@ -499,6 +497,27 @@ public final class SimpleXMLParser { private void saveState(int s) { stack.push(new Integer(s)); } + + /** + * 处理标签的开头,若不在支持标签范围内,将<符号作为文本处理,例:<1111 (仿造浏览器的处理方式) + */ + public void beginnOfTag(char c, int type) { + previousCharacter = c; + if (c == -1) { + return; + } + if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) { + if (type == TEXT) { + flush(); + } + saveState(TEXT); + state = TAG_ENCOUNTERED; + return; + } + text.append((char) character); + nowhite = true; + } + /** * Flushes the text that is currently in the buffer. * The text can be ignored, added to the document