From 4f71896be84d6e4264e5a03c94e5bea252180598 Mon Sep 17 00:00:00 2001 From: "Hugh.C" Date: Tue, 17 Dec 2019 13:52:37 +0800 Subject: [PATCH] =?UTF-8?q?REPORT-25253=20Html=E6=97=A0=E6=B3=95=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E5=B0=8F=E4=BA=8E=E5=8F=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../text/html/simpleparser/HTMLWorker.java | 9 ++++-- .../xml/simpleparser/SimpleXMLParser.java | 30 +++++++++++++++---- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fine-itext/src/com/fr/third/v2/lowagie/text/html/simpleparser/HTMLWorker.java b/fine-itext/src/com/fr/third/v2/lowagie/text/html/simpleparser/HTMLWorker.java index ade85f5cf..968747ecf 100644 --- a/fine-itext/src/com/fr/third/v2/lowagie/text/html/simpleparser/HTMLWorker.java +++ b/fine-itext/src/com/fr/third/v2/lowagie/text/html/simpleparser/HTMLWorker.java @@ -804,10 +804,15 @@ public class HTMLWorker implements SimpleXMLDocHandler, DocListener { public static final HashMap tagsSupported = new HashMap(); + public static final HashMap tagsPrefixSupported = new HashMap(); + static { StringTokenizer tok = new StringTokenizer(tagsSupportedString); - while (tok.hasMoreTokens()) - tagsSupported.put(tok.nextToken(), null); + while (tok.hasMoreTokens()){ + String s = tok.nextToken(); + tagsSupported.put(s, null); + tagsPrefixSupported.put(s.charAt(0), null); + } } } diff --git a/fine-itext/src/com/fr/third/v2/lowagie/text/xml/simpleparser/SimpleXMLParser.java b/fine-itext/src/com/fr/third/v2/lowagie/text/xml/simpleparser/SimpleXMLParser.java index 2a15f56ff..0642a11e7 100644 --- a/fine-itext/src/com/fr/third/v2/lowagie/text/xml/simpleparser/SimpleXMLParser.java +++ b/fine-itext/src/com/fr/third/v2/lowagie/text/xml/simpleparser/SimpleXMLParser.java @@ -75,6 +75,8 @@ */ package com.fr.third.v2.lowagie.text.xml.simpleparser; +import com.fr.third.v2.lowagie.text.html.simpleparser.HTMLWorker; + import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -229,16 +231,13 @@ public final class SimpleXMLParser { // we are in an unknown state before there's actual content case UNKNOWN: if(character == '<') { - saveState(TEXT); - state = TAG_ENCOUNTERED; + beginnOfTag((char) reader.read(), UNKNOWN); } break; // we can encounter any content case TEXT: if(character == '<') { - flush(); - saveState(state); - state = TAG_ENCOUNTERED; + beginnOfTag((char) reader.read(), TEXT); } else if(character == '&') { saveState(state); entity.setLength(0); @@ -481,6 +480,27 @@ public final class SimpleXMLParser { } } + + /** + * 处理标签的开头,若不在支持标签范围内,将<符号作为文本处理,例:<1111 (仿造浏览器的处理方式) + */ + public void beginnOfTag(char c, int type) { + previousCharacter = c; + if (c == -1) { + return; + } + if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) { + if (type == TEXT) { + flush(); + } + saveState(TEXT); + state = TAG_ENCOUNTERED; + return; + } + text.append((char) character); + nowhite = true; + } + /** * Gets a state from the stack * @return the previous state