From 2b0774489afbf24f55859cfb95b40dcea1511909 Mon Sep 17 00:00:00 2001 From: "Hugh.C" Date: Fri, 15 May 2020 13:48:37 +0800 Subject: [PATCH] =?UTF-8?q?REPORT-31492=20word=E5=AF=BC=E5=87=BA=E4=B8=ADH?= =?UTF-8?q?tml=E6=97=A0=E6=B3=95=E8=A7=A3=E6=9E=90=E5=B0=8F=E4=BA=8E?= =?UTF-8?q?=E5=8F=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../text/html/simpleparser/HTMLWorker.java | 9 ++++-- .../xml/simpleparser/SimpleXMLParser.java | 29 +++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java index f1ea48571..adb73e007 100644 --- a/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java +++ b/fine-itext-old/src/com/fr/third/com/lowagie/text/html/simpleparser/HTMLWorker.java @@ -775,11 +775,14 @@ public class HTMLWorker implements SimpleXMLDocHandler, DocListener { + " h1 h2 h3 h4 h5 h6 img hr"; public static final HashMap tagsSupported = new HashMap(); + public static final HashMap tagsPrefixSupported = new HashMap(); static { StringTokenizer tok = new StringTokenizer(tagsSupportedString); - while (tok.hasMoreTokens()) - tagsSupported.put(tok.nextToken(), null); + while (tok.hasMoreTokens()) { + String s = tok.nextToken(); + tagsSupported.put(s, null); + tagsPrefixSupported.put(s.charAt(0), null); + } } - } diff --git a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java index fe9c80e03..0d7209ece 100755 --- a/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java +++ b/fine-itext-old/src/com/fr/third/com/lowagie/text/xml/simpleparser/SimpleXMLParser.java @@ -75,6 +75,7 @@ */ package com.fr.third.com.lowagie.text.xml.simpleparser; +import com.fr.third.com.lowagie.text.html.simpleparser.HTMLWorker; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -230,16 +231,13 @@ public final class SimpleXMLParser { // we are in an unknown state before there's actual content case UNKNOWN: if(character == '<') { - saveState(TEXT); - state = TAG_ENCOUNTERED; + beginnOfTag((char) reader.read(), UNKNOWN); } break; // we can encounter any content case TEXT: if(character == '<') { - flush(); - saveState(state); - state = TAG_ENCOUNTERED; + beginnOfTag((char) reader.read(), UNKNOWN); } else if(character == '&') { saveState(state); entity.setLength(0); @@ -499,6 +497,27 @@ public final class SimpleXMLParser { private void saveState(int s) { stack.push(new Integer(s)); } + + /** + * 处理标签的开头,若不在支持标签范围内,将<符号作为文本处理,例:<1111 (仿造浏览器的处理方式) + */ + public void beginnOfTag(char c, int type) { + previousCharacter = c; + if (c == -1) { + return; + } + if (c == '/' || HTMLWorker.tagsPrefixSupported.containsKey(c)) { + if (type == TEXT) { + flush(); + } + saveState(TEXT); + state = TAG_ENCOUNTERED; + return; + } + text.append((char) character); + nowhite = true; + } + /** * Flushes the text that is currently in the buffer. * The text can be ignored, added to the document