From 7255663d3b2d93d62c9482e7eda7555c11358bec Mon Sep 17 00:00:00 2001
From: "Hugh.C" <Hugh.C@fanruan.com>
Date: Tue, 11 Feb 2020 11:51:14 +0800
Subject: [PATCH] =?UTF-8?q?REPORT-14598=20pdf=20HTML=E5=AF=BC=E5=87=BA?=
 =?UTF-8?q?=E6=8D=A2=E8=A1=8C=E4=B8=8E=E6=B5=8F=E8=A7=88=E5=99=A8=E4=B8=8D?=
 =?UTF-8?q?=E4=B8=80=E8=87=B4=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../SpaceWithPunctuationBreakIterator.java    | 30 +++++++-
 .../text/html/simpleparser/HTMLWorker.java    |  5 ++
 .../third/v2/lowagie/text/pdf/PdfChunk.java   | 40 +++++++---
 .../fr/third/v2/lowagie/text/pdf/PdfFont.java | 74 ++++++++++++++-----
 4 files changed, 121 insertions(+), 28 deletions(-)

diff --git a/fine-itext/src/com/fr/third/v2/lowagie/text/html/SpaceWithPunctuationBreakIterator.java b/fine-itext/src/com/fr/third/v2/lowagie/text/html/SpaceWithPunctuationBreakIterator.java
index 94322266b..1d4c7b035 100644
--- a/fine-itext/src/com/fr/third/v2/lowagie/text/html/SpaceWithPunctuationBreakIterator.java
+++ b/fine-itext/src/com/fr/third/v2/lowagie/text/html/SpaceWithPunctuationBreakIterator.java
@@ -13,16 +13,23 @@ public class SpaceWithPunctuationBreakIterator extends BreakIterator {
     private int currentPos =  -1;
     private int currentIndex = -1;
     private boolean[] spaceIndex;
+    //不作为break分词的字符
+    private boolean[] noSwitchIndex;
 
     public SpaceWithPunctuationBreakIterator(String text, BreakIterator iterator){
         this.iterator = iterator;
         iterator.setText(text);
         this.spaceIndex = new boolean[text.length()];
+        this.noSwitchIndex = new boolean[text.length()];
         int ilen = text.length() - 1;
         if(ilen > 0) {
             for (int i = 0; i < ilen; i++) {
                 char c = text.charAt(i);
-                spaceIndex[i + 1] = (c == ' ' && isPunctuation(text.charAt(i + 1)) )|| c == '-' || c == '\u2010' || c== '\n';
+                //中文的标点符号都是可以直接断开的
+                spaceIndex[i + 1] = (c == ' ' && isPunctuation(text.charAt(i + 1)) )|| c == '-' || c == '\u2010' || c== '\n'|| isChinesePunctuation( c);
+                //需要保证下一个字符不是中文，下一个字符如果是中文的话，允许分行
+                char nextC = text.charAt(i+1);
+                noSwitchIndex[i + 1] = (c=='/' || c == '.' || c == ':' || c == ';') && !isChinese(nextC);
             }
         }
     }
@@ -32,6 +39,23 @@ public class SpaceWithPunctuationBreakIterator extends BreakIterator {
         return code == 24 || code == 20 || code == 21 || code == 22 || code == 23;
     }
 
+    public boolean isChinese(char c){
+        return c >= 0x4E00 && c <= 0x9FBF;
+    }
+
+    // 根据UnicodeBlock方法判断中文标点符号
+    public boolean isChinesePunctuation(char c) {
+        Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
+        if (ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
+                || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
+                || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
+                || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
     public int first() {
        throw new UnsupportedOperationException();
     }
@@ -48,6 +72,10 @@ public class SpaceWithPunctuationBreakIterator extends BreakIterator {
         if(currentIndex == currentPos) {
             currentPos = this.iterator.next();
         }
+        if (currentPos > -1 && currentPos < noSwitchIndex.length && noSwitchIndex[currentPos]) {
+            currentIndex = currentPos;
+            return this.next();
+        }
         for(int i = currentIndex + 1; i < currentPos; i++){
             if(spaceIndex[i]){
                 currentIndex = i;
diff --git a/fine-itext/src/com/fr/third/v2/lowagie/text/html/simpleparser/HTMLWorker.java b/fine-itext/src/com/fr/third/v2/lowagie/text/html/simpleparser/HTMLWorker.java
index 7e9254e71..245155736 100644
--- a/fine-itext/src/com/fr/third/v2/lowagie/text/html/simpleparser/HTMLWorker.java
+++ b/fine-itext/src/com/fr/third/v2/lowagie/text/html/simpleparser/HTMLWorker.java
@@ -50,6 +50,7 @@
 
 package com.fr.third.v2.lowagie.text.html.simpleparser;
 
+import com.fr.third.v2.lowagie.text.pdf.PdfFont;
 import java.awt.image.BufferedImage;
 import java.io.ByteArrayInputStream;
 import java.io.File;
@@ -159,6 +160,10 @@ public class HTMLWorker implements SimpleXMLDocHandler, DocListener {
 		return parseToList(reader, style, null);
 	}
 
+	public static void initDefaultFont(String fontName) {
+		PdfFont.initDefaultFont(fontName);
+	}
+
 	public static ArrayList parseToList(Reader reader, StyleSheet style,
 			HashMap interfaceProps) throws IOException {
 		HTMLWorker worker = new HTMLWorker(null);
diff --git a/fine-itext/src/com/fr/third/v2/lowagie/text/pdf/PdfChunk.java b/fine-itext/src/com/fr/third/v2/lowagie/text/pdf/PdfChunk.java
index 8d014cd80..8d12c2a14 100644
--- a/fine-itext/src/com/fr/third/v2/lowagie/text/pdf/PdfChunk.java
+++ b/fine-itext/src/com/fr/third/v2/lowagie/text/pdf/PdfChunk.java
@@ -53,6 +53,7 @@ import java.awt.Color;
 import java.awt.FontMetrics;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 
@@ -92,6 +93,8 @@ public class PdfChunk {
 
     private static final String BREAK_TAG = "<br>";
 
+    private final static char EMPTY_SYMBOL = ' ';
+
     private boolean breakTag = false;
 
     public float getHeight() {
@@ -309,7 +312,6 @@ public class PdfChunk {
         if (splitCharacter == null)
             splitCharacter = DefaultSplitCharacter.DEFAULT;
     }
-
     // methods
 
     /** Gets the Unicode equivalent to a CID.
@@ -362,11 +364,11 @@ public class PdfChunk {
         // or until the totalWidth is reached
         int length = value.length();
         char valueArray[] = value.toCharArray();
-        BreakIterator iterator = BreakIterator.getLineInstance(Locale.getDefault());
-        BreakIterator iterator1 = new SpaceWithPunctuationBreakIterator(value, iterator);
+        BreakIterator  iterator = new SpaceWithPunctuationBreakIterator(value,  BreakIterator.getLineInstance(Locale.getDefault()));
         char character = 0;
+        boolean hasEmptySymbolEndOfLine = false; //行末有空格存在 ps:不存在连续空格键
         while (currentPosition < length) {
-            int next = iterator1.next();
+            int next = iterator.next();
             if(next < 1){
                 break;
             }
@@ -386,11 +388,17 @@ public class PdfChunk {
             }
             String substring = value.substring(start, next);
             currentWidth += font.width(substring);
-            if (currentWidth + indent.getRight() > width){
-                currentPosition = start - 1;
+            if (currentWidth + indent.getRight() > width) {
+                if (dealWithEmptySymbol(substring, currentWidth + indent.getRight(), width)) {
+                    //行末空格（加上该空格大于限制的行宽，减去则小于限制的行宽）、需要去掉该空格，不然下划线、删除线什么的会变长
+                    //该空格不能留给下一行
+                    hasEmptySymbolEndOfLine=true;
+                    start = next;
+                } else {
+                    currentPosition = start - 1;
+                }
                 break;
             }
-
             start = next;
 
         }
@@ -401,12 +409,26 @@ public class PdfChunk {
         }
         // otherwise, the string has to be truncated
         String returnValue = value.substring(start);
-        value = value.substring(0, start);
+        value = value.substring(0, start - (hasEmptySymbolEndOfLine ? 1 : 0));
         PdfChunk pc = new PdfChunk(returnValue, this);
         return pc;
     }
 
-/**
+    /**
+     * @param text       文本
+     * @param totalWidth 已经处理过的文本和当前文本的宽度和
+     * @param lineWidth  行宽
+     * @return true : 去掉text末尾的空格后小于行宽
+     */
+    private boolean dealWithEmptySymbol(String text, float totalWidth, float lineWidth) {
+        if (null == text || 0 == text.length()) {
+            return false;
+        }
+        //HTML解析后不存在连续多个空格键存在的情况，因此只需去除末尾的空格键
+        return text.charAt(text.length() - 1) == EMPTY_SYMBOL && totalWidth - getFont().width(EMPTY_SYMBOL) < lineWidth;
+    }
+
+    /**
  * Truncates this <CODE>PdfChunk</CODE> if it's too long for the given width.
  * <P>
  * Returns <VAR>null</VAR> if the <CODE>PdfChunk</CODE> wasn't truncated.
diff --git a/fine-itext/src/com/fr/third/v2/lowagie/text/pdf/PdfFont.java b/fine-itext/src/com/fr/third/v2/lowagie/text/pdf/PdfFont.java
index 5e5f7b675..31592eed7 100644
--- a/fine-itext/src/com/fr/third/v2/lowagie/text/pdf/PdfFont.java
+++ b/fine-itext/src/com/fr/third/v2/lowagie/text/pdf/PdfFont.java
@@ -77,11 +77,12 @@ import java.awt.geom.AffineTransform;
  */
 
 public class PdfFont implements Comparable {
-    private static final int ONE_THOUSAND = 1000 ;
 
     private Font oriFont;
 
-    public static int SCALE = 100;
+    public static float SCALE = 100;
+
+    private static String DEFAULT_FONT_NAME = "";
 
     /** the font metrics. */
 //    private BaseFont font;
@@ -101,6 +102,12 @@ public class PdfFont implements Comparable {
         this.oriFont = oriFont;
     }
 
+    public static void initDefaultFont(String fontName) {
+        if ("" == DEFAULT_FONT_NAME && null != fontName) {
+            DEFAULT_FONT_NAME = fontName;
+        }
+    }
+
     // methods
 
     /**
@@ -160,9 +167,14 @@ public class PdfFont implements Comparable {
 
     private FontMetrics metrics;
 
-    private FontMetrics getMetrics() {
-        if (null == metrics) {
-            metrics = FontDesignMetrics.getMetrics(getAwtFont(SCALE));
+    private java.awt.Font scaleFont;
+
+    private java.awt.Font scaleDefaultFont;
+
+    private FontMetrics getMetrics(java.awt.Font font) {
+        if (null == metrics || !font.equals(metrics.getFont())) {
+            metrics = FontDesignMetrics.getMetrics(font);
+            return metrics;
         }
         return metrics;
     }
@@ -174,16 +186,27 @@ public class PdfFont implements Comparable {
      * @return		a width in Text Space
      */
 
-    float width(int character) {
-        return image == null ? getMetrics().charWidth(replaceNbsp(character))/SCALE : image.getScaledWidth();
-    }
-
-    float width(String s) {
-        return image == null ? getMetrics().stringWidth(replaceNbsp(s))/SCALE : image.getScaledWidth();
+    public float width(int character) {
+        if (null != image) {
+            return image.getScaledWidth();
+        }
+        java.awt.Font font = getScaleAwtFont();
+        font = font.canDisplay(character) ? font : getScaleDefaultAwtFont();
+        return getMetrics(font).charWidth(replaceNbsp(character)) / SCALE;
     }
 
-    String replaceNbsp(String str) {
-        return canDisplayNbsp() ? str : str.replaceAll(String.valueOf((char) 160), String.valueOf((char) 32));
+    public float width(String s) {
+        if (null != image) {
+            return image.getScaledWidth();
+        }
+        if (null == s) {
+            return 0f;
+        }
+        float num = 0f;
+        for (int i = 0; i < s.length(); i++) {
+            num += width(s.charAt(i));
+        }
+        return num;
     }
 
     int replaceNbsp(int character) {
@@ -191,20 +214,35 @@ public class PdfFont implements Comparable {
     }
 
     private boolean canDisplayNbsp() {
-        return getAwtFont().canDisplay((char) 160);
+        return getScaleAwtFont().canDisplay((char) 160);
     }
 
     BaseFont getFont() {
-        return  oriFont.getCalculatedBaseFont(false);
+        return oriFont.getCalculatedBaseFont(false);
     }
 
     public java.awt.Font getAwtFont() {
-        return getAwtFont(1);
+        return getAwtFont(oriFont.getFontName(), 1f);
+    }
+
+    private java.awt.Font getScaleAwtFont() {
+        if (null == scaleFont) {
+            scaleFont = getAwtFont(oriFont.getFontName(), SCALE);
+        }
+        return scaleFont;
     }
 
-    private java.awt.Font getAwtFont(int scale) {
+    private java.awt.Font getScaleDefaultAwtFont() {
+        if (null == scaleDefaultFont) {
+            scaleDefaultFont = getAwtFont(DEFAULT_FONT_NAME, SCALE);
+        }
+        return scaleDefaultFont;
+    }
+
+
+    private java.awt.Font getAwtFont(String fontName,float scale) {
         Map attrMap = new HashMap(4);
-        attrMap.put(TextAttribute.FAMILY, oriFont.getFontName());
+        attrMap.put(TextAttribute.FAMILY, fontName);
         attrMap.put(TextAttribute.SIZE, new Float(oriFont.getSize() * scale));
         if (oriFont.isBold()) {
             attrMap.put(TextAttribute.WEIGHT, TextAttribute.WEIGHT_BOLD);