Skip to content

Commit 5a3d9d8

Browse files
Merge pull request #251 from metanorma/table_width_fix
table width algorithm performance optimization, #245
2 parents ae5925f + 4f0be7c commit 5a3d9d8

File tree

6 files changed

+131
-52
lines changed

6 files changed

+131
-52
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ SHELL ?= /bin/bash
66
endif
77

88
#JAR_VERSION := $(shell mvn -q -Dexec.executable="echo" -Dexec.args='$${project.version}' --non-recursive exec:exec -DforceStdout)
9-
JAR_VERSION := 1.88
9+
JAR_VERSION := 1.90
1010
JAR_FILE := mn2pdf-$(JAR_VERSION).jar
1111

1212
all: target/$(JAR_FILE)

README.adoc

+5-5
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ You will need the Java Development Kit (JDK) version 8, Update 241 (8u241) or hi
1717

1818
[source,sh]
1919
----
20-
java -Xss5m -Xmx2048m -jar target/mn2pdf-1.89.jar --xml-file <XML-FileName> --xsl-file <XSLT-FileName> --pdf-file <Output-PDF-FileName> [--syntax-highlight]
20+
java -Xss5m -Xmx2048m -jar target/mn2pdf-1.90.jar --xml-file <XML-FileName> --xsl-file <XSLT-FileName> --pdf-file <Output-PDF-FileName> [--syntax-highlight]
2121
----
2222

2323
e.g.
2424

2525
[source,sh]
2626
----
27-
java -Xss5m -Xmx2048m -jar target/mn2pdf-1.89.jar --xml-file tests/G.191.xml --xsl-file tests/itu.recommendation.xsl --pdf-file tests/G.191.pdf
27+
java -Xss5m -Xmx2048m -jar target/mn2pdf-1.90.jar --xml-file tests/G.191.xml --xsl-file tests/itu.recommendation.xsl --pdf-file tests/G.191.pdf
2828
----
2929

3030
=== PDF encryption features
@@ -100,7 +100,7 @@ Update version in `pom.xml`, e.g.:
100100
----
101101
<groupId>org.metanorma.fop</groupId>
102102
<artifactId>mn2pdf</artifactId>
103-
<version>1.89</version>
103+
<version>1.90</version>
104104
<name>Metanorma XML to PDF converter</name>
105105
----
106106

@@ -111,8 +111,8 @@ Tag the same version in Git:
111111

112112
[source,xml]
113113
----
114-
git tag v1.89
115-
git push origin v1.89
114+
git tag v1.90
115+
git push origin v1.90
116116
----
117117

118118
Then the corresponding GitHub release will be automatically created at:

pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<modelVersion>4.0.0</modelVersion>
66
<groupId>org.metanorma.fop</groupId>
77
<artifactId>mn2pdf</artifactId>
8-
<version>1.89</version>
8+
<version>1.90</version>
99
<name>Metanorma XML to PDF converter</name>
1010
<packaging>jar</packaging>
1111
<url>https://www.metanorma.org</url>

src/main/java/org/metanorma/fop/PDFGenerator.java

+65-19
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import java.util.*;
1111
import java.util.logging.Level;
1212
import java.util.logging.Logger;
13+
import java.util.stream.Collectors;
1314
import javax.xml.parsers.*;
1415
import javax.xml.transform.OutputKeys;
1516
import javax.xml.transform.Result;
@@ -277,7 +278,8 @@ public boolean process() {
277278
isAddMathAsText = xsltConverter.hasParamAddMathAsText() && isMathExists;
278279
isAddMathAsAttachment = xsltConverter.hasParamAddMathAsAttachment();
279280

280-
isApplyAutolayoutAlgorithm = xsltConverter.isApplyAutolayoutAlgorithm();
281+
isApplyAutolayoutAlgorithm = xsltConverter.isApplyAutolayoutAlgorithm();
282+
281283

282284
if (isSyntaxHighlight) {
283285
xsltParams.put("syntax-highlight", "true");
@@ -1173,6 +1175,8 @@ private void readEncryptionParameters(File fEncryptionParameters) {
11731175

11741176
private void setTablesWidths(fontConfig fontcfg, XSLTconverter xsltConverter, File pdf) {
11751177

1178+
int TABLE_CELLS_COUNT_MAX = 30000;
1179+
11761180
String methodName = getClass().getSimpleName() + "." + (new Object(){}.getClass().getEnclosingMethod().getName());
11771181
Profiler.addMethodCall(methodName);
11781182
long startMethodTime = System.currentTimeMillis();
@@ -1196,7 +1200,7 @@ private void setTablesWidths(fontConfig fontcfg, XSLTconverter xsltConverter, Fi
11961200
SourceXMLDocument sourceXMLDocumentTablesOnly = new SourceXMLDocument(xmlTablesOnly);
11971201

11981202
int countTableCells = sourceXMLDocumentTablesOnly.getCountTableCells();
1199-
if (countTableCells < 30000) {
1203+
if (countTableCells < TABLE_CELLS_COUNT_MAX) {
12001204
// transform XML to XSL-FO (XML .fo file)
12011205
xsltConverter.transform(sourceXMLDocumentTablesOnly, false);
12021206

@@ -1218,46 +1222,79 @@ private void setTablesWidths(fontConfig fontcfg, XSLTconverter xsltConverter, Fi
12181222

12191223
} else { // for large tables, or large number of tables
12201224

1221-
List<String> tablesIds = sourceXMLDocumentTablesOnly.readElementsIds("//*[local-name() = 'table' or local-name() = 'dl']");
1222-
12231225
List<String> xmlTablesIF = new ArrayList<>();
1224-
// process each table separatery for memory consumption optimization
1225-
int tableCounter = 0;
1226-
int tableCount = tablesIds.size();
1227-
for (String tableId : tablesIds) {
1228-
tableCounter++;
1229-
logger.info("[INFO] Generation of XSL-FO (" + tableCounter + "/" + tableCount + ") with information about the table widths with id='" + tableId + "'...");
12301226

1231-
// process table with id=tableId only
1232-
xsltConverter.setParam("table_only_with_id", tableId);
1227+
Map<String,Integer> tablesCellsCountMap = sourceXMLDocumentTablesOnly.getTablesCellsCountMap();
1228+
1229+
int portion = 1;
1230+
while(!tablesCellsCountMap.isEmpty()) {
1231+
int totalCells = 0;
1232+
List<String> tablesProcessed = new ArrayList<>();
1233+
1234+
Iterator<Map.Entry<String, Integer>> iterator = tablesCellsCountMap.entrySet().iterator();
1235+
while (iterator.hasNext() && totalCells < TABLE_CELLS_COUNT_MAX) {
1236+
Map.Entry<String, Integer> entry = iterator.next();
1237+
if (totalCells == 0 || totalCells + entry.getValue() < TABLE_CELLS_COUNT_MAX) {
1238+
totalCells += entry.getValue();
1239+
tablesProcessed.add(entry.getKey());
1240+
}
1241+
}
1242+
1243+
/*for (Map.Entry<String, Integer> entry : tablesCellsCountMap.entrySet()) {
1244+
else {
1245+
break;
1246+
}
1247+
}*/
1248+
logger.info("[INFO] Generation of XSL-FO (portion " + portion + ") with information about the table widths...");
1249+
1250+
// "table1 table2 table3 " (with space at the end)
1251+
String tableIds = tablesProcessed.stream().collect(Collectors.joining(" ")) + " ";
1252+
// call XSLT and pass the tables ids
1253+
1254+
// process table with ids=tableIds only
1255+
xsltConverter.setParam("table_only_with_ids", tableIds);
12331256

12341257
// transform XML to XSL-FO (XML .fo file)
12351258
xsltConverter.transform(sourceXMLDocumentTablesOnly, false);
12361259

12371260
String xmlFO = sourceXMLDocumentTablesOnly.getXMLFO();
12381261

12391262
//debug
1240-
debugSaveXML(xmlFO, pdf.getAbsolutePath() + "." + tableId + ".fo.tables.xml");
1263+
debugSaveXML(xmlFO, pdf.getAbsolutePath() + ".portion_" + portion + ".fo.tables.xml");
12411264

1242-
fontcfg.outputFontManifestLog(Paths.get(pdf.getAbsolutePath() + "." + tableId + ".tables.fontmanifest.log.txt"));
1265+
fontcfg.outputFontManifestLog(Paths.get(pdf.getAbsolutePath() + ".portion_" + portion + ".tables.fontmanifest.log.txt"));
12431266

12441267
fontcfg.setSourceDocumentFontList(sourceXMLDocumentTablesOnly.getDocumentFonts());
12451268

12461269
Source sourceFO = new StreamSource(new StringReader(xmlFO));
12471270

1248-
logger.info("[INFO] Generation of Intermediate Format (" + tableCounter + "/" + tableCount + ") with information about the table's widths with id='" + tableId + "'...");
1249-
String xmlIF = generateFOPIntermediateFormat(sourceFO, fontcfg.getConfig(), pdf, true, "." + tableId + ".tables");
1271+
logger.info("[INFO] Generation of Intermediate Format with information about the table's widths (portion " + portion + ") ...");
1272+
String xmlIF = generateFOPIntermediateFormat(sourceFO, fontcfg.getConfig(), pdf, true, ".portion_" + portion + ".tables");
12501273

12511274
xmlTableIF = createTableIF(xmlIF);
12521275

1253-
debugSaveXML(xmlTableIF, pdf.getAbsolutePath() + "." + tableId + ".tables.xml");
1276+
debugSaveXML(xmlTableIF, pdf.getAbsolutePath() + ".portion_" + portion + ".tables.xml");
12541277

12551278
xmlTableIF = tableWidthsCleanup(xmlTableIF);
12561279

12571280
xmlTablesIF.add(xmlTableIF);
1281+
1282+
// remove processed tables
1283+
tablesCellsCountMap.keySet().removeAll(tablesProcessed);
1284+
portion++;
12581285
}
1286+
1287+
/*List<String> tablesIds = sourceXMLDocumentTablesOnly.readElementsIds("//*[local-name() = 'table' or local-name() = 'dl']");
1288+
// process each table separatery for memory consumption optimization
1289+
int tableCounter = 0;
1290+
int tableCount = tablesIds.size();
1291+
for (String tableId : tablesIds) {
1292+
tableCounter++;
1293+
logger.info("[INFO] Generation of XSL-FO (" + tableCounter + "/" + tableCount + ") with information about the table widths with id='" + tableId + "'...");
1294+
}*/
12591295
xmlTableIF = tablesWidthsUnion(xmlTablesIF);
12601296
xsltConverter.setParam("table_only_with_id", ""); // further process all tables
1297+
xsltConverter.setParam("table_only_with_ids", ""); // further process all tables
12611298
}
12621299

12631300
debugSaveXML(xmlTableIF, pdf.getAbsolutePath() + ".tables.xml");
@@ -1330,11 +1367,17 @@ private void saveDebugFO(String debugXSLFO) {
13301367
}
13311368

13321369
private String tableWidthsCleanup(String table) {
1333-
int startPos = table.indexOf("<table ");
1370+
try {
1371+
table = applyXSLT("table_if_clean.xsl", table, false);
1372+
} catch (Exception ex) {
1373+
logger.severe("Can't simplify the tables width information XML.");
1374+
ex.printStackTrace();
1375+
}
1376+
/*int startPos = table.indexOf("<table ");
13341377
int endPos = table.indexOf("</tables>");
13351378
table = table.substring(startPos, endPos);
13361379
int startPosTbody = table.indexOf("<tbody>");
1337-
table = table.substring(0,startPosTbody) + "</table>";
1380+
table = table.substring(0,startPosTbody) + "</table>";*/
13381381
return table;
13391382
}
13401383

@@ -1344,6 +1387,9 @@ private String tablesWidthsUnion(List<String> tables) {
13441387
sbTablesIF.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?><tables>");
13451388
}
13461389
for (String itemTableIF: tables) {
1390+
int startPos = itemTableIF.indexOf("<table ");
1391+
int endPos = itemTableIF.indexOf("</tables>");
1392+
itemTableIF = itemTableIF.substring(startPos, endPos);
13471393
sbTablesIF.append(itemTableIF);
13481394
}
13491395
if (!tables.isEmpty()) {

src/main/java/org/metanorma/fop/SourceXMLDocument.java

+46-26
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package org.metanorma.fop;
22

33
import java.io.*;
4-
import java.nio.charset.StandardCharsets;
54
import java.nio.file.Files;
65
import java.nio.file.Path;
76
import java.nio.file.Paths;
@@ -14,10 +13,7 @@
1413
import java.util.logging.Level;
1514
import java.util.logging.Logger;
1615
import javax.xml.parsers.*;
17-
import javax.xml.transform.OutputKeys;
18-
import javax.xml.transform.Source;
19-
import javax.xml.transform.Transformer;
20-
import javax.xml.transform.TransformerFactory;
16+
import javax.xml.transform.*;
2117
import javax.xml.transform.dom.DOMSource;
2218
import javax.xml.transform.stream.StreamResult;
2319
import javax.xml.transform.stream.StreamSource;
@@ -26,7 +22,7 @@
2622
import javax.xml.xpath.XPathExpression;
2723
import javax.xml.xpath.XPathExpressionException;
2824
import javax.xml.xpath.XPathFactory;
29-
import static org.metanorma.fop.PDFGenerator.logger;
25+
3026
import static org.metanorma.fop.Util.getStreamFromResources;
3127

3228
import org.metanorma.utils.LoggerHelper;
@@ -52,6 +48,7 @@ public class SourceXMLDocument {
5248

5349
private boolean hasAnnotations = false;
5450
private boolean hasTables = false;
51+
private Map<String, Integer> tablesCellsCountMap = new HashMap<>();
5552
private boolean hasMath = false;
5653

5754
static final String TMPDIR = System.getProperty("java.io.tmpdir");
@@ -89,6 +86,7 @@ public SourceXMLDocument(String strXML) {
8986
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
9087
InputSource xmlIFIS = new InputSource(new StringReader(strXML));
9188
sourceXML = dBuilder.parse(xmlIFIS);
89+
readMetaInformation();
9290
} catch (Exception ex) {
9391
logger.severe("Can't parse source XML.");
9492
ex.printStackTrace();
@@ -97,14 +95,41 @@ public SourceXMLDocument(String strXML) {
9795

9896
private void readMetaInformation() {
9997
String element_review = readValue("//*[local-name() = 'review'][1]");
100-
this.hasAnnotations = element_review.length() != 0;
101-
// check table without colgroup/col (width) or dl
102-
String element_table = readValue("//*[(local-name() = 'table' and not(*[local-name() = 'colgroup']/*[local-name() = 'col'])) or local-name() = 'dl'][1]");
103-
this.hasTables = element_table.length() != 0;
98+
hasAnnotations = element_review.length() != 0;
10499
String element_math = readValue("//*[local-name() = 'math'][1]");
105-
this.hasMath = element_math.length() != 0;
100+
hasMath = element_math.length() != 0;
101+
//tables without colgroup/col (width) or dl
102+
//String element_table = readValue("//*[(local-name() = 'table' and not(*[local-name() = 'colgroup']/*[local-name() = 'col'])) or local-name() = 'dl'][1]");
103+
//hasTables = element_table.length() != 0;
104+
obtainTablesCellsCount();
105+
hasTables = !tablesCellsCountMap.isEmpty();
106106
}
107107

108+
private void obtainTablesCellsCount() {
109+
try {
110+
XPath xPathAllTable = XPathFactory.newInstance().newXPath();
111+
// select all tables (without colgroup) and definitions lists (dl)
112+
XPathExpression queryAllTables = xPathAllTable.compile("//*[(local-name() = 'table' and not(*[local-name() = 'colgroup']/*[local-name() = 'col'])) or local-name() = 'dl']");
113+
NodeList nodesTables = (NodeList)queryAllTables.evaluate(sourceXML, XPathConstants.NODESET);
114+
for (int i = 0; i < nodesTables.getLength(); i++) {
115+
Node nodeTable = nodesTables.item(i);
116+
String tableId = "";
117+
Node nodeId = nodeTable.getAttributes().getNamedItem("id");
118+
if (nodeId != null) {
119+
tableId =nodeId.getTextContent();
120+
}
121+
if (!tableId.isEmpty()) {
122+
XPath xPathTableCountCells = XPathFactory.newInstance().newXPath();
123+
XPathExpression queryTableCountCells = xPathTableCountCells.compile(".//*[local-name() = 'td' or local-name() = 'th' or local-name() = 'dt' or local-name() = 'dd']");
124+
NodeList nodesCells = (NodeList) queryTableCountCells.evaluate(nodeTable, XPathConstants.NODESET);
125+
int countCells = nodesCells.getLength();
126+
tablesCellsCountMap.put(tableId, countCells);
127+
}
128+
}
129+
} catch (XPathExpressionException ex) {
130+
logger.severe(ex.toString());
131+
}
132+
}
108133

109134
public StreamSource getStreamSource() {
110135
if (sourceXMLstr.isEmpty()) {
@@ -411,7 +436,6 @@ public String getDocumentFilePath() {
411436
return documentFilePath;
412437
}
413438

414-
415439
private String updatePreprocessXSLT(Document docXML) throws Exception {
416440

417441
Source srcXSL = new StreamSource(getStreamFromResources(getClass().getClassLoader(), "update_preprocess_xslt.xsl"));
@@ -441,19 +465,6 @@ private String readValue(String xpath) {
441465
return value;
442466
}
443467

444-
private int readTableCellsCount(){
445-
int count = 0;
446-
try {
447-
XPath xPath = XPathFactory.newInstance().newXPath();
448-
XPathExpression query = xPath.compile("//*[local-name() = 'td' or local-name() = 'th' or local-name() = 'dt' or local-name() = 'dd']");
449-
NodeList nodes = (NodeList)query.evaluate(sourceXML, XPathConstants.NODESET);
450-
count = nodes.getLength();
451-
} catch (Exception ex) {
452-
logger.severe(ex.toString());
453-
}
454-
return count;
455-
}
456-
457468
public List<String> readElementsIds(String xpath) {
458469
List<String> values = new ArrayList<>();
459470
try {
@@ -487,7 +498,12 @@ public boolean hasMath() {
487498
}
488499

489500
public int getCountTableCells() {
490-
int countTableCells = readTableCellsCount();
501+
int countTableCells = 0;
502+
try {
503+
countTableCells = tablesCellsCountMap.values().stream().mapToInt(Integer::intValue).sum();
504+
} catch (Exception ex) {
505+
logger.severe(ex.toString());
506+
};
491507
return countTableCells;
492508
}
493509

@@ -496,4 +512,8 @@ public void flushResources() {
496512
sourceXMLstr = "";
497513
xmlFO = null;
498514
}
515+
516+
public Map<String, Integer> getTablesCellsCountMap() {
517+
return tablesCellsCountMap;
518+
}
499519
}

src/test/java/org/metanorma/fop/SourceXMLDocumentTests.java

+13
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
import java.nio.file.Path;
66
import java.nio.file.Paths;
77
import java.util.ArrayList;
8+
import java.util.HashMap;
89
import java.util.List;
10+
import java.util.Map;
911
import javax.xml.transform.TransformerException;
1012
import org.apache.commons.cli.ParseException;
1113

@@ -92,4 +94,15 @@ public void testGetDocumentPreprocessXSLT() {
9294

9395
assertTrue(strProcessXSLT.equals(strProcessXSLTEtalon));
9496
}
97+
98+
@Test
99+
public void testTablesCellsCount() {
100+
ClassLoader classLoader = getClass().getClassLoader();
101+
String xml = classLoader.getResource("G.191.xml").getFile();
102+
SourceXMLDocument sourceXMLDocument = new SourceXMLDocument(new File(xml));
103+
Map<String,Integer> tablesCellsCount = sourceXMLDocument.getTablesCellsCountMap();
104+
int countCells = sourceXMLDocument.getCountTableCells();
105+
assertTrue(tablesCellsCount.size() == 27);
106+
assertTrue(countCells == 725);
107+
}
95108
}

0 commit comments

Comments
 (0)