Commit df0d10bd authored by Erik Schilling's avatar Erik Schilling
Browse files

Added the ability to generate outlines for PDFs

--outline-tags allows to specify the HTML tags which should be
considered for the outline. The tags are expected to be given in
order of hierachy, for example, 'h1,h2' will trigger a generation
with h1 elements as top level outline entries and h2 as their
childs.

Ideally this would not be required if Chromium would add
this directly. So if these bugs are closed this can probably be
removed again:
- https://bugs.chromium.org/p/chromium/issues/detail?id=840455
- https://github.com/GoogleChrome/puppeteer/issues/1778

This code is heavily based on @Hopding's comment at:
https://github.com/Hopding/pdf-lib/issues/127#issuecomment-502450179
parent 2e995afc
......@@ -32,6 +32,7 @@ pagedjs-cli ./path/to/index.html -o result.pdf
-ho, --hypher_only [str] Only hyphenate passed elements selector, such as ".hyphenate, aside"
-e, --encoding [type] Set the encoding of the input html, defaults to "utf-8"
-t, --timeout [ms] Set a max timeout of [ms]
--outline-tags [tags] Specifies that an outline should be generated for the resulting PDF document. [tags] specifies which HTML tags should be considered for that outline. "h1,h2" will trigger an outline with "h1" tags as root elements and "h2" elements as their childs.
```
## Hyphenation
......
......@@ -26,6 +26,11 @@ program
.option('-t, --timeout [ms]', 'Set a max timeout of [ms]')
.option('-x, --html', 'output html file')
.option('-b, --blockLocal', 'Disallow access to filesystem for local files')
.option('--outline-tags [tags]', 'Specifies that an outline should be ' +
'generated for the resulting PDF document. [tags] specifies which ' +
'HTML tags should be considered for that outline. ' +
'"h1,h2" will trigger an outline with "h1" tags as root elements ' +
'and "h2" elements as their childs.')
.parse(process.argv);
......@@ -122,6 +127,7 @@ if (typeof input === "string") {
file = await printer.html(input, options);
output = replaceExt(output, '.html');
} else {
options.outlineTags = !program.outlineTags ? [] : program.outlineTags.split(',');
file = await printer.pdf(input, options);
}
} else {
......
......@@ -203,6 +203,100 @@ class PostProcesser extends EventEmitter {
console.log(page);
}
/**
* Adds a table of content to the generated PDF
*
* Ideally this would not be required if Chromium would add this directly.
* So if these bugs are closed this can probably be removed again:
* - https://bugs.chromium.org/p/chromium/issues/detail?id=840455
* - https://github.com/GoogleChrome/puppeteer/issues/1778
*
* This code is heavily based on @Hopding's comment at:
* https://github.com/Hopding/pdf-lib/issues/127#issuecomment-502450179
*/
addOutline(outlineSpec) {
const outline = JSON.parse(JSON.stringify(outlineSpec))
const pageRefs = [];
this.pdfDoc.catalog.Pages.traverse((kid, ref) => {
if (kid instanceof PDFLib.PDFPage)
pageRefs.push(ref);
});
const index = this.pdfDoc.index;
const outlineReference = index.nextObjectNumber();
const countOutlineLayer = (layer) => {
let count = 0;
for (const outlineEntry of layer) {
++count;
count += countOutlineLayer(outlineEntry.children);
}
return count;
}
const createItemsForOutlineLayer = (layer, parent) => {
layer.forEach((outlineItem, i) => {
let prev = i > 0 ? layer[i - 1].ref : null;
let next = i < layer.length - 1 ? layer[i + 1].ref : null;
const pdfItem = createOutlineItem(outlineItem, prev, next, parent);
index.assign(outlineItem.ref, pdfItem);
});
}
const createOutlineItem = (outlineItem, prev, next, parent) => {
if (!outlineItem.id) {
throw new Error(`Cannot generate outline item with title '${outlineItem.title} ` +
`without any target anchor. Please specify an 'id' attribute for ` +
`the relevant HTML element`);
}
const item = {
Title: PDFLib.PDFString.fromString(outlineItem.title),
Parent: parent,
Dest: PDFLib.PDFName.from(outlineItem.id),
};
if (prev) {
item.Prev = prev;
}
if (next) {
item.Next = next;
}
if (outlineItem.children.length > 0) {
item.First = outlineItem.children[0].ref;
item.Last = outlineItem.children[outlineItem.children.length - 1].ref;
item.Count = PDFLib.PDFNumber.fromNumber(countOutlineLayer(outlineItem.children));
createItemsForOutlineLayer(outlineItem.children, outlineItem.ref);
}
return PDFLib.PDFDictionary.from(item, index);
};
const createOutlineReferences = (outlineEntry) => {
outlineEntry.ref = index.nextObjectNumber();
for (const child of outlineEntry.children) {
createOutlineReferences(child);
}
}
for (const outlineItem of outline) {
createOutlineReferences(outlineItem);
}
createItemsForOutlineLayer(outline, outlineReference);
const pdfOutline = PDFLib.PDFDictionary.from(
{
First: outline[0].ref,
Last: outline[outline.length - 1].ref,
Count: PDFLib.PDFNumber.fromNumber(countOutlineLayer(outline)),
},
index,
);
index.assign(outlineReference, pdfOutline);
this.pdfDoc.catalog.set('Outlines', outlineReference);
}
save() {
let writer = new PDFDocumentWriter();
const pdfBytes = writer.saveToBytesWithXRefTable(this.pdfDoc);
......
......@@ -182,6 +182,54 @@ class Printer extends EventEmitter {
return page;
}
async _parseOutline(page, tags) {
return await page.evaluate((tags) => {
const tagsToProcess = [];
for (const node of document.querySelectorAll(tags.join(','))) {
tagsToProcess.push(node);
}
tagsToProcess.reverse();
const root = {children: [], depth: -1};
let currentOutlineNode = root;
while (tagsToProcess.length > 0) {
const tag = tagsToProcess.pop();
const orderDepth = tags.indexOf(tag.tagName.toLowerCase());
if (orderDepth < currentOutlineNode.depth) {
currentOutlineNode = currentOutlineNode.parent;
tagsToProcess.push(tag);
} else {
const newNode = {
title: tag.innerText,
id: tag.id,
children: [],
depth: orderDepth,
};
if (orderDepth == currentOutlineNode.depth) {
newNode.parent = currentOutlineNode.parent;
currentOutlineNode.parent.children.push(newNode);
currentOutlineNode = newNode;
} else if (orderDepth > currentOutlineNode.depth) {
newNode.parent = currentOutlineNode;
currentOutlineNode.children.push(newNode);
currentOutlineNode = newNode;
}
}
}
const stripParentProperty = (node) => {
node.parent = undefined;
for (const child of node.children) {
stripParentProperty(child);
}
}
stripParentProperty(root)
return root.children;
}, tags);
}
async pdf(input, options={}) {
let page = await this.render(input);
......@@ -201,6 +249,8 @@ class Printer extends EventEmitter {
return meta;
});
const outline = options.outlineTags.length > 0 ? await this._parseOutline(page, options.outlineTags) : null;
let settings = {
printBackground: true,
displayHeaderFooter: false,
......@@ -228,6 +278,9 @@ class Printer extends EventEmitter {
let post = new PostProcesser(pdf);
post.metadata(meta);
post.boxes(this.pages);
if (outline) {
post.addOutline(outline);
}
pdf = post.save();
return pdf;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment