Merge multiple PDF files in one PDF file
Hello Sambhashanam readers – I had written an article a long back Mail merge in java for Microsoft Word document and convert to PDF without iText – Part II.
Some of you asked – how can I get a single merged PDF file out of many PDF output – and here is an example which picks all the PDF files from a directory and generates the merged PDF file. This examle uses Apache PDFBox – A Java PDF Library to achieve merging.
1. Create POM file with all dependencies as below
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.sambhashanam</groupId> <artifactId>pdfmerger</artifactId> <version>1.0</version> <packaging>jar</packaging> <name>pdfmerger</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>compile,test</scope> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.1</version> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>xmpbox</artifactId> <version>2.0.1</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.5</version> </dependency> <dependency> <groupId>org.bouncycastle</groupId> <artifactId>bcprov-jdk16</artifactId> <version>1.46</version> </dependency> </dependencies> </project> |
2. Create a LazyFileInputStream [delays opening the file until the first byte is read]
package com.sambhashanam.io; import java.io.File; import java.io.FileDescriptor; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import org.apache.commons.io.input.AutoCloseInputStream; /** * This input stream delays opening the file until the first byte is read, and * closes and discards the underlying stream as soon as the end of input has * been reached or when the stream is explicitly closed. */ public class LazyFileInputStream extends AutoCloseInputStream { /** * The file descriptor to use. */ protected final FileDescriptor fd; /** * The file to read from. */ protected final File file; /** * True if the input stream was opened. It is also set to true if the stream * was closed without reading (to avoid opening the file after the stream * was closed). */ protected boolean opened; /** * Creates a new <code>LazyFileInputStream</code> for the given file. If the * file is unreadable, a FileNotFoundException is thrown. * The file is not opened until the first byte is read from the stream. * * @param file the file * @throws java.io.FileNotFoundException */ public LazyFileInputStream(File file) throws FileNotFoundException { super(null); if (!file.canRead()) { throw new FileNotFoundException(file.getPath()); } this.file = file; this.fd = null; } /** * Creates a new <code>LazyFileInputStream</code> for the given file * descriptor. * The file is not opened until the first byte is read from the stream. * * @param fd */ public LazyFileInputStream(FileDescriptor fd) { super(null); this.file = null; this.fd = fd; } /** * Creates a new <code>LazyFileInputStream</code> for the given file. If the * file is unreadable, a FileNotFoundException is thrown. * * @param name * @throws java.io.FileNotFoundException */ public LazyFileInputStream(String name) throws FileNotFoundException { this(new File(name)); } /** * Open the stream if required. * * @throws java.io.IOException */ protected void open() throws IOException { if (!opened) { opened = true; if (fd != null) { in = new FileInputStream(fd); } else { in = new FileInputStream(file); } } } public int read() throws IOException { open(); return super.read(); } public int available() throws IOException { open(); return super.available(); } public void close() throws IOException { // make sure the file is not opened afterwards opened = true; // only close the file if it was in fact opened if (in != null) { super.close(); } } public synchronized void reset() throws IOException { open(); super.reset(); } public boolean markSupported() { try { open(); } catch (IOException e) { throw new IllegalStateException(e.toString()); } return super.markSupported(); } public synchronized void mark(int readlimit) { try { open(); } catch (IOException e) { throw new IllegalStateException(e.toString()); } super.mark(readlimit); } public long skip(long n) throws IOException { open(); return super.skip(n); } public int read(byte[] b) throws IOException { open(); return super.read(b, 0, b.length); } public int read(byte[] b, int off, int len) throws IOException { open(); return super.read(b, off, len); } } |
3. Create a java class that wraps reading directory, files, merging files and output to the given location.
/** * */ package com.sambhashanam.pdfmerger; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Calendar; import java.util.List; import javax.xml.transform.TransformerException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.multipdf.PDFMergerUtility; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.common.PDMetadata; import org.apache.xmpbox.XMPMetadata; import org.apache.xmpbox.schema.DublinCoreSchema; import org.apache.xmpbox.schema.PDFAIdentificationSchema; import org.apache.xmpbox.schema.XMPBasicSchema; import org.apache.xmpbox.type.BadFieldValueException; import org.apache.xmpbox.xml.XmpSerializer; import com.sambhashanam.io.LazyFileInputStream; /** * @author Dhananjay Kumar Jha * */ public class PDFMergingUtility { private static final Log LOG = LogFactory.getLog(PDFMergingUtility.class); private FilenameFilter filenameFilter = new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".pdf"); } }; public void merge(String inputSourceDirectory, String outputFile,String mergedFileTitle, String creator, String subject,long maxMainMemoryBytes, long maxStorageBytes) throws IOException { long t1 = System.currentTimeMillis(); List<InputStream> sources = prepareLazyInputStreamSources(inputSourceDirectory); FileOutputStream fileOutputStream = new FileOutputStream(outputFile); merge(sources,fileOutputStream,mergedFileTitle, creator, subject, maxMainMemoryBytes, maxStorageBytes); long t2 = System.currentTimeMillis(); System.out.println("Total time taken in merging " + sources.size() + " pdf files were " + (t2 - t1) + " milisecond."); } protected List<InputStream> prepareLazyInputStreamSources( String inputSourceDirectory) throws IOException { File dir = new File(inputSourceDirectory); if (!dir.exists() || !dir.isDirectory()) { throw new IOException( "Supplied path either does not exist or not a directory."); } List<LazyFileInputStream> sources = new ArrayList<LazyFileInputStream>(); for (File file : dir.listFiles(filenameFilter)) { sources.add(new LazyFileInputStream(file.getAbsolutePath())); } return new ArrayList<InputStream>(sources); } /** * Creates a compound PDF document from a list of input documents. * <p> * The merged document is PDF/A-1b compliant, provided the source documents * are as well. It contains document properties title, creator and subject, * currently hard-coded. * * @param sources * list of source PDF document streams. * @return compound PDF document as a readable input stream. * @throws IOException * if anything goes wrong during PDF merge. */ public void merge(final List<InputStream> sources,OutputStream outputStream, String mergedFileTitle, String creator, String subject, long maxMainMemoryBytes, long maxStorageBytes) throws IOException { String title = mergedFileTitle; COSStream cosStream = null; try { cosStream = new COSStream(); PDFMergerUtility pdfMerger = createPDFMergerUtility(sources, outputStream); // PDF and XMP properties must be identical, otherwise document is // not PDF/A compliant PDDocumentInformation pdfDocumentInfo = createPDFDocumentInfo( title, creator, subject); PDMetadata xmpMetadata = createXMPMetadata(cosStream, title, creator, subject); pdfMerger.setDestinationDocumentInformation(pdfDocumentInfo); pdfMerger.setDestinationMetadata(xmpMetadata); LOG.info("Merging " + sources.size() + " source documents into one PDF"); pdfMerger.mergeDocuments(MemoryUsageSetting.setupMixed(maxMainMemoryBytes, maxStorageBytes)); LOG.info("PDF merge successful"); } catch (BadFieldValueException e) { throw new IOException("Problem while merging PDFs", e); } catch (TransformerException e) { throw new IOException("Problem while merging PDFs", e); } finally { for (InputStream source : sources) { IOUtils.closeQuietly(source); } IOUtils.closeQuietly(cosStream); IOUtils.closeQuietly(outputStream); } } private PDFMergerUtility createPDFMergerUtility(List<InputStream> sources, OutputStream mergedPDFOutputStream) { LOG.info("Initialising PDF merge utility"); PDFMergerUtility pdfMerger = new PDFMergerUtility(); pdfMerger.addSources(sources); pdfMerger.setDestinationStream(mergedPDFOutputStream); return pdfMerger; } private PDDocumentInformation createPDFDocumentInfo(String title, String creator, String subject) { LOG.info("Setting document info (title, author, subject) for merged PDF"); PDDocumentInformation documentInformation = new PDDocumentInformation(); documentInformation.setTitle(title); documentInformation.setCreator(creator); documentInformation.setSubject(subject); return documentInformation; } private PDMetadata createXMPMetadata(COSStream cosStream, String title, String creator, String subject) throws BadFieldValueException, TransformerException, IOException { LOG.info("Setting XMP metadata (title, author, subject) for merged PDF"); XMPMetadata xmpMetadata = XMPMetadata.createXMPMetadata(); // PDF/A-1b properties PDFAIdentificationSchema pdfaSchema = xmpMetadata .createAndAddPFAIdentificationSchema(); pdfaSchema.setPart(1); pdfaSchema.setConformance("B"); // Dublin Core properties DublinCoreSchema dublinCoreSchema = xmpMetadata .createAndAddDublinCoreSchema(); dublinCoreSchema.setTitle(title); dublinCoreSchema.addCreator(creator); dublinCoreSchema.setDescription(subject); // XMP Basic properties XMPBasicSchema basicSchema = xmpMetadata.createAndAddXMPBasicSchema(); Calendar creationDate = Calendar.getInstance(); basicSchema.setCreateDate(creationDate); basicSchema.setModifyDate(creationDate); basicSchema.setMetadataDate(creationDate); basicSchema.setCreatorTool(creator); // Create and return XMP data structure in XML format ByteArrayOutputStream xmpOutputStream = null; OutputStream cosXMPStream = null; try { xmpOutputStream = new ByteArrayOutputStream(); cosXMPStream = cosStream.createOutputStream(); new XmpSerializer().serialize(xmpMetadata, xmpOutputStream, true); cosXMPStream.write(xmpOutputStream.toByteArray()); return new PDMetadata(cosStream); } finally { IOUtils.closeQuietly(xmpOutputStream); IOUtils.closeQuietly(cosXMPStream); } } } |
4. And finally the Test case.
package test.com.sambhashanam.pdfmerger; import java.io.IOException; import org.junit.Test; import com.sambhashanam.pdfmerger.PDFMergingUtility; public class PDFMergingUtilityTest { @Test public void test() throws IOException { PDFMergingUtility pdfMergingUtility = new PDFMergingUtility(); pdfMergingUtility.merge("F:/Development/Projects/Java/Eclipse/PDFBox/pdfmerger/src/main/resources/pdf-files", "F:/Development/Projects/Java/Eclipse/PDFBox/pdfmerger/src/main/resources/pdf-out-files/1-800.pdf", "800 Merged files", "dhananjay.jha", "Test result of 800 merged files", 0, 1000000000000l); } } |
Happy coding!.
Please find the complete source files here pdfmerger