Backend Development 7 min read

10 Practical Python Scripts for PDF Manipulation

This article presents ten useful Python scripts that demonstrate how to create, merge, split, extract text, add watermarks, convert, rotate, edit metadata, and encrypt PDF files using libraries such as PyPDF2, fpdf, pdf2image, and Pillow.

Test Development Learning Exchange
Test Development Learning Exchange
Test Development Learning Exchange
10 Practical Python Scripts for PDF Manipulation

PDF files are ubiquitous in daily work, and Python offers powerful libraries to handle them; this guide shares ten practical scripts covering PDF creation, editing, information extraction, and conversion.

1. PDF Merge

from PyPDF2 import PdfMerger

def merge_pdfs(paths, output):
    merger = PdfMerger()
    for pdf in paths:
        merger.append(pdf)
    merger.write(output)
    merger.close()

merge_pdfs(["doc1.pdf", "doc2.pdf"], "merged.pdf")

2. PDF Page Split

from PyPDF2 import PdfReader, PdfWriter

def split_pdf(input_pdf, start_page, end_page, output_pdf):
    reader = PdfReader(input_pdf)
    writer = PdfWriter()
    for i in range(start_page, end_page + 1):
        writer.add_page(reader.pages[i])
    with open(output_pdf, "wb") as fp:
        writer.write(fp)

split_pdf("large_document.pdf", 10, 20, "pages_10_to_20.pdf")

3. Text Extraction

from PyPDF2 import PdfReader

def extract_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

text = extract_text("example.pdf")
print(text)

4. Create PDF

from fpdf import FPDF

def create_pdf(output):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Hello, world!", ln=True)
    pdf.output(output)

create_pdf("hello_world.pdf")

5. Add Watermark

from PyPDF2 import PdfReader, PdfWriter, PdfMerger

def add_watermark(input_pdf, watermark_pdf, output_pdf):
    base = PdfReader(input_pdf).pages[0]
    watermark = PdfReader(watermark_pdf).pages[0]
    packet = io.BytesIO()
    can = canvas.Canvas(packet, pagesize=letter)
    can.saveState()
    can.translate(cmmtopage[0], cmmtopage[1])
    can.drawInlineImage(watermark, 0, 0, width=watermark.mediabox.getWidth(), height=watermark.mediabox.getHeight())
    can.restoreState()
    can.save()
    packet.seek(0)
    new_pdf = PdfReader(packet)
    output = PdfMerger()
    output.append(base)
    output.merge(1, new_pdf)
    output.write(output_pdf)

add_watermark("original.pdf", "watermark.pdf", "watermarked.pdf")

6. PDF to Images

from pdf2image import convert_from_path

def pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path)
    for i, image in enumerate(images):
        image.save(f"{output_folder}/page_{i}.jpg", "JPEG")

pdf_to_images("example.pdf", "images")

7. Images to PDF

from PIL import Image

def images_to_pdf(image_paths, output_pdf):
    images = [Image.open(x) for x in image_paths]
    images[0].save(output_pdf, save_all=True, append_images=images[1:])

images_to_pdf(["image1.jpg", "image2.jpg"], "images.pdf")

8. Rotate PDF Pages

from PyPDF2 import PdfWriter, PdfReader

def rotate_pages(pdf_path, output_pdf):
    pdf_writer = PdfWriter()
    pdf = PdfReader(pdf_path)
    for page in pdf.pages:
        page.rotate(90)
        pdf_writer.add_page(page)
    with open(output_pdf, "wb") as out:
        pdf_writer.write(out)

rotate_pages("example.pdf", "rotated.pdf")

9. Modify PDF Metadata

from PyPDF2 import PdfWriter, PdfReader

def modify_metadata(input_pdf, output_pdf, title="New Title"):
    pdf = PdfReader(input_pdf)
    pdf.Writer = PdfWriter()
    info = pdf.Writer._info
    info.title = title
    pdf.Writer.write(output_pdf)

modify_metadata("example.pdf", "modified.pdf", "Updated Title")

10. Password Protect PDF

from PyPDF2 import PdfWriter, PdfReader

def encrypt_pdf(input_pdf, output_pdf, password):
    pdf = PdfReader(input_pdf)
    pdf_writer = PdfWriter()
    pdf_writer.append_pages_from_reader(pdf)
    pdf_writer.encrypt(user_pwd=password, owner_pwd=None, use_128bit=True)
    with open(output_pdf, "wb") as fh:
        pdf_writer.write(fh)

encrypt_pdf("example.pdf", "protected.pdf", "mysecretpassword")

The above scripts cover the full spectrum of PDF handling, from basic text extraction to complex page rotation and password protection, and can be adapted to suit specific project needs.

Before running these examples, ensure the required libraries (PyPDF2, fpdf, pdf2image, Pillow) are installed via pip.

PythonPDFPyPDF2pillowfpdfpdf2image
Test Development Learning Exchange
Written by

Test Development Learning Exchange

Test Development Learning Exchange

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.