10 Practical Python Scripts for PDF Manipulation
This article presents ten useful Python scripts that demonstrate how to create, merge, split, extract text, add watermarks, convert, rotate, edit metadata, and encrypt PDF files using libraries such as PyPDF2, fpdf, pdf2image, and Pillow.
PDF files are ubiquitous in daily work, and Python offers powerful libraries to handle them; this guide shares ten practical scripts covering PDF creation, editing, information extraction, and conversion.
1. PDF Merge
from PyPDF2 import PdfMerger
def merge_pdfs(paths, output):
merger = PdfMerger()
for pdf in paths:
merger.append(pdf)
merger.write(output)
merger.close()
merge_pdfs(["doc1.pdf", "doc2.pdf"], "merged.pdf")2. PDF Page Split
from PyPDF2 import PdfReader, PdfWriter
def split_pdf(input_pdf, start_page, end_page, output_pdf):
reader = PdfReader(input_pdf)
writer = PdfWriter()
for i in range(start_page, end_page + 1):
writer.add_page(reader.pages[i])
with open(output_pdf, "wb") as fp:
writer.write(fp)
split_pdf("large_document.pdf", 10, 20, "pages_10_to_20.pdf")3. Text Extraction
from PyPDF2 import PdfReader
def extract_text(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
text = extract_text("example.pdf")
print(text)4. Create PDF
from fpdf import FPDF
def create_pdf(output):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Hello, world!", ln=True)
pdf.output(output)
create_pdf("hello_world.pdf")5. Add Watermark
from PyPDF2 import PdfReader, PdfWriter, PdfMerger
def add_watermark(input_pdf, watermark_pdf, output_pdf):
base = PdfReader(input_pdf).pages[0]
watermark = PdfReader(watermark_pdf).pages[0]
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=letter)
can.saveState()
can.translate(cmmtopage[0], cmmtopage[1])
can.drawInlineImage(watermark, 0, 0, width=watermark.mediabox.getWidth(), height=watermark.mediabox.getHeight())
can.restoreState()
can.save()
packet.seek(0)
new_pdf = PdfReader(packet)
output = PdfMerger()
output.append(base)
output.merge(1, new_pdf)
output.write(output_pdf)
add_watermark("original.pdf", "watermark.pdf", "watermarked.pdf")6. PDF to Images
from pdf2image import convert_from_path
def pdf_to_images(pdf_path, output_folder):
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
image.save(f"{output_folder}/page_{i}.jpg", "JPEG")
pdf_to_images("example.pdf", "images")7. Images to PDF
from PIL import Image
def images_to_pdf(image_paths, output_pdf):
images = [Image.open(x) for x in image_paths]
images[0].save(output_pdf, save_all=True, append_images=images[1:])
images_to_pdf(["image1.jpg", "image2.jpg"], "images.pdf")8. Rotate PDF Pages
from PyPDF2 import PdfWriter, PdfReader
def rotate_pages(pdf_path, output_pdf):
pdf_writer = PdfWriter()
pdf = PdfReader(pdf_path)
for page in pdf.pages:
page.rotate(90)
pdf_writer.add_page(page)
with open(output_pdf, "wb") as out:
pdf_writer.write(out)
rotate_pages("example.pdf", "rotated.pdf")9. Modify PDF Metadata
from PyPDF2 import PdfWriter, PdfReader
def modify_metadata(input_pdf, output_pdf, title="New Title"):
pdf = PdfReader(input_pdf)
pdf.Writer = PdfWriter()
info = pdf.Writer._info
info.title = title
pdf.Writer.write(output_pdf)
modify_metadata("example.pdf", "modified.pdf", "Updated Title")10. Password Protect PDF
from PyPDF2 import PdfWriter, PdfReader
def encrypt_pdf(input_pdf, output_pdf, password):
pdf = PdfReader(input_pdf)
pdf_writer = PdfWriter()
pdf_writer.append_pages_from_reader(pdf)
pdf_writer.encrypt(user_pwd=password, owner_pwd=None, use_128bit=True)
with open(output_pdf, "wb") as fh:
pdf_writer.write(fh)
encrypt_pdf("example.pdf", "protected.pdf", "mysecretpassword")The above scripts cover the full spectrum of PDF handling, from basic text extraction to complex page rotation and password protection, and can be adapted to suit specific project needs.
Before running these examples, ensure the required libraries (PyPDF2, fpdf, pdf2image, Pillow) are installed via pip.
Test Development Learning Exchange
Test Development Learning Exchange
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.