Python Utility Scripts for Data Cleaning, Translation, File Sync, Cloud Backup, and More
This article presents a collection of Python utility scripts that demonstrate how to clean CSV data, translate text files, synchronize folders, upload files to S3, count directory contents, classify files by type, perform OCR on images, convert video to audio, extract images from webpages, and generate text summaries using modern libraries.
CSV File Data Cleaning Tool
import csv
def clean_csv(input_file, output_file):
with open(input_file, mode='r', newline='', encoding='utf-8') as infile, \
open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
# Clean data, e.g., strip whitespace
cleaned_row = [cell.strip() for cell in row]
writer.writerow(cleaned_row)
input_file = '/path/to/input.csv'
output_file = '/path/to/cleaned_output.csv'
clean_csv(input_file, output_file)Text File Translation Tool
from deep_translator import GoogleTranslator
def translate_text(input_file, output_file, target_language):
with open(input_file, 'r', encoding='utf-8') as file:
text = file.read()
translated_text = GoogleTranslator(source='auto', target=target_language).translate(text)
with open(output_file, 'w', encoding='utf-8') as file:
file.write(translated_text)
input_file = '/path/to/your/text.txt'
output_file = '/path/to/translated_text.txt'
target_language = 'es' # Spanish
translate_text(input_file, output_file, target_language)Folder Synchronization Tool
import shutil
import os
def sync_folders(src_folder, dst_folder):
for item in os.listdir(src_folder):
s = os.path.join(src_folder, item)
d = os.path.join(dst_folder, item)
if os.path.isdir(s):
shutil.copytree(s, d, dirs_exist_ok=True)
else:
shutil.copy2(s, d)
src_folder = '/path/to/source/folder'
dst_folder = '/path/to/destination/folder'
sync_folders(src_folder, dst_folder)Data Backup to Cloud Storage Service (AWS S3)
import boto3
def upload_to_s3(bucket_name, local_file_path, remote_file_path):
s3 = boto3.client('s3')
s3.upload_file(local_file_path, bucket_name, remote_file_path)
bucket_name = 'your-bucket-name'
local_file_path = '/path/to/your/local/file'
remote_file_path = 'path/to/remote/file'
upload_to_s3(bucket_name, local_file_path, remote_file_path)Folder Content Statistics Tool
import os
def count_files_and_dirs(directory):
total_files = 0
total_dirs = 0
for root, dirs, files in os.walk(directory):
total_files += len(files)
total_dirs += len(dirs)
return total_files, total_dirs
directory = '/path/to/your/directory'
files, dirs = count_files_and_dirs(directory)
print(f"Total files: {files}, Total directories: {dirs}")File Type Classification Tool
import os
def classify_files_by_type(directory):
file_types = {}
for filename in os.listdir(directory):
if os.path.isfile(os.path.join(directory, filename)):
extension = os.path.splitext(filename)[1]
if extension in file_types:
file_types[extension] += 1
else:
file_types[extension] = 1
return file_types
directory = '/path/to/your/directory'
file_types = classify_files_by_type(directory)
for ext, count in file_types.items():
print(f"Files with extension '{ext}': {count}")Image Recognition Tag Generation (OCR)
from PIL import Image
from pytesseract import pytesseract
def image_to_text(image_path):
img = Image.open(image_path)
text = pytesseract.image_to_string(img)
return text
image_path = '/path/to/your/image.png'
text = image_to_text(image_path)
print(f"Text from image: {text}")Video to Audio Conversion Tool
from moviepy.editor import VideoFileClip
def convert_video_to_audio(video_path, audio_path):
clip = VideoFileClip(video_path)
clip.audio.write_audiofile(audio_path)
video_path = '/path/to/your/video.mp4'
audio_path = '/path/to/converted/audio.mp3'
convert_video_to_audio(video_path, audio_path)Extract Images from Webpage
import requests
from bs4 import BeautifulSoup
import os
def extract_images_from_webpage(url, output_dir):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
os.makedirs(output_dir, exist_ok=True)
for image in images:
src = image.get('src')
if src and src.startswith('http'):
image_data = requests.get(src).content
filename = os.path.join(output_dir, os.path.basename(src))
with open(filename, 'wb') as f:
f.write(image_data)
url = 'https://example.com'
output_dir = '/path/to/output/images'
extract_images_from_webpage(url, output_dir)Text Summarization Tool
from transformers import pipeline
def generate_summary(text):
summarizer = pipeline("summarization")
summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
return summary[0]['summary_text']
text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed non risus. Suspendisse lectus tortor, dignissim sit amet, adipiscing nec, ultricies sed, dolor."""
summary = generate_summary(text)
print(f"Summary: {summary}")Test Development Learning Exchange
Test Development Learning Exchange
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.