Fundamentals 6 min read

Python Utility Scripts for Data Cleaning, Translation, File Sync, Cloud Backup, and More

This article presents a collection of Python utility scripts that demonstrate how to clean CSV data, translate text files, synchronize folders, upload files to S3, count directory contents, classify files by type, perform OCR on images, convert video to audio, extract images from webpages, and generate text summaries using modern libraries.

Test Development Learning Exchange

Sep 1, 2024

Python Utility Scripts for Data Cleaning, Translation, File Sync, Cloud Backup, and More

CSV File Data Cleaning Tool

import csv

def clean_csv(input_file, output_file):
    with open(input_file, mode='r', newline='', encoding='utf-8') as infile, \
         open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        for row in reader:
            # Clean data, e.g., strip whitespace
            cleaned_row = [cell.strip() for cell in row]
            writer.writerow(cleaned_row)

input_file = '/path/to/input.csv'
output_file = '/path/to/cleaned_output.csv'
clean_csv(input_file, output_file)

Text File Translation Tool

from deep_translator import GoogleTranslator

def translate_text(input_file, output_file, target_language):
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()
    translated_text = GoogleTranslator(source='auto', target=target_language).translate(text)
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(translated_text)

input_file = '/path/to/your/text.txt'
output_file = '/path/to/translated_text.txt'
target_language = 'es'  # Spanish
translate_text(input_file, output_file, target_language)

Folder Synchronization Tool

import shutil
import os

def sync_folders(src_folder, dst_folder):
    for item in os.listdir(src_folder):
        s = os.path.join(src_folder, item)
        d = os.path.join(dst_folder, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, dirs_exist_ok=True)
        else:
            shutil.copy2(s, d)

src_folder = '/path/to/source/folder'
dst_folder = '/path/to/destination/folder'
sync_folders(src_folder, dst_folder)

Data Backup to Cloud Storage Service (AWS S3)

import boto3

def upload_to_s3(bucket_name, local_file_path, remote_file_path):
    s3 = boto3.client('s3')
    s3.upload_file(local_file_path, bucket_name, remote_file_path)

bucket_name = 'your-bucket-name'
local_file_path = '/path/to/your/local/file'
remote_file_path = 'path/to/remote/file'
upload_to_s3(bucket_name, local_file_path, remote_file_path)

Folder Content Statistics Tool

import os

def count_files_and_dirs(directory):
    total_files = 0
    total_dirs = 0
    for root, dirs, files in os.walk(directory):
        total_files += len(files)
        total_dirs += len(dirs)
    return total_files, total_dirs

directory = '/path/to/your/directory'
files, dirs = count_files_and_dirs(directory)
print(f"Total files: {files}, Total directories: {dirs}")

File Type Classification Tool

import os

def classify_files_by_type(directory):
    file_types = {}
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            extension = os.path.splitext(filename)[1]
            if extension in file_types:
                file_types[extension] += 1
            else:
                file_types[extension] = 1
    return file_types

directory = '/path/to/your/directory'
file_types = classify_files_by_type(directory)
for ext, count in file_types.items():
    print(f"Files with extension '{ext}': {count}")

Image Recognition Tag Generation (OCR)

from PIL import Image
from pytesseract import pytesseract

def image_to_text(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

image_path = '/path/to/your/image.png'
text = image_to_text(image_path)
print(f"Text from image: {text}")

Video to Audio Conversion Tool

from moviepy.editor import VideoFileClip

def convert_video_to_audio(video_path, audio_path):
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path)

video_path = '/path/to/your/video.mp4'
audio_path = '/path/to/converted/audio.mp3'
convert_video_to_audio(video_path, audio_path)

Extract Images from Webpage

import requests
from bs4 import BeautifulSoup
import os

def extract_images_from_webpage(url, output_dir):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    images = soup.find_all('img')
    os.makedirs(output_dir, exist_ok=True)
    for image in images:
        src = image.get('src')
        if src and src.startswith('http'):
            image_data = requests.get(src).content
            filename = os.path.join(output_dir, os.path.basename(src))
            with open(filename, 'wb') as f:
                f.write(image_data)

url = 'https://example.com'
output_dir = '/path/to/output/images'
extract_images_from_webpage(url, output_dir)

Text Summarization Tool

from transformers import pipeline

def generate_summary(text):
    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    return summary[0]['summary_text']

text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
Sed non risus. Suspendisse lectus tortor, dignissim sit amet, adipiscing nec, ultricies sed, dolor."""
summary = generate_summary(text)
print(f"Summary: {summary}")

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

AI cloud storage utilities file-management data-cleaning

Written by

Test Development Learning Exchange

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.