Fundamentals 12 min read

Processing Multiple Excel Files with Python pandas and openpyxl

This guide demonstrates how to install pandas and openpyxl, then use Python to read, merge, batch‑process, extract, clean, rename sheets, export to CSV, and analyze data across multiple Excel files, providing complete code examples for each task.

Test Development Learning Exchange

Nov 1, 2024

Processing Multiple Excel Files with Python pandas and openpyxl

Handling multiple Excel files is a common data‑analysis task; Python offers powerful libraries such as pandas and openpyxl to simplify merging, batch processing, and fine‑grained manipulation.

Installation : Ensure pandas and openpyxl are installed. pip install pandas openpyxl 1. Read and merge multiple Excel files : Iterate over a folder, read each file with pd.read_excel, and concatenate into a single DataFrame.

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    all_data = pd.concat([all_data, df], ignore_index=True)
print(all_data.head())

2. Batch processing (add column, filter, save) :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    df['New_Column'] = 'Some Value'
    filtered_df = df[df['Some_Column'] > 100]
    output_file_path = os.path.join(output_folder, file)
    filtered_df.to_excel(output_file_path, index=False)
print("Processing complete.")

3. Extract specific information from each file (e.g., first cell):

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
results = []
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    specific_value = df.iloc[0, 0]
    results.append((file, specific_value))
for file, value in results:
    print(f"File: {file}, Specific Value: {value}")

4. Fine‑grained control with openpyxl (modify cells, save):

import openpyxl
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    workbook = openpyxl.load_workbook(file_path)
    sheet = workbook.active
    sheet['A1'] = 'New Value'
    output_file_path = os.path.join(output_folder, file)
    workbook.save(output_file_path)
print("Processing complete.")

5. Merge files into separate worksheets of a single workbook :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_file = 'merged_workbook.xlsx'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    for file in excel_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_excel(file_path)
        sheet_name = os.path.splitext(file)[0]
        df.to_excel(writer, sheet_name=sheet_name, index=False)
print("Merging complete.")

6. Data cleaning (drop empty rows, fill missing values) :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)
    df.fillna(0, inplace=True)
    output_file_path = os.path.join(output_folder, file)
    df.to_excel(output_file_path, index=False)
print("Data cleaning complete.")

7. Extract specific columns and concatenate :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path, usecols=['Column1', 'Column2'])
    all_data = pd.concat([all_data, df], ignore_index=True)
print(all_data.head())

8. Batch rename worksheets :

import openpyxl
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    workbook = openpyxl.load_workbook(file_path)
    if 'OldSheetName' in workbook.sheetnames:
        sheet = workbook['OldSheetName']
        sheet.title = 'NewSheetName'
    output_file_path = os.path.join(output_folder, file)
    workbook.save(output_file_path)
print("Sheet renaming complete.")

9. Export each Excel file to CSV :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/csvs'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    base_name = os.path.splitext(file)[0]
    output_file_path = os.path.join(output_folder, f"{base_name}.csv")
    df.to_csv(output_file_path, index=False)
print("Export to CSV complete.")

10. Simple data analysis across all files (sum and average of a column):

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    all_data = pd.concat([all_data, df], ignore_index=True)
total_sum = all_data['Some_Column'].sum()
average_value = all_data['Some_Column'].mean()
print(f"Total Sum: {total_sum}")
print(f"Average Value: {average_value}")

In summary, pandas excels at high‑level data merging, cleaning, and analysis, while openpyxl provides low‑level control for cell‑wise edits and sheet management; choose the approach that best fits your specific Excel‑processing needs.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

automation data-processing Excel openpyxl

Written by

Test Development Learning Exchange

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.