Fundamentals 12 min read

Processing Multiple Excel Files with Python pandas and openpyxl

This guide demonstrates how to install pandas and openpyxl, then use Python to read, merge, batch‑process, extract, clean, rename sheets, export to CSV, and analyze data across multiple Excel files, providing complete code examples for each task.

Test Development Learning Exchange
Test Development Learning Exchange
Test Development Learning Exchange
Processing Multiple Excel Files with Python pandas and openpyxl

Handling multiple Excel files is a common data‑analysis task; Python offers powerful libraries such as pandas and openpyxl to simplify merging, batch processing, and fine‑grained manipulation.

Installation : Ensure pandas and openpyxl are installed.

pip install pandas openpyxl

1. Read and merge multiple Excel files : Iterate over a folder, read each file with pd.read_excel , and concatenate into a single DataFrame.

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    all_data = pd.concat([all_data, df], ignore_index=True)
print(all_data.head())

2. Batch processing (add column, filter, save) :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    df['New_Column'] = 'Some Value'
    filtered_df = df[df['Some_Column'] > 100]
    output_file_path = os.path.join(output_folder, file)
    filtered_df.to_excel(output_file_path, index=False)
print("Processing complete.")

3. Extract specific information from each file (e.g., first cell):

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
results = []
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    specific_value = df.iloc[0, 0]
    results.append((file, specific_value))
for file, value in results:
    print(f"File: {file}, Specific Value: {value}")

4. Fine‑grained control with openpyxl (modify cells, save):

import openpyxl
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    workbook = openpyxl.load_workbook(file_path)
    sheet = workbook.active
    sheet['A1'] = 'New Value'
    output_file_path = os.path.join(output_folder, file)
    workbook.save(output_file_path)
print("Processing complete.")

5. Merge files into separate worksheets of a single workbook :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_file = 'merged_workbook.xlsx'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    for file in excel_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_excel(file_path)
        sheet_name = os.path.splitext(file)[0]
        df.to_excel(writer, sheet_name=sheet_name, index=False)
print("Merging complete.")

6. Data cleaning (drop empty rows, fill missing values) :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)
    df.fillna(0, inplace=True)
    output_file_path = os.path.join(output_folder, file)
    df.to_excel(output_file_path, index=False)
print("Data cleaning complete.")

7. Extract specific columns and concatenate :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path, usecols=['Column1', 'Column2'])
    all_data = pd.concat([all_data, df], ignore_index=True)
print(all_data.head())

8. Batch rename worksheets :

import openpyxl
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    workbook = openpyxl.load_workbook(file_path)
    if 'OldSheetName' in workbook.sheetnames:
        sheet = workbook['OldSheetName']
        sheet.title = 'NewSheetName'
    output_file_path = os.path.join(output_folder, file)
    workbook.save(output_file_path)
print("Sheet renaming complete.")

9. Export each Excel file to CSV :

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/csvs'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    base_name = os.path.splitext(file)[0]
    output_file_path = os.path.join(output_folder, f"{base_name}.csv")
    df.to_csv(output_file_path, index=False)
print("Export to CSV complete.")

10. Simple data analysis across all files (sum and average of a column):

import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    all_data = pd.concat([all_data, df], ignore_index=True)
total_sum = all_data['Some_Column'].sum()
average_value = all_data['Some_Column'].mean()
print(f"Total Sum: {total_sum}")
print(f"Average Value: {average_value}")

In summary, pandas excels at high‑level data merging, cleaning, and analysis, while openpyxl provides low‑level control for cell‑wise edits and sheet management; choose the approach that best fits your specific Excel‑processing needs.

AutomationData ProcessingExcelpandasopenpyxl
Test Development Learning Exchange
Written by

Test Development Learning Exchange

Test Development Learning Exchange

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.