Processing Multiple Excel Files with Python pandas and openpyxl
This guide demonstrates how to install pandas and openpyxl, then use Python to read, merge, batch‑process, extract, clean, rename sheets, export to CSV, and analyze data across multiple Excel files, providing complete code examples for each task.
Handling multiple Excel files is a common data‑analysis task; Python offers powerful libraries such as pandas and openpyxl to simplify merging, batch processing, and fine‑grained manipulation.
Installation : Ensure pandas and openpyxl are installed.
pip install pandas openpyxl1. Read and merge multiple Excel files : Iterate over a folder, read each file with pd.read_excel , and concatenate into a single DataFrame.
import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
file_path = os.path.join(folder_path, file)
df = pd.read_excel(file_path)
all_data = pd.concat([all_data, df], ignore_index=True)
print(all_data.head())2. Batch processing (add column, filter, save) :
import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
file_path = os.path.join(folder_path, file)
df = pd.read_excel(file_path)
df['New_Column'] = 'Some Value'
filtered_df = df[df['Some_Column'] > 100]
output_file_path = os.path.join(output_folder, file)
filtered_df.to_excel(output_file_path, index=False)
print("Processing complete.")3. Extract specific information from each file (e.g., first cell):
import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
results = []
for file in excel_files:
file_path = os.path.join(folder_path, file)
df = pd.read_excel(file_path)
specific_value = df.iloc[0, 0]
results.append((file, specific_value))
for file, value in results:
print(f"File: {file}, Specific Value: {value}")4. Fine‑grained control with openpyxl (modify cells, save):
import openpyxl
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
file_path = os.path.join(folder_path, file)
workbook = openpyxl.load_workbook(file_path)
sheet = workbook.active
sheet['A1'] = 'New Value'
output_file_path = os.path.join(output_folder, file)
workbook.save(output_file_path)
print("Processing complete.")5. Merge files into separate worksheets of a single workbook :
import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_file = 'merged_workbook.xlsx'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
for file in excel_files:
file_path = os.path.join(folder_path, file)
df = pd.read_excel(file_path)
sheet_name = os.path.splitext(file)[0]
df.to_excel(writer, sheet_name=sheet_name, index=False)
print("Merging complete.")6. Data cleaning (drop empty rows, fill missing values) :
import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
file_path = os.path.join(folder_path, file)
df = pd.read_excel(file_path)
df.dropna(how='all', inplace=True)
df.fillna(0, inplace=True)
output_file_path = os.path.join(output_folder, file)
df.to_excel(output_file_path, index=False)
print("Data cleaning complete.")7. Extract specific columns and concatenate :
import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
file_path = os.path.join(folder_path, file)
df = pd.read_excel(file_path, usecols=['Column1', 'Column2'])
all_data = pd.concat([all_data, df], ignore_index=True)
print(all_data.head())8. Batch rename worksheets :
import openpyxl
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/folder'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
file_path = os.path.join(folder_path, file)
workbook = openpyxl.load_workbook(file_path)
if 'OldSheetName' in workbook.sheetnames:
sheet = workbook['OldSheetName']
sheet.title = 'NewSheetName'
output_file_path = os.path.join(output_folder, file)
workbook.save(output_file_path)
print("Sheet renaming complete.")9. Export each Excel file to CSV :
import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
output_folder = 'path/to/output/csvs'
os.makedirs(output_folder, exist_ok=True)
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
for file in excel_files:
file_path = os.path.join(folder_path, file)
df = pd.read_excel(file_path)
base_name = os.path.splitext(file)[0]
output_file_path = os.path.join(output_folder, f"{base_name}.csv")
df.to_csv(output_file_path, index=False)
print("Export to CSV complete.")10. Simple data analysis across all files (sum and average of a column):
import pandas as pd
import os
folder_path = 'path/to/your/excel/files'
excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx') or f.endswith('.xls')]
all_data = pd.DataFrame()
for file in excel_files:
file_path = os.path.join(folder_path, file)
df = pd.read_excel(file_path)
all_data = pd.concat([all_data, df], ignore_index=True)
total_sum = all_data['Some_Column'].sum()
average_value = all_data['Some_Column'].mean()
print(f"Total Sum: {total_sum}")
print(f"Average Value: {average_value}")In summary, pandas excels at high‑level data merging, cleaning, and analysis, while openpyxl provides low‑level control for cell‑wise edits and sheet management; choose the approach that best fits your specific Excel‑processing needs.
Test Development Learning Exchange
Test Development Learning Exchange
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.