Create New Sheet With Sums Of Specific Column For Each File In Directory Of Multiple Xlsx Files
I have many Excel files in a directory with the same structure for each file -- for example the data below could be test1.xlsx: Date Type Name Task Subtask
Solution 1:
I would collect all your data frames into one list and then concatenate them in one shot - it should be much faster:
import os
import glob
import pandas as pd
def merge_excel_to_df_add_filename(flist, **kwargs):
dfs = []
for f in flist:
df = pd.read_excel(f, **kwargs)
df['file'] = f
dfs.append(df)
return pd.concat(dfs, ignore_index=True)
fmask = os.path.join('/path/to/excel/files', '*.xlsx')
df = merge_excel_to_df_add_filename(glob.glob(fmask),
skiprows=4,
index_col=None,
na_values=['NA'])
g = df.groupby('file')['Hours'].agg({'Hours': ['sum','mean']}).reset_index()
# rename columns
g.columns = ['File_Name', 'sum of hours', 'average hours']
# write result to Excel file
g.to_excel('result.xlsx', index=False)
Solution 2:
While you reading file into memory you should remeber filename you are currently processing:
all_data = pd.DataFrame()
for f in glob.glob("path/*.xlsx"):
df = pd.read_excel(f, skiprows=4,index_col=None, na_values=['NA'])
df['filename'] = f
all_data = all_data.append(df,ignore_index=True)
task_output = all_data.groupby(['filename', "Task","Subtask"])["Hours"].agg([np.sum,np.mean])
Post a Comment for "Create New Sheet With Sums Of Specific Column For Each File In Directory Of Multiple Xlsx Files"