Hi, I’ve created a script to parse some logs that would ultimately output 2 or more excel sheets in a workbook. I’m running into an issue with def compute_durations(df) where it’s throwing a ValueError: left keys must be sorted error in the merge_asof section. However, right above it, I’m sorting the dataframes. Can someone help me figure out why it is still giving me this error? Below is my code:
import json
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from tqdm import tqdm
# Constants
EXCEL_ROW_LIMIT = 1_048_576
HEADER_ROW_COUNT = 1
MAX_ROWS_PER_SHEET = EXCEL_ROW_LIMIT - HEADER_ROW_COUNT
def parse_tableau_prep_log(file_path):
"""
Parses a Tableau Prep log file with line-delimited JSON entries.
Preserves structured JSON in 'value' if applicable.
"""
records = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if not line:
continue
try:
log_entry = json.loads(line)
value = log_entry.get('v')
# Try to parse JSON if value is a string that looks like JSON
if isinstance(value, str):
try:
parsed_value = json.loads(value)
if isinstance(parsed_value, (dict, list)):
value = parsed_value
except (json.JSONDecodeError, TypeError):
pass
flat_record = {
'timestamp': log_entry.get('ts'),
'pid': log_entry.get('pid'),
'tid': log_entry.get('tid'),
'severity': log_entry.get('sev'),
'key': log_entry.get('k'),
'value': value,
}
records.append(flat_record)
except json.JSONDecodeError as e:
print(f"Skipping line due to JSON decode error: {e}")
return pd.DataFrame(records)
def compute_durations(df):
"""
Computes durations between matching begin-/end- operation pairs.
Matches operations in timestamp order within the same thread.
"""
df_begin = df[df['key'].str.startswith('begin-')].copy()
df_end = df[df['key'].str.startswith('end-')].copy()
df_begin['operation'] = df_begin['key'].str.replace('begin-', '', regex=False)
df_end['operation'] = df_end['key'].str.replace('end-', '', regex=False)
df_begin['timestamp'] = pd.to_datetime(df_begin['timestamp'])
df_end['timestamp'] = pd.to_datetime(df_end['timestamp'])
# Sort before merge_asof
df_begin.sort_values(by=['operation', 'tid', 'timestamp'], inplace=True)
df_end.sort_values(by=['operation', 'tid', 'timestamp'], inplace=True)
# Merge each begin with next matching end on operation and tid
df_duration = pd.merge_asof(
df_begin,
df_end,
by=['operation', 'tid'],
left_on='timestamp',
right_on='timestamp',
direction='forward',
suffixes=('_begin', '_end')
)
df_duration = df_duration.dropna(subset=['timestamp_end'])
df_duration['duration_seconds'] = (
df_duration['timestamp_end'] - df_duration['timestamp_begin']
).dt.total_seconds()
return df_duration[['operation', 'timestamp_begin', 'timestamp_end', 'duration_seconds']]\
.sort_values(by='duration_seconds', ascending=False)\
.reset_index(drop=True)
def write_logs_to_excel(df_logs, df_durations, output_path):
"""
Writes logs and durations to Excel.
Splits logs into multiple sheets if row count exceeds Excel's limit.
Uses tqdm for progress feedback.
"""
from math import ceil
wb = Workbook()
wb.remove(wb.active)
total_rows = len(df_logs)
total_parts = ceil(total_rows / MAX_ROWS_PER_SHEET)
for i in range(0, total_rows, MAX_ROWS_PER_SHEET):
chunk = df_logs.iloc[i:i + MAX_ROWS_PER_SHEET]
sheet_name = f"Logs_Part_{(i // MAX_ROWS_PER_SHEET) + 1}"
print(f"📄 Writing {sheet_name} with {len(chunk)} rows...")
ws = wb.create_sheet(title=sheet_name)
for row in tqdm(dataframe_to_rows(chunk, index=False, header=True),
total=len(chunk)+1,
desc=f"Writing {sheet_name}",
unit="row"):
row = [json.dumps(cell) if isinstance(cell, (dict, list)) else cell for cell in row]
ws.append(row)
print("⏱ Writing Durations sheet...")
ws = wb.create_sheet(title="Durations")
for row in tqdm(dataframe_to_rows(df_durations, index=False, header=True),
total=len(df_durations)+1,
desc="Writing Durations",
unit="row"):
ws.append(row)
wb.save(output_path)
print(f"✅ Excel file saved to {output_path}")
if __name__ == "__main__":
log_file_path = "log_2023_08_21_18_18_52.txt" # Replace with actual path
output_excel_path = "parsed_tableau_prep_log_with_durations.xlsx"
print("📥 Parsing log file...")
df_logs = parse_tableau_prep_log(log_file_path)
print("⏳ Computing durations...")
df_durations = compute_durations(df_logs)
print("📤 Writing to Excel...")
write_logs_to_excel(df_logs, df_durations, output_excel_path)