ValueError: left keys must be sorted

Hi, I’ve created a script to parse some logs that would ultimately output 2 or more excel sheets in a workbook. I’m running into an issue with def compute_durations(df) where it’s throwing a ValueError: left keys must be sorted error in the merge_asof section. However, right above it, I’m sorting the dataframes. Can someone help me figure out why it is still giving me this error? Below is my code:

import json
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from tqdm import tqdm

# Constants
EXCEL_ROW_LIMIT = 1_048_576
HEADER_ROW_COUNT = 1
MAX_ROWS_PER_SHEET = EXCEL_ROW_LIMIT - HEADER_ROW_COUNT

def parse_tableau_prep_log(file_path):
    """
    Parses a Tableau Prep log file with line-delimited JSON entries.
    Preserves structured JSON in 'value' if applicable.
    """
    records = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
            try:
                log_entry = json.loads(line)
                value = log_entry.get('v')

                # Try to parse JSON if value is a string that looks like JSON
                if isinstance(value, str):
                    try:
                        parsed_value = json.loads(value)
                        if isinstance(parsed_value, (dict, list)):
                            value = parsed_value
                    except (json.JSONDecodeError, TypeError):
                        pass

                flat_record = {
                    'timestamp': log_entry.get('ts'),
                    'pid': log_entry.get('pid'),
                    'tid': log_entry.get('tid'),
                    'severity': log_entry.get('sev'),
                    'key': log_entry.get('k'),
                    'value': value,
                }
                records.append(flat_record)
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSON decode error: {e}")
    return pd.DataFrame(records)

def compute_durations(df):
    """
    Computes durations between matching begin-/end- operation pairs.
    Matches operations in timestamp order within the same thread.
    """
    df_begin = df[df['key'].str.startswith('begin-')].copy()
    df_end = df[df['key'].str.startswith('end-')].copy()

    df_begin['operation'] = df_begin['key'].str.replace('begin-', '', regex=False)
    df_end['operation'] = df_end['key'].str.replace('end-', '', regex=False)

    df_begin['timestamp'] = pd.to_datetime(df_begin['timestamp'])
    df_end['timestamp'] = pd.to_datetime(df_end['timestamp'])

    # Sort before merge_asof
    df_begin.sort_values(by=['operation', 'tid', 'timestamp'], inplace=True)
    df_end.sort_values(by=['operation', 'tid', 'timestamp'], inplace=True)

    # Merge each begin with next matching end on operation and tid
    df_duration = pd.merge_asof(
        df_begin,
        df_end,
        by=['operation', 'tid'],
        left_on='timestamp',
        right_on='timestamp',
        direction='forward',
        suffixes=('_begin', '_end')
    )

    df_duration = df_duration.dropna(subset=['timestamp_end'])

    df_duration['duration_seconds'] = (
        df_duration['timestamp_end'] - df_duration['timestamp_begin']
    ).dt.total_seconds()

    return df_duration[['operation', 'timestamp_begin', 'timestamp_end', 'duration_seconds']]\
        .sort_values(by='duration_seconds', ascending=False)\
        .reset_index(drop=True)

def write_logs_to_excel(df_logs, df_durations, output_path):
    """
    Writes logs and durations to Excel.
    Splits logs into multiple sheets if row count exceeds Excel's limit.
    Uses tqdm for progress feedback.
    """
    from math import ceil

    wb = Workbook()
    wb.remove(wb.active)

    total_rows = len(df_logs)
    total_parts = ceil(total_rows / MAX_ROWS_PER_SHEET)

    for i in range(0, total_rows, MAX_ROWS_PER_SHEET):
        chunk = df_logs.iloc[i:i + MAX_ROWS_PER_SHEET]
        sheet_name = f"Logs_Part_{(i // MAX_ROWS_PER_SHEET) + 1}"
        print(f"📄 Writing {sheet_name} with {len(chunk)} rows...")
        ws = wb.create_sheet(title=sheet_name)

        for row in tqdm(dataframe_to_rows(chunk, index=False, header=True),
                        total=len(chunk)+1,
                        desc=f"Writing {sheet_name}",
                        unit="row"):
            row = [json.dumps(cell) if isinstance(cell, (dict, list)) else cell for cell in row]
            ws.append(row)

    print("⏱ Writing Durations sheet...")
    ws = wb.create_sheet(title="Durations")
    for row in tqdm(dataframe_to_rows(df_durations, index=False, header=True),
                    total=len(df_durations)+1,
                    desc="Writing Durations",
                    unit="row"):
        ws.append(row)

    wb.save(output_path)
    print(f"✅ Excel file saved to {output_path}")

if __name__ == "__main__":
    log_file_path = "log_2023_08_21_18_18_52.txt"  # Replace with actual path
    output_excel_path = "parsed_tableau_prep_log_with_durations.xlsx"

    print("📥 Parsing log file...")
    df_logs = parse_tableau_prep_log(log_file_path)

    print("⏳ Computing durations...")
    df_durations = compute_durations(df_logs)

    print("📤 Writing to Excel...")
    write_logs_to_excel(df_logs, df_durations, output_excel_path)


I havent’ used merge_asof a great deal - only a few times - but one point that occurs to me in your code (compute_durations), is that the pre-merge sort columns (operation, tid, timestamp) aren’t the same as the merge join column timestamp (specified in left_on and right_on): merge_asof requires both dataframes to be sorted on the join key(s), which here is just timestamp, whereas the inclusion of operation and tid in the prior sort may produce a different ordering than it expects from the join on timestamp alone, which if true would explain the error.

Have you looked at the results for dropping operation and tid in the sort? They’re probably redundant in the sort anyway given you’re including them with by=['operation', 'tid'], which means it tries to match on these two columns before the merge on timestamp.

I don’t see the exact error message in your post, but from what’s been described it seems to be purely about a mismatch in the join ordering between the two dataframes.

The forward matching (direction='forward') makes sense: for each beginning timestamp in the left dataframe you’re looking for the nearest subsequent ending timestamp in the right dataframe with the same operation and tid values.

1 Like

That worked, thank you

2 Likes