gateway/tool_stats_durations_from_log.py

import argparse
import csv
import re
from datetime import datetime
from typing import Iterable, List, Optional, Tuple


TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S"


def parse_line(line: str) -> Tuple[Optional[str], Optional[str], Optional[datetime]]:
    """
    Extract (logger, function, timestamp) from a log line.

    Expected format examples (single line):
    2025-09-18 16:35:04 - INFO - modules.workflows._transfer.handlingTasks - Task 1 - Starting action 3/4 - D:\\Athi\\...\\handlingTasks.py:572 - executeTask

    Returns (logger, function, timestamp_dt) or (None, None, None) if not matched.
    """
    # Timestamp is first 19 chars in given logs
    if len(line) < 19:
        return None, None, None

    ts_str = line[:19]
    try:
        ts = datetime.strptime(ts_str, TIMESTAMP_FORMAT)
    except ValueError:
        return None, None, None

    # Extract logger name between first two " - " segments following level
    # ^TIMESTAMP - LEVEL - LOGGER - ...
    m_logger = re.search(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - \w+ - ([\w\.]+) - ", line)
    if not m_logger:
        return None, None, None
    logger = m_logger.group(1)

    # Function is the very last token after the final " - "
    m_func = re.search(r" - (\w+)$", line.rstrip())
    if not m_func:
        return None, None, None
    func = m_func.group(1)

    return logger, func, ts


def iter_jobs(
    lines: Iterable[str], ignore_substrings: Optional[List[str]] = None
) -> Iterable[Tuple[datetime, datetime, str]]:
    """
    Yields tuples of (start_ts, end_ts, job_label) for sequential groups of same job.

    - A job label is "{logger}.{function}" based on each parsed line.
    - Consecutive lines with the same job label are grouped together.
    - If a group has a single line, start_ts == end_ts, duration becomes 0 seconds.
    - Lines containing any of ignore_substrings are skipped.
    """
    ignore_substrings = ignore_substrings or []

    current_label: Optional[str] = None
    current_start: Optional[datetime] = None
    current_end: Optional[datetime] = None

    for line in lines:
        # Optional ignores by substring within the entire line
        if any(substr in line for substr in ignore_substrings):
            continue

        logger, func, ts = parse_line(line)
        if not logger or not func or not ts:
            continue

        job_label = f"{logger}.{func}"

        if current_label is None:
            current_label = job_label
            current_start = ts
            current_end = ts
            continue

        if job_label == current_label:
            # Extend the current group
            current_end = ts
            continue

        # Close the previous group and start a new one
        if current_label is not None and current_start is not None and current_end is not None:
            yield current_start, current_end, current_label

        current_label = job_label
        current_start = ts
        current_end = ts

    # Flush the last open group
    if current_label is not None and current_start is not None and current_end is not None:
        yield current_start, current_end, current_label


def main() -> None:
    parser = argparse.ArgumentParser(description="Extract job durations from poweron.log into CSV")
    parser.add_argument("log_path", help="Path to poweron.log")
    parser.add_argument(
        "--output",
        "-o",
        default="job_durations.csv",
        help="Output CSV path (default: job_durations.csv)",
    )
    parser.add_argument(
        "--ignore",
        nargs="*",
        default=[
            # Default ignore examples; add/remove as needed
            "Starting action",  # e.g., "Task 1 - Starting action 3/4"
        ],
        help="List of substrings; lines containing any will be ignored",
    )
    args = parser.parse_args()

    with open(args.log_path, "r", encoding="utf-8", errors="ignore") as f_in, open(
        args.output, "w", newline="", encoding="utf-8"
    ) as f_out:
        writer = csv.writer(f_out)
        writer.writerow(["start_timestamp", "duration_seconds", "job_label"])

        for start_ts, end_ts, job_label in iter_jobs(f_in, ignore_substrings=args.ignore):
            duration_seconds = int((end_ts - start_ts).total_seconds())
            writer.writerow([start_ts.strftime(TIMESTAMP_FORMAT), duration_seconds, job_label])


if __name__ == "__main__":
    main()