feat: extract content node angepasst für mehr optionen
This commit is contained in:
parent
9a498bf833
commit
41b2113bd5
9 changed files with 1564 additions and 33 deletions
|
|
@ -59,6 +59,207 @@ CONTEXT_NODES = [
|
|||
"auch kuenftige Nicht-Bild-Typen bleiben erhalten)."
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "outputMode",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "blob", "label": t("Ausgabe: ein Textblock (blob)")},
|
||||
{"value": "lines", "label": t("Ausgabe: Zeilen / Segmente")},
|
||||
{"value": "pages", "label": t("Ausgabe: nach Seite (z. B. PDF)")},
|
||||
{"value": "chunks", "label": t("Ausgabe: Chunks (fixe Groesse)")},
|
||||
{"value": "structured", "label": t("Ausgabe: Parts als Liste")},
|
||||
]
|
||||
},
|
||||
"default": "lines",
|
||||
"description": t(
|
||||
"Wie die extrahierten Inhalte unter ``presentation`` strukturiert werden "
|
||||
"(zusaetzlich zu den unveraenderten ``parts`` im Handover)."
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "splitBy",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "newline", "label": t("Trennen: Zeilenumbruch")},
|
||||
{"value": "paragraph", "label": t("Trennen: Absatz (Leerzeilen)")},
|
||||
{"value": "sentence", "label": t("Trennen: Saetze (heuristisch)")},
|
||||
]
|
||||
},
|
||||
"default": "newline",
|
||||
"description": t(
|
||||
"Gueltig fuer ``outputMode`` lines und chunks: welches Trennzeichen der "
|
||||
"zusammenhaengende Klartext zuerst erhaelt."
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "chunkSizeUnit",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"dependsOn": "outputMode",
|
||||
"showWhen": ["chunks"],
|
||||
"options": [
|
||||
{"value": "tokens", "label": t("Chunk-Groesse: Tokens (approx. ~4 Zeichen)")},
|
||||
{"value": "characters", "label": t("Chunk-Groesse: Zeichen")},
|
||||
{"value": "words", "label": t("Chunk-Groesse: Woerter")},
|
||||
]
|
||||
},
|
||||
"default": "tokens",
|
||||
"description": t("Einheit fuer ``chunkSize`` / ``chunkOverlap`` wenn outputMode chunks."),
|
||||
},
|
||||
{
|
||||
"name": "chunkSize",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"dependsOn": "outputMode",
|
||||
"showWhen": ["chunks"],
|
||||
"options": [
|
||||
{"value": "256", "label": "256"},
|
||||
{"value": "500", "label": "500"},
|
||||
{"value": "1000", "label": "1000"},
|
||||
{"value": "2000", "label": "2000"},
|
||||
{"value": "4000", "label": "4000"},
|
||||
]
|
||||
},
|
||||
"default": "500",
|
||||
"description": t("Zielgroesse pro Chunk (siehe chunkSizeUnit). Nur bei outputMode chunks."),
|
||||
},
|
||||
{
|
||||
"name": "chunkOverlap",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"dependsOn": "outputMode",
|
||||
"showWhen": ["chunks"],
|
||||
"options": [
|
||||
{"value": "0", "label": "0"},
|
||||
{"value": "25", "label": "25"},
|
||||
{"value": "50", "label": "50"},
|
||||
{"value": "100", "label": "100"},
|
||||
{"value": "200", "label": "200"},
|
||||
]
|
||||
},
|
||||
"default": "0",
|
||||
"description": t("Ueberlappung zwischen aufeinanderfolgenden Chunks (gleiche Einheit wie chunkSize)."),
|
||||
},
|
||||
{
|
||||
"name": "filterEmptyLines",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "true", "label": t("Ja")},
|
||||
{"value": "false", "label": t("Nein")},
|
||||
]
|
||||
},
|
||||
"default": "true",
|
||||
"description": t("Leere bzw. nur-Whitespace-Segmente bei lines/chunks entfernen."),
|
||||
},
|
||||
{
|
||||
"name": "trimWhitespace",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "true", "label": t("Ja")},
|
||||
{"value": "false", "label": t("Nein")},
|
||||
]
|
||||
},
|
||||
"default": "true",
|
||||
"description": t("Fuehrende und nachfolgende Leerzeichen pro Segment trimmen."),
|
||||
},
|
||||
{
|
||||
"name": "includeLineNumbers",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "true", "label": t("Ja")},
|
||||
{"value": "false", "label": t("Nein")},
|
||||
]
|
||||
},
|
||||
"default": "false",
|
||||
"description": t("Bei lines: jedem Eintrag eine Zeilennummer (1-based) zuweisen."),
|
||||
},
|
||||
{
|
||||
"name": "includeMetadata",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "true", "label": t("Ja")},
|
||||
{"value": "false", "label": t("Nein")},
|
||||
]
|
||||
},
|
||||
"default": "false",
|
||||
"description": t("Dateiname und einfache Offsets bei lines/chunks/pages an Eintraege haengen."),
|
||||
},
|
||||
{
|
||||
"name": "csvHeaderRow",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "true", "label": t("Ja")},
|
||||
{"value": "false", "label": t("Nein")},
|
||||
]
|
||||
},
|
||||
"default": "true",
|
||||
"description": t(
|
||||
"Bei CSV-Dateien: erste Zeile als Spaltenkoepfe interpretieren "
|
||||
"und ``csvRows`` als Liste von Objekten in ``presentation`` schreiben."
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "pdfExtractMode",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "text", "label": t("PDF/Parts: Text & Tabellen (keine Bild-Parts)")},
|
||||
{"value": "tables", "label": t("PDF/Parts: nur Tabellen-Parts")},
|
||||
{"value": "images", "label": t("PDF/Parts: nur Bild-Parts")},
|
||||
{"value": "all", "label": t("PDF/Parts: alle Typgruppen")},
|
||||
]
|
||||
},
|
||||
"default": "text",
|
||||
"description": t(
|
||||
"Filtert fuer die Presentation-Schicht nach typeGroup/MIME "
|
||||
"(gilt fuer alle Dokumenttypen analog, nicht nur PDF)."
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "markdownPreserveFormatting",
|
||||
"type": "str",
|
||||
"required": False,
|
||||
"frontendType": "select",
|
||||
"frontendOptions": {
|
||||
"options": [
|
||||
{"value": "true", "label": t("Markdown beibehalten")},
|
||||
{"value": "false", "label": t("zu vereinfachtem Klartext reduzieren")},
|
||||
]
|
||||
},
|
||||
"default": "false",
|
||||
"description": t(
|
||||
"Bei text/markdown-Parts: leichte Entfernung von Markup-Zeichen wenn false."
|
||||
),
|
||||
},
|
||||
],
|
||||
"inputs": 1,
|
||||
"outputs": 1,
|
||||
|
|
@ -79,6 +280,16 @@ CONTEXT_NODES = [
|
|||
"recommended": True,
|
||||
"type": "Any",
|
||||
},
|
||||
{
|
||||
"path": ["documents", 0, "documentData", "presentation"],
|
||||
"pickerLabel": t("Presentation (strukturierte Sicht)"),
|
||||
"detail": t(
|
||||
"Nur die konfigurierte Ausgabe-Struktur (blob/lines/pages/chunks/structured); "
|
||||
"unveraenderte Roh-Parts bleiben im umschliessenden Handover."
|
||||
),
|
||||
"recommended": False,
|
||||
"type": "Any",
|
||||
},
|
||||
{
|
||||
"path": ["response"],
|
||||
"pickerLabel": t("Nur Text"),
|
||||
|
|
|
|||
|
|
@ -19,6 +19,12 @@ def _resolveLogDir() -> str:
|
|||
logDir = os.path.join(gatewayDir, logDir)
|
||||
return logDir
|
||||
|
||||
|
||||
def resolve_app_log_dir() -> str:
|
||||
"""Absolute filesystem path for ``APP_LOGGING_LOG_DIR``."""
|
||||
return _resolveLogDir()
|
||||
|
||||
|
||||
def ensureDir(path: str) -> None:
|
||||
"""Create directory if it does not exist."""
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
|
|
|||
|
|
@ -32,6 +32,11 @@ from modules.features.graphicalEditor.portTypes import normalizeToSchema, wrapTr
|
|||
from modules.features.graphicalEditor.nodeDefinitions import STATIC_NODE_TYPES
|
||||
from modules.serviceCenter.services.serviceSubscription.mainServiceSubscription import SubscriptionInactiveException as _SubscriptionInactiveException
|
||||
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import BillingContextError as _BillingContextError
|
||||
from modules.workflows.automation2.graphicalEditorRunFileLogger import (
|
||||
GraphicalEditorRunFileLogger,
|
||||
graphical_editor_run_file_logging_enabled,
|
||||
merge_run_context_with_ge_log_prefix,
|
||||
)
|
||||
from modules.workflows.automation2.runEnvelope import normalize_run_envelope
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -291,6 +296,78 @@ def _updateStepLog(iface, stepId: str, status: str, output: Dict = None, error:
|
|||
logger.debug("Could not update AutoStepLog %s: %s", stepId, e)
|
||||
|
||||
|
||||
def _ge_iso_timestamp() -> str:
|
||||
"""UTC timestamp for NDJSON logs (readable, milliseconds)."""
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
|
||||
|
||||
|
||||
async def _ge_log_node_finished(
|
||||
file_logger: Optional[GraphicalEditorRunFileLogger],
|
||||
*,
|
||||
run_id: Optional[str],
|
||||
node_outputs: Dict[str, Any],
|
||||
run_envelope: Optional[Dict[str, Any]],
|
||||
node_id: str,
|
||||
node_type: str,
|
||||
status: str,
|
||||
input_snap: Optional[Dict[str, Any]],
|
||||
output: Any = None,
|
||||
error: Optional[str] = None,
|
||||
duration_ms: Optional[int] = None,
|
||||
retry_count: Optional[int] = None,
|
||||
skip_reason: Optional[str] = None,
|
||||
loop_index: Optional[int] = None,
|
||||
loop_node_id: Optional[str] = None,
|
||||
loop_item: Optional[Any] = None,
|
||||
) -> None:
|
||||
"""Append one execution line + one workflow-context snapshot (NDJSON)."""
|
||||
if file_logger is None or not run_id:
|
||||
return
|
||||
ts = _ge_iso_timestamp()
|
||||
exec_rec: Dict[str, Any] = {
|
||||
"timestamp": ts,
|
||||
"runId": run_id,
|
||||
"nodeId": node_id,
|
||||
"nodeType": node_type,
|
||||
"status": status,
|
||||
"input": _stripBinaryValues(dict(input_snap or {})),
|
||||
}
|
||||
if skip_reason:
|
||||
exec_rec["skipReason"] = skip_reason
|
||||
if duration_ms is not None:
|
||||
exec_rec["durationMs"] = duration_ms
|
||||
if retry_count is not None:
|
||||
exec_rec["retryCount"] = retry_count
|
||||
if loop_index is not None:
|
||||
exec_rec["loopIndex"] = loop_index
|
||||
if loop_node_id is not None:
|
||||
exec_rec["loopNodeId"] = loop_node_id
|
||||
if loop_item is not None:
|
||||
exec_rec["loopItem"] = _stripBinaryValues(loop_item)
|
||||
if error is not None:
|
||||
exec_rec["error"] = error
|
||||
if output is not None:
|
||||
exec_rec["output"] = (
|
||||
_stripBinaryValues(output) if isinstance(output, dict) else {"value": _stripBinaryValues(output)}
|
||||
)
|
||||
await file_logger.append_node_execution_line(exec_rec)
|
||||
|
||||
ctx_rec: Dict[str, Any] = {
|
||||
"timestamp": ts,
|
||||
"runId": run_id,
|
||||
"afterNodeId": node_id,
|
||||
"afterNodeType": node_type,
|
||||
"afterStatus": status,
|
||||
"nodeOutputsSnapshot": _serializableOutputs(node_outputs),
|
||||
"runEnvelope": _stripBinaryValues(dict(run_envelope or {})),
|
||||
}
|
||||
if loop_index is not None:
|
||||
ctx_rec["loopIndex"] = loop_index
|
||||
if loop_node_id is not None:
|
||||
ctx_rec["loopNodeId"] = loop_node_id
|
||||
await file_logger.append_context_snapshot_line(ctx_rec)
|
||||
|
||||
|
||||
async def _executeWithRetry(executor, node, context, maxRetries: int = 0, retryDelaySeconds: float = 1.0):
|
||||
"""Execute a node with optional retry policy from node parameters."""
|
||||
params = node.get("parameters") or {}
|
||||
|
|
@ -356,6 +433,7 @@ async def _run_post_loop_done_nodes(
|
|||
automation2_interface: Optional[Any],
|
||||
runId: Optional[str],
|
||||
processed_in_loop: Set[str],
|
||||
ge_file_logger: Optional[GraphicalEditorRunFileLogger] = None,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""After all loop iterations: merge upstream into loop output and run the Done (output 1) branch once."""
|
||||
_prim_in = getLoopPrimaryInputSource(loop_node_id, connectionMap, body_ids)
|
||||
|
|
@ -395,6 +473,17 @@ async def _run_post_loop_done_nodes(
|
|||
_skId = _createStepLog(automation2_interface, runId, _dnid, _dn.get("type", ""), status="skipped", inputSnapshot=_skipSnap)
|
||||
if _skId:
|
||||
_updateStepLog(automation2_interface, _skId, "skipped")
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=_dnid,
|
||||
node_type=_dn.get("type", ""),
|
||||
status="skipped",
|
||||
input_snap=_skipSnap,
|
||||
skip_reason=str(_skipSnap.get("_skipReason") or "inactive_branch"),
|
||||
)
|
||||
continue
|
||||
_dexec = _getExecutor(_dn.get("type", ""), services, automation2_interface)
|
||||
if not _dexec:
|
||||
|
|
@ -415,21 +504,82 @@ async def _run_post_loop_done_nodes(
|
|||
_updateStepLog(automation2_interface, _dStepId, "completed",
|
||||
output=_dres if isinstance(_dres, dict) else {"value": _dres},
|
||||
durationMs=_dDur, tokensUsed=_dTok, retryCount=_dRetry)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=_dnid,
|
||||
node_type=_dn.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_dIn,
|
||||
output=_dres,
|
||||
duration_ms=_dDur,
|
||||
retry_count=_dRetry,
|
||||
)
|
||||
except PauseForHumanTaskError:
|
||||
_updateStepLog(automation2_interface, _dStepId, "completed",
|
||||
durationMs=int((time.time() - _dStart) * 1000))
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=_dnid,
|
||||
node_type=_dn.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_dIn,
|
||||
duration_ms=int((time.time() - _dStart) * 1000),
|
||||
)
|
||||
raise
|
||||
except PauseForEmailWaitError:
|
||||
_updateStepLog(automation2_interface, _dStepId, "completed",
|
||||
durationMs=int((time.time() - _dStart) * 1000))
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=_dnid,
|
||||
node_type=_dn.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_dIn,
|
||||
duration_ms=int((time.time() - _dStart) * 1000),
|
||||
)
|
||||
raise
|
||||
except (_SubscriptionInactiveException, _BillingContextError):
|
||||
_dFailDur = int((time.time() - _dStart) * 1000)
|
||||
_updateStepLog(automation2_interface, _dStepId, "failed",
|
||||
error="Subscription/Billing error", durationMs=int((time.time() - _dStart) * 1000))
|
||||
error="Subscription/Billing error", durationMs=_dFailDur)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=_dnid,
|
||||
node_type=_dn.get("type", ""),
|
||||
status="failed",
|
||||
input_snap=_dIn,
|
||||
error="Subscription/Billing error",
|
||||
duration_ms=_dFailDur,
|
||||
)
|
||||
raise
|
||||
except Exception as _dex:
|
||||
_dFailDur2 = int((time.time() - _dStart) * 1000)
|
||||
_updateStepLog(automation2_interface, _dStepId, "failed",
|
||||
error=str(_dex), durationMs=int((time.time() - _dStart) * 1000))
|
||||
error=str(_dex), durationMs=_dFailDur2)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=_dnid,
|
||||
node_type=_dn.get("type", ""),
|
||||
status="failed",
|
||||
input_snap=_dIn,
|
||||
error=str(_dex),
|
||||
duration_ms=_dFailDur2,
|
||||
)
|
||||
raise
|
||||
processed_in_loop.update(_done_only)
|
||||
return None
|
||||
|
|
@ -523,6 +673,7 @@ async def executeGraph(
|
|||
except Exception as valErr:
|
||||
logger.warning("executeGraph resume: schema validation failed for %s: %s", startAfterNodeId, valErr)
|
||||
|
||||
ge_file_logger: Optional[GraphicalEditorRunFileLogger] = None
|
||||
nodeOutputs: Dict[str, Any] = dict(initialNodeOutputs or {})
|
||||
if not runId and automation2_interface and workflowId and not is_resume:
|
||||
run_context = {
|
||||
|
|
@ -560,6 +711,12 @@ async def executeGraph(
|
|||
)
|
||||
runId = run.get("id") if run else None
|
||||
logger.info("executeGraph created run %s label=%s", runId, run_label)
|
||||
if runId and graphical_editor_run_file_logging_enabled():
|
||||
ge_file_logger = GraphicalEditorRunFileLogger.bootstrap_new_run(
|
||||
automation2_interface,
|
||||
runId,
|
||||
run_context,
|
||||
)
|
||||
|
||||
env_for_run = normalize_run_envelope(run_envelope, user_id=userId)
|
||||
|
||||
|
|
@ -586,6 +743,17 @@ async def executeGraph(
|
|||
if runId:
|
||||
_activeRunContexts[runId] = context
|
||||
|
||||
if (
|
||||
graphical_editor_run_file_logging_enabled()
|
||||
and automation2_interface
|
||||
and runId
|
||||
and ge_file_logger is None
|
||||
):
|
||||
ge_file_logger = GraphicalEditorRunFileLogger.ensure_attached(
|
||||
automation2_interface,
|
||||
runId,
|
||||
)
|
||||
|
||||
skip_until_passed = bool(startAfterNodeId)
|
||||
processed_in_loop: Set[str] = set()
|
||||
_aggregateAccumulators: Dict[str, list] = {}
|
||||
|
|
@ -648,28 +816,106 @@ async def executeGraph(
|
|||
_updateStepLog(automation2_interface, _rStepId, "completed",
|
||||
output=result if isinstance(result, dict) else {"value": result},
|
||||
durationMs=_rDur, retryCount=_rRetry)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_rInputSnap,
|
||||
output=result,
|
||||
duration_ms=_rDur,
|
||||
retry_count=_rRetry,
|
||||
loop_index=next_index,
|
||||
loop_node_id=loop_node_id,
|
||||
loop_item=items[next_index],
|
||||
)
|
||||
logger.info("executeGraph loop resume body node %s done (iter %d, retries=%d)", bnid, next_index, _rRetry)
|
||||
if _resume_feedback_body_node_id and bnid == _resume_feedback_body_node_id:
|
||||
_resume_body_results.append(result)
|
||||
except PauseForHumanTaskError as e:
|
||||
_rPauseDur = int((time.time() - _rStepStart) * 1000)
|
||||
_updateStepLog(automation2_interface, _rStepId, "completed",
|
||||
durationMs=int((time.time() - _rStepStart) * 1000))
|
||||
durationMs=_rPauseDur)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_rInputSnap,
|
||||
duration_ms=_rPauseDur,
|
||||
loop_index=next_index,
|
||||
loop_node_id=loop_node_id,
|
||||
loop_item=items[next_index],
|
||||
)
|
||||
if automation2_interface:
|
||||
run_ctx = dict(run.get("context") or {})
|
||||
run_ctx["_loopState"] = {"loopNodeId": loop_node_id, "currentIndex": next_index, "items": items}
|
||||
automation2_interface.updateRun(e.runId, status="paused", nodeOutputs=_serializableOutputs(nodeOutputs), currentNodeId=e.nodeId, context=run_ctx)
|
||||
automation2_interface.updateRun(runId, status="paused", nodeOutputs=_serializableOutputs(nodeOutputs), currentNodeId=e.nodeId, context=run_ctx)
|
||||
return {"success": False, "paused": True, "taskId": e.taskId, "runId": e.runId, "nodeId": e.nodeId, "nodeOutputs": _serializableOutputs(nodeOutputs)}
|
||||
except PauseForEmailWaitError as e:
|
||||
_rEmailDur = int((time.time() - _rStepStart) * 1000)
|
||||
_updateStepLog(automation2_interface, _rStepId, "completed",
|
||||
durationMs=int((time.time() - _rStepStart) * 1000))
|
||||
durationMs=_rEmailDur)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_rInputSnap,
|
||||
duration_ms=_rEmailDur,
|
||||
loop_index=next_index,
|
||||
loop_node_id=loop_node_id,
|
||||
loop_item=items[next_index],
|
||||
)
|
||||
raise
|
||||
except (_SubscriptionInactiveException, _BillingContextError):
|
||||
_rFailDurSb = int((time.time() - _rStepStart) * 1000)
|
||||
_updateStepLog(automation2_interface, _rStepId, "failed",
|
||||
error="Subscription/Billing error", durationMs=int((time.time() - _rStepStart) * 1000))
|
||||
error="Subscription/Billing error", durationMs=_rFailDurSb)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="failed",
|
||||
input_snap=_rInputSnap,
|
||||
error="Subscription/Billing error",
|
||||
duration_ms=_rFailDurSb,
|
||||
loop_index=next_index,
|
||||
loop_node_id=loop_node_id,
|
||||
loop_item=items[next_index],
|
||||
)
|
||||
raise
|
||||
except Exception as ex:
|
||||
_rFailDurEx = int((time.time() - _rStepStart) * 1000)
|
||||
_updateStepLog(automation2_interface, _rStepId, "failed",
|
||||
error=str(ex), durationMs=int((time.time() - _rStepStart) * 1000))
|
||||
error=str(ex), durationMs=_rFailDurEx)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="failed",
|
||||
input_snap=_rInputSnap,
|
||||
error=str(ex),
|
||||
duration_ms=_rFailDurEx,
|
||||
loop_index=next_index,
|
||||
loop_node_id=loop_node_id,
|
||||
loop_item=items[next_index],
|
||||
)
|
||||
logger.exception("executeGraph loop body node %s FAILED: %s", bnid, ex)
|
||||
nodeOutputs[bnid] = {"error": str(ex), "success": False}
|
||||
if runId and automation2_interface:
|
||||
|
|
@ -699,6 +945,7 @@ async def executeGraph(
|
|||
automation2_interface=automation2_interface,
|
||||
runId=runId,
|
||||
processed_in_loop=processed_in_loop,
|
||||
ge_file_logger=ge_file_logger,
|
||||
)
|
||||
|
||||
for i, node in enumerate(ordered):
|
||||
|
|
@ -735,6 +982,17 @@ async def executeGraph(
|
|||
_skipStepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, status="skipped", inputSnapshot=_skipInputSnap)
|
||||
if _skipStepId:
|
||||
_updateStepLog(automation2_interface, _skipStepId, "skipped")
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=nodeId,
|
||||
node_type=nodeType,
|
||||
status="skipped",
|
||||
input_snap=_skipInputSnap,
|
||||
skip_reason=str(_skipInputSnap.get("_skipReason") or "inactive_branch"),
|
||||
)
|
||||
continue
|
||||
executor = _getExecutor(nodeType, services, automation2_interface)
|
||||
logger.info(
|
||||
|
|
@ -806,10 +1064,20 @@ async def executeGraph(
|
|||
_activeOutputs[bnid] = None
|
||||
continue
|
||||
_bStepStart = time.time()
|
||||
_bInputSnapAlways: Dict[str, Any] = {"_loopItem": _item, "_loopIndex": _idx}
|
||||
for _bSnapSrc, _, _ in connectionMap.get(bnid, []):
|
||||
if _bSnapSrc in _activeOutputs:
|
||||
_bInputSnapAlways[_bSnapSrc] = _activeOutputs[_bSnapSrc]
|
||||
_bStepId = None
|
||||
if not _batchMode or _idx == 0 or _idx == len(items) - 1:
|
||||
_bInputSnap = {"_loopItem": _item, "_loopIndex": _idx}
|
||||
_bStepId = _createStepLog(automation2_interface, runId, bnid, body_node.get("type", ""), "running", _bInputSnap)
|
||||
_bStepId = _createStepLog(
|
||||
automation2_interface,
|
||||
runId,
|
||||
bnid,
|
||||
body_node.get("type", ""),
|
||||
"running",
|
||||
_bInputSnapAlways,
|
||||
)
|
||||
try:
|
||||
bres, _bRetry = await _executeWithRetry(bexec, body_node, _activeCtx)
|
||||
if body_node.get("type") == "data.aggregate":
|
||||
|
|
@ -822,17 +1090,48 @@ async def executeGraph(
|
|||
_aggregateTempChunks.setdefault(bnid, []).append(_aggregateAccumulators[bnid])
|
||||
_aggregateAccumulators[bnid] = []
|
||||
_activeOutputs[bnid] = bres
|
||||
_bDur = int((time.time() - _bStepStart) * 1000)
|
||||
if _bStepId:
|
||||
_bDur = int((time.time() - _bStepStart) * 1000)
|
||||
_updateStepLog(automation2_interface, _bStepId, "completed",
|
||||
output=bres if isinstance(bres, dict) else {"value": bres},
|
||||
durationMs=_bDur, retryCount=_bRetry)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=_activeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_bInputSnapAlways,
|
||||
output=bres,
|
||||
duration_ms=_bDur,
|
||||
retry_count=_bRetry,
|
||||
loop_index=_idx,
|
||||
loop_node_id=nodeId,
|
||||
loop_item=_item,
|
||||
)
|
||||
if _loopConcurrency == 1:
|
||||
nodeOutputs[bnid] = bres
|
||||
except PauseForHumanTaskError as e:
|
||||
_bHd = int((time.time() - _bStepStart) * 1000)
|
||||
if _bStepId:
|
||||
_updateStepLog(automation2_interface, _bStepId, "completed",
|
||||
durationMs=int((time.time() - _bStepStart) * 1000))
|
||||
durationMs=_bHd)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=_activeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_bInputSnapAlways,
|
||||
duration_ms=_bHd,
|
||||
loop_index=_idx,
|
||||
loop_node_id=nodeId,
|
||||
loop_item=_item,
|
||||
)
|
||||
if runId and automation2_interface:
|
||||
_run = automation2_interface.getRun(runId) or {}
|
||||
_run_ctx = dict(_run.get("context") or {})
|
||||
|
|
@ -840,19 +1139,66 @@ async def executeGraph(
|
|||
automation2_interface.updateRun(e.runId, status="paused", nodeOutputs=_serializableOutputs(nodeOutputs), currentNodeId=e.nodeId, context=_run_ctx)
|
||||
return {"_pause": True, "taskId": e.taskId, "runId": e.runId, "nodeId": e.nodeId}
|
||||
except PauseForEmailWaitError:
|
||||
_bEd = int((time.time() - _bStepStart) * 1000)
|
||||
if _bStepId:
|
||||
_updateStepLog(automation2_interface, _bStepId, "completed",
|
||||
durationMs=int((time.time() - _bStepStart) * 1000))
|
||||
durationMs=_bEd)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=_activeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="completed",
|
||||
input_snap=_bInputSnapAlways,
|
||||
duration_ms=_bEd,
|
||||
loop_index=_idx,
|
||||
loop_node_id=nodeId,
|
||||
loop_item=_item,
|
||||
)
|
||||
raise
|
||||
except (_SubscriptionInactiveException, _BillingContextError):
|
||||
_bSb = int((time.time() - _bStepStart) * 1000)
|
||||
if _bStepId:
|
||||
_updateStepLog(automation2_interface, _bStepId, "failed",
|
||||
error="Subscription/Billing error", durationMs=int((time.time() - _bStepStart) * 1000))
|
||||
error="Subscription/Billing error", durationMs=_bSb)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=_activeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="failed",
|
||||
input_snap=_bInputSnapAlways,
|
||||
error="Subscription/Billing error",
|
||||
duration_ms=_bSb,
|
||||
loop_index=_idx,
|
||||
loop_node_id=nodeId,
|
||||
loop_item=_item,
|
||||
)
|
||||
raise
|
||||
except Exception as ex:
|
||||
_bFail = int((time.time() - _bStepStart) * 1000)
|
||||
if _bStepId:
|
||||
_updateStepLog(automation2_interface, _bStepId, "failed",
|
||||
error=str(ex), durationMs=int((time.time() - _bStepStart) * 1000))
|
||||
error=str(ex), durationMs=_bFail)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=_activeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=bnid,
|
||||
node_type=body_node.get("type", ""),
|
||||
status="failed",
|
||||
input_snap=_bInputSnapAlways,
|
||||
error=str(ex),
|
||||
duration_ms=_bFail,
|
||||
loop_index=_idx,
|
||||
loop_node_id=nodeId,
|
||||
loop_item=_item,
|
||||
)
|
||||
logger.exception("executeGraph loop body node %s FAILED (iter %d): %s", bnid, _idx, ex)
|
||||
return {"_error": str(ex), "failedNode": bnid}
|
||||
|
||||
|
|
@ -932,11 +1278,31 @@ async def executeGraph(
|
|||
automation2_interface=automation2_interface,
|
||||
runId=runId,
|
||||
processed_in_loop=processed_in_loop,
|
||||
ge_file_logger=ge_file_logger,
|
||||
)
|
||||
|
||||
_loopDurMs = int((time.time() - _stepStartMs) * 1000)
|
||||
_loopStepOut = {
|
||||
"iterationCount": len(items),
|
||||
"items": len(items),
|
||||
"concurrency": _loopConcurrency,
|
||||
"batchMode": _batchMode,
|
||||
}
|
||||
_updateStepLog(automation2_interface, _stepId, "completed",
|
||||
output={"iterationCount": len(items), "items": len(items), "concurrency": _loopConcurrency, "batchMode": _batchMode},
|
||||
durationMs=int((time.time() - _stepStartMs) * 1000))
|
||||
output=_loopStepOut,
|
||||
durationMs=_loopDurMs)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=nodeId,
|
||||
node_type=nodeType,
|
||||
status="completed",
|
||||
input_snap=_loopInputSnap,
|
||||
output=_loopStepOut,
|
||||
duration_ms=_loopDurMs,
|
||||
)
|
||||
logger.info("executeGraph flow.loop done: %d iterations (concurrency=%d, batchMode=%s)", len(items), _loopConcurrency, _batchMode)
|
||||
elif _isBarrierNode(nodeType):
|
||||
if not _allMergePredecessorsReady(nodeId, connectionMap, nodeOutputs):
|
||||
|
|
@ -952,6 +1318,24 @@ async def executeGraph(
|
|||
result, retryCount = await _executeWithRetry(executor, node, context)
|
||||
result = _normalizeResult(result, nodeType)
|
||||
nodeOutputs[nodeId] = result
|
||||
_mergeDurMs = int((time.time() - _stepStartMs) * 1000)
|
||||
_mergeTok = result.get("tokensUsed", 0) if isinstance(result, dict) else 0
|
||||
_updateStepLog(automation2_interface, _stepId, "completed",
|
||||
output=result if isinstance(result, dict) else {"value": result},
|
||||
durationMs=_mergeDurMs, tokensUsed=_mergeTok, retryCount=retryCount)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=nodeId,
|
||||
node_type=nodeType,
|
||||
status="completed",
|
||||
input_snap=_inputSnap,
|
||||
output=result,
|
||||
duration_ms=_mergeDurMs,
|
||||
retry_count=retryCount,
|
||||
)
|
||||
else:
|
||||
_stepStartMs = time.time()
|
||||
_inputSnap = {}
|
||||
|
|
@ -967,6 +1351,19 @@ async def executeGraph(
|
|||
_updateStepLog(automation2_interface, _stepId, "completed",
|
||||
output=result if isinstance(result, dict) else {"value": result},
|
||||
durationMs=_durMs, tokensUsed=_tokens, retryCount=retryCount)
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=nodeId,
|
||||
node_type=nodeType,
|
||||
status="completed",
|
||||
input_snap=_inputSnap,
|
||||
output=result,
|
||||
duration_ms=_durMs,
|
||||
retry_count=retryCount,
|
||||
)
|
||||
logger.info(
|
||||
"executeGraph node %s done: result_type=%s result_keys=%s retries=%d duration=%dms",
|
||||
nodeId,
|
||||
|
|
@ -976,8 +1373,23 @@ async def executeGraph(
|
|||
_durMs,
|
||||
)
|
||||
except PauseForHumanTaskError as e:
|
||||
_huPauseMs = int((time.time() - _stepStartMs) * 1000)
|
||||
_updateStepLog(automation2_interface, _stepId, "completed",
|
||||
durationMs=int((time.time() - _stepStartMs) * 1000))
|
||||
durationMs=_huPauseMs)
|
||||
_ge_in = locals().get("_inputSnap")
|
||||
if _ge_in is None:
|
||||
_ge_in = locals().get("_loopInputSnap") or {}
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=nodeId,
|
||||
node_type=nodeType,
|
||||
status="completed",
|
||||
input_snap=_ge_in,
|
||||
duration_ms=_huPauseMs,
|
||||
)
|
||||
logger.info("executeGraph paused for human task %s", e.taskId)
|
||||
return {
|
||||
"success": False,
|
||||
|
|
@ -988,8 +1400,23 @@ async def executeGraph(
|
|||
"nodeOutputs": _serializableOutputs(nodeOutputs),
|
||||
}
|
||||
except PauseForEmailWaitError as e:
|
||||
_emailPauseMs = int((time.time() - _stepStartMs) * 1000)
|
||||
_updateStepLog(automation2_interface, _stepId, "completed",
|
||||
durationMs=int((time.time() - _stepStartMs) * 1000))
|
||||
durationMs=_emailPauseMs)
|
||||
_ge_email_in = locals().get("_inputSnap")
|
||||
if _ge_email_in is None:
|
||||
_ge_email_in = locals().get("_loopInputSnap") or {}
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=nodeId,
|
||||
node_type=nodeType,
|
||||
status="completed",
|
||||
input_snap=_ge_email_in,
|
||||
duration_ms=_emailPauseMs,
|
||||
)
|
||||
logger.info("executeGraph paused for email wait (run %s, node %s)", e.runId, e.nodeId)
|
||||
try:
|
||||
from modules.interfaces.interfaceDbApp import getRootInterface
|
||||
|
|
@ -1013,6 +1440,9 @@ async def executeGraph(
|
|||
"mandateId": context.get("mandateId"),
|
||||
"instanceId": context.get("instanceId"),
|
||||
}
|
||||
if automation2_interface and e.runId:
|
||||
prev_ctx = dict((automation2_interface.getRun(e.runId) or {}).get("context") or {})
|
||||
run_ctx = merge_run_context_with_ge_log_prefix(prev_ctx, run_ctx)
|
||||
automation2_interface.updateRun(
|
||||
e.runId,
|
||||
status="paused",
|
||||
|
|
@ -1033,6 +1463,21 @@ async def executeGraph(
|
|||
nodeOutputs[nodeId] = {"error": str(e), "success": False}
|
||||
_durMs = int((time.time() - _stepStartMs) * 1000)
|
||||
_updateStepLog(automation2_interface, _stepId, "failed", error=str(e), durationMs=_durMs)
|
||||
_ge_fail_in = locals().get("_inputSnap")
|
||||
if _ge_fail_in is None:
|
||||
_ge_fail_in = locals().get("_loopInputSnap") or {}
|
||||
await _ge_log_node_finished(
|
||||
ge_file_logger,
|
||||
run_id=runId,
|
||||
node_outputs=nodeOutputs,
|
||||
run_envelope=context.get("runEnvelope"),
|
||||
node_id=nodeId,
|
||||
node_type=nodeType,
|
||||
status="failed",
|
||||
input_snap=_ge_fail_in,
|
||||
error=str(e),
|
||||
duration_ms=_durMs,
|
||||
)
|
||||
if runId and automation2_interface:
|
||||
automation2_interface.updateRun(runId, status="failed", nodeOutputs=_serializableOutputs(nodeOutputs))
|
||||
if runId:
|
||||
|
|
|
|||
|
|
@ -65,16 +65,23 @@ class InputExecutor:
|
|||
)
|
||||
taskId = task.get("id")
|
||||
|
||||
self.automation2.updateRun(
|
||||
from modules.workflows.automation2.graphicalEditorRunFileLogger import merge_persisted_run_context
|
||||
|
||||
_pause_ctx = merge_persisted_run_context(
|
||||
self.automation2,
|
||||
runId,
|
||||
status="paused",
|
||||
nodeOutputs=context.get("nodeOutputs"),
|
||||
currentNodeId=nodeId,
|
||||
context={
|
||||
{
|
||||
"connectionMap": context.get("connectionMap"),
|
||||
"inputSources": context.get("inputSources"),
|
||||
"orderedNodeIds": [n.get("id") for n in context.get("_orderedNodes", []) if n.get("id")],
|
||||
},
|
||||
)
|
||||
self.automation2.updateRun(
|
||||
runId,
|
||||
status="paused",
|
||||
nodeOutputs=context.get("nodeOutputs"),
|
||||
currentNodeId=nodeId,
|
||||
context=_pause_ctx,
|
||||
)
|
||||
logger.info("InputExecutor node %s: created task %s, run %s paused", nodeId, taskId, runId)
|
||||
raise PauseForHumanTaskError(runId=runId, taskId=taskId, nodeId=nodeId)
|
||||
|
|
|
|||
215
modules/workflows/automation2/graphicalEditorRunFileLogger.py
Normal file
215
modules/workflows/automation2/graphicalEditorRunFileLogger.py
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
"""Per-run NDJSON logs for persisted Automation2 / graphical-editor runs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
from modules.shared.debugLogger import ensureDir, resolve_app_log_dir
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
RUN_FILE_LOG_RELATIVE_ROOT = "graphical_editor_runs"
|
||||
CONTEXT_KEY = "_geRunFileLogRelativeDir"
|
||||
EXECUTION_FILENAME = "node_execution.ndjson"
|
||||
CONTEXT_SNAPSHOT_FILENAME = "workflow_context.ndjson"
|
||||
|
||||
|
||||
def graphical_editor_run_file_logging_enabled() -> bool:
|
||||
"""True when NDJSON files should be written for each persisted run."""
|
||||
raw = APP_CONFIG.get("APP_GRAPHICAL_EDITOR_RUN_FILE_LOGGING", False)
|
||||
if isinstance(raw, bool):
|
||||
return raw
|
||||
s = str(raw).strip().lower()
|
||||
return s in ("1", "true", "yes", "on")
|
||||
|
||||
|
||||
def merge_run_context_with_ge_log_prefix(
|
||||
base_context: Optional[Dict[str, Any]],
|
||||
incoming: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""Copy ``CONTEXT_KEY`` from *base_context* onto *incoming* if present (pause paths)."""
|
||||
out = dict(incoming or {})
|
||||
prev = (base_context or {}).get(CONTEXT_KEY)
|
||||
if prev is not None:
|
||||
out[CONTEXT_KEY] = prev
|
||||
return out
|
||||
|
||||
|
||||
def merge_persisted_run_context(
|
||||
automation2_interface: Any,
|
||||
run_id: str,
|
||||
replacement: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""``{**db_context, **replacement}`` so *_geRunFileLogRelativeDir* and other keys survive pause updates."""
|
||||
prev = dict((automation2_interface.getRun(run_id) or {}).get("context") or {})
|
||||
return {**prev, **(replacement or {})}
|
||||
|
||||
|
||||
class GraphicalEditorRunFileLogger:
|
||||
"""Append-only NDJSON log for one run folder under ``resolve_app_log_dir()``."""
|
||||
|
||||
__slots__ = ("_exec_path", "_ctx_path", "_lock", "_run_id")
|
||||
|
||||
def __init__(self, run_id: str, absolute_run_dir: str) -> None:
|
||||
self._run_id = run_id
|
||||
ensureDir(absolute_run_dir)
|
||||
self._exec_path = os.path.join(absolute_run_dir, EXECUTION_FILENAME)
|
||||
self._ctx_path = os.path.join(absolute_run_dir, CONTEXT_SNAPSHOT_FILENAME)
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
@property
|
||||
def run_id(self) -> str:
|
||||
return self._run_id
|
||||
|
||||
@staticmethod
|
||||
def fresh_run_subdirectory_name(run_id: str) -> str:
|
||||
ts = datetime.now(timezone.utc).strftime("%Y_%m_%d_%H_%M_%S")
|
||||
return f"{ts}__{run_id}"
|
||||
|
||||
@staticmethod
|
||||
def relative_run_path(subdir_name: str) -> str:
|
||||
"""Path relative to ``APP_LOGGING_LOG_DIR`` (POSIX-style segments)."""
|
||||
return "/".join((RUN_FILE_LOG_RELATIVE_ROOT, subdir_name))
|
||||
|
||||
@classmethod
|
||||
def bootstrap_new_run(cls, automation2_interface: Any, run_id: str, run_context: Dict[str, Any]) -> GraphicalEditorRunFileLogger | None:
|
||||
"""Create filesystem folder + persist CONTEXT_KEY via ``updateRun``."""
|
||||
if not graphical_editor_run_file_logging_enabled():
|
||||
return None
|
||||
if not automation2_interface or not run_id:
|
||||
return None
|
||||
subdir = cls.fresh_run_subdirectory_name(run_id)
|
||||
rel = cls.relative_run_path(subdir)
|
||||
base = resolve_app_log_dir()
|
||||
absolute = os.path.join(base, RUN_FILE_LOG_RELATIVE_ROOT, subdir)
|
||||
|
||||
merged = dict(run_context or {})
|
||||
merged[CONTEXT_KEY] = rel
|
||||
try:
|
||||
automation2_interface.updateRun(run_id, context=merged)
|
||||
except Exception as ex:
|
||||
logger.warning("GeRunFileLog: could not persist log dir on run=%s: %s", run_id, ex)
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
"GeRunFileLog: created run folder %s (run=%s)",
|
||||
absolute,
|
||||
run_id,
|
||||
)
|
||||
return cls(run_id, absolute)
|
||||
|
||||
@classmethod
|
||||
def open_from_run_record(cls, automation2_interface: Any, run_id: str) -> GraphicalEditorRunFileLogger | None:
|
||||
"""Open logger for an existing run using CONTEXT_KEY from DB."""
|
||||
if not graphical_editor_run_file_logging_enabled():
|
||||
return None
|
||||
if not automation2_interface or not run_id:
|
||||
return None
|
||||
try:
|
||||
run = automation2_interface.getRun(run_id) or {}
|
||||
except Exception as ex:
|
||||
logger.debug("GeRunFileLog: getRun failed run=%s: %s", run_id, ex)
|
||||
return None
|
||||
rel = (run.get("context") or {}).get(CONTEXT_KEY)
|
||||
if not rel or not isinstance(rel, str):
|
||||
return None
|
||||
base_norm = os.path.realpath(resolve_app_log_dir())
|
||||
allowed_root = os.path.realpath(os.path.join(base_norm, RUN_FILE_LOG_RELATIVE_ROOT))
|
||||
cand = os.path.realpath(os.path.join(base_norm, *rel.replace("\\", "/").split("/")))
|
||||
if cand != allowed_root and not cand.startswith(allowed_root + os.sep):
|
||||
logger.warning(
|
||||
"GeRunFileLog: path outside log root denied for run=%s rel=%s",
|
||||
run_id,
|
||||
rel,
|
||||
)
|
||||
return None
|
||||
absolute = cand
|
||||
return cls(run_id, absolute)
|
||||
|
||||
@classmethod
|
||||
def find_existing_absolute_dir(cls, run_id: str) -> Optional[str]:
|
||||
"""If a folder named ``*{timestamp}__{run_id}`` exists under the log root, return its absolute path."""
|
||||
root = os.path.realpath(os.path.join(resolve_app_log_dir(), RUN_FILE_LOG_RELATIVE_ROOT))
|
||||
if not os.path.isdir(root):
|
||||
return None
|
||||
suffix = f"__{run_id}"
|
||||
try:
|
||||
names = sorted((n for n in os.listdir(root) if n.endswith(suffix)), reverse=True)
|
||||
except OSError:
|
||||
return None
|
||||
if not names:
|
||||
return None
|
||||
cand = os.path.realpath(os.path.join(root, names[0]))
|
||||
allowed_root = root
|
||||
if cand != allowed_root and not cand.startswith(allowed_root + os.sep):
|
||||
return None
|
||||
return cand if os.path.isdir(cand) else None
|
||||
|
||||
@classmethod
|
||||
def ensure_attached(cls, automation2_interface: Any, run_id: str) -> GraphicalEditorRunFileLogger | None:
|
||||
"""Open logger from DB, or reattach an on-disk folder for *run_id*, or create a new one."""
|
||||
opened = cls.open_from_run_record(automation2_interface, run_id)
|
||||
if opened is not None:
|
||||
return opened
|
||||
if not graphical_editor_run_file_logging_enabled():
|
||||
return None
|
||||
if not automation2_interface or not run_id:
|
||||
return None
|
||||
try:
|
||||
run = automation2_interface.getRun(run_id) or {}
|
||||
except Exception as ex:
|
||||
logger.debug("GeRunFileLog: ensure getRun failed run=%s: %s", run_id, ex)
|
||||
return None
|
||||
prev_ctx = dict(run.get("context") or {})
|
||||
|
||||
existing_abs = cls.find_existing_absolute_dir(run_id)
|
||||
if existing_abs:
|
||||
base_norm = os.path.realpath(resolve_app_log_dir())
|
||||
rel = os.path.relpath(existing_abs, base_norm).replace(os.sep, "/")
|
||||
merged = {**prev_ctx, CONTEXT_KEY: rel}
|
||||
try:
|
||||
automation2_interface.updateRun(run_id, context=merged)
|
||||
except Exception as ex:
|
||||
logger.warning("GeRunFileLog: reattach persist failed run=%s: %s", run_id, ex)
|
||||
return None
|
||||
logger.info("GeRunFileLog: reattached existing folder for run=%s -> %s", run_id, existing_abs)
|
||||
return cls(run_id, existing_abs)
|
||||
|
||||
subdir = cls.fresh_run_subdirectory_name(run_id)
|
||||
rel = cls.relative_run_path(subdir)
|
||||
base = resolve_app_log_dir()
|
||||
absolute = os.path.join(base, RUN_FILE_LOG_RELATIVE_ROOT, subdir)
|
||||
merged = {**prev_ctx, CONTEXT_KEY: rel}
|
||||
try:
|
||||
automation2_interface.updateRun(run_id, context=merged)
|
||||
except Exception as ex:
|
||||
logger.warning("GeRunFileLog: ensure new folder persist failed run=%s: %s", run_id, ex)
|
||||
return None
|
||||
logger.info("GeRunFileLog: created late attach folder %s (run=%s)", absolute, run_id)
|
||||
return cls(run_id, absolute)
|
||||
|
||||
async def append_node_execution_line(self, record: Dict[str, Any]) -> None:
|
||||
line = json.dumps(record, ensure_ascii=False, default=str)
|
||||
async with self._lock:
|
||||
try:
|
||||
with open(self._exec_path, "a", encoding="utf-8") as f:
|
||||
f.write(line + "\n")
|
||||
except Exception as ex:
|
||||
logger.warning("GeRunFileLog: append execution failed run=%s: %s", self._run_id, ex)
|
||||
|
||||
async def append_context_snapshot_line(self, record: Dict[str, Any]) -> None:
|
||||
line = json.dumps(record, ensure_ascii=False, default=str)
|
||||
async with self._lock:
|
||||
try:
|
||||
with open(self._ctx_path, "a", encoding="utf-8") as f:
|
||||
f.write(line + "\n")
|
||||
except Exception as ex:
|
||||
logger.warning("GeRunFileLog: append context snapshot failed run=%s: %s", self._run_id, ex)
|
||||
|
|
@ -10,15 +10,19 @@ Returns a unified handover compatible with AiResult-style downstream wiring:
|
|||
``handoverMediaDocumentName`` matching a sibling blob document.
|
||||
- ``documents[1:]``: each extracted image as its own binary ``ActionDocument`` (like
|
||||
``ai.process`` artefact outputs).
|
||||
- ``ActionResult.data["response"]`` plus normalized executor field ``response``: concatenated
|
||||
plain text from all text parts — safe default for ``file.create`` / primaryTextRef."""
|
||||
- Root ``presentation`` inside the JSON (`schemaVersion`, per-file modes/lines/pages/chunks/…)
|
||||
— built from filtered ``parts`` without changing extractor output.
|
||||
- ``ActionResult.data["response"]`` plus normalized executor field ``response``: flat text derived
|
||||
from ``presentation`` (downstream-friendly wie zuvor fuer ``file.create`` / ``primaryTextRef``)."""
|
||||
|
||||
import base64 as _b64
|
||||
import binascii as _binascii
|
||||
import csv
|
||||
import logging
|
||||
import re
|
||||
from io import StringIO
|
||||
import time
|
||||
from typing import Any, Dict, List, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
|
||||
from modules.datamodels.datamodelDocref import coerceDocumentReferenceList
|
||||
|
|
@ -32,6 +36,12 @@ HANDOVER_KIND = "context.extractContent.handover.v1"
|
|||
|
||||
_CONTENT_FILTER_OPTIONS = ("all", "textOnly", "imagesOnly", "noImages")
|
||||
|
||||
PRESENTATION_SCHEMA_VERSION = 1
|
||||
|
||||
_PDF_EXTRACT_PRESENTATION_MODES = ("text", "tables", "images", "all")
|
||||
_OUTPUT_MODES = ("blob", "lines", "pages", "chunks", "structured")
|
||||
_SPLIT_BY_VALUES = ("newline", "paragraph", "sentence")
|
||||
_CHUNK_UNITS = ("tokens", "characters", "words")
|
||||
|
||||
def _apply_content_filter(payload: Dict[str, Any], content_filter: str) -> Dict[str, Any]:
|
||||
"""Filter parts in the handover payload by content_filter.
|
||||
|
|
@ -141,6 +151,498 @@ def _joined_text_from_handover_payload(payload: Dict[str, Any]) -> str:
|
|||
return "\n\n".join(chunks)
|
||||
|
||||
|
||||
def _normalize_bool_select(value: Any, *, default: bool) -> bool:
|
||||
s = str(value if value is not None else "").strip().lower()
|
||||
if s in ("true", "1", "yes", "on"):
|
||||
return True
|
||||
if s in ("false", "0", "no", "off"):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def _parse_positive_int(value: Any, default: int) -> int:
|
||||
try:
|
||||
n = int(str(value).strip())
|
||||
return n if n > 0 else default
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _parse_non_negative_int(value: Any, default: int) -> int:
|
||||
try:
|
||||
n = int(str(value).strip())
|
||||
return n if n >= 0 else default
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def parse_presentation_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Defaults match ``context.extractContent`` node schema in ``context.py``."""
|
||||
output_mode = str(parameters.get("outputMode") or "lines").strip().lower()
|
||||
if output_mode not in _OUTPUT_MODES:
|
||||
output_mode = "lines"
|
||||
split_by = str(parameters.get("splitBy") or "newline").strip().lower()
|
||||
if split_by not in _SPLIT_BY_VALUES:
|
||||
split_by = "newline"
|
||||
chunk_unit = str(parameters.get("chunkSizeUnit") or "tokens").strip().lower()
|
||||
if chunk_unit not in _CHUNK_UNITS:
|
||||
chunk_unit = "tokens"
|
||||
pdf_mode = str(parameters.get("pdfExtractMode") or "text").strip().lower()
|
||||
if pdf_mode not in _PDF_EXTRACT_PRESENTATION_MODES:
|
||||
pdf_mode = "text"
|
||||
return {
|
||||
"outputMode": output_mode,
|
||||
"splitBy": split_by,
|
||||
"chunkSizeUnit": chunk_unit,
|
||||
"chunkSize": _parse_positive_int(parameters.get("chunkSize"), 500),
|
||||
"chunkOverlap": _parse_non_negative_int(parameters.get("chunkOverlap"), 0),
|
||||
"filterEmptyLines": _normalize_bool_select(parameters.get("filterEmptyLines"), default=True),
|
||||
"trimWhitespace": _normalize_bool_select(parameters.get("trimWhitespace"), default=True),
|
||||
"includeLineNumbers": _normalize_bool_select(parameters.get("includeLineNumbers"), default=False),
|
||||
"includeMetadata": _normalize_bool_select(parameters.get("includeMetadata"), default=False),
|
||||
"csvHeaderRow": _normalize_bool_select(parameters.get("csvHeaderRow"), default=True),
|
||||
"pdfExtractMode": pdf_mode,
|
||||
"markdownPreserveFormatting": _normalize_bool_select(
|
||||
parameters.get("markdownPreserveFormatting"),
|
||||
default=False,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _copy_part(p: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return dict(p)
|
||||
|
||||
|
||||
def _presentation_filter_parts(parts: List[Dict[str, Any]], pdf_mode: str) -> List[Dict[str, Any]]:
|
||||
"""Filter **copies** of parts for the presentation layer (``pdfExtractMode``)."""
|
||||
if pdf_mode == "all":
|
||||
return [_copy_part(p) for p in parts if isinstance(p, dict)]
|
||||
out: List[Dict[str, Any]] = []
|
||||
for p in parts:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
tg = (p.get("typeGroup") or "").strip()
|
||||
if pdf_mode == "text":
|
||||
if tg == "image":
|
||||
continue
|
||||
if tg in ("text", "table", "structure"):
|
||||
out.append(_copy_part(p))
|
||||
elif pdf_mode == "tables":
|
||||
if tg == "table":
|
||||
out.append(_copy_part(p))
|
||||
elif pdf_mode == "images":
|
||||
if tg == "image":
|
||||
out.append(_copy_part(p))
|
||||
return out
|
||||
|
||||
|
||||
def _simplify_markdown_light(text: str) -> str:
|
||||
"""Cheap markdown-to-plain pass (no tokenizer library)."""
|
||||
s = text
|
||||
s = re.sub(r"`([^`]*)`", r"\1", s)
|
||||
s = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", s)
|
||||
s = re.sub(r"^#+\s*", "", s, flags=re.MULTILINE)
|
||||
s = s.replace("**", "").replace("__", "")
|
||||
s = re.sub(r"[*_]{1,2}([^*_]+)[*_]{1,2}", r"\1", s)
|
||||
return s.strip()
|
||||
|
||||
|
||||
def _apply_markdown_presentation_on_parts(parts: List[Dict[str, Any]], preserve: bool) -> None:
|
||||
if preserve:
|
||||
return
|
||||
for p in parts:
|
||||
mime = (p.get("mimeType") or "").strip().lower()
|
||||
if mime != "text/markdown":
|
||||
continue
|
||||
raw = p.get("data")
|
||||
if raw is None:
|
||||
continue
|
||||
p["data"] = _simplify_markdown_light(str(raw))
|
||||
|
||||
|
||||
def _part_metadata_dict(p: Dict[str, Any]) -> Dict[str, Any]:
|
||||
meta = p.get("metadata")
|
||||
if isinstance(meta, dict):
|
||||
return dict(meta)
|
||||
return {}
|
||||
|
||||
|
||||
def _page_index_from_part(p: Dict[str, Any]) -> int:
|
||||
meta = _part_metadata_dict(p)
|
||||
pi = meta.get("pageIndex")
|
||||
try:
|
||||
return int(pi) if pi is not None else 0
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
|
||||
def _is_csv_source(source_file_name: str, parts: List[Dict[str, Any]]) -> bool:
|
||||
low = (source_file_name or "").lower()
|
||||
if low.endswith(".csv"):
|
||||
return True
|
||||
for p in parts:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
mime = (p.get("mimeType") or "").strip().lower()
|
||||
if mime == "text/csv" or mime.endswith("csv"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _csv_text_from_parts(parts: List[Dict[str, Any]]) -> Optional[str]:
|
||||
"""Prefer explicit CSV table part payload; else None."""
|
||||
for p in parts:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
if (p.get("typeGroup") or "").strip() != "table":
|
||||
continue
|
||||
mime = (p.get("mimeType") or "").strip().lower()
|
||||
if "csv" in mime or mime == "text/plain":
|
||||
raw = p.get("data")
|
||||
if raw is None:
|
||||
continue
|
||||
return str(raw)
|
||||
for p in parts:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
if (p.get("typeGroup") or "").strip() == "text":
|
||||
mime = (p.get("mimeType") or "").strip().lower()
|
||||
if mime == "text/csv":
|
||||
raw = p.get("data")
|
||||
if raw is not None:
|
||||
return str(raw)
|
||||
return None
|
||||
|
||||
|
||||
def _parse_csv_rows(csv_text: str, header_row: bool) -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
reader = csv.reader(StringIO(csv_text))
|
||||
rows = [list(r) for r in reader]
|
||||
except csv.Error:
|
||||
return None
|
||||
if not rows:
|
||||
return {"headers": [], "rows": []}
|
||||
if not header_row:
|
||||
return {"headers": [], "rows": rows}
|
||||
headers = [str(c).strip() for c in rows[0]]
|
||||
body = rows[1:]
|
||||
dict_rows: List[Dict[str, str]] = []
|
||||
for r in body:
|
||||
item: Dict[str, str] = {}
|
||||
for i, h in enumerate(headers):
|
||||
key = h or f"column_{i + 1}"
|
||||
item[key] = str(r[i]).strip() if i < len(r) else ""
|
||||
dict_rows.append(item)
|
||||
return {"headers": headers, "rows": dict_rows}
|
||||
|
||||
|
||||
def _segment_merged_text(merged: str, split_by: str) -> List[str]:
|
||||
if split_by == "paragraph":
|
||||
return [s for s in re.split(r"\n\s*\n+", merged) if s != ""]
|
||||
if split_by == "sentence":
|
||||
pieces = re.split(r"(?<=[.!?])\s+", merged)
|
||||
return [s for s in pieces if s.strip() != ""]
|
||||
return merged.split("\n")
|
||||
|
||||
|
||||
def _apply_line_filters(
|
||||
segments: List[str],
|
||||
*,
|
||||
filter_empty: bool,
|
||||
trim_ws: bool,
|
||||
) -> List[str]:
|
||||
out: List[str] = []
|
||||
for seg in segments:
|
||||
s = seg
|
||||
if trim_ws:
|
||||
s = s.strip()
|
||||
else:
|
||||
s = str(s)
|
||||
if filter_empty and (not s or not s.strip()):
|
||||
continue
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
def _chars_per_unit(unit: str, chunk_size: int) -> int:
|
||||
# Token path: rough heuristic ~4 characters per token (documented convention).
|
||||
if unit == "tokens":
|
||||
return max(1, chunk_size * 4)
|
||||
if unit == "words":
|
||||
return max(1, chunk_size * 6)
|
||||
return max(1, chunk_size)
|
||||
|
||||
|
||||
def _overlap_chars(unit: str, overlap: int, chunk_size: int) -> int:
|
||||
return min(_chars_per_unit(unit, overlap), _chars_per_unit(unit, chunk_size))
|
||||
|
||||
|
||||
def _chunk_plain_text(text: str, cfg: Dict[str, Any]) -> List[str]:
|
||||
unit = cfg["chunkSizeUnit"]
|
||||
size = cfg["chunkSize"]
|
||||
overlap_amount = cfg["chunkOverlap"]
|
||||
if unit == "words":
|
||||
words = text.split()
|
||||
if not words:
|
||||
return []
|
||||
out: List[str] = []
|
||||
step = max(1, size - overlap_amount)
|
||||
i = 0
|
||||
while i < len(words):
|
||||
chunk_words = words[i : i + size]
|
||||
out.append(" ".join(chunk_words))
|
||||
if len(chunk_words) < size:
|
||||
break
|
||||
i += step
|
||||
return out
|
||||
csize = _chars_per_unit(unit, size)
|
||||
ovl = min(_overlap_chars(unit, overlap_amount, size), csize - 1) if csize > 1 else 0
|
||||
if not text:
|
||||
return []
|
||||
out: List[str] = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(len(text), start + csize)
|
||||
out.append(text[start:end])
|
||||
if end >= len(text):
|
||||
break
|
||||
start = max(0, end - ovl)
|
||||
return out
|
||||
|
||||
|
||||
def _base_item_meta(
|
||||
source_file_name: str,
|
||||
cfg: Dict[str, Any],
|
||||
*,
|
||||
segment_index: int,
|
||||
offset_hint: Optional[int] = None,
|
||||
page_index: Optional[int] = None,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
if not cfg.get("includeMetadata"):
|
||||
return None
|
||||
m: Dict[str, Any] = {"segmentIndex": segment_index}
|
||||
if source_file_name:
|
||||
m["sourceFileName"] = source_file_name
|
||||
if offset_hint is not None:
|
||||
m["charOffsetApprox"] = offset_hint
|
||||
if page_index is not None:
|
||||
m["pageIndex"] = page_index
|
||||
return m
|
||||
|
||||
|
||||
def presentation_response_text(
|
||||
presentation: Dict[str, Any],
|
||||
payload: Dict[str, Any],
|
||||
) -> str:
|
||||
"""Derive flattened ``response`` text from ``presentation.files``."""
|
||||
|
||||
files_section = presentation.get("files") or {}
|
||||
ordered = payload.get("fileOrder")
|
||||
keys: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
|
||||
chunks: List[str] = []
|
||||
for fk in keys:
|
||||
bucket = files_section.get(fk)
|
||||
if not isinstance(bucket, dict):
|
||||
continue
|
||||
mode = (bucket.get("outputMode") or "").strip()
|
||||
if mode == "blob":
|
||||
t = bucket.get("text")
|
||||
if isinstance(t, str) and t.strip():
|
||||
chunks.append(t.strip())
|
||||
elif mode == "lines":
|
||||
for it in bucket.get("items") or []:
|
||||
if isinstance(it, dict):
|
||||
tx = it.get("text")
|
||||
if isinstance(tx, str) and tx.strip():
|
||||
chunks.append(tx.strip())
|
||||
elif mode == "pages":
|
||||
for pg in bucket.get("pages") or []:
|
||||
if not isinstance(pg, dict):
|
||||
continue
|
||||
for it in pg.get("items") or []:
|
||||
if isinstance(it, dict):
|
||||
tx = it.get("text")
|
||||
if isinstance(tx, str) and tx.strip():
|
||||
chunks.append(tx.strip())
|
||||
elif mode == "chunks":
|
||||
for it in bucket.get("chunks") or []:
|
||||
if isinstance(it, dict):
|
||||
tx = it.get("text")
|
||||
if isinstance(tx, str) and tx.strip():
|
||||
chunks.append(tx.strip())
|
||||
elif mode == "structured":
|
||||
for it in bucket.get("items") or []:
|
||||
if not isinstance(it, dict):
|
||||
continue
|
||||
if not _part_carries_plain_text(it):
|
||||
continue
|
||||
tx = it.get("data")
|
||||
if isinstance(tx, str) and tx.strip():
|
||||
chunks.append(tx.strip())
|
||||
return "\n\n".join(chunks)
|
||||
|
||||
|
||||
def build_presentation_for_payload(payload: Dict[str, Any], cfg: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Build root ``presentation`` object (does not mutate ``payload``)."""
|
||||
files_section = payload.get("files") or {}
|
||||
ordered = payload.get("fileOrder")
|
||||
keys: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
|
||||
out_files: Dict[str, Any] = {}
|
||||
for fk in keys:
|
||||
bucket = files_section.get(fk)
|
||||
if not isinstance(bucket, dict):
|
||||
continue
|
||||
source_name = str(bucket.get("sourceFileName") or "")
|
||||
raw_parts = [p for p in (bucket.get("parts") or []) if isinstance(p, dict)]
|
||||
parts = _presentation_filter_parts(raw_parts, cfg["pdfExtractMode"])
|
||||
_apply_markdown_presentation_on_parts(parts, cfg["markdownPreserveFormatting"])
|
||||
out_files[fk] = _build_file_presentation(source_name, parts, cfg)
|
||||
return {
|
||||
"schemaVersion": PRESENTATION_SCHEMA_VERSION,
|
||||
"kind": "context.extractContent.presentation.v1",
|
||||
"outputMode": cfg["outputMode"],
|
||||
"fileOrder": keys,
|
||||
"files": out_files,
|
||||
}
|
||||
|
||||
|
||||
def _join_parts_plain_text(parts: List[Dict[str, Any]]) -> str:
|
||||
blocks: List[str] = []
|
||||
for p in parts:
|
||||
if not _part_carries_plain_text(p):
|
||||
continue
|
||||
raw = p.get("data")
|
||||
if raw is None:
|
||||
continue
|
||||
s = str(raw).strip()
|
||||
if s:
|
||||
blocks.append(s)
|
||||
return "\n\n".join(blocks)
|
||||
|
||||
|
||||
def _redact_large_part_payload(p: Dict[str, Any]) -> Dict[str, Any]:
|
||||
pc = dict(p)
|
||||
tg = (pc.get("typeGroup") or "").strip().lower()
|
||||
mime = (pc.get("mimeType") or "").strip().lower()
|
||||
if tg == "image" or mime.startswith("image/"):
|
||||
pc["data"] = ""
|
||||
return pc
|
||||
|
||||
|
||||
def _build_file_presentation(
|
||||
source_file_name: str,
|
||||
parts: List[Dict[str, Any]],
|
||||
cfg: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
output_mode = cfg["outputMode"]
|
||||
merge_plain = _join_parts_plain_text(parts)
|
||||
|
||||
csv_block: Optional[Dict[str, Any]] = None
|
||||
if _is_csv_source(source_file_name, parts):
|
||||
csv_txt = _csv_text_from_parts(parts)
|
||||
if csv_txt is not None:
|
||||
csv_block = _parse_csv_rows(csv_txt, cfg["csvHeaderRow"])
|
||||
|
||||
base: Dict[str, Any] = {
|
||||
"outputMode": output_mode,
|
||||
"sourceFileName": source_file_name or None,
|
||||
}
|
||||
if csv_block is not None:
|
||||
base["csv"] = csv_block
|
||||
|
||||
if output_mode == "blob":
|
||||
base["text"] = merge_plain
|
||||
return base
|
||||
|
||||
if output_mode == "structured":
|
||||
base["items"] = [_redact_large_part_payload(_copy_part(p)) for p in parts]
|
||||
return base
|
||||
|
||||
if output_mode == "pages":
|
||||
by_page: Dict[int, List[str]] = {}
|
||||
for p in parts:
|
||||
if not _part_carries_plain_text(p):
|
||||
continue
|
||||
raw = p.get("data")
|
||||
if raw is None:
|
||||
continue
|
||||
s = str(raw).strip()
|
||||
if not s:
|
||||
continue
|
||||
pi = _page_index_from_part(p)
|
||||
by_page.setdefault(pi, []).append(s)
|
||||
ordered_pages = sorted(by_page.keys())
|
||||
page_objs: List[Dict[str, Any]] = []
|
||||
for pi in ordered_pages:
|
||||
merged = "\n\n".join(by_page[pi])
|
||||
segs = _segment_merged_text(merged, cfg["splitBy"])
|
||||
segs = _apply_line_filters(
|
||||
segs,
|
||||
filter_empty=cfg["filterEmptyLines"],
|
||||
trim_ws=cfg["trimWhitespace"],
|
||||
)
|
||||
items: List[Dict[str, Any]] = []
|
||||
offset = 0
|
||||
for idx, seg in enumerate(segs, start=1):
|
||||
meta = _base_item_meta(
|
||||
source_file_name,
|
||||
cfg,
|
||||
segment_index=idx,
|
||||
offset_hint=offset,
|
||||
page_index=pi,
|
||||
)
|
||||
row: Dict[str, Any] = {"text": seg}
|
||||
if cfg["includeLineNumbers"]:
|
||||
row["lineNumber"] = idx
|
||||
if meta:
|
||||
row["metadata"] = meta
|
||||
items.append(row)
|
||||
offset += len(seg) + 1
|
||||
page_objs.append({"pageIndex": pi, "items": items})
|
||||
base["pages"] = page_objs
|
||||
return base
|
||||
|
||||
if output_mode == "chunks":
|
||||
segs = _segment_merged_text(merge_plain, cfg["splitBy"])
|
||||
segs = _apply_line_filters(
|
||||
segs,
|
||||
filter_empty=cfg["filterEmptyLines"],
|
||||
trim_ws=cfg["trimWhitespace"],
|
||||
)
|
||||
flat = "\n".join(segs)
|
||||
chunk_texts = _chunk_plain_text(flat, cfg)
|
||||
chunk_objs: List[Dict[str, Any]] = []
|
||||
for idx, ct in enumerate(chunk_texts, start=1):
|
||||
meta = _base_item_meta(source_file_name, cfg, segment_index=idx)
|
||||
row: Dict[str, Any] = {"index": idx, "text": ct}
|
||||
if meta:
|
||||
row["metadata"] = meta
|
||||
chunk_objs.append(row)
|
||||
base["chunks"] = chunk_objs
|
||||
return base
|
||||
|
||||
# lines (default): shared path with pages/chunks splitting
|
||||
segs = _segment_merged_text(merge_plain, cfg["splitBy"])
|
||||
segs = _apply_line_filters(
|
||||
segs,
|
||||
filter_empty=cfg["filterEmptyLines"],
|
||||
trim_ws=cfg["trimWhitespace"],
|
||||
)
|
||||
items: List[Dict[str, Any]] = []
|
||||
offset = 0
|
||||
for idx, seg in enumerate(segs, start=1):
|
||||
meta = _base_item_meta(source_file_name, cfg, segment_index=idx, offset_hint=offset)
|
||||
row = {"text": seg}
|
||||
if cfg["includeLineNumbers"]:
|
||||
row["lineNumber"] = idx
|
||||
if meta:
|
||||
row["metadata"] = meta
|
||||
items.append(row)
|
||||
offset += len(seg) + 1
|
||||
base["items"] = items
|
||||
return base
|
||||
|
||||
|
||||
def _mime_to_file_extension(mime: str) -> str:
|
||||
m = (mime or "").split(";")[0].strip().lower()
|
||||
mapping = {
|
||||
|
|
@ -364,6 +866,9 @@ async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
|
|||
content_filter = "all"
|
||||
payload = _apply_content_filter(payload, content_filter)
|
||||
|
||||
pres_cfg = parse_presentation_parameters(parameters)
|
||||
presentation = build_presentation_for_payload(payload, pres_cfg)
|
||||
|
||||
stem = f"{wf}_{int(time.time())}"
|
||||
# Only split image sidecars when the filtered payload can still contain image parts.
|
||||
if content_filter in ("all", "imagesOnly"):
|
||||
|
|
@ -376,7 +881,8 @@ async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
|
|||
stripped_payload = payload
|
||||
media_docs = []
|
||||
|
||||
joined_text = _joined_text_from_handover_payload(payload)
|
||||
stripped_payload["presentation"] = presentation
|
||||
joined_text = presentation_response_text(presentation, stripped_payload)
|
||||
|
||||
json_meta = {
|
||||
"actionType": "context.extractContent",
|
||||
|
|
|
|||
|
|
@ -320,18 +320,25 @@ def _pause_for_human_tasks(
|
|||
)
|
||||
task_id = str((task or {}).get("id") or "")
|
||||
ordered_ids = [n.get("id") for n in (run_context.get("_orderedNodes") or []) if n.get("id")]
|
||||
iface.updateRun(
|
||||
from modules.workflows.automation2.graphicalEditorRunFileLogger import merge_persisted_run_context
|
||||
|
||||
_pause_ctx = merge_persisted_run_context(
|
||||
iface,
|
||||
run_id,
|
||||
status="paused",
|
||||
nodeOutputs=run_context.get("nodeOutputs"),
|
||||
currentNodeId=node_id,
|
||||
context={
|
||||
{
|
||||
"connectionMap": run_context.get("connectionMap"),
|
||||
"inputSources": run_context.get("inputSources"),
|
||||
"orderedNodeIds": ordered_ids,
|
||||
"pauseReason": "contextAssignment",
|
||||
},
|
||||
)
|
||||
iface.updateRun(
|
||||
run_id,
|
||||
status="paused",
|
||||
nodeOutputs=run_context.get("nodeOutputs"),
|
||||
currentNodeId=node_id,
|
||||
context=_pause_ctx,
|
||||
)
|
||||
if not (run_id and task_id and node_id):
|
||||
raise RuntimeError("humanTask requires _runId, task id, and _workflowNodeId")
|
||||
raise PauseForHumanTaskError(runId=run_id, taskId=task_id, nodeId=node_id)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,9 @@ from modules.workflows.methods.methodContext.actions.extractContent import (
|
|||
_apply_content_filter,
|
||||
_joined_text_from_handover_payload,
|
||||
_split_images_to_sidecar_documents,
|
||||
build_presentation_for_payload,
|
||||
parse_presentation_parameters,
|
||||
presentation_response_text,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -142,3 +145,116 @@ def test_content_filter_text_only_no_sidecars():
|
|||
result = _apply_content_filter(_mixed_payload(), "textOnly")
|
||||
stripped, blobs = _split_images_to_sidecar_documents(result, document_name_stem="test")
|
||||
assert blobs == []
|
||||
|
||||
|
||||
def test_presentation_lines_and_response():
|
||||
payload = {
|
||||
"kind": HANDOVER_KIND,
|
||||
"fileOrder": ["f1"],
|
||||
"files": {
|
||||
"f1": {
|
||||
"sourceFileName": "x.txt",
|
||||
"parts": [
|
||||
{"typeGroup": "text", "data": "a\n\nb", "id": "1"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
cfg = parse_presentation_parameters({"outputMode": "lines", "splitBy": "paragraph"})
|
||||
pres = build_presentation_for_payload(payload, cfg)
|
||||
assert pres["files"]["f1"]["outputMode"] == "lines"
|
||||
assert [it["text"] for it in pres["files"]["f1"]["items"]] == ["a", "b"]
|
||||
assert presentation_response_text(pres, payload) == "a\n\nb"
|
||||
|
||||
|
||||
def test_presentation_pdf_mode_tables_only():
|
||||
payload = {
|
||||
"fileOrder": ["f1"],
|
||||
"files": {
|
||||
"f1": {
|
||||
"sourceFileName": "d.pdf",
|
||||
"parts": [
|
||||
{"typeGroup": "text", "data": "t", "id": "a"},
|
||||
{"typeGroup": "table", "mimeType": "text/csv", "data": "h1,h2\n1,2", "id": "b"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
cfg = parse_presentation_parameters({"pdfExtractMode": "tables", "outputMode": "blob"})
|
||||
pres = build_presentation_for_payload(payload, cfg)
|
||||
assert pres["files"]["f1"]["text"] == "h1,h2\n1,2"
|
||||
|
||||
|
||||
def test_presentation_csv_rows():
|
||||
payload = {
|
||||
"fileOrder": ["f1"],
|
||||
"files": {
|
||||
"f1": {
|
||||
"sourceFileName": "f.csv",
|
||||
"parts": [{"typeGroup": "table", "mimeType": "text/csv", "data": "a,b\n1,2", "id": "t"}],
|
||||
},
|
||||
},
|
||||
}
|
||||
cfg = parse_presentation_parameters({"csvHeaderRow": "true"})
|
||||
pres = build_presentation_for_payload(payload, cfg)
|
||||
csv = pres["files"]["f1"]["csv"]
|
||||
assert csv["headers"] == ["a", "b"]
|
||||
assert csv["rows"] == [{"a": "1", "b": "2"}]
|
||||
|
||||
|
||||
def test_presentation_pages_groups_by_page_index():
|
||||
payload = {
|
||||
"fileOrder": ["f1"],
|
||||
"files": {
|
||||
"f1": {
|
||||
"sourceFileName": "p.pdf",
|
||||
"parts": [
|
||||
{"typeGroup": "text", "data": "p0", "metadata": {"pageIndex": 0}, "id": "a"},
|
||||
{"typeGroup": "text", "data": "p1a\np1b", "metadata": {"pageIndex": 1}, "id": "b"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
cfg = parse_presentation_parameters({"outputMode": "pages", "splitBy": "newline"})
|
||||
pres = build_presentation_for_payload(payload, cfg)
|
||||
pages = pres["files"]["f1"]["pages"]
|
||||
assert [(p["pageIndex"], [it["text"] for it in p["items"]]) for p in pages] == [
|
||||
(0, ["p0"]),
|
||||
(1, ["p1a", "p1b"]),
|
||||
]
|
||||
|
||||
|
||||
def test_presentation_chunks_with_overlap_chars():
|
||||
payload = {
|
||||
"fileOrder": ["f1"],
|
||||
"files": {"f1": {"sourceFileName": "t.txt", "parts": [{"typeGroup": "text", "data": "abcdefghij", "id": "a"}]}},
|
||||
}
|
||||
cfg = parse_presentation_parameters(
|
||||
{"outputMode": "chunks", "chunkSizeUnit": "characters", "chunkSize": "4", "chunkOverlap": "2"}
|
||||
)
|
||||
pres = build_presentation_for_payload(payload, cfg)
|
||||
texts = [c["text"] for c in pres["files"]["f1"]["chunks"]]
|
||||
assert texts == ["abcd", "cdef", "efgh", "ghij"]
|
||||
|
||||
|
||||
def test_presentation_stripped_payload_gains_presentation_key_after_split():
|
||||
raw = b"x"
|
||||
b64 = base64.b64encode(raw).decode("ascii")
|
||||
payload = {
|
||||
"kind": HANDOVER_KIND,
|
||||
"schemaVersion": 1,
|
||||
"fileOrder": ["f1"],
|
||||
"files": {
|
||||
"f1": {
|
||||
"parts": [
|
||||
{"typeGroup": "text", "data": "txt", "id": "t"},
|
||||
{"typeGroup": "image", "mimeType": "image/png", "data": b64, "id": "img"},
|
||||
]
|
||||
}
|
||||
},
|
||||
}
|
||||
pres = build_presentation_for_payload(payload, parse_presentation_parameters({}))
|
||||
stripped, _blobs = _split_images_to_sidecar_documents(payload, document_name_stem="s")
|
||||
stripped["presentation"] = pres
|
||||
assert "presentation" in stripped
|
||||
assert stripped["presentation"]["files"]["f1"]["items"]
|
||||
|
|
|
|||
|
|
@ -25,7 +25,25 @@ def test_context_extractContent_node_shape():
|
|||
assert "DocumentList" in node["inputPorts"][0]["accepts"]
|
||||
assert "LoopItem" in node["inputPorts"][0]["accepts"]
|
||||
names = [p["name"] for p in node["parameters"]]
|
||||
assert names == ["documentList", "contentFilter"]
|
||||
assert names == [
|
||||
"documentList",
|
||||
"contentFilter",
|
||||
"outputMode",
|
||||
"splitBy",
|
||||
"chunkSizeUnit",
|
||||
"chunkSize",
|
||||
"chunkOverlap",
|
||||
"filterEmptyLines",
|
||||
"trimWhitespace",
|
||||
"includeLineNumbers",
|
||||
"includeMetadata",
|
||||
"csvHeaderRow",
|
||||
"pdfExtractMode",
|
||||
"markdownPreserveFormatting",
|
||||
]
|
||||
|
||||
pick_paths = [opt["path"] for opt in node["outputPorts"][0]["dataPickOptions"]]
|
||||
assert ["documents", 0, "documentData", "presentation"] in pick_paths
|
||||
|
||||
|
||||
def test_udm_port_types_registered():
|
||||
|
|
|
|||
Loading…
Reference in a new issue