diff --git a/modules/features/graphicalEditor/nodeDefinitions/ai.py b/modules/features/graphicalEditor/nodeDefinitions/ai.py
index ecdebcf6..a709f0be 100644
--- a/modules/features/graphicalEditor/nodeDefinitions/ai.py
+++ b/modules/features/graphicalEditor/nodeDefinitions/ai.py
@@ -6,6 +6,9 @@ from modules.shared.i18nRegistry import t
from modules.features.graphicalEditor.nodeDefinitions.contextPickerHelp import (
CONTEXT_BUILDER_PARAM_DESCRIPTION,
)
+from modules.features.graphicalEditor.nodeDefinitions.flow import (
+ CONTEXT_ENVELOPE_DATA_PICK_OPTIONS,
+)
# Shared authoritative DataPicker paths (same handover idea as ``context.extractContent`` outputPorts).
ACTION_RESULT_DATA_PICK_OPTIONS = [
@@ -43,6 +46,7 @@ ACTION_RESULT_DATA_PICK_OPTIONS = [
]
AI_RESULT_DATA_PICK_OPTIONS = [
+ *CONTEXT_ENVELOPE_DATA_PICK_OPTIONS,
{
"path": ["documents", 0, "documentData"],
"pickerLabel": t("Gesamter Inhalt"),
@@ -50,14 +54,14 @@ AI_RESULT_DATA_PICK_OPTIONS = [
"Hauptausgabedatei oder strukturierter Inhalt von ``documents[0]`` "
"(z. B. erzeugtes Dokument, JSON-Handover)."
),
- "recommended": True,
+ "recommended": False,
"type": "Any",
},
{
"path": ["response"],
"pickerLabel": t("Nur Text"),
"detail": t("Modell-Antwort als reiner Fließtext (ohne eingebettete Bildbytes)."),
- "recommended": True,
+ "recommended": False,
"type": "str",
},
{
diff --git a/modules/features/graphicalEditor/nodeDefinitions/context.py b/modules/features/graphicalEditor/nodeDefinitions/context.py
index 26c5b788..22e068dd 100644
--- a/modules/features/graphicalEditor/nodeDefinitions/context.py
+++ b/modules/features/graphicalEditor/nodeDefinitions/context.py
@@ -4,7 +4,10 @@
from modules.shared.i18nRegistry import t
-from modules.features.graphicalEditor.nodeDefinitions.flow import CONTEXT_MERGE_ACTION_RESULT_DATA_PICK_OPTIONS
+from modules.features.graphicalEditor.nodeDefinitions.flow import (
+ CONTEXT_ENVELOPE_DATA_PICK_OPTIONS,
+ CONTEXT_MERGE_ACTION_RESULT_DATA_PICK_OPTIONS,
+)
_CONTEXT_INPUT_SCHEMAS = [
"Transit",
@@ -27,11 +30,12 @@ CONTEXT_NODES = [
"category": "context",
"label": t("Inhalt extrahieren"),
"description": t(
- "Extrahiert Inhalt ohne KI. Ergebnis einheitlich wie KI-Schritte: `response` "
- "(gesammelter Klartext), strukturierte JSON-Unterlage in `documents[0]`, "
- "einzelne Bilder als eigene Dokumente `extract_media_*` (nur im Workflow, ohne Eintrag unter „Meine Dateien“) — "
- "Auswahl im Daten-Picker wie bei `ai.process`."
+ "Extrahiert Inhalt ohne KI. ``data`` ist die gewählte **Presentation** (`fileOrder`, `files` je "
+ "Quelldatei, kanonisches `data` pro Bucket) plus ``_meta`` (Quellnamen, Operation, Persist). "
+ "``response`` für diesen Knoten bleibt leer — kein zusätzlicher Fließtext. "
+ "``imageDocumentsOnly`` enthält Bilder über persistierte Artefakte."
),
+ "injectRunContext": True,
"parameters": [
{"name": "documentList", "type": "str", "required": True, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": "",
@@ -51,7 +55,7 @@ CONTEXT_NODES = [
},
"default": "all",
"description": t(
- "Welche Parts im Handover behalten werden. "
+ "Welche extrahierten Parts weiterverwendet werden. "
"all = alle Typgruppen inkl. Bilder; "
"textOnly = ausschliesslich Text-, Tabellen- und Struktur-Parts; "
"imagesOnly = ausschliesslich Bild-Parts; "
@@ -75,8 +79,7 @@ CONTEXT_NODES = [
},
"default": "lines",
"description": t(
- "Wie die extrahierten Inhalte unter ``presentation`` strukturiert werden "
- "(zusaetzlich zu den unveraenderten ``parts`` im Handover)."
+ "Wie das Ergebnis unter ``files`` strukturiert wird (``outputMode``: blob, lines, …)."
),
},
{
@@ -238,10 +241,11 @@ CONTEXT_NODES = [
{"value": "all", "label": t("PDF/Parts: alle Typgruppen")},
]
},
- "default": "text",
+ "default": "all",
"description": t(
"Filtert fuer die Presentation-Schicht nach typeGroup/MIME "
- "(gilt fuer alle Dokumenttypen analog, nicht nur PDF)."
+ "(gilt fuer alle Dokumenttypen analog, nicht nur PDF). "
+ "Passt zum Inhaltsfilter „Alles“; „Text & Tabellen“ blendet Bild-Parts in der Presentation aus."
),
},
{
@@ -271,51 +275,40 @@ CONTEXT_NODES = [
# Frontend uses only this list — no schema expansion merge for this port.
"dataPickOptions": [
{
- "path": ["documents", 0, "documentData"],
- "pickerLabel": t("Gesamter Inhalt"),
+ "path": ["data"],
+ "pickerLabel": t("Vollständiges data-Objekt"),
"detail": t(
- "Strukturiertes Handover als JSON inklusive aller Textteile "
- "und Verweisen auf ausgelagerte Bilder."
+ "Presentation-Envelope (``schemaVersion``, ``kind``, ``fileOrder``, ``files``) "
+ "plus ``_meta`` (``operationRef``, ``sourceFileNames``, Persist)."
),
"recommended": True,
"type": "Any",
},
{
- "path": ["documents", 0, "documentData", "presentation"],
- "pickerLabel": t("Presentation (strukturierte Sicht)"),
- "detail": t(
- "Nur die konfigurierte Ausgabe-Struktur (blob/lines/pages/chunks/structured); "
- "unveraenderte Roh-Parts bleiben im umschliessenden Handover."
- ),
+ "path": ["data", "files"],
+ "pickerLabel": t("Alle Dateibuckets"),
+ "detail": t("Map Dateischlüssel → Bucket (Zeilenliste, Blob, CSV-Tabelle bei structured, …)."),
"recommended": False,
"type": "Any",
},
- {
- "path": ["response"],
- "pickerLabel": t("Nur Text"),
- "detail": t(
- "Verketteter Klartext aus allen erkannten Textteilen."
- ),
- "recommended": True,
- "type": "str",
- },
{
"path": ["imageDocumentsOnly"],
"pickerLabel": t("Nur Bilder"),
"detail": t(
- "Nur die extrahierten Bilddokumente als Liste, ohne JSON-Handover."
+ "Nur die Bilder aus der Extraktion (persistierte Artefakte bzw. inline), "
+ "als Liste fuer nachgelagerte Schritte."
),
"recommended": False,
"type": "List[ActionDocument]",
},
{
- "path": ["documents"],
- "pickerLabel": t("Alle Dateitypen"),
+ "path": ["data", "_meta"],
+ "pickerLabel": t("Metadaten (_meta)"),
"detail": t(
- "Alle Ausgabedokumente nacheinander: JSON-Handover und Bilder."
+ "``operationRef``, ``sourceFileNames``, Presentation-Parameter, Liste persistierter Bilder."
),
"recommended": False,
- "type": "List[ActionDocument]",
+ "type": "Any",
},
],
}
@@ -330,6 +323,8 @@ CONTEXT_NODES = [
"label": t("Kontext zusammenführen"),
"description": t(
"Führt eine Liste von Ergebnissen zu einem einzigen Kontext zusammen. "
+ "Ausgabe ``data``: versionierter Umschlag (``schemaVersion``, ``kind``), Felder wie "
+ "``merged`` / ``first`` / ``response`` sowie ``_meta``. "
"Wähle als Datenquelle die Option Alle Schleifen-Ergebnisse einer Schleife, "
"um alle Iterationsergebnisse in einem Datensatz zu vereinen."
),
@@ -365,6 +360,8 @@ CONTEXT_NODES = [
"label": t("Kontext transformieren"),
"description": t(
"Verändert die Struktur des eingehenden Datenstroms. "
+ "Ausgabe ``data``: versionierter Umschlag (``schemaVersion``, ``kind``: transform), "
+ "konfigurierte Ausgabe-Felder und ``_meta``. "
"Operationen pro Mapping: 'rename' (Key umbenennen), 'cast' (Typ konvertieren), "
"'nest' (mehrere Felder unter neuem Objekt zusammenfassen), "
"'flatten' (verschachteltes Objekt auf oberste Ebene heben), "
@@ -423,6 +420,7 @@ CONTEXT_NODES = [
"dynamic": True,
"deriveFrom": "mappings",
"deriveNameField": "outputField",
+ "dataPickOptions": CONTEXT_ENVELOPE_DATA_PICK_OPTIONS,
}
},
"injectUpstreamPayload": True,
diff --git a/modules/features/graphicalEditor/nodeDefinitions/file.py b/modules/features/graphicalEditor/nodeDefinitions/file.py
index 8d4b390d..2b79f2e0 100644
--- a/modules/features/graphicalEditor/nodeDefinitions/file.py
+++ b/modules/features/graphicalEditor/nodeDefinitions/file.py
@@ -14,9 +14,8 @@ FILE_NODES = [
"category": "file",
"label": t("Datei erstellen"),
"description": t(
- "Erstellt eine Datei aus Kontext. Nach „Inhalt extrahieren“: „response“ für reinen Text; "
- "„Nur Bilder“ liefert alle extrahierten Bilder — Datei erstellen fasst sie zu einer PDF oder DOCX "
- "(Ausgabeformat pdf oder docx wählen)."
+ "Erstellt eine Datei aus der Presentation von „Inhalt extrahieren“ "
+ "(``data`` oder Schleifen-``bodyResults``). Ausgabe über den Generation-Service."
),
"parameters": [
{"name": "outputFormat", "type": "str", "required": True, "frontendType": "select",
@@ -29,7 +28,7 @@ FILE_NODES = [
"default": ""},
{"name": "context", "type": "Any", "required": False, "frontendType": "contextBuilder",
"description": CONTEXT_BUILDER_PARAM_DESCRIPTION, "default": "",
- "graphInherit": {"port": 0, "kind": "primaryTextRef"}},
+ "graphInherit": {"port": 0, "kind": "recommendedDataPickRef"}},
],
"inputs": 1,
"outputs": 1,
diff --git a/modules/features/graphicalEditor/nodeDefinitions/flow.py b/modules/features/graphicalEditor/nodeDefinitions/flow.py
index e47a063e..b2fc020b 100644
--- a/modules/features/graphicalEditor/nodeDefinitions/flow.py
+++ b/modules/features/graphicalEditor/nodeDefinitions/flow.py
@@ -63,6 +63,28 @@ LOOP_ITEM_DATA_PICK_OPTIONS = [
},
]
+# Base paths when ``ActionResult.data`` uses envelope + ``_meta`` (context.extractContent-style clarity).
+CONTEXT_ENVELOPE_DATA_PICK_OPTIONS = [
+ {
+ "path": ["data"],
+ "pickerLabel": t("Vollständiges data-Objekt"),
+ "detail": t(
+ "Versionierter Kontext-Umschlag: ``schemaVersion``, ``kind``, Nutzdatenfelder, ``_meta``."
+ ),
+ "recommended": True,
+ "type": "Dict",
+ },
+ {
+ "path": ["data", "_meta"],
+ "pickerLabel": t("Technische Metadaten (_meta)"),
+ "detail": t(
+ "`actionType`, Payload-Schema-Version; bei Transform/Merge keine großen Payloads."
+ ),
+ "recommended": False,
+ "type": "Any",
+ },
+]
+
MERGE_RESULT_DATA_PICK_OPTIONS = [
{
"path": ["merged"],
@@ -90,6 +112,7 @@ MERGE_RESULT_DATA_PICK_OPTIONS = [
# Extended picker for ``context.mergeContext`` (ActionResult + ``surfaceDataAsTopLevel``): same
# merge keys as ``flow.merge`` plus ``count`` from the action payload.
CONTEXT_MERGE_ACTION_RESULT_DATA_PICK_OPTIONS = [
+ *CONTEXT_ENVELOPE_DATA_PICK_OPTIONS,
*MERGE_RESULT_DATA_PICK_OPTIONS,
{
"path": ["count"],
diff --git a/modules/features/graphicalEditor/portTypes.py b/modules/features/graphicalEditor/portTypes.py
index 24a97446..0784e436 100644
--- a/modules/features/graphicalEditor/portTypes.py
+++ b/modules/features/graphicalEditor/portTypes.py
@@ -315,14 +315,18 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
# bindings like `processDocuments → documents → *` for syncToAccounting.
PortField(name="documents", type="List[ActionDocument]", required=False,
description=(
- "Dokumentliste: Index 0 oft JSON-Handover oder Hauptdatei; Einträge mit "
- "MIME image/* oder Namen extract_media_* sind ausgelagerte Bilder (documentData = Binär)."
+ "Dokumentliste für Actions mit echten Artefakt-Dokumenten. "
+ "Beim Knoten „Inhalt extrahieren“ fehlt dieses Feld in der Knotenausgabe."
),
picker_label=t("Alle Ausgabe-Dokumente"),
picker_item_label=t("je Dokument"),
),
PortField(name="data", type="Dict", required=False,
- description="Ergebnisdaten",
+ description=(
+ "Strukturierter Inhalt. Bei **context.extractContent**: **Presentation**-Root "
+ "(`schemaVersion`, `kind`, `fileOrder`, `files`) plus **`_meta`** — ohne "
+ "zusätzliches `response`/`contentExtracted`-Duplikat."
+ ),
picker_label=t("Technische Detaildaten (data)")),
# Mirror AiResult primary text fields so DataPicker / primaryTextRef behave the same
PortField(name="prompt", type="str", required=False,
@@ -330,7 +334,8 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
picker_label=t("Auslöser / Prompt (falls vorhanden)")),
PortField(name="response", type="str", required=False,
description=(
- "Primär nur Fließtext (z. B. nach Extraktion: alle Text-Parts verkettet, keine Bilder)."
+ "Fließtext wo die Action einen liefert. Bei **„Inhalt extrahieren“** absichtlich leer — "
+ "Inhalt liegt in ``data``.``files``."
),
recommended=True,
picker_label=t("Nur Fließtext (gesamt)")),
@@ -339,12 +344,29 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
picker_label=t("Mitgegebener Kontext")),
PortField(name="imageDocumentsOnly", type="List[ActionDocument]", required=False,
description=(
- "Nur Bildausgaben (ohne JSON-Handover), z. B. von context.extractContent."
+ "Nur Bild-bezogene Einträge. Bei „Inhalt extrahieren“: synthetische "
+ "Einträge mit ``fileId`` aus persistierten Extrakt-Bildern (kein separates JSON-Dokument)."
),
picker_label=t("Nur Bilder (Liste)")),
PortField(name="responseData", type="Dict", required=False,
description="Optional: strukturierte Zusatzdaten",
picker_label=t("Strukturierte Zusatzdaten")),
+ PortField(name="presentation", type="Dict", required=False,
+ description=(
+ "Selten: Top-Level-Spiegel von Präsentationsdaten andere Actions. "
+ "Bei „Inhalt extrahieren“ liegt alles direkt unter ``data`` (kein zusätzlicher Spiegel)."
+ ),
+ picker_label=t("Presentation (Top-Level-Spiegel)")),
+ PortField(name="presentationSummary", type="Dict", required=False,
+ description=(
+ "Kompakte Metadaten zu ``presentation`` (Debugging / traces)."
+ ),
+ picker_label=t("Presentation-Zusammenfassung")),
+ PortField(name="presentationConfig", type="Dict", required=False,
+ description=(
+ "Optional: Debugging-Konfiguration; bei Extract liegt die Primärquelle in ``validationMetadata`` des JSON-Dokuments."
+ ),
+ picker_label=t("Presentation-Konfiguration")),
]),
"Transit": PortSchema(name="Transit", fields=[]),
"UdmDocument": PortSchema(name="UdmDocument", carriesConnectionProvenance=True, fields=[
@@ -675,6 +697,8 @@ SYSTEM_VARIABLES: Dict[str, Dict[str, str]] = {
#
# When a parameter declares ``graphInherit.kind == "primaryTextRef"``, executeGraph
# inserts an explicit DataRef before run (see pickNotPushMigration.materializePrimaryTextHandover).
+# ``recommendedDataPickRef`` uses upstream ``outputPorts.dataPickOptions`` where ``recommended: true``
+# (see pickNotPushMigration.materializeRecommendedDataPickRef).
# Schema names are catalog output port types (e.g. AiResult).
PRIMARY_TEXT_HANDOVER_REF_PATH: Dict[str, List[Any]] = {
diff --git a/modules/features/graphicalEditor/upstreamPathsService.py b/modules/features/graphicalEditor/upstreamPathsService.py
index f0cb473e..13e84719 100644
--- a/modules/features/graphicalEditor/upstreamPathsService.py
+++ b/modules/features/graphicalEditor/upstreamPathsService.py
@@ -110,24 +110,29 @@ def compute_upstream_paths(graph: Dict[str, Any], target_node_id: str) -> List[D
out0 = (ndef.get("outputPorts") or {}).get(0, {})
out0 = out0 if isinstance(out0, dict) else {}
dpo = out0.get("dataPickOptions")
- if isinstance(dpo, list) and len(dpo) > 0:
+
+ bases: List[Dict[str, Any]] = []
+ if isinstance(dpo, list):
+ bases = _paths_for_data_pick_options(dpo, aid)
+ derived = parse_graph_defined_output_schema(anode, out0)
+ derived_paths: List[Dict[str, Any]] = []
+ if derived:
+ derived_paths = _paths_for_port_schema(derived, aid)
+
+ merged_list = bases + derived_paths
+ if merged_list:
plab = (anode.get("title") or "").strip() or aid
- for entry in _paths_for_data_pick_options(dpo, aid):
+ for entry in merged_list:
entry["producerLabel"] = plab
paths.append(entry)
continue
- derived = parse_graph_defined_output_schema(anode, out0)
- if derived:
- for entry in _paths_for_port_schema(derived, aid):
- entry["producerLabel"] = (anode.get("title") or "").strip() or aid
- paths.append(entry)
- else:
- raw_schema = out0.get("schema") if isinstance(out0, dict) else None
- schema_name = raw_schema if isinstance(raw_schema, str) and raw_schema else "ActionResult"
- for entry in _paths_for_schema(schema_name, aid):
- entry["producerLabel"] = (anode.get("title") or "").strip() or aid
- paths.append(entry)
+ raw_schema = out0.get("schema") if isinstance(out0, dict) else None
+ schema_name = raw_schema if isinstance(raw_schema, str) and raw_schema else "ActionResult"
+ plab = (anode.get("title") or "").strip() or aid
+ for entry in _paths_for_schema(schema_name, aid):
+ entry["producerLabel"] = plab
+ paths.append(entry)
# Lexical loop hints (flow.loop): only for nodes inside the loop body
for aid in ancestors:
diff --git a/modules/interfaces/interfaceDbManagement.py b/modules/interfaces/interfaceDbManagement.py
index 794063f4..f412cea7 100644
--- a/modules/interfaces/interfaceDbManagement.py
+++ b/modules/interfaces/interfaceDbManagement.py
@@ -990,6 +990,10 @@ class ComponentObjects:
If pagination is provided: PaginatedResult with items and metadata
"""
def _convertFileItems(files):
+ from modules.workflows.automation2.workflowArtifactVisibility import (
+ suppress_workflow_file_in_workspace_ui,
+ )
+
fileItems = []
for file in files:
try:
@@ -1002,6 +1006,8 @@ class ComponentObjects:
fileName = file.get("fileName")
if not fileName or fileName == "None":
continue
+ if suppress_workflow_file_in_workspace_ui(file):
+ continue
if file.get("scope") is None:
file["scope"] = "personal"
diff --git a/modules/routes/routeAutomationWorkspace.py b/modules/routes/routeAutomationWorkspace.py
index b742d7ea..32624363 100644
--- a/modules/routes/routeAutomationWorkspace.py
+++ b/modules/routes/routeAutomationWorkspace.py
@@ -26,6 +26,7 @@ from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import (
AutoWorkflow,
)
from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase
+from modules.workflows.automation2.workflowArtifactVisibility import suppress_workflow_file_in_workspace_ui
from modules.shared.i18nRegistry import apiRouteContext
routeApiMsg = apiRouteContext("routeAutomationWorkspace")
@@ -265,7 +266,8 @@ def getWorkspaceRunDetail(
logger.warning("getWorkspaceRunDetail: file lookup failed: %s", e)
def _resolveFileList(ids: set[str]) -> list[dict]:
- return [fileMetaById[fid] for fid in ids if fid in fileMetaById]
+ rows = [dict(fileMetaById[fid]) for fid in ids if fid in fileMetaById]
+ return [m for m in rows if not suppress_workflow_file_in_workspace_ui(m)]
assignedFileIds: set[str] = set()
for step, (inputIds, outputIds) in zip(steps, perStepFileIds):
diff --git a/modules/serviceCenter/services/serviceExtraction/extractors/extractorPdf.py b/modules/serviceCenter/services/serviceExtraction/extractors/extractorPdf.py
index 1df4e7fc..657e3fc6 100644
--- a/modules/serviceCenter/services/serviceExtraction/extractors/extractorPdf.py
+++ b/modules/serviceCenter/services/serviceExtraction/extractors/extractorPdf.py
@@ -73,7 +73,30 @@ class PdfExtractor(Extractor):
))
return parts
- # Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
+ file_name = context.get("fileName", "document.pdf")
+ ordered_ok = False
+ try:
+ doc = fitz.open(stream=fileBytes, filetype="pdf")
+ for page_index in range(len(doc)):
+ page = doc[page_index]
+ page_parts = self._extract_page_blocks_in_reading_order(
+ page,
+ doc,
+ page_index=page_index,
+ root_id=rootId,
+ file_name=file_name,
+ )
+ if page_parts:
+ parts.extend(page_parts)
+ ordered_ok = True
+ doc.close()
+ except Exception:
+ ordered_ok = False
+
+ if ordered_ok and any(getattr(p, "typeGroup", "") in ("text", "image") for p in parts):
+ return parts
+
+ parts = [parts[0]] # keep container only; fall back below
try:
doc = fitz.open(stream=fileBytes, filetype="pdf")
for i in range(len(doc)):
@@ -174,4 +197,196 @@ class PdfExtractor(Extractor):
return parts
+ @staticmethod
+ def _text_from_text_block(block: Dict[str, Any]) -> str:
+ lines_out: List[str] = []
+ for line in block.get("lines") or []:
+ if not isinstance(line, dict):
+ continue
+ spans = line.get("spans") or []
+ line_text = "".join(
+ str(span.get("text") or "")
+ for span in spans
+ if isinstance(span, dict)
+ )
+ lines_out.append(line_text)
+ return "\n".join(lines_out).strip()
+ @staticmethod
+ def _bbox_center(bbox: Any) -> tuple[float, float]:
+ if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
+ return 0.0, 0.0
+ x0, y0, x1, y1 = float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
+ return (x0 + x1) / 2.0, (y0 + y1) / 2.0
+
+ @staticmethod
+ def _point_inside_bbox(x: float, y: float, bbox: Any) -> bool:
+ if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
+ return False
+ x0, y0, x1, y1 = float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
+ return x0 <= x <= x1 and y0 <= y <= y1
+
+ def _extract_page_blocks_in_reading_order(
+ self,
+ page: Any,
+ doc: Any,
+ *,
+ page_index: int,
+ root_id: str,
+ file_name: str,
+ ) -> List[ContentPart]:
+ """Emit text/image/table parts in on-page reading order (top-to-bottom, left-to-right)."""
+ entries: List[tuple[float, float, str, Dict[str, Any]]] = []
+ table_bboxes: List[Any] = []
+
+ try:
+ table_finder = page.find_tables()
+ for ti, tab in enumerate(getattr(table_finder, "tables", []) or []):
+ try:
+ matrix = tab.extract()
+ except Exception:
+ matrix = None
+ if not matrix:
+ continue
+ csv_data = self._rows_to_csv_payload(matrix)
+ if not csv_data.strip():
+ continue
+ bbox = getattr(tab, "bbox", None)
+ if bbox is not None:
+ table_bboxes.append(bbox)
+ cy, cx = self._bbox_center(bbox)
+ entries.append((cy, cx, "table", {
+ "label": f"table_{page_index + 1}_{ti}",
+ "data": csv_data,
+ "table_index": ti,
+ }))
+ except Exception:
+ pass
+
+ try:
+ page_dict = page.get_text("dict", sort=True)
+ except Exception:
+ page_dict = None
+ blocks = page_dict.get("blocks") if isinstance(page_dict, dict) else None
+ if isinstance(blocks, list):
+ text_block_no = 0
+ image_no = 0
+ for block in blocks:
+ if not isinstance(block, dict):
+ continue
+ bbox = block.get("bbox")
+ cy, cx = self._bbox_center(bbox)
+ btype = block.get("type")
+ if btype == 0:
+ if any(self._point_inside_bbox(cx, cy, tb) for tb in table_bboxes):
+ continue
+ text = self._text_from_text_block(block)
+ if not text:
+ continue
+ label = f"page_{page_index + 1}" if text_block_no == 0 else f"page_{page_index + 1}_t{text_block_no}"
+ entries.append((cy, cx, "text", {
+ "label": label,
+ "data": text,
+ "text_block_no": text_block_no,
+ }))
+ text_block_no += 1
+ continue
+ if btype != 1:
+ continue
+ img_bytes = block.get("image")
+ ext = str(block.get("ext") or "png").lower()
+ mime = f"image/{ext}"
+ if not img_bytes:
+ xref = block.get("xref")
+ if xref is not None:
+ try:
+ extracted = doc.extract_image(int(xref))
+ img_bytes = extracted.get("image", b"")
+ ext = str(extracted.get("ext") or ext).lower()
+ mime = f"image/{ext}"
+ except Exception:
+ img_bytes = b""
+ if not img_bytes:
+ continue
+ entries.append((cy, cx, "image", {
+ "label": f"image_{page_index + 1}_{image_no}",
+ "mime": mime,
+ "bytes": img_bytes,
+ "image_no": image_no,
+ }))
+ image_no += 1
+
+ entries.sort(key=lambda item: (item[0], item[1]))
+ out: List[ContentPart] = []
+ for _y, _x, kind, payload in entries:
+ if kind == "text":
+ tbno = int(payload.get("text_block_no") or 0)
+ text = str(payload.get("data") or "")
+ out.append(ContentPart(
+ id=makeId(),
+ parentId=root_id,
+ label=str(payload.get("label") or f"page_{page_index + 1}"),
+ typeGroup="text",
+ mimeType="text/plain",
+ data=text,
+ metadata={
+ "pages": 1,
+ "pageIndex": page_index,
+ "size": len(text.encode("utf-8")),
+ "contextRef": {
+ "containerPath": file_name,
+ "location": f"page:{page_index + 1}/block:{tbno}",
+ "pageIndex": page_index,
+ },
+ },
+ ))
+ elif kind == "table":
+ ti = int(payload.get("table_index") or 0)
+ csv_data = str(payload.get("data") or "")
+ out.append(ContentPart(
+ id=makeId(),
+ parentId=root_id,
+ label=str(payload.get("label") or f"table_{page_index + 1}_{ti}"),
+ typeGroup="table",
+ mimeType="text/csv",
+ data=csv_data,
+ metadata={
+ "pageIndex": page_index,
+ "size": len(csv_data.encode("utf-8")),
+ "contextRef": {
+ "containerPath": file_name,
+ "location": f"page:{page_index + 1}/table:{ti}",
+ "pageIndex": page_index,
+ },
+ },
+ ))
+ elif kind == "image":
+ ino = int(payload.get("image_no") or 0)
+ img_bytes = payload.get("bytes") or b""
+ mime = str(payload.get("mime") or "image/png")
+ out.append(ContentPart(
+ id=makeId(),
+ parentId=root_id,
+ label=str(payload.get("label") or f"image_{page_index + 1}_{ino}"),
+ typeGroup="image",
+ mimeType=mime,
+ data=base64.b64encode(img_bytes).decode("utf-8"),
+ metadata={
+ "pageIndex": page_index,
+ "size": len(img_bytes),
+ "contextRef": {
+ "containerPath": file_name,
+ "location": f"page:{page_index + 1}/image:{ino}",
+ "pageIndex": page_index,
+ },
+ },
+ ))
+ return out
+
+ @staticmethod
+ def _rows_to_csv_payload(rows: List[List[Any]]) -> str:
+ lines: List[str] = []
+ for row in rows:
+ cells = [str(c or "").replace('"', '""') for c in row]
+ lines.append(",".join(f'"{c}"' for c in cells))
+ return "\n".join(lines)
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py
index f75a5108..7ec05c5c 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py
@@ -670,7 +670,7 @@ class RendererPdf(BaseRenderer):
runType = run.get("type", "text")
value = self._escapeReportlabXml(run.get("value", ""))
if runType == "text":
- parts.append(value)
+ parts.append(value.replace("\n", "
"))
elif runType == "bold":
parts.append(f"{value}")
elif runType == "italic":
@@ -691,6 +691,7 @@ class RendererPdf(BaseRenderer):
if not text:
return ""
s = self._escapeReportlabXml(text)
+ s = s.replace("\n", "
")
s = _re_pdf.sub(r"\*\*(.+?)\*\*", r"\1", s, flags=_re_pdf.DOTALL)
s = _re_pdf.sub(r"__(.+?)__", r"\1", s, flags=_re_pdf.DOTALL)
s = _re_pdf.sub(r"(?\1", s)
diff --git a/modules/workflows/automation2/executionEngine.py b/modules/workflows/automation2/executionEngine.py
index 5f6a8592..f68a3feb 100644
--- a/modules/workflows/automation2/executionEngine.py
+++ b/modules/workflows/automation2/executionEngine.py
@@ -217,6 +217,30 @@ def _serializableOutputs(nodeOutputs: Dict[str, Any]) -> Dict[str, Any]:
return _stripBinaryValues(cleaned)
+def _merge_node_parameters_into_snap(
+ snap: Optional[Dict[str, Any]],
+ *,
+ node_id: Optional[str],
+ context: Optional[Dict[str, Any]],
+) -> Dict[str, Any]:
+ """Copy wire snapshot and attach **nodeParameters** from the graph definition (by ``node_id``).
+
+ Uses ``context['graphNodesById']`` populated at executeGraph start — stable even when
+ per-step node dict references differ. Field name is ``nodeParameters`` (no leading
+ underscore) so it survives consumers that hide ``_*`` keys."""
+ merged: Dict[str, Any] = dict(snap or {})
+ if not node_id or not isinstance(context, dict):
+ return merged
+ cmap = context.get("graphNodesById")
+ if not isinstance(cmap, dict):
+ return merged
+ gnode = cmap.get(node_id)
+ if not isinstance(gnode, dict):
+ return merged
+ merged["nodeParameters"] = dict(gnode.get("parameters") or {})
+ return merged
+
+
def _emitStepEvent(runId: str, stepData: Dict[str, Any]) -> None:
"""Emit a step-log SSE event to any listening client for this run."""
try:
@@ -319,18 +343,20 @@ async def _ge_log_node_finished(
loop_index: Optional[int] = None,
loop_node_id: Optional[str] = None,
loop_item: Optional[Any] = None,
+ exec_context: Optional[Dict[str, Any]] = None,
) -> None:
"""Append one execution line + one workflow-context snapshot (NDJSON)."""
if file_logger is None or not run_id:
return
ts = _ge_iso_timestamp()
+ snap = _merge_node_parameters_into_snap(input_snap, node_id=node_id, context=exec_context)
exec_rec: Dict[str, Any] = {
"timestamp": ts,
"runId": run_id,
"nodeId": node_id,
"nodeType": node_type,
"status": status,
- "input": _stripBinaryValues(dict(input_snap or {})),
+ "input": _stripBinaryValues(snap),
}
if skip_reason:
exec_rec["skipReason"] = skip_reason
@@ -470,6 +496,7 @@ async def _run_post_loop_done_nodes(
for _sSrc, _, _ in connectionMap.get(_dnid, []):
if _sSrc in nodeOutputs:
_skipSnap[_sSrc] = nodeOutputs[_sSrc]
+ _skipSnap = _merge_node_parameters_into_snap(_skipSnap, node_id=_dnid, context=context)
_skId = _createStepLog(automation2_interface, runId, _dnid, _dn.get("type", ""), status="skipped", inputSnapshot=_skipSnap)
if _skId:
_updateStepLog(automation2_interface, _skId, "skipped")
@@ -478,6 +505,7 @@ async def _run_post_loop_done_nodes(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=_dnid,
node_type=_dn.get("type", ""),
status="skipped",
@@ -494,6 +522,7 @@ async def _run_post_loop_done_nodes(
for _src, _, _ in connectionMap.get(_dnid, []):
if _src in nodeOutputs:
_dIn[_src] = nodeOutputs[_src]
+ _dIn = _merge_node_parameters_into_snap(_dIn, node_id=_dnid, context=context)
_dStepId = _createStepLog(automation2_interface, runId, _dnid, _dn.get("type", ""), "running", _dIn)
try:
_dres, _dRetry = await _executeWithRetry(_dexec, _dn, context)
@@ -509,6 +538,7 @@ async def _run_post_loop_done_nodes(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=_dnid,
node_type=_dn.get("type", ""),
status="completed",
@@ -525,6 +555,7 @@ async def _run_post_loop_done_nodes(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=_dnid,
node_type=_dn.get("type", ""),
status="completed",
@@ -540,6 +571,7 @@ async def _run_post_loop_done_nodes(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=_dnid,
node_type=_dn.get("type", ""),
status="completed",
@@ -556,6 +588,7 @@ async def _run_post_loop_done_nodes(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=_dnid,
node_type=_dn.get("type", ""),
status="failed",
@@ -573,6 +606,7 @@ async def _run_post_loop_done_nodes(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=_dnid,
node_type=_dn.get("type", ""),
status="failed",
@@ -622,6 +656,8 @@ async def executeGraph(
from modules.workflows.automation2.pickNotPushMigration import (
materializeConnectionRefs,
materializePrimaryTextHandover,
+ materializeRecommendedDataPickRef,
+ normalizeFileCreatePresentationRefs,
)
from modules.workflows.automation2.featureInstanceRefMigration import (
materializeFeatureInstanceRefs,
@@ -635,6 +671,8 @@ async def executeGraph(
graph = materializeFeatureInstanceRefs(graph)
graph = materializeConnectionRefs(graph)
graph = materializePrimaryTextHandover(graph)
+ graph = materializeRecommendedDataPickRef(graph)
+ graph = normalizeFileCreatePresentationRefs(graph)
nodeTypeIds = _getNodeTypeIds(services)
logger.debug("executeGraph nodeTypeIds (%d): %s", len(nodeTypeIds), sorted(nodeTypeIds))
errors = validateGraph(graph, nodeTypeIds)
@@ -720,6 +758,9 @@ async def executeGraph(
env_for_run = normalize_run_envelope(run_envelope, user_id=userId)
+ graph_nodes_by_id: Dict[str, Any] = {
+ str(n["id"]): n for n in nodes if n.get("id")
+ }
context = {
"workflowId": workflowId,
"instanceId": instanceId,
@@ -732,6 +773,7 @@ async def executeGraph(
"_runId": runId,
"_orderedNodes": ordered,
"runEnvelope": env_for_run,
+ "graphNodesById": graph_nodes_by_id,
}
# Lets graph actions (e.g. ``context.setContext`` human-task mode) call
# ``createTask`` / ``updateRun`` without threading the interface through services.
@@ -803,6 +845,7 @@ async def executeGraph(
for _rSrc, _, _ in connectionMap.get(bnid, []):
if _rSrc in nodeOutputs:
_rInputSnap[_rSrc] = nodeOutputs[_rSrc]
+ _rInputSnap = _merge_node_parameters_into_snap(_rInputSnap, node_id=bnid, context=context)
_rStepId = _createStepLog(automation2_interface, runId, bnid, body_node.get("type", ""), "running", _rInputSnap)
try:
result, _rRetry = await _executeWithRetry(executor, body_node, context)
@@ -821,6 +864,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="completed",
@@ -844,6 +888,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="completed",
@@ -867,6 +912,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="completed",
@@ -886,6 +932,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="failed",
@@ -906,6 +953,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="failed",
@@ -979,6 +1027,7 @@ async def executeGraph(
for _sSrc, _, _ in connectionMap.get(nodeId, []):
if _sSrc in nodeOutputs:
_skipInputSnap[_sSrc] = nodeOutputs[_sSrc]
+ _skipInputSnap = _merge_node_parameters_into_snap(_skipInputSnap, node_id=nodeId, context=context)
_skipStepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, status="skipped", inputSnapshot=_skipInputSnap)
if _skipStepId:
_updateStepLog(automation2_interface, _skipStepId, "skipped")
@@ -987,6 +1036,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=nodeId,
node_type=nodeType,
status="skipped",
@@ -1015,6 +1065,7 @@ async def executeGraph(
for _lSrc, _, _ in connectionMap.get(nodeId, []):
if _lSrc in nodeOutputs:
_loopInputSnap[_lSrc] = nodeOutputs[_lSrc]
+ _loopInputSnap = _merge_node_parameters_into_snap(_loopInputSnap, node_id=nodeId, context=context)
_stepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, "running", _loopInputSnap)
result = await executor.execute(node, context)
items = result.get("items") or []
@@ -1068,6 +1119,9 @@ async def executeGraph(
for _bSnapSrc, _, _ in connectionMap.get(bnid, []):
if _bSnapSrc in _activeOutputs:
_bInputSnapAlways[_bSnapSrc] = _activeOutputs[_bSnapSrc]
+ _bInputSnapAlways = _merge_node_parameters_into_snap(
+ _bInputSnapAlways, node_id=bnid, context=context
+ )
_bStepId = None
if not _batchMode or _idx == 0 or _idx == len(items) - 1:
_bStepId = _createStepLog(
@@ -1100,6 +1154,7 @@ async def executeGraph(
run_id=runId,
node_outputs=_activeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="completed",
@@ -1123,6 +1178,7 @@ async def executeGraph(
run_id=runId,
node_outputs=_activeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="completed",
@@ -1148,6 +1204,7 @@ async def executeGraph(
run_id=runId,
node_outputs=_activeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="completed",
@@ -1168,6 +1225,7 @@ async def executeGraph(
run_id=runId,
node_outputs=_activeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="failed",
@@ -1189,6 +1247,7 @@ async def executeGraph(
run_id=runId,
node_outputs=_activeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=bnid,
node_type=body_node.get("type", ""),
status="failed",
@@ -1296,6 +1355,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=nodeId,
node_type=nodeType,
status="completed",
@@ -1314,6 +1374,7 @@ async def executeGraph(
for src, _, _ in connectionMap.get(nodeId, []):
if src in nodeOutputs:
_inputSnap[src] = nodeOutputs[src]
+ _inputSnap = _merge_node_parameters_into_snap(_inputSnap, node_id=nodeId, context=context)
_stepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, "running", _inputSnap)
result, retryCount = await _executeWithRetry(executor, node, context)
result = _normalizeResult(result, nodeType)
@@ -1328,6 +1389,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=nodeId,
node_type=nodeType,
status="completed",
@@ -1342,6 +1404,7 @@ async def executeGraph(
for src, _, _ in connectionMap.get(nodeId, []):
if src in nodeOutputs:
_inputSnap[src] = nodeOutputs[src]
+ _inputSnap = _merge_node_parameters_into_snap(_inputSnap, node_id=nodeId, context=context)
_stepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, "running", _inputSnap)
result, retryCount = await _executeWithRetry(executor, node, context)
result = _normalizeResult(result, nodeType)
@@ -1356,6 +1419,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=nodeId,
node_type=nodeType,
status="completed",
@@ -1384,6 +1448,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=nodeId,
node_type=nodeType,
status="completed",
@@ -1411,6 +1476,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=nodeId,
node_type=nodeType,
status="completed",
@@ -1471,6 +1537,7 @@ async def executeGraph(
run_id=runId,
node_outputs=nodeOutputs,
run_envelope=context.get("runEnvelope"),
+ exec_context=context,
node_id=nodeId,
node_type=nodeType,
status="failed",
diff --git a/modules/workflows/automation2/executors/actionNodeExecutor.py b/modules/workflows/automation2/executors/actionNodeExecutor.py
index 5d03298f..d5a3fce8 100644
--- a/modules/workflows/automation2/executors/actionNodeExecutor.py
+++ b/modules/workflows/automation2/executors/actionNodeExecutor.py
@@ -21,10 +21,40 @@ from modules.features.graphicalEditor.portTypes import (
from modules.serviceCenter.services.serviceSubscription.mainServiceSubscription import SubscriptionInactiveException as _SubscriptionInactiveException
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import BillingContextError as _BillingContextError
from modules.workflows.automation2.executors.inputExecutor import PauseForHumanTaskError
+from modules.workflows.methods.methodContext.actions.extractContent import (
+ PRESENTATION_KIND,
+ build_presentation_envelope_from_plain_text,
+ presentation_dict_without_meta,
+ presentation_response_text,
+)
logger = logging.getLogger(__name__)
_FILE_CREATE_CTX_LOG_MAX = 500
+_SKIP_UNIFIED_PRESENTATION_NODES = frozenset({"context.extractContent"})
+
+
+def _attach_unified_presentation_data(out: Dict[str, Any], *, node_type: str) -> None:
+ """Ensure ``out[\"data\"]`` carries ``context.extractContent.presentation.v1`` for ``file.create``."""
+ if node_type in _SKIP_UNIFIED_PRESENTATION_NODES:
+ return
+ data = out.get("data")
+ if isinstance(data, dict) and data.get("kind") == PRESENTATION_KIND:
+ return
+ text = str(out.get("response") or "").strip()
+ if not text and isinstance(data, dict):
+ text = str(data.get("response") or "").strip()
+ if not text:
+ return
+ pres = build_presentation_envelope_from_plain_text(text, source_name=node_type or "content")
+ if not pres:
+ return
+ meta: Dict[str, Any] = {"actionType": node_type}
+ if isinstance(data, dict):
+ prev = data.get("_meta")
+ if isinstance(prev, dict):
+ meta = {**prev, **meta}
+ out["data"] = {**pres, "_meta": meta}
def _truncate_for_log(val: Any, max_len: int = _FILE_CREATE_CTX_LOG_MAX) -> str:
@@ -147,6 +177,41 @@ def _image_documents_from_docs_list(docs_list: list) -> list:
]
+def _image_refs_from_extract_node_data(extract_data: Any) -> list:
+ """Synthetic image document dicts from ``context.extractContent`` ``_meta.persistedImageArtifacts``."""
+ if not isinstance(extract_data, dict):
+ return []
+ meta = extract_data.get("_meta")
+ if not isinstance(meta, dict):
+ return []
+ arts = meta.get("persistedImageArtifacts")
+ if not isinstance(arts, list):
+ return []
+ out: list = []
+ for a in arts:
+ if not isinstance(a, dict):
+ continue
+ fid = a.get("fileId")
+ if not fid:
+ continue
+ out.append(
+ {
+ "documentName": a.get("fileName") or f"extract_image_{fid}",
+ "mimeType": str(a.get("mimeType") or "application/octet-stream"),
+ "documentData": None,
+ "fileId": str(fid),
+ "_hasBinaryData": True,
+ "validationMetadata": {
+ "actionType": "context.extractContent",
+ "handoverRole": "extractedMedia",
+ "suppressInWorkflowFileLists": True,
+ "sourcePartId": a.get("sourcePartId"),
+ },
+ }
+ )
+ return out
+
+
_USER_CONNECTION_ID_RE = re.compile(
r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
re.IGNORECASE,
@@ -679,9 +744,12 @@ class ActionNodeExecutor:
extractedContext = ""
rd_early = getattr(result, "data", None)
if isinstance(rd_early, dict):
- _r = rd_early.get("response")
- if _r is not None and str(_r).strip():
- extractedContext = str(_r).strip()
+ if rd_early.get("kind") == PRESENTATION_KIND:
+ extractedContext = presentation_response_text(presentation_dict_without_meta(rd_early)).strip()
+ else:
+ _r = rd_early.get("response")
+ if _r is not None and str(_r).strip():
+ extractedContext = str(_r).strip()
promptText = str(resolvedParams.get("aiPrompt") or resolvedParams.get("prompt") or "").strip()
resultData = getattr(result, "data", None)
@@ -728,9 +796,17 @@ class ActionNodeExecutor:
out.setdefault("context", ctx_str if ctx_str else "")
rsp = str(out.get("response") or "").strip()
if not rsp:
- out["response"] = extractedContext or ""
+ if nodeType != "context.extractContent":
+ out["response"] = extractedContext or ""
+ else:
+ out["response"] = ""
if result.success:
img_only = _image_documents_from_docs_list(docsList)
+ if (
+ nodeType == "context.extractContent"
+ and isinstance(result.data, dict)
+ ):
+ img_only = list(img_only) + _image_refs_from_extract_node_data(result.data)
# mergeContext packs iterated payloads under ``data.merged`` only — ``documents``
# on the ActionResult is empty, so image sidecars live on ``merged.imageDocumentsOnly``.
if (
@@ -766,6 +842,12 @@ class ActionNodeExecutor:
_attachConnectionProvenance(cr_out, resolvedParams, outputSchema, chatService, self.services)
return normalizeToSchema(cr_out, outputSchema)
+ if nodeType == "context.extractContent":
+ out.pop("documents", None)
+
+ if outputSchema in ("AiResult", "ActionResult") and result.success:
+ _attach_unified_presentation_data(out, node_type=nodeType)
+
_attachConnectionProvenance(out, resolvedParams, outputSchema, chatService, self.services)
# When the node declares ``surfaceDataAsTopLevel`` (typical for
diff --git a/modules/workflows/automation2/executors/flowExecutor.py b/modules/workflows/automation2/executors/flowExecutor.py
index e0836db8..e64b1212 100644
--- a/modules/workflows/automation2/executors/flowExecutor.py
+++ b/modules/workflows/automation2/executors/flowExecutor.py
@@ -295,14 +295,42 @@ class FlowExecutor:
def _normalize_loop_items(self, raw: Any) -> List[Any]:
"""Coerce resolved `items` into a list (lists, dict children, or scalars)."""
if isinstance(raw, list):
- return raw
+ return self._expand_presentation_lines_loop_items(raw)
if isinstance(raw, dict):
children = raw.get("children")
if isinstance(children, list) and len(children) > 0:
- return children
- return [{"name": k, "value": v} for k, v in raw.items()]
+ return self._expand_presentation_lines_loop_items(children)
+ items = [{"name": k, "value": v} for k, v in raw.items()]
+ return self._expand_presentation_lines_loop_items(items)
return [raw] if raw is not None else []
+ def _expand_presentation_lines_loop_items(self, items: List[Any]) -> List[Any]:
+ """When looping ``presentation.files`` in ``lines`` mode, iterate per slot (e.g. CSV row)."""
+ if not items:
+ return items
+ expanded: List[Any] = []
+ saw_lines_bucket = False
+ for it in items:
+ if not isinstance(it, dict):
+ expanded.append(it)
+ continue
+ val = it.get("value")
+ if not isinstance(val, dict) or val.get("outputMode") != "lines":
+ expanded.append(it)
+ continue
+ data = val.get("data")
+ if not isinstance(data, list) or len(data) <= 1:
+ expanded.append(it)
+ continue
+ saw_lines_bucket = True
+ base_name = str(it.get("name") or val.get("sourceFileName") or "line")
+ for idx, slot in enumerate(data):
+ if not isinstance(slot, dict):
+ continue
+ sid = str(slot.get("id") or slot.get("label") or idx)
+ expanded.append({"name": f"{base_name}:{sid}", "value": slot})
+ return expanded if saw_lines_bucket else items
+
def _apply_iteration_mode(self, items: List[Any], mode: str, stride: int) -> List[Any]:
"""Select which elements to iterate over (backend-defined modes)."""
if not items:
diff --git a/modules/workflows/automation2/graphUtils.py b/modules/workflows/automation2/graphUtils.py
index 65f7084c..54cff2a1 100644
--- a/modules/workflows/automation2/graphUtils.py
+++ b/modules/workflows/automation2/graphUtils.py
@@ -435,6 +435,13 @@ def resolveParameterReferences(value: Any, nodeOutputs: Dict[str, Any]) -> Any:
data = data.get("data", data)
plist = list(path)
resolved = _get_by_path(data, plist)
+ if resolved is None:
+ from modules.workflows.automation2.pickNotPushMigration import (
+ remap_stale_presentation_ref_path,
+ )
+ alt_path = remap_stale_presentation_ref_path(plist)
+ if alt_path != plist:
+ resolved = _get_by_path(data, alt_path)
if resolved is None and isinstance(data, dict) and plist:
if plist[0] == "payload" and len(plist) > 1:
# Strip explicit "payload" prefix (legacy DataPicker paths)
@@ -491,13 +498,10 @@ def resolveParameterReferences(value: Any, nodeOutputs: Dict[str, Any]) -> Any:
# contextBuilder: list where every item is a `{"type":"ref",...}` envelope.
# Resolve each part; a single ref preserves the resolved type (str, list, dict).
if value and all(isinstance(v, dict) and v.get("type") == "ref" for v in value):
- from modules.workflows.methods.methodAi._common import serialize_context
-
resolved_parts = [resolveParameterReferences(v, nodeOutputs) for v in value]
if len(resolved_parts) == 1:
return resolved_parts[0]
- parts = [serialize_context(p, prefer_handover_primary=True) for p in resolved_parts]
- return "\n\n".join(p for p in parts if p)
+ return resolved_parts
return [resolveParameterReferences(v, nodeOutputs) for v in value]
return value
diff --git a/modules/workflows/automation2/pickNotPushMigration.py b/modules/workflows/automation2/pickNotPushMigration.py
index b6da00a2..0bc7072f 100644
--- a/modules/workflows/automation2/pickNotPushMigration.py
+++ b/modules/workflows/automation2/pickNotPushMigration.py
@@ -5,6 +5,8 @@ Graph helpers for Pick-not-Push: materialize typed DataRefs before executeGraph
- ``materializeConnectionRefs``: empty ``connectionReference`` from upstream connection provenance.
- ``materializePrimaryTextHandover``: parameters whose static definition includes
``graphInherit.kind == "primaryTextRef"`` (canonical paths: ``PRIMARY_TEXT_HANDOVER_REF_PATH``).
+- ``materializeRecommendedDataPickRef``: parameters with ``graphInherit.kind == "recommendedDataPickRef"``
+ use the upstream output port's ``dataPickOptions`` entry with ``recommended: true``.
Runtime: executeGraph deep-copies the version graph and applies these passes in order.
"""
@@ -12,7 +14,7 @@ from __future__ import annotations
import copy
import logging
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
from modules.features.graphicalEditor.nodeDefinitions import STATIC_NODE_TYPES
from modules.features.graphicalEditor.portTypes import (
@@ -154,3 +156,133 @@ def materializePrimaryTextHandover(graph: Dict[str, Any]) -> Dict[str, Any]:
)
return g
+
+
+def _recommended_data_pick_path(out_port: Dict[str, Any]) -> Optional[List[Any]]:
+ opts = out_port.get("dataPickOptions") if isinstance(out_port, dict) else None
+ if not isinstance(opts, list):
+ return None
+ for opt in opts:
+ if not isinstance(opt, dict):
+ continue
+ if opt.get("recommended") is True:
+ path = opt.get("path")
+ if isinstance(path, list) and path:
+ return list(path)
+ return None
+
+
+def materializeRecommendedDataPickRef(graph: Dict[str, Any]) -> Dict[str, Any]:
+ """Materialize empty parameters that declare ``graphInherit.kind == \"recommendedDataPickRef\"``."""
+ g = copy.deepcopy(graph)
+ nodes: List[Dict[str, Any]] = g.get("nodes") or []
+ connections = g.get("connections") or []
+ if not nodes:
+ return g
+
+ conn_map = buildConnectionMap(connections)
+ node_by_id = {n["id"]: n for n in nodes if n.get("id")}
+
+ for node in nodes:
+ nid = node.get("id")
+ ntype = node.get("type")
+ if not nid or not ntype:
+ continue
+ node_def = _NODE_DEF_BY_ID.get(ntype)
+ if not node_def:
+ continue
+ params = node.get("parameters")
+ if not isinstance(params, dict):
+ node["parameters"] = {}
+ params = node["parameters"]
+
+ for pdef in node_def.get("parameters") or []:
+ gi = pdef.get("graphInherit")
+ if not isinstance(gi, dict) or gi.get("kind") != "recommendedDataPickRef":
+ continue
+ pname = pdef.get("name")
+ if not pname:
+ continue
+ port_ix = int(gi.get("port", 0))
+ if not _slot_empty_for_primary_text_inherit(params.get(pname)):
+ continue
+ input_sources = getInputSources(nid, conn_map)
+ if port_ix not in input_sources:
+ continue
+ src_id, _ = input_sources[port_ix]
+ src_node = node_by_id.get(src_id) or {}
+ src_def = _NODE_DEF_BY_ID.get(src_node.get("type") or "")
+ if not src_def:
+ continue
+ out_port = (src_def.get("outputPorts") or {}).get(port_ix, {}) or {}
+ if not isinstance(out_port, dict):
+ out_port = (src_def.get("outputPorts") or {}).get(0, {}) or {}
+ ref_path = _recommended_data_pick_path(out_port if isinstance(out_port, dict) else {})
+ if not ref_path:
+ continue
+ ref = _data_ref(src_id, ref_path)
+ if pdef.get("frontendType") == "contextBuilder":
+ params[pname] = [ref]
+ else:
+ params[pname] = ref
+ logger.debug(
+ "materializeRecommendedDataPickRef: %s.%s -> ref %s path=%s",
+ nid,
+ pname,
+ src_id,
+ ref_path,
+ )
+
+ return g
+
+
+_STALE_FILE_CREATE_CONTEXT_PATHS = frozenset({
+ ("responseData",),
+ ("response",),
+ ("merged",),
+ ("documents", 0, "documentData"),
+})
+
+
+def remap_stale_presentation_ref_path(path: List[Any]) -> List[Any]:
+ """Map legacy text-handover paths to unified presentation ``data``."""
+ if tuple(path) in _STALE_FILE_CREATE_CONTEXT_PATHS:
+ return ["data"]
+ return list(path)
+
+
+def _normalize_presentation_refs_in_value(val: Any) -> Any:
+ """Rewrite stale ref paths inside ``contextBuilder`` lists or bare refs."""
+ if isinstance(val, dict) and val.get("type") == "ref":
+ path = val.get("path")
+ if isinstance(path, list) and path:
+ new_path = remap_stale_presentation_ref_path(path)
+ if new_path != path:
+ return {**val, "path": new_path}
+ return val
+ if isinstance(val, list):
+ return [_normalize_presentation_refs_in_value(item) for item in val]
+ return val
+
+
+def normalizeFileCreatePresentationRefs(graph: Dict[str, Any]) -> Dict[str, Any]:
+ """Remap legacy ``file.create`` context refs to unified presentation ``data``."""
+ g = copy.deepcopy(graph)
+ nodes: List[Dict[str, Any]] = g.get("nodes") or []
+ for node in nodes:
+ if node.get("type") != "file.create":
+ continue
+ params = node.get("parameters")
+ if not isinstance(params, dict):
+ continue
+ ctx = params.get("context")
+ if ctx in (None, "", []):
+ continue
+ normalized = _normalize_presentation_refs_in_value(ctx)
+ if normalized != ctx:
+ params["context"] = normalized
+ logger.debug(
+ "normalizeFileCreatePresentationRefs: %s.context remapped to presentation data ref",
+ node.get("id"),
+ )
+ return g
diff --git a/modules/workflows/automation2/workflowArtifactVisibility.py b/modules/workflows/automation2/workflowArtifactVisibility.py
new file mode 100644
index 00000000..0eb8d4bd
--- /dev/null
+++ b/modules/workflows/automation2/workflowArtifactVisibility.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2025 Patrick Motsch
+"""Heuristics for hiding internal workflow artefacts from user-facing file lists."""
+
+from __future__ import annotations
+
+from typing import Any, Mapping, Optional
+
+
+_WORKFLOW_INTERNAL_FILE_TAG = "_workflowInternal"
+
+
+def suppress_workflow_file_in_workspace_ui(meta: Optional[Mapping[str, Any]]) -> bool:
+ """True when a file row should not appear in user-facing file lists.
+
+ Used by Automation Workspace **and** ``/api/files/list`` (Meine Dateien).
+ Matches persisted JSON handovers from transient runs (``extracted_content_transient*``),
+ internal extract image files (``extract_media_*``), the ``_workflowInternal`` tag, and
+ optional explicit flags.
+ """
+ if not isinstance(meta, Mapping):
+ return False
+ tags = meta.get("tags")
+ if isinstance(tags, list) and _WORKFLOW_INTERNAL_FILE_TAG in tags:
+ return True
+ fn = str(meta.get("fileName") or "").lower()
+ if "extracted_content_transient" in fn:
+ return True
+ if "extract_media_" in fn:
+ return True
+ if meta.get("suppressInWorkflowFileLists") is True:
+ return True
+ return False
diff --git a/modules/workflows/methods/methodAi/_common.py b/modules/workflows/methods/methodAi/_common.py
index 60609104..27b36663 100644
--- a/modules/workflows/methods/methodAi/_common.py
+++ b/modules/workflows/methods/methodAi/_common.py
@@ -30,6 +30,49 @@ def _handover_response_plain(val: Any) -> Optional[str]:
return str(r).strip().lstrip("\ufeff")
+def primary_text_for_prompt_context(val: Any) -> str:
+ """Flatten ActionResult / presentation / merge payloads to readable text.
+
+ Used when merging multiple context-builder refs so extract outputs are not
+ turned into giant JSON via ``serialize_context`` (empty ``response``).
+ """
+ if val is None:
+ return ""
+ if isinstance(val, str):
+ s = val.strip().lstrip("\ufeff")
+ if not s:
+ return ""
+ if len(s) >= 2 and ((s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}"))):
+ try:
+ return primary_text_for_prompt_context(json.loads(s))
+ except (json.JSONDecodeError, TypeError, ValueError):
+ pass
+ return s
+ if isinstance(val, list):
+ chunks = [primary_text_for_prompt_context(item) for item in val]
+ chunks = [c for c in chunks if c]
+ return "\n\n".join(chunks)
+ if isinstance(val, dict):
+ got = _handover_response_plain(val)
+ if got is not None:
+ return got
+ inner = val.get("data")
+ if isinstance(inner, dict):
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ joined_text_from_extract_node_data,
+ )
+
+ t = (joined_text_from_extract_node_data(inner) or "").strip()
+ if t:
+ return t
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ joined_text_from_extract_node_data,
+ )
+
+ return (joined_text_from_extract_node_data(val) or "").strip()
+ return str(val).strip() if str(val).strip() else ""
+
+
def serialize_context(val: Any, *, prefer_handover_primary: bool = False) -> str:
"""Convert any context value to a readable string for use in AI prompts.
diff --git a/modules/workflows/methods/methodBase.py b/modules/workflows/methods/methodBase.py
index 02cae134..e666beff 100644
--- a/modules/workflows/methods/methodBase.py
+++ b/modules/workflows/methods/methodBase.py
@@ -202,7 +202,15 @@ class MethodBase:
validated = {}
# System parameters that should always be preserved, even if not in paramDefs
- systemParams = ['parentOperationId', 'expectedDocumentFormats']
+ systemParams = [
+ 'parentOperationId',
+ 'expectedDocumentFormats',
+ # Injected by automation2 ActionNodeExecutor (graph node definitions)
+ '_runContext',
+ '_upstreamPayload',
+ '_branchInputs',
+ '_workflowNodeId',
+ ]
for sysParam in systemParams:
if sysParam in parameters:
validated[sysParam] = parameters[sysParam]
diff --git a/modules/workflows/methods/methodContext/actions/extractContent.py b/modules/workflows/methods/methodContext/actions/extractContent.py
index 758d772e..866a0568 100644
--- a/modules/workflows/methods/methodContext/actions/extractContent.py
+++ b/modules/workflows/methods/methodContext/actions/extractContent.py
@@ -3,28 +3,27 @@
"""context.extractContent — extracts content without AI.
-Returns a unified handover compatible with AiResult-style downstream wiring:
+``ActionResult.data`` is one **presentation** envelope (`schemaVersion`, `kind`,
+`outputMode`, `fileOrder`, `files`) matching node parameters plus ``_meta`` (operation refs,
+persisted-image trace, presentation config).
-- ``documents[0]``: structured JSON (`context.extractContent.handover.v1`); image ``parts``
- keep metadata but omit pixel data; each dropped image references
- ``handoverMediaDocumentName`` matching a sibling blob document.
-- ``documents[1:]``: each extracted image as its own binary ``ActionDocument`` (like
- ``ai.process`` artefact outputs).
-- Root ``presentation`` inside the JSON (`schemaVersion`, per-file modes/lines/pages/chunks/…)
- — built from filtered ``parts`` without changing extractor output.
-- ``ActionResult.data["response"]`` plus normalized executor field ``response``: flat text derived
- from ``presentation`` (downstream-friendly wie zuvor fuer ``file.create`` / ``primaryTextRef``)."""
+Raw ``ContentExtracted`` is not emitted on the automation output; persistence still uses it
+internally when ``_runContext`` enables image uploads.
+
+Older ``kind: context.extractContent.handover.v1`` is legacy-only (merge/tests), not produced here."""
import base64 as _b64
import binascii as _binascii
+import copy
import csv
+import json
import logging
import re
-from io import StringIO
+from io import BytesIO, StringIO
import time
from typing import Any, Dict, List, Optional, Tuple
-from modules.datamodels.datamodelChat import ActionResult, ActionDocument
+from modules.datamodels.datamodelChat import ActionResult
from modules.datamodels.datamodelDocref import coerceDocumentReferenceList
from modules.datamodels.datamodelExtraction import ContentExtracted, ExtractionOptions
@@ -32,9 +31,26 @@ logger = logging.getLogger(__name__)
_UNSAFE_FILE_KEY = re.compile(r"[^\w\-.\(\)\[\]%@+]")
-HANDOVER_KIND = "context.extractContent.handover.v1"
+# Bumped when ``ActionResult.data`` shape changes (`_meta.extractPayloadSchemaVersion`).
+EXTRACT_PAYLOAD_SCHEMA_VERSION = 3
+
+LEGACY_HANDOVER_KIND = "context.extractContent.handover.v1"
+HANDOVER_KIND = LEGACY_HANDOVER_KIND
+PRESENTATION_KIND = "context.extractContent.presentation.v1"
_CONTENT_FILTER_OPTIONS = ("all", "textOnly", "imagesOnly", "noImages")
+_CONTENT_FILTER_BY_LOWER = {k.lower(): k for k in _CONTENT_FILTER_OPTIONS}
+
+
+def _canonical_content_filter(raw: Any) -> str:
+ """Map JSON / UI values to canonical ``_CONTENT_FILTER_OPTIONS`` keys (case-insensitive)."""
+ s = str(raw if raw is not None else "all").strip()
+ if not s:
+ return "all"
+ if s in _CONTENT_FILTER_OPTIONS:
+ return s
+ return _CONTENT_FILTER_BY_LOWER.get(s.lower()) or "all"
+
PRESENTATION_SCHEMA_VERSION = 1
@@ -73,6 +89,39 @@ def _apply_content_filter(payload: Dict[str, Any], content_filter: str) -> Dict[
return result
+def _filter_extractions_by_content_filter(
+ extracted_results: List[ContentExtracted],
+ content_filter: str,
+) -> List[ContentExtracted]:
+ """Return copies with ``parts`` trimmed (same semantics as ``_apply_content_filter``)."""
+ if content_filter == "all":
+ return extracted_results
+ out: List[ContentExtracted] = []
+ for ec in extracted_results:
+ parts = list(ec.parts or [])
+ if content_filter == "textOnly":
+ parts = [
+ p
+ for p in parts
+ if (getattr(p, "typeGroup", None) or "") in ("text", "table", "structure")
+ ]
+ elif content_filter == "imagesOnly":
+ parts = [p for p in parts if (getattr(p, "typeGroup", None) or "") == "image"]
+ elif content_filter == "noImages":
+ parts = [p for p in parts if (getattr(p, "typeGroup", None) or "") != "image"]
+ copied = ec.model_copy(update={"parts": parts})
+ out.append(copied)
+ return out
+
+
+def _serialize_content_extracted_for_output(ec: ContentExtracted) -> Dict[str, Any]:
+ """Serialize for internal persist path (no exported ``summary``); not emitted on ``ActionResult.data``."""
+ d = ec.model_dump(mode="json", exclude_none=True) if hasattr(ec, "model_dump") else ec.dict(exclude_none=True)
+ if isinstance(d, dict):
+ d.pop("summary", None)
+ return d
+
+
def _default_extraction_options() -> ExtractionOptions:
"""No merge — keep all parts for downstream JSON selection."""
return ExtractionOptions(
@@ -177,7 +226,13 @@ def _parse_non_negative_int(value: Any, default: int) -> int:
def parse_presentation_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:
- """Defaults match ``context.extractContent`` node schema in ``context.py``."""
+ """Defaults match ``context.extractContent`` node schema in ``context.py``.
+
+ ``contentFilter=all`` plus legacy default ``pdfExtractMode=text`` would drop
+ image parts from **presentation** even though extraction kept them — we
+ coerce that combination to ``all``. When ``pdfExtractMode`` is omitted,
+ sensible defaults derive from ``contentFilter``.
+ """
output_mode = str(parameters.get("outputMode") or "lines").strip().lower()
if output_mode not in _OUTPUT_MODES:
output_mode = "lines"
@@ -187,9 +242,23 @@ def parse_presentation_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]:
chunk_unit = str(parameters.get("chunkSizeUnit") or "tokens").strip().lower()
if chunk_unit not in _CHUNK_UNITS:
chunk_unit = "tokens"
- pdf_mode = str(parameters.get("pdfExtractMode") or "text").strip().lower()
- if pdf_mode not in _PDF_EXTRACT_PRESENTATION_MODES:
+ content_filter = _canonical_content_filter(parameters.get("contentFilter"))
+ raw_pdf = parameters.get("pdfExtractMode")
+ raw_pdf_str = str(raw_pdf).strip() if raw_pdf is not None else ""
+ if raw_pdf_str:
+ pdf_mode = raw_pdf_str.lower()
+ elif content_filter == "imagesOnly":
+ pdf_mode = "images"
+ elif content_filter in ("textOnly", "noImages"):
pdf_mode = "text"
+ else:
+ pdf_mode = "all"
+ if pdf_mode not in _PDF_EXTRACT_PRESENTATION_MODES:
+ pdf_mode = "all"
+ if content_filter == "all" and pdf_mode == "text":
+ pdf_mode = "all"
+ elif content_filter == "imagesOnly" and pdf_mode in ("text", "tables"):
+ pdf_mode = "images"
return {
"outputMode": output_mode,
"splitBy": split_by,
@@ -430,56 +499,191 @@ def _base_item_meta(
return m
+def summarize_presentation_payload(presentation: Dict[str, Any]) -> Dict[str, Any]:
+ """Compact shape for logs / run traces (no full ``data`` payload)."""
+ files_out: Dict[str, Any] = {}
+ for fk, bucket in (presentation.get("files") or {}).items():
+ if not isinstance(bucket, dict):
+ continue
+ om = bucket.get("outputMode")
+ d = bucket.get("data")
+ shape: Dict[str, Any] = {"outputMode": om, "dataPythonType": type(d).__name__}
+ if isinstance(d, str):
+ shape["stringLength"] = len(d)
+ shape["head"] = d[:200]
+ shape["tail"] = d[-120:] if len(d) > 320 else None
+ elif isinstance(d, list):
+ shape["listLength"] = len(d)
+ if d:
+ el0 = d[0]
+ shape["firstElementPythonType"] = type(el0).__name__
+ if isinstance(el0, str):
+ shape["firstStringLength"] = len(el0)
+ shape["firstHead"] = el0[:160]
+ elif isinstance(el0, dict):
+ shape["firstKeys"] = list(el0.keys())[:12]
+ files_out[str(fk)] = shape
+ return {
+ "schemaVersion": presentation.get("schemaVersion"),
+ "kind": presentation.get("kind"),
+ "rootOutputMode": presentation.get("outputMode"),
+ "fileOrder": presentation.get("fileOrder"),
+ "files": files_out,
+ }
+
+
+def _joined_text_from_content_extracted_serial(items: List[Any]) -> str:
+ """Plain text from serialized ``contentExtracted`` list (dict items with ``parts``)."""
+ chunks: List[str] = []
+ for item in items:
+ if not isinstance(item, dict):
+ continue
+ for p in item.get("parts") or []:
+ if not isinstance(p, dict):
+ continue
+ if not _part_carries_plain_text(p):
+ continue
+ raw = p.get("data")
+ if raw is None:
+ continue
+ s = str(raw).strip()
+ if s:
+ chunks.append(s)
+ return "\n\n".join(chunks)
+
+
+def presentation_dict_without_meta(data: Dict[str, Any]) -> Dict[str, Any]:
+ """Strip ``_meta`` for helpers that expect a bare presentation envelope."""
+ return {k: v for k, v in data.items() if k != "_meta"}
+
+
+def joined_text_from_extract_node_data(data: Any) -> str:
+ """Primary text / mergeContext: presentation-root ``data``, ``contentExtracted``, or legacy handover."""
+ if not isinstance(data, dict):
+ return ""
+ if data.get("kind") == PRESENTATION_KIND:
+ return presentation_response_text(presentation_dict_without_meta(data))
+ ce = data.get("contentExtracted")
+ if isinstance(ce, list) and ce:
+ return _joined_text_from_content_extracted_serial(ce)
+ if data.get("files") is not None:
+ return _joined_text_from_handover_payload(data)
+ return ""
+
+
def presentation_response_text(
presentation: Dict[str, Any],
- payload: Dict[str, Any],
+ file_order_hint: Optional[Any] = None,
) -> str:
"""Derive flattened ``response`` text from ``presentation.files``."""
-
files_section = presentation.get("files") or {}
- ordered = payload.get("fileOrder")
- keys: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
- chunks: List[str] = []
+ keys: List[str] = []
+ if isinstance(file_order_hint, dict):
+ ord0 = file_order_hint.get("fileOrder")
+ keys = ord0 if isinstance(ord0, list) and ord0 else []
+ elif isinstance(file_order_hint, list):
+ keys = file_order_hint
+ if not keys:
+ po = presentation.get("fileOrder")
+ keys = po if isinstance(po, list) and po else list(files_section.keys())
+ chunks_out: List[str] = []
for fk in keys:
bucket = files_section.get(fk)
if not isinstance(bucket, dict):
continue
- mode = (bucket.get("outputMode") or "").strip()
- if mode == "blob":
- t = bucket.get("text")
- if isinstance(t, str) and t.strip():
- chunks.append(t.strip())
- elif mode == "lines":
- for it in bucket.get("items") or []:
+ texts = _flat_text_segments_from_presentation_bucket(bucket)
+ chunks_out.extend(texts)
+ return "\n\n".join(chunks_out)
+
+
+def _flat_text_segments_from_presentation_bucket(bucket: Dict[str, Any]) -> List[str]:
+ """Derive plain-text segments from ``presentation.files[*]``.
+
+ Prefer **data** when set (canonical shape for tooling):
+ - ``blob``: ``data`` is a single ``str``.
+ - ``lines``: ``data`` is a ``list[dict]``, one dict per extraction part (order preserved): same
+ fields as serialised ``ContentPart`` (image ``data`` redacted) plus ``lines`` (split/filtered text;
+ empty for non-text/table/structure plain-text parts).
+ - ``chunks``: ``data`` is ``list[str]``.
+ - ``pages``: ``data`` is ``list[{"pageIndex": int, "lines": [...]}]``.
+ - ``structured``: ``data`` mirrors ``items`` — list of part-like dicts; text from ``data`` fields.
+ """
+ if not isinstance(bucket, dict):
+ return []
+ raw_data = bucket.get("data")
+ mode = str(bucket.get("outputMode") or "").strip()
+
+ if isinstance(raw_data, str):
+ s = raw_data.strip()
+ return [s] if s else []
+ if isinstance(raw_data, list):
+ extracted: List[str] = []
+ for el in raw_data:
+ if isinstance(el, str):
+ lt = el.strip()
+ if lt:
+ extracted.append(lt)
+ elif isinstance(el, dict):
+ if el.get("type") == "image":
+ continue
+ if el.get("typeGroup") == "image":
+ continue
+ line_block = el.get("lines")
+ if isinstance(line_block, list):
+ for ln in line_block:
+ if isinstance(ln, str):
+ s = ln.strip()
+ if s:
+ extracted.append(s)
+ elif ln is not None:
+ s = str(ln).strip()
+ if s:
+ extracted.append(s)
+ elif _part_carries_plain_text(el):
+ d = el.get("data")
+ if isinstance(d, str):
+ s = d.strip()
+ if s:
+ extracted.append(s)
+ if extracted:
+ return extracted
+
+ # Legacy layouts (omit ``data`` or empty list interpreted as fallback)
+ out: List[str] = []
+ if mode == "blob":
+ t = bucket.get("text")
+ if isinstance(t, str) and t.strip():
+ out.append(t.strip())
+ elif mode == "lines":
+ for it in bucket.get("items") or []:
+ if isinstance(it, dict):
+ tx = it.get("text")
+ if isinstance(tx, str) and tx.strip():
+ out.append(tx.strip())
+ elif mode == "pages":
+ for pg in bucket.get("pages") or []:
+ if not isinstance(pg, dict):
+ continue
+ for it in pg.get("items") or []:
if isinstance(it, dict):
tx = it.get("text")
if isinstance(tx, str) and tx.strip():
- chunks.append(tx.strip())
- elif mode == "pages":
- for pg in bucket.get("pages") or []:
- if not isinstance(pg, dict):
- continue
- for it in pg.get("items") or []:
- if isinstance(it, dict):
- tx = it.get("text")
- if isinstance(tx, str) and tx.strip():
- chunks.append(tx.strip())
- elif mode == "chunks":
- for it in bucket.get("chunks") or []:
- if isinstance(it, dict):
- tx = it.get("text")
- if isinstance(tx, str) and tx.strip():
- chunks.append(tx.strip())
- elif mode == "structured":
- for it in bucket.get("items") or []:
- if not isinstance(it, dict):
- continue
+ out.append(tx.strip())
+ elif mode == "chunks":
+ for it in bucket.get("chunks") or []:
+ if isinstance(it, dict):
+ tx = it.get("text")
+ if isinstance(tx, str) and tx.strip():
+ out.append(tx.strip())
+ elif mode == "structured":
+ for it in bucket.get("items") or []:
+ if isinstance(it, dict):
if not _part_carries_plain_text(it):
continue
tx = it.get("data")
if isinstance(tx, str) and tx.strip():
- chunks.append(tx.strip())
- return "\n\n".join(chunks)
+ out.append(tx.strip())
+ return out
def build_presentation_for_payload(payload: Dict[str, Any], cfg: Dict[str, Any]) -> Dict[str, Any]:
@@ -499,13 +703,75 @@ def build_presentation_for_payload(payload: Dict[str, Any], cfg: Dict[str, Any])
out_files[fk] = _build_file_presentation(source_name, parts, cfg)
return {
"schemaVersion": PRESENTATION_SCHEMA_VERSION,
- "kind": "context.extractContent.presentation.v1",
+ "kind": PRESENTATION_KIND,
"outputMode": cfg["outputMode"],
"fileOrder": keys,
"files": out_files,
}
+def build_presentation_for_serial_extractions(
+ serial_docs: List[Dict[str, Any]],
+ source_file_names: List[str],
+ cfg: Dict[str, Any],
+) -> Dict[str, Any]:
+ """Build presentation from serialized extraction dicts (possibly after image persist)."""
+ key_counts: Dict[str, int] = {}
+ keys: List[str] = []
+ out_files: Dict[str, Any] = {}
+ for i, blob in enumerate(serial_docs):
+ if not isinstance(blob, dict):
+ continue
+ name = source_file_names[i] if i < len(source_file_names) else ""
+ fk = _file_json_key(str(name), i, key_counts)
+ keys.append(fk)
+ raw_parts = [p for p in (blob.get("parts") or []) if isinstance(p, dict)]
+ parts = _presentation_filter_parts(raw_parts, cfg["pdfExtractMode"])
+ _apply_markdown_presentation_on_parts(parts, cfg["markdownPreserveFormatting"])
+ out_files[fk] = _build_file_presentation(str(name), parts, cfg)
+ return {
+ "schemaVersion": PRESENTATION_SCHEMA_VERSION,
+ "kind": PRESENTATION_KIND,
+ "outputMode": cfg["outputMode"],
+ "fileOrder": keys,
+ "files": out_files,
+ }
+
+
+def build_presentation_for_extractions(
+ extracted_results: List[ContentExtracted],
+ source_file_names: List[str],
+ cfg: Dict[str, Any],
+) -> Dict[str, Any]:
+ """Build ``presentation`` from [`mainServiceExtraction.extractContent`] results."""
+ serial = [_serialize_content_extracted_for_output(ec) for ec in extracted_results]
+ return build_presentation_for_serial_extractions(serial, source_file_names, cfg)
+
+
+def build_presentation_envelope_from_plain_text(
+ text: str,
+ *,
+ source_name: str = "content",
+ output_mode: str = "lines",
+) -> Dict[str, Any]:
+ """Wrap plain text in ``context.extractContent.presentation.v1`` for unified ``file.create`` handover."""
+ t = (text or "").strip()
+ if not t:
+ return {}
+ cfg = parse_presentation_parameters({"outputMode": output_mode})
+ label = (source_name or "content").strip() or "content"
+ serial = [{
+ "parts": [{
+ "typeGroup": "text",
+ "mimeType": "text/plain",
+ "data": t,
+ "label": label,
+ "id": f"plain_{label}",
+ }],
+ }]
+ return build_presentation_for_serial_extractions(serial, [label], cfg)
+
+
def _join_parts_plain_text(parts: List[Dict[str, Any]]) -> str:
blocks: List[str] = []
for p in parts:
@@ -529,6 +795,138 @@ def _redact_large_part_payload(p: Dict[str, Any]) -> Dict[str, Any]:
return pc
+def _attach_redacted_image_parts(bucket: Dict[str, Any], parts: List[Dict[str, Any]]) -> None:
+ """Attach aggregate ``imageParts`` for ``pages`` / ``chunks`` where ``data`` stays non-part-shaped.
+
+ ``lines`` mode carries each image as its own entry in ``data`` (same order as extraction parts).
+ """
+ imgs = [_redact_large_part_payload(_copy_part(p)) for p in parts if (p.get("typeGroup") or "").strip() == "image"]
+ if imgs:
+ bucket["imageParts"] = imgs
+
+
+def _line_segments_filtered_for_text_fragment(fragment: str, cfg: Dict[str, Any]) -> List[str]:
+ frag = fragment.strip()
+ if not frag:
+ return []
+ segs = _segment_merged_text(frag, cfg["splitBy"])
+ return _apply_line_filters(segs, filter_empty=cfg["filterEmptyLines"], trim_ws=cfg["trimWhitespace"])
+
+
+def _rows_to_csv_payload(rows: List[List[Any]]) -> str:
+ lines: List[str] = []
+ for row in rows:
+ cells = [str(c or "").replace('"', '""') for c in row]
+ lines.append(",".join(f'"{c}"' for c in cells))
+ return "\n".join(lines)
+
+
+def _table_matrix_from_csv(csv_text: str, *, header_row: bool) -> Optional[tuple[List[str], List[List[str]]]]:
+ """Parse CSV table payload into (headers, body rows) for ``renderReport`` tables."""
+ parsed = _parse_csv_rows(csv_text, header_row)
+ if not parsed:
+ return None
+ headers = [str(h) for h in (parsed.get("headers") or [])]
+ raw_rows = parsed.get("rows") or []
+ if not raw_rows:
+ return None
+ if isinstance(raw_rows[0], dict):
+ if not headers:
+ headers = list(raw_rows[0].keys())
+ body = [[str(row.get(h, "")) for h in headers] for row in raw_rows]
+ return headers, body
+ body = [[str(c) for c in row] for row in raw_rows if isinstance(row, list)]
+ if not body:
+ return None
+ if not headers:
+ headers = [f"Column {i + 1}" for i in range(len(body[0]))]
+ return headers, body
+
+
+def _presentation_line_slot_from_part(part: Dict[str, Any], cfg: Dict[str, Any]) -> Dict[str, Any]:
+ """One presentation row per extraction part: serialised part (redacted) + ``lines`` for this part only."""
+ slot = _redact_large_part_payload(_copy_part(part))
+ if (part.get("typeGroup") or "").strip() == "table":
+ # Keep CSV / structured table payload intact — do not split into ``lines``.
+ slot["lines"] = []
+ return slot
+ if _part_carries_plain_text(part):
+ slot["lines"] = _line_segments_filtered_for_text_fragment(str(part.get("data") or ""), cfg)
+ else:
+ slot["lines"] = []
+ return slot
+
+
+def _presentation_line_slots_from_part(part: Dict[str, Any], cfg: Dict[str, Any]) -> List[Dict[str, Any]]:
+ """Expand one extraction part to presentation slots (CSV tables → one slot per row in ``lines`` mode)."""
+ if (part.get("typeGroup") or "").strip() != "table":
+ return [_presentation_line_slot_from_part(part, cfg)]
+ if cfg.get("outputMode") != "lines":
+ return [_presentation_line_slot_from_part(part, cfg)]
+ csv_txt = str(part.get("data") or "")
+ if not csv_txt.strip():
+ return [_presentation_line_slot_from_part(part, cfg)]
+ segs = _segment_merged_text(csv_txt, cfg["splitBy"])
+ segs = _apply_line_filters(
+ segs,
+ filter_empty=cfg["filterEmptyLines"],
+ trim_ws=cfg["trimWhitespace"],
+ )
+ if len(segs) <= 1:
+ return [_presentation_line_slot_from_part(part, cfg)]
+ out: List[Dict[str, Any]] = []
+ part_id = str(part.get("id") or "table")
+ for idx, seg in enumerate(segs, start=1):
+ row_part = _copy_part(part)
+ row_part["typeGroup"] = "text"
+ row_part["mimeType"] = "text/plain"
+ row_part["data"] = seg
+ row_part["label"] = str(part.get("label") or "row")
+ row_part["id"] = f"{part_id}_line_{idx}"
+ slot = _redact_large_part_payload(row_part)
+ slot["lines"] = [seg]
+ out.append(slot)
+ return out
+
+
+def _presentation_image_marker_in_data(part: Dict[str, Any]) -> Dict[str, Any]:
+ """Builds an image reference blob (used by ``blob`` output as ``[image:]`` token only)."""
+ rp = _redact_large_part_payload(_copy_part(part))
+ marker: Dict[str, Any] = {"type": "image", "typeGroup": "image", "partId": rp.get("id")}
+ mime = rp.get("mimeType")
+ if mime:
+ marker["mimeType"] = str(mime).strip()
+ lbl = rp.get("label")
+ if lbl:
+ marker["label"] = lbl
+ eid = rp.get("embeddedImageFileId")
+ if eid:
+ marker["embeddedImageFileId"] = str(eid)
+ enfn = rp.get("embeddedImageFileName")
+ if enfn:
+ marker["embeddedImageFileName"] = str(enfn)
+ meta = rp.get("metadata")
+ extra: Dict[str, Any] = {}
+ if isinstance(meta, dict):
+ pi = meta.get("pageIndex")
+ if pi is not None:
+ try:
+ extra["pageIndex"] = int(pi)
+ except (TypeError, ValueError):
+ extra["pageIndex"] = pi
+ cr = meta.get("contextRef")
+ if isinstance(cr, dict):
+ loc = cr.get("location")
+ if loc:
+ extra["contextLocation"] = loc
+ cp = cr.get("containerPath")
+ if cp:
+ extra["contextContainerPath"] = cp
+ if extra:
+ marker["extra"] = extra
+ return marker
+
+
def _build_file_presentation(
source_file_name: str,
parts: List[Dict[str, Any]],
@@ -547,15 +945,33 @@ def _build_file_presentation(
"outputMode": output_mode,
"sourceFileName": source_file_name or None,
}
- if csv_block is not None:
- base["csv"] = csv_block
if output_mode == "blob":
- base["text"] = merge_plain
+ chunks_blob: List[str] = []
+ for p in parts:
+ tg = (p.get("typeGroup") or "").strip()
+ if tg == "image":
+ m = _presentation_image_marker_in_data(p)
+ pid = str(m.get("partId") or "").strip()
+ chunks_blob.append(f"[image:{pid}]" if pid else "[image]")
+ continue
+ if _part_carries_plain_text(p):
+ raw = p.get("data")
+ if raw is None:
+ continue
+ s = str(raw).strip()
+ if not s:
+ continue
+ chunks_blob.append(s)
+ base["data"] = "\n\n".join(chunks_blob)
return base
if output_mode == "structured":
- base["items"] = [_redact_large_part_payload(_copy_part(p)) for p in parts]
+ if csv_block is not None:
+ base["csv"] = csv_block
+ items_list = [_redact_large_part_payload(_copy_part(p)) for p in parts]
+ base["items"] = items_list
+ base["data"] = list(items_list)
return base
if output_mode == "pages":
@@ -600,6 +1016,19 @@ def _build_file_presentation(
offset += len(seg) + 1
page_objs.append({"pageIndex": pi, "items": items})
base["pages"] = page_objs
+ base["data"] = [
+ {
+ "pageIndex": int(po["pageIndex"]),
+ "lines": [
+ str(it["text"])
+ for it in (po.get("items") or [])
+ if isinstance(it, dict) and isinstance(it.get("text"), str)
+ ],
+ }
+ for po in page_objs
+ if isinstance(po, dict)
+ ]
+ _attach_redacted_image_parts(base, parts)
return base
if output_mode == "chunks":
@@ -619,27 +1048,62 @@ def _build_file_presentation(
row["metadata"] = meta
chunk_objs.append(row)
base["chunks"] = chunk_objs
+ base["data"] = [str(row["text"]) for row in chunk_objs if isinstance(row.get("text"), str)]
+ _attach_redacted_image_parts(base, parts)
return base
- # lines (default): shared path with pages/chunks splitting
- segs = _segment_merged_text(merge_plain, cfg["splitBy"])
- segs = _apply_line_filters(
- segs,
- filter_empty=cfg["filterEmptyLines"],
- trim_ws=cfg["trimWhitespace"],
- )
- items: List[Dict[str, Any]] = []
- offset = 0
- for idx, seg in enumerate(segs, start=1):
- meta = _base_item_meta(source_file_name, cfg, segment_index=idx, offset_hint=offset)
- row = {"text": seg}
- if cfg["includeLineNumbers"]:
- row["lineNumber"] = idx
- if meta:
- row["metadata"] = meta
- items.append(row)
- offset += len(seg) + 1
- base["items"] = items
+ # lines (default): same part order/cardinality as extraction; segmentation inside each part.
+ slots: List[Dict[str, Any]] = []
+ for p in parts:
+ if isinstance(p, dict):
+ slots.extend(_presentation_line_slots_from_part(p, cfg))
+ base["data"] = slots
+ if cfg["includeLineNumbers"] or cfg["includeMetadata"]:
+ flat_items: List[Dict[str, Any]] = []
+ line_no = 0
+ seg_off = 0
+ for slot in slots:
+ tg_slot = (slot.get("typeGroup") or "").strip()
+ part_id = slot.get("id")
+ page_ix = _page_index_from_part(slot)
+
+ if tg_slot == "image":
+ line_no += 1
+ meta_i = _base_item_meta(
+ source_file_name,
+ cfg,
+ segment_index=line_no,
+ offset_hint=seg_off,
+ page_index=page_ix,
+ )
+ row_im: Dict[str, Any] = {"type": "image", "partId": slot.get("id"), "mimeType": slot.get("mimeType")}
+ if cfg["includeLineNumbers"]:
+ row_im["lineNumber"] = line_no
+ if meta_i:
+ row_im["metadata"] = meta_i
+ flat_items.append(row_im)
+ seg_off += 1
+ continue
+
+ for ln in slot.get("lines") or []:
+ if not isinstance(ln, str):
+ continue
+ line_no += 1
+ meta_t = _base_item_meta(
+ source_file_name,
+ cfg,
+ segment_index=line_no,
+ offset_hint=seg_off,
+ page_index=page_ix,
+ )
+ row_t: Dict[str, Any] = {"text": ln}
+ if cfg["includeLineNumbers"]:
+ row_t["lineNumber"] = line_no
+ if meta_t:
+ row_t["metadata"] = meta_t
+ flat_items.append(row_t)
+ seg_off += len(ln) + 1
+ base["items"] = flat_items
return base
@@ -657,88 +1121,118 @@ def _mime_to_file_extension(mime: str) -> str:
return mapping.get(m, m.rsplit("/", 1)[-1] if "/" in m else "bin")
-def _split_images_to_sidecar_documents(
- payload: Dict[str, Any],
+def _persist_extracted_image_parts(
+ content_extracted_serial: List[Dict[str, Any]],
*,
- document_name_stem: str,
-) -> Tuple[Dict[str, Any], List[ActionDocument]]:
- """
- Deep-copy handover JSON, clear image pixel data from ``parts``, attach
- ``handoverMediaDocumentName`` on each image part, emit binary ActionDocuments.
- """
- import copy
+ name_stem: str,
+ run_context: Optional[Dict[str, Any]],
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """Decode base64 image parts, persist bytes, replace with ``embeddedImageFileId``; return artifacts meta."""
+ artifacts: List[Dict[str, Any]] = []
+ if not run_context or not isinstance(run_context, dict):
+ logger.warning("extractContent image persist: _runContext missing — images not stored")
+ return content_extracted_serial, artifacts
+ mandate_id = run_context.get("mandateId")
+ instance_id = run_context.get("instanceId")
+ if not mandate_id or not instance_id:
+ logger.warning(
+ "extractContent image persist: mandateId/instanceId missing in _runContext (mandate=%r instance=%r)",
+ mandate_id,
+ instance_id,
+ )
+ return content_extracted_serial, artifacts
- bundle = copy.deepcopy(payload)
- files_section = bundle.get("files") or {}
- ordered = bundle.get("fileOrder")
- key_order: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
- media_docs: List[ActionDocument] = []
- kind = bundle.get("kind") or HANDOVER_KIND
+ try:
+ from modules.interfaces.interfaceDbManagement import getInterface as _get_mgmt
+ from modules.interfaces.interfaceDbApp import getInterface as _get_app
+ from modules.security.rootAccess import getRootUser
+ except Exception as exc:
+ logger.warning("extractContent image persist: import failed: %s", exc)
+ return content_extracted_serial, artifacts
- stem = re.sub(r"[^\w\-]+", "_", document_name_stem).strip("_") or "extract"
+ owner = getRootUser()
+ uid = run_context.get("userId")
+ if uid:
+ try:
+ umap = _get_app(getRootUser()).getUsersByIds([str(uid)])
+ owner = umap.get(str(uid)) or owner
+ except Exception:
+ pass
- for fk in key_order:
- bucket = files_section.get(fk)
- if not isinstance(bucket, dict):
+ try:
+ mgmt = _get_mgmt(owner, mandateId=str(mandate_id), featureInstanceId=str(instance_id))
+ except Exception as exc:
+ logger.warning("extractContent image persist: mgmt interface failed: %s", exc)
+ return content_extracted_serial, artifacts
+
+ stem = re.sub(r"[^\w\-]+", "_", name_stem).strip("_") or "extract"
+
+ for doc_idx, blob in enumerate(content_extracted_serial):
+ if not isinstance(blob, dict):
continue
- parts = bucket.get("parts")
+ parts = blob.get("parts")
if not isinstance(parts, list):
continue
- new_parts: List[Dict[str, Any]] = []
+ new_parts: List[Any] = []
for p in parts:
if not isinstance(p, dict):
new_parts.append(p)
continue
- pcopy = dict(p)
- tg = (pcopy.get("typeGroup") or "").strip()
- mime = (pcopy.get("mimeType") or "").strip()
- raw_data = pcopy.get("data")
- if tg == "image" and mime.lower().startswith("image/") and raw_data:
- raw_s = raw_data.strip() if isinstance(raw_data, str) else ""
+ tg = (p.get("typeGroup") or "").strip()
+ mime = (p.get("mimeType") or "").strip()
+ raw_data = p.get("data")
+ if tg != "image" or not mime.lower().startswith("image/") or not raw_data:
+ new_parts.append(p)
+ continue
+ raw_s = raw_data.strip() if isinstance(raw_data, str) else ""
+ try:
+ img_bytes = _b64.b64decode(raw_s, validate=True) if raw_s else b""
+ except (_binascii.Error, TypeError, ValueError):
+ new_parts.append(p)
+ continue
+ if not img_bytes:
+ new_parts.append(p)
+ continue
+ part_id = str(p.get("id") or "part")
+ safe_id = re.sub(r"[^\w\-.]+", "_", part_id).strip("_") or "media"
+ if len(safe_id) > 200:
+ safe_id = safe_id[:200]
+ ext = _mime_to_file_extension(mime)
+ # Stable name (no run timestamp) so duplicate content reuses the same FileItem.
+ media_name = f"extract_media_{safe_id}.{ext}"
+ try:
+ file_item = mgmt.createFile(media_name, mime, img_bytes, folderId=None)
+ mgmt.createFileData(file_item.id, img_bytes)
try:
- blob = _b64.b64decode(raw_s, validate=True) if raw_s else b""
- except (_binascii.Error, TypeError, ValueError) as e:
+ mgmt.updateFile(str(file_item.id), {"tags": ["_workflowInternal"]})
+ except Exception as tag_exc:
logger.warning(
- "extractContent: could not decode image part %s (keep inline): %s",
- pcopy.get("id"),
- e,
+ "extractContent image persist: could not tag internal file %s: %s",
+ file_item.id,
+ tag_exc,
)
- new_parts.append(pcopy)
- continue
- if not blob:
- new_parts.append(pcopy)
- continue
- part_id = str(pcopy.get("id") or "part")
- # Full part id (UUID) — must not truncate or names collide / break linking
- safe_id = re.sub(r"[^\w\-.]+", "_", part_id).strip("_") or "media"
- if len(safe_id) > 200:
- safe_id = safe_id[:200]
- ext = _mime_to_file_extension(mime)
- media_name = f"extract_media_{stem}_{safe_id}.{ext}"
- pcopy["data"] = ""
- pcopy["handoverMediaDocumentName"] = media_name
- media_docs.append(
- ActionDocument(
- documentName=media_name,
- documentData=blob,
- mimeType=mime,
- validationMetadata={
- "actionType": "context.extractContent",
- "handoverRole": "extractedMedia",
- "sourcePartId": part_id,
- "handoverSchema": kind,
- "containerFileKey": fk,
- },
- )
- )
- new_parts.append(pcopy)
- else:
- new_parts.append(pcopy)
- bucket["parts"] = new_parts
- bucket["byTypeGroup"] = _rebuild_by_type_group(new_parts)
- files_section[fk] = bucket
+ except Exception as exc:
+ logger.warning("extractContent image persist: createFile failed %s: %s", part_id, exc)
+ new_parts.append(p)
+ continue
+ p_new = dict(p)
+ p_new["data"] = ""
+ p_new["embeddedImageFileId"] = str(file_item.id)
+ p_new["embeddedImageFileName"] = str(getattr(file_item, "fileName", media_name))
+ new_parts.append(p_new)
+ artifacts.append(
+ {
+ "fileId": str(file_item.id),
+ "fileName": str(getattr(file_item, "fileName", media_name)),
+ "mimeType": mime,
+ "sourcePartId": part_id,
+ "documentIndex": doc_idx,
+ "suppressInWorkflowFileLists": True,
+ }
+ )
+ blob["parts"] = new_parts
- return bundle, media_docs
+ return content_extracted_serial, artifacts
def _one_file_bucket(ec: ContentExtracted, source_file_name: str) -> Dict[str, Any]:
@@ -766,28 +1260,341 @@ def _one_file_bucket(ec: ContentExtracted, source_file_name: str) -> Dict[str, A
}
-def build_extract_content_handover(
+
+_MAX_IMAGE_EMBED_BYTES = 300_000
+_IMAGE_MAX_DIMENSION = 1200
+
+
+def _get_mgmt_for_presentation_render(services: Any) -> Optional[Any]:
+ mgmt = getattr(services, "interfaceDbComponent", None) if services else None
+ if mgmt:
+ return mgmt
+ if not services:
+ return None
+ try:
+ import modules.interfaces.interfaceDbManagement as iface
+
+ user = getattr(services, "user", None)
+ if not user:
+ return None
+ return iface.getInterface(
+ user,
+ mandateId=getattr(services, "mandateId", None) or "",
+ featureInstanceId=getattr(services, "featureInstanceId", None) or "",
+ )
+ except Exception as exc:
+ logger.warning("presentation render: mgmt interface failed: %s", exc)
+ return None
+
+
+def _resize_image_bytes_for_document(image_bytes: bytes) -> bytes:
+ try:
+ from PIL import Image as PILImage
+
+ img = PILImage.open(BytesIO(image_bytes))
+ if img.mode in ("RGBA", "LA"):
+ bg = PILImage.new("RGB", img.size, (255, 255, 255))
+ bg.paste(img, mask=img.split()[-1])
+ img = bg
+ elif img.mode == "P":
+ img = img.convert("RGBA")
+ bg = PILImage.new("RGB", img.size, (255, 255, 255))
+ bg.paste(img, mask=img.split()[-1])
+ img = bg
+ elif img.mode != "RGB":
+ img = img.convert("RGB")
+ if max(img.size) > _IMAGE_MAX_DIMENSION:
+ img.thumbnail((_IMAGE_MAX_DIMENSION, _IMAGE_MAX_DIMENSION), PILImage.BILINEAR)
+ out = BytesIO()
+ img.save(out, format="JPEG", quality=85, optimize=True)
+ return out.getvalue()
+ except Exception as exc:
+ logger.warning("presentation render: image resize failed (%s)", exc)
+ return image_bytes
+
+
+def _load_image_bytes_by_file_id(services: Any, file_id: str) -> Optional[bytes]:
+ mgmt = _get_mgmt_for_presentation_render(services)
+ if not mgmt or not hasattr(mgmt, "getFileData"):
+ return None
+ try:
+ return mgmt.getFileData(str(file_id))
+ except Exception as exc:
+ logger.warning("presentation render: getFileData(%s) failed: %s", file_id, exc)
+ return None
+
+
+def _inline_runs_from_presentation_lines(lines: List[Any]) -> List[Dict[str, Any]]:
+ """Map presentation ``lines`` to inline runs, preserving line order with explicit breaks."""
+ from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import _parseInlineRuns
+
+ runs: List[Dict[str, Any]] = []
+ first = True
+ for ln in lines:
+ if not first:
+ runs.append({"type": "text", "value": "\n"})
+ first = False
+ piece = str(ln) if ln is not None else ""
+ if not piece:
+ continue
+ runs.extend(_parseInlineRuns(piece))
+ return runs if runs else [{"type": "text", "value": ""}]
+
+
+def _is_presentation_file_bucket(d: Dict[str, Any]) -> bool:
+ """True for a single ``presentation.files[*]`` bucket (loop item value / per-file handover)."""
+ if d.get("kind") == PRESENTATION_KIND:
+ return False
+ data = d.get("data")
+ if not isinstance(data, (list, str)):
+ return False
+ return "outputMode" in d or "sourceFileName" in d
+
+
+def _is_loop_presentation_file_item(d: Dict[str, Any]) -> bool:
+ val = d.get("value")
+ return isinstance(d.get("name"), str) and isinstance(val, dict) and _is_presentation_file_bucket(val)
+
+
+def _is_presentation_line_slot(d: Dict[str, Any]) -> bool:
+ """Single slot from ``presentation.files[*].data[]`` (e.g. loop iteration over one CSV row)."""
+ if d.get("kind") == PRESENTATION_KIND or _is_presentation_file_bucket(d):
+ return False
+ tg = (d.get("typeGroup") or "").strip()
+ if tg in ("text", "table", "image", "structure"):
+ return True
+ return isinstance(d.get("lines"), list)
+
+
+def presentation_envelope_from_file_bucket(
+ bucket: Dict[str, Any],
*,
- extracted_results: List[ContentExtracted],
- chat_file_names: List[str],
- operation_ref: str,
+ file_key: Optional[str] = None,
) -> Dict[str, Any]:
- key_counts: Dict[str, int] = {}
- files: Dict[str, Any] = {}
- ordered: List[str] = []
-
- for i, ec in enumerate(extracted_results):
- name = chat_file_names[i] if i < len(chat_file_names) else ""
- fk = _file_json_key(str(name), i, key_counts)
- files[fk] = _one_file_bucket(ec, str(name))
- ordered.append(fk)
-
+ """Wrap one ``presentation.files`` entry as a full presentation envelope."""
+ fk = (file_key or "").strip()
+ if not fk:
+ src = str(bucket.get("sourceFileName") or "").strip()
+ fk = f"file_1_{src}" if src else "file_1"
return {
- "schemaVersion": 1,
- "kind": HANDOVER_KIND,
- "operationRef": operation_ref,
- "fileOrder": ordered,
- "files": files,
+ "schemaVersion": PRESENTATION_SCHEMA_VERSION,
+ "kind": PRESENTATION_KIND,
+ "outputMode": bucket.get("outputMode") or "lines",
+ "fileOrder": [fk],
+ "files": {fk: bucket},
+ }
+
+
+def normalize_presentation_envelopes(raw: Any) -> List[Dict[str, Any]]:
+ """Collect ``context.extractContent.presentation.v1`` dicts from ActionResult / list shapes."""
+ if raw is None:
+ return []
+ if isinstance(raw, list):
+ out: List[Dict[str, Any]] = []
+ for item in raw:
+ out.extend(normalize_presentation_envelopes(item))
+ return out
+ if isinstance(raw, dict):
+ if raw.get("kind") == PRESENTATION_KIND:
+ return [raw]
+ if _is_loop_presentation_file_item(raw):
+ return [
+ presentation_envelope_from_file_bucket(
+ raw["value"],
+ file_key=str(raw.get("name") or "file_1"),
+ )
+ ]
+ if _is_presentation_file_bucket(raw):
+ return [presentation_envelope_from_file_bucket(raw)]
+ if _is_presentation_line_slot(raw):
+ bucket = {"outputMode": "lines", "sourceFileName": "", "data": [raw]}
+ return [presentation_envelope_from_file_bucket(bucket)]
+ inner = raw.get("data")
+ if isinstance(inner, dict) and inner.get("kind") == PRESENTATION_KIND:
+ return [inner]
+ for key in ("data", "merged", "value"):
+ nested = raw.get(key)
+ if isinstance(nested, dict) and nested is not raw:
+ found = normalize_presentation_envelopes(nested)
+ if found:
+ return found
+ return []
+
+
+def presentation_envelopes_to_document_json(
+ raw: Any,
+ *,
+ title: str,
+ language: str,
+ services: Any = None,
+) -> Dict[str, Any]:
+ """Map presentation envelope(s) to ``renderReport`` ``extractedContent`` (documents/sections)."""
+ from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import _parseInlineRuns
+
+ envelopes = normalize_presentation_envelopes(raw)
+ if not envelopes:
+ raise ValueError(
+ "context must be presentation data from Inhalt extrahieren (kind=context.extractContent.presentation.v1)"
+ )
+
+ sections: List[Dict[str, Any]] = []
+ order = 0
+
+ def _next_id() -> str:
+ nonlocal order
+ order += 1
+ return f"s_{order}"
+
+ def _append_heading(text: str, level: int = 2) -> None:
+ t = (text or "").strip()
+ if not t:
+ return
+ sections.append({
+ "id": _next_id(),
+ "content_type": "heading",
+ "order": order,
+ "elements": [{"content": {"text": t, "level": level}}],
+ })
+
+ def _append_paragraph(text: str) -> None:
+ t = (text or "").strip()
+ if not t:
+ return
+ sections.append({
+ "id": _next_id(),
+ "content_type": "paragraph",
+ "order": order,
+ "elements": [{"content": {"inlineRuns": _parseInlineRuns(t)}}],
+ })
+
+ def _append_image_slot(slot: Dict[str, Any]) -> None:
+ fid = slot.get("embeddedImageFileId")
+ if not fid:
+ return
+ blob = _load_image_bytes_by_file_id(services, str(fid))
+ if not blob:
+ return
+ if len(blob) > _MAX_IMAGE_EMBED_BYTES:
+ blob = _resize_image_bytes_for_document(blob)
+ alt = (
+ slot.get("embeddedImageFileName")
+ or slot.get("label")
+ or f"image_{fid}"
+ )
+ sections.append({
+ "id": _next_id(),
+ "content_type": "image",
+ "order": order,
+ "elements": [{
+ "content": {
+ "altText": str(alt),
+ "base64Data": _b64.b64encode(blob).decode("ascii"),
+ },
+ }],
+ })
+
+ def _append_text_slot(slot: Dict[str, Any]) -> None:
+ lines = slot.get("lines")
+ if isinstance(lines, list) and lines:
+ sections.append({
+ "id": _next_id(),
+ "content_type": "paragraph",
+ "order": order,
+ "elements": [{"content": {"inlineRuns": _inline_runs_from_presentation_lines(lines)}}],
+ })
+ return
+ raw_d = slot.get("data")
+ if isinstance(raw_d, str) and raw_d.strip():
+ sections.append({
+ "id": _next_id(),
+ "content_type": "paragraph",
+ "order": order,
+ "elements": [{"content": {"inlineRuns": _inline_runs_from_presentation_lines(raw_d.splitlines())}}],
+ })
+
+ def _append_table_slot(slot: Dict[str, Any]) -> None:
+ raw = slot.get("data")
+ if not isinstance(raw, str) or not raw.strip():
+ return
+ header_row = True
+ meta = slot.get("metadata")
+ if isinstance(meta, dict) and meta.get("csvHeaderRow") is False:
+ header_row = False
+ parsed = _table_matrix_from_csv(raw, header_row=header_row)
+ if not parsed:
+ return
+ headers, body = parsed
+ sections.append({
+ "id": _next_id(),
+ "content_type": "table",
+ "order": order,
+ "elements": [{"content": {"headers": headers, "rows": body}}],
+ })
+
+ def _append_slot(slot: Dict[str, Any]) -> None:
+ tg = (slot.get("typeGroup") or "").strip().lower()
+ mime = (slot.get("mimeType") or "").strip().lower()
+ if tg == "image" or mime.startswith("image/"):
+ _append_image_slot(slot)
+ return
+ if tg == "container":
+ return
+ if tg == "table" or ("csv" in mime and slot.get("data")):
+ _append_table_slot(slot)
+ return
+ if _part_carries_plain_text(slot):
+ _append_text_slot(slot)
+
+ def _append_bucket(bucket: Dict[str, Any], *, show_file_heading: bool) -> None:
+ if show_file_heading:
+ src = str(bucket.get("sourceFileName") or "").strip()
+ if src:
+ _append_heading(src)
+ raw_data = bucket.get("data")
+ if isinstance(raw_data, str):
+ _append_paragraph(raw_data)
+ return
+ if isinstance(raw_data, list):
+ for el in raw_data:
+ if isinstance(el, dict):
+ _append_slot(el)
+ elif isinstance(el, str):
+ _append_paragraph(el)
+ return
+ if isinstance(raw_data, dict):
+ _append_slot(raw_data)
+
+ for envelope in envelopes:
+ files_section = envelope.get("files") or {}
+ file_order = envelope.get("fileOrder")
+ keys: List[str] = (
+ list(file_order) if isinstance(file_order, list) and file_order else list(files_section.keys())
+ )
+ multi_files = len(keys) > 1
+ for fk in keys:
+ bucket = files_section.get(fk)
+ if isinstance(bucket, dict):
+ _append_bucket(bucket, show_file_heading=multi_files)
+
+ if not sections:
+ raise ValueError("presentation produced no renderable sections")
+
+ lang = (language or "de").strip() or "de"
+ doc_title = (title or "Document").strip() or "Document"
+ return {
+ "metadata": {
+ "split_strategy": "single_document",
+ "source_documents": [],
+ "extraction_method": "context_extract_presentation",
+ "title": doc_title,
+ "language": lang,
+ },
+ "documents": [{
+ "id": "doc_1",
+ "title": doc_title,
+ "language": lang,
+ "sections": sections,
+ }],
}
@@ -826,7 +1633,7 @@ async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
self.services.chat.progressLogFinish(operation_id, False)
return ActionResult.isFailure(error="No documents found in documentList")
- logger.info(f"Extracting JSON handover from {len(chat_documents)} documents")
+ logger.info(f"Extracting content from {len(chat_documents)} documents")
self.services.chat.progressLogUpdate(operation_id, 0.3, "Preparing extraction options")
@@ -853,63 +1660,56 @@ async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
file_names = [getattr(cd, "fileName", "") or "" for cd in chat_documents]
- payload = build_extract_content_handover(
- extracted_results=extracted_results,
- chat_file_names=file_names,
- operation_ref=operation_id,
- )
-
- self.services.chat.progressLogUpdate(operation_id, 0.9, "Building JSON")
-
- content_filter = str(parameters.get("contentFilter") or "all").strip().lower()
- if content_filter not in _CONTENT_FILTER_OPTIONS:
- content_filter = "all"
- payload = _apply_content_filter(payload, content_filter)
+ content_filter = _canonical_content_filter(parameters.get("contentFilter"))
+ filtered_extractions = _filter_extractions_by_content_filter(extracted_results, content_filter)
pres_cfg = parse_presentation_parameters(parameters)
- presentation = build_presentation_for_payload(payload, pres_cfg)
stem = f"{wf}_{int(time.time())}"
- # Only split image sidecars when the filtered payload can still contain image parts.
+ run_ctx = parameters.get("_runContext")
+
+ content_extracted_serial = [_serialize_content_extracted_for_output(ec) for ec in filtered_extractions]
+ image_artifacts: List[Dict[str, Any]] = []
if content_filter in ("all", "imagesOnly"):
- stripped_payload, media_docs = _split_images_to_sidecar_documents(
- payload,
- document_name_stem=stem,
+ content_extracted_serial, image_artifacts = _persist_extracted_image_parts(
+ content_extracted_serial,
+ name_stem=stem,
+ run_context=run_ctx if isinstance(run_ctx, dict) else None,
)
- else:
- # textOnly / noImages: no image parts remain → skip the split entirely.
- stripped_payload = payload
- media_docs = []
- stripped_payload["presentation"] = presentation
- joined_text = presentation_response_text(presentation, stripped_payload)
+ presentation = build_presentation_for_serial_extractions(content_extracted_serial, file_names, pres_cfg)
- json_meta = {
- "actionType": "context.extractContent",
- "documentCountInput": len(chat_documents),
- "documentCountRoots": len(extracted_results),
- "handoverSchema": stripped_payload.get("kind"),
- "handoverRole": "structuredHandover",
- "mediaDocumentCount": len(media_docs),
- }
+ try:
+ _pc_json = json.dumps(dict(pres_cfg), ensure_ascii=False, default=str)
+ _sum = summarize_presentation_payload(presentation)
+ _sum_json = json.dumps(_sum, ensure_ascii=False, default=str)
+ logger.info(
+ "extractContent op=%s presentationConfig=%s presentationSummary=%s",
+ operation_id,
+ _pc_json,
+ _sum_json[:8000] + ("…" if len(_sum_json) > 8000 else ""),
+ )
+ except Exception as _log_e:
+ logger.debug("extractContent presentation trace log skipped: %s", _log_e)
- json_doc = ActionDocument(
- documentName=f"extracted_content_{stem}.json",
- documentData=stripped_payload,
- mimeType="application/json",
- validationMetadata=json_meta,
- )
-
- handover_data = {
- "response": joined_text,
- "contentType": "text",
- "handoverKind": stripped_payload.get("kind"),
- "structuredDocumentIndex": 0,
- "mediaDocumentCount": len(media_docs),
+ data_out: Dict[str, Any] = {
+ **presentation,
+ "_meta": {
+ "actionType": "context.extractContent",
+ "operationRef": operation_id,
+ "sourceFileNames": list(file_names),
+ "documentCountInput": len(chat_documents),
+ "documentCountRoots": len(extracted_results),
+ "extractPayloadSchemaVersion": EXTRACT_PAYLOAD_SCHEMA_VERSION,
+ "presentationConfig": dict(pres_cfg),
+ "persistedImageArtifacts": image_artifacts,
+ "suppressInWorkflowFileLists": True,
+ "persistedImageCount": len(image_artifacts),
+ },
}
self.services.chat.progressLogFinish(operation_id, True)
- return ActionResult.isSuccess(documents=[json_doc] + media_docs, data=handover_data)
+ return ActionResult.isSuccess(documents=[], data=data_out)
except Exception as e:
logger.error(f"Error in content extraction: {str(e)}")
diff --git a/modules/workflows/methods/methodContext/actions/mergeContext.py b/modules/workflows/methods/methodContext/actions/mergeContext.py
index 3947db30..8bc76e4b 100644
--- a/modules/workflows/methods/methodContext/actions/mergeContext.py
+++ b/modules/workflows/methods/methodContext/actions/mergeContext.py
@@ -18,8 +18,9 @@ from typing import Any, Dict, List, Optional
from modules.datamodels.datamodelChat import ActionResult
from modules.workflows.methods.methodContext.actions.extractContent import (
- _joined_text_from_handover_payload,
+ joined_text_from_extract_node_data,
)
+from modules.workflows.methods.methodContext.contextEnvelope import wrap_merge_context_data
logger = logging.getLogger(__name__)
@@ -89,6 +90,9 @@ def _primary_text_from_item(it: Any) -> str:
r = inner.get("response")
if r is not None and str(r).strip():
return str(r).strip()
+ ce_text = joined_text_from_extract_node_data(inner)
+ if ce_text.strip():
+ return ce_text.strip()
docs = it.get("documents")
if not isinstance(docs, list) or not docs:
return ""
@@ -104,14 +108,14 @@ def _primary_text_from_item(it: Any) -> str:
except (UnicodeDecodeError, ValueError):
return ""
if isinstance(raw, dict):
- return (_joined_text_from_handover_payload(raw) or "").strip()
+ return (joined_text_from_extract_node_data(raw) or "").strip()
if isinstance(raw, str) and raw.strip():
s = raw.strip()
if s.startswith("{") and s.endswith("}"):
try:
parsed = json.loads(s)
if isinstance(parsed, dict):
- return (_joined_text_from_handover_payload(parsed) or "").strip()
+ return (joined_text_from_extract_node_data(parsed) or "").strip()
except (json.JSONDecodeError, TypeError):
pass
return s
@@ -126,6 +130,14 @@ def _sanitize_heading_title(name: str) -> str:
def _iteration_heading_from_item(it: Any) -> Optional[str]:
if not isinstance(it, dict):
return None
+ inner = it.get("data")
+ if isinstance(inner, dict):
+ meta = inner.get("_meta") if isinstance(inner.get("_meta"), dict) else {}
+ sf = inner.get("sourceFileNames") or meta.get("sourceFileNames")
+ if isinstance(sf, list) and sf:
+ first = sf[0]
+ if isinstance(first, str) and first.strip():
+ return _sanitize_heading_title(first.strip())
docs = it.get("documents")
if not isinstance(docs, list) or not docs:
return None
@@ -222,7 +234,7 @@ async def mergeContext(self, parameters: Dict[str, Any]) -> ActionResult:
(_ps[:200] + "…") if len(_ps) > 200 else _ps,
len(conflicts),
)
- data: Dict[str, Any] = {
+ payload: Dict[str, Any] = {
"merged": merged,
"inputs": inputs,
"first": inputs[0] if inputs else None,
@@ -230,7 +242,7 @@ async def mergeContext(self, parameters: Dict[str, Any]) -> ActionResult:
"conflicts": sorted(set(conflicts)) if conflicts else [],
"response": primary,
}
- return ActionResult.isSuccess(data=data)
+ return ActionResult.isSuccess(data=wrap_merge_context_data(payload))
except Exception as exc:
logger.exception("mergeContext failed")
return ActionResult.isFailure(error=str(exc))
diff --git a/modules/workflows/methods/methodContext/actions/transformContext.py b/modules/workflows/methods/methodContext/actions/transformContext.py
index 6fe05e03..ffff183d 100644
--- a/modules/workflows/methods/methodContext/actions/transformContext.py
+++ b/modules/workflows/methods/methodContext/actions/transformContext.py
@@ -18,6 +18,7 @@ import re
from typing import Any, Dict, List, Optional
from modules.datamodels.datamodelChat import ActionResult
+from modules.workflows.methods.methodContext.contextEnvelope import wrap_transform_context_data
logger = logging.getLogger(__name__)
@@ -216,7 +217,7 @@ async def transformContext(self, parameters: Dict[str, Any]) -> ActionResult:
if cast_errors:
result["_castErrors"] = cast_errors
- return ActionResult.isSuccess(data=result)
+ return ActionResult.isSuccess(data=wrap_transform_context_data(result))
except Exception as exc:
logger.exception("transformContext failed")
return ActionResult.isFailure(error=str(exc))
diff --git a/modules/workflows/methods/methodContext/contextEnvelope.py b/modules/workflows/methods/methodContext/contextEnvelope.py
new file mode 100644
index 00000000..c35836cf
--- /dev/null
+++ b/modules/workflows/methods/methodContext/contextEnvelope.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2026 Patrick Motsch
+"""Versioned ``ActionResult.data`` envelope for context.* actions (merge, transform)."""
+
+from __future__ import annotations
+
+from typing import Any, Dict
+
+CONTEXT_MERGE_KIND = "context.mergeContext.v1"
+CONTEXT_MERGE_SCHEMA_VERSION = 1
+
+CONTEXT_TRANSFORM_KIND = "context.transformContext.v1"
+CONTEXT_TRANSFORM_SCHEMA_VERSION = 1
+
+
+def wrap_merge_context_data(body: Dict[str, Any]) -> Dict[str, Any]:
+ """Wrap merge payload: ``schemaVersion``, ``kind``, body fields, ``_meta`` last."""
+ meta: Dict[str, Any] = {
+ "actionType": "context.mergeContext",
+ "mergePayloadSchemaVersion": CONTEXT_MERGE_SCHEMA_VERSION,
+ }
+ out: Dict[str, Any] = {
+ "schemaVersion": CONTEXT_MERGE_SCHEMA_VERSION,
+ "kind": CONTEXT_MERGE_KIND,
+ }
+ out.update(body)
+ out["_meta"] = meta
+ return out
+
+
+def wrap_transform_context_data(fields: Dict[str, Any]) -> Dict[str, Any]:
+ """Wrap transform output fields under a versioned envelope (``_meta`` overwrites same key in fields)."""
+ meta: Dict[str, Any] = {
+ "actionType": "context.transformContext",
+ "transformPayloadSchemaVersion": CONTEXT_TRANSFORM_SCHEMA_VERSION,
+ }
+ out: Dict[str, Any] = {
+ "schemaVersion": CONTEXT_TRANSFORM_SCHEMA_VERSION,
+ "kind": CONTEXT_TRANSFORM_KIND,
+ }
+ out.update(fields)
+ out["_meta"] = meta
+ return out
diff --git a/modules/workflows/methods/methodContext/methodContext.py b/modules/workflows/methods/methodContext/methodContext.py
index b2e7220b..b82d4356 100644
--- a/modules/workflows/methods/methodContext/methodContext.py
+++ b/modules/workflows/methods/methodContext/methodContext.py
@@ -57,12 +57,9 @@ class MethodContext(MethodBase):
"extractContent": WorkflowActionDefinition(
actionId="context.extractContent",
description=(
- "Extract document content without AI. Unified handover: (1) `documents[0]` "
- "JSON `context.extractContent.handover.v1` with text in `parts` and image placeholders "
- "linking to sibling blobs via `handoverMediaDocumentName`; "
- "(2) each extracted image as a separate binary document (`extract_media_*`); "
- "(3) `data.response` / top-level `response` after normalization — concatenated plain text "
- "for prompts and file.create. Pick `response`, a specific document, or deep JSON paths."
+ "Extract document content without AI. Returns `data` as the configured presentation "
+ "envelope (`fileOrder`, `files`, …) plus `_meta`; no duplicated service payload or bundled "
+ "plain-text column. Persisted images appear via `embeddedImageFileId` in internal serial only."
),
dynamicMode=True,
outputType="UdmDocument",
@@ -151,8 +148,8 @@ class MethodContext(MethodBase):
"mergeContext": WorkflowActionDefinition(
actionId="context.mergeContext",
description=(
- "Führt eine Liste von Schrittergebnissen (z. B. ``bodyResults`` einer "
- "``flow.loop``) zu einem zusammengeführten Dict zusammen."
+ "Führt Schritte zu einem Dict zusammen. ``data`` enthält einen versionierten Umschlag "
+ "(``context.mergeContext.v1``, ``merged``, ``response``, …) und ``_meta``."
),
outputType="ActionResult",
parameters={
@@ -210,10 +207,9 @@ class MethodContext(MethodBase):
"transformContext": WorkflowActionDefinition(
actionId="context.transformContext",
description=(
- "Transform the upstream payload via a list of {sourceField, outputField, "
- "operation, type, expression} mappings. Operations: rename, cast, nest, "
- "flatten, compute. compute uses {{...}} templates; nesting is implicit "
- "via dotted outputField paths."
+ "Transform mappings on the upstream payload. ``data`` trägt "
+ "``schemaVersion``, ``kind: context.transformContext.v1``, die gemappten Felder "
+ "und optional ``_castErrors``, plus ``_meta``."
),
outputType="Transit",
parameters={
diff --git a/modules/workflows/methods/methodFile/actions/create.py b/modules/workflows/methods/methodFile/actions/create.py
index e7ef569c..9342767f 100644
--- a/modules/workflows/methods/methodFile/actions/create.py
+++ b/modules/workflows/methods/methodFile/actions/create.py
@@ -3,7 +3,7 @@
from typing import Any, Dict, List, Optional
-import asyncio
+import ast
import base64
import binascii
import io
@@ -12,79 +12,33 @@ import logging
import re
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
-from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import (
- enhancePlainTextWithMarkdownTables,
- markdownToDocumentJson,
-)
from modules.shared.i18nRegistry import normalizePrimaryLanguageTag
from modules.workflows.automation2.executors.actionNodeExecutor import _coerce_document_data_to_bytes
-from modules.workflows.methods.methodAi._common import is_image_action_document_list, serialize_context
+from modules.workflows.methods.methodAi._common import is_image_action_document_list
+from modules.workflows.methods.methodContext.actions.extractContent import (
+ presentation_envelopes_to_document_json,
+)
logger = logging.getLogger(__name__)
_SAFE_FILENAME = re.compile(r'[^\w\-.\(\)\s\[\]%@+]')
-_HEAVY_CONTEXT_KEYS = frozenset({"imageDocumentsOnly", "documents", "inputs"})
-
-
-def _collect_image_documents_only(raw: Any) -> List[Any]:
- """Resolve ``imageDocumentsOnly`` whether the context is merged, nested, or surfaced."""
- if not isinstance(raw, dict):
- return []
- paths = (
- ("imageDocumentsOnly",),
- ("merged", "imageDocumentsOnly"),
- ("data", "merged", "imageDocumentsOnly"),
- ("data", "imageDocumentsOnly"),
- )
- for path in paths:
- cur: Any = raw
- ok = True
- for p in path:
- if not isinstance(cur, dict):
- ok = False
- break
- cur = cur.get(p)
- if ok and isinstance(cur, list) and cur:
- return cur
- return []
-
-
-def _context_string_for_report(raw: Any, output_format: str) -> str:
- """Build one narrative string for ``markdownToDocumentJson`` / render.
-
- Prefer plain ``response`` text (merge node surfaces it; nested ``merged.response``
- too). Never dump ``inputs`` / binary lists into the PDF body — that produced giant
- JSON + base64 "hash" paragraphs after merge + ``contextBuilder``.
- """
- of = (output_format or "docx").strip().lower().lstrip(".")
- if of == "json":
- return serialize_context(raw, prefer_handover_primary=False)
- if isinstance(raw, str):
- return raw.strip().lstrip("\ufeff")
- if isinstance(raw, dict):
- for path in (
- ("response",),
- ("merged", "response"),
- ("data", "response"),
- ("data", "merged", "response"),
- ):
- cur: Any = raw
- ok = True
- for k in path:
- if not isinstance(cur, dict):
- ok = False
- break
- cur = cur.get(k)
- if ok and cur is not None and str(cur).strip():
- return str(cur).strip().lstrip("\ufeff")
- lean = {k: v for k, v in raw.items() if k not in _HEAVY_CONTEXT_KEYS}
+def _coerce_structured_context(raw: Any) -> Any:
+ """Undo legacy ``str`` coercion on structured refs (loop ``bodyResults``, presentation)."""
+ if not isinstance(raw, str):
+ return raw
+ stripped = raw.strip()
+ if not stripped or stripped[0] not in ("[", "{"):
+ return raw
+ for loader in (json.loads, ast.literal_eval):
try:
- return json.dumps(lean, ensure_ascii=False, indent=2, default=str)
- except Exception:
- return serialize_context(lean, prefer_handover_primary=False)
- return serialize_context(raw, prefer_handover_primary=False)
+ parsed = loader(stripped)
+ except (json.JSONDecodeError, ValueError, SyntaxError, TypeError):
+ continue
+ if isinstance(parsed, (dict, list)):
+ return parsed
+ return raw
def _raw_context_preview_for_log(raw: Any, max_len: int = 500) -> str:
@@ -121,12 +75,6 @@ def _persistDocumentsToUserFiles(
return
if not mgmt:
return
- logger.info(
- "file.create persist: mgmt=%s id(mgmt)=%s has_createFileData=%s",
- type(mgmt).__name__,
- id(mgmt),
- hasattr(mgmt, "createFileData"),
- )
for doc in action_documents:
try:
doc_data = doc.documentData if hasattr(doc, "documentData") else doc.get("documentData")
@@ -149,15 +97,8 @@ def _persistDocumentsToUserFiles(
or doc.get("mimeType")
or "application/octet-stream"
)
- logger.info(
- "file.create persist: calling createFile name=%s bytes=%s",
- doc_name,
- len(content),
- )
file_item = mgmt.createFile(doc_name, mime, content, folderId=folder_id)
- logger.info("file.create persist: createFile returned id=%s", file_item.id)
- ok = mgmt.createFileData(file_item.id, content)
- logger.info("file.create persist: createFileData returned %s for id=%s", ok, file_item.id)
+ mgmt.createFileData(file_item.id, content)
meta = getattr(doc, "validationMetadata", None) or doc.get("validationMetadata") or {}
if isinstance(meta, dict):
meta["fileId"] = file_item.id
@@ -165,7 +106,6 @@ def _persistDocumentsToUserFiles(
doc.validationMetadata = meta
elif isinstance(doc, dict):
doc["validationMetadata"] = meta
- logger.info("file.create: persisted %s to user files (id=%s)", doc_name, file_item.id)
except Exception as e:
dname = getattr(doc, "documentName", None) or doc.get("documentName", "?")
logger.warning("file.create: failed to persist document %s: %s", dname, e)
@@ -215,100 +155,7 @@ def _load_image_bytes_from_action_doc(doc: dict, services) -> Optional[bytes]:
return None
-# Images larger than this threshold (decoded bytes) are resized before embedding
-# to avoid multi-minute PDF rendering of high-res raster scans.
-_MAX_IMAGE_EMBED_BYTES = 300_000 # 300 KB decoded ≈ ~400 KB base64
-_IMAGE_MAX_DIMENSION = 1200 # longest edge in pixels after resize
-
-
-def _resize_image_for_document(image_bytes: bytes) -> bytes:
- """Resize image to at most ``_IMAGE_MAX_DIMENSION`` px on the longest edge
- and re-encode as JPEG. Falls back to the original bytes on any error."""
- try:
- from PIL import Image as PILImage
- import io as _io
-
- img = PILImage.open(_io.BytesIO(image_bytes))
-
- # Flatten transparency / palette modes to RGB (required for JPEG)
- if img.mode in ("RGBA", "LA"):
- bg = PILImage.new("RGB", img.size, (255, 255, 255))
- bg.paste(img, mask=img.split()[-1])
- img = bg
- elif img.mode == "P":
- img = img.convert("RGBA")
- bg = PILImage.new("RGB", img.size, (255, 255, 255))
- bg.paste(img, mask=img.split()[-1])
- img = bg
- elif img.mode != "RGB":
- img = img.convert("RGB")
-
- w, h = img.size
- if max(w, h) > _IMAGE_MAX_DIMENSION:
- # thumbnail() is optimised for downscaling: it uses an intermediate
- # box-filter step before the final filter, making it 3-5× faster
- # than resize() on large images. BILINEAR is fast and sufficient
- # for document thumbnails.
- img.thumbnail((_IMAGE_MAX_DIMENSION, _IMAGE_MAX_DIMENSION), PILImage.BILINEAR)
-
- out = _io.BytesIO()
- img.save(out, format="JPEG", quality=85, optimize=True)
- return out.getvalue()
- except Exception as e:
- logger.warning("file.create: image resize failed (%s) — using original bytes", e)
- return image_bytes
-
-
-def _append_images_to_content(structured_content: dict, image_docs: list, services=None) -> dict:
- """Append images from imageDocumentsOnly as native image elements to the structured JSON.
-
- Each image becomes an ``image`` element with ``base64Data`` in a trailing
- "Bilder" section of the first document. Images larger than
- ``_MAX_IMAGE_EMBED_BYTES`` are automatically resized/compressed so the
- synchronous PDF renderer does not block for minutes on high-res scans.
- The renderers (DOCX / PDF) handle ``content.base64Data`` natively.
- """
- elements = []
- for doc in image_docs:
- b = _load_image_bytes_from_action_doc(doc, services)
- if not b:
- raw = doc.get("documentData") if isinstance(doc, dict) else None
- if isinstance(raw, str):
- try:
- b = base64.b64decode(raw)
- except Exception:
- pass
- if not b:
- continue
-
- if len(b) > _MAX_IMAGE_EMBED_BYTES:
- logger.info(
- "file.create: image %s is %d bytes — resizing to max %dpx for embedding",
- (doc.get("documentName") if isinstance(doc, dict) else "?") or "?",
- len(b),
- _IMAGE_MAX_DIMENSION,
- )
- b = _resize_image_for_document(b)
-
- elements.append({
- "type": "image",
- "content": {
- "base64Data": base64.b64encode(b).decode("ascii"),
- "alt": (doc.get("documentName") if isinstance(doc, dict) else None) or "image",
- },
- })
-
- if not elements:
- return structured_content
-
- docs = structured_content.get("documents")
- if isinstance(docs, list) and docs:
- docs[0].setdefault("sections", []).append({"heading": "Bilder", "elements": elements})
- return structured_content
-
-
def _images_list_to_pdf(image_bytes_list: List[bytes]) -> bytes:
- """One PDF page per image; embedded raster data via PyMuPDF."""
import fitz
pdf = fitz.open()
@@ -322,7 +169,6 @@ def _images_list_to_pdf(image_bytes_list: List[bytes]) -> bytes:
def _images_list_to_docx(image_bytes_list: List[bytes]) -> bytes:
- """Images embedded in the document package (inline shapes), not hyperlinks."""
from docx import Document
from docx.shared import Inches
@@ -403,28 +249,13 @@ async def _create_merged_image_documents(
async def create(self, parameters: Dict[str, Any]) -> ActionResult:
- """
- Create a file from context (text/markdown from upstream AI node).
- Uses GenerationService.renderReport to produce docx, pdf, txt, md, html, xlsx, etc.
- """
- raw_context = parameters.get("context", "")
+ """Create a file from ``context.extractContent`` presentation data via ``renderReport``."""
+ raw_context = _coerce_structured_context(parameters.get("context", ""))
if isinstance(raw_context, list) and is_image_action_document_list(raw_context):
return await _create_merged_image_documents(self, parameters, raw_context)
outputFormat = (parameters.get("outputFormat") or "docx").strip().lower().lstrip(".")
- context = _context_string_for_report(raw_context, outputFormat)
-
- if not context:
- logger.warning(
- "file.create: context empty after resolve — raw_context type=%s raw_summary=%r "
- "serialized_len=%s (check ActionNodeExecutor \"file.create context resolution\" log for DataRef / upstream).",
- type(raw_context).__name__,
- _raw_context_preview_for_log(raw_context),
- len(context or ""),
- )
- return ActionResult.isFailure(error="context is required (connect an AI node or provide text)")
-
title = (parameters.get("title") or "Document").strip()
templateName = parameters.get("templateName")
language = normalizePrimaryLanguageTag(
@@ -438,31 +269,30 @@ async def create(self, parameters: Dict[str, Any]) -> ActionResult:
folder_id = str(raw_folder).strip()
try:
- if outputFormat != "json":
- context = enhancePlainTextWithMarkdownTables(context)
- structured_content = markdownToDocumentJson(context, title, language)
- if templateName:
- structured_content.setdefault("metadata", {})["templateName"] = templateName
+ structured_content = presentation_envelopes_to_document_json(
+ raw_context,
+ title=title,
+ language=language,
+ services=self.services,
+ )
+ except ValueError as e:
+ logger.warning(
+ "file.create: invalid presentation context type=%s preview=%r: %s",
+ type(raw_context).__name__,
+ _raw_context_preview_for_log(raw_context),
+ e,
+ )
+ return ActionResult.isFailure(error=str(e))
- img_docs = _collect_image_documents_only(raw_context)
- if img_docs:
- # Image decoding and PIL resizing are CPU-bound; run them in a
- # thread pool so the event loop is not blocked while processing
- # high-res raster images (e.g. 3+ MB PNGs from PDF extraction).
- loop = asyncio.get_event_loop()
- structured_content = await loop.run_in_executor(
- None,
- _append_images_to_content,
- structured_content,
- img_docs,
- self.services,
- )
+ if templateName:
+ structured_content.setdefault("metadata", {})["templateName"] = templateName
- generation = getattr(self.services, "generation", None)
- if not generation:
- return ActionResult.isFailure(error="Generation service not available")
+ generation = getattr(self.services, "generation", None)
+ if not generation:
+ return ActionResult.isFailure(error="Generation service not available")
- ai_service = getattr(self.services, "ai", None)
+ ai_service = getattr(self.services, "ai", None)
+ try:
rendered_docs = await generation.renderReport(
extractedContent=structured_content,
outputFormat=outputFormat,
@@ -472,43 +302,50 @@ async def create(self, parameters: Dict[str, Any]) -> ActionResult:
aiService=ai_service,
parentOperationId=parameters.get("parentOperationId"),
)
-
- if not rendered_docs:
- return ActionResult.isFailure(error="Rendering produced no output")
-
- action_documents = []
- mime_map = {
- "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- "pdf": "application/pdf",
- "txt": "text/plain",
- "md": "text/markdown",
- "html": "text/html",
- "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- "csv": "text/csv",
- "json": "application/json",
- }
- for rd in rendered_docs:
- doc_data = rd.documentData if hasattr(rd, "documentData") else getattr(rd, "document_data", None)
- doc_name = getattr(rd, "filename", None) or getattr(rd, "documentName", None) or getattr(rd, "document_name", f"output.{outputFormat}")
- mime = getattr(rd, "mimeType", None) or getattr(rd, "mime_type", None) or mime_map.get(outputFormat, "application/octet-stream")
-
- if isinstance(doc_data, bytes):
- doc_data = base64.b64encode(doc_data).decode("ascii")
-
- action_documents.append(ActionDocument(
- documentName=doc_name,
- documentData=doc_data,
- mimeType=mime,
- validationMetadata={
- "actionType": "file.create",
- "outputFormat": outputFormat,
- "templateName": templateName,
- },
- ))
-
- _persistDocumentsToUserFiles(action_documents, self.services, folder_id=folder_id)
- return ActionResult.isSuccess(documents=action_documents)
-
except Exception as e:
- logger.error(f"file.create failed: {e}", exc_info=True)
+ logger.error("file.create failed: %s", e, exc_info=True)
return ActionResult.isFailure(error=str(e))
+
+ if not rendered_docs:
+ return ActionResult.isFailure(error="Rendering produced no output")
+
+ action_documents = []
+ mime_map = {
+ "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "pdf": "application/pdf",
+ "txt": "text/plain",
+ "md": "text/markdown",
+ "html": "text/html",
+ "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ "csv": "text/csv",
+ "json": "application/json",
+ }
+ for rd in rendered_docs:
+ doc_data = rd.documentData if hasattr(rd, "documentData") else getattr(rd, "document_data", None)
+ doc_name = (
+ getattr(rd, "filename", None)
+ or getattr(rd, "documentName", None)
+ or getattr(rd, "document_name", f"output.{outputFormat}")
+ )
+ mime = (
+ getattr(rd, "mimeType", None)
+ or getattr(rd, "mime_type", None)
+ or mime_map.get(outputFormat, "application/octet-stream")
+ )
+
+ if isinstance(doc_data, bytes):
+ doc_data = base64.b64encode(doc_data).decode("ascii")
+
+ action_documents.append(ActionDocument(
+ documentName=doc_name,
+ documentData=doc_data,
+ mimeType=mime,
+ validationMetadata={
+ "actionType": "file.create",
+ "outputFormat": outputFormat,
+ "templateName": templateName,
+ },
+ ))
+
+ _persistDocumentsToUserFiles(action_documents, self.services, folder_id=folder_id)
+ return ActionResult.isSuccess(documents=action_documents)
diff --git a/modules/workflows/methods/methodFile/methodFile.py b/modules/workflows/methods/methodFile/methodFile.py
index 3f9dbd02..c30f86a4 100644
--- a/modules/workflows/methods/methodFile/methodFile.py
+++ b/modules/workflows/methods/methodFile/methodFile.py
@@ -35,10 +35,13 @@ class MethodFile(MethodBase):
),
"context": WorkflowActionParameter(
name="context",
- type="str",
+ type="Any",
frontendType=FrontendType.HIDDEN,
required=False,
- description="Injected from contentSource or upstream connection",
+ description=(
+ "Resolved context: presentation envelope(s) from context.extractContent "
+ "(dict or list, e.g. loop bodyResults), or legacy plain text string."
+ ),
),
"outputFormat": WorkflowActionParameter(
name="outputFormat",
diff --git a/tests/unit/workflow/test_extract_content_handover.py b/tests/unit/workflow/test_extract_content_handover.py
index e9a71636..9f436cbb 100644
--- a/tests/unit/workflow/test_extract_content_handover.py
+++ b/tests/unit/workflow/test_extract_content_handover.py
@@ -1,15 +1,26 @@
-# Unit tests: unified extractContent handover (text vs image sidecars).
+# Unit tests: context.extractContent serialize + presentation helpers (legacy handover dicts vs new paths).
import base64
+import copy as _copy
+
+from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from modules.workflows.methods.methodContext.actions.extractContent import (
HANDOVER_KIND,
+ EXTRACT_PAYLOAD_SCHEMA_VERSION,
_apply_content_filter,
+ _canonical_content_filter,
+ _joined_text_from_content_extracted_serial,
+ _filter_extractions_by_content_filter,
_joined_text_from_handover_payload,
- _split_images_to_sidecar_documents,
+ _persist_extracted_image_parts,
+ _serialize_content_extracted_for_output,
+ build_presentation_for_extractions,
build_presentation_for_payload,
+ joined_text_from_extract_node_data,
parse_presentation_parameters,
presentation_response_text,
+ summarize_presentation_payload,
)
@@ -30,6 +41,120 @@ def test_joined_text_orders_text_table_and_skips_container():
assert _joined_text_from_handover_payload(payload) == "A\n\nB"
+def test_joined_text_from_extract_node_data_prefers_content_extracted():
+ data = {
+ "contentExtracted": [
+ {"id": "x", "parts": [{"typeGroup": "text", "mimeType": "text/plain", "data": "Z", "id": "p"}]}
+ ]
+ }
+ assert joined_text_from_extract_node_data(data) == "Z"
+
+
+def test_joined_text_serial_list():
+ items = [{"parts": [{"typeGroup": "text", "mimeType": "text/plain", "data": "a", "id": "1"}]}]
+ assert _joined_text_from_content_extracted_serial(items) == "a"
+
+
+def test_serialize_content_extracted_drops_summary():
+ ce = ContentExtracted(
+ id="doc1",
+ parts=[ContentPart(id="p", label="main", typeGroup="text", mimeType="text/plain", data="hi")],
+ summary={"ignored": True},
+ )
+ d = _serialize_content_extracted_for_output(ce)
+ assert "summary" not in d
+
+
+def test_persist_images_without_run_context_is_noop():
+ raw = b"fake-binary-image"
+ b64 = base64.b64encode(raw).decode("ascii")
+ serial = [
+ {
+ "id": "1",
+ "parts": [
+ {"typeGroup": "text", "data": "x", "mimeType": "text/plain", "id": "t1"},
+ {"typeGroup": "image", "mimeType": "image/png", "data": b64, "id": "img1"},
+ ],
+ }
+ ]
+ original = _copy.deepcopy(serial)
+ out, arts = _persist_extracted_image_parts(serial, name_stem="stem", run_context=None)
+ assert arts == []
+ assert out == original
+
+
+def test_filter_extractions_by_content_filter_text_only():
+ ec = ContentExtracted(
+ id="id1",
+ parts=[
+ ContentPart(id="t", label="t", typeGroup="text", mimeType="text/plain", data="a"),
+ ContentPart(id="i", label="i", typeGroup="image", mimeType="image/png", data=""),
+ ],
+ )
+ out = _filter_extractions_by_content_filter([ec], "textOnly")
+ assert len(out) == 1
+ assert len(out[0].parts) == 1
+ assert out[0].parts[0].typeGroup == "text"
+
+
+def test_canonical_content_filter_is_case_insensitive():
+ assert _canonical_content_filter("imagesOnly") == "imagesOnly"
+ assert _canonical_content_filter("IMAGESONLY") == "imagesOnly"
+ assert _canonical_content_filter("textOnly") == "textOnly"
+ assert _canonical_content_filter("unknown") == "all"
+
+
+def test_parse_presentation_parameters_content_filter_all_coerces_legacy_pdf_text():
+ """Graphs with „Alles“ but stored pdfExtractMode ``text`` must not drop image parts in presentation."""
+ cfg = parse_presentation_parameters({"contentFilter": "all", "pdfExtractMode": "text"})
+ assert cfg["pdfExtractMode"] == "all"
+
+
+def test_parse_presentation_parameters_images_only_defaults_pdf_mode():
+ cfg = parse_presentation_parameters({"contentFilter": "imagesOnly"})
+ assert cfg["pdfExtractMode"] == "images"
+
+
+def test_presentation_lines_includes_redacted_image_parts_when_pdf_mode_all():
+ payload = {
+ "fileOrder": ["f1"],
+ "files": {
+ "f1": {
+ "sourceFileName": "x.pdf",
+ "parts": [
+ {"typeGroup": "text", "data": "body", "id": "t"},
+ {"typeGroup": "image", "mimeType": "image/png", "data": "YQ==", "id": "img1"},
+ ],
+ },
+ },
+ }
+ cfg = parse_presentation_parameters({"contentFilter": "all", "outputMode": "lines", "pdfExtractMode": "all"})
+ pres = build_presentation_for_payload(payload, cfg)
+ bf = pres["files"]["f1"]
+ assert len(bf["data"]) == 2
+ assert bf["data"][0]["typeGroup"] == "text"
+ assert bf["data"][0]["lines"] == ["body"]
+ assert bf["data"][1]["typeGroup"] == "image"
+ assert bf["data"][1]["lines"] == []
+ assert bf["data"][1].get("data") == ""
+ assert "imageParts" not in bf
+
+
+def test_build_presentation_for_extractions_matches_payload_path():
+ ce = ContentExtracted(
+ id="id",
+ parts=[ContentPart(id="p", label="main", typeGroup="text", mimeType="text/plain", data="a\n\nb")],
+ )
+ cfg = parse_presentation_parameters({"outputMode": "lines", "splitBy": "paragraph"})
+ pres = build_presentation_for_extractions([ce], ["f.txt"], cfg)
+ fk = pres["fileOrder"][0]
+ b1 = pres["files"][fk]
+ assert b1["outputMode"] == "lines"
+ assert len(b1["data"]) == 1
+ assert b1["data"][0]["lines"] == ["a", "b"]
+ assert "items" not in b1
+
+
def test_joined_text_includes_csv_table_parts():
payload = {
"fileOrder": ["f1"],
@@ -44,47 +169,6 @@ def test_joined_text_includes_csv_table_parts():
assert _joined_text_from_handover_payload(payload) == "a,b\n1,2"
-def test_split_images_moves_pixels_to_blob_docs():
- raw = b"fake-binary-image"
- b64 = base64.b64encode(raw).decode("ascii")
- payload = {
- "kind": HANDOVER_KIND,
- "schemaVersion": 1,
- "fileOrder": ["f1"],
- "files": {
- "f1": {
- "parts": [
- {"typeGroup": "text", "data": "x", "id": "t1"},
- {
- "typeGroup": "image",
- "mimeType": "image/png",
- "data": b64,
- "id": "p1-img",
- "metadata": {},
- },
- ]
- }
- },
- }
- stripped, blobs = _split_images_to_sidecar_documents(payload, document_name_stem="abc")
- assert len(blobs) == 1
- assert blobs[0].mimeType == "image/png"
- assert blobs[0].documentData == raw
- assert blobs[0].documentName.endswith(".png")
- assert blobs[0].documentName.startswith("extract_media_")
- meta = blobs[0].validationMetadata or {}
- assert meta.get("handoverRole") == "extractedMedia"
- img_parts = [
- p
- for p in stripped["files"]["f1"]["parts"]
- if isinstance(p, dict) and (p.get("typeGroup") or "") == "image"
- ]
- assert len(img_parts) == 1
- assert img_parts[0]["data"] == ""
- assert img_parts[0]["handoverMediaDocumentName"] == blobs[0].documentName
- assert "image" in stripped["files"]["f1"]["byTypeGroup"]
-
-
def _mixed_payload():
return {
"kind": HANDOVER_KIND,
@@ -106,7 +190,7 @@ def _mixed_payload():
def test_content_filter_all_is_noop():
payload = _mixed_payload()
result = _apply_content_filter(payload, "all")
- assert result is payload # same object, no copy
+ assert result is payload
def test_content_filter_text_only_keeps_text_table_structure():
@@ -129,7 +213,6 @@ def test_content_filter_no_images_removes_only_images():
parts = result["files"]["f1"]["parts"]
type_groups = {p["typeGroup"] for p in parts}
assert "image" not in type_groups
- # text, table, structure all remain
assert {"text", "table", "structure"} == type_groups
@@ -137,14 +220,7 @@ def test_content_filter_text_only_joined_text_has_no_image_data():
result = _apply_content_filter(_mixed_payload(), "textOnly")
text = _joined_text_from_handover_payload(result)
assert "hello" in text
- assert "abc=" not in text # base64 image data must not appear
-
-
-def test_content_filter_text_only_no_sidecars():
- """textOnly: no image parts → _split produces zero sidecars."""
- result = _apply_content_filter(_mixed_payload(), "textOnly")
- stripped, blobs = _split_images_to_sidecar_documents(result, document_name_stem="test")
- assert blobs == []
+ assert "abc=" not in text
def test_presentation_lines_and_response():
@@ -162,9 +238,12 @@ def test_presentation_lines_and_response():
}
cfg = parse_presentation_parameters({"outputMode": "lines", "splitBy": "paragraph"})
pres = build_presentation_for_payload(payload, cfg)
- assert pres["files"]["f1"]["outputMode"] == "lines"
- assert [it["text"] for it in pres["files"]["f1"]["items"]] == ["a", "b"]
- assert presentation_response_text(pres, payload) == "a\n\nb"
+ b1 = pres["files"]["f1"]
+ assert b1["outputMode"] == "lines"
+ assert isinstance(b1["data"], list)
+ assert len(b1["data"]) == 1
+ assert b1["data"][0]["lines"] == ["a", "b"]
+ assert presentation_response_text(pres) == "a\n\nb"
def test_presentation_pdf_mode_tables_only():
@@ -182,7 +261,9 @@ def test_presentation_pdf_mode_tables_only():
}
cfg = parse_presentation_parameters({"pdfExtractMode": "tables", "outputMode": "blob"})
pres = build_presentation_for_payload(payload, cfg)
- assert pres["files"]["f1"]["text"] == "h1,h2\n1,2"
+ bf = pres["files"]["f1"]
+ assert isinstance(bf["data"], str)
+ assert bf["data"] == "h1,h2\n1,2"
def test_presentation_csv_rows():
@@ -195,7 +276,7 @@ def test_presentation_csv_rows():
},
},
}
- cfg = parse_presentation_parameters({"csvHeaderRow": "true"})
+ cfg = parse_presentation_parameters({"outputMode": "structured", "csvHeaderRow": "true"})
pres = build_presentation_for_payload(payload, cfg)
csv = pres["files"]["f1"]["csv"]
assert csv["headers"] == ["a", "b"]
@@ -222,6 +303,11 @@ def test_presentation_pages_groups_by_page_index():
(0, ["p0"]),
(1, ["p1a", "p1b"]),
]
+ pdata = pres["files"]["f1"]["data"]
+ assert pdata == [
+ {"pageIndex": 0, "lines": ["p0"]},
+ {"pageIndex": 1, "lines": ["p1a", "p1b"]},
+ ]
def test_presentation_chunks_with_overlap_chars():
@@ -235,9 +321,10 @@ def test_presentation_chunks_with_overlap_chars():
pres = build_presentation_for_payload(payload, cfg)
texts = [c["text"] for c in pres["files"]["f1"]["chunks"]]
assert texts == ["abcd", "cdef", "efgh", "ghij"]
+ assert pres["files"]["f1"]["data"] == texts
-def test_presentation_stripped_payload_gains_presentation_key_after_split():
+def test_presentation_keeps_pres_key_after_inline_image_strip_simulation():
raw = b"x"
b64 = base64.b64encode(raw).decode("ascii")
payload = {
@@ -254,7 +341,339 @@ def test_presentation_stripped_payload_gains_presentation_key_after_split():
},
}
pres = build_presentation_for_payload(payload, parse_presentation_parameters({}))
- stripped, _blobs = _split_images_to_sidecar_documents(payload, document_name_stem="s")
- stripped["presentation"] = pres
- assert "presentation" in stripped
- assert stripped["presentation"]["files"]["f1"]["items"]
+ serial = _copy.deepcopy([{"id": "1", "parts": payload["files"]["f1"]["parts"]}])
+ stayed, arts = _persist_extracted_image_parts(serial, name_stem="s", run_context=None)
+ assert arts == []
+ wrapper = {**pres, "_meta": {}}
+ fk = pres["fileOrder"][0]
+ assert isinstance(wrapper["files"][fk].get("data"), list)
+ assert len(wrapper["files"][fk]["data"]) == 2
+
+
+def test_summarize_presentation_payload_shape():
+ payload = {
+ "fileOrder": ["f1"],
+ "files": {"f1": {"sourceFileName": "t.txt", "parts": [{"typeGroup": "text", "data": "hello", "id": "a"}]}},
+ }
+ pres = build_presentation_for_payload(payload, parse_presentation_parameters({"outputMode": "blob"}))
+ s = summarize_presentation_payload(pres)
+ assert s["fileOrder"] == ["f1"]
+ assert "f1" in s["files"]
+ assert s["files"]["f1"]["outputMode"] == "blob"
+ assert s["files"]["f1"]["stringLength"] == 5
+ assert "hello" in (s["files"]["f1"].get("head") or "")
+
+
+def test_joined_text_from_extract_node_data_uses_presentation_root():
+ from modules.workflows.methods.methodContext.actions.extractContent import PRESENTATION_KIND
+
+ data = {
+ "schemaVersion": 1,
+ "kind": PRESENTATION_KIND,
+ "outputMode": "lines",
+ "fileOrder": ["f1"],
+ "files": {"f1": {"outputMode": "lines", "sourceFileName": "x.txt", "data": ["body"]}},
+ "_meta": {"extractPayloadSchemaVersion": EXTRACT_PAYLOAD_SCHEMA_VERSION},
+ }
+ assert joined_text_from_extract_node_data(data) == "body"
+ assert data["_meta"]["extractPayloadSchemaVersion"] == EXTRACT_PAYLOAD_SCHEMA_VERSION
+
+
+def test_action_result_contract_new_extract_payload_keys():
+ from modules.workflows.methods.methodContext.actions.extractContent import PRESENTATION_KIND
+
+ data = {
+ "schemaVersion": 1,
+ "kind": PRESENTATION_KIND,
+ "outputMode": "lines",
+ "fileOrder": ["f1"],
+ "files": {"f1": {"outputMode": "lines", "sourceFileName": "x.txt", "data": ["body"]}},
+ "_meta": {"actionType": "context.extractContent", "extractPayloadSchemaVersion": EXTRACT_PAYLOAD_SCHEMA_VERSION},
+ }
+ assert data["kind"] == PRESENTATION_KIND
+ assert joined_text_from_extract_node_data(data) == "body"
+
+
+def test_automation_workspace_suppresses_extract_artifacts():
+ from modules.workflows.automation2.workflowArtifactVisibility import suppress_workflow_file_in_workspace_ui
+
+ assert suppress_workflow_file_in_workspace_ui({"fileName": "extracted_content_transient-abc_99.json"})
+ assert suppress_workflow_file_in_workspace_ui({"fileName": "extract_media_stem_uuid.png"})
+ assert not suppress_workflow_file_in_workspace_ui({"fileName": "export_2026.csv"})
+ assert suppress_workflow_file_in_workspace_ui({"fileName": "", "suppressInWorkflowFileLists": True})
+ assert suppress_workflow_file_in_workspace_ui({"fileName": "report.pdf", "tags": ["_workflowInternal"]})
+ assert not suppress_workflow_file_in_workspace_ui({"fileName": "report.pdf", "tags": ["invoice"]})
+
+
+def test_normalize_presentation_envelopes_action_result_and_list():
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ PRESENTATION_KIND,
+ normalize_presentation_envelopes,
+ )
+
+ pres = {
+ "kind": PRESENTATION_KIND,
+ "fileOrder": ["f1"],
+ "files": {"f1": {"outputMode": "lines", "sourceFileName": "x.txt", "data": []}},
+ }
+ wrapped = {"success": True, "data": pres}
+ assert len(normalize_presentation_envelopes(wrapped)) == 1
+ assert len(normalize_presentation_envelopes([wrapped])) == 1
+
+
+def test_method_base_preserves_run_context_injection():
+ from modules.workflows.methods.methodFile.methodFile import MethodFile
+
+ class _Svc:
+ pass
+
+ action_def = MethodFile(_Svc())._actions["create"]
+ validated = MethodFile(_Svc())._validateParameters(
+ {"context": "x", "outputFormat": "pdf", "_runContext": {"mandateId": "m", "instanceId": "i"}},
+ action_def.parameters,
+ )
+ assert validated.get("_runContext") == {"mandateId": "m", "instanceId": "i"}
+
+
+def test_presentation_envelopes_to_document_json_one_section_per_data_slot():
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ PRESENTATION_KIND,
+ presentation_envelopes_to_document_json,
+ )
+
+ pres = {
+ "kind": PRESENTATION_KIND,
+ "outputMode": "lines",
+ "fileOrder": ["f1"],
+ "files": {
+ "f1": {
+ "outputMode": "lines",
+ "sourceFileName": "a.pdf",
+ "data": [
+ {
+ "typeGroup": "text",
+ "mimeType": "text/plain",
+ "data": "ignored",
+ "lines": ["Line A", "Line B"],
+ },
+ ],
+ },
+ },
+ }
+ out = presentation_envelopes_to_document_json(
+ {"success": True, "data": pres},
+ title="T",
+ language="de",
+ )
+ paragraphs = [
+ s for s in out["documents"][0]["sections"]
+ if s.get("content_type") == "paragraph"
+ ]
+ assert len(paragraphs) == 1
+ runs = paragraphs[0]["elements"][0]["content"]["inlineRuns"]
+ joined = "".join(r.get("value", "") for r in runs)
+ assert "Line A" in joined
+ assert "Line B" in joined
+ assert "\n" in joined
+
+
+def test_presentation_envelopes_table_slot_becomes_table_section():
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ PRESENTATION_KIND,
+ presentation_envelopes_to_document_json,
+ )
+
+ pres = {
+ "kind": PRESENTATION_KIND,
+ "outputMode": "lines",
+ "fileOrder": ["f1"],
+ "files": {
+ "f1": {
+ "outputMode": "lines",
+ "sourceFileName": "sheet.csv",
+ "data": [
+ {
+ "typeGroup": "table",
+ "mimeType": "text/csv",
+ "data": '"Name","Amount"\n"Alice","100"\n"Bob","200"',
+ "lines": [],
+ },
+ ],
+ },
+ },
+ }
+ out = presentation_envelopes_to_document_json(
+ {"success": True, "data": pres},
+ title="T",
+ language="de",
+ )
+ tables = [s for s in out["documents"][0]["sections"] if s.get("content_type") == "table"]
+ assert len(tables) == 1
+ content = tables[0]["elements"][0]["content"]
+ assert content["headers"] == ["Name", "Amount"]
+ assert content["rows"] == [["Alice", "100"], ["Bob", "200"]]
+
+
+def test_presentation_line_slot_preserves_table_without_lines():
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ _presentation_line_slot_from_part,
+ _presentation_line_slots_from_part,
+ parse_presentation_parameters,
+ )
+
+ cfg = parse_presentation_parameters({"outputMode": "lines", "splitBy": "newline"})
+ part = {
+ "typeGroup": "table",
+ "mimeType": "text/csv",
+ "data": '"A","B"\n"1","2"\n"3","4"',
+ "id": "t1",
+ }
+ slot = _presentation_line_slot_from_part(part, cfg)
+ assert slot.get("lines") == []
+ assert slot.get("data") == part["data"]
+ slots = _presentation_line_slots_from_part(part, cfg)
+ assert len(slots) == 3
+ assert slots[0]["lines"] == ['"A","B"']
+ assert slots[1]["lines"] == ['"1","2"']
+
+
+def test_presentation_envelopes_preserves_data_slot_order_text_image_text():
+ import base64
+
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ PRESENTATION_KIND,
+ presentation_envelopes_to_document_json,
+ )
+
+ class _Mgmt:
+ def getFileData(self, _fid: str) -> bytes:
+ return base64.b64decode(
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg=="
+ )
+
+ class _Svc:
+ interfaceDbComponent = _Mgmt()
+
+ pres = {
+ "kind": PRESENTATION_KIND,
+ "outputMode": "lines",
+ "fileOrder": ["f1"],
+ "files": {
+ "f1": {
+ "outputMode": "lines",
+ "sourceFileName": "a.pdf",
+ "data": [
+ {"typeGroup": "text", "mimeType": "text/plain", "lines": ["Before"]},
+ {
+ "typeGroup": "image",
+ "mimeType": "image/png",
+ "embeddedImageFileId": "00000000-0000-0000-0000-000000000001",
+ },
+ {"typeGroup": "text", "mimeType": "text/plain", "lines": ["After"]},
+ ],
+ },
+ },
+ }
+ out = presentation_envelopes_to_document_json(
+ {"success": True, "data": pres},
+ title="T",
+ language="de",
+ services=_Svc(),
+ )
+ types = [s.get("content_type") for s in out["documents"][0]["sections"]]
+ assert types == ["paragraph", "image", "paragraph"]
+
+
+def test_presentation_envelopes_to_document_json_text_slots():
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ PRESENTATION_KIND,
+ presentation_envelopes_to_document_json,
+ )
+
+ pres = {
+ "kind": PRESENTATION_KIND,
+ "outputMode": "lines",
+ "fileOrder": ["f1"],
+ "files": {
+ "f1": {
+ "outputMode": "lines",
+ "sourceFileName": "a.pdf",
+ "data": [
+ {
+ "typeGroup": "text",
+ "mimeType": "text/plain",
+ "data": "Hello",
+ "lines": ["Hello", "World"],
+ },
+ ],
+ },
+ },
+ }
+ out = presentation_envelopes_to_document_json(
+ [{"success": True, "data": pres}],
+ title="T",
+ language="de",
+ )
+ paragraphs = [
+ s for s in out["documents"][0]["sections"]
+ if s.get("content_type") == "paragraph"
+ ]
+ assert len(paragraphs) == 1
+ all_text = []
+ for p in paragraphs:
+ runs = p["elements"][0]["content"]["inlineRuns"]
+ all_text.append("".join(r.get("value", "") for r in runs))
+ assert any("Hello" in t for t in all_text)
+ assert any("World" in t for t in all_text)
+
+
+def test_presentation_envelopes_to_document_json_image_slot():
+ import base64
+
+ from modules.workflows.methods.methodContext.actions.extractContent import (
+ PRESENTATION_KIND,
+ presentation_envelopes_to_document_json,
+ )
+
+ fid = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
+ pres = {
+ "kind": PRESENTATION_KIND,
+ "outputMode": "lines",
+ "fileOrder": ["f1"],
+ "files": {
+ "f1": {
+ "outputMode": "lines",
+ "sourceFileName": "a.pdf",
+ "data": [
+ {
+ "typeGroup": "image",
+ "mimeType": "image/png",
+ "embeddedImageFileId": fid,
+ "embeddedImageFileName": "clip.png",
+ },
+ ],
+ },
+ },
+ }
+
+ class _Mgmt:
+ def getFileData(self, file_id):
+ assert file_id == fid
+ return b"\x89PNG\r\n"
+
+ class _Svc:
+ interfaceDbComponent = _Mgmt()
+
+ out = presentation_envelopes_to_document_json(
+ pres,
+ title="Img",
+ language="de",
+ services=_Svc(),
+ )
+ img_secs = [
+ s for s in out["documents"][0]["sections"]
+ if s.get("content_type") == "image"
+ ]
+ assert len(img_secs) == 1
+ b64 = img_secs[0]["elements"][0]["content"]["base64Data"]
+ assert base64.b64decode(b64).startswith(b"\x89PNG")
diff --git a/tests/unit/workflow/test_merge_context_handover.py b/tests/unit/workflow/test_merge_context_handover.py
index c89de1e3..cd2bdfc3 100644
--- a/tests/unit/workflow/test_merge_context_handover.py
+++ b/tests/unit/workflow/test_merge_context_handover.py
@@ -45,7 +45,7 @@ async def test_mergeContext_handover_only_in_documents_yields_data_response():
}
result = await mergeContext(object(), {"dataSource": [item]})
assert result.success
- assert result.data
+ assert result.data.get("kind") == "context.mergeContext.v1"
assert result.data.get("response") == "only-from-handover"
@@ -176,3 +176,24 @@ async def test_mergeContext_accumulates_image_documents_only_across_iterations()
names = [d.get("documentName") for d in imgs]
assert "img_a.png" in names
assert "img_b.png" in names
+
+
+@pytest.mark.asyncio
+async def test_transform_context_envelope_has_kind_and_meta():
+ from modules.workflows.methods.methodContext.actions.transformContext import transformContext
+
+ svc = object()
+ result = await transformContext(
+ svc,
+ {
+ "mappings": [{"operation": "rename", "sourceField": "a", "outputField": "b"}],
+ "_upstreamPayload": {"a": 42},
+ },
+ )
+ assert result.success and result.data
+ assert result.data.get("kind") == "context.transformContext.v1"
+ assert result.data.get("schemaVersion") == 1
+ assert result.data.get("b") == 42
+ meta = result.data.get("_meta")
+ assert isinstance(meta, dict)
+ assert meta.get("actionType") == "context.transformContext"
diff --git a/tests/unit/workflow/test_phase3_context_node.py b/tests/unit/workflow/test_phase3_context_node.py
index 07496025..76fbc972 100644
--- a/tests/unit/workflow/test_phase3_context_node.py
+++ b/tests/unit/workflow/test_phase3_context_node.py
@@ -18,6 +18,7 @@ def test_context_extractContent_node_exists():
def test_context_extractContent_node_shape():
node = next(n for n in STATIC_NODE_TYPES if n["id"] == "context.extractContent")
assert node["category"] == "context"
+ assert node.get("injectRunContext") is True
assert node["meta"]["usesAi"] is False
assert node["_method"] == "context"
assert node["_action"] == "extractContent"
@@ -43,7 +44,16 @@ def test_context_extractContent_node_shape():
]
pick_paths = [opt["path"] for opt in node["outputPorts"][0]["dataPickOptions"]]
- assert ["documents", 0, "documentData", "presentation"] in pick_paths
+ assert ["data", "files"] in pick_paths
+ assert ["data", "_meta"] in pick_paths
+
+
+
+def test_context_transformContext_has_envelope_data_pick_paths():
+ node = next(n for n in STATIC_NODE_TYPES if n["id"] == "context.transformContext")
+ pick_paths = [opt["path"] for opt in node["outputPorts"][0]["dataPickOptions"]]
+ assert ["data"] in pick_paths
+ assert ["data", "_meta"] in pick_paths
def test_udm_port_types_registered():
@@ -85,6 +95,14 @@ def test_getExecutor_dispatches_context():
assert isinstance(executor, ActionNodeExecutor)
+def test_context_mergeContext_has_envelope_data_pick_paths():
+ node = next(n for n in STATIC_NODE_TYPES if n["id"] == "context.mergeContext")
+ pick_paths = [opt["path"] for opt in node["outputPorts"][0]["dataPickOptions"]]
+ assert ["data"] in pick_paths
+ assert ["data", "_meta"] in pick_paths
+ assert ["merged"] in pick_paths
+
+
def test_context_mergeContext_surfaces_data_pick_paths_match_node_outputs():
"""DataPicker uses paths like ``merged``; executor must surface ``data.*`` to top level."""
node = next(n for n in STATIC_NODE_TYPES if n["id"] == "context.mergeContext")
diff --git a/tests/unit/workflow/test_serialize_context_and_file_create_context.py b/tests/unit/workflow/test_serialize_context_and_file_create_context.py
deleted file mode 100644
index 57ae3823..00000000
--- a/tests/unit/workflow/test_serialize_context_and_file_create_context.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2025 Patrick Motsch
-# All rights reserved.
-
-import json
-
-from modules.workflows.methods.methodAi._common import serialize_context
-from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import (
- enhancePlainTextWithMarkdownTables,
- markdownToDocumentJson,
-)
-from modules.workflows.methods.methodFile.actions.create import (
- _collect_image_documents_only,
- _context_string_for_report,
-)
-
-
-def test_serialize_context_nonserializable_embeds_via_default_str():
- class _Ns:
- def __str__(self):
- return "ns"
-
- s = serialize_context({"x": _Ns(), "n": 1})
- parsed = json.loads(s)
- assert parsed["n"] == 1
- assert "ns" in parsed["x"]
-
-
-def test_serialize_context_strips_bom_on_plain_string():
- assert serialize_context("\ufeffhello") == "hello"
-
-
-def test_context_string_docx_prefers_response_over_full_dict():
- body = "Datum;Mandant\n2026-01-01;acme"
- ctx = {"response": "\ufeff" + body, "data": {"foo": 1}}
- assert _context_string_for_report(ctx, "docx") == body
-
-
-def test_context_string_json_serializes_full_structure():
- ctx = {"response": "hi", "data": {"foo": 1}}
- out = _context_string_for_report(ctx, "json")
- assert json.loads(out)["data"]["foo"] == 1
-
-
-def test_serialize_context_prefers_response_when_json_fails():
- d: dict = {"response": "plain", "n": 1}
- d["_loop"] = d # circular — json.dumps fails
- assert serialize_context(d).strip() == "plain"
-
-
-def test_serialize_context_prefer_handover_primary_skips_metadata():
- blob = {"response": "LINE", "data": {"nested": {"x" * 200}}, "extra": {"y": 2}}
- s = serialize_context(blob, prefer_handover_primary=True)
- assert s == "LINE"
-
-
-def test_context_string_plain_str_passthrough_docx():
- assert _context_string_for_report(" hello ", "docx") == "hello"
-
-
-def test_collect_image_documents_nested_paths():
- imgs = [{"documentName": "m.png", "mimeType": "image/png"}]
- assert _collect_image_documents_only({"merged": {"imageDocumentsOnly": imgs}}) == imgs
- assert _collect_image_documents_only({"data": {"merged": {"imageDocumentsOnly": imgs}}}) == imgs
-
-
-def test_context_string_prefers_merged_response_over_inputs_noise():
- raw = {"merged": {"response": "from-merged"}, "inputs": {"0": {"documentData": "X" * 10000}}}
- assert _context_string_for_report(raw, "docx") == "from-merged"
-
-
-def test_context_string_fallback_json_strips_heavy_keys():
- raw = {"foo": 1, "inputs": {"nasty": True}, "imageDocumentsOnly": [{"documentName": "x"}]}
- out = _context_string_for_report(raw, "docx")
- parsed = json.loads(out)
- assert "inputs" not in parsed
- assert "imageDocumentsOnly" not in parsed
- assert parsed["foo"] == 1
-
-
-def test_enhance_plain_csv_semicolon_to_markdown_table():
- body = "Datum;Betrag\n2026-01-01;12.50\n2026-01-02;3.00"
- out = enhancePlainTextWithMarkdownTables(body)
- assert "| Datum |" in out
- assert "| Betrag |" in out
- assert "---" in out
-
-
-def test_enhance_preserves_normal_paragraphs():
- body = "Ein Absatz ohne Raster.\n\nZweiter Gedanke."
- assert enhancePlainTextWithMarkdownTables(body) == body
-
-
-def test_enhance_then_markdown_json_contains_table_section():
- body = "Datum;Betrag\n2026-01-01;12\n2026-01-02;3"
- enhanced = enhancePlainTextWithMarkdownTables(body)
- doc = markdownToDocumentJson(enhanced, "Report", "de")
- sections = doc["documents"][0]["sections"]
- assert any(s.get("content_type") == "table" for s in sections)