From 64b58802a4ec8d738b9255588ddcd0f7ac7f3217 Mon Sep 17 00:00:00 2001 From: Ida Date: Thu, 14 May 2026 16:41:43 +0200 Subject: [PATCH] fix: handover nochmal zentralisiert --- .../graphicalEditor/nodeDefinitions/ai.py | 8 +- .../nodeDefinitions/context.py | 66 +- .../graphicalEditor/nodeDefinitions/file.py | 7 +- .../graphicalEditor/nodeDefinitions/flow.py | 23 + modules/features/graphicalEditor/portTypes.py | 34 +- .../graphicalEditor/upstreamPathsService.py | 31 +- modules/interfaces/interfaceDbManagement.py | 6 + modules/routes/routeAutomationWorkspace.py | 4 +- .../extractors/extractorPdf.py | 217 ++- .../renderers/rendererPdf.py | 3 +- .../workflows/automation2/executionEngine.py | 69 +- .../executors/actionNodeExecutor.py | 90 +- .../automation2/executors/flowExecutor.py | 34 +- modules/workflows/automation2/graphUtils.py | 12 +- .../automation2/pickNotPushMigration.py | 134 +- .../automation2/workflowArtifactVisibility.py | 32 + modules/workflows/methods/methodAi/_common.py | 43 + modules/workflows/methods/methodBase.py | 10 +- .../methodContext/actions/extractContent.py | 1212 ++++++++++++++--- .../methodContext/actions/mergeContext.py | 22 +- .../methodContext/actions/transformContext.py | 3 +- .../methods/methodContext/contextEnvelope.py | 42 + .../methods/methodContext/methodContext.py | 20 +- .../methods/methodFile/actions/create.py | 339 ++--- .../methods/methodFile/methodFile.py | 7 +- .../workflow/test_extract_content_handover.py | 545 +++++++- .../workflow/test_merge_context_handover.py | 23 +- .../unit/workflow/test_phase3_context_node.py | 20 +- ...rialize_context_and_file_create_context.py | 98 -- 29 files changed, 2439 insertions(+), 715 deletions(-) create mode 100644 modules/workflows/automation2/workflowArtifactVisibility.py create mode 100644 modules/workflows/methods/methodContext/contextEnvelope.py delete mode 100644 tests/unit/workflow/test_serialize_context_and_file_create_context.py diff --git a/modules/features/graphicalEditor/nodeDefinitions/ai.py b/modules/features/graphicalEditor/nodeDefinitions/ai.py index ecdebcf6..a709f0be 100644 --- a/modules/features/graphicalEditor/nodeDefinitions/ai.py +++ b/modules/features/graphicalEditor/nodeDefinitions/ai.py @@ -6,6 +6,9 @@ from modules.shared.i18nRegistry import t from modules.features.graphicalEditor.nodeDefinitions.contextPickerHelp import ( CONTEXT_BUILDER_PARAM_DESCRIPTION, ) +from modules.features.graphicalEditor.nodeDefinitions.flow import ( + CONTEXT_ENVELOPE_DATA_PICK_OPTIONS, +) # Shared authoritative DataPicker paths (same handover idea as ``context.extractContent`` outputPorts). ACTION_RESULT_DATA_PICK_OPTIONS = [ @@ -43,6 +46,7 @@ ACTION_RESULT_DATA_PICK_OPTIONS = [ ] AI_RESULT_DATA_PICK_OPTIONS = [ + *CONTEXT_ENVELOPE_DATA_PICK_OPTIONS, { "path": ["documents", 0, "documentData"], "pickerLabel": t("Gesamter Inhalt"), @@ -50,14 +54,14 @@ AI_RESULT_DATA_PICK_OPTIONS = [ "Hauptausgabedatei oder strukturierter Inhalt von ``documents[0]`` " "(z. B. erzeugtes Dokument, JSON-Handover)." ), - "recommended": True, + "recommended": False, "type": "Any", }, { "path": ["response"], "pickerLabel": t("Nur Text"), "detail": t("Modell-Antwort als reiner Fließtext (ohne eingebettete Bildbytes)."), - "recommended": True, + "recommended": False, "type": "str", }, { diff --git a/modules/features/graphicalEditor/nodeDefinitions/context.py b/modules/features/graphicalEditor/nodeDefinitions/context.py index 26c5b788..22e068dd 100644 --- a/modules/features/graphicalEditor/nodeDefinitions/context.py +++ b/modules/features/graphicalEditor/nodeDefinitions/context.py @@ -4,7 +4,10 @@ from modules.shared.i18nRegistry import t -from modules.features.graphicalEditor.nodeDefinitions.flow import CONTEXT_MERGE_ACTION_RESULT_DATA_PICK_OPTIONS +from modules.features.graphicalEditor.nodeDefinitions.flow import ( + CONTEXT_ENVELOPE_DATA_PICK_OPTIONS, + CONTEXT_MERGE_ACTION_RESULT_DATA_PICK_OPTIONS, +) _CONTEXT_INPUT_SCHEMAS = [ "Transit", @@ -27,11 +30,12 @@ CONTEXT_NODES = [ "category": "context", "label": t("Inhalt extrahieren"), "description": t( - "Extrahiert Inhalt ohne KI. Ergebnis einheitlich wie KI-Schritte: `response` " - "(gesammelter Klartext), strukturierte JSON-Unterlage in `documents[0]`, " - "einzelne Bilder als eigene Dokumente `extract_media_*` (nur im Workflow, ohne Eintrag unter „Meine Dateien“) — " - "Auswahl im Daten-Picker wie bei `ai.process`." + "Extrahiert Inhalt ohne KI. ``data`` ist die gewählte **Presentation** (`fileOrder`, `files` je " + "Quelldatei, kanonisches `data` pro Bucket) plus ``_meta`` (Quellnamen, Operation, Persist). " + "``response`` für diesen Knoten bleibt leer — kein zusätzlicher Fließtext. " + "``imageDocumentsOnly`` enthält Bilder über persistierte Artefakte." ), + "injectRunContext": True, "parameters": [ {"name": "documentList", "type": "str", "required": True, "frontendType": "hidden", "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": "", @@ -51,7 +55,7 @@ CONTEXT_NODES = [ }, "default": "all", "description": t( - "Welche Parts im Handover behalten werden. " + "Welche extrahierten Parts weiterverwendet werden. " "all = alle Typgruppen inkl. Bilder; " "textOnly = ausschliesslich Text-, Tabellen- und Struktur-Parts; " "imagesOnly = ausschliesslich Bild-Parts; " @@ -75,8 +79,7 @@ CONTEXT_NODES = [ }, "default": "lines", "description": t( - "Wie die extrahierten Inhalte unter ``presentation`` strukturiert werden " - "(zusaetzlich zu den unveraenderten ``parts`` im Handover)." + "Wie das Ergebnis unter ``files`` strukturiert wird (``outputMode``: blob, lines, …)." ), }, { @@ -238,10 +241,11 @@ CONTEXT_NODES = [ {"value": "all", "label": t("PDF/Parts: alle Typgruppen")}, ] }, - "default": "text", + "default": "all", "description": t( "Filtert fuer die Presentation-Schicht nach typeGroup/MIME " - "(gilt fuer alle Dokumenttypen analog, nicht nur PDF)." + "(gilt fuer alle Dokumenttypen analog, nicht nur PDF). " + "Passt zum Inhaltsfilter „Alles“; „Text & Tabellen“ blendet Bild-Parts in der Presentation aus." ), }, { @@ -271,51 +275,40 @@ CONTEXT_NODES = [ # Frontend uses only this list — no schema expansion merge for this port. "dataPickOptions": [ { - "path": ["documents", 0, "documentData"], - "pickerLabel": t("Gesamter Inhalt"), + "path": ["data"], + "pickerLabel": t("Vollständiges data-Objekt"), "detail": t( - "Strukturiertes Handover als JSON inklusive aller Textteile " - "und Verweisen auf ausgelagerte Bilder." + "Presentation-Envelope (``schemaVersion``, ``kind``, ``fileOrder``, ``files``) " + "plus ``_meta`` (``operationRef``, ``sourceFileNames``, Persist)." ), "recommended": True, "type": "Any", }, { - "path": ["documents", 0, "documentData", "presentation"], - "pickerLabel": t("Presentation (strukturierte Sicht)"), - "detail": t( - "Nur die konfigurierte Ausgabe-Struktur (blob/lines/pages/chunks/structured); " - "unveraenderte Roh-Parts bleiben im umschliessenden Handover." - ), + "path": ["data", "files"], + "pickerLabel": t("Alle Dateibuckets"), + "detail": t("Map Dateischlüssel → Bucket (Zeilenliste, Blob, CSV-Tabelle bei structured, …)."), "recommended": False, "type": "Any", }, - { - "path": ["response"], - "pickerLabel": t("Nur Text"), - "detail": t( - "Verketteter Klartext aus allen erkannten Textteilen." - ), - "recommended": True, - "type": "str", - }, { "path": ["imageDocumentsOnly"], "pickerLabel": t("Nur Bilder"), "detail": t( - "Nur die extrahierten Bilddokumente als Liste, ohne JSON-Handover." + "Nur die Bilder aus der Extraktion (persistierte Artefakte bzw. inline), " + "als Liste fuer nachgelagerte Schritte." ), "recommended": False, "type": "List[ActionDocument]", }, { - "path": ["documents"], - "pickerLabel": t("Alle Dateitypen"), + "path": ["data", "_meta"], + "pickerLabel": t("Metadaten (_meta)"), "detail": t( - "Alle Ausgabedokumente nacheinander: JSON-Handover und Bilder." + "``operationRef``, ``sourceFileNames``, Presentation-Parameter, Liste persistierter Bilder." ), "recommended": False, - "type": "List[ActionDocument]", + "type": "Any", }, ], } @@ -330,6 +323,8 @@ CONTEXT_NODES = [ "label": t("Kontext zusammenführen"), "description": t( "Führt eine Liste von Ergebnissen zu einem einzigen Kontext zusammen. " + "Ausgabe ``data``: versionierter Umschlag (``schemaVersion``, ``kind``), Felder wie " + "``merged`` / ``first`` / ``response`` sowie ``_meta``. " "Wähle als Datenquelle die Option Alle Schleifen-Ergebnisse einer Schleife, " "um alle Iterationsergebnisse in einem Datensatz zu vereinen." ), @@ -365,6 +360,8 @@ CONTEXT_NODES = [ "label": t("Kontext transformieren"), "description": t( "Verändert die Struktur des eingehenden Datenstroms. " + "Ausgabe ``data``: versionierter Umschlag (``schemaVersion``, ``kind``: transform), " + "konfigurierte Ausgabe-Felder und ``_meta``. " "Operationen pro Mapping: 'rename' (Key umbenennen), 'cast' (Typ konvertieren), " "'nest' (mehrere Felder unter neuem Objekt zusammenfassen), " "'flatten' (verschachteltes Objekt auf oberste Ebene heben), " @@ -423,6 +420,7 @@ CONTEXT_NODES = [ "dynamic": True, "deriveFrom": "mappings", "deriveNameField": "outputField", + "dataPickOptions": CONTEXT_ENVELOPE_DATA_PICK_OPTIONS, } }, "injectUpstreamPayload": True, diff --git a/modules/features/graphicalEditor/nodeDefinitions/file.py b/modules/features/graphicalEditor/nodeDefinitions/file.py index 8d4b390d..2b79f2e0 100644 --- a/modules/features/graphicalEditor/nodeDefinitions/file.py +++ b/modules/features/graphicalEditor/nodeDefinitions/file.py @@ -14,9 +14,8 @@ FILE_NODES = [ "category": "file", "label": t("Datei erstellen"), "description": t( - "Erstellt eine Datei aus Kontext. Nach „Inhalt extrahieren“: „response“ für reinen Text; " - "„Nur Bilder“ liefert alle extrahierten Bilder — Datei erstellen fasst sie zu einer PDF oder DOCX " - "(Ausgabeformat pdf oder docx wählen)." + "Erstellt eine Datei aus der Presentation von „Inhalt extrahieren“ " + "(``data`` oder Schleifen-``bodyResults``). Ausgabe über den Generation-Service." ), "parameters": [ {"name": "outputFormat", "type": "str", "required": True, "frontendType": "select", @@ -29,7 +28,7 @@ FILE_NODES = [ "default": ""}, {"name": "context", "type": "Any", "required": False, "frontendType": "contextBuilder", "description": CONTEXT_BUILDER_PARAM_DESCRIPTION, "default": "", - "graphInherit": {"port": 0, "kind": "primaryTextRef"}}, + "graphInherit": {"port": 0, "kind": "recommendedDataPickRef"}}, ], "inputs": 1, "outputs": 1, diff --git a/modules/features/graphicalEditor/nodeDefinitions/flow.py b/modules/features/graphicalEditor/nodeDefinitions/flow.py index e47a063e..b2fc020b 100644 --- a/modules/features/graphicalEditor/nodeDefinitions/flow.py +++ b/modules/features/graphicalEditor/nodeDefinitions/flow.py @@ -63,6 +63,28 @@ LOOP_ITEM_DATA_PICK_OPTIONS = [ }, ] +# Base paths when ``ActionResult.data`` uses envelope + ``_meta`` (context.extractContent-style clarity). +CONTEXT_ENVELOPE_DATA_PICK_OPTIONS = [ + { + "path": ["data"], + "pickerLabel": t("Vollständiges data-Objekt"), + "detail": t( + "Versionierter Kontext-Umschlag: ``schemaVersion``, ``kind``, Nutzdatenfelder, ``_meta``." + ), + "recommended": True, + "type": "Dict", + }, + { + "path": ["data", "_meta"], + "pickerLabel": t("Technische Metadaten (_meta)"), + "detail": t( + "`actionType`, Payload-Schema-Version; bei Transform/Merge keine großen Payloads." + ), + "recommended": False, + "type": "Any", + }, +] + MERGE_RESULT_DATA_PICK_OPTIONS = [ { "path": ["merged"], @@ -90,6 +112,7 @@ MERGE_RESULT_DATA_PICK_OPTIONS = [ # Extended picker for ``context.mergeContext`` (ActionResult + ``surfaceDataAsTopLevel``): same # merge keys as ``flow.merge`` plus ``count`` from the action payload. CONTEXT_MERGE_ACTION_RESULT_DATA_PICK_OPTIONS = [ + *CONTEXT_ENVELOPE_DATA_PICK_OPTIONS, *MERGE_RESULT_DATA_PICK_OPTIONS, { "path": ["count"], diff --git a/modules/features/graphicalEditor/portTypes.py b/modules/features/graphicalEditor/portTypes.py index 24a97446..0784e436 100644 --- a/modules/features/graphicalEditor/portTypes.py +++ b/modules/features/graphicalEditor/portTypes.py @@ -315,14 +315,18 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = { # bindings like `processDocuments → documents → *` for syncToAccounting. PortField(name="documents", type="List[ActionDocument]", required=False, description=( - "Dokumentliste: Index 0 oft JSON-Handover oder Hauptdatei; Einträge mit " - "MIME image/* oder Namen extract_media_* sind ausgelagerte Bilder (documentData = Binär)." + "Dokumentliste für Actions mit echten Artefakt-Dokumenten. " + "Beim Knoten „Inhalt extrahieren“ fehlt dieses Feld in der Knotenausgabe." ), picker_label=t("Alle Ausgabe-Dokumente"), picker_item_label=t("je Dokument"), ), PortField(name="data", type="Dict", required=False, - description="Ergebnisdaten", + description=( + "Strukturierter Inhalt. Bei **context.extractContent**: **Presentation**-Root " + "(`schemaVersion`, `kind`, `fileOrder`, `files`) plus **`_meta`** — ohne " + "zusätzliches `response`/`contentExtracted`-Duplikat." + ), picker_label=t("Technische Detaildaten (data)")), # Mirror AiResult primary text fields so DataPicker / primaryTextRef behave the same PortField(name="prompt", type="str", required=False, @@ -330,7 +334,8 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = { picker_label=t("Auslöser / Prompt (falls vorhanden)")), PortField(name="response", type="str", required=False, description=( - "Primär nur Fließtext (z. B. nach Extraktion: alle Text-Parts verkettet, keine Bilder)." + "Fließtext wo die Action einen liefert. Bei **„Inhalt extrahieren“** absichtlich leer — " + "Inhalt liegt in ``data``.``files``." ), recommended=True, picker_label=t("Nur Fließtext (gesamt)")), @@ -339,12 +344,29 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = { picker_label=t("Mitgegebener Kontext")), PortField(name="imageDocumentsOnly", type="List[ActionDocument]", required=False, description=( - "Nur Bildausgaben (ohne JSON-Handover), z. B. von context.extractContent." + "Nur Bild-bezogene Einträge. Bei „Inhalt extrahieren“: synthetische " + "Einträge mit ``fileId`` aus persistierten Extrakt-Bildern (kein separates JSON-Dokument)." ), picker_label=t("Nur Bilder (Liste)")), PortField(name="responseData", type="Dict", required=False, description="Optional: strukturierte Zusatzdaten", picker_label=t("Strukturierte Zusatzdaten")), + PortField(name="presentation", type="Dict", required=False, + description=( + "Selten: Top-Level-Spiegel von Präsentationsdaten andere Actions. " + "Bei „Inhalt extrahieren“ liegt alles direkt unter ``data`` (kein zusätzlicher Spiegel)." + ), + picker_label=t("Presentation (Top-Level-Spiegel)")), + PortField(name="presentationSummary", type="Dict", required=False, + description=( + "Kompakte Metadaten zu ``presentation`` (Debugging / traces)." + ), + picker_label=t("Presentation-Zusammenfassung")), + PortField(name="presentationConfig", type="Dict", required=False, + description=( + "Optional: Debugging-Konfiguration; bei Extract liegt die Primärquelle in ``validationMetadata`` des JSON-Dokuments." + ), + picker_label=t("Presentation-Konfiguration")), ]), "Transit": PortSchema(name="Transit", fields=[]), "UdmDocument": PortSchema(name="UdmDocument", carriesConnectionProvenance=True, fields=[ @@ -675,6 +697,8 @@ SYSTEM_VARIABLES: Dict[str, Dict[str, str]] = { # # When a parameter declares ``graphInherit.kind == "primaryTextRef"``, executeGraph # inserts an explicit DataRef before run (see pickNotPushMigration.materializePrimaryTextHandover). +# ``recommendedDataPickRef`` uses upstream ``outputPorts.dataPickOptions`` where ``recommended: true`` +# (see pickNotPushMigration.materializeRecommendedDataPickRef). # Schema names are catalog output port types (e.g. AiResult). PRIMARY_TEXT_HANDOVER_REF_PATH: Dict[str, List[Any]] = { diff --git a/modules/features/graphicalEditor/upstreamPathsService.py b/modules/features/graphicalEditor/upstreamPathsService.py index f0cb473e..13e84719 100644 --- a/modules/features/graphicalEditor/upstreamPathsService.py +++ b/modules/features/graphicalEditor/upstreamPathsService.py @@ -110,24 +110,29 @@ def compute_upstream_paths(graph: Dict[str, Any], target_node_id: str) -> List[D out0 = (ndef.get("outputPorts") or {}).get(0, {}) out0 = out0 if isinstance(out0, dict) else {} dpo = out0.get("dataPickOptions") - if isinstance(dpo, list) and len(dpo) > 0: + + bases: List[Dict[str, Any]] = [] + if isinstance(dpo, list): + bases = _paths_for_data_pick_options(dpo, aid) + derived = parse_graph_defined_output_schema(anode, out0) + derived_paths: List[Dict[str, Any]] = [] + if derived: + derived_paths = _paths_for_port_schema(derived, aid) + + merged_list = bases + derived_paths + if merged_list: plab = (anode.get("title") or "").strip() or aid - for entry in _paths_for_data_pick_options(dpo, aid): + for entry in merged_list: entry["producerLabel"] = plab paths.append(entry) continue - derived = parse_graph_defined_output_schema(anode, out0) - if derived: - for entry in _paths_for_port_schema(derived, aid): - entry["producerLabel"] = (anode.get("title") or "").strip() or aid - paths.append(entry) - else: - raw_schema = out0.get("schema") if isinstance(out0, dict) else None - schema_name = raw_schema if isinstance(raw_schema, str) and raw_schema else "ActionResult" - for entry in _paths_for_schema(schema_name, aid): - entry["producerLabel"] = (anode.get("title") or "").strip() or aid - paths.append(entry) + raw_schema = out0.get("schema") if isinstance(out0, dict) else None + schema_name = raw_schema if isinstance(raw_schema, str) and raw_schema else "ActionResult" + plab = (anode.get("title") or "").strip() or aid + for entry in _paths_for_schema(schema_name, aid): + entry["producerLabel"] = plab + paths.append(entry) # Lexical loop hints (flow.loop): only for nodes inside the loop body for aid in ancestors: diff --git a/modules/interfaces/interfaceDbManagement.py b/modules/interfaces/interfaceDbManagement.py index 794063f4..f412cea7 100644 --- a/modules/interfaces/interfaceDbManagement.py +++ b/modules/interfaces/interfaceDbManagement.py @@ -990,6 +990,10 @@ class ComponentObjects: If pagination is provided: PaginatedResult with items and metadata """ def _convertFileItems(files): + from modules.workflows.automation2.workflowArtifactVisibility import ( + suppress_workflow_file_in_workspace_ui, + ) + fileItems = [] for file in files: try: @@ -1002,6 +1006,8 @@ class ComponentObjects: fileName = file.get("fileName") if not fileName or fileName == "None": continue + if suppress_workflow_file_in_workspace_ui(file): + continue if file.get("scope") is None: file["scope"] = "personal" diff --git a/modules/routes/routeAutomationWorkspace.py b/modules/routes/routeAutomationWorkspace.py index b742d7ea..32624363 100644 --- a/modules/routes/routeAutomationWorkspace.py +++ b/modules/routes/routeAutomationWorkspace.py @@ -26,6 +26,7 @@ from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import ( AutoWorkflow, ) from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase +from modules.workflows.automation2.workflowArtifactVisibility import suppress_workflow_file_in_workspace_ui from modules.shared.i18nRegistry import apiRouteContext routeApiMsg = apiRouteContext("routeAutomationWorkspace") @@ -265,7 +266,8 @@ def getWorkspaceRunDetail( logger.warning("getWorkspaceRunDetail: file lookup failed: %s", e) def _resolveFileList(ids: set[str]) -> list[dict]: - return [fileMetaById[fid] for fid in ids if fid in fileMetaById] + rows = [dict(fileMetaById[fid]) for fid in ids if fid in fileMetaById] + return [m for m in rows if not suppress_workflow_file_in_workspace_ui(m)] assignedFileIds: set[str] = set() for step, (inputIds, outputIds) in zip(steps, perStepFileIds): diff --git a/modules/serviceCenter/services/serviceExtraction/extractors/extractorPdf.py b/modules/serviceCenter/services/serviceExtraction/extractors/extractorPdf.py index 1df4e7fc..657e3fc6 100644 --- a/modules/serviceCenter/services/serviceExtraction/extractors/extractorPdf.py +++ b/modules/serviceCenter/services/serviceExtraction/extractors/extractorPdf.py @@ -73,7 +73,30 @@ class PdfExtractor(Extractor): )) return parts - # Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer) + file_name = context.get("fileName", "document.pdf") + ordered_ok = False + try: + doc = fitz.open(stream=fileBytes, filetype="pdf") + for page_index in range(len(doc)): + page = doc[page_index] + page_parts = self._extract_page_blocks_in_reading_order( + page, + doc, + page_index=page_index, + root_id=rootId, + file_name=file_name, + ) + if page_parts: + parts.extend(page_parts) + ordered_ok = True + doc.close() + except Exception: + ordered_ok = False + + if ordered_ok and any(getattr(p, "typeGroup", "") in ("text", "image") for p in parts): + return parts + + parts = [parts[0]] # keep container only; fall back below try: doc = fitz.open(stream=fileBytes, filetype="pdf") for i in range(len(doc)): @@ -174,4 +197,196 @@ class PdfExtractor(Extractor): return parts + @staticmethod + def _text_from_text_block(block: Dict[str, Any]) -> str: + lines_out: List[str] = [] + for line in block.get("lines") or []: + if not isinstance(line, dict): + continue + spans = line.get("spans") or [] + line_text = "".join( + str(span.get("text") or "") + for span in spans + if isinstance(span, dict) + ) + lines_out.append(line_text) + return "\n".join(lines_out).strip() + @staticmethod + def _bbox_center(bbox: Any) -> tuple[float, float]: + if not isinstance(bbox, (list, tuple)) or len(bbox) < 4: + return 0.0, 0.0 + x0, y0, x1, y1 = float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]) + return (x0 + x1) / 2.0, (y0 + y1) / 2.0 + + @staticmethod + def _point_inside_bbox(x: float, y: float, bbox: Any) -> bool: + if not isinstance(bbox, (list, tuple)) or len(bbox) < 4: + return False + x0, y0, x1, y1 = float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]) + return x0 <= x <= x1 and y0 <= y <= y1 + + def _extract_page_blocks_in_reading_order( + self, + page: Any, + doc: Any, + *, + page_index: int, + root_id: str, + file_name: str, + ) -> List[ContentPart]: + """Emit text/image/table parts in on-page reading order (top-to-bottom, left-to-right).""" + entries: List[tuple[float, float, str, Dict[str, Any]]] = [] + table_bboxes: List[Any] = [] + + try: + table_finder = page.find_tables() + for ti, tab in enumerate(getattr(table_finder, "tables", []) or []): + try: + matrix = tab.extract() + except Exception: + matrix = None + if not matrix: + continue + csv_data = self._rows_to_csv_payload(matrix) + if not csv_data.strip(): + continue + bbox = getattr(tab, "bbox", None) + if bbox is not None: + table_bboxes.append(bbox) + cy, cx = self._bbox_center(bbox) + entries.append((cy, cx, "table", { + "label": f"table_{page_index + 1}_{ti}", + "data": csv_data, + "table_index": ti, + })) + except Exception: + pass + + try: + page_dict = page.get_text("dict", sort=True) + except Exception: + page_dict = None + blocks = page_dict.get("blocks") if isinstance(page_dict, dict) else None + if isinstance(blocks, list): + text_block_no = 0 + image_no = 0 + for block in blocks: + if not isinstance(block, dict): + continue + bbox = block.get("bbox") + cy, cx = self._bbox_center(bbox) + btype = block.get("type") + if btype == 0: + if any(self._point_inside_bbox(cx, cy, tb) for tb in table_bboxes): + continue + text = self._text_from_text_block(block) + if not text: + continue + label = f"page_{page_index + 1}" if text_block_no == 0 else f"page_{page_index + 1}_t{text_block_no}" + entries.append((cy, cx, "text", { + "label": label, + "data": text, + "text_block_no": text_block_no, + })) + text_block_no += 1 + continue + if btype != 1: + continue + img_bytes = block.get("image") + ext = str(block.get("ext") or "png").lower() + mime = f"image/{ext}" + if not img_bytes: + xref = block.get("xref") + if xref is not None: + try: + extracted = doc.extract_image(int(xref)) + img_bytes = extracted.get("image", b"") + ext = str(extracted.get("ext") or ext).lower() + mime = f"image/{ext}" + except Exception: + img_bytes = b"" + if not img_bytes: + continue + entries.append((cy, cx, "image", { + "label": f"image_{page_index + 1}_{image_no}", + "mime": mime, + "bytes": img_bytes, + "image_no": image_no, + })) + image_no += 1 + + entries.sort(key=lambda item: (item[0], item[1])) + out: List[ContentPart] = [] + for _y, _x, kind, payload in entries: + if kind == "text": + tbno = int(payload.get("text_block_no") or 0) + text = str(payload.get("data") or "") + out.append(ContentPart( + id=makeId(), + parentId=root_id, + label=str(payload.get("label") or f"page_{page_index + 1}"), + typeGroup="text", + mimeType="text/plain", + data=text, + metadata={ + "pages": 1, + "pageIndex": page_index, + "size": len(text.encode("utf-8")), + "contextRef": { + "containerPath": file_name, + "location": f"page:{page_index + 1}/block:{tbno}", + "pageIndex": page_index, + }, + }, + )) + elif kind == "table": + ti = int(payload.get("table_index") or 0) + csv_data = str(payload.get("data") or "") + out.append(ContentPart( + id=makeId(), + parentId=root_id, + label=str(payload.get("label") or f"table_{page_index + 1}_{ti}"), + typeGroup="table", + mimeType="text/csv", + data=csv_data, + metadata={ + "pageIndex": page_index, + "size": len(csv_data.encode("utf-8")), + "contextRef": { + "containerPath": file_name, + "location": f"page:{page_index + 1}/table:{ti}", + "pageIndex": page_index, + }, + }, + )) + elif kind == "image": + ino = int(payload.get("image_no") or 0) + img_bytes = payload.get("bytes") or b"" + mime = str(payload.get("mime") or "image/png") + out.append(ContentPart( + id=makeId(), + parentId=root_id, + label=str(payload.get("label") or f"image_{page_index + 1}_{ino}"), + typeGroup="image", + mimeType=mime, + data=base64.b64encode(img_bytes).decode("utf-8"), + metadata={ + "pageIndex": page_index, + "size": len(img_bytes), + "contextRef": { + "containerPath": file_name, + "location": f"page:{page_index + 1}/image:{ino}", + "pageIndex": page_index, + }, + }, + )) + return out + + @staticmethod + def _rows_to_csv_payload(rows: List[List[Any]]) -> str: + lines: List[str] = [] + for row in rows: + cells = [str(c or "").replace('"', '""') for c in row] + lines.append(",".join(f'"{c}"' for c in cells)) + return "\n".join(lines) diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py index f75a5108..7ec05c5c 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py @@ -670,7 +670,7 @@ class RendererPdf(BaseRenderer): runType = run.get("type", "text") value = self._escapeReportlabXml(run.get("value", "")) if runType == "text": - parts.append(value) + parts.append(value.replace("\n", "
")) elif runType == "bold": parts.append(f"{value}") elif runType == "italic": @@ -691,6 +691,7 @@ class RendererPdf(BaseRenderer): if not text: return "" s = self._escapeReportlabXml(text) + s = s.replace("\n", "
") s = _re_pdf.sub(r"\*\*(.+?)\*\*", r"\1", s, flags=_re_pdf.DOTALL) s = _re_pdf.sub(r"__(.+?)__", r"\1", s, flags=_re_pdf.DOTALL) s = _re_pdf.sub(r"(?\1", s) diff --git a/modules/workflows/automation2/executionEngine.py b/modules/workflows/automation2/executionEngine.py index 5f6a8592..f68a3feb 100644 --- a/modules/workflows/automation2/executionEngine.py +++ b/modules/workflows/automation2/executionEngine.py @@ -217,6 +217,30 @@ def _serializableOutputs(nodeOutputs: Dict[str, Any]) -> Dict[str, Any]: return _stripBinaryValues(cleaned) +def _merge_node_parameters_into_snap( + snap: Optional[Dict[str, Any]], + *, + node_id: Optional[str], + context: Optional[Dict[str, Any]], +) -> Dict[str, Any]: + """Copy wire snapshot and attach **nodeParameters** from the graph definition (by ``node_id``). + + Uses ``context['graphNodesById']`` populated at executeGraph start — stable even when + per-step node dict references differ. Field name is ``nodeParameters`` (no leading + underscore) so it survives consumers that hide ``_*`` keys.""" + merged: Dict[str, Any] = dict(snap or {}) + if not node_id or not isinstance(context, dict): + return merged + cmap = context.get("graphNodesById") + if not isinstance(cmap, dict): + return merged + gnode = cmap.get(node_id) + if not isinstance(gnode, dict): + return merged + merged["nodeParameters"] = dict(gnode.get("parameters") or {}) + return merged + + def _emitStepEvent(runId: str, stepData: Dict[str, Any]) -> None: """Emit a step-log SSE event to any listening client for this run.""" try: @@ -319,18 +343,20 @@ async def _ge_log_node_finished( loop_index: Optional[int] = None, loop_node_id: Optional[str] = None, loop_item: Optional[Any] = None, + exec_context: Optional[Dict[str, Any]] = None, ) -> None: """Append one execution line + one workflow-context snapshot (NDJSON).""" if file_logger is None or not run_id: return ts = _ge_iso_timestamp() + snap = _merge_node_parameters_into_snap(input_snap, node_id=node_id, context=exec_context) exec_rec: Dict[str, Any] = { "timestamp": ts, "runId": run_id, "nodeId": node_id, "nodeType": node_type, "status": status, - "input": _stripBinaryValues(dict(input_snap or {})), + "input": _stripBinaryValues(snap), } if skip_reason: exec_rec["skipReason"] = skip_reason @@ -470,6 +496,7 @@ async def _run_post_loop_done_nodes( for _sSrc, _, _ in connectionMap.get(_dnid, []): if _sSrc in nodeOutputs: _skipSnap[_sSrc] = nodeOutputs[_sSrc] + _skipSnap = _merge_node_parameters_into_snap(_skipSnap, node_id=_dnid, context=context) _skId = _createStepLog(automation2_interface, runId, _dnid, _dn.get("type", ""), status="skipped", inputSnapshot=_skipSnap) if _skId: _updateStepLog(automation2_interface, _skId, "skipped") @@ -478,6 +505,7 @@ async def _run_post_loop_done_nodes( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=_dnid, node_type=_dn.get("type", ""), status="skipped", @@ -494,6 +522,7 @@ async def _run_post_loop_done_nodes( for _src, _, _ in connectionMap.get(_dnid, []): if _src in nodeOutputs: _dIn[_src] = nodeOutputs[_src] + _dIn = _merge_node_parameters_into_snap(_dIn, node_id=_dnid, context=context) _dStepId = _createStepLog(automation2_interface, runId, _dnid, _dn.get("type", ""), "running", _dIn) try: _dres, _dRetry = await _executeWithRetry(_dexec, _dn, context) @@ -509,6 +538,7 @@ async def _run_post_loop_done_nodes( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=_dnid, node_type=_dn.get("type", ""), status="completed", @@ -525,6 +555,7 @@ async def _run_post_loop_done_nodes( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=_dnid, node_type=_dn.get("type", ""), status="completed", @@ -540,6 +571,7 @@ async def _run_post_loop_done_nodes( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=_dnid, node_type=_dn.get("type", ""), status="completed", @@ -556,6 +588,7 @@ async def _run_post_loop_done_nodes( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=_dnid, node_type=_dn.get("type", ""), status="failed", @@ -573,6 +606,7 @@ async def _run_post_loop_done_nodes( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=_dnid, node_type=_dn.get("type", ""), status="failed", @@ -622,6 +656,8 @@ async def executeGraph( from modules.workflows.automation2.pickNotPushMigration import ( materializeConnectionRefs, materializePrimaryTextHandover, + materializeRecommendedDataPickRef, + normalizeFileCreatePresentationRefs, ) from modules.workflows.automation2.featureInstanceRefMigration import ( materializeFeatureInstanceRefs, @@ -635,6 +671,8 @@ async def executeGraph( graph = materializeFeatureInstanceRefs(graph) graph = materializeConnectionRefs(graph) graph = materializePrimaryTextHandover(graph) + graph = materializeRecommendedDataPickRef(graph) + graph = normalizeFileCreatePresentationRefs(graph) nodeTypeIds = _getNodeTypeIds(services) logger.debug("executeGraph nodeTypeIds (%d): %s", len(nodeTypeIds), sorted(nodeTypeIds)) errors = validateGraph(graph, nodeTypeIds) @@ -720,6 +758,9 @@ async def executeGraph( env_for_run = normalize_run_envelope(run_envelope, user_id=userId) + graph_nodes_by_id: Dict[str, Any] = { + str(n["id"]): n for n in nodes if n.get("id") + } context = { "workflowId": workflowId, "instanceId": instanceId, @@ -732,6 +773,7 @@ async def executeGraph( "_runId": runId, "_orderedNodes": ordered, "runEnvelope": env_for_run, + "graphNodesById": graph_nodes_by_id, } # Lets graph actions (e.g. ``context.setContext`` human-task mode) call # ``createTask`` / ``updateRun`` without threading the interface through services. @@ -803,6 +845,7 @@ async def executeGraph( for _rSrc, _, _ in connectionMap.get(bnid, []): if _rSrc in nodeOutputs: _rInputSnap[_rSrc] = nodeOutputs[_rSrc] + _rInputSnap = _merge_node_parameters_into_snap(_rInputSnap, node_id=bnid, context=context) _rStepId = _createStepLog(automation2_interface, runId, bnid, body_node.get("type", ""), "running", _rInputSnap) try: result, _rRetry = await _executeWithRetry(executor, body_node, context) @@ -821,6 +864,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="completed", @@ -844,6 +888,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="completed", @@ -867,6 +912,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="completed", @@ -886,6 +932,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="failed", @@ -906,6 +953,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="failed", @@ -979,6 +1027,7 @@ async def executeGraph( for _sSrc, _, _ in connectionMap.get(nodeId, []): if _sSrc in nodeOutputs: _skipInputSnap[_sSrc] = nodeOutputs[_sSrc] + _skipInputSnap = _merge_node_parameters_into_snap(_skipInputSnap, node_id=nodeId, context=context) _skipStepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, status="skipped", inputSnapshot=_skipInputSnap) if _skipStepId: _updateStepLog(automation2_interface, _skipStepId, "skipped") @@ -987,6 +1036,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=nodeId, node_type=nodeType, status="skipped", @@ -1015,6 +1065,7 @@ async def executeGraph( for _lSrc, _, _ in connectionMap.get(nodeId, []): if _lSrc in nodeOutputs: _loopInputSnap[_lSrc] = nodeOutputs[_lSrc] + _loopInputSnap = _merge_node_parameters_into_snap(_loopInputSnap, node_id=nodeId, context=context) _stepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, "running", _loopInputSnap) result = await executor.execute(node, context) items = result.get("items") or [] @@ -1068,6 +1119,9 @@ async def executeGraph( for _bSnapSrc, _, _ in connectionMap.get(bnid, []): if _bSnapSrc in _activeOutputs: _bInputSnapAlways[_bSnapSrc] = _activeOutputs[_bSnapSrc] + _bInputSnapAlways = _merge_node_parameters_into_snap( + _bInputSnapAlways, node_id=bnid, context=context + ) _bStepId = None if not _batchMode or _idx == 0 or _idx == len(items) - 1: _bStepId = _createStepLog( @@ -1100,6 +1154,7 @@ async def executeGraph( run_id=runId, node_outputs=_activeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="completed", @@ -1123,6 +1178,7 @@ async def executeGraph( run_id=runId, node_outputs=_activeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="completed", @@ -1148,6 +1204,7 @@ async def executeGraph( run_id=runId, node_outputs=_activeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="completed", @@ -1168,6 +1225,7 @@ async def executeGraph( run_id=runId, node_outputs=_activeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="failed", @@ -1189,6 +1247,7 @@ async def executeGraph( run_id=runId, node_outputs=_activeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=bnid, node_type=body_node.get("type", ""), status="failed", @@ -1296,6 +1355,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=nodeId, node_type=nodeType, status="completed", @@ -1314,6 +1374,7 @@ async def executeGraph( for src, _, _ in connectionMap.get(nodeId, []): if src in nodeOutputs: _inputSnap[src] = nodeOutputs[src] + _inputSnap = _merge_node_parameters_into_snap(_inputSnap, node_id=nodeId, context=context) _stepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, "running", _inputSnap) result, retryCount = await _executeWithRetry(executor, node, context) result = _normalizeResult(result, nodeType) @@ -1328,6 +1389,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=nodeId, node_type=nodeType, status="completed", @@ -1342,6 +1404,7 @@ async def executeGraph( for src, _, _ in connectionMap.get(nodeId, []): if src in nodeOutputs: _inputSnap[src] = nodeOutputs[src] + _inputSnap = _merge_node_parameters_into_snap(_inputSnap, node_id=nodeId, context=context) _stepId = _createStepLog(automation2_interface, runId, nodeId, nodeType, "running", _inputSnap) result, retryCount = await _executeWithRetry(executor, node, context) result = _normalizeResult(result, nodeType) @@ -1356,6 +1419,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=nodeId, node_type=nodeType, status="completed", @@ -1384,6 +1448,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=nodeId, node_type=nodeType, status="completed", @@ -1411,6 +1476,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=nodeId, node_type=nodeType, status="completed", @@ -1471,6 +1537,7 @@ async def executeGraph( run_id=runId, node_outputs=nodeOutputs, run_envelope=context.get("runEnvelope"), + exec_context=context, node_id=nodeId, node_type=nodeType, status="failed", diff --git a/modules/workflows/automation2/executors/actionNodeExecutor.py b/modules/workflows/automation2/executors/actionNodeExecutor.py index 5d03298f..d5a3fce8 100644 --- a/modules/workflows/automation2/executors/actionNodeExecutor.py +++ b/modules/workflows/automation2/executors/actionNodeExecutor.py @@ -21,10 +21,40 @@ from modules.features.graphicalEditor.portTypes import ( from modules.serviceCenter.services.serviceSubscription.mainServiceSubscription import SubscriptionInactiveException as _SubscriptionInactiveException from modules.serviceCenter.services.serviceBilling.mainServiceBilling import BillingContextError as _BillingContextError from modules.workflows.automation2.executors.inputExecutor import PauseForHumanTaskError +from modules.workflows.methods.methodContext.actions.extractContent import ( + PRESENTATION_KIND, + build_presentation_envelope_from_plain_text, + presentation_dict_without_meta, + presentation_response_text, +) logger = logging.getLogger(__name__) _FILE_CREATE_CTX_LOG_MAX = 500 +_SKIP_UNIFIED_PRESENTATION_NODES = frozenset({"context.extractContent"}) + + +def _attach_unified_presentation_data(out: Dict[str, Any], *, node_type: str) -> None: + """Ensure ``out[\"data\"]`` carries ``context.extractContent.presentation.v1`` for ``file.create``.""" + if node_type in _SKIP_UNIFIED_PRESENTATION_NODES: + return + data = out.get("data") + if isinstance(data, dict) and data.get("kind") == PRESENTATION_KIND: + return + text = str(out.get("response") or "").strip() + if not text and isinstance(data, dict): + text = str(data.get("response") or "").strip() + if not text: + return + pres = build_presentation_envelope_from_plain_text(text, source_name=node_type or "content") + if not pres: + return + meta: Dict[str, Any] = {"actionType": node_type} + if isinstance(data, dict): + prev = data.get("_meta") + if isinstance(prev, dict): + meta = {**prev, **meta} + out["data"] = {**pres, "_meta": meta} def _truncate_for_log(val: Any, max_len: int = _FILE_CREATE_CTX_LOG_MAX) -> str: @@ -147,6 +177,41 @@ def _image_documents_from_docs_list(docs_list: list) -> list: ] +def _image_refs_from_extract_node_data(extract_data: Any) -> list: + """Synthetic image document dicts from ``context.extractContent`` ``_meta.persistedImageArtifacts``.""" + if not isinstance(extract_data, dict): + return [] + meta = extract_data.get("_meta") + if not isinstance(meta, dict): + return [] + arts = meta.get("persistedImageArtifacts") + if not isinstance(arts, list): + return [] + out: list = [] + for a in arts: + if not isinstance(a, dict): + continue + fid = a.get("fileId") + if not fid: + continue + out.append( + { + "documentName": a.get("fileName") or f"extract_image_{fid}", + "mimeType": str(a.get("mimeType") or "application/octet-stream"), + "documentData": None, + "fileId": str(fid), + "_hasBinaryData": True, + "validationMetadata": { + "actionType": "context.extractContent", + "handoverRole": "extractedMedia", + "suppressInWorkflowFileLists": True, + "sourcePartId": a.get("sourcePartId"), + }, + } + ) + return out + + _USER_CONNECTION_ID_RE = re.compile( r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE, @@ -679,9 +744,12 @@ class ActionNodeExecutor: extractedContext = "" rd_early = getattr(result, "data", None) if isinstance(rd_early, dict): - _r = rd_early.get("response") - if _r is not None and str(_r).strip(): - extractedContext = str(_r).strip() + if rd_early.get("kind") == PRESENTATION_KIND: + extractedContext = presentation_response_text(presentation_dict_without_meta(rd_early)).strip() + else: + _r = rd_early.get("response") + if _r is not None and str(_r).strip(): + extractedContext = str(_r).strip() promptText = str(resolvedParams.get("aiPrompt") or resolvedParams.get("prompt") or "").strip() resultData = getattr(result, "data", None) @@ -728,9 +796,17 @@ class ActionNodeExecutor: out.setdefault("context", ctx_str if ctx_str else "") rsp = str(out.get("response") or "").strip() if not rsp: - out["response"] = extractedContext or "" + if nodeType != "context.extractContent": + out["response"] = extractedContext or "" + else: + out["response"] = "" if result.success: img_only = _image_documents_from_docs_list(docsList) + if ( + nodeType == "context.extractContent" + and isinstance(result.data, dict) + ): + img_only = list(img_only) + _image_refs_from_extract_node_data(result.data) # mergeContext packs iterated payloads under ``data.merged`` only — ``documents`` # on the ActionResult is empty, so image sidecars live on ``merged.imageDocumentsOnly``. if ( @@ -766,6 +842,12 @@ class ActionNodeExecutor: _attachConnectionProvenance(cr_out, resolvedParams, outputSchema, chatService, self.services) return normalizeToSchema(cr_out, outputSchema) + if nodeType == "context.extractContent": + out.pop("documents", None) + + if outputSchema in ("AiResult", "ActionResult") and result.success: + _attach_unified_presentation_data(out, node_type=nodeType) + _attachConnectionProvenance(out, resolvedParams, outputSchema, chatService, self.services) # When the node declares ``surfaceDataAsTopLevel`` (typical for diff --git a/modules/workflows/automation2/executors/flowExecutor.py b/modules/workflows/automation2/executors/flowExecutor.py index e0836db8..e64b1212 100644 --- a/modules/workflows/automation2/executors/flowExecutor.py +++ b/modules/workflows/automation2/executors/flowExecutor.py @@ -295,14 +295,42 @@ class FlowExecutor: def _normalize_loop_items(self, raw: Any) -> List[Any]: """Coerce resolved `items` into a list (lists, dict children, or scalars).""" if isinstance(raw, list): - return raw + return self._expand_presentation_lines_loop_items(raw) if isinstance(raw, dict): children = raw.get("children") if isinstance(children, list) and len(children) > 0: - return children - return [{"name": k, "value": v} for k, v in raw.items()] + return self._expand_presentation_lines_loop_items(children) + items = [{"name": k, "value": v} for k, v in raw.items()] + return self._expand_presentation_lines_loop_items(items) return [raw] if raw is not None else [] + def _expand_presentation_lines_loop_items(self, items: List[Any]) -> List[Any]: + """When looping ``presentation.files`` in ``lines`` mode, iterate per slot (e.g. CSV row).""" + if not items: + return items + expanded: List[Any] = [] + saw_lines_bucket = False + for it in items: + if not isinstance(it, dict): + expanded.append(it) + continue + val = it.get("value") + if not isinstance(val, dict) or val.get("outputMode") != "lines": + expanded.append(it) + continue + data = val.get("data") + if not isinstance(data, list) or len(data) <= 1: + expanded.append(it) + continue + saw_lines_bucket = True + base_name = str(it.get("name") or val.get("sourceFileName") or "line") + for idx, slot in enumerate(data): + if not isinstance(slot, dict): + continue + sid = str(slot.get("id") or slot.get("label") or idx) + expanded.append({"name": f"{base_name}:{sid}", "value": slot}) + return expanded if saw_lines_bucket else items + def _apply_iteration_mode(self, items: List[Any], mode: str, stride: int) -> List[Any]: """Select which elements to iterate over (backend-defined modes).""" if not items: diff --git a/modules/workflows/automation2/graphUtils.py b/modules/workflows/automation2/graphUtils.py index 65f7084c..54cff2a1 100644 --- a/modules/workflows/automation2/graphUtils.py +++ b/modules/workflows/automation2/graphUtils.py @@ -435,6 +435,13 @@ def resolveParameterReferences(value: Any, nodeOutputs: Dict[str, Any]) -> Any: data = data.get("data", data) plist = list(path) resolved = _get_by_path(data, plist) + if resolved is None: + from modules.workflows.automation2.pickNotPushMigration import ( + remap_stale_presentation_ref_path, + ) + alt_path = remap_stale_presentation_ref_path(plist) + if alt_path != plist: + resolved = _get_by_path(data, alt_path) if resolved is None and isinstance(data, dict) and plist: if plist[0] == "payload" and len(plist) > 1: # Strip explicit "payload" prefix (legacy DataPicker paths) @@ -491,13 +498,10 @@ def resolveParameterReferences(value: Any, nodeOutputs: Dict[str, Any]) -> Any: # contextBuilder: list where every item is a `{"type":"ref",...}` envelope. # Resolve each part; a single ref preserves the resolved type (str, list, dict). if value and all(isinstance(v, dict) and v.get("type") == "ref" for v in value): - from modules.workflows.methods.methodAi._common import serialize_context - resolved_parts = [resolveParameterReferences(v, nodeOutputs) for v in value] if len(resolved_parts) == 1: return resolved_parts[0] - parts = [serialize_context(p, prefer_handover_primary=True) for p in resolved_parts] - return "\n\n".join(p for p in parts if p) + return resolved_parts return [resolveParameterReferences(v, nodeOutputs) for v in value] return value diff --git a/modules/workflows/automation2/pickNotPushMigration.py b/modules/workflows/automation2/pickNotPushMigration.py index b6da00a2..0bc7072f 100644 --- a/modules/workflows/automation2/pickNotPushMigration.py +++ b/modules/workflows/automation2/pickNotPushMigration.py @@ -5,6 +5,8 @@ Graph helpers for Pick-not-Push: materialize typed DataRefs before executeGraph - ``materializeConnectionRefs``: empty ``connectionReference`` from upstream connection provenance. - ``materializePrimaryTextHandover``: parameters whose static definition includes ``graphInherit.kind == "primaryTextRef"`` (canonical paths: ``PRIMARY_TEXT_HANDOVER_REF_PATH``). +- ``materializeRecommendedDataPickRef``: parameters with ``graphInherit.kind == "recommendedDataPickRef"`` + use the upstream output port's ``dataPickOptions`` entry with ``recommended: true``. Runtime: executeGraph deep-copies the version graph and applies these passes in order. """ @@ -12,7 +14,7 @@ from __future__ import annotations import copy import logging -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from modules.features.graphicalEditor.nodeDefinitions import STATIC_NODE_TYPES from modules.features.graphicalEditor.portTypes import ( @@ -154,3 +156,133 @@ def materializePrimaryTextHandover(graph: Dict[str, Any]) -> Dict[str, Any]: ) return g + + +def _recommended_data_pick_path(out_port: Dict[str, Any]) -> Optional[List[Any]]: + opts = out_port.get("dataPickOptions") if isinstance(out_port, dict) else None + if not isinstance(opts, list): + return None + for opt in opts: + if not isinstance(opt, dict): + continue + if opt.get("recommended") is True: + path = opt.get("path") + if isinstance(path, list) and path: + return list(path) + return None + + +def materializeRecommendedDataPickRef(graph: Dict[str, Any]) -> Dict[str, Any]: + """Materialize empty parameters that declare ``graphInherit.kind == \"recommendedDataPickRef\"``.""" + g = copy.deepcopy(graph) + nodes: List[Dict[str, Any]] = g.get("nodes") or [] + connections = g.get("connections") or [] + if not nodes: + return g + + conn_map = buildConnectionMap(connections) + node_by_id = {n["id"]: n for n in nodes if n.get("id")} + + for node in nodes: + nid = node.get("id") + ntype = node.get("type") + if not nid or not ntype: + continue + node_def = _NODE_DEF_BY_ID.get(ntype) + if not node_def: + continue + params = node.get("parameters") + if not isinstance(params, dict): + node["parameters"] = {} + params = node["parameters"] + + for pdef in node_def.get("parameters") or []: + gi = pdef.get("graphInherit") + if not isinstance(gi, dict) or gi.get("kind") != "recommendedDataPickRef": + continue + pname = pdef.get("name") + if not pname: + continue + port_ix = int(gi.get("port", 0)) + if not _slot_empty_for_primary_text_inherit(params.get(pname)): + continue + input_sources = getInputSources(nid, conn_map) + if port_ix not in input_sources: + continue + src_id, _ = input_sources[port_ix] + src_node = node_by_id.get(src_id) or {} + src_def = _NODE_DEF_BY_ID.get(src_node.get("type") or "") + if not src_def: + continue + out_port = (src_def.get("outputPorts") or {}).get(port_ix, {}) or {} + if not isinstance(out_port, dict): + out_port = (src_def.get("outputPorts") or {}).get(0, {}) or {} + ref_path = _recommended_data_pick_path(out_port if isinstance(out_port, dict) else {}) + if not ref_path: + continue + ref = _data_ref(src_id, ref_path) + if pdef.get("frontendType") == "contextBuilder": + params[pname] = [ref] + else: + params[pname] = ref + logger.debug( + "materializeRecommendedDataPickRef: %s.%s -> ref %s path=%s", + nid, + pname, + src_id, + ref_path, + ) + + return g + + +_STALE_FILE_CREATE_CONTEXT_PATHS = frozenset({ + ("responseData",), + ("response",), + ("merged",), + ("documents", 0, "documentData"), +}) + + +def remap_stale_presentation_ref_path(path: List[Any]) -> List[Any]: + """Map legacy text-handover paths to unified presentation ``data``.""" + if tuple(path) in _STALE_FILE_CREATE_CONTEXT_PATHS: + return ["data"] + return list(path) + + +def _normalize_presentation_refs_in_value(val: Any) -> Any: + """Rewrite stale ref paths inside ``contextBuilder`` lists or bare refs.""" + if isinstance(val, dict) and val.get("type") == "ref": + path = val.get("path") + if isinstance(path, list) and path: + new_path = remap_stale_presentation_ref_path(path) + if new_path != path: + return {**val, "path": new_path} + return val + if isinstance(val, list): + return [_normalize_presentation_refs_in_value(item) for item in val] + return val + + +def normalizeFileCreatePresentationRefs(graph: Dict[str, Any]) -> Dict[str, Any]: + """Remap legacy ``file.create`` context refs to unified presentation ``data``.""" + g = copy.deepcopy(graph) + nodes: List[Dict[str, Any]] = g.get("nodes") or [] + for node in nodes: + if node.get("type") != "file.create": + continue + params = node.get("parameters") + if not isinstance(params, dict): + continue + ctx = params.get("context") + if ctx in (None, "", []): + continue + normalized = _normalize_presentation_refs_in_value(ctx) + if normalized != ctx: + params["context"] = normalized + logger.debug( + "normalizeFileCreatePresentationRefs: %s.context remapped to presentation data ref", + node.get("id"), + ) + return g diff --git a/modules/workflows/automation2/workflowArtifactVisibility.py b/modules/workflows/automation2/workflowArtifactVisibility.py new file mode 100644 index 00000000..0eb8d4bd --- /dev/null +++ b/modules/workflows/automation2/workflowArtifactVisibility.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025 Patrick Motsch +"""Heuristics for hiding internal workflow artefacts from user-facing file lists.""" + +from __future__ import annotations + +from typing import Any, Mapping, Optional + + +_WORKFLOW_INTERNAL_FILE_TAG = "_workflowInternal" + + +def suppress_workflow_file_in_workspace_ui(meta: Optional[Mapping[str, Any]]) -> bool: + """True when a file row should not appear in user-facing file lists. + + Used by Automation Workspace **and** ``/api/files/list`` (Meine Dateien). + Matches persisted JSON handovers from transient runs (``extracted_content_transient*``), + internal extract image files (``extract_media_*``), the ``_workflowInternal`` tag, and + optional explicit flags. + """ + if not isinstance(meta, Mapping): + return False + tags = meta.get("tags") + if isinstance(tags, list) and _WORKFLOW_INTERNAL_FILE_TAG in tags: + return True + fn = str(meta.get("fileName") or "").lower() + if "extracted_content_transient" in fn: + return True + if "extract_media_" in fn: + return True + if meta.get("suppressInWorkflowFileLists") is True: + return True + return False diff --git a/modules/workflows/methods/methodAi/_common.py b/modules/workflows/methods/methodAi/_common.py index 60609104..27b36663 100644 --- a/modules/workflows/methods/methodAi/_common.py +++ b/modules/workflows/methods/methodAi/_common.py @@ -30,6 +30,49 @@ def _handover_response_plain(val: Any) -> Optional[str]: return str(r).strip().lstrip("\ufeff") +def primary_text_for_prompt_context(val: Any) -> str: + """Flatten ActionResult / presentation / merge payloads to readable text. + + Used when merging multiple context-builder refs so extract outputs are not + turned into giant JSON via ``serialize_context`` (empty ``response``). + """ + if val is None: + return "" + if isinstance(val, str): + s = val.strip().lstrip("\ufeff") + if not s: + return "" + if len(s) >= 2 and ((s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}"))): + try: + return primary_text_for_prompt_context(json.loads(s)) + except (json.JSONDecodeError, TypeError, ValueError): + pass + return s + if isinstance(val, list): + chunks = [primary_text_for_prompt_context(item) for item in val] + chunks = [c for c in chunks if c] + return "\n\n".join(chunks) + if isinstance(val, dict): + got = _handover_response_plain(val) + if got is not None: + return got + inner = val.get("data") + if isinstance(inner, dict): + from modules.workflows.methods.methodContext.actions.extractContent import ( + joined_text_from_extract_node_data, + ) + + t = (joined_text_from_extract_node_data(inner) or "").strip() + if t: + return t + from modules.workflows.methods.methodContext.actions.extractContent import ( + joined_text_from_extract_node_data, + ) + + return (joined_text_from_extract_node_data(val) or "").strip() + return str(val).strip() if str(val).strip() else "" + + def serialize_context(val: Any, *, prefer_handover_primary: bool = False) -> str: """Convert any context value to a readable string for use in AI prompts. diff --git a/modules/workflows/methods/methodBase.py b/modules/workflows/methods/methodBase.py index 02cae134..e666beff 100644 --- a/modules/workflows/methods/methodBase.py +++ b/modules/workflows/methods/methodBase.py @@ -202,7 +202,15 @@ class MethodBase: validated = {} # System parameters that should always be preserved, even if not in paramDefs - systemParams = ['parentOperationId', 'expectedDocumentFormats'] + systemParams = [ + 'parentOperationId', + 'expectedDocumentFormats', + # Injected by automation2 ActionNodeExecutor (graph node definitions) + '_runContext', + '_upstreamPayload', + '_branchInputs', + '_workflowNodeId', + ] for sysParam in systemParams: if sysParam in parameters: validated[sysParam] = parameters[sysParam] diff --git a/modules/workflows/methods/methodContext/actions/extractContent.py b/modules/workflows/methods/methodContext/actions/extractContent.py index 758d772e..866a0568 100644 --- a/modules/workflows/methods/methodContext/actions/extractContent.py +++ b/modules/workflows/methods/methodContext/actions/extractContent.py @@ -3,28 +3,27 @@ """context.extractContent — extracts content without AI. -Returns a unified handover compatible with AiResult-style downstream wiring: +``ActionResult.data`` is one **presentation** envelope (`schemaVersion`, `kind`, +`outputMode`, `fileOrder`, `files`) matching node parameters plus ``_meta`` (operation refs, +persisted-image trace, presentation config). -- ``documents[0]``: structured JSON (`context.extractContent.handover.v1`); image ``parts`` - keep metadata but omit pixel data; each dropped image references - ``handoverMediaDocumentName`` matching a sibling blob document. -- ``documents[1:]``: each extracted image as its own binary ``ActionDocument`` (like - ``ai.process`` artefact outputs). -- Root ``presentation`` inside the JSON (`schemaVersion`, per-file modes/lines/pages/chunks/…) - — built from filtered ``parts`` without changing extractor output. -- ``ActionResult.data["response"]`` plus normalized executor field ``response``: flat text derived - from ``presentation`` (downstream-friendly wie zuvor fuer ``file.create`` / ``primaryTextRef``).""" +Raw ``ContentExtracted`` is not emitted on the automation output; persistence still uses it +internally when ``_runContext`` enables image uploads. + +Older ``kind: context.extractContent.handover.v1`` is legacy-only (merge/tests), not produced here.""" import base64 as _b64 import binascii as _binascii +import copy import csv +import json import logging import re -from io import StringIO +from io import BytesIO, StringIO import time from typing import Any, Dict, List, Optional, Tuple -from modules.datamodels.datamodelChat import ActionResult, ActionDocument +from modules.datamodels.datamodelChat import ActionResult from modules.datamodels.datamodelDocref import coerceDocumentReferenceList from modules.datamodels.datamodelExtraction import ContentExtracted, ExtractionOptions @@ -32,9 +31,26 @@ logger = logging.getLogger(__name__) _UNSAFE_FILE_KEY = re.compile(r"[^\w\-.\(\)\[\]%@+]") -HANDOVER_KIND = "context.extractContent.handover.v1" +# Bumped when ``ActionResult.data`` shape changes (`_meta.extractPayloadSchemaVersion`). +EXTRACT_PAYLOAD_SCHEMA_VERSION = 3 + +LEGACY_HANDOVER_KIND = "context.extractContent.handover.v1" +HANDOVER_KIND = LEGACY_HANDOVER_KIND +PRESENTATION_KIND = "context.extractContent.presentation.v1" _CONTENT_FILTER_OPTIONS = ("all", "textOnly", "imagesOnly", "noImages") +_CONTENT_FILTER_BY_LOWER = {k.lower(): k for k in _CONTENT_FILTER_OPTIONS} + + +def _canonical_content_filter(raw: Any) -> str: + """Map JSON / UI values to canonical ``_CONTENT_FILTER_OPTIONS`` keys (case-insensitive).""" + s = str(raw if raw is not None else "all").strip() + if not s: + return "all" + if s in _CONTENT_FILTER_OPTIONS: + return s + return _CONTENT_FILTER_BY_LOWER.get(s.lower()) or "all" + PRESENTATION_SCHEMA_VERSION = 1 @@ -73,6 +89,39 @@ def _apply_content_filter(payload: Dict[str, Any], content_filter: str) -> Dict[ return result +def _filter_extractions_by_content_filter( + extracted_results: List[ContentExtracted], + content_filter: str, +) -> List[ContentExtracted]: + """Return copies with ``parts`` trimmed (same semantics as ``_apply_content_filter``).""" + if content_filter == "all": + return extracted_results + out: List[ContentExtracted] = [] + for ec in extracted_results: + parts = list(ec.parts or []) + if content_filter == "textOnly": + parts = [ + p + for p in parts + if (getattr(p, "typeGroup", None) or "") in ("text", "table", "structure") + ] + elif content_filter == "imagesOnly": + parts = [p for p in parts if (getattr(p, "typeGroup", None) or "") == "image"] + elif content_filter == "noImages": + parts = [p for p in parts if (getattr(p, "typeGroup", None) or "") != "image"] + copied = ec.model_copy(update={"parts": parts}) + out.append(copied) + return out + + +def _serialize_content_extracted_for_output(ec: ContentExtracted) -> Dict[str, Any]: + """Serialize for internal persist path (no exported ``summary``); not emitted on ``ActionResult.data``.""" + d = ec.model_dump(mode="json", exclude_none=True) if hasattr(ec, "model_dump") else ec.dict(exclude_none=True) + if isinstance(d, dict): + d.pop("summary", None) + return d + + def _default_extraction_options() -> ExtractionOptions: """No merge — keep all parts for downstream JSON selection.""" return ExtractionOptions( @@ -177,7 +226,13 @@ def _parse_non_negative_int(value: Any, default: int) -> int: def parse_presentation_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]: - """Defaults match ``context.extractContent`` node schema in ``context.py``.""" + """Defaults match ``context.extractContent`` node schema in ``context.py``. + + ``contentFilter=all`` plus legacy default ``pdfExtractMode=text`` would drop + image parts from **presentation** even though extraction kept them — we + coerce that combination to ``all``. When ``pdfExtractMode`` is omitted, + sensible defaults derive from ``contentFilter``. + """ output_mode = str(parameters.get("outputMode") or "lines").strip().lower() if output_mode not in _OUTPUT_MODES: output_mode = "lines" @@ -187,9 +242,23 @@ def parse_presentation_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]: chunk_unit = str(parameters.get("chunkSizeUnit") or "tokens").strip().lower() if chunk_unit not in _CHUNK_UNITS: chunk_unit = "tokens" - pdf_mode = str(parameters.get("pdfExtractMode") or "text").strip().lower() - if pdf_mode not in _PDF_EXTRACT_PRESENTATION_MODES: + content_filter = _canonical_content_filter(parameters.get("contentFilter")) + raw_pdf = parameters.get("pdfExtractMode") + raw_pdf_str = str(raw_pdf).strip() if raw_pdf is not None else "" + if raw_pdf_str: + pdf_mode = raw_pdf_str.lower() + elif content_filter == "imagesOnly": + pdf_mode = "images" + elif content_filter in ("textOnly", "noImages"): pdf_mode = "text" + else: + pdf_mode = "all" + if pdf_mode not in _PDF_EXTRACT_PRESENTATION_MODES: + pdf_mode = "all" + if content_filter == "all" and pdf_mode == "text": + pdf_mode = "all" + elif content_filter == "imagesOnly" and pdf_mode in ("text", "tables"): + pdf_mode = "images" return { "outputMode": output_mode, "splitBy": split_by, @@ -430,56 +499,191 @@ def _base_item_meta( return m +def summarize_presentation_payload(presentation: Dict[str, Any]) -> Dict[str, Any]: + """Compact shape for logs / run traces (no full ``data`` payload).""" + files_out: Dict[str, Any] = {} + for fk, bucket in (presentation.get("files") or {}).items(): + if not isinstance(bucket, dict): + continue + om = bucket.get("outputMode") + d = bucket.get("data") + shape: Dict[str, Any] = {"outputMode": om, "dataPythonType": type(d).__name__} + if isinstance(d, str): + shape["stringLength"] = len(d) + shape["head"] = d[:200] + shape["tail"] = d[-120:] if len(d) > 320 else None + elif isinstance(d, list): + shape["listLength"] = len(d) + if d: + el0 = d[0] + shape["firstElementPythonType"] = type(el0).__name__ + if isinstance(el0, str): + shape["firstStringLength"] = len(el0) + shape["firstHead"] = el0[:160] + elif isinstance(el0, dict): + shape["firstKeys"] = list(el0.keys())[:12] + files_out[str(fk)] = shape + return { + "schemaVersion": presentation.get("schemaVersion"), + "kind": presentation.get("kind"), + "rootOutputMode": presentation.get("outputMode"), + "fileOrder": presentation.get("fileOrder"), + "files": files_out, + } + + +def _joined_text_from_content_extracted_serial(items: List[Any]) -> str: + """Plain text from serialized ``contentExtracted`` list (dict items with ``parts``).""" + chunks: List[str] = [] + for item in items: + if not isinstance(item, dict): + continue + for p in item.get("parts") or []: + if not isinstance(p, dict): + continue + if not _part_carries_plain_text(p): + continue + raw = p.get("data") + if raw is None: + continue + s = str(raw).strip() + if s: + chunks.append(s) + return "\n\n".join(chunks) + + +def presentation_dict_without_meta(data: Dict[str, Any]) -> Dict[str, Any]: + """Strip ``_meta`` for helpers that expect a bare presentation envelope.""" + return {k: v for k, v in data.items() if k != "_meta"} + + +def joined_text_from_extract_node_data(data: Any) -> str: + """Primary text / mergeContext: presentation-root ``data``, ``contentExtracted``, or legacy handover.""" + if not isinstance(data, dict): + return "" + if data.get("kind") == PRESENTATION_KIND: + return presentation_response_text(presentation_dict_without_meta(data)) + ce = data.get("contentExtracted") + if isinstance(ce, list) and ce: + return _joined_text_from_content_extracted_serial(ce) + if data.get("files") is not None: + return _joined_text_from_handover_payload(data) + return "" + + def presentation_response_text( presentation: Dict[str, Any], - payload: Dict[str, Any], + file_order_hint: Optional[Any] = None, ) -> str: """Derive flattened ``response`` text from ``presentation.files``.""" - files_section = presentation.get("files") or {} - ordered = payload.get("fileOrder") - keys: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys()) - chunks: List[str] = [] + keys: List[str] = [] + if isinstance(file_order_hint, dict): + ord0 = file_order_hint.get("fileOrder") + keys = ord0 if isinstance(ord0, list) and ord0 else [] + elif isinstance(file_order_hint, list): + keys = file_order_hint + if not keys: + po = presentation.get("fileOrder") + keys = po if isinstance(po, list) and po else list(files_section.keys()) + chunks_out: List[str] = [] for fk in keys: bucket = files_section.get(fk) if not isinstance(bucket, dict): continue - mode = (bucket.get("outputMode") or "").strip() - if mode == "blob": - t = bucket.get("text") - if isinstance(t, str) and t.strip(): - chunks.append(t.strip()) - elif mode == "lines": - for it in bucket.get("items") or []: + texts = _flat_text_segments_from_presentation_bucket(bucket) + chunks_out.extend(texts) + return "\n\n".join(chunks_out) + + +def _flat_text_segments_from_presentation_bucket(bucket: Dict[str, Any]) -> List[str]: + """Derive plain-text segments from ``presentation.files[*]``. + + Prefer **data** when set (canonical shape for tooling): + - ``blob``: ``data`` is a single ``str``. + - ``lines``: ``data`` is a ``list[dict]``, one dict per extraction part (order preserved): same + fields as serialised ``ContentPart`` (image ``data`` redacted) plus ``lines`` (split/filtered text; + empty for non-text/table/structure plain-text parts). + - ``chunks``: ``data`` is ``list[str]``. + - ``pages``: ``data`` is ``list[{"pageIndex": int, "lines": [...]}]``. + - ``structured``: ``data`` mirrors ``items`` — list of part-like dicts; text from ``data`` fields. + """ + if not isinstance(bucket, dict): + return [] + raw_data = bucket.get("data") + mode = str(bucket.get("outputMode") or "").strip() + + if isinstance(raw_data, str): + s = raw_data.strip() + return [s] if s else [] + if isinstance(raw_data, list): + extracted: List[str] = [] + for el in raw_data: + if isinstance(el, str): + lt = el.strip() + if lt: + extracted.append(lt) + elif isinstance(el, dict): + if el.get("type") == "image": + continue + if el.get("typeGroup") == "image": + continue + line_block = el.get("lines") + if isinstance(line_block, list): + for ln in line_block: + if isinstance(ln, str): + s = ln.strip() + if s: + extracted.append(s) + elif ln is not None: + s = str(ln).strip() + if s: + extracted.append(s) + elif _part_carries_plain_text(el): + d = el.get("data") + if isinstance(d, str): + s = d.strip() + if s: + extracted.append(s) + if extracted: + return extracted + + # Legacy layouts (omit ``data`` or empty list interpreted as fallback) + out: List[str] = [] + if mode == "blob": + t = bucket.get("text") + if isinstance(t, str) and t.strip(): + out.append(t.strip()) + elif mode == "lines": + for it in bucket.get("items") or []: + if isinstance(it, dict): + tx = it.get("text") + if isinstance(tx, str) and tx.strip(): + out.append(tx.strip()) + elif mode == "pages": + for pg in bucket.get("pages") or []: + if not isinstance(pg, dict): + continue + for it in pg.get("items") or []: if isinstance(it, dict): tx = it.get("text") if isinstance(tx, str) and tx.strip(): - chunks.append(tx.strip()) - elif mode == "pages": - for pg in bucket.get("pages") or []: - if not isinstance(pg, dict): - continue - for it in pg.get("items") or []: - if isinstance(it, dict): - tx = it.get("text") - if isinstance(tx, str) and tx.strip(): - chunks.append(tx.strip()) - elif mode == "chunks": - for it in bucket.get("chunks") or []: - if isinstance(it, dict): - tx = it.get("text") - if isinstance(tx, str) and tx.strip(): - chunks.append(tx.strip()) - elif mode == "structured": - for it in bucket.get("items") or []: - if not isinstance(it, dict): - continue + out.append(tx.strip()) + elif mode == "chunks": + for it in bucket.get("chunks") or []: + if isinstance(it, dict): + tx = it.get("text") + if isinstance(tx, str) and tx.strip(): + out.append(tx.strip()) + elif mode == "structured": + for it in bucket.get("items") or []: + if isinstance(it, dict): if not _part_carries_plain_text(it): continue tx = it.get("data") if isinstance(tx, str) and tx.strip(): - chunks.append(tx.strip()) - return "\n\n".join(chunks) + out.append(tx.strip()) + return out def build_presentation_for_payload(payload: Dict[str, Any], cfg: Dict[str, Any]) -> Dict[str, Any]: @@ -499,13 +703,75 @@ def build_presentation_for_payload(payload: Dict[str, Any], cfg: Dict[str, Any]) out_files[fk] = _build_file_presentation(source_name, parts, cfg) return { "schemaVersion": PRESENTATION_SCHEMA_VERSION, - "kind": "context.extractContent.presentation.v1", + "kind": PRESENTATION_KIND, "outputMode": cfg["outputMode"], "fileOrder": keys, "files": out_files, } +def build_presentation_for_serial_extractions( + serial_docs: List[Dict[str, Any]], + source_file_names: List[str], + cfg: Dict[str, Any], +) -> Dict[str, Any]: + """Build presentation from serialized extraction dicts (possibly after image persist).""" + key_counts: Dict[str, int] = {} + keys: List[str] = [] + out_files: Dict[str, Any] = {} + for i, blob in enumerate(serial_docs): + if not isinstance(blob, dict): + continue + name = source_file_names[i] if i < len(source_file_names) else "" + fk = _file_json_key(str(name), i, key_counts) + keys.append(fk) + raw_parts = [p for p in (blob.get("parts") or []) if isinstance(p, dict)] + parts = _presentation_filter_parts(raw_parts, cfg["pdfExtractMode"]) + _apply_markdown_presentation_on_parts(parts, cfg["markdownPreserveFormatting"]) + out_files[fk] = _build_file_presentation(str(name), parts, cfg) + return { + "schemaVersion": PRESENTATION_SCHEMA_VERSION, + "kind": PRESENTATION_KIND, + "outputMode": cfg["outputMode"], + "fileOrder": keys, + "files": out_files, + } + + +def build_presentation_for_extractions( + extracted_results: List[ContentExtracted], + source_file_names: List[str], + cfg: Dict[str, Any], +) -> Dict[str, Any]: + """Build ``presentation`` from [`mainServiceExtraction.extractContent`] results.""" + serial = [_serialize_content_extracted_for_output(ec) for ec in extracted_results] + return build_presentation_for_serial_extractions(serial, source_file_names, cfg) + + +def build_presentation_envelope_from_plain_text( + text: str, + *, + source_name: str = "content", + output_mode: str = "lines", +) -> Dict[str, Any]: + """Wrap plain text in ``context.extractContent.presentation.v1`` for unified ``file.create`` handover.""" + t = (text or "").strip() + if not t: + return {} + cfg = parse_presentation_parameters({"outputMode": output_mode}) + label = (source_name or "content").strip() or "content" + serial = [{ + "parts": [{ + "typeGroup": "text", + "mimeType": "text/plain", + "data": t, + "label": label, + "id": f"plain_{label}", + }], + }] + return build_presentation_for_serial_extractions(serial, [label], cfg) + + def _join_parts_plain_text(parts: List[Dict[str, Any]]) -> str: blocks: List[str] = [] for p in parts: @@ -529,6 +795,138 @@ def _redact_large_part_payload(p: Dict[str, Any]) -> Dict[str, Any]: return pc +def _attach_redacted_image_parts(bucket: Dict[str, Any], parts: List[Dict[str, Any]]) -> None: + """Attach aggregate ``imageParts`` for ``pages`` / ``chunks`` where ``data`` stays non-part-shaped. + + ``lines`` mode carries each image as its own entry in ``data`` (same order as extraction parts). + """ + imgs = [_redact_large_part_payload(_copy_part(p)) for p in parts if (p.get("typeGroup") or "").strip() == "image"] + if imgs: + bucket["imageParts"] = imgs + + +def _line_segments_filtered_for_text_fragment(fragment: str, cfg: Dict[str, Any]) -> List[str]: + frag = fragment.strip() + if not frag: + return [] + segs = _segment_merged_text(frag, cfg["splitBy"]) + return _apply_line_filters(segs, filter_empty=cfg["filterEmptyLines"], trim_ws=cfg["trimWhitespace"]) + + +def _rows_to_csv_payload(rows: List[List[Any]]) -> str: + lines: List[str] = [] + for row in rows: + cells = [str(c or "").replace('"', '""') for c in row] + lines.append(",".join(f'"{c}"' for c in cells)) + return "\n".join(lines) + + +def _table_matrix_from_csv(csv_text: str, *, header_row: bool) -> Optional[tuple[List[str], List[List[str]]]]: + """Parse CSV table payload into (headers, body rows) for ``renderReport`` tables.""" + parsed = _parse_csv_rows(csv_text, header_row) + if not parsed: + return None + headers = [str(h) for h in (parsed.get("headers") or [])] + raw_rows = parsed.get("rows") or [] + if not raw_rows: + return None + if isinstance(raw_rows[0], dict): + if not headers: + headers = list(raw_rows[0].keys()) + body = [[str(row.get(h, "")) for h in headers] for row in raw_rows] + return headers, body + body = [[str(c) for c in row] for row in raw_rows if isinstance(row, list)] + if not body: + return None + if not headers: + headers = [f"Column {i + 1}" for i in range(len(body[0]))] + return headers, body + + +def _presentation_line_slot_from_part(part: Dict[str, Any], cfg: Dict[str, Any]) -> Dict[str, Any]: + """One presentation row per extraction part: serialised part (redacted) + ``lines`` for this part only.""" + slot = _redact_large_part_payload(_copy_part(part)) + if (part.get("typeGroup") or "").strip() == "table": + # Keep CSV / structured table payload intact — do not split into ``lines``. + slot["lines"] = [] + return slot + if _part_carries_plain_text(part): + slot["lines"] = _line_segments_filtered_for_text_fragment(str(part.get("data") or ""), cfg) + else: + slot["lines"] = [] + return slot + + +def _presentation_line_slots_from_part(part: Dict[str, Any], cfg: Dict[str, Any]) -> List[Dict[str, Any]]: + """Expand one extraction part to presentation slots (CSV tables → one slot per row in ``lines`` mode).""" + if (part.get("typeGroup") or "").strip() != "table": + return [_presentation_line_slot_from_part(part, cfg)] + if cfg.get("outputMode") != "lines": + return [_presentation_line_slot_from_part(part, cfg)] + csv_txt = str(part.get("data") or "") + if not csv_txt.strip(): + return [_presentation_line_slot_from_part(part, cfg)] + segs = _segment_merged_text(csv_txt, cfg["splitBy"]) + segs = _apply_line_filters( + segs, + filter_empty=cfg["filterEmptyLines"], + trim_ws=cfg["trimWhitespace"], + ) + if len(segs) <= 1: + return [_presentation_line_slot_from_part(part, cfg)] + out: List[Dict[str, Any]] = [] + part_id = str(part.get("id") or "table") + for idx, seg in enumerate(segs, start=1): + row_part = _copy_part(part) + row_part["typeGroup"] = "text" + row_part["mimeType"] = "text/plain" + row_part["data"] = seg + row_part["label"] = str(part.get("label") or "row") + row_part["id"] = f"{part_id}_line_{idx}" + slot = _redact_large_part_payload(row_part) + slot["lines"] = [seg] + out.append(slot) + return out + + +def _presentation_image_marker_in_data(part: Dict[str, Any]) -> Dict[str, Any]: + """Builds an image reference blob (used by ``blob`` output as ``[image:]`` token only).""" + rp = _redact_large_part_payload(_copy_part(part)) + marker: Dict[str, Any] = {"type": "image", "typeGroup": "image", "partId": rp.get("id")} + mime = rp.get("mimeType") + if mime: + marker["mimeType"] = str(mime).strip() + lbl = rp.get("label") + if lbl: + marker["label"] = lbl + eid = rp.get("embeddedImageFileId") + if eid: + marker["embeddedImageFileId"] = str(eid) + enfn = rp.get("embeddedImageFileName") + if enfn: + marker["embeddedImageFileName"] = str(enfn) + meta = rp.get("metadata") + extra: Dict[str, Any] = {} + if isinstance(meta, dict): + pi = meta.get("pageIndex") + if pi is not None: + try: + extra["pageIndex"] = int(pi) + except (TypeError, ValueError): + extra["pageIndex"] = pi + cr = meta.get("contextRef") + if isinstance(cr, dict): + loc = cr.get("location") + if loc: + extra["contextLocation"] = loc + cp = cr.get("containerPath") + if cp: + extra["contextContainerPath"] = cp + if extra: + marker["extra"] = extra + return marker + + def _build_file_presentation( source_file_name: str, parts: List[Dict[str, Any]], @@ -547,15 +945,33 @@ def _build_file_presentation( "outputMode": output_mode, "sourceFileName": source_file_name or None, } - if csv_block is not None: - base["csv"] = csv_block if output_mode == "blob": - base["text"] = merge_plain + chunks_blob: List[str] = [] + for p in parts: + tg = (p.get("typeGroup") or "").strip() + if tg == "image": + m = _presentation_image_marker_in_data(p) + pid = str(m.get("partId") or "").strip() + chunks_blob.append(f"[image:{pid}]" if pid else "[image]") + continue + if _part_carries_plain_text(p): + raw = p.get("data") + if raw is None: + continue + s = str(raw).strip() + if not s: + continue + chunks_blob.append(s) + base["data"] = "\n\n".join(chunks_blob) return base if output_mode == "structured": - base["items"] = [_redact_large_part_payload(_copy_part(p)) for p in parts] + if csv_block is not None: + base["csv"] = csv_block + items_list = [_redact_large_part_payload(_copy_part(p)) for p in parts] + base["items"] = items_list + base["data"] = list(items_list) return base if output_mode == "pages": @@ -600,6 +1016,19 @@ def _build_file_presentation( offset += len(seg) + 1 page_objs.append({"pageIndex": pi, "items": items}) base["pages"] = page_objs + base["data"] = [ + { + "pageIndex": int(po["pageIndex"]), + "lines": [ + str(it["text"]) + for it in (po.get("items") or []) + if isinstance(it, dict) and isinstance(it.get("text"), str) + ], + } + for po in page_objs + if isinstance(po, dict) + ] + _attach_redacted_image_parts(base, parts) return base if output_mode == "chunks": @@ -619,27 +1048,62 @@ def _build_file_presentation( row["metadata"] = meta chunk_objs.append(row) base["chunks"] = chunk_objs + base["data"] = [str(row["text"]) for row in chunk_objs if isinstance(row.get("text"), str)] + _attach_redacted_image_parts(base, parts) return base - # lines (default): shared path with pages/chunks splitting - segs = _segment_merged_text(merge_plain, cfg["splitBy"]) - segs = _apply_line_filters( - segs, - filter_empty=cfg["filterEmptyLines"], - trim_ws=cfg["trimWhitespace"], - ) - items: List[Dict[str, Any]] = [] - offset = 0 - for idx, seg in enumerate(segs, start=1): - meta = _base_item_meta(source_file_name, cfg, segment_index=idx, offset_hint=offset) - row = {"text": seg} - if cfg["includeLineNumbers"]: - row["lineNumber"] = idx - if meta: - row["metadata"] = meta - items.append(row) - offset += len(seg) + 1 - base["items"] = items + # lines (default): same part order/cardinality as extraction; segmentation inside each part. + slots: List[Dict[str, Any]] = [] + for p in parts: + if isinstance(p, dict): + slots.extend(_presentation_line_slots_from_part(p, cfg)) + base["data"] = slots + if cfg["includeLineNumbers"] or cfg["includeMetadata"]: + flat_items: List[Dict[str, Any]] = [] + line_no = 0 + seg_off = 0 + for slot in slots: + tg_slot = (slot.get("typeGroup") or "").strip() + part_id = slot.get("id") + page_ix = _page_index_from_part(slot) + + if tg_slot == "image": + line_no += 1 + meta_i = _base_item_meta( + source_file_name, + cfg, + segment_index=line_no, + offset_hint=seg_off, + page_index=page_ix, + ) + row_im: Dict[str, Any] = {"type": "image", "partId": slot.get("id"), "mimeType": slot.get("mimeType")} + if cfg["includeLineNumbers"]: + row_im["lineNumber"] = line_no + if meta_i: + row_im["metadata"] = meta_i + flat_items.append(row_im) + seg_off += 1 + continue + + for ln in slot.get("lines") or []: + if not isinstance(ln, str): + continue + line_no += 1 + meta_t = _base_item_meta( + source_file_name, + cfg, + segment_index=line_no, + offset_hint=seg_off, + page_index=page_ix, + ) + row_t: Dict[str, Any] = {"text": ln} + if cfg["includeLineNumbers"]: + row_t["lineNumber"] = line_no + if meta_t: + row_t["metadata"] = meta_t + flat_items.append(row_t) + seg_off += len(ln) + 1 + base["items"] = flat_items return base @@ -657,88 +1121,118 @@ def _mime_to_file_extension(mime: str) -> str: return mapping.get(m, m.rsplit("/", 1)[-1] if "/" in m else "bin") -def _split_images_to_sidecar_documents( - payload: Dict[str, Any], +def _persist_extracted_image_parts( + content_extracted_serial: List[Dict[str, Any]], *, - document_name_stem: str, -) -> Tuple[Dict[str, Any], List[ActionDocument]]: - """ - Deep-copy handover JSON, clear image pixel data from ``parts``, attach - ``handoverMediaDocumentName`` on each image part, emit binary ActionDocuments. - """ - import copy + name_stem: str, + run_context: Optional[Dict[str, Any]], +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """Decode base64 image parts, persist bytes, replace with ``embeddedImageFileId``; return artifacts meta.""" + artifacts: List[Dict[str, Any]] = [] + if not run_context or not isinstance(run_context, dict): + logger.warning("extractContent image persist: _runContext missing — images not stored") + return content_extracted_serial, artifacts + mandate_id = run_context.get("mandateId") + instance_id = run_context.get("instanceId") + if not mandate_id or not instance_id: + logger.warning( + "extractContent image persist: mandateId/instanceId missing in _runContext (mandate=%r instance=%r)", + mandate_id, + instance_id, + ) + return content_extracted_serial, artifacts - bundle = copy.deepcopy(payload) - files_section = bundle.get("files") or {} - ordered = bundle.get("fileOrder") - key_order: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys()) - media_docs: List[ActionDocument] = [] - kind = bundle.get("kind") or HANDOVER_KIND + try: + from modules.interfaces.interfaceDbManagement import getInterface as _get_mgmt + from modules.interfaces.interfaceDbApp import getInterface as _get_app + from modules.security.rootAccess import getRootUser + except Exception as exc: + logger.warning("extractContent image persist: import failed: %s", exc) + return content_extracted_serial, artifacts - stem = re.sub(r"[^\w\-]+", "_", document_name_stem).strip("_") or "extract" + owner = getRootUser() + uid = run_context.get("userId") + if uid: + try: + umap = _get_app(getRootUser()).getUsersByIds([str(uid)]) + owner = umap.get(str(uid)) or owner + except Exception: + pass - for fk in key_order: - bucket = files_section.get(fk) - if not isinstance(bucket, dict): + try: + mgmt = _get_mgmt(owner, mandateId=str(mandate_id), featureInstanceId=str(instance_id)) + except Exception as exc: + logger.warning("extractContent image persist: mgmt interface failed: %s", exc) + return content_extracted_serial, artifacts + + stem = re.sub(r"[^\w\-]+", "_", name_stem).strip("_") or "extract" + + for doc_idx, blob in enumerate(content_extracted_serial): + if not isinstance(blob, dict): continue - parts = bucket.get("parts") + parts = blob.get("parts") if not isinstance(parts, list): continue - new_parts: List[Dict[str, Any]] = [] + new_parts: List[Any] = [] for p in parts: if not isinstance(p, dict): new_parts.append(p) continue - pcopy = dict(p) - tg = (pcopy.get("typeGroup") or "").strip() - mime = (pcopy.get("mimeType") or "").strip() - raw_data = pcopy.get("data") - if tg == "image" and mime.lower().startswith("image/") and raw_data: - raw_s = raw_data.strip() if isinstance(raw_data, str) else "" + tg = (p.get("typeGroup") or "").strip() + mime = (p.get("mimeType") or "").strip() + raw_data = p.get("data") + if tg != "image" or not mime.lower().startswith("image/") or not raw_data: + new_parts.append(p) + continue + raw_s = raw_data.strip() if isinstance(raw_data, str) else "" + try: + img_bytes = _b64.b64decode(raw_s, validate=True) if raw_s else b"" + except (_binascii.Error, TypeError, ValueError): + new_parts.append(p) + continue + if not img_bytes: + new_parts.append(p) + continue + part_id = str(p.get("id") or "part") + safe_id = re.sub(r"[^\w\-.]+", "_", part_id).strip("_") or "media" + if len(safe_id) > 200: + safe_id = safe_id[:200] + ext = _mime_to_file_extension(mime) + # Stable name (no run timestamp) so duplicate content reuses the same FileItem. + media_name = f"extract_media_{safe_id}.{ext}" + try: + file_item = mgmt.createFile(media_name, mime, img_bytes, folderId=None) + mgmt.createFileData(file_item.id, img_bytes) try: - blob = _b64.b64decode(raw_s, validate=True) if raw_s else b"" - except (_binascii.Error, TypeError, ValueError) as e: + mgmt.updateFile(str(file_item.id), {"tags": ["_workflowInternal"]}) + except Exception as tag_exc: logger.warning( - "extractContent: could not decode image part %s (keep inline): %s", - pcopy.get("id"), - e, + "extractContent image persist: could not tag internal file %s: %s", + file_item.id, + tag_exc, ) - new_parts.append(pcopy) - continue - if not blob: - new_parts.append(pcopy) - continue - part_id = str(pcopy.get("id") or "part") - # Full part id (UUID) — must not truncate or names collide / break linking - safe_id = re.sub(r"[^\w\-.]+", "_", part_id).strip("_") or "media" - if len(safe_id) > 200: - safe_id = safe_id[:200] - ext = _mime_to_file_extension(mime) - media_name = f"extract_media_{stem}_{safe_id}.{ext}" - pcopy["data"] = "" - pcopy["handoverMediaDocumentName"] = media_name - media_docs.append( - ActionDocument( - documentName=media_name, - documentData=blob, - mimeType=mime, - validationMetadata={ - "actionType": "context.extractContent", - "handoverRole": "extractedMedia", - "sourcePartId": part_id, - "handoverSchema": kind, - "containerFileKey": fk, - }, - ) - ) - new_parts.append(pcopy) - else: - new_parts.append(pcopy) - bucket["parts"] = new_parts - bucket["byTypeGroup"] = _rebuild_by_type_group(new_parts) - files_section[fk] = bucket + except Exception as exc: + logger.warning("extractContent image persist: createFile failed %s: %s", part_id, exc) + new_parts.append(p) + continue + p_new = dict(p) + p_new["data"] = "" + p_new["embeddedImageFileId"] = str(file_item.id) + p_new["embeddedImageFileName"] = str(getattr(file_item, "fileName", media_name)) + new_parts.append(p_new) + artifacts.append( + { + "fileId": str(file_item.id), + "fileName": str(getattr(file_item, "fileName", media_name)), + "mimeType": mime, + "sourcePartId": part_id, + "documentIndex": doc_idx, + "suppressInWorkflowFileLists": True, + } + ) + blob["parts"] = new_parts - return bundle, media_docs + return content_extracted_serial, artifacts def _one_file_bucket(ec: ContentExtracted, source_file_name: str) -> Dict[str, Any]: @@ -766,28 +1260,341 @@ def _one_file_bucket(ec: ContentExtracted, source_file_name: str) -> Dict[str, A } -def build_extract_content_handover( + +_MAX_IMAGE_EMBED_BYTES = 300_000 +_IMAGE_MAX_DIMENSION = 1200 + + +def _get_mgmt_for_presentation_render(services: Any) -> Optional[Any]: + mgmt = getattr(services, "interfaceDbComponent", None) if services else None + if mgmt: + return mgmt + if not services: + return None + try: + import modules.interfaces.interfaceDbManagement as iface + + user = getattr(services, "user", None) + if not user: + return None + return iface.getInterface( + user, + mandateId=getattr(services, "mandateId", None) or "", + featureInstanceId=getattr(services, "featureInstanceId", None) or "", + ) + except Exception as exc: + logger.warning("presentation render: mgmt interface failed: %s", exc) + return None + + +def _resize_image_bytes_for_document(image_bytes: bytes) -> bytes: + try: + from PIL import Image as PILImage + + img = PILImage.open(BytesIO(image_bytes)) + if img.mode in ("RGBA", "LA"): + bg = PILImage.new("RGB", img.size, (255, 255, 255)) + bg.paste(img, mask=img.split()[-1]) + img = bg + elif img.mode == "P": + img = img.convert("RGBA") + bg = PILImage.new("RGB", img.size, (255, 255, 255)) + bg.paste(img, mask=img.split()[-1]) + img = bg + elif img.mode != "RGB": + img = img.convert("RGB") + if max(img.size) > _IMAGE_MAX_DIMENSION: + img.thumbnail((_IMAGE_MAX_DIMENSION, _IMAGE_MAX_DIMENSION), PILImage.BILINEAR) + out = BytesIO() + img.save(out, format="JPEG", quality=85, optimize=True) + return out.getvalue() + except Exception as exc: + logger.warning("presentation render: image resize failed (%s)", exc) + return image_bytes + + +def _load_image_bytes_by_file_id(services: Any, file_id: str) -> Optional[bytes]: + mgmt = _get_mgmt_for_presentation_render(services) + if not mgmt or not hasattr(mgmt, "getFileData"): + return None + try: + return mgmt.getFileData(str(file_id)) + except Exception as exc: + logger.warning("presentation render: getFileData(%s) failed: %s", file_id, exc) + return None + + +def _inline_runs_from_presentation_lines(lines: List[Any]) -> List[Dict[str, Any]]: + """Map presentation ``lines`` to inline runs, preserving line order with explicit breaks.""" + from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import _parseInlineRuns + + runs: List[Dict[str, Any]] = [] + first = True + for ln in lines: + if not first: + runs.append({"type": "text", "value": "\n"}) + first = False + piece = str(ln) if ln is not None else "" + if not piece: + continue + runs.extend(_parseInlineRuns(piece)) + return runs if runs else [{"type": "text", "value": ""}] + + +def _is_presentation_file_bucket(d: Dict[str, Any]) -> bool: + """True for a single ``presentation.files[*]`` bucket (loop item value / per-file handover).""" + if d.get("kind") == PRESENTATION_KIND: + return False + data = d.get("data") + if not isinstance(data, (list, str)): + return False + return "outputMode" in d or "sourceFileName" in d + + +def _is_loop_presentation_file_item(d: Dict[str, Any]) -> bool: + val = d.get("value") + return isinstance(d.get("name"), str) and isinstance(val, dict) and _is_presentation_file_bucket(val) + + +def _is_presentation_line_slot(d: Dict[str, Any]) -> bool: + """Single slot from ``presentation.files[*].data[]`` (e.g. loop iteration over one CSV row).""" + if d.get("kind") == PRESENTATION_KIND or _is_presentation_file_bucket(d): + return False + tg = (d.get("typeGroup") or "").strip() + if tg in ("text", "table", "image", "structure"): + return True + return isinstance(d.get("lines"), list) + + +def presentation_envelope_from_file_bucket( + bucket: Dict[str, Any], *, - extracted_results: List[ContentExtracted], - chat_file_names: List[str], - operation_ref: str, + file_key: Optional[str] = None, ) -> Dict[str, Any]: - key_counts: Dict[str, int] = {} - files: Dict[str, Any] = {} - ordered: List[str] = [] - - for i, ec in enumerate(extracted_results): - name = chat_file_names[i] if i < len(chat_file_names) else "" - fk = _file_json_key(str(name), i, key_counts) - files[fk] = _one_file_bucket(ec, str(name)) - ordered.append(fk) - + """Wrap one ``presentation.files`` entry as a full presentation envelope.""" + fk = (file_key or "").strip() + if not fk: + src = str(bucket.get("sourceFileName") or "").strip() + fk = f"file_1_{src}" if src else "file_1" return { - "schemaVersion": 1, - "kind": HANDOVER_KIND, - "operationRef": operation_ref, - "fileOrder": ordered, - "files": files, + "schemaVersion": PRESENTATION_SCHEMA_VERSION, + "kind": PRESENTATION_KIND, + "outputMode": bucket.get("outputMode") or "lines", + "fileOrder": [fk], + "files": {fk: bucket}, + } + + +def normalize_presentation_envelopes(raw: Any) -> List[Dict[str, Any]]: + """Collect ``context.extractContent.presentation.v1`` dicts from ActionResult / list shapes.""" + if raw is None: + return [] + if isinstance(raw, list): + out: List[Dict[str, Any]] = [] + for item in raw: + out.extend(normalize_presentation_envelopes(item)) + return out + if isinstance(raw, dict): + if raw.get("kind") == PRESENTATION_KIND: + return [raw] + if _is_loop_presentation_file_item(raw): + return [ + presentation_envelope_from_file_bucket( + raw["value"], + file_key=str(raw.get("name") or "file_1"), + ) + ] + if _is_presentation_file_bucket(raw): + return [presentation_envelope_from_file_bucket(raw)] + if _is_presentation_line_slot(raw): + bucket = {"outputMode": "lines", "sourceFileName": "", "data": [raw]} + return [presentation_envelope_from_file_bucket(bucket)] + inner = raw.get("data") + if isinstance(inner, dict) and inner.get("kind") == PRESENTATION_KIND: + return [inner] + for key in ("data", "merged", "value"): + nested = raw.get(key) + if isinstance(nested, dict) and nested is not raw: + found = normalize_presentation_envelopes(nested) + if found: + return found + return [] + + +def presentation_envelopes_to_document_json( + raw: Any, + *, + title: str, + language: str, + services: Any = None, +) -> Dict[str, Any]: + """Map presentation envelope(s) to ``renderReport`` ``extractedContent`` (documents/sections).""" + from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import _parseInlineRuns + + envelopes = normalize_presentation_envelopes(raw) + if not envelopes: + raise ValueError( + "context must be presentation data from Inhalt extrahieren (kind=context.extractContent.presentation.v1)" + ) + + sections: List[Dict[str, Any]] = [] + order = 0 + + def _next_id() -> str: + nonlocal order + order += 1 + return f"s_{order}" + + def _append_heading(text: str, level: int = 2) -> None: + t = (text or "").strip() + if not t: + return + sections.append({ + "id": _next_id(), + "content_type": "heading", + "order": order, + "elements": [{"content": {"text": t, "level": level}}], + }) + + def _append_paragraph(text: str) -> None: + t = (text or "").strip() + if not t: + return + sections.append({ + "id": _next_id(), + "content_type": "paragraph", + "order": order, + "elements": [{"content": {"inlineRuns": _parseInlineRuns(t)}}], + }) + + def _append_image_slot(slot: Dict[str, Any]) -> None: + fid = slot.get("embeddedImageFileId") + if not fid: + return + blob = _load_image_bytes_by_file_id(services, str(fid)) + if not blob: + return + if len(blob) > _MAX_IMAGE_EMBED_BYTES: + blob = _resize_image_bytes_for_document(blob) + alt = ( + slot.get("embeddedImageFileName") + or slot.get("label") + or f"image_{fid}" + ) + sections.append({ + "id": _next_id(), + "content_type": "image", + "order": order, + "elements": [{ + "content": { + "altText": str(alt), + "base64Data": _b64.b64encode(blob).decode("ascii"), + }, + }], + }) + + def _append_text_slot(slot: Dict[str, Any]) -> None: + lines = slot.get("lines") + if isinstance(lines, list) and lines: + sections.append({ + "id": _next_id(), + "content_type": "paragraph", + "order": order, + "elements": [{"content": {"inlineRuns": _inline_runs_from_presentation_lines(lines)}}], + }) + return + raw_d = slot.get("data") + if isinstance(raw_d, str) and raw_d.strip(): + sections.append({ + "id": _next_id(), + "content_type": "paragraph", + "order": order, + "elements": [{"content": {"inlineRuns": _inline_runs_from_presentation_lines(raw_d.splitlines())}}], + }) + + def _append_table_slot(slot: Dict[str, Any]) -> None: + raw = slot.get("data") + if not isinstance(raw, str) or not raw.strip(): + return + header_row = True + meta = slot.get("metadata") + if isinstance(meta, dict) and meta.get("csvHeaderRow") is False: + header_row = False + parsed = _table_matrix_from_csv(raw, header_row=header_row) + if not parsed: + return + headers, body = parsed + sections.append({ + "id": _next_id(), + "content_type": "table", + "order": order, + "elements": [{"content": {"headers": headers, "rows": body}}], + }) + + def _append_slot(slot: Dict[str, Any]) -> None: + tg = (slot.get("typeGroup") or "").strip().lower() + mime = (slot.get("mimeType") or "").strip().lower() + if tg == "image" or mime.startswith("image/"): + _append_image_slot(slot) + return + if tg == "container": + return + if tg == "table" or ("csv" in mime and slot.get("data")): + _append_table_slot(slot) + return + if _part_carries_plain_text(slot): + _append_text_slot(slot) + + def _append_bucket(bucket: Dict[str, Any], *, show_file_heading: bool) -> None: + if show_file_heading: + src = str(bucket.get("sourceFileName") or "").strip() + if src: + _append_heading(src) + raw_data = bucket.get("data") + if isinstance(raw_data, str): + _append_paragraph(raw_data) + return + if isinstance(raw_data, list): + for el in raw_data: + if isinstance(el, dict): + _append_slot(el) + elif isinstance(el, str): + _append_paragraph(el) + return + if isinstance(raw_data, dict): + _append_slot(raw_data) + + for envelope in envelopes: + files_section = envelope.get("files") or {} + file_order = envelope.get("fileOrder") + keys: List[str] = ( + list(file_order) if isinstance(file_order, list) and file_order else list(files_section.keys()) + ) + multi_files = len(keys) > 1 + for fk in keys: + bucket = files_section.get(fk) + if isinstance(bucket, dict): + _append_bucket(bucket, show_file_heading=multi_files) + + if not sections: + raise ValueError("presentation produced no renderable sections") + + lang = (language or "de").strip() or "de" + doc_title = (title or "Document").strip() or "Document" + return { + "metadata": { + "split_strategy": "single_document", + "source_documents": [], + "extraction_method": "context_extract_presentation", + "title": doc_title, + "language": lang, + }, + "documents": [{ + "id": "doc_1", + "title": doc_title, + "language": lang, + "sections": sections, + }], } @@ -826,7 +1633,7 @@ async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult: self.services.chat.progressLogFinish(operation_id, False) return ActionResult.isFailure(error="No documents found in documentList") - logger.info(f"Extracting JSON handover from {len(chat_documents)} documents") + logger.info(f"Extracting content from {len(chat_documents)} documents") self.services.chat.progressLogUpdate(operation_id, 0.3, "Preparing extraction options") @@ -853,63 +1660,56 @@ async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult: file_names = [getattr(cd, "fileName", "") or "" for cd in chat_documents] - payload = build_extract_content_handover( - extracted_results=extracted_results, - chat_file_names=file_names, - operation_ref=operation_id, - ) - - self.services.chat.progressLogUpdate(operation_id, 0.9, "Building JSON") - - content_filter = str(parameters.get("contentFilter") or "all").strip().lower() - if content_filter not in _CONTENT_FILTER_OPTIONS: - content_filter = "all" - payload = _apply_content_filter(payload, content_filter) + content_filter = _canonical_content_filter(parameters.get("contentFilter")) + filtered_extractions = _filter_extractions_by_content_filter(extracted_results, content_filter) pres_cfg = parse_presentation_parameters(parameters) - presentation = build_presentation_for_payload(payload, pres_cfg) stem = f"{wf}_{int(time.time())}" - # Only split image sidecars when the filtered payload can still contain image parts. + run_ctx = parameters.get("_runContext") + + content_extracted_serial = [_serialize_content_extracted_for_output(ec) for ec in filtered_extractions] + image_artifacts: List[Dict[str, Any]] = [] if content_filter in ("all", "imagesOnly"): - stripped_payload, media_docs = _split_images_to_sidecar_documents( - payload, - document_name_stem=stem, + content_extracted_serial, image_artifacts = _persist_extracted_image_parts( + content_extracted_serial, + name_stem=stem, + run_context=run_ctx if isinstance(run_ctx, dict) else None, ) - else: - # textOnly / noImages: no image parts remain → skip the split entirely. - stripped_payload = payload - media_docs = [] - stripped_payload["presentation"] = presentation - joined_text = presentation_response_text(presentation, stripped_payload) + presentation = build_presentation_for_serial_extractions(content_extracted_serial, file_names, pres_cfg) - json_meta = { - "actionType": "context.extractContent", - "documentCountInput": len(chat_documents), - "documentCountRoots": len(extracted_results), - "handoverSchema": stripped_payload.get("kind"), - "handoverRole": "structuredHandover", - "mediaDocumentCount": len(media_docs), - } + try: + _pc_json = json.dumps(dict(pres_cfg), ensure_ascii=False, default=str) + _sum = summarize_presentation_payload(presentation) + _sum_json = json.dumps(_sum, ensure_ascii=False, default=str) + logger.info( + "extractContent op=%s presentationConfig=%s presentationSummary=%s", + operation_id, + _pc_json, + _sum_json[:8000] + ("…" if len(_sum_json) > 8000 else ""), + ) + except Exception as _log_e: + logger.debug("extractContent presentation trace log skipped: %s", _log_e) - json_doc = ActionDocument( - documentName=f"extracted_content_{stem}.json", - documentData=stripped_payload, - mimeType="application/json", - validationMetadata=json_meta, - ) - - handover_data = { - "response": joined_text, - "contentType": "text", - "handoverKind": stripped_payload.get("kind"), - "structuredDocumentIndex": 0, - "mediaDocumentCount": len(media_docs), + data_out: Dict[str, Any] = { + **presentation, + "_meta": { + "actionType": "context.extractContent", + "operationRef": operation_id, + "sourceFileNames": list(file_names), + "documentCountInput": len(chat_documents), + "documentCountRoots": len(extracted_results), + "extractPayloadSchemaVersion": EXTRACT_PAYLOAD_SCHEMA_VERSION, + "presentationConfig": dict(pres_cfg), + "persistedImageArtifacts": image_artifacts, + "suppressInWorkflowFileLists": True, + "persistedImageCount": len(image_artifacts), + }, } self.services.chat.progressLogFinish(operation_id, True) - return ActionResult.isSuccess(documents=[json_doc] + media_docs, data=handover_data) + return ActionResult.isSuccess(documents=[], data=data_out) except Exception as e: logger.error(f"Error in content extraction: {str(e)}") diff --git a/modules/workflows/methods/methodContext/actions/mergeContext.py b/modules/workflows/methods/methodContext/actions/mergeContext.py index 3947db30..8bc76e4b 100644 --- a/modules/workflows/methods/methodContext/actions/mergeContext.py +++ b/modules/workflows/methods/methodContext/actions/mergeContext.py @@ -18,8 +18,9 @@ from typing import Any, Dict, List, Optional from modules.datamodels.datamodelChat import ActionResult from modules.workflows.methods.methodContext.actions.extractContent import ( - _joined_text_from_handover_payload, + joined_text_from_extract_node_data, ) +from modules.workflows.methods.methodContext.contextEnvelope import wrap_merge_context_data logger = logging.getLogger(__name__) @@ -89,6 +90,9 @@ def _primary_text_from_item(it: Any) -> str: r = inner.get("response") if r is not None and str(r).strip(): return str(r).strip() + ce_text = joined_text_from_extract_node_data(inner) + if ce_text.strip(): + return ce_text.strip() docs = it.get("documents") if not isinstance(docs, list) or not docs: return "" @@ -104,14 +108,14 @@ def _primary_text_from_item(it: Any) -> str: except (UnicodeDecodeError, ValueError): return "" if isinstance(raw, dict): - return (_joined_text_from_handover_payload(raw) or "").strip() + return (joined_text_from_extract_node_data(raw) or "").strip() if isinstance(raw, str) and raw.strip(): s = raw.strip() if s.startswith("{") and s.endswith("}"): try: parsed = json.loads(s) if isinstance(parsed, dict): - return (_joined_text_from_handover_payload(parsed) or "").strip() + return (joined_text_from_extract_node_data(parsed) or "").strip() except (json.JSONDecodeError, TypeError): pass return s @@ -126,6 +130,14 @@ def _sanitize_heading_title(name: str) -> str: def _iteration_heading_from_item(it: Any) -> Optional[str]: if not isinstance(it, dict): return None + inner = it.get("data") + if isinstance(inner, dict): + meta = inner.get("_meta") if isinstance(inner.get("_meta"), dict) else {} + sf = inner.get("sourceFileNames") or meta.get("sourceFileNames") + if isinstance(sf, list) and sf: + first = sf[0] + if isinstance(first, str) and first.strip(): + return _sanitize_heading_title(first.strip()) docs = it.get("documents") if not isinstance(docs, list) or not docs: return None @@ -222,7 +234,7 @@ async def mergeContext(self, parameters: Dict[str, Any]) -> ActionResult: (_ps[:200] + "…") if len(_ps) > 200 else _ps, len(conflicts), ) - data: Dict[str, Any] = { + payload: Dict[str, Any] = { "merged": merged, "inputs": inputs, "first": inputs[0] if inputs else None, @@ -230,7 +242,7 @@ async def mergeContext(self, parameters: Dict[str, Any]) -> ActionResult: "conflicts": sorted(set(conflicts)) if conflicts else [], "response": primary, } - return ActionResult.isSuccess(data=data) + return ActionResult.isSuccess(data=wrap_merge_context_data(payload)) except Exception as exc: logger.exception("mergeContext failed") return ActionResult.isFailure(error=str(exc)) diff --git a/modules/workflows/methods/methodContext/actions/transformContext.py b/modules/workflows/methods/methodContext/actions/transformContext.py index 6fe05e03..ffff183d 100644 --- a/modules/workflows/methods/methodContext/actions/transformContext.py +++ b/modules/workflows/methods/methodContext/actions/transformContext.py @@ -18,6 +18,7 @@ import re from typing import Any, Dict, List, Optional from modules.datamodels.datamodelChat import ActionResult +from modules.workflows.methods.methodContext.contextEnvelope import wrap_transform_context_data logger = logging.getLogger(__name__) @@ -216,7 +217,7 @@ async def transformContext(self, parameters: Dict[str, Any]) -> ActionResult: if cast_errors: result["_castErrors"] = cast_errors - return ActionResult.isSuccess(data=result) + return ActionResult.isSuccess(data=wrap_transform_context_data(result)) except Exception as exc: logger.exception("transformContext failed") return ActionResult.isFailure(error=str(exc)) diff --git a/modules/workflows/methods/methodContext/contextEnvelope.py b/modules/workflows/methods/methodContext/contextEnvelope.py new file mode 100644 index 00000000..c35836cf --- /dev/null +++ b/modules/workflows/methods/methodContext/contextEnvelope.py @@ -0,0 +1,42 @@ +# Copyright (c) 2026 Patrick Motsch +"""Versioned ``ActionResult.data`` envelope for context.* actions (merge, transform).""" + +from __future__ import annotations + +from typing import Any, Dict + +CONTEXT_MERGE_KIND = "context.mergeContext.v1" +CONTEXT_MERGE_SCHEMA_VERSION = 1 + +CONTEXT_TRANSFORM_KIND = "context.transformContext.v1" +CONTEXT_TRANSFORM_SCHEMA_VERSION = 1 + + +def wrap_merge_context_data(body: Dict[str, Any]) -> Dict[str, Any]: + """Wrap merge payload: ``schemaVersion``, ``kind``, body fields, ``_meta`` last.""" + meta: Dict[str, Any] = { + "actionType": "context.mergeContext", + "mergePayloadSchemaVersion": CONTEXT_MERGE_SCHEMA_VERSION, + } + out: Dict[str, Any] = { + "schemaVersion": CONTEXT_MERGE_SCHEMA_VERSION, + "kind": CONTEXT_MERGE_KIND, + } + out.update(body) + out["_meta"] = meta + return out + + +def wrap_transform_context_data(fields: Dict[str, Any]) -> Dict[str, Any]: + """Wrap transform output fields under a versioned envelope (``_meta`` overwrites same key in fields).""" + meta: Dict[str, Any] = { + "actionType": "context.transformContext", + "transformPayloadSchemaVersion": CONTEXT_TRANSFORM_SCHEMA_VERSION, + } + out: Dict[str, Any] = { + "schemaVersion": CONTEXT_TRANSFORM_SCHEMA_VERSION, + "kind": CONTEXT_TRANSFORM_KIND, + } + out.update(fields) + out["_meta"] = meta + return out diff --git a/modules/workflows/methods/methodContext/methodContext.py b/modules/workflows/methods/methodContext/methodContext.py index b2e7220b..b82d4356 100644 --- a/modules/workflows/methods/methodContext/methodContext.py +++ b/modules/workflows/methods/methodContext/methodContext.py @@ -57,12 +57,9 @@ class MethodContext(MethodBase): "extractContent": WorkflowActionDefinition( actionId="context.extractContent", description=( - "Extract document content without AI. Unified handover: (1) `documents[0]` " - "JSON `context.extractContent.handover.v1` with text in `parts` and image placeholders " - "linking to sibling blobs via `handoverMediaDocumentName`; " - "(2) each extracted image as a separate binary document (`extract_media_*`); " - "(3) `data.response` / top-level `response` after normalization — concatenated plain text " - "for prompts and file.create. Pick `response`, a specific document, or deep JSON paths." + "Extract document content without AI. Returns `data` as the configured presentation " + "envelope (`fileOrder`, `files`, …) plus `_meta`; no duplicated service payload or bundled " + "plain-text column. Persisted images appear via `embeddedImageFileId` in internal serial only." ), dynamicMode=True, outputType="UdmDocument", @@ -151,8 +148,8 @@ class MethodContext(MethodBase): "mergeContext": WorkflowActionDefinition( actionId="context.mergeContext", description=( - "Führt eine Liste von Schrittergebnissen (z. B. ``bodyResults`` einer " - "``flow.loop``) zu einem zusammengeführten Dict zusammen." + "Führt Schritte zu einem Dict zusammen. ``data`` enthält einen versionierten Umschlag " + "(``context.mergeContext.v1``, ``merged``, ``response``, …) und ``_meta``." ), outputType="ActionResult", parameters={ @@ -210,10 +207,9 @@ class MethodContext(MethodBase): "transformContext": WorkflowActionDefinition( actionId="context.transformContext", description=( - "Transform the upstream payload via a list of {sourceField, outputField, " - "operation, type, expression} mappings. Operations: rename, cast, nest, " - "flatten, compute. compute uses {{...}} templates; nesting is implicit " - "via dotted outputField paths." + "Transform mappings on the upstream payload. ``data`` trägt " + "``schemaVersion``, ``kind: context.transformContext.v1``, die gemappten Felder " + "und optional ``_castErrors``, plus ``_meta``." ), outputType="Transit", parameters={ diff --git a/modules/workflows/methods/methodFile/actions/create.py b/modules/workflows/methods/methodFile/actions/create.py index e7ef569c..9342767f 100644 --- a/modules/workflows/methods/methodFile/actions/create.py +++ b/modules/workflows/methods/methodFile/actions/create.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional -import asyncio +import ast import base64 import binascii import io @@ -12,79 +12,33 @@ import logging import re from modules.datamodels.datamodelChat import ActionResult, ActionDocument -from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import ( - enhancePlainTextWithMarkdownTables, - markdownToDocumentJson, -) from modules.shared.i18nRegistry import normalizePrimaryLanguageTag from modules.workflows.automation2.executors.actionNodeExecutor import _coerce_document_data_to_bytes -from modules.workflows.methods.methodAi._common import is_image_action_document_list, serialize_context +from modules.workflows.methods.methodAi._common import is_image_action_document_list +from modules.workflows.methods.methodContext.actions.extractContent import ( + presentation_envelopes_to_document_json, +) logger = logging.getLogger(__name__) _SAFE_FILENAME = re.compile(r'[^\w\-.\(\)\s\[\]%@+]') -_HEAVY_CONTEXT_KEYS = frozenset({"imageDocumentsOnly", "documents", "inputs"}) - - -def _collect_image_documents_only(raw: Any) -> List[Any]: - """Resolve ``imageDocumentsOnly`` whether the context is merged, nested, or surfaced.""" - if not isinstance(raw, dict): - return [] - paths = ( - ("imageDocumentsOnly",), - ("merged", "imageDocumentsOnly"), - ("data", "merged", "imageDocumentsOnly"), - ("data", "imageDocumentsOnly"), - ) - for path in paths: - cur: Any = raw - ok = True - for p in path: - if not isinstance(cur, dict): - ok = False - break - cur = cur.get(p) - if ok and isinstance(cur, list) and cur: - return cur - return [] - - -def _context_string_for_report(raw: Any, output_format: str) -> str: - """Build one narrative string for ``markdownToDocumentJson`` / render. - - Prefer plain ``response`` text (merge node surfaces it; nested ``merged.response`` - too). Never dump ``inputs`` / binary lists into the PDF body — that produced giant - JSON + base64 "hash" paragraphs after merge + ``contextBuilder``. - """ - of = (output_format or "docx").strip().lower().lstrip(".") - if of == "json": - return serialize_context(raw, prefer_handover_primary=False) - if isinstance(raw, str): - return raw.strip().lstrip("\ufeff") - if isinstance(raw, dict): - for path in ( - ("response",), - ("merged", "response"), - ("data", "response"), - ("data", "merged", "response"), - ): - cur: Any = raw - ok = True - for k in path: - if not isinstance(cur, dict): - ok = False - break - cur = cur.get(k) - if ok and cur is not None and str(cur).strip(): - return str(cur).strip().lstrip("\ufeff") - lean = {k: v for k, v in raw.items() if k not in _HEAVY_CONTEXT_KEYS} +def _coerce_structured_context(raw: Any) -> Any: + """Undo legacy ``str`` coercion on structured refs (loop ``bodyResults``, presentation).""" + if not isinstance(raw, str): + return raw + stripped = raw.strip() + if not stripped or stripped[0] not in ("[", "{"): + return raw + for loader in (json.loads, ast.literal_eval): try: - return json.dumps(lean, ensure_ascii=False, indent=2, default=str) - except Exception: - return serialize_context(lean, prefer_handover_primary=False) - return serialize_context(raw, prefer_handover_primary=False) + parsed = loader(stripped) + except (json.JSONDecodeError, ValueError, SyntaxError, TypeError): + continue + if isinstance(parsed, (dict, list)): + return parsed + return raw def _raw_context_preview_for_log(raw: Any, max_len: int = 500) -> str: @@ -121,12 +75,6 @@ def _persistDocumentsToUserFiles( return if not mgmt: return - logger.info( - "file.create persist: mgmt=%s id(mgmt)=%s has_createFileData=%s", - type(mgmt).__name__, - id(mgmt), - hasattr(mgmt, "createFileData"), - ) for doc in action_documents: try: doc_data = doc.documentData if hasattr(doc, "documentData") else doc.get("documentData") @@ -149,15 +97,8 @@ def _persistDocumentsToUserFiles( or doc.get("mimeType") or "application/octet-stream" ) - logger.info( - "file.create persist: calling createFile name=%s bytes=%s", - doc_name, - len(content), - ) file_item = mgmt.createFile(doc_name, mime, content, folderId=folder_id) - logger.info("file.create persist: createFile returned id=%s", file_item.id) - ok = mgmt.createFileData(file_item.id, content) - logger.info("file.create persist: createFileData returned %s for id=%s", ok, file_item.id) + mgmt.createFileData(file_item.id, content) meta = getattr(doc, "validationMetadata", None) or doc.get("validationMetadata") or {} if isinstance(meta, dict): meta["fileId"] = file_item.id @@ -165,7 +106,6 @@ def _persistDocumentsToUserFiles( doc.validationMetadata = meta elif isinstance(doc, dict): doc["validationMetadata"] = meta - logger.info("file.create: persisted %s to user files (id=%s)", doc_name, file_item.id) except Exception as e: dname = getattr(doc, "documentName", None) or doc.get("documentName", "?") logger.warning("file.create: failed to persist document %s: %s", dname, e) @@ -215,100 +155,7 @@ def _load_image_bytes_from_action_doc(doc: dict, services) -> Optional[bytes]: return None -# Images larger than this threshold (decoded bytes) are resized before embedding -# to avoid multi-minute PDF rendering of high-res raster scans. -_MAX_IMAGE_EMBED_BYTES = 300_000 # 300 KB decoded ≈ ~400 KB base64 -_IMAGE_MAX_DIMENSION = 1200 # longest edge in pixels after resize - - -def _resize_image_for_document(image_bytes: bytes) -> bytes: - """Resize image to at most ``_IMAGE_MAX_DIMENSION`` px on the longest edge - and re-encode as JPEG. Falls back to the original bytes on any error.""" - try: - from PIL import Image as PILImage - import io as _io - - img = PILImage.open(_io.BytesIO(image_bytes)) - - # Flatten transparency / palette modes to RGB (required for JPEG) - if img.mode in ("RGBA", "LA"): - bg = PILImage.new("RGB", img.size, (255, 255, 255)) - bg.paste(img, mask=img.split()[-1]) - img = bg - elif img.mode == "P": - img = img.convert("RGBA") - bg = PILImage.new("RGB", img.size, (255, 255, 255)) - bg.paste(img, mask=img.split()[-1]) - img = bg - elif img.mode != "RGB": - img = img.convert("RGB") - - w, h = img.size - if max(w, h) > _IMAGE_MAX_DIMENSION: - # thumbnail() is optimised for downscaling: it uses an intermediate - # box-filter step before the final filter, making it 3-5× faster - # than resize() on large images. BILINEAR is fast and sufficient - # for document thumbnails. - img.thumbnail((_IMAGE_MAX_DIMENSION, _IMAGE_MAX_DIMENSION), PILImage.BILINEAR) - - out = _io.BytesIO() - img.save(out, format="JPEG", quality=85, optimize=True) - return out.getvalue() - except Exception as e: - logger.warning("file.create: image resize failed (%s) — using original bytes", e) - return image_bytes - - -def _append_images_to_content(structured_content: dict, image_docs: list, services=None) -> dict: - """Append images from imageDocumentsOnly as native image elements to the structured JSON. - - Each image becomes an ``image`` element with ``base64Data`` in a trailing - "Bilder" section of the first document. Images larger than - ``_MAX_IMAGE_EMBED_BYTES`` are automatically resized/compressed so the - synchronous PDF renderer does not block for minutes on high-res scans. - The renderers (DOCX / PDF) handle ``content.base64Data`` natively. - """ - elements = [] - for doc in image_docs: - b = _load_image_bytes_from_action_doc(doc, services) - if not b: - raw = doc.get("documentData") if isinstance(doc, dict) else None - if isinstance(raw, str): - try: - b = base64.b64decode(raw) - except Exception: - pass - if not b: - continue - - if len(b) > _MAX_IMAGE_EMBED_BYTES: - logger.info( - "file.create: image %s is %d bytes — resizing to max %dpx for embedding", - (doc.get("documentName") if isinstance(doc, dict) else "?") or "?", - len(b), - _IMAGE_MAX_DIMENSION, - ) - b = _resize_image_for_document(b) - - elements.append({ - "type": "image", - "content": { - "base64Data": base64.b64encode(b).decode("ascii"), - "alt": (doc.get("documentName") if isinstance(doc, dict) else None) or "image", - }, - }) - - if not elements: - return structured_content - - docs = structured_content.get("documents") - if isinstance(docs, list) and docs: - docs[0].setdefault("sections", []).append({"heading": "Bilder", "elements": elements}) - return structured_content - - def _images_list_to_pdf(image_bytes_list: List[bytes]) -> bytes: - """One PDF page per image; embedded raster data via PyMuPDF.""" import fitz pdf = fitz.open() @@ -322,7 +169,6 @@ def _images_list_to_pdf(image_bytes_list: List[bytes]) -> bytes: def _images_list_to_docx(image_bytes_list: List[bytes]) -> bytes: - """Images embedded in the document package (inline shapes), not hyperlinks.""" from docx import Document from docx.shared import Inches @@ -403,28 +249,13 @@ async def _create_merged_image_documents( async def create(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Create a file from context (text/markdown from upstream AI node). - Uses GenerationService.renderReport to produce docx, pdf, txt, md, html, xlsx, etc. - """ - raw_context = parameters.get("context", "") + """Create a file from ``context.extractContent`` presentation data via ``renderReport``.""" + raw_context = _coerce_structured_context(parameters.get("context", "")) if isinstance(raw_context, list) and is_image_action_document_list(raw_context): return await _create_merged_image_documents(self, parameters, raw_context) outputFormat = (parameters.get("outputFormat") or "docx").strip().lower().lstrip(".") - context = _context_string_for_report(raw_context, outputFormat) - - if not context: - logger.warning( - "file.create: context empty after resolve — raw_context type=%s raw_summary=%r " - "serialized_len=%s (check ActionNodeExecutor \"file.create context resolution\" log for DataRef / upstream).", - type(raw_context).__name__, - _raw_context_preview_for_log(raw_context), - len(context or ""), - ) - return ActionResult.isFailure(error="context is required (connect an AI node or provide text)") - title = (parameters.get("title") or "Document").strip() templateName = parameters.get("templateName") language = normalizePrimaryLanguageTag( @@ -438,31 +269,30 @@ async def create(self, parameters: Dict[str, Any]) -> ActionResult: folder_id = str(raw_folder).strip() try: - if outputFormat != "json": - context = enhancePlainTextWithMarkdownTables(context) - structured_content = markdownToDocumentJson(context, title, language) - if templateName: - structured_content.setdefault("metadata", {})["templateName"] = templateName + structured_content = presentation_envelopes_to_document_json( + raw_context, + title=title, + language=language, + services=self.services, + ) + except ValueError as e: + logger.warning( + "file.create: invalid presentation context type=%s preview=%r: %s", + type(raw_context).__name__, + _raw_context_preview_for_log(raw_context), + e, + ) + return ActionResult.isFailure(error=str(e)) - img_docs = _collect_image_documents_only(raw_context) - if img_docs: - # Image decoding and PIL resizing are CPU-bound; run them in a - # thread pool so the event loop is not blocked while processing - # high-res raster images (e.g. 3+ MB PNGs from PDF extraction). - loop = asyncio.get_event_loop() - structured_content = await loop.run_in_executor( - None, - _append_images_to_content, - structured_content, - img_docs, - self.services, - ) + if templateName: + structured_content.setdefault("metadata", {})["templateName"] = templateName - generation = getattr(self.services, "generation", None) - if not generation: - return ActionResult.isFailure(error="Generation service not available") + generation = getattr(self.services, "generation", None) + if not generation: + return ActionResult.isFailure(error="Generation service not available") - ai_service = getattr(self.services, "ai", None) + ai_service = getattr(self.services, "ai", None) + try: rendered_docs = await generation.renderReport( extractedContent=structured_content, outputFormat=outputFormat, @@ -472,43 +302,50 @@ async def create(self, parameters: Dict[str, Any]) -> ActionResult: aiService=ai_service, parentOperationId=parameters.get("parentOperationId"), ) - - if not rendered_docs: - return ActionResult.isFailure(error="Rendering produced no output") - - action_documents = [] - mime_map = { - "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "pdf": "application/pdf", - "txt": "text/plain", - "md": "text/markdown", - "html": "text/html", - "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "csv": "text/csv", - "json": "application/json", - } - for rd in rendered_docs: - doc_data = rd.documentData if hasattr(rd, "documentData") else getattr(rd, "document_data", None) - doc_name = getattr(rd, "filename", None) or getattr(rd, "documentName", None) or getattr(rd, "document_name", f"output.{outputFormat}") - mime = getattr(rd, "mimeType", None) or getattr(rd, "mime_type", None) or mime_map.get(outputFormat, "application/octet-stream") - - if isinstance(doc_data, bytes): - doc_data = base64.b64encode(doc_data).decode("ascii") - - action_documents.append(ActionDocument( - documentName=doc_name, - documentData=doc_data, - mimeType=mime, - validationMetadata={ - "actionType": "file.create", - "outputFormat": outputFormat, - "templateName": templateName, - }, - )) - - _persistDocumentsToUserFiles(action_documents, self.services, folder_id=folder_id) - return ActionResult.isSuccess(documents=action_documents) - except Exception as e: - logger.error(f"file.create failed: {e}", exc_info=True) + logger.error("file.create failed: %s", e, exc_info=True) return ActionResult.isFailure(error=str(e)) + + if not rendered_docs: + return ActionResult.isFailure(error="Rendering produced no output") + + action_documents = [] + mime_map = { + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "pdf": "application/pdf", + "txt": "text/plain", + "md": "text/markdown", + "html": "text/html", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "csv": "text/csv", + "json": "application/json", + } + for rd in rendered_docs: + doc_data = rd.documentData if hasattr(rd, "documentData") else getattr(rd, "document_data", None) + doc_name = ( + getattr(rd, "filename", None) + or getattr(rd, "documentName", None) + or getattr(rd, "document_name", f"output.{outputFormat}") + ) + mime = ( + getattr(rd, "mimeType", None) + or getattr(rd, "mime_type", None) + or mime_map.get(outputFormat, "application/octet-stream") + ) + + if isinstance(doc_data, bytes): + doc_data = base64.b64encode(doc_data).decode("ascii") + + action_documents.append(ActionDocument( + documentName=doc_name, + documentData=doc_data, + mimeType=mime, + validationMetadata={ + "actionType": "file.create", + "outputFormat": outputFormat, + "templateName": templateName, + }, + )) + + _persistDocumentsToUserFiles(action_documents, self.services, folder_id=folder_id) + return ActionResult.isSuccess(documents=action_documents) diff --git a/modules/workflows/methods/methodFile/methodFile.py b/modules/workflows/methods/methodFile/methodFile.py index 3f9dbd02..c30f86a4 100644 --- a/modules/workflows/methods/methodFile/methodFile.py +++ b/modules/workflows/methods/methodFile/methodFile.py @@ -35,10 +35,13 @@ class MethodFile(MethodBase): ), "context": WorkflowActionParameter( name="context", - type="str", + type="Any", frontendType=FrontendType.HIDDEN, required=False, - description="Injected from contentSource or upstream connection", + description=( + "Resolved context: presentation envelope(s) from context.extractContent " + "(dict or list, e.g. loop bodyResults), or legacy plain text string." + ), ), "outputFormat": WorkflowActionParameter( name="outputFormat", diff --git a/tests/unit/workflow/test_extract_content_handover.py b/tests/unit/workflow/test_extract_content_handover.py index e9a71636..9f436cbb 100644 --- a/tests/unit/workflow/test_extract_content_handover.py +++ b/tests/unit/workflow/test_extract_content_handover.py @@ -1,15 +1,26 @@ -# Unit tests: unified extractContent handover (text vs image sidecars). +# Unit tests: context.extractContent serialize + presentation helpers (legacy handover dicts vs new paths). import base64 +import copy as _copy + +from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart from modules.workflows.methods.methodContext.actions.extractContent import ( HANDOVER_KIND, + EXTRACT_PAYLOAD_SCHEMA_VERSION, _apply_content_filter, + _canonical_content_filter, + _joined_text_from_content_extracted_serial, + _filter_extractions_by_content_filter, _joined_text_from_handover_payload, - _split_images_to_sidecar_documents, + _persist_extracted_image_parts, + _serialize_content_extracted_for_output, + build_presentation_for_extractions, build_presentation_for_payload, + joined_text_from_extract_node_data, parse_presentation_parameters, presentation_response_text, + summarize_presentation_payload, ) @@ -30,6 +41,120 @@ def test_joined_text_orders_text_table_and_skips_container(): assert _joined_text_from_handover_payload(payload) == "A\n\nB" +def test_joined_text_from_extract_node_data_prefers_content_extracted(): + data = { + "contentExtracted": [ + {"id": "x", "parts": [{"typeGroup": "text", "mimeType": "text/plain", "data": "Z", "id": "p"}]} + ] + } + assert joined_text_from_extract_node_data(data) == "Z" + + +def test_joined_text_serial_list(): + items = [{"parts": [{"typeGroup": "text", "mimeType": "text/plain", "data": "a", "id": "1"}]}] + assert _joined_text_from_content_extracted_serial(items) == "a" + + +def test_serialize_content_extracted_drops_summary(): + ce = ContentExtracted( + id="doc1", + parts=[ContentPart(id="p", label="main", typeGroup="text", mimeType="text/plain", data="hi")], + summary={"ignored": True}, + ) + d = _serialize_content_extracted_for_output(ce) + assert "summary" not in d + + +def test_persist_images_without_run_context_is_noop(): + raw = b"fake-binary-image" + b64 = base64.b64encode(raw).decode("ascii") + serial = [ + { + "id": "1", + "parts": [ + {"typeGroup": "text", "data": "x", "mimeType": "text/plain", "id": "t1"}, + {"typeGroup": "image", "mimeType": "image/png", "data": b64, "id": "img1"}, + ], + } + ] + original = _copy.deepcopy(serial) + out, arts = _persist_extracted_image_parts(serial, name_stem="stem", run_context=None) + assert arts == [] + assert out == original + + +def test_filter_extractions_by_content_filter_text_only(): + ec = ContentExtracted( + id="id1", + parts=[ + ContentPart(id="t", label="t", typeGroup="text", mimeType="text/plain", data="a"), + ContentPart(id="i", label="i", typeGroup="image", mimeType="image/png", data=""), + ], + ) + out = _filter_extractions_by_content_filter([ec], "textOnly") + assert len(out) == 1 + assert len(out[0].parts) == 1 + assert out[0].parts[0].typeGroup == "text" + + +def test_canonical_content_filter_is_case_insensitive(): + assert _canonical_content_filter("imagesOnly") == "imagesOnly" + assert _canonical_content_filter("IMAGESONLY") == "imagesOnly" + assert _canonical_content_filter("textOnly") == "textOnly" + assert _canonical_content_filter("unknown") == "all" + + +def test_parse_presentation_parameters_content_filter_all_coerces_legacy_pdf_text(): + """Graphs with „Alles“ but stored pdfExtractMode ``text`` must not drop image parts in presentation.""" + cfg = parse_presentation_parameters({"contentFilter": "all", "pdfExtractMode": "text"}) + assert cfg["pdfExtractMode"] == "all" + + +def test_parse_presentation_parameters_images_only_defaults_pdf_mode(): + cfg = parse_presentation_parameters({"contentFilter": "imagesOnly"}) + assert cfg["pdfExtractMode"] == "images" + + +def test_presentation_lines_includes_redacted_image_parts_when_pdf_mode_all(): + payload = { + "fileOrder": ["f1"], + "files": { + "f1": { + "sourceFileName": "x.pdf", + "parts": [ + {"typeGroup": "text", "data": "body", "id": "t"}, + {"typeGroup": "image", "mimeType": "image/png", "data": "YQ==", "id": "img1"}, + ], + }, + }, + } + cfg = parse_presentation_parameters({"contentFilter": "all", "outputMode": "lines", "pdfExtractMode": "all"}) + pres = build_presentation_for_payload(payload, cfg) + bf = pres["files"]["f1"] + assert len(bf["data"]) == 2 + assert bf["data"][0]["typeGroup"] == "text" + assert bf["data"][0]["lines"] == ["body"] + assert bf["data"][1]["typeGroup"] == "image" + assert bf["data"][1]["lines"] == [] + assert bf["data"][1].get("data") == "" + assert "imageParts" not in bf + + +def test_build_presentation_for_extractions_matches_payload_path(): + ce = ContentExtracted( + id="id", + parts=[ContentPart(id="p", label="main", typeGroup="text", mimeType="text/plain", data="a\n\nb")], + ) + cfg = parse_presentation_parameters({"outputMode": "lines", "splitBy": "paragraph"}) + pres = build_presentation_for_extractions([ce], ["f.txt"], cfg) + fk = pres["fileOrder"][0] + b1 = pres["files"][fk] + assert b1["outputMode"] == "lines" + assert len(b1["data"]) == 1 + assert b1["data"][0]["lines"] == ["a", "b"] + assert "items" not in b1 + + def test_joined_text_includes_csv_table_parts(): payload = { "fileOrder": ["f1"], @@ -44,47 +169,6 @@ def test_joined_text_includes_csv_table_parts(): assert _joined_text_from_handover_payload(payload) == "a,b\n1,2" -def test_split_images_moves_pixels_to_blob_docs(): - raw = b"fake-binary-image" - b64 = base64.b64encode(raw).decode("ascii") - payload = { - "kind": HANDOVER_KIND, - "schemaVersion": 1, - "fileOrder": ["f1"], - "files": { - "f1": { - "parts": [ - {"typeGroup": "text", "data": "x", "id": "t1"}, - { - "typeGroup": "image", - "mimeType": "image/png", - "data": b64, - "id": "p1-img", - "metadata": {}, - }, - ] - } - }, - } - stripped, blobs = _split_images_to_sidecar_documents(payload, document_name_stem="abc") - assert len(blobs) == 1 - assert blobs[0].mimeType == "image/png" - assert blobs[0].documentData == raw - assert blobs[0].documentName.endswith(".png") - assert blobs[0].documentName.startswith("extract_media_") - meta = blobs[0].validationMetadata or {} - assert meta.get("handoverRole") == "extractedMedia" - img_parts = [ - p - for p in stripped["files"]["f1"]["parts"] - if isinstance(p, dict) and (p.get("typeGroup") or "") == "image" - ] - assert len(img_parts) == 1 - assert img_parts[0]["data"] == "" - assert img_parts[0]["handoverMediaDocumentName"] == blobs[0].documentName - assert "image" in stripped["files"]["f1"]["byTypeGroup"] - - def _mixed_payload(): return { "kind": HANDOVER_KIND, @@ -106,7 +190,7 @@ def _mixed_payload(): def test_content_filter_all_is_noop(): payload = _mixed_payload() result = _apply_content_filter(payload, "all") - assert result is payload # same object, no copy + assert result is payload def test_content_filter_text_only_keeps_text_table_structure(): @@ -129,7 +213,6 @@ def test_content_filter_no_images_removes_only_images(): parts = result["files"]["f1"]["parts"] type_groups = {p["typeGroup"] for p in parts} assert "image" not in type_groups - # text, table, structure all remain assert {"text", "table", "structure"} == type_groups @@ -137,14 +220,7 @@ def test_content_filter_text_only_joined_text_has_no_image_data(): result = _apply_content_filter(_mixed_payload(), "textOnly") text = _joined_text_from_handover_payload(result) assert "hello" in text - assert "abc=" not in text # base64 image data must not appear - - -def test_content_filter_text_only_no_sidecars(): - """textOnly: no image parts → _split produces zero sidecars.""" - result = _apply_content_filter(_mixed_payload(), "textOnly") - stripped, blobs = _split_images_to_sidecar_documents(result, document_name_stem="test") - assert blobs == [] + assert "abc=" not in text def test_presentation_lines_and_response(): @@ -162,9 +238,12 @@ def test_presentation_lines_and_response(): } cfg = parse_presentation_parameters({"outputMode": "lines", "splitBy": "paragraph"}) pres = build_presentation_for_payload(payload, cfg) - assert pres["files"]["f1"]["outputMode"] == "lines" - assert [it["text"] for it in pres["files"]["f1"]["items"]] == ["a", "b"] - assert presentation_response_text(pres, payload) == "a\n\nb" + b1 = pres["files"]["f1"] + assert b1["outputMode"] == "lines" + assert isinstance(b1["data"], list) + assert len(b1["data"]) == 1 + assert b1["data"][0]["lines"] == ["a", "b"] + assert presentation_response_text(pres) == "a\n\nb" def test_presentation_pdf_mode_tables_only(): @@ -182,7 +261,9 @@ def test_presentation_pdf_mode_tables_only(): } cfg = parse_presentation_parameters({"pdfExtractMode": "tables", "outputMode": "blob"}) pres = build_presentation_for_payload(payload, cfg) - assert pres["files"]["f1"]["text"] == "h1,h2\n1,2" + bf = pres["files"]["f1"] + assert isinstance(bf["data"], str) + assert bf["data"] == "h1,h2\n1,2" def test_presentation_csv_rows(): @@ -195,7 +276,7 @@ def test_presentation_csv_rows(): }, }, } - cfg = parse_presentation_parameters({"csvHeaderRow": "true"}) + cfg = parse_presentation_parameters({"outputMode": "structured", "csvHeaderRow": "true"}) pres = build_presentation_for_payload(payload, cfg) csv = pres["files"]["f1"]["csv"] assert csv["headers"] == ["a", "b"] @@ -222,6 +303,11 @@ def test_presentation_pages_groups_by_page_index(): (0, ["p0"]), (1, ["p1a", "p1b"]), ] + pdata = pres["files"]["f1"]["data"] + assert pdata == [ + {"pageIndex": 0, "lines": ["p0"]}, + {"pageIndex": 1, "lines": ["p1a", "p1b"]}, + ] def test_presentation_chunks_with_overlap_chars(): @@ -235,9 +321,10 @@ def test_presentation_chunks_with_overlap_chars(): pres = build_presentation_for_payload(payload, cfg) texts = [c["text"] for c in pres["files"]["f1"]["chunks"]] assert texts == ["abcd", "cdef", "efgh", "ghij"] + assert pres["files"]["f1"]["data"] == texts -def test_presentation_stripped_payload_gains_presentation_key_after_split(): +def test_presentation_keeps_pres_key_after_inline_image_strip_simulation(): raw = b"x" b64 = base64.b64encode(raw).decode("ascii") payload = { @@ -254,7 +341,339 @@ def test_presentation_stripped_payload_gains_presentation_key_after_split(): }, } pres = build_presentation_for_payload(payload, parse_presentation_parameters({})) - stripped, _blobs = _split_images_to_sidecar_documents(payload, document_name_stem="s") - stripped["presentation"] = pres - assert "presentation" in stripped - assert stripped["presentation"]["files"]["f1"]["items"] + serial = _copy.deepcopy([{"id": "1", "parts": payload["files"]["f1"]["parts"]}]) + stayed, arts = _persist_extracted_image_parts(serial, name_stem="s", run_context=None) + assert arts == [] + wrapper = {**pres, "_meta": {}} + fk = pres["fileOrder"][0] + assert isinstance(wrapper["files"][fk].get("data"), list) + assert len(wrapper["files"][fk]["data"]) == 2 + + +def test_summarize_presentation_payload_shape(): + payload = { + "fileOrder": ["f1"], + "files": {"f1": {"sourceFileName": "t.txt", "parts": [{"typeGroup": "text", "data": "hello", "id": "a"}]}}, + } + pres = build_presentation_for_payload(payload, parse_presentation_parameters({"outputMode": "blob"})) + s = summarize_presentation_payload(pres) + assert s["fileOrder"] == ["f1"] + assert "f1" in s["files"] + assert s["files"]["f1"]["outputMode"] == "blob" + assert s["files"]["f1"]["stringLength"] == 5 + assert "hello" in (s["files"]["f1"].get("head") or "") + + +def test_joined_text_from_extract_node_data_uses_presentation_root(): + from modules.workflows.methods.methodContext.actions.extractContent import PRESENTATION_KIND + + data = { + "schemaVersion": 1, + "kind": PRESENTATION_KIND, + "outputMode": "lines", + "fileOrder": ["f1"], + "files": {"f1": {"outputMode": "lines", "sourceFileName": "x.txt", "data": ["body"]}}, + "_meta": {"extractPayloadSchemaVersion": EXTRACT_PAYLOAD_SCHEMA_VERSION}, + } + assert joined_text_from_extract_node_data(data) == "body" + assert data["_meta"]["extractPayloadSchemaVersion"] == EXTRACT_PAYLOAD_SCHEMA_VERSION + + +def test_action_result_contract_new_extract_payload_keys(): + from modules.workflows.methods.methodContext.actions.extractContent import PRESENTATION_KIND + + data = { + "schemaVersion": 1, + "kind": PRESENTATION_KIND, + "outputMode": "lines", + "fileOrder": ["f1"], + "files": {"f1": {"outputMode": "lines", "sourceFileName": "x.txt", "data": ["body"]}}, + "_meta": {"actionType": "context.extractContent", "extractPayloadSchemaVersion": EXTRACT_PAYLOAD_SCHEMA_VERSION}, + } + assert data["kind"] == PRESENTATION_KIND + assert joined_text_from_extract_node_data(data) == "body" + + +def test_automation_workspace_suppresses_extract_artifacts(): + from modules.workflows.automation2.workflowArtifactVisibility import suppress_workflow_file_in_workspace_ui + + assert suppress_workflow_file_in_workspace_ui({"fileName": "extracted_content_transient-abc_99.json"}) + assert suppress_workflow_file_in_workspace_ui({"fileName": "extract_media_stem_uuid.png"}) + assert not suppress_workflow_file_in_workspace_ui({"fileName": "export_2026.csv"}) + assert suppress_workflow_file_in_workspace_ui({"fileName": "", "suppressInWorkflowFileLists": True}) + assert suppress_workflow_file_in_workspace_ui({"fileName": "report.pdf", "tags": ["_workflowInternal"]}) + assert not suppress_workflow_file_in_workspace_ui({"fileName": "report.pdf", "tags": ["invoice"]}) + + +def test_normalize_presentation_envelopes_action_result_and_list(): + from modules.workflows.methods.methodContext.actions.extractContent import ( + PRESENTATION_KIND, + normalize_presentation_envelopes, + ) + + pres = { + "kind": PRESENTATION_KIND, + "fileOrder": ["f1"], + "files": {"f1": {"outputMode": "lines", "sourceFileName": "x.txt", "data": []}}, + } + wrapped = {"success": True, "data": pres} + assert len(normalize_presentation_envelopes(wrapped)) == 1 + assert len(normalize_presentation_envelopes([wrapped])) == 1 + + +def test_method_base_preserves_run_context_injection(): + from modules.workflows.methods.methodFile.methodFile import MethodFile + + class _Svc: + pass + + action_def = MethodFile(_Svc())._actions["create"] + validated = MethodFile(_Svc())._validateParameters( + {"context": "x", "outputFormat": "pdf", "_runContext": {"mandateId": "m", "instanceId": "i"}}, + action_def.parameters, + ) + assert validated.get("_runContext") == {"mandateId": "m", "instanceId": "i"} + + +def test_presentation_envelopes_to_document_json_one_section_per_data_slot(): + from modules.workflows.methods.methodContext.actions.extractContent import ( + PRESENTATION_KIND, + presentation_envelopes_to_document_json, + ) + + pres = { + "kind": PRESENTATION_KIND, + "outputMode": "lines", + "fileOrder": ["f1"], + "files": { + "f1": { + "outputMode": "lines", + "sourceFileName": "a.pdf", + "data": [ + { + "typeGroup": "text", + "mimeType": "text/plain", + "data": "ignored", + "lines": ["Line A", "Line B"], + }, + ], + }, + }, + } + out = presentation_envelopes_to_document_json( + {"success": True, "data": pres}, + title="T", + language="de", + ) + paragraphs = [ + s for s in out["documents"][0]["sections"] + if s.get("content_type") == "paragraph" + ] + assert len(paragraphs) == 1 + runs = paragraphs[0]["elements"][0]["content"]["inlineRuns"] + joined = "".join(r.get("value", "") for r in runs) + assert "Line A" in joined + assert "Line B" in joined + assert "\n" in joined + + +def test_presentation_envelopes_table_slot_becomes_table_section(): + from modules.workflows.methods.methodContext.actions.extractContent import ( + PRESENTATION_KIND, + presentation_envelopes_to_document_json, + ) + + pres = { + "kind": PRESENTATION_KIND, + "outputMode": "lines", + "fileOrder": ["f1"], + "files": { + "f1": { + "outputMode": "lines", + "sourceFileName": "sheet.csv", + "data": [ + { + "typeGroup": "table", + "mimeType": "text/csv", + "data": '"Name","Amount"\n"Alice","100"\n"Bob","200"', + "lines": [], + }, + ], + }, + }, + } + out = presentation_envelopes_to_document_json( + {"success": True, "data": pres}, + title="T", + language="de", + ) + tables = [s for s in out["documents"][0]["sections"] if s.get("content_type") == "table"] + assert len(tables) == 1 + content = tables[0]["elements"][0]["content"] + assert content["headers"] == ["Name", "Amount"] + assert content["rows"] == [["Alice", "100"], ["Bob", "200"]] + + +def test_presentation_line_slot_preserves_table_without_lines(): + from modules.workflows.methods.methodContext.actions.extractContent import ( + _presentation_line_slot_from_part, + _presentation_line_slots_from_part, + parse_presentation_parameters, + ) + + cfg = parse_presentation_parameters({"outputMode": "lines", "splitBy": "newline"}) + part = { + "typeGroup": "table", + "mimeType": "text/csv", + "data": '"A","B"\n"1","2"\n"3","4"', + "id": "t1", + } + slot = _presentation_line_slot_from_part(part, cfg) + assert slot.get("lines") == [] + assert slot.get("data") == part["data"] + slots = _presentation_line_slots_from_part(part, cfg) + assert len(slots) == 3 + assert slots[0]["lines"] == ['"A","B"'] + assert slots[1]["lines"] == ['"1","2"'] + + +def test_presentation_envelopes_preserves_data_slot_order_text_image_text(): + import base64 + + from modules.workflows.methods.methodContext.actions.extractContent import ( + PRESENTATION_KIND, + presentation_envelopes_to_document_json, + ) + + class _Mgmt: + def getFileData(self, _fid: str) -> bytes: + return base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg==" + ) + + class _Svc: + interfaceDbComponent = _Mgmt() + + pres = { + "kind": PRESENTATION_KIND, + "outputMode": "lines", + "fileOrder": ["f1"], + "files": { + "f1": { + "outputMode": "lines", + "sourceFileName": "a.pdf", + "data": [ + {"typeGroup": "text", "mimeType": "text/plain", "lines": ["Before"]}, + { + "typeGroup": "image", + "mimeType": "image/png", + "embeddedImageFileId": "00000000-0000-0000-0000-000000000001", + }, + {"typeGroup": "text", "mimeType": "text/plain", "lines": ["After"]}, + ], + }, + }, + } + out = presentation_envelopes_to_document_json( + {"success": True, "data": pres}, + title="T", + language="de", + services=_Svc(), + ) + types = [s.get("content_type") for s in out["documents"][0]["sections"]] + assert types == ["paragraph", "image", "paragraph"] + + +def test_presentation_envelopes_to_document_json_text_slots(): + from modules.workflows.methods.methodContext.actions.extractContent import ( + PRESENTATION_KIND, + presentation_envelopes_to_document_json, + ) + + pres = { + "kind": PRESENTATION_KIND, + "outputMode": "lines", + "fileOrder": ["f1"], + "files": { + "f1": { + "outputMode": "lines", + "sourceFileName": "a.pdf", + "data": [ + { + "typeGroup": "text", + "mimeType": "text/plain", + "data": "Hello", + "lines": ["Hello", "World"], + }, + ], + }, + }, + } + out = presentation_envelopes_to_document_json( + [{"success": True, "data": pres}], + title="T", + language="de", + ) + paragraphs = [ + s for s in out["documents"][0]["sections"] + if s.get("content_type") == "paragraph" + ] + assert len(paragraphs) == 1 + all_text = [] + for p in paragraphs: + runs = p["elements"][0]["content"]["inlineRuns"] + all_text.append("".join(r.get("value", "") for r in runs)) + assert any("Hello" in t for t in all_text) + assert any("World" in t for t in all_text) + + +def test_presentation_envelopes_to_document_json_image_slot(): + import base64 + + from modules.workflows.methods.methodContext.actions.extractContent import ( + PRESENTATION_KIND, + presentation_envelopes_to_document_json, + ) + + fid = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + pres = { + "kind": PRESENTATION_KIND, + "outputMode": "lines", + "fileOrder": ["f1"], + "files": { + "f1": { + "outputMode": "lines", + "sourceFileName": "a.pdf", + "data": [ + { + "typeGroup": "image", + "mimeType": "image/png", + "embeddedImageFileId": fid, + "embeddedImageFileName": "clip.png", + }, + ], + }, + }, + } + + class _Mgmt: + def getFileData(self, file_id): + assert file_id == fid + return b"\x89PNG\r\n" + + class _Svc: + interfaceDbComponent = _Mgmt() + + out = presentation_envelopes_to_document_json( + pres, + title="Img", + language="de", + services=_Svc(), + ) + img_secs = [ + s for s in out["documents"][0]["sections"] + if s.get("content_type") == "image" + ] + assert len(img_secs) == 1 + b64 = img_secs[0]["elements"][0]["content"]["base64Data"] + assert base64.b64decode(b64).startswith(b"\x89PNG") diff --git a/tests/unit/workflow/test_merge_context_handover.py b/tests/unit/workflow/test_merge_context_handover.py index c89de1e3..cd2bdfc3 100644 --- a/tests/unit/workflow/test_merge_context_handover.py +++ b/tests/unit/workflow/test_merge_context_handover.py @@ -45,7 +45,7 @@ async def test_mergeContext_handover_only_in_documents_yields_data_response(): } result = await mergeContext(object(), {"dataSource": [item]}) assert result.success - assert result.data + assert result.data.get("kind") == "context.mergeContext.v1" assert result.data.get("response") == "only-from-handover" @@ -176,3 +176,24 @@ async def test_mergeContext_accumulates_image_documents_only_across_iterations() names = [d.get("documentName") for d in imgs] assert "img_a.png" in names assert "img_b.png" in names + + +@pytest.mark.asyncio +async def test_transform_context_envelope_has_kind_and_meta(): + from modules.workflows.methods.methodContext.actions.transformContext import transformContext + + svc = object() + result = await transformContext( + svc, + { + "mappings": [{"operation": "rename", "sourceField": "a", "outputField": "b"}], + "_upstreamPayload": {"a": 42}, + }, + ) + assert result.success and result.data + assert result.data.get("kind") == "context.transformContext.v1" + assert result.data.get("schemaVersion") == 1 + assert result.data.get("b") == 42 + meta = result.data.get("_meta") + assert isinstance(meta, dict) + assert meta.get("actionType") == "context.transformContext" diff --git a/tests/unit/workflow/test_phase3_context_node.py b/tests/unit/workflow/test_phase3_context_node.py index 07496025..76fbc972 100644 --- a/tests/unit/workflow/test_phase3_context_node.py +++ b/tests/unit/workflow/test_phase3_context_node.py @@ -18,6 +18,7 @@ def test_context_extractContent_node_exists(): def test_context_extractContent_node_shape(): node = next(n for n in STATIC_NODE_TYPES if n["id"] == "context.extractContent") assert node["category"] == "context" + assert node.get("injectRunContext") is True assert node["meta"]["usesAi"] is False assert node["_method"] == "context" assert node["_action"] == "extractContent" @@ -43,7 +44,16 @@ def test_context_extractContent_node_shape(): ] pick_paths = [opt["path"] for opt in node["outputPorts"][0]["dataPickOptions"]] - assert ["documents", 0, "documentData", "presentation"] in pick_paths + assert ["data", "files"] in pick_paths + assert ["data", "_meta"] in pick_paths + + + +def test_context_transformContext_has_envelope_data_pick_paths(): + node = next(n for n in STATIC_NODE_TYPES if n["id"] == "context.transformContext") + pick_paths = [opt["path"] for opt in node["outputPorts"][0]["dataPickOptions"]] + assert ["data"] in pick_paths + assert ["data", "_meta"] in pick_paths def test_udm_port_types_registered(): @@ -85,6 +95,14 @@ def test_getExecutor_dispatches_context(): assert isinstance(executor, ActionNodeExecutor) +def test_context_mergeContext_has_envelope_data_pick_paths(): + node = next(n for n in STATIC_NODE_TYPES if n["id"] == "context.mergeContext") + pick_paths = [opt["path"] for opt in node["outputPorts"][0]["dataPickOptions"]] + assert ["data"] in pick_paths + assert ["data", "_meta"] in pick_paths + assert ["merged"] in pick_paths + + def test_context_mergeContext_surfaces_data_pick_paths_match_node_outputs(): """DataPicker uses paths like ``merged``; executor must surface ``data.*`` to top level.""" node = next(n for n in STATIC_NODE_TYPES if n["id"] == "context.mergeContext") diff --git a/tests/unit/workflow/test_serialize_context_and_file_create_context.py b/tests/unit/workflow/test_serialize_context_and_file_create_context.py deleted file mode 100644 index 57ae3823..00000000 --- a/tests/unit/workflow/test_serialize_context_and_file_create_context.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. - -import json - -from modules.workflows.methods.methodAi._common import serialize_context -from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import ( - enhancePlainTextWithMarkdownTables, - markdownToDocumentJson, -) -from modules.workflows.methods.methodFile.actions.create import ( - _collect_image_documents_only, - _context_string_for_report, -) - - -def test_serialize_context_nonserializable_embeds_via_default_str(): - class _Ns: - def __str__(self): - return "ns" - - s = serialize_context({"x": _Ns(), "n": 1}) - parsed = json.loads(s) - assert parsed["n"] == 1 - assert "ns" in parsed["x"] - - -def test_serialize_context_strips_bom_on_plain_string(): - assert serialize_context("\ufeffhello") == "hello" - - -def test_context_string_docx_prefers_response_over_full_dict(): - body = "Datum;Mandant\n2026-01-01;acme" - ctx = {"response": "\ufeff" + body, "data": {"foo": 1}} - assert _context_string_for_report(ctx, "docx") == body - - -def test_context_string_json_serializes_full_structure(): - ctx = {"response": "hi", "data": {"foo": 1}} - out = _context_string_for_report(ctx, "json") - assert json.loads(out)["data"]["foo"] == 1 - - -def test_serialize_context_prefers_response_when_json_fails(): - d: dict = {"response": "plain", "n": 1} - d["_loop"] = d # circular — json.dumps fails - assert serialize_context(d).strip() == "plain" - - -def test_serialize_context_prefer_handover_primary_skips_metadata(): - blob = {"response": "LINE", "data": {"nested": {"x" * 200}}, "extra": {"y": 2}} - s = serialize_context(blob, prefer_handover_primary=True) - assert s == "LINE" - - -def test_context_string_plain_str_passthrough_docx(): - assert _context_string_for_report(" hello ", "docx") == "hello" - - -def test_collect_image_documents_nested_paths(): - imgs = [{"documentName": "m.png", "mimeType": "image/png"}] - assert _collect_image_documents_only({"merged": {"imageDocumentsOnly": imgs}}) == imgs - assert _collect_image_documents_only({"data": {"merged": {"imageDocumentsOnly": imgs}}}) == imgs - - -def test_context_string_prefers_merged_response_over_inputs_noise(): - raw = {"merged": {"response": "from-merged"}, "inputs": {"0": {"documentData": "X" * 10000}}} - assert _context_string_for_report(raw, "docx") == "from-merged" - - -def test_context_string_fallback_json_strips_heavy_keys(): - raw = {"foo": 1, "inputs": {"nasty": True}, "imageDocumentsOnly": [{"documentName": "x"}]} - out = _context_string_for_report(raw, "docx") - parsed = json.loads(out) - assert "inputs" not in parsed - assert "imageDocumentsOnly" not in parsed - assert parsed["foo"] == 1 - - -def test_enhance_plain_csv_semicolon_to_markdown_table(): - body = "Datum;Betrag\n2026-01-01;12.50\n2026-01-02;3.00" - out = enhancePlainTextWithMarkdownTables(body) - assert "| Datum |" in out - assert "| Betrag |" in out - assert "---" in out - - -def test_enhance_preserves_normal_paragraphs(): - body = "Ein Absatz ohne Raster.\n\nZweiter Gedanke." - assert enhancePlainTextWithMarkdownTables(body) == body - - -def test_enhance_then_markdown_json_contains_table_section(): - body = "Datum;Betrag\n2026-01-01;12\n2026-01-02;3" - enhanced = enhancePlainTextWithMarkdownTables(body) - doc = markdownToDocumentJson(enhanced, "Report", "de") - sections = doc["documents"][0]["sections"] - assert any(s.get("content_type") == "table" for s in sections)