gateway/modules/workflows/methods/methodFile/actions/create.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

from typing import Any, Dict, List, Optional

import asyncio
import base64
import binascii
import io
import json
import logging
import re

from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import (
    enhancePlainTextWithMarkdownTables,
    markdownToDocumentJson,
)
from modules.shared.i18nRegistry import normalizePrimaryLanguageTag
from modules.workflows.automation2.executors.actionNodeExecutor import _coerce_document_data_to_bytes
from modules.workflows.methods.methodAi._common import is_image_action_document_list, serialize_context

logger = logging.getLogger(__name__)

_SAFE_FILENAME = re.compile(r'[^\w\-.\(\)\s\[\]%@+]')


_HEAVY_CONTEXT_KEYS = frozenset({"imageDocumentsOnly", "documents", "inputs"})


def _collect_image_documents_only(raw: Any) -> List[Any]:
    """Resolve ``imageDocumentsOnly`` whether the context is merged, nested, or surfaced."""
    if not isinstance(raw, dict):
        return []
    paths = (
        ("imageDocumentsOnly",),
        ("merged", "imageDocumentsOnly"),
        ("data", "merged", "imageDocumentsOnly"),
        ("data", "imageDocumentsOnly"),
    )
    for path in paths:
        cur: Any = raw
        ok = True
        for p in path:
            if not isinstance(cur, dict):
                ok = False
                break
            cur = cur.get(p)
        if ok and isinstance(cur, list) and cur:
            return cur
    return []


def _context_string_for_report(raw: Any, output_format: str) -> str:
    """Build one narrative string for ``markdownToDocumentJson`` / render.

    Prefer plain ``response`` text (merge node surfaces it; nested ``merged.response``
    too). Never dump ``inputs`` / binary lists into the PDF body — that produced giant
    JSON + base64 "hash" paragraphs after merge + ``contextBuilder``.
    """
    of = (output_format or "docx").strip().lower().lstrip(".")
    if of == "json":
        return serialize_context(raw, prefer_handover_primary=False)
    if isinstance(raw, str):
        return raw.strip().lstrip("\ufeff")
    if isinstance(raw, dict):
        for path in (
            ("response",),
            ("merged", "response"),
            ("data", "response"),
            ("data", "merged", "response"),
        ):
            cur: Any = raw
            ok = True
            for k in path:
                if not isinstance(cur, dict):
                    ok = False
                    break
                cur = cur.get(k)
            if ok and cur is not None and str(cur).strip():
                return str(cur).strip().lstrip("\ufeff")
        lean = {k: v for k, v in raw.items() if k not in _HEAVY_CONTEXT_KEYS}
        try:
            return json.dumps(lean, ensure_ascii=False, indent=2, default=str)
        except Exception:
            return serialize_context(lean, prefer_handover_primary=False)
    return serialize_context(raw, prefer_handover_primary=False)


def _raw_context_preview_for_log(raw: Any, max_len: int = 500) -> str:
    if raw is None:
        return "None"
    s = raw if isinstance(raw, str) else repr(raw)
    s = s.replace("\r", "\\r").replace("\n", "\\n")
    if len(s) <= max_len:
        return s
    return s[:max_len] + f"...<{len(s)} chars>"


def _persistDocumentsToUserFiles(
    action_documents: list,
    services,
    folder_id: Optional[str] = None,
) -> None:
    """Persist file.create output documents to user's file storage (like upload).
    Adds fileId to each document's validationMetadata for download links in UI."""
    mgmt = getattr(services, "interfaceDbComponent", None)
    if not mgmt:
        try:
            import modules.interfaces.interfaceDbManagement as iface
            user = getattr(services, "user", None)
            if not user:
                return
            mgmt = iface.getInterface(
                user,
                mandateId=getattr(services, "mandateId", None) or "",
                featureInstanceId=getattr(services, "featureInstanceId", None) or "",
            )
        except Exception as e:
            logger.warning("file.create: could not get management interface for persistence: %s", e)
            return
    if not mgmt:
        return
    logger.info(
        "file.create persist: mgmt=%s id(mgmt)=%s has_createFileData=%s",
        type(mgmt).__name__,
        id(mgmt),
        hasattr(mgmt, "createFileData"),
    )
    for doc in action_documents:
        try:
            doc_data = doc.documentData if hasattr(doc, "documentData") else doc.get("documentData")
            if not doc_data:
                continue
            if isinstance(doc_data, str):
                try:
                    content = base64.b64decode(doc_data, validate=True)
                except (TypeError, ValueError, binascii.Error):
                    content = doc_data.encode("utf-8")
            else:
                content = doc_data
            doc_name = (
                getattr(doc, "documentName", None)
                or doc.get("documentName")
                or "output.pdf"
            )
            mime = (
                getattr(doc, "mimeType", None)
                or doc.get("mimeType")
                or "application/octet-stream"
            )
            logger.info(
                "file.create persist: calling createFile name=%s bytes=%s",
                doc_name,
                len(content),
            )
            file_item = mgmt.createFile(doc_name, mime, content, folderId=folder_id)
            logger.info("file.create persist: createFile returned id=%s", file_item.id)
            ok = mgmt.createFileData(file_item.id, content)
            logger.info("file.create persist: createFileData returned %s for id=%s", ok, file_item.id)
            meta = getattr(doc, "validationMetadata", None) or doc.get("validationMetadata") or {}
            if isinstance(meta, dict):
                meta["fileId"] = file_item.id
                if hasattr(doc, "validationMetadata"):
                    doc.validationMetadata = meta
                elif isinstance(doc, dict):
                    doc["validationMetadata"] = meta
            logger.info("file.create: persisted %s to user files (id=%s)", doc_name, file_item.id)
        except Exception as e:
            dname = getattr(doc, "documentName", None) or doc.get("documentName", "?")
            logger.warning("file.create: failed to persist document %s: %s", dname, e)


def _sanitize_output_stem(title: str) -> str:
    t = (title or "").strip() or "Document"
    stem = _SAFE_FILENAME.sub("_", t).strip("._")
    return stem[:120] if stem else "Document"


def _get_management_interface(services) -> Optional[Any]:
    mgmt = getattr(services, "interfaceDbComponent", None)
    if mgmt:
        return mgmt
    try:
        import modules.interfaces.interfaceDbManagement as iface

        user = getattr(services, "user", None)
        if not user:
            return None
        return iface.getInterface(
            user,
            mandateId=getattr(services, "mandateId", None) or "",
            featureInstanceId=getattr(services, "featureInstanceId", None) or "",
        )
    except Exception as e:
        logger.warning("file.create: could not get management interface: %s", e)
        return None


def _load_image_bytes_from_action_doc(doc: dict, services) -> Optional[bytes]:
    raw = doc.get("documentData")
    blob = _coerce_document_data_to_bytes(raw)
    if blob:
        return blob
    fid = doc.get("fileId")
    if not fid and isinstance(doc.get("validationMetadata"), dict):
        fid = (doc.get("validationMetadata") or {}).get("fileId")
    if fid and str(fid).strip():
        mgmt = _get_management_interface(services)
        if mgmt and hasattr(mgmt, "getFileData"):
            try:
                return mgmt.getFileData(str(fid))
            except Exception as e:
                logger.warning("file.create: getFileData(%s) failed: %s", fid, e)
    return None


# Images larger than this threshold (decoded bytes) are resized before embedding
# to avoid multi-minute PDF rendering of high-res raster scans.
_MAX_IMAGE_EMBED_BYTES = 300_000  # 300 KB decoded ≈ ~400 KB base64
_IMAGE_MAX_DIMENSION = 1200       # longest edge in pixels after resize


def _resize_image_for_document(image_bytes: bytes) -> bytes:
    """Resize image to at most ``_IMAGE_MAX_DIMENSION`` px on the longest edge
    and re-encode as JPEG.  Falls back to the original bytes on any error."""
    try:
        from PIL import Image as PILImage
        import io as _io

        img = PILImage.open(_io.BytesIO(image_bytes))

        # Flatten transparency / palette modes to RGB (required for JPEG)
        if img.mode in ("RGBA", "LA"):
            bg = PILImage.new("RGB", img.size, (255, 255, 255))
            bg.paste(img, mask=img.split()[-1])
            img = bg
        elif img.mode == "P":
            img = img.convert("RGBA")
            bg = PILImage.new("RGB", img.size, (255, 255, 255))
            bg.paste(img, mask=img.split()[-1])
            img = bg
        elif img.mode != "RGB":
            img = img.convert("RGB")

        w, h = img.size
        if max(w, h) > _IMAGE_MAX_DIMENSION:
            # thumbnail() is optimised for downscaling: it uses an intermediate
            # box-filter step before the final filter, making it 3-5× faster
            # than resize() on large images.  BILINEAR is fast and sufficient
            # for document thumbnails.
            img.thumbnail((_IMAGE_MAX_DIMENSION, _IMAGE_MAX_DIMENSION), PILImage.BILINEAR)

        out = _io.BytesIO()
        img.save(out, format="JPEG", quality=85, optimize=True)
        return out.getvalue()
    except Exception as e:
        logger.warning("file.create: image resize failed (%s) — using original bytes", e)
        return image_bytes


def _append_images_to_content(structured_content: dict, image_docs: list, services=None) -> dict:
    """Append images from imageDocumentsOnly as native image elements to the structured JSON.

    Each image becomes an ``image`` element with ``base64Data`` in a trailing
    "Bilder" section of the first document.  Images larger than
    ``_MAX_IMAGE_EMBED_BYTES`` are automatically resized/compressed so the
    synchronous PDF renderer does not block for minutes on high-res scans.
    The renderers (DOCX / PDF) handle ``content.base64Data`` natively.
    """
    elements = []
    for doc in image_docs:
        b = _load_image_bytes_from_action_doc(doc, services)
        if not b:
            raw = doc.get("documentData") if isinstance(doc, dict) else None
            if isinstance(raw, str):
                try:
                    b = base64.b64decode(raw)
                except Exception:
                    pass
        if not b:
            continue

        if len(b) > _MAX_IMAGE_EMBED_BYTES:
            logger.info(
                "file.create: image %s is %d bytes — resizing to max %dpx for embedding",
                (doc.get("documentName") if isinstance(doc, dict) else "?") or "?",
                len(b),
                _IMAGE_MAX_DIMENSION,
            )
            b = _resize_image_for_document(b)

        elements.append({
            "type": "image",
            "content": {
                "base64Data": base64.b64encode(b).decode("ascii"),
                "alt": (doc.get("documentName") if isinstance(doc, dict) else None) or "image",
            },
        })

    if not elements:
        return structured_content

    docs = structured_content.get("documents")
    if isinstance(docs, list) and docs:
        docs[0].setdefault("sections", []).append({"heading": "Bilder", "elements": elements})
    return structured_content


def _images_list_to_pdf(image_bytes_list: List[bytes]) -> bytes:
    """One PDF page per image; embedded raster data via PyMuPDF."""
    import fitz

    pdf = fitz.open()
    try:
        for blob in image_bytes_list:
            page = pdf.new_page()
            page.insert_image(page.rect, stream=blob, keep_proportion=True)
        return pdf.tobytes()
    finally:
        pdf.close()


def _images_list_to_docx(image_bytes_list: List[bytes]) -> bytes:
    """Images embedded in the document package (inline shapes), not hyperlinks."""
    from docx import Document
    from docx.shared import Inches

    doc = Document()
    for blob in image_bytes_list:
        p = doc.add_paragraph()
        run = p.add_run()
        run.add_picture(io.BytesIO(blob), width=Inches(6.5))
        doc.add_paragraph()
    out = io.BytesIO()
    doc.save(out)
    return out.getvalue()


async def _create_merged_image_documents(
    self,
    parameters: Dict[str, Any],
    image_docs: List[dict],
) -> ActionResult:
    """Build one PDF or DOCX containing all extracted images (``imageDocumentsOnly``)."""
    output_format = (parameters.get("outputFormat") or "docx").strip().lower().lstrip(".")
    title = (parameters.get("title") or "Document").strip()
    stem = _sanitize_output_stem(title)
    folder_id: Optional[str] = None
    raw_folder = parameters.get("folderId")
    if raw_folder is not None and str(raw_folder).strip():
        folder_id = str(raw_folder).strip()

    if output_format not in ("pdf", "docx"):
        return ActionResult.isFailure(
            error=(
                f"Nur-Bilder-Kontext: „{output_format}“ wird nicht unterstützt. "
                "Bitte Ausgabeformat „pdf“ oder „docx“ wählen."
            )
        )

    blobs: List[bytes] = []
    for d in image_docs:
        b = _load_image_bytes_from_action_doc(d, self.services)
        if not b:
            name = d.get("documentName") or "?"
            return ActionResult.isFailure(
                error=f"Bilddaten fehlen oder sind nicht lesbar (Datei: {name})."
            )
        blobs.append(b)

    if output_format == "pdf":
        try:
            combined = _images_list_to_pdf(blobs)
        except Exception as e:
            logger.warning("file.create: PDF merge failed: %s", e, exc_info=True)
            return ActionResult.isFailure(error=f"PDF aus Bildern konnte nicht erzeugt werden: {e}")
        out_name = f"{stem}.pdf"
        mime = "application/pdf"
    else:
        combined = _images_list_to_docx(blobs)
        out_name = f"{stem}.docx"
        mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

    if not combined:
        return ActionResult.isFailure(error="Zusammenfügen der Bilder ergab leere Ausgabe")

    doc_b64 = base64.b64encode(combined).decode("ascii")
    action_documents = [
        ActionDocument(
            documentName=out_name,
            documentData=doc_b64,
            mimeType=mime,
            validationMetadata={
                "actionType": "file.create",
                "outputFormat": output_format,
                "source": "mergedImageDocumentsOnly",
            },
        )
    ]
    _persistDocumentsToUserFiles(action_documents, self.services, folder_id=folder_id)
    return ActionResult.isSuccess(documents=action_documents)


async def create(self, parameters: Dict[str, Any]) -> ActionResult:
    """
    Create a file from context (text/markdown from upstream AI node).
    Uses GenerationService.renderReport to produce docx, pdf, txt, md, html, xlsx, etc.
    """
    raw_context = parameters.get("context", "")

    if isinstance(raw_context, list) and is_image_action_document_list(raw_context):
        return await _create_merged_image_documents(self, parameters, raw_context)

    outputFormat = (parameters.get("outputFormat") or "docx").strip().lower().lstrip(".")
    context = _context_string_for_report(raw_context, outputFormat)

    if not context:
        logger.warning(
            "file.create: context empty after resolve — raw_context type=%s raw_summary=%r "
            "serialized_len=%s (check ActionNodeExecutor \"file.create context resolution\" log for DataRef / upstream).",
            type(raw_context).__name__,
            _raw_context_preview_for_log(raw_context),
            len(context or ""),
        )
        return ActionResult.isFailure(error="context is required (connect an AI node or provide text)")

    title = (parameters.get("title") or "Document").strip()
    templateName = parameters.get("templateName")
    language = normalizePrimaryLanguageTag(
        str(parameters.get("language") or "de"),
        "de",
    )

    folder_id: Optional[str] = None
    raw_folder = parameters.get("folderId")
    if raw_folder is not None and str(raw_folder).strip():
        folder_id = str(raw_folder).strip()

    try:
        if outputFormat != "json":
            context = enhancePlainTextWithMarkdownTables(context)
        structured_content = markdownToDocumentJson(context, title, language)
        if templateName:
            structured_content.setdefault("metadata", {})["templateName"] = templateName

        img_docs = _collect_image_documents_only(raw_context)
        if img_docs:
            # Image decoding and PIL resizing are CPU-bound; run them in a
            # thread pool so the event loop is not blocked while processing
            # high-res raster images (e.g. 3+ MB PNGs from PDF extraction).
            loop = asyncio.get_event_loop()
            structured_content = await loop.run_in_executor(
                None,
                _append_images_to_content,
                structured_content,
                img_docs,
                self.services,
            )

        generation = getattr(self.services, "generation", None)
        if not generation:
            return ActionResult.isFailure(error="Generation service not available")

        ai_service = getattr(self.services, "ai", None)
        rendered_docs = await generation.renderReport(
            extractedContent=structured_content,
            outputFormat=outputFormat,
            language=language,
            title=title,
            userPrompt=None,
            aiService=ai_service,
            parentOperationId=parameters.get("parentOperationId"),
        )

        if not rendered_docs:
            return ActionResult.isFailure(error="Rendering produced no output")

        action_documents = []
        mime_map = {
            "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            "pdf": "application/pdf",
            "txt": "text/plain",
            "md": "text/markdown",
            "html": "text/html",
            "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            "csv": "text/csv",
            "json": "application/json",
        }
        for rd in rendered_docs:
            doc_data = rd.documentData if hasattr(rd, "documentData") else getattr(rd, "document_data", None)
            doc_name = getattr(rd, "filename", None) or getattr(rd, "documentName", None) or getattr(rd, "document_name", f"output.{outputFormat}")
            mime = getattr(rd, "mimeType", None) or getattr(rd, "mime_type", None) or mime_map.get(outputFormat, "application/octet-stream")

            if isinstance(doc_data, bytes):
                doc_data = base64.b64encode(doc_data).decode("ascii")

            action_documents.append(ActionDocument(
                documentName=doc_name,
                documentData=doc_data,
                mimeType=mime,
                validationMetadata={
                    "actionType": "file.create",
                    "outputFormat": outputFormat,
                    "templateName": templateName,
                },
            ))

        _persistDocumentsToUserFiles(action_documents, self.services, folder_id=folder_id)
        return ActionResult.isSuccess(documents=action_documents)

    except Exception as e:
        logger.error(f"file.create failed: {e}", exc_info=True)
        return ActionResult.isFailure(error=str(e))