gateway/modules/workflows/methods/methodContext/actions/extractContent.py

358 lines
13 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""context.extractContent — extracts content without AI.
Returns a unified handover compatible with AiResult-style downstream wiring:
- ``documents[0]``: structured JSON (`context.extractContent.handover.v1`); image ``parts``
keep metadata but omit pixel data; each dropped image references
``handoverMediaDocumentName`` matching a sibling blob document.
- ``documents[1:]``: each extracted image as its own binary ``ActionDocument`` (like
``ai.process`` artefact outputs).
- ``ActionResult.data["response"]`` plus normalized executor field ``response``: concatenated
plain text from all text parts — safe default for ``file.create`` / primaryTextRef."""
import base64 as _b64
import binascii as _binascii
import logging
import re
import time
from typing import Any, Dict, List, Tuple
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import coerceDocumentReferenceList
from modules.datamodels.datamodelExtraction import ContentExtracted, ExtractionOptions
logger = logging.getLogger(__name__)
_UNSAFE_FILE_KEY = re.compile(r"[^\w\-.\(\)\[\]%@+]")
HANDOVER_KIND = "context.extractContent.handover.v1"
def _default_extraction_options() -> ExtractionOptions:
"""No merge — keep all parts for downstream JSON selection."""
return ExtractionOptions(
prompt="Extract all content from the document",
mergeStrategy=None,
processDocumentsIndividually=True,
outputFormat="parts",
outputDetail="full",
)
def _file_json_key(display_name: str, index: int, key_counts: Dict[str, int]) -> str:
stem = (display_name or "").strip() or f"document_{index + 1}"
slug = stem.replace("/", "_").replace("\\", "_").replace(" ", "_")
slug = _UNSAFE_FILE_KEY.sub("_", slug).strip("_") or f"document_{index + 1}"
base = f"file_{index + 1}_{slug}"
n = key_counts.get(base, 0)
key_counts[base] = n + 1
return base if n == 0 else f"{base}__{n}"
def _serialize_parts(parts: Any) -> List[Dict[str, Any]]:
out: List[Dict[str, Any]] = []
for p in parts or []:
if hasattr(p, "model_dump"):
out.append(p.model_dump(mode="json"))
elif isinstance(p, dict):
out.append(dict(p))
return out
def _rebuild_by_type_group(parts_ser: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
by_type: Dict[str, List[Dict[str, Any]]] = {}
for entry in parts_ser:
if not isinstance(entry, dict):
continue
tg = (entry.get("typeGroup") or "").strip() or "_other"
by_type.setdefault(tg, []).append(entry)
return by_type
def _joined_text_from_handover_payload(payload: Dict[str, Any]) -> str:
"""Concatenate text parts across fileOrder for AiResult-compatible ``response``."""
files_section = payload.get("files") or {}
ordered = payload.get("fileOrder")
keys: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
chunks: List[str] = []
for fk in keys:
bucket = files_section.get(fk)
if not isinstance(bucket, dict):
continue
for p in bucket.get("parts") or []:
if not isinstance(p, dict):
continue
if (p.get("typeGroup") or "").strip() != "text":
continue
raw = p.get("data")
if raw is None:
continue
s = str(raw).strip()
if s:
chunks.append(s)
return "\n\n".join(chunks)
def _mime_to_file_extension(mime: str) -> str:
m = (mime or "").split(";")[0].strip().lower()
mapping = {
"image/jpeg": "jpg",
"image/jpg": "jpg",
"image/png": "png",
"image/gif": "gif",
"image/webp": "webp",
"image/bmp": "bmp",
"image/tiff": "tiff",
}
return mapping.get(m, m.rsplit("/", 1)[-1] if "/" in m else "bin")
def _split_images_to_sidecar_documents(
payload: Dict[str, Any],
*,
document_name_stem: str,
) -> Tuple[Dict[str, Any], List[ActionDocument]]:
"""
Deep-copy handover JSON, clear image pixel data from ``parts``, attach
``handoverMediaDocumentName`` on each image part, emit binary ActionDocuments.
"""
import copy
bundle = copy.deepcopy(payload)
files_section = bundle.get("files") or {}
ordered = bundle.get("fileOrder")
key_order: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
media_docs: List[ActionDocument] = []
kind = bundle.get("kind") or HANDOVER_KIND
stem = re.sub(r"[^\w\-]+", "_", document_name_stem).strip("_") or "extract"
for fk in key_order:
bucket = files_section.get(fk)
if not isinstance(bucket, dict):
continue
parts = bucket.get("parts")
if not isinstance(parts, list):
continue
new_parts: List[Dict[str, Any]] = []
for p in parts:
if not isinstance(p, dict):
new_parts.append(p)
continue
pcopy = dict(p)
tg = (pcopy.get("typeGroup") or "").strip()
mime = (pcopy.get("mimeType") or "").strip()
raw_data = pcopy.get("data")
if tg == "image" and mime.lower().startswith("image/") and raw_data:
raw_s = raw_data.strip() if isinstance(raw_data, str) else ""
try:
blob = _b64.b64decode(raw_s, validate=True) if raw_s else b""
except (_binascii.Error, TypeError, ValueError) as e:
logger.warning(
"extractContent: could not decode image part %s (keep inline): %s",
pcopy.get("id"),
e,
)
new_parts.append(pcopy)
continue
if not blob:
new_parts.append(pcopy)
continue
part_id = str(pcopy.get("id") or "part")
# Full part id (UUID) — must not truncate or names collide / break linking
safe_id = re.sub(r"[^\w\-.]+", "_", part_id).strip("_") or "media"
if len(safe_id) > 200:
safe_id = safe_id[:200]
ext = _mime_to_file_extension(mime)
media_name = f"extract_media_{stem}_{safe_id}.{ext}"
pcopy["data"] = ""
pcopy["handoverMediaDocumentName"] = media_name
media_docs.append(
ActionDocument(
documentName=media_name,
documentData=blob,
mimeType=mime,
validationMetadata={
"actionType": "context.extractContent",
"handoverRole": "extractedMedia",
"sourcePartId": part_id,
"handoverSchema": kind,
"containerFileKey": fk,
},
)
)
new_parts.append(pcopy)
else:
new_parts.append(pcopy)
bucket["parts"] = new_parts
bucket["byTypeGroup"] = _rebuild_by_type_group(new_parts)
files_section[fk] = bucket
return bundle, media_docs
def _one_file_bucket(ec: ContentExtracted, source_file_name: str) -> Dict[str, Any]:
parts_ser = _serialize_parts(ec.parts)
ud = getattr(ec, "udm", None)
if hasattr(ud, "model_dump"):
ud = ud.model_dump(mode="json")
summary = getattr(ec, "summary", None)
if hasattr(summary, "model_dump"):
summary = summary.model_dump(mode="json")
elif isinstance(summary, dict):
summary = dict(summary)
elif summary is None:
summary = {}
return {
"sourceFileName": source_file_name,
"extractedId": getattr(ec, "id", ""),
"summary": summary,
"udm": ud,
"parts": parts_ser,
"byTypeGroup": _rebuild_by_type_group(parts_ser),
}
def build_extract_content_handover(
*,
extracted_results: List[ContentExtracted],
chat_file_names: List[str],
operation_ref: str,
) -> Dict[str, Any]:
key_counts: Dict[str, int] = {}
files: Dict[str, Any] = {}
ordered: List[str] = []
for i, ec in enumerate(extracted_results):
name = chat_file_names[i] if i < len(chat_file_names) else ""
fk = _file_json_key(str(name), i, key_counts)
files[fk] = _one_file_bucket(ec, str(name))
ordered.append(fk)
return {
"schemaVersion": 1,
"kind": HANDOVER_KIND,
"operationRef": operation_ref,
"fileOrder": ordered,
"files": files,
}
async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
operation_id = None
try:
wf = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operation_id = f"context_extract_{wf}_{int(time.time())}"
document_list_param = parameters.get("documentList")
if not document_list_param:
return ActionResult.isFailure(error="documentList is required")
dl = coerceDocumentReferenceList(document_list_param)
if not dl.references:
return ActionResult.isFailure(
error=(
f"documentList could not be parsed (type={type(document_list_param).__name__}); "
"expected DocumentReferenceList, list of strings/dicts, or "
"a wrapper dict like {'documents': [...]}"
),
)
parent_operation_id = parameters.get("parentOperationId")
self.services.chat.progressLogStart(
operation_id,
"Extracting content from documents",
"Content Extraction",
f"Documents: {len(dl.references)}",
parentOperationId=parent_operation_id,
)
self.services.chat.progressLogUpdate(operation_id, 0.2, "Loading documents")
chat_documents = self.services.chat.getChatDocumentsFromDocumentList(dl)
if not chat_documents:
self.services.chat.progressLogFinish(operation_id, False)
return ActionResult.isFailure(error="No documents found in documentList")
logger.info(f"Extracting JSON handover from {len(chat_documents)} documents")
self.services.chat.progressLogUpdate(operation_id, 0.3, "Preparing extraction options")
eo_param = parameters.get("extractionOptions")
extraction_options: ExtractionOptions
if isinstance(eo_param, dict) and eo_param:
eo = dict(eo_param)
eo.setdefault("prompt", "Extract all content from the document")
if "mergeStrategy" not in eo:
eo["mergeStrategy"] = None
try:
extraction_options = ExtractionOptions(**eo)
except Exception as e:
logger.warning(f"Invalid extractionOptions, using defaults: {e}")
extraction_options = _default_extraction_options()
elif isinstance(eo_param, ExtractionOptions):
extraction_options = eo_param
else:
extraction_options = _default_extraction_options()
self.services.chat.progressLogUpdate(operation_id, 0.4, "Extracting …")
self.services.chat.progressLogUpdate(operation_id, 0.5, f"Extracting {len(chat_documents)} document(s)")
extracted_results = self.services.extraction.extractContent(chat_documents, extraction_options, operationId=operation_id)
file_names = [getattr(cd, "fileName", "") or "" for cd in chat_documents]
payload = build_extract_content_handover(
extracted_results=extracted_results,
chat_file_names=file_names,
operation_ref=operation_id,
)
self.services.chat.progressLogUpdate(operation_id, 0.9, "Building JSON")
stem = f"{wf}_{int(time.time())}"
stripped_payload, media_docs = _split_images_to_sidecar_documents(
payload,
document_name_stem=stem,
)
joined_text = _joined_text_from_handover_payload(payload)
json_meta = {
"actionType": "context.extractContent",
"documentCountInput": len(chat_documents),
"documentCountRoots": len(extracted_results),
"handoverSchema": stripped_payload.get("kind"),
"handoverRole": "structuredHandover",
"mediaDocumentCount": len(media_docs),
}
json_doc = ActionDocument(
documentName=f"extracted_content_{stem}.json",
documentData=stripped_payload,
mimeType="application/json",
validationMetadata=json_meta,
)
handover_data = {
"response": joined_text,
"contentType": "text",
"handoverKind": stripped_payload.get("kind"),
"structuredDocumentIndex": 0,
"mediaDocumentCount": len(media_docs),
}
self.services.chat.progressLogFinish(operation_id, True)
return ActionResult.isSuccess(documents=[json_doc] + media_docs, data=handover_data)
except Exception as e:
logger.error(f"Error in content extraction: {str(e)}")
try:
if operation_id:
self.services.chat.progressLogFinish(operation_id, False)
except Exception:
pass
return ActionResult.isFailure(error=str(e))