358 lines
13 KiB
Python
358 lines
13 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
|
|
"""context.extractContent — extracts content without AI.
|
|
|
|
Returns a unified handover compatible with AiResult-style downstream wiring:
|
|
|
|
- ``documents[0]``: structured JSON (`context.extractContent.handover.v1`); image ``parts``
|
|
keep metadata but omit pixel data; each dropped image references
|
|
``handoverMediaDocumentName`` matching a sibling blob document.
|
|
- ``documents[1:]``: each extracted image as its own binary ``ActionDocument`` (like
|
|
``ai.process`` artefact outputs).
|
|
- ``ActionResult.data["response"]`` plus normalized executor field ``response``: concatenated
|
|
plain text from all text parts — safe default for ``file.create`` / primaryTextRef."""
|
|
|
|
import base64 as _b64
|
|
import binascii as _binascii
|
|
import logging
|
|
import re
|
|
import time
|
|
from typing import Any, Dict, List, Tuple
|
|
|
|
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
|
|
from modules.datamodels.datamodelDocref import coerceDocumentReferenceList
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ExtractionOptions
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_UNSAFE_FILE_KEY = re.compile(r"[^\w\-.\(\)\[\]%@+]")
|
|
|
|
HANDOVER_KIND = "context.extractContent.handover.v1"
|
|
|
|
|
|
def _default_extraction_options() -> ExtractionOptions:
|
|
"""No merge — keep all parts for downstream JSON selection."""
|
|
return ExtractionOptions(
|
|
prompt="Extract all content from the document",
|
|
mergeStrategy=None,
|
|
processDocumentsIndividually=True,
|
|
outputFormat="parts",
|
|
outputDetail="full",
|
|
)
|
|
|
|
|
|
def _file_json_key(display_name: str, index: int, key_counts: Dict[str, int]) -> str:
|
|
stem = (display_name or "").strip() or f"document_{index + 1}"
|
|
slug = stem.replace("/", "_").replace("\\", "_").replace(" ", "_")
|
|
slug = _UNSAFE_FILE_KEY.sub("_", slug).strip("_") or f"document_{index + 1}"
|
|
base = f"file_{index + 1}_{slug}"
|
|
n = key_counts.get(base, 0)
|
|
key_counts[base] = n + 1
|
|
return base if n == 0 else f"{base}__{n}"
|
|
|
|
|
|
def _serialize_parts(parts: Any) -> List[Dict[str, Any]]:
|
|
out: List[Dict[str, Any]] = []
|
|
for p in parts or []:
|
|
if hasattr(p, "model_dump"):
|
|
out.append(p.model_dump(mode="json"))
|
|
elif isinstance(p, dict):
|
|
out.append(dict(p))
|
|
return out
|
|
|
|
|
|
def _rebuild_by_type_group(parts_ser: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
|
|
by_type: Dict[str, List[Dict[str, Any]]] = {}
|
|
for entry in parts_ser:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
tg = (entry.get("typeGroup") or "").strip() or "_other"
|
|
by_type.setdefault(tg, []).append(entry)
|
|
return by_type
|
|
|
|
|
|
def _joined_text_from_handover_payload(payload: Dict[str, Any]) -> str:
|
|
"""Concatenate text parts across fileOrder for AiResult-compatible ``response``."""
|
|
files_section = payload.get("files") or {}
|
|
ordered = payload.get("fileOrder")
|
|
keys: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
|
|
chunks: List[str] = []
|
|
for fk in keys:
|
|
bucket = files_section.get(fk)
|
|
if not isinstance(bucket, dict):
|
|
continue
|
|
for p in bucket.get("parts") or []:
|
|
if not isinstance(p, dict):
|
|
continue
|
|
if (p.get("typeGroup") or "").strip() != "text":
|
|
continue
|
|
raw = p.get("data")
|
|
if raw is None:
|
|
continue
|
|
s = str(raw).strip()
|
|
if s:
|
|
chunks.append(s)
|
|
return "\n\n".join(chunks)
|
|
|
|
|
|
def _mime_to_file_extension(mime: str) -> str:
|
|
m = (mime or "").split(";")[0].strip().lower()
|
|
mapping = {
|
|
"image/jpeg": "jpg",
|
|
"image/jpg": "jpg",
|
|
"image/png": "png",
|
|
"image/gif": "gif",
|
|
"image/webp": "webp",
|
|
"image/bmp": "bmp",
|
|
"image/tiff": "tiff",
|
|
}
|
|
return mapping.get(m, m.rsplit("/", 1)[-1] if "/" in m else "bin")
|
|
|
|
|
|
def _split_images_to_sidecar_documents(
|
|
payload: Dict[str, Any],
|
|
*,
|
|
document_name_stem: str,
|
|
) -> Tuple[Dict[str, Any], List[ActionDocument]]:
|
|
"""
|
|
Deep-copy handover JSON, clear image pixel data from ``parts``, attach
|
|
``handoverMediaDocumentName`` on each image part, emit binary ActionDocuments.
|
|
"""
|
|
import copy
|
|
|
|
bundle = copy.deepcopy(payload)
|
|
files_section = bundle.get("files") or {}
|
|
ordered = bundle.get("fileOrder")
|
|
key_order: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys())
|
|
media_docs: List[ActionDocument] = []
|
|
kind = bundle.get("kind") or HANDOVER_KIND
|
|
|
|
stem = re.sub(r"[^\w\-]+", "_", document_name_stem).strip("_") or "extract"
|
|
|
|
for fk in key_order:
|
|
bucket = files_section.get(fk)
|
|
if not isinstance(bucket, dict):
|
|
continue
|
|
parts = bucket.get("parts")
|
|
if not isinstance(parts, list):
|
|
continue
|
|
new_parts: List[Dict[str, Any]] = []
|
|
for p in parts:
|
|
if not isinstance(p, dict):
|
|
new_parts.append(p)
|
|
continue
|
|
pcopy = dict(p)
|
|
tg = (pcopy.get("typeGroup") or "").strip()
|
|
mime = (pcopy.get("mimeType") or "").strip()
|
|
raw_data = pcopy.get("data")
|
|
if tg == "image" and mime.lower().startswith("image/") and raw_data:
|
|
raw_s = raw_data.strip() if isinstance(raw_data, str) else ""
|
|
try:
|
|
blob = _b64.b64decode(raw_s, validate=True) if raw_s else b""
|
|
except (_binascii.Error, TypeError, ValueError) as e:
|
|
logger.warning(
|
|
"extractContent: could not decode image part %s (keep inline): %s",
|
|
pcopy.get("id"),
|
|
e,
|
|
)
|
|
new_parts.append(pcopy)
|
|
continue
|
|
if not blob:
|
|
new_parts.append(pcopy)
|
|
continue
|
|
part_id = str(pcopy.get("id") or "part")
|
|
# Full part id (UUID) — must not truncate or names collide / break linking
|
|
safe_id = re.sub(r"[^\w\-.]+", "_", part_id).strip("_") or "media"
|
|
if len(safe_id) > 200:
|
|
safe_id = safe_id[:200]
|
|
ext = _mime_to_file_extension(mime)
|
|
media_name = f"extract_media_{stem}_{safe_id}.{ext}"
|
|
pcopy["data"] = ""
|
|
pcopy["handoverMediaDocumentName"] = media_name
|
|
media_docs.append(
|
|
ActionDocument(
|
|
documentName=media_name,
|
|
documentData=blob,
|
|
mimeType=mime,
|
|
validationMetadata={
|
|
"actionType": "context.extractContent",
|
|
"handoverRole": "extractedMedia",
|
|
"sourcePartId": part_id,
|
|
"handoverSchema": kind,
|
|
"containerFileKey": fk,
|
|
},
|
|
)
|
|
)
|
|
new_parts.append(pcopy)
|
|
else:
|
|
new_parts.append(pcopy)
|
|
bucket["parts"] = new_parts
|
|
bucket["byTypeGroup"] = _rebuild_by_type_group(new_parts)
|
|
files_section[fk] = bucket
|
|
|
|
return bundle, media_docs
|
|
|
|
|
|
def _one_file_bucket(ec: ContentExtracted, source_file_name: str) -> Dict[str, Any]:
|
|
parts_ser = _serialize_parts(ec.parts)
|
|
|
|
ud = getattr(ec, "udm", None)
|
|
if hasattr(ud, "model_dump"):
|
|
ud = ud.model_dump(mode="json")
|
|
|
|
summary = getattr(ec, "summary", None)
|
|
if hasattr(summary, "model_dump"):
|
|
summary = summary.model_dump(mode="json")
|
|
elif isinstance(summary, dict):
|
|
summary = dict(summary)
|
|
elif summary is None:
|
|
summary = {}
|
|
|
|
return {
|
|
"sourceFileName": source_file_name,
|
|
"extractedId": getattr(ec, "id", ""),
|
|
"summary": summary,
|
|
"udm": ud,
|
|
"parts": parts_ser,
|
|
"byTypeGroup": _rebuild_by_type_group(parts_ser),
|
|
}
|
|
|
|
|
|
def build_extract_content_handover(
|
|
*,
|
|
extracted_results: List[ContentExtracted],
|
|
chat_file_names: List[str],
|
|
operation_ref: str,
|
|
) -> Dict[str, Any]:
|
|
key_counts: Dict[str, int] = {}
|
|
files: Dict[str, Any] = {}
|
|
ordered: List[str] = []
|
|
|
|
for i, ec in enumerate(extracted_results):
|
|
name = chat_file_names[i] if i < len(chat_file_names) else ""
|
|
fk = _file_json_key(str(name), i, key_counts)
|
|
files[fk] = _one_file_bucket(ec, str(name))
|
|
ordered.append(fk)
|
|
|
|
return {
|
|
"schemaVersion": 1,
|
|
"kind": HANDOVER_KIND,
|
|
"operationRef": operation_ref,
|
|
"fileOrder": ordered,
|
|
"files": files,
|
|
}
|
|
|
|
|
|
async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
operation_id = None
|
|
try:
|
|
wf = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
operation_id = f"context_extract_{wf}_{int(time.time())}"
|
|
|
|
document_list_param = parameters.get("documentList")
|
|
if not document_list_param:
|
|
return ActionResult.isFailure(error="documentList is required")
|
|
|
|
dl = coerceDocumentReferenceList(document_list_param)
|
|
if not dl.references:
|
|
return ActionResult.isFailure(
|
|
error=(
|
|
f"documentList could not be parsed (type={type(document_list_param).__name__}); "
|
|
"expected DocumentReferenceList, list of strings/dicts, or "
|
|
"a wrapper dict like {'documents': [...]}"
|
|
),
|
|
)
|
|
|
|
parent_operation_id = parameters.get("parentOperationId")
|
|
self.services.chat.progressLogStart(
|
|
operation_id,
|
|
"Extracting content from documents",
|
|
"Content Extraction",
|
|
f"Documents: {len(dl.references)}",
|
|
parentOperationId=parent_operation_id,
|
|
)
|
|
|
|
self.services.chat.progressLogUpdate(operation_id, 0.2, "Loading documents")
|
|
chat_documents = self.services.chat.getChatDocumentsFromDocumentList(dl)
|
|
if not chat_documents:
|
|
self.services.chat.progressLogFinish(operation_id, False)
|
|
return ActionResult.isFailure(error="No documents found in documentList")
|
|
|
|
logger.info(f"Extracting JSON handover from {len(chat_documents)} documents")
|
|
|
|
self.services.chat.progressLogUpdate(operation_id, 0.3, "Preparing extraction options")
|
|
|
|
eo_param = parameters.get("extractionOptions")
|
|
extraction_options: ExtractionOptions
|
|
if isinstance(eo_param, dict) and eo_param:
|
|
eo = dict(eo_param)
|
|
eo.setdefault("prompt", "Extract all content from the document")
|
|
if "mergeStrategy" not in eo:
|
|
eo["mergeStrategy"] = None
|
|
try:
|
|
extraction_options = ExtractionOptions(**eo)
|
|
except Exception as e:
|
|
logger.warning(f"Invalid extractionOptions, using defaults: {e}")
|
|
extraction_options = _default_extraction_options()
|
|
elif isinstance(eo_param, ExtractionOptions):
|
|
extraction_options = eo_param
|
|
else:
|
|
extraction_options = _default_extraction_options()
|
|
|
|
self.services.chat.progressLogUpdate(operation_id, 0.4, "Extracting …")
|
|
self.services.chat.progressLogUpdate(operation_id, 0.5, f"Extracting {len(chat_documents)} document(s)")
|
|
extracted_results = self.services.extraction.extractContent(chat_documents, extraction_options, operationId=operation_id)
|
|
|
|
file_names = [getattr(cd, "fileName", "") or "" for cd in chat_documents]
|
|
|
|
payload = build_extract_content_handover(
|
|
extracted_results=extracted_results,
|
|
chat_file_names=file_names,
|
|
operation_ref=operation_id,
|
|
)
|
|
|
|
self.services.chat.progressLogUpdate(operation_id, 0.9, "Building JSON")
|
|
|
|
stem = f"{wf}_{int(time.time())}"
|
|
stripped_payload, media_docs = _split_images_to_sidecar_documents(
|
|
payload,
|
|
document_name_stem=stem,
|
|
)
|
|
joined_text = _joined_text_from_handover_payload(payload)
|
|
|
|
json_meta = {
|
|
"actionType": "context.extractContent",
|
|
"documentCountInput": len(chat_documents),
|
|
"documentCountRoots": len(extracted_results),
|
|
"handoverSchema": stripped_payload.get("kind"),
|
|
"handoverRole": "structuredHandover",
|
|
"mediaDocumentCount": len(media_docs),
|
|
}
|
|
|
|
json_doc = ActionDocument(
|
|
documentName=f"extracted_content_{stem}.json",
|
|
documentData=stripped_payload,
|
|
mimeType="application/json",
|
|
validationMetadata=json_meta,
|
|
)
|
|
|
|
handover_data = {
|
|
"response": joined_text,
|
|
"contentType": "text",
|
|
"handoverKind": stripped_payload.get("kind"),
|
|
"structuredDocumentIndex": 0,
|
|
"mediaDocumentCount": len(media_docs),
|
|
}
|
|
|
|
self.services.chat.progressLogFinish(operation_id, True)
|
|
return ActionResult.isSuccess(documents=[json_doc] + media_docs, data=handover_data)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in content extraction: {str(e)}")
|
|
try:
|
|
if operation_id:
|
|
self.services.chat.progressLogFinish(operation_id, False)
|
|
except Exception:
|
|
pass
|
|
return ActionResult.isFailure(error=str(e))
|