# Copyright (c) 2025 Patrick Motsch # All rights reserved. """context.extractContent — extracts content without AI. Returns a unified handover compatible with AiResult-style downstream wiring: - ``documents[0]``: structured JSON (`context.extractContent.handover.v1`); image ``parts`` keep metadata but omit pixel data; each dropped image references ``handoverMediaDocumentName`` matching a sibling blob document. - ``documents[1:]``: each extracted image as its own binary ``ActionDocument`` (like ``ai.process`` artefact outputs). - ``ActionResult.data["response"]`` plus normalized executor field ``response``: concatenated plain text from all text parts — safe default for ``file.create`` / primaryTextRef.""" import base64 as _b64 import binascii as _binascii import logging import re import time from typing import Any, Dict, List, Tuple from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelDocref import coerceDocumentReferenceList from modules.datamodels.datamodelExtraction import ContentExtracted, ExtractionOptions logger = logging.getLogger(__name__) _UNSAFE_FILE_KEY = re.compile(r"[^\w\-.\(\)\[\]%@+]") HANDOVER_KIND = "context.extractContent.handover.v1" def _default_extraction_options() -> ExtractionOptions: """No merge — keep all parts for downstream JSON selection.""" return ExtractionOptions( prompt="Extract all content from the document", mergeStrategy=None, processDocumentsIndividually=True, outputFormat="parts", outputDetail="full", ) def _file_json_key(display_name: str, index: int, key_counts: Dict[str, int]) -> str: stem = (display_name or "").strip() or f"document_{index + 1}" slug = stem.replace("/", "_").replace("\\", "_").replace(" ", "_") slug = _UNSAFE_FILE_KEY.sub("_", slug).strip("_") or f"document_{index + 1}" base = f"file_{index + 1}_{slug}" n = key_counts.get(base, 0) key_counts[base] = n + 1 return base if n == 0 else f"{base}__{n}" def _serialize_parts(parts: Any) -> List[Dict[str, Any]]: out: List[Dict[str, Any]] = [] for p in parts or []: if hasattr(p, "model_dump"): out.append(p.model_dump(mode="json")) elif isinstance(p, dict): out.append(dict(p)) return out def _rebuild_by_type_group(parts_ser: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: by_type: Dict[str, List[Dict[str, Any]]] = {} for entry in parts_ser: if not isinstance(entry, dict): continue tg = (entry.get("typeGroup") or "").strip() or "_other" by_type.setdefault(tg, []).append(entry) return by_type def _joined_text_from_handover_payload(payload: Dict[str, Any]) -> str: """Concatenate text parts across fileOrder for AiResult-compatible ``response``.""" files_section = payload.get("files") or {} ordered = payload.get("fileOrder") keys: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys()) chunks: List[str] = [] for fk in keys: bucket = files_section.get(fk) if not isinstance(bucket, dict): continue for p in bucket.get("parts") or []: if not isinstance(p, dict): continue if (p.get("typeGroup") or "").strip() != "text": continue raw = p.get("data") if raw is None: continue s = str(raw).strip() if s: chunks.append(s) return "\n\n".join(chunks) def _mime_to_file_extension(mime: str) -> str: m = (mime or "").split(";")[0].strip().lower() mapping = { "image/jpeg": "jpg", "image/jpg": "jpg", "image/png": "png", "image/gif": "gif", "image/webp": "webp", "image/bmp": "bmp", "image/tiff": "tiff", } return mapping.get(m, m.rsplit("/", 1)[-1] if "/" in m else "bin") def _split_images_to_sidecar_documents( payload: Dict[str, Any], *, document_name_stem: str, ) -> Tuple[Dict[str, Any], List[ActionDocument]]: """ Deep-copy handover JSON, clear image pixel data from ``parts``, attach ``handoverMediaDocumentName`` on each image part, emit binary ActionDocuments. """ import copy bundle = copy.deepcopy(payload) files_section = bundle.get("files") or {} ordered = bundle.get("fileOrder") key_order: List[str] = ordered if isinstance(ordered, list) and ordered else list(files_section.keys()) media_docs: List[ActionDocument] = [] kind = bundle.get("kind") or HANDOVER_KIND stem = re.sub(r"[^\w\-]+", "_", document_name_stem).strip("_") or "extract" for fk in key_order: bucket = files_section.get(fk) if not isinstance(bucket, dict): continue parts = bucket.get("parts") if not isinstance(parts, list): continue new_parts: List[Dict[str, Any]] = [] for p in parts: if not isinstance(p, dict): new_parts.append(p) continue pcopy = dict(p) tg = (pcopy.get("typeGroup") or "").strip() mime = (pcopy.get("mimeType") or "").strip() raw_data = pcopy.get("data") if tg == "image" and mime.lower().startswith("image/") and raw_data: raw_s = raw_data.strip() if isinstance(raw_data, str) else "" try: blob = _b64.b64decode(raw_s, validate=True) if raw_s else b"" except (_binascii.Error, TypeError, ValueError) as e: logger.warning( "extractContent: could not decode image part %s (keep inline): %s", pcopy.get("id"), e, ) new_parts.append(pcopy) continue if not blob: new_parts.append(pcopy) continue part_id = str(pcopy.get("id") or "part") # Full part id (UUID) — must not truncate or names collide / break linking safe_id = re.sub(r"[^\w\-.]+", "_", part_id).strip("_") or "media" if len(safe_id) > 200: safe_id = safe_id[:200] ext = _mime_to_file_extension(mime) media_name = f"extract_media_{stem}_{safe_id}.{ext}" pcopy["data"] = "" pcopy["handoverMediaDocumentName"] = media_name media_docs.append( ActionDocument( documentName=media_name, documentData=blob, mimeType=mime, validationMetadata={ "actionType": "context.extractContent", "handoverRole": "extractedMedia", "sourcePartId": part_id, "handoverSchema": kind, "containerFileKey": fk, }, ) ) new_parts.append(pcopy) else: new_parts.append(pcopy) bucket["parts"] = new_parts bucket["byTypeGroup"] = _rebuild_by_type_group(new_parts) files_section[fk] = bucket return bundle, media_docs def _one_file_bucket(ec: ContentExtracted, source_file_name: str) -> Dict[str, Any]: parts_ser = _serialize_parts(ec.parts) ud = getattr(ec, "udm", None) if hasattr(ud, "model_dump"): ud = ud.model_dump(mode="json") summary = getattr(ec, "summary", None) if hasattr(summary, "model_dump"): summary = summary.model_dump(mode="json") elif isinstance(summary, dict): summary = dict(summary) elif summary is None: summary = {} return { "sourceFileName": source_file_name, "extractedId": getattr(ec, "id", ""), "summary": summary, "udm": ud, "parts": parts_ser, "byTypeGroup": _rebuild_by_type_group(parts_ser), } def build_extract_content_handover( *, extracted_results: List[ContentExtracted], chat_file_names: List[str], operation_ref: str, ) -> Dict[str, Any]: key_counts: Dict[str, int] = {} files: Dict[str, Any] = {} ordered: List[str] = [] for i, ec in enumerate(extracted_results): name = chat_file_names[i] if i < len(chat_file_names) else "" fk = _file_json_key(str(name), i, key_counts) files[fk] = _one_file_bucket(ec, str(name)) ordered.append(fk) return { "schemaVersion": 1, "kind": HANDOVER_KIND, "operationRef": operation_ref, "fileOrder": ordered, "files": files, } async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult: operation_id = None try: wf = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" operation_id = f"context_extract_{wf}_{int(time.time())}" document_list_param = parameters.get("documentList") if not document_list_param: return ActionResult.isFailure(error="documentList is required") dl = coerceDocumentReferenceList(document_list_param) if not dl.references: return ActionResult.isFailure( error=( f"documentList could not be parsed (type={type(document_list_param).__name__}); " "expected DocumentReferenceList, list of strings/dicts, or " "a wrapper dict like {'documents': [...]}" ), ) parent_operation_id = parameters.get("parentOperationId") self.services.chat.progressLogStart( operation_id, "Extracting content from documents", "Content Extraction", f"Documents: {len(dl.references)}", parentOperationId=parent_operation_id, ) self.services.chat.progressLogUpdate(operation_id, 0.2, "Loading documents") chat_documents = self.services.chat.getChatDocumentsFromDocumentList(dl) if not chat_documents: self.services.chat.progressLogFinish(operation_id, False) return ActionResult.isFailure(error="No documents found in documentList") logger.info(f"Extracting JSON handover from {len(chat_documents)} documents") self.services.chat.progressLogUpdate(operation_id, 0.3, "Preparing extraction options") eo_param = parameters.get("extractionOptions") extraction_options: ExtractionOptions if isinstance(eo_param, dict) and eo_param: eo = dict(eo_param) eo.setdefault("prompt", "Extract all content from the document") if "mergeStrategy" not in eo: eo["mergeStrategy"] = None try: extraction_options = ExtractionOptions(**eo) except Exception as e: logger.warning(f"Invalid extractionOptions, using defaults: {e}") extraction_options = _default_extraction_options() elif isinstance(eo_param, ExtractionOptions): extraction_options = eo_param else: extraction_options = _default_extraction_options() self.services.chat.progressLogUpdate(operation_id, 0.4, "Extracting …") self.services.chat.progressLogUpdate(operation_id, 0.5, f"Extracting {len(chat_documents)} document(s)") extracted_results = self.services.extraction.extractContent(chat_documents, extraction_options, operationId=operation_id) file_names = [getattr(cd, "fileName", "") or "" for cd in chat_documents] payload = build_extract_content_handover( extracted_results=extracted_results, chat_file_names=file_names, operation_ref=operation_id, ) self.services.chat.progressLogUpdate(operation_id, 0.9, "Building JSON") stem = f"{wf}_{int(time.time())}" stripped_payload, media_docs = _split_images_to_sidecar_documents( payload, document_name_stem=stem, ) joined_text = _joined_text_from_handover_payload(payload) json_meta = { "actionType": "context.extractContent", "documentCountInput": len(chat_documents), "documentCountRoots": len(extracted_results), "handoverSchema": stripped_payload.get("kind"), "handoverRole": "structuredHandover", "mediaDocumentCount": len(media_docs), } json_doc = ActionDocument( documentName=f"extracted_content_{stem}.json", documentData=stripped_payload, mimeType="application/json", validationMetadata=json_meta, ) handover_data = { "response": joined_text, "contentType": "text", "handoverKind": stripped_payload.get("kind"), "structuredDocumentIndex": 0, "mediaDocumentCount": len(media_docs), } self.services.chat.progressLogFinish(operation_id, True) return ActionResult.isSuccess(documents=[json_doc] + media_docs, data=handover_data) except Exception as e: logger.error(f"Error in content extraction: {str(e)}") try: if operation_id: self.services.chat.progressLogFinish(operation_id, False) except Exception: pass return ActionResult.isFailure(error=str(e))