# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Adapter to convert ContentPart list (from extraction) to renderer JSON schema. Schema: { metadata: {...}, documents: [{ sections: [{ content_type, elements: [...] }] }] } """ import csv import io from typing import Dict, List, Any from modules.datamodels.datamodelExtraction import ContentPart def content_parts_to_renderer_schema(parts: List[ContentPart], title: str = "Neutralized Document") -> Dict[str, Any]: """ Convert ContentPart list to the standardized renderer schema. Args: parts: List of ContentPart from extraction title: Document title for metadata Returns: Dict with metadata, documents[0].sections structure for renderers """ sections: List[Dict[str, Any]] = [] for part in parts: if not hasattr(part, 'typeGroup') or not hasattr(part, 'data'): part_dict = part if isinstance(part, dict) else part.model_dump() type_group = part_dict.get("typeGroup", "text") data = part_dict.get("data", "") label = part_dict.get("label", "") else: type_group = part.typeGroup data = part.data or "" label = part.label or "" # Skip binary/image parts without text - they can't be neutralized meaningfully if type_group in ("binary", "image"): continue # Skip empty data if not (data and str(data).strip()): continue section = _part_to_section(type_group, data, label) if section: sections.append(section) # Ensure at least one section (renderers require it) if not sections: sections = [{ "content_type": "paragraph", "elements": [{"type": "paragraph", "content": {"text": ""}}] }] return { "metadata": {"title": title}, "documents": [{ "sections": sections }] } def _part_to_section(type_group: str, data: str, label: str) -> Dict[str, Any]: """Convert a single ContentPart to a section dict.""" data_str = str(data).strip() if type_group == "table" and ("csv" in label.lower() or "," in data_str or "\t" in data_str): # Parse CSV/TSV into table structure try: rows = list(csv.reader(io.StringIO(data_str))) if rows: headers = rows[0] rows_data = rows[1:] return { "content_type": "table", "elements": [{ "type": "table", "content": {"headers": headers, "rows": rows_data} }] } except Exception: pass # Fallback: treat as paragraph return { "content_type": "paragraph", "elements": [{ "type": "extracted_text", "content": data_str, "source": label }] } if type_group == "structure": # PPTX slide content - often markdown-like return { "content_type": "paragraph", "elements": [{ "type": "extracted_text", "content": data_str, "source": label }] } # Default: text/paragraph return { "content_type": "paragraph", "elements": [{ "type": "extracted_text", "content": data_str, "source": label }] }