115 lines
3.5 KiB
Python
115 lines
3.5 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Adapter to convert ContentPart list (from extraction) to renderer JSON schema.
|
|
Schema: { metadata: {...}, documents: [{ sections: [{ content_type, elements: [...] }] }] }
|
|
"""
|
|
|
|
import csv
|
|
import io
|
|
from typing import Dict, List, Any
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
|
|
|
|
def content_parts_to_renderer_schema(parts: List[ContentPart], title: str = "Neutralized Document") -> Dict[str, Any]:
|
|
"""
|
|
Convert ContentPart list to the standardized renderer schema.
|
|
|
|
Args:
|
|
parts: List of ContentPart from extraction
|
|
title: Document title for metadata
|
|
|
|
Returns:
|
|
Dict with metadata, documents[0].sections structure for renderers
|
|
"""
|
|
sections: List[Dict[str, Any]] = []
|
|
|
|
for part in parts:
|
|
if not hasattr(part, 'typeGroup') or not hasattr(part, 'data'):
|
|
part_dict = part if isinstance(part, dict) else part.model_dump()
|
|
type_group = part_dict.get("typeGroup", "text")
|
|
data = part_dict.get("data", "")
|
|
label = part_dict.get("label", "")
|
|
else:
|
|
type_group = part.typeGroup
|
|
data = part.data or ""
|
|
label = part.label or ""
|
|
|
|
# Skip binary/image parts without text - they can't be neutralized meaningfully
|
|
if type_group in ("binary", "image"):
|
|
continue
|
|
|
|
# Skip empty data
|
|
if not (data and str(data).strip()):
|
|
continue
|
|
|
|
section = _part_to_section(type_group, data, label)
|
|
if section:
|
|
sections.append(section)
|
|
|
|
# Ensure at least one section (renderers require it)
|
|
if not sections:
|
|
sections = [{
|
|
"content_type": "paragraph",
|
|
"elements": [{"type": "paragraph", "content": {"text": ""}}]
|
|
}]
|
|
|
|
return {
|
|
"metadata": {"title": title},
|
|
"documents": [{
|
|
"sections": sections
|
|
}]
|
|
}
|
|
|
|
|
|
def _part_to_section(type_group: str, data: str, label: str) -> Dict[str, Any]:
|
|
"""Convert a single ContentPart to a section dict."""
|
|
data_str = str(data).strip()
|
|
|
|
if type_group == "table" and ("csv" in label.lower() or "," in data_str or "\t" in data_str):
|
|
# Parse CSV/TSV into table structure
|
|
try:
|
|
rows = list(csv.reader(io.StringIO(data_str)))
|
|
if rows:
|
|
headers = rows[0]
|
|
rows_data = rows[1:]
|
|
return {
|
|
"content_type": "table",
|
|
"elements": [{
|
|
"type": "table",
|
|
"content": {"headers": headers, "rows": rows_data}
|
|
}]
|
|
}
|
|
except Exception:
|
|
pass
|
|
# Fallback: treat as paragraph
|
|
return {
|
|
"content_type": "paragraph",
|
|
"elements": [{
|
|
"type": "extracted_text",
|
|
"content": data_str,
|
|
"source": label
|
|
}]
|
|
}
|
|
|
|
if type_group == "structure":
|
|
# PPTX slide content - often markdown-like
|
|
return {
|
|
"content_type": "paragraph",
|
|
"elements": [{
|
|
"type": "extracted_text",
|
|
"content": data_str,
|
|
"source": label
|
|
}]
|
|
}
|
|
|
|
# Default: text/paragraph
|
|
return {
|
|
"content_type": "paragraph",
|
|
"elements": [{
|
|
"type": "extracted_text",
|
|
"content": data_str,
|
|
"source": label
|
|
}]
|
|
}
|