gateway/modules/features/neutralization/serviceNeutralization/subContentPartAdapter.py

115 lines
3.5 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Adapter to convert ContentPart list (from extraction) to renderer JSON schema.
Schema: { metadata: {...}, documents: [{ sections: [{ content_type, elements: [...] }] }] }
"""
import csv
import io
from typing import Dict, List, Any
from modules.datamodels.datamodelExtraction import ContentPart
def content_parts_to_renderer_schema(parts: List[ContentPart], title: str = "Neutralized Document") -> Dict[str, Any]:
"""
Convert ContentPart list to the standardized renderer schema.
Args:
parts: List of ContentPart from extraction
title: Document title for metadata
Returns:
Dict with metadata, documents[0].sections structure for renderers
"""
sections: List[Dict[str, Any]] = []
for part in parts:
if not hasattr(part, 'typeGroup') or not hasattr(part, 'data'):
part_dict = part if isinstance(part, dict) else part.model_dump()
type_group = part_dict.get("typeGroup", "text")
data = part_dict.get("data", "")
label = part_dict.get("label", "")
else:
type_group = part.typeGroup
data = part.data or ""
label = part.label or ""
# Skip binary/image parts without text - they can't be neutralized meaningfully
if type_group in ("binary", "image"):
continue
# Skip empty data
if not (data and str(data).strip()):
continue
section = _part_to_section(type_group, data, label)
if section:
sections.append(section)
# Ensure at least one section (renderers require it)
if not sections:
sections = [{
"content_type": "paragraph",
"elements": [{"type": "paragraph", "content": {"text": ""}}]
}]
return {
"metadata": {"title": title},
"documents": [{
"sections": sections
}]
}
def _part_to_section(type_group: str, data: str, label: str) -> Dict[str, Any]:
"""Convert a single ContentPart to a section dict."""
data_str = str(data).strip()
if type_group == "table" and ("csv" in label.lower() or "," in data_str or "\t" in data_str):
# Parse CSV/TSV into table structure
try:
rows = list(csv.reader(io.StringIO(data_str)))
if rows:
headers = rows[0]
rows_data = rows[1:]
return {
"content_type": "table",
"elements": [{
"type": "table",
"content": {"headers": headers, "rows": rows_data}
}]
}
except Exception:
pass
# Fallback: treat as paragraph
return {
"content_type": "paragraph",
"elements": [{
"type": "extracted_text",
"content": data_str,
"source": label
}]
}
if type_group == "structure":
# PPTX slide content - often markdown-like
return {
"content_type": "paragraph",
"elements": [{
"type": "extracted_text",
"content": data_str,
"source": label
}]
}
# Default: text/paragraph
return {
"content_type": "paragraph",
"elements": [{
"type": "extracted_text",
"content": data_str,
"source": label
}]
}