125 lines
4.2 KiB
Python
125 lines
4.2 KiB
Python
from typing import Any, Dict, List, Optional, Literal
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class ContentPart(BaseModel):
|
|
id: str = Field(description="Unique content part identifier")
|
|
parentId: Optional[str] = Field(default=None, description="Optional parent content part id")
|
|
label: str = Field(description="Human readable label of the part")
|
|
typeGroup: str = Field(description="Logical type group: text, table, structure, binary, ...")
|
|
mimeType: str = Field(description="MIME type of the part payload")
|
|
data: str = Field(default="", description="Primary data payload, often extracted text")
|
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Arbitrary metadata for the part")
|
|
|
|
|
|
class ExtractedContent(BaseModel):
|
|
id: str = Field(description="Extraction id or source document id")
|
|
parts: List[ContentPart] = Field(default_factory=list, description="List of extracted parts")
|
|
summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary")
|
|
|
|
|
|
class MergeStrategy(BaseModel):
|
|
"""Strategy configuration for merging content parts and AI results."""
|
|
|
|
# Grouping configuration
|
|
groupBy: str = Field(
|
|
default="typeGroup",
|
|
description="Field to group parts by (typeGroup, parentId, label, etc.)"
|
|
)
|
|
|
|
# Ordering configuration
|
|
orderBy: str = Field(
|
|
default="id",
|
|
description="Field to order parts within groups (id, order, pageIndex, etc.)"
|
|
)
|
|
|
|
# Merge behavior
|
|
mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field(
|
|
default="concatenate",
|
|
description="How to merge content within groups"
|
|
)
|
|
|
|
# Size limits
|
|
maxSize: Optional[int] = Field(
|
|
default=None,
|
|
description="Maximum size for merged content in bytes"
|
|
)
|
|
|
|
# Type-specific merge settings
|
|
textMerge: Optional[Dict[str, Any]] = Field(
|
|
default=None,
|
|
description="Text-specific merge settings (separator, formatting, etc.)"
|
|
)
|
|
|
|
tableMerge: Optional[Dict[str, Any]] = Field(
|
|
default=None,
|
|
description="Table-specific merge settings (header handling, etc.)"
|
|
)
|
|
|
|
structureMerge: Optional[Dict[str, Any]] = Field(
|
|
default=None,
|
|
description="Structure-specific merge settings (hierarchy, etc.)"
|
|
)
|
|
|
|
# AI result merging
|
|
aiResultMerge: Optional[Dict[str, Any]] = Field(
|
|
default=None,
|
|
description="AI result merging settings (prompt, context, etc.)"
|
|
)
|
|
|
|
# Chunk handling
|
|
preserveChunks: bool = Field(
|
|
default=False,
|
|
description="Whether to preserve individual chunks or merge them"
|
|
)
|
|
|
|
chunkSeparator: str = Field(
|
|
default="\n\n---\n\n",
|
|
description="Separator between chunks when merging"
|
|
)
|
|
|
|
# Metadata handling
|
|
preserveMetadata: bool = Field(
|
|
default=True,
|
|
description="Whether to preserve metadata from original parts"
|
|
)
|
|
|
|
metadataFields: Optional[List[str]] = Field(
|
|
default=None,
|
|
description="Specific metadata fields to preserve (None = all)"
|
|
)
|
|
|
|
# Error handling
|
|
onError: Literal["skip", "include", "fail"] = Field(
|
|
default="skip",
|
|
description="How to handle errors during merging"
|
|
)
|
|
|
|
# Validation
|
|
validateContent: bool = Field(
|
|
default=True,
|
|
description="Whether to validate content before merging"
|
|
)
|
|
|
|
def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]:
|
|
"""Get type-specific merge settings for a content type."""
|
|
if typeGroup == "text" and self.textMerge:
|
|
return self.textMerge
|
|
elif typeGroup == "table" and self.tableMerge:
|
|
return self.tableMerge
|
|
elif typeGroup == "structure" and self.structureMerge:
|
|
return self.structureMerge
|
|
else:
|
|
return {}
|
|
|
|
def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool:
|
|
"""Determine if a chunk should be preserved based on strategy."""
|
|
if not self.preserveChunks:
|
|
return False
|
|
|
|
# Check if chunk has error metadata
|
|
if self.onError == "skip" and chunk.get("metadata", {}).get("error"):
|
|
return False
|
|
|
|
return True
|
|
|