From 64590aa61e2cd3ad36c50c0c0eda0b6ef31d2d39 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sun, 4 Jan 2026 20:01:34 +0100
Subject: [PATCH] fixes
---
.../serviceAi/CONTENT_EXTRACTION_ANALYSIS.md | 2564 -----------------
.../serviceAi/PARALLEL_PROCESSING_CONCEPT.md | 376 ---
.../serviceAi/README_MODULE_STRUCTURE.md | 78 -
modules/services/serviceAi/mainServiceAi.py | 12 -
modules/services/serviceAi/merge_1.txt | 529 ++++
.../services/serviceAi/subAiCallLooping.py | 591 ++--
modules/services/serviceAi/subJsonMerger.py | 2049 +++++++++++++
.../serviceAi/subJsonResponseHandling.py | 1522 +++++++++-
.../services/serviceAi/subLoopingUseCases.py | 37 +-
.../services/serviceAi/subStructureFilling.py | 291 +-
.../serviceAi/subStructureGeneration.py | 7 +-
.../services/serviceAi/test_json_merger.py | 594 ++++
.../serviceGeneration/paths/codePath.py | 138 +
modules/shared/jsonUtils.py | 1425 ++++++++-
14 files changed, 6478 insertions(+), 3735 deletions(-)
delete mode 100644 modules/services/serviceAi/CONTENT_EXTRACTION_ANALYSIS.md
delete mode 100644 modules/services/serviceAi/PARALLEL_PROCESSING_CONCEPT.md
delete mode 100644 modules/services/serviceAi/README_MODULE_STRUCTURE.md
create mode 100644 modules/services/serviceAi/merge_1.txt
create mode 100644 modules/services/serviceAi/subJsonMerger.py
create mode 100644 modules/services/serviceAi/test_json_merger.py
diff --git a/modules/services/serviceAi/CONTENT_EXTRACTION_ANALYSIS.md b/modules/services/serviceAi/CONTENT_EXTRACTION_ANALYSIS.md
deleted file mode 100644
index b83d328f..00000000
--- a/modules/services/serviceAi/CONTENT_EXTRACTION_ANALYSIS.md
+++ /dev/null
@@ -1,2564 +0,0 @@
-# Content Extraction Logic Analysis - ai.process Action
-
-## Overview
-This document provides a stepwise structured analysis of the content extraction logic in the main AI call (`ai.process` action). It covers input formats, document processing, AI service communication, and content handling.
-
----
-
-## 1. Input Content Formats
-
-### 1.1 Document Input Formats
-The `ai.process` action accepts documents in the following formats:
-
-#### Supported Document Types (via Extraction Service)
-- **PDF** (`application/pdf`) - Extracted via `PdfExtractor`
-- **Word Documents** (`application/vnd.openxmlformats-officedocument.wordprocessingml.document`) - Extracted via `DocxExtractor`
-- **Excel** (`application/vnd.openxmlformats-officedocument.spreadsheetml.sheet`) - Extracted via `XlsxExtractor`
-- **PowerPoint** (`application/vnd.openxmlformats-officedocument.presentationml.presentation`) - Extracted via `PptxExtractor`
-- **CSV** (`text/csv`) - Extracted via `CsvExtractor`
-- **HTML** (`text/html`) - Extracted via `HtmlExtractor`
-- **XML** (`application/xml`, `text/xml`) - Extracted via `XmlExtractor`
-- **JSON** (`application/json`) - Extracted via `JsonExtractor`
-- **Images** (`image/jpeg`, `image/png`, `image/gif`, `image/webp`) - Extracted via `ImageExtractor`
-- **Text** (`text/plain`) - Extracted via `TextExtractor`
-- **SQL** (`application/sql`) - Extracted via `SqlExtractor`
-- **Binary** (other formats) - Extracted via `BinaryExtractor`
-
-#### Document Reference Formats
-Documents are provided via the `documentList` parameter which accepts:
-- `DocumentReferenceList` object (preferred)
-- List of strings (document references)
-- Single string (single document reference)
-- `None` (no documents)
-
-### 1.2 Content Parts Input Format
-Alternatively, pre-extracted content can be provided via `contentParts` parameter:
-- **Type**: `List[ContentPart]`
-- **ContentPart Structure**:
- ```python
- ContentPart(
- id: str, # Unique identifier
- parentId: Optional[str], # Parent part ID (for hierarchical content)
- label: str, # Human-readable label
- typeGroup: str, # "text", "table", "image", "structure", "container", "binary"
- mimeType: str, # MIME type of the content
- data: Union[str, bytes], # Actual content data
- metadata: Dict[str, Any] # Metadata including:
- # - documentId
- # - documentMimeType
- # - originalFileName
- # - contentFormat ("extracted", "object", "reference")
- # - intent ("extract", "display", "analyze")
- # - usageHint
- # - extractionPrompt
- # - sourceAction
- )
- ```
-
-### 1.3 Prompt Input Format
-- **Type**: `str`
-- **Required**: Yes
-- **Description**: Instruction for the AI describing what processing to perform
-
-### 1.4 Result Type Format
-- **Type**: `str`
-- **Default**: `"txt"`
-- **Supported Formats**: `txt`, `json`, `md`, `csv`, `xml`, `html`, `pdf`, `docx`, `xlsx`, `pptx`, `png`, `jpg`, `jpeg`, `gif`, `webp`
-- **Purpose**: Determines output file extension and generation intent
-
----
-
-## 2. Document Processing Flow
-
-### 2.1 Entry Point: `ai.process` Action
-**Location**: `gateway/modules/workflows/methods/methodAi/actions/process.py`
-
-**Flow**:
-1. **Parameter Extraction** (lines 35-55)
- - Extract `aiPrompt` from parameters
- - Extract `documentList` and convert to `DocumentReferenceList`
- - Extract `resultType` (default: "txt")
- - Extract `contentParts` if already provided
-
-2. **Content Extraction Decision** (lines 72-119)
- - **Path A**: If `contentParts` already provided → Skip extraction, use provided parts
- - **Path B**: If `documentList` provided but no `contentParts` → Extract content from documents
- - **Path C**: If BOTH `contentParts` AND `documentList` provided:
- - **In `ai.process` action** (lines 85-86, 167-174):
- - Condition: `if not contentParts and documentList.references:` (line 86)
- - **Behavior**: Only extracts from `documentList` if `contentParts` is NOT provided
- - **Result**: If both provided, `contentParts` takes precedence
- - **Important**: `documentList` is **NOT passed** to `callAiContent()` (line 167)
- - Only `contentParts` is passed to the AI service
- - **Conclusion**: `documentList` is **ignored** when `contentParts` is provided
- - **Note**: Merging logic exists in document generation path (`DocumentGenerationPath.generateDocument`, lines 109-119), but this only applies when `documentList` is passed separately to `callAiContent()` (not from `ai.process` action)
- - **Note**: Similar merging exists in data extraction path (`_handleDataExtraction`, lines 727-733), but also requires `documentList` to be passed to `callAiContent()`
-
-### 2.2 Content Extraction Process (Path B)
-
-**Location**: `gateway/modules/services/serviceExtraction/mainServiceExtraction.py`
-
-#### Step 1: Document Resolution (lines 86-94 in process.py)
-```python
-chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList)
-```
-- Converts `DocumentReferenceList` to `List[ChatDocument]`
-- Each `ChatDocument` contains:
- - `id`: Document ID
- - `fileId`: File ID for database lookup
- - `fileName`: Original filename
- - `mimeType`: MIME type
-
-#### Step 2: Extraction Options Preparation (lines 96-108 in process.py)
-```python
-extractionOptions = ExtractionOptions(
- prompt="Extract all content from the document",
- mergeStrategy=MergeStrategy(
- mergeType="concatenate",
- groupBy="typeGroup",
- orderBy="id"
- ),
- processDocumentsIndividually=True
-)
-```
-
-#### Step 3: Content Extraction (line 111 in process.py)
-```python
-extractedResults = self.services.extraction.extractContent(chatDocuments, extractionOptions)
-```
-
-**Extraction Service Flow** (`mainServiceExtraction.py:extractContent`):
-
-1. **For each document** (lines 69-288):
- - **Load document bytes** (line 96):
- ```python
- documentBytes = dbInterface.getFileData(doc.fileId)
- ```
-
- - **Run extraction pipeline** (lines 113-120):
- ```python
- ec = runExtraction(
- extractorRegistry=self._extractorRegistry,
- chunkerRegistry=self._chunkerRegistry,
- documentBytes=documentData["bytes"],
- fileName=documentData["fileName"],
- mimeType=documentData["mimeType"],
- options=options
- )
- ```
-
- - **Extraction Process**:
- - **Extractor Selection**: Based on MIME type, select appropriate extractor (PDF, DOCX, XLSX, etc.)
- - **Content Parsing**: Extractor parses document and extracts structured content
- - **Chunking** (if needed): Large content is chunked based on size limits
- - **ContentPart Creation**: Each extracted piece becomes a `ContentPart` with:
- - `typeGroup`: "text", "table", "image", "structure", "container", "binary"
- - `data`: Extracted content (text, table data, base64 image, etc.)
- - `mimeType`: Original MIME type
- - `label`: Descriptive label
-
- - **Metadata Attachment** (lines 132-166):
- ```python
- # Required metadata fields
- p.metadata["documentId"] = documentData["id"]
- p.metadata["documentMimeType"] = documentData["mimeType"]
- p.metadata["originalFileName"] = documentData["fileName"]
- p.metadata["contentFormat"] = "extracted" # Default
- p.metadata["intent"] = "extract" # Default
- p.metadata["extractionPrompt"] = options.prompt
- p.metadata["usageHint"] = f"Use extracted content from {documentData['fileName']}"
- p.metadata["sourceAction"] = "extraction.extractContent"
- ```
-
-2. **Return Results**:
- - Returns `List[ContentExtracted]` (one per input document)
- - Each `ContentExtracted` contains:
- - `id`: Document ID
- - `parts`: `List[ContentPart]` - All extracted content parts
-
-#### Step 4: Combine ContentParts (lines 113-119 in process.py)
-```python
-contentParts = []
-for extracted in extractedResults:
- if extracted.parts:
- contentParts.extend(extracted.parts)
-```
-
-**Result**: Single `List[ContentPart]` containing all extracted content from all documents.
-
----
-
-## 3. What is Sent to the AI Service
-
-### 3.1 AI Service Call
-**Location**: `gateway/modules/workflows/methods/methodAi/actions/process.py` (line 167)
-
-```python
-aiResponse = await self.services.ai.callAiContent(
- prompt=aiPrompt,
- options=options,
- contentParts=contentParts, # Already extracted (or None if no documents)
- outputFormat=output_format,
- parentOperationId=operationId,
- generationIntent=generationIntent # REQUIRED for DATA_GENERATE
-)
-```
-
-### 3.2 Parameters Sent to AI Service
-
-#### 3.2.1 Prompt
-- **Type**: `str`
-- **Content**: User-provided instruction describing what processing to perform
-- **Example**: "Extract all content from the document"
-
-#### 3.2.2 Options (`AiCallOptions`)
-```python
-options = AiCallOptions(
- resultFormat=output_format, # e.g., "txt", "json", "docx"
- operationType=OperationTypeEnum.DATA_GENERATE # or IMAGE_GENERATE
-)
-```
-
-**Operation Types**:
-- `DATA_GENERATE`: Generate structured content (documents, code)
-- `IMAGE_GENERATE`: Generate images
-- `DATA_EXTRACT`: Extract and process content
-- `DATA_ANALYSE`: Analyze content
-- `IMAGE_ANALYSE`: Analyze images
-
-#### 3.2.3 ContentParts (`List[ContentPart]`)
-**Structure per ContentPart**:
-```python
-ContentPart(
- id="part_123",
- parentId=None,
- label="Chapter 1 Text",
- typeGroup="text", # or "table", "image", "structure", "container", "binary"
- mimeType="text/plain",
- data="Actual content text here...", # or base64 for images
- metadata={
- "documentId": "doc_456",
- "documentMimeType": "application/pdf",
- "originalFileName": "document.pdf",
- "contentFormat": "extracted",
- "intent": "extract",
- "usageHint": "Use extracted content from document.pdf",
- "extractionPrompt": "Extract all content from the document",
- "sourceAction": "extraction.extractContent"
- }
-)
-```
-
-#### 3.2.4 Output Format
-- **Type**: `str`
-- **Examples**: `"txt"`, `"json"`, `"docx"`, `"pdf"`, `"xlsx"`, `"png"`
-
-#### 3.2.5 Generation Intent
-- **Type**: `str`
-- **Values**: `"document"`, `"code"`, `"image"`
-- **Default Logic** (lines 142-160 in process.py):
- - Document formats (xlsx, docx, pdf, txt, md, html, csv, xml, pptx) → `"document"`
- - Code formats (py, js, ts, java, cpp, c, go, rs, rb, php, swift, kt) → `"code"`
- - Image formats (png, jpg, jpeg, gif, webp) → `"image"` (handled separately)
-
----
-
-## 4. What the AI Service Does with Documents and Contents
-
-### 4.1 AI Service Entry Point
-**Location**: `gateway/modules/services/serviceAi/mainServiceAi.py:callAiContent` (line 540)
-
-### 4.2 Operation Type Routing
-
-#### 4.2.1 IMAGE_GENERATE (lines 599-601)
-- Routes to `_handleImageGeneration()`
-- Generates images from prompt (no document processing)
-
-#### 4.2.2 DATA_GENERATE (lines 607-640)
-- **Requires**: `generationIntent` parameter
-- **Routes based on intent**:
- - `generationIntent == "code"` → `_handleCodeGeneration()`
- - `generationIntent == "document"` → `_handleDocumentGeneration()`
-
-#### 4.2.3 DATA_EXTRACT (lines 643-653)
-- Routes to `_handleDataExtraction()`
-- Extracts content from documents, then processes with AI
-
-### 4.3 Document Generation Flow (`_handleDocumentGeneration`)
-
-**Location**: `mainServiceAi.py:_handleDocumentGeneration` (referenced at line 631)
-
-**CRITICAL**: When called from `ai.process` action:
-- **Only `contentParts` is passed** to `callAiContent()` (line 167 in `process.py`)
-- **`documentList` is NOT passed** (it's `None`)
-- Therefore, **extraction does NOT happen again** in the document generation path
-- The `contentParts` already extracted in `ai.process` are used directly
-- **Steps 1-2 below are SKIPPED** for `ai.process` flow (no `documentList` to process)
-
-**Note**: `DocumentGenerationPath.generateDocument()` can also be called directly from other code paths with `documentList`, so it handles both cases. The following steps describe the general flow when `documentList` IS provided (not from `ai.process`).
-
-#### Step 1: Document Intent Clarification
-- **Condition**: `if documentList:` AND `documentIntents` not provided
-- If documents exist:
- - Calls `clarifyDocumentIntents()` to analyze document purposes
- - Determines how each document should be used (extract, display, analyze)
-- **For `ai.process` flow**: This step is **skipped** (no `documentList` passed)
-
-#### Step 2: Content Extraction and Preparation
-- **Condition**: `if documents:` (i.e., if `documentList` was provided and converted to documents)
-- If documents exist:
- - Calls `extractAndPrepareContent()`:
- - **RAW Extraction (NO AI)**: Uses `extractContent()` service for pure document parsing
- - **What it does**: Parses PDF, DOCX, XLSX, etc. to extract structured content
- - **What it creates**: ContentParts with raw extracted data
- - **AI involved**: NONE - this is pure parsing/parsing, no AI calls
- - **Prompt Used**: `intent.extractionPrompt` or default `"Extract all content from the document"`
- - **Important**: This prompt is stored in metadata but NOT used for AI extraction here
- - It's only used later during section generation (Step 4) for Vision AI extraction
- - **Purpose**: Just metadata storage, not actual AI prompt execution
- - **ContentPart Preparation**:
- - **For Images**:
- - Creates image ContentPart with base64 image data
- - Marks with `needsVisionExtraction: True`
- - Stores `extractionPrompt` in metadata for later use
- - **Reason**: Vision AI extraction is expensive, so it's deferred to section generation
- - **No AI extraction happens here** - image is just parsed and stored
- - **For Text**:
- - Creates text ContentPart with extracted text (from PDF text layer, DOCX text, etc.)
- - Marks with `skipExtraction: True` (already extracted from parsing, no AI needed)
- - **No AI extraction happens here** - text is already extracted from document parsing
- - **For Objects**: Creates object ContentParts for rendering (images, videos, etc.)
- - Then merges with provided `contentParts` (if any)
-- **For `ai.process` flow**: This step is **skipped** (no `documentList` passed, `contentParts` already extracted)
-- **Why Extract (Parse) Before Structure Generation?**
- - **ContentParts are needed BEFORE structure generation** so AI can assign them to chapters
- - Structure generation needs to know:
- - What documents exist (documentId)
- - What content types are available (typeGroup: text, image, table, etc.)
- - What content formats exist (contentFormat: extracted, object, reference)
- - **Structure generation doesn't need AI-extracted text from images** - it just needs to know images exist
- - Vision AI extraction (converting images to text) is deferred to section generation (Step 4) for efficiency
- - **Key Point**: Only RAW parsing happens here - NO AI calls, NO Vision AI, NO text extraction from images
-
-#### Step 3: Structure Generation (for document formats)
-- Calls `structureGenerator.generateStructure()`:
- - Generates document structure (chapters, sections)
- - Creates JSON structure with:
- - `metadata`: Title, language
- - `documents`: Array of document structures
- - `chapters`: Array of chapter structures with:
- - `id`, `level`, `title`
- - `contentParts`: Assignment of ContentParts to chapters
- - `generationHint`: Description of chapter content
-
-#### Step 4: Structure Filling
-- Calls `structureFiller.fillStructure()`:
- - For each chapter:
- - Extracts relevant ContentParts assigned to chapter
- - **Vision AI Extraction (if needed)**:
- - Checks for ContentParts with `needsVisionExtraction == True` (images)
- - Calls Vision AI with `extractionPrompt` from metadata (line 651 in `subStructureFilling.py`)
- - Converts image ContentPart to text ContentPart with extracted text
- - **Prompt Used**: `part.metadata.get("extractionPrompt")` or default `"Extract all text content from this image..."`
- - **Section Generation**:
- - Generates section content using AI with processed ContentParts
- - Processes ContentParts with model-aware chunking if needed
- - Merges results intelligently
-- **Two-Phase Extraction Explained**:
- - **Phase 1 (Step 2)**: RAW extraction (parsing) - creates ContentParts for structure generation
- - **Phase 2 (Step 4)**: Vision AI extraction (for images only) - happens during section generation
- - **Why Two Phases?**
- - Structure generation needs ContentParts early (to assign to chapters)
- - Vision AI extraction is expensive and only needed when generating content
- - Text content doesn't need AI extraction (already extracted in Phase 1)
-
-#### Step 5: Document Rendering
-- Converts filled structure to final document format (PDF, DOCX, XLSX, etc.)
-- Returns `AiResponse` with rendered documents
-
-### 4.4 Content Parts Processing (`processContentPartsWithAi`)
-
-**Location**: `gateway/modules/services/serviceExtraction/mainServiceExtraction.py:processContentPartsWithAi` (line 1499)
-
-#### Step 1: Model Selection
-```python
-availableModels = modelRegistry.getAvailableModels()
-failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)
-```
-- Selects appropriate AI models based on:
- - Operation type
- - Content type (text, images, etc.)
- - Model capabilities
-
-#### Step 2: Parallel Processing
-- Processes all ContentParts in parallel (max 5 concurrent by default)
-- For each ContentPart:
- - Calls `processContentPartWithFallback()`
-
-#### Step 3: ContentPart Processing (`processContentPartWithFallback`)
-
-**Location**: `mainServiceExtraction.py:processContentPartWithFallback` (line 1232)
-
-**Flow**:
-
-1. **Size Check** (lines 1328-1379):
- ```python
- # Calculate if content fits in model context
- partSize = len(contentPart.data.encode('utf-8'))
- modelContextTokens = model.contextLength
- availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
- ```
-
-2. **Chunking Decision**:
- - If content exceeds model limits → **Chunk content**
- - If content fits → **Process directly**
-
-3. **Chunking Process** (`chunkContentPartForAi`, line 1146):
- - Calculates model-specific chunk sizes:
- ```python
- # Reserve tokens for:
- # - Prompt
- # - System message wrapper
- # - Max output tokens
- # - Message overhead
- availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.60)
- ```
- - Uses appropriate chunker based on `typeGroup`:
- - `TextChunker` for text
- - `StructureChunker` for JSON/structured content
- - `TableChunker` for tables
- - `ImageChunker` for images
-
-4. **AI Call**:
- - **For chunks**: Process each chunk separately, then merge results
- - **For single part**: Call AI directly
- - **For images**: Special handling with vision models (base64 encoding)
-
-5. **Model Fallback**:
- - If model fails → Try next model in failover list
- - Continues until success or all models exhausted
-
-#### Step 4: Result Merging (`mergePartResults`)
-
-**Location**: `mainServiceExtraction.py:mergePartResults` (line 615)
-
-**Merging Strategies**:
-
-1. **Elements Response Format** (detected at line 657):
- - Merges JSON responses with `"elements"` array
- - Specifically merges tables by headers
- - Combines rows from tables with same headers
-
-2. **JSON Extraction Response Format** (detected at line 669):
- - Merges `{"extracted_content": {...}}` structures
- - Combines:
- - Text blocks
- - Tables (by headers)
- - Headings
- - Lists
- - Images
-
-3. **Regular Merging** (line 680):
- - Uses `MergeStrategy`:
- - `groupBy`: "typeGroup" or "documentId"
- - `orderBy`: "id" or "originalIndex"
- - `mergeType`: "concatenate"
- - Applies intelligent token-aware merging if enabled
- - Preserves ContentPart metadata
-
-#### Step 5: Return Merged Content
-- Returns single `AiCallResponse` with:
- - `content`: Merged content string
- - `modelName`: "multiple" (if multiple models used)
- - `priceUsd`: Sum of all model costs
- - `processingTime`: Sum of all processing times
- - `bytesSent`: Sum of all bytes sent
- - `bytesReceived`: Sum of all bytes received
-
----
-
-## 5. Summary Flow Diagram
-
-```
-ai.process Action
- │
- ├─→ Extract Parameters (aiPrompt, documentList, resultType)
- │
- ├─→ Check contentParts
- │ ├─→ If provided → Use directly
- │ └─→ If not provided → Extract from documents
- │ │
- │ ├─→ Convert documentList → ChatDocuments
- │ │
- │ ├─→ For each document:
- │ │ ├─→ Load document bytes from database
- │ │ ├─→ Select extractor (PDF, DOCX, XLSX, etc.)
- │ │ ├─→ Extract content → ContentParts
- │ │ ├─→ Chunk if needed (size-based)
- │ │ └─→ Attach metadata
- │ │
- │ └─→ Combine all ContentParts
- │
- ├─→ Determine operationType (DATA_GENERATE, IMAGE_GENERATE, etc.)
- │
- ├─→ Determine generationIntent (document, code, image)
- │
- └─→ Call AI Service (callAiContent)
- │
- ├─→ Route by operationType
- │ │
- │ ├─→ DATA_GENERATE + document → Document Generation
- │ │ ├─→ Clarify document intents
- │ │ ├─→ Extract/prepare content
- │ │ ├─→ Generate structure (chapters, sections)
- │ │ ├─→ Fill structure (generate content per section)
- │ │ └─→ Render document (PDF, DOCX, etc.)
- │ │
- │ ├─→ DATA_GENERATE + code → Code Generation
- │ │ └─→ Generate code directly
- │ │
- │ └─→ DATA_EXTRACT → Data Extraction
- │ ├─→ Extract content from documents
- │ └─→ Process with AI (simple text processing)
- │
- └─→ Process ContentParts (if provided)
- │
- ├─→ For each ContentPart:
- │ ├─→ Check size vs model limits
- │ ├─→ If too large → Chunk (model-aware)
- │ ├─→ Call AI with chunk/part
- │ ├─→ Handle model fallback if needed
- │ └─→ Collect results
- │
- └─→ Merge results
- ├─→ Detect response format (elements, extraction, regular)
- ├─→ Apply merging strategy
- └─→ Return merged content
-```
-
----
-
-## 6. Key Data Structures
-
-### 6.1 ContentPart
-```python
-ContentPart(
- id: str, # Unique identifier
- parentId: Optional[str], # Parent part ID
- label: str, # Human-readable label
- typeGroup: str, # "text", "table", "image", "structure", "container", "binary"
- mimeType: str, # MIME type
- data: Union[str, bytes], # Content data
- metadata: Dict[str, Any] # Metadata dictionary
-)
-```
-
-### 6.2 ContentExtracted
-```python
-ContentExtracted(
- id: str, # Document ID
- parts: List[ContentPart] # Extracted content parts
-)
-```
-
-### 6.3 AiCallOptions
-```python
-AiCallOptions(
- resultFormat: str, # Output format ("txt", "json", "docx", etc.)
- operationType: OperationTypeEnum, # Operation type
- priority: PriorityEnum, # Quality vs speed
- processingMode: ProcessingModeEnum, # Detailed vs fast
- compressPrompt: bool, # Compress prompt
- compressContext: bool # Compress context
-)
-```
-
-### 6.4 AiCallResponse
-```python
-AiCallResponse(
- content: str, # Generated/processed content
- modelName: str, # Model used
- priceUsd: float, # Cost in USD
- processingTime: float, # Processing time in seconds
- bytesSent: int, # Bytes sent to model
- bytesReceived: int, # Bytes received from model
- errorCount: int # Number of errors
-)
-```
-
----
-
-## 7. Important Notes
-
-### 7.1 Content Extraction Separation
-- **Extraction** (no AI): Pure document parsing and content extraction
-- **AI Processing**: Content analysis, generation, transformation
-
-### 7.2 Model-Aware Chunking
-- Chunking considers:
- - Model context length
- - Model max output tokens
- - Prompt size
- - System message overhead
- - Conservative safety margins (60% of available tokens)
-
-### 7.3 Parallel Processing
-- ContentParts are processed in parallel (max 5 concurrent)
-- Improves performance for multiple documents/parts
-
-### 7.4 Intelligent Merging
-- Merges content intelligently:
- - Tables by headers
- - Text blocks with separators
- - Preserves document structure
- - Token-aware optimization
-
-### 7.5 Metadata Preservation
-- ContentPart metadata is preserved throughout the pipeline
-- Includes document source, extraction prompt, usage hints
-- Enables traceability and proper content assignment
-
----
-
-## 8. Debug Files Generated
-
-During processing, the following debug files may be generated:
-
-1. **Extraction Results**: `extraction_result_{filename}.txt`
- - Contains extraction summary per document
- - Includes part metadata and data previews
-
-2. **Text Parts**: `extraction_text_part_{N}_{filename}.txt`
- - Contains full extracted text for each text part
-
-3. **Per-Part Extracted Data**: `content_extraction_per_part.txt`
- - Contains per-part extracted content summary
-
-4. **Original Parts Extracted Data**: `content_extraction_original_parts.txt`
- - Contains original parts with extracted content
-
-5. **Generation Prompts/Responses**: `generation_contentPart_{id}_{label}_{prompt|response}.txt`
- - Contains prompts and responses for generation phase
-
-6. **Structure Generation**: `chapter_structure_generation_{prompt|response}.txt`
- - Contains structure generation prompts and responses
-
----
-
-## 9. Recommendations and Next Steps
-
-This section documents architectural findings, recommendations, and planned improvements. Topics will be added step by step as analysis progresses.
-
-### 9.1 Architectural Inconsistency: contentParts + documentList Merging Behavior
-
-#### Problem Statement
-
-The `ai.process` action exhibits **inconsistent behavior** when both `contentParts` and `documentList` parameters are provided:
-
-**Current Behavior Across Code Paths:**
-
-1. **`ai.process` Action** (`process.py` lines 85-86):
- - **Logic**: `if not contentParts and documentList.references:`
- - **Behavior**: If both provided → Only `contentParts` used, `documentList` ignored
- - **Issue**: `documentList` is not passed to `callAiContent()`, so it's completely ignored
-
-2. **Document Generation Path** (`documentPath.py` lines 109-119):
- - **Logic**: Extracts from `documentList`, then merges with `contentParts`
- - **Behavior**: If both provided → **MERGES** both
- - **Code**: `preparedContentParts.extend(contentParts)`
-
-3. **Data Extraction Path** (`mainServiceAi.py` lines 727-733):
- - **Logic**: Extracts from `documentList`, then merges with `contentParts`
- - **Behavior**: If both provided → **MERGES** both
- - **Code**: `preparedContentParts.extend(contentParts)`
-
-#### Analysis
-
-**Arguments FOR Current Behavior (Skip documentList):**
-- Performance: Avoids redundant extraction if contentParts already provided
-- Explicit Intent: If user provides contentParts, they may want only those
-- Pre-extracted Content: contentParts might be pre-processed/filtered content
-- Simplicity: Simpler logic, fewer edge cases
-
-**Arguments AGAINST Current Behavior (Should Merge):**
-- **Inconsistency**: Other paths merge, creating confusion
-- **User Intent**: If user provides both, they likely want both used
-- **Flexibility**: Allows combining pre-extracted content with additional documents
-- **Architectural Pattern**: Document generation path already handles this correctly
-- **No Performance Issue**: Extraction is fast, merging is trivial
-
-#### Recommendation
-
-**The current behavior in `ai.process` does NOT make architectural sense** because:
-
-1. **Inconsistency**: The action routes to paths that DO merge, but the action itself doesn't
-2. **Lost Functionality**: User cannot combine pre-extracted contentParts with additional documents
-3. **Unexpected Behavior**: Users might expect both to be used (like in other paths)
-
-#### Proposed Fix
-
-Change `ai.process` to merge both with intelligent deduplication:
-
-**Logic Requirements:**
-- Extract content parts from documents (without AI) **only if** that document is not already represented in the `contentParts` list
-- Merge all contentParts
-- Result: Complete list of contentParts for all provided documents (no duplicates)
-
-**Current Implementation** (lines 85-119):
-```python
-# If contentParts not provided but documentList is, extract content first
-if not contentParts and documentList.references:
- # Extract from documentList
- extractedResults = self.services.extraction.extractContent(...)
- contentParts = []
- for extracted in extractedResults:
- if extracted.parts:
- contentParts.extend(extracted.parts)
-```
-
-**Proposed Implementation**:
-```python
-# Step 1: Identify documents already represented in contentParts
-documentsAlreadyExtracted = set()
-if contentParts:
- for part in contentParts:
- documentId = part.metadata.get("documentId")
- if documentId:
- documentsAlreadyExtracted.add(documentId)
- logger.info(f"Found {len(documentsAlreadyExtracted)} documents already represented in contentParts: {documentsAlreadyExtracted}")
-
-# Step 2: Extract from documentList only for documents NOT already in contentParts
-extractedParts = []
-if documentList and documentList.references:
- self.services.chat.progressLogUpdate(operationId, 0.3, "Extracting content from documents")
- chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList)
-
- if chatDocuments:
- # Filter: Only extract documents not already represented
- documentsToExtract = [
- doc for doc in chatDocuments
- if doc.id not in documentsAlreadyExtracted
- ]
-
- if documentsToExtract:
- logger.info(f"Extracting content from {len(documentsToExtract)} new documents (skipping {len(chatDocuments) - len(documentsToExtract)} already represented)")
-
- # Prepare extraction options
- extractionOptions = parameters.get("extractionOptions")
- if not extractionOptions:
- extractionOptions = ExtractionOptions(
- prompt="Extract all content from the document",
- mergeStrategy=MergeStrategy(
- mergeType="concatenate",
- groupBy="typeGroup",
- orderBy="id"
- ),
- processDocumentsIndividually=True
- )
-
- # Extract content (without AI - pure extraction)
- extractedResults = self.services.extraction.extractContent(documentsToExtract, extractionOptions)
-
- # Combine all ContentParts from extracted results
- for extracted in extractedResults:
- if extracted.parts:
- extractedParts.extend(extracted.parts)
-
- logger.info(f"Extracted {len(extractedParts)} content parts from {len(extractedResults)} documents")
- else:
- logger.info(f"All documents from documentList are already represented in contentParts, skipping extraction")
-
-# Step 3: Merge all contentParts
-if contentParts:
- # Preserve pre-extracted content metadata
- for part in contentParts:
- if part.metadata.get("skipExtraction", False):
- part.metadata.setdefault("contentFormat", "extracted")
- part.metadata.setdefault("isPreExtracted", True)
-
- # Merge: extracted parts first, then provided contentParts
- # This ensures extracted content comes before pre-extracted content
- finalContentParts = extractedParts + contentParts
- contentParts = finalContentParts
- logger.info(f"Merged contentParts: {len(extractedParts)} extracted + {len(contentParts) - len(extractedParts)} provided = {len(contentParts)} total")
-elif extractedParts:
- contentParts = extractedParts
-```
-
-**Benefits:**
-- Makes behavior consistent across all paths
-- Allows users to combine pre-extracted content with documents
-- Matches user expectations
-- Follows the architectural pattern already established in document generation path
-
-#### Edge Cases Handled
-
-1. **Duplicate Documents**: Same document in both `contentParts` and `documentList`
- - **Solution**: Check `documentId` in `contentParts` metadata before extracting
- - **Implementation**: Build set of `documentsAlreadyExtracted` from `part.metadata.get("documentId")`
- - **Result**: Only extract documents NOT already represented in `contentParts`
- - **Benefit**: Avoids redundant extraction, prevents duplicate content
-
-2. **Different Extraction Options**: contentParts might have different extraction settings
- - **Solution**: Preserve metadata, let AI handle differences
- - **Note**: Each ContentPart retains its own metadata (extractionPrompt, etc.)
- - **Behavior**: Documents extracted with current options, pre-extracted parts keep their original metadata
-
-3. **Ordering**: Which comes first - extracted or provided?
- - **Solution**: Extracted parts first, then provided contentParts
- - **Rationale**: Newly extracted content comes first, pre-extracted content follows
- - **Implementation**: `finalContentParts = extractedParts + contentParts`
-
-4. **Performance**: Avoids unnecessary extraction
- - **Solution**: Only extracts documents not already in `contentParts`
- - **Benefit**: Skips extraction for documents already represented
- - **Logging**: Logs which documents are skipped and why
-
-5. **Missing documentId in Metadata**: What if contentPart doesn't have documentId?
- - **Solution**: Only documents with `documentId` in metadata are considered "already extracted"
- - **Behavior**: If `documentId` missing, document will be extracted (safe default)
- - **Note**: Extraction service always sets `documentId` in metadata, so this is rare
-
-#### Implementation Steps
-
-1. **Update `ai.process` action** (`process.py` lines 85-119):
- - **Step 1**: Build set of `documentsAlreadyExtracted` from `contentParts` metadata
- - **Step 2**: Filter `chatDocuments` to only include documents NOT in `documentsAlreadyExtracted`
- - **Step 3**: Extract content only from filtered documents (pure extraction, no AI)
- - **Step 4**: Merge extracted parts with provided `contentParts` (extracted first, then provided)
- - **Step 5**: Preserve metadata for pre-extracted contentParts
- - **Step 6**: Add logging for transparency (which documents skipped, counts, etc.)
-
-2. **Update Documentation**:
- - Update action parameter documentation to clarify deduplication behavior
- - Document that extraction only happens for documents not already in `contentParts`
- - Add examples showing both parameters used together
- - Explain how `documentId` metadata is used for deduplication
-
-3. **Testing**:
- - **Test Case 1**: Both parameters provided, no overlap → Both extracted and merged
- - **Test Case 2**: Both parameters provided, full overlap → Only contentParts used, no extraction
- - **Test Case 3**: Both parameters provided, partial overlap → Extract only new documents, merge all
- - **Test Case 4**: Only contentParts → Use as-is
- - **Test Case 5**: Only documentList → Extract all documents
- - **Test Case 6**: contentParts without documentId metadata → Extract all documents (safe default)
-
-4. **Migration**:
- - No breaking changes expected (only adds functionality)
- - Existing code using only one parameter continues to work
- - New behavior: When both provided, intelligently deduplicates before merging
-
-### 9.2 Architectural Redundancy: Duplicate Extraction Logic
-
-#### Problem Statement
-
-**Current Architecture:**
-- `ai.process` action extracts documents and creates `contentParts` (lines 86-119)
-- Then passes only `contentParts` to `callAiContent()` (line 167)
-- `callAiContent()` accepts both `contentParts` AND `documentList` (line 545)
-- Document generation path has `extractAndPrepareContent()` logic (line 103 in `documentPath.py`)
-- But this extraction logic is **never used** when called from `ai.process` (because `documentList` is not passed)
-
-**Question**: Why does `ai.process` extract documents when the AI service already has extraction logic?
-
-#### Analysis
-
-**Current Flow:**
-```
-ai.process
- ├─→ Extract documents → contentParts (lines 86-119)
- ├─→ Pass contentParts to callAiContent() (line 167)
- └─→ callAiContent() routes to document generation path
- └─→ extractAndPrepareContent() exists but is SKIPPED (no documentList)
-```
-
-**Alternative Flow (More Logical):**
-```
-ai.process
- ├─→ Pass documentList to callAiContent() (line 167)
- └─→ callAiContent() routes to document generation path
- └─→ extractAndPrepareContent() handles extraction
-```
-
-#### Issues with Current Architecture
-
-1. **Code Duplication**: Extraction logic exists in both `ai.process` and document generation path
-2. **Inconsistency**: Different extraction paths use different extraction options/logic
-3. **Maintenance Burden**: Changes to extraction logic must be made in multiple places
-4. **Unused Code**: `extractAndPrepareContent()` in document generation path is unused when called from `ai.process`
-5. **Loss of Flexibility**: `ai.process` can't leverage document intent clarification and other features in `extractAndPrepareContent()`
-
-#### Why Current Architecture Exists (Possible Reasons)
-
-1. **Historical**: Extraction may have been added to `ai.process` before AI service had extraction
-2. **Separation of Concerns**: `ai.process` might be intended as a simpler entry point
-3. **Progress Tracking**: Early extraction allows better progress tracking at action level
-4. **Performance**: Early extraction might allow parallel processing
-
-However, these don't justify the duplication and inconsistency.
-
-#### Recommendation
-
-**Option A: Remove Extraction from `ai.process` (Preferred)**
-- `ai.process` should pass `documentList` to `callAiContent()` instead of extracting
-- Let the AI service handle all extraction through `extractAndPrepareContent()`
-- Benefits:
- - Single source of truth for extraction logic
- - Consistent extraction options and behavior
- - Leverages document intent clarification
- - Simpler `ai.process` action
- - Better separation: action layer vs service layer
-
-**Option B: Keep Extraction in `ai.process` but Make it Optional**
-- Add parameter to control whether extraction happens in `ai.process` or AI service
-- Still creates complexity and potential inconsistency
-
-**Option C: Keep Current Architecture (Not Recommended)**
-- Document the duplication and accept it
-- Maintain extraction logic in both places
-- Risk of divergence over time
-
-#### Proposed Refactoring (Option A)
-
-**Current Implementation** (`process.py` lines 85-119):
-```python
-# Extract in ai.process
-if not contentParts and documentList.references:
- extractedResults = self.services.extraction.extractContent(...)
- contentParts = combineExtractedResults(extractedResults)
-
-# Pass only contentParts
-aiResponse = await self.services.ai.callAiContent(
- contentParts=contentParts, # documentList NOT passed
- ...
-)
-```
-
-**Proposed Implementation**:
-```python
-# Don't extract in ai.process - let AI service handle it
-# Pass documentList to AI service
-aiResponse = await self.services.ai.callAiContent(
- prompt=aiPrompt,
- options=options,
- documentList=documentList, # Pass documentList instead
- contentParts=contentParts, # Still support pre-extracted contentParts
- outputFormat=output_format,
- parentOperationId=operationId,
- generationIntent=generationIntent
-)
-```
-
-**Benefits:**
-- Single extraction path in AI service
-- Consistent extraction behavior
-- Leverages document intent clarification
-- Simpler `ai.process` action
-- Better architecture: action layer delegates to service layer
-
-**Migration Path:**
-1. Update `ai.process` to pass `documentList` to `callAiContent()`
-2. Remove extraction logic from `ai.process` (or make it optional)
-3. Ensure `extractAndPrepareContent()` handles all extraction cases
-4. Test that all existing workflows continue to work
-5. Update documentation
-
-**Edge Cases:**
-- Pre-extracted `contentParts` should still be supported (merge with extracted)
-- Extraction options should be configurable via parameters
-- Progress tracking should work at both levels
-
-### 9.3 Target State: Ideal Architecture and Flow
-
-#### Target Architecture Overview
-
-The target state addresses all architectural issues identified:
-1. **Single extraction path** in AI service (no duplication in `ai.process`)
-2. **Intelligent merging** of `contentParts` and `documentList` with deduplication
-3. **Clear separation** of concerns: action layer delegates to service layer
-4. **Consistent behavior** across all code paths
-
-#### Target Flow Diagram
-
-```
-┌─────────────────────────────────────────────────────────────────┐
-│ ai.process Action │
-│ │
-│ 1. Extract Parameters │
-│ ├─→ aiPrompt │
-│ ├─→ documentList (optional) │
-│ ├─→ contentParts (optional) │
-│ ├─→ resultType │
-│ └─→ generationIntent │
-│ │
-│ 2. Determine Operation Type │
-│ ├─→ IMAGE_GENERATE → Route to image generation │
-│ ├─→ DATA_GENERATE → Route to document/code generation │
-│ └─→ DATA_EXTRACT → Route to data extraction │
-│ │
-│ 3. Pass Parameters to AI Service │
-│ └─→ callAiContent( │
-│ prompt=aiPrompt, │
-│ documentList=documentList, ← PASS documentList │
-│ contentParts=contentParts, ← PASS contentParts │
-│ options=options, │
-│ generationIntent=generationIntent │
-│ ) │
-└─────────────────────────────────────────────────────────────────┘
- │
- ▼
-┌─────────────────────────────────────────────────────────────────┐
-│ AI Service: callAiContent() │
-│ │
-│ 1. Route by Operation Type │
-│ └─→ DATA_GENERATE → _handleDocumentGeneration() │
-└─────────────────────────────────────────────────────────────────┘
- │
- ▼
-┌─────────────────────────────────────────────────────────────────┐
-│ Document Generation Path: generateDocument() │
-│ │
-│ Phase 1: Document Intent Clarification │
-│ ┌─────────────────────────────────────────────────────────┐ │
-│ │ if documentList: │ │
-│ │ documents = getChatDocumentsFromDocumentList() │ │
-│ │ │ │
-│ │ # Step 1: Map pre-extracted JSONs to original docs │ │
-│ │ # (for intent analysis, analyze original docs, not JSON)│ │
-│ │ documentMapping = {} │ │
-│ │ resolvedDocuments = [] │ │
-│ │ for doc in documents: │ │
-│ │ preExtracted = resolvePreExtractedDocument(doc) │ │
-│ │ if preExtracted: │ │
-│ │ originalDocId = preExtracted["originalDocument"]["id"]│
-│ │ documentMapping[originalDocId] = doc.id │ │
-│ │ resolvedDocuments.append(originalDoc) │ │
-│ │ else: │ │
-│ │ resolvedDocuments.append(doc) │ │
-│ │ │ │
-│ │ # Step 2: AI analyzes document purposes │ │
-│ │ documentIntents = clarifyDocumentIntents( │ │
-│ │ resolvedDocuments, │ │
-│ │ userPrompt, │ │
-│ │ actionParameters │ │
-│ │ ) │ │
-│ │ │ │
-│ │ # Step 3: Map intents back to JSON doc IDs │ │
-│ │ # (if intent was for original doc, map to JSON doc) │ │
-│ │ for intent in documentIntents: │ │
-│ │ if intent.documentId in documentMapping: │ │
-│ │ intent.documentId = documentMapping[intent.documentId]│
-│ │ │ │
-│ │ # Result: List[DocumentIntent] with: │ │
-│ │ # - documentId: Document ID │ │
-│ │ # - intents: ["extract", "render", "reference"] │ │
-│ │ # - extractionPrompt: Prompt for extraction │ │
-│ │ # - reasoning: Why these intents were chosen │ │
-│ └─────────────────────────────────────────────────────────┘ │
-│ │
-│ Phase 2: Content Extraction and Preparation │
-│ ┌─────────────────────────────────────────────────────────┐ │
-│ │ Step 1: Identify Pre-Extracted JSON Documents │ │
-│ │ preExtractedDocs = [] │ │
-│ │ originalDocIdsCovered = set() │ │
-│ │ for doc in documents: │ │
-│ │ preExtracted = resolvePreExtractedDocument(doc) │ │
-│ │ if preExtracted: │ │
-│ │ preExtractedDocs.append(doc) │ │
-│ │ originalDocId = preExtracted["originalDocument"]["id"]│
-│ │ originalDocIdsCovered.add(originalDocId) │ │
-│ │ │ │
-│ │ Step 2: Filter Out Original Documents │ │
-│ │ # Remove original documents covered by pre-extracted │ │
-│ │ filteredDocuments = [ │ │
-│ │ doc for doc in documents │ │
-│ │ if doc.id not in originalDocIdsCovered │ │
-│ │ ] │ │
-│ │ │ │
-│ │ Step 3: Identify Already Extracted Documents │ │
-│ │ documentsAlreadyExtracted = set() │ │
-│ │ for part in contentParts: │ │
-│ │ if part.metadata.get("documentId"): │ │
-│ │ documentsAlreadyExtracted.add(documentId) │ │
-│ │ │ │
-│ │ Step 4: Filter Documents to Extract │ │
-│ │ documentsToExtract = [ │ │
-│ │ doc for doc in filteredDocuments │ │
-│ │ if doc.id not in documentsAlreadyExtracted │ │
-│ │ ] │ │
-│ │ │ │
-│ │ Step 5: Process Pre-Extracted JSON Documents │ │
-│ │ preExtractedParts = [] │ │
-│ │ for doc in preExtractedDocs: │ │
-│ │ preExtracted = resolvePreExtractedDocument(doc) │ │
-│ │ contentExtracted = preExtracted["contentExtracted"] │ │
-│ │ # Extract ContentParts from JSON (not regular JSON) │ │
-│ │ for part in contentExtracted.parts: │ │
-│ │ # Process nested parts if structure part │ │
-│ │ # Apply intents (extract, render, reference) │ │
-│ │ # Mark as pre-extracted │ │
-│ │ part.metadata["isPreExtracted"] = True │ │
-│ │ part.metadata["fromPreExtractedJson"] = True │ │
-│ │ preExtractedParts.append(part) │ │
-│ │ │ │
-│ │ Step 6: RAW Extraction (NO AI) for Regular Documents │ │
-│ │ if documentsToExtract: │ │
-│ │ extractedResults = extractContent( │ │
-│ │ documentsToExtract, │ │
-│ │ extractionOptions │ │
-│ │ ) │ │
-│ │ extractedParts = combineResults(extractedResults) │ │
-│ │ else: │ │
-│ │ extractedParts = [] │ │
-│ │ │ │
-│ │ Step 7: Merge All ContentParts │ │
-│ │ allParts = [] │ │
-│ │ allParts.extend(preExtractedParts) # Pre-extracted first│
-│ │ allParts.extend(extractedParts) # Then extracted │ │
-│ │ if contentParts: │ │
-│ │ # Preserve metadata │ │
-│ │ for part in contentParts: │ │
-│ │ part.metadata.setdefault("isPreExtracted", True) │ │
-│ │ allParts.extend(contentParts) # Then provided │ │
-│ │ │ │
-│ │ finalContentParts = allParts │ │
-│ └─────────────────────────────────────────────────────────┘ │
-│ │
-│ Phase 3: Structure Generation │
-│ ┌─────────────────────────────────────────────────────────┐ │
-│ │ structure = generateStructure( │ │
-│ │ userPrompt, │ │
-│ │ finalContentParts, ← Uses ContentParts metadata │ │
-│ │ outputFormat │ │
-│ │ ) │ │
-│ │ │ │
-│ │ Result: JSON structure with chapters │ │
-│ │ - Each chapter has contentParts assignments │ │
-│ │ - Based on ContentPart metadata (documentId, etc.) │ │
-│ └─────────────────────────────────────────────────────────┘ │
-│ │
-│ Phase 4: Structure Filling │
-│ ┌─────────────────────────────────────────────────────────┐ │
-│ │ filledStructure = fillStructure( │ │
-│ │ structure, │ │
-│ │ finalContentParts, │ │
-│ │ userPrompt │ │
-│ │ ) │ │
-│ │ │ │
-│ │ For each section: │ │
-│ │ 1. Check if ContentPart needsVisionExtraction │ │
-│ │ 2. If yes: Call Vision AI (Phase 2 extraction) │ │
-│ │ 3. Generate section content with AI │ │
-│ └─────────────────────────────────────────────────────────┘ │
-│ │
-│ Phase 5: Document Rendering │
-│ ┌─────────────────────────────────────────────────────────┐ │
-│ │ renderedDocuments = renderDocuments( │ │
-│ │ filledStructure, │ │
-│ │ outputFormat │ │
-│ │ ) │ │
-│ └─────────────────────────────────────────────────────────┘ │
-└─────────────────────────────────────────────────────────────────┘
-```
-
-#### Key Differences from Current State
-
-**Current State Issues:**
-1. ❌ `ai.process` extracts documents (duplication)
-2. ❌ `ai.process` doesn't pass `documentList` to AI service
-3. ❌ No deduplication when both `contentParts` and `documentList` provided
-4. ❌ Inconsistent behavior across code paths
-5. ❌ Pre-extracted JSON documents in `documentList` may not be properly identified
-
-**Target State Benefits:**
-1. ✅ Single extraction path in AI service
-2. ✅ `ai.process` passes both `documentList` and `contentParts`
-3. ✅ Intelligent deduplication (extract only new documents)
-4. ✅ Pre-extracted JSON documents identified and processed as ContentParts (not regular JSON)
-5. ✅ Original documents filtered out if covered by pre-extracted JSON
-6. ✅ Consistent behavior across all code paths
-7. ✅ Better separation of concerns
-
-#### Document Intent Clarification Details
-
-**What Happens in Phase 1:**
-
-1. **Document Resolution**:
- - Maps pre-extracted JSON documents to their original documents
- - Creates `documentMapping` to track original → JSON document ID mapping
- - Resolves documents for intent analysis (analyze original docs, not JSON)
-
-2. **AI Analysis** (`clarifyDocumentIntents`):
- - **Input**: User prompt, resolved documents, action parameters (outputFormat, etc.)
- - **Process**: Uses AI (`callAiPlanning()`) to analyze how each document should be used
- - **Output**: List of `DocumentIntent` objects, one per document
- - **AI Call**: Structured JSON response with intents and reasoning
-
-3. **Intent Determination**:
- - **"extract"**: Content extraction needed (text, structure, OCR, etc.)
- - Used for: PDFs, DOCX, images with text, tables, etc.
- - Generates `extractionPrompt` for specific extraction needs
- - Example: `"Extract all text content, preserving structure"`
- - **"render"**: Image/binary should be rendered as-is (visual element)
- - Used for: Images that should appear in final document
- - No extraction prompt needed
- - Example: Image that should be displayed in PDF/DOCX
- - **"reference"**: Document reference/attachment (no extraction)
- - Used for: Documents mentioned but not extracted
- - No extraction prompt needed
- - Example: Template document referenced but not included
-
-4. **Multiple Intents**:
- - A document can have multiple intents (e.g., `["extract", "render"]`)
- - Example: Image that needs text extraction AND visual rendering
- - Each intent creates a separate ContentPart later in extraction phase
-
-5. **Extraction Prompt Generation**:
- - AI generates specific extraction prompt for each document
- - Based on user prompt, document type, and output format
- - Examples:
- - `"Extract all text content, preserving structure"`
- - `"Extract text content from image using vision AI"`
- - `"Extract tables and data, preserving formatting"`
- - Stored in `DocumentIntent.extractionPrompt` for later use
-
-6. **Mapping Back**:
- - If intent was for original document, map back to JSON document ID
- - Ensures intents are associated with correct documents
- - Pre-extracted JSON documents get intents mapped correctly
-
-**Example Flow**:
-```
-Input:
- documents = [
- ChatDocument(id="doc_1", fileName="report.pdf"),
- ChatDocument(id="doc_2", fileName="image.jpg"),
- ChatDocument(id="json_3", fileName="pre_extracted.json") # Pre-extracted
- ]
- userPrompt = "Create a report with the PDF content and show the image"
-
-Step 1: Map pre-extracted JSON
- → json_3 maps to original_doc_3
- → resolvedDocuments = [doc_1, doc_2, original_doc_3]
-
-Step 2: AI Analysis
- → Analyzes: "Create report with PDF content and show image"
- → Determines:
- - doc_1: ["extract"] (needs text extraction)
- extractionPrompt: "Extract all text content, preserving structure"
- - doc_2: ["render"] (needs visual rendering)
- extractionPrompt: null
- - original_doc_3: ["extract"] (needs extraction)
- extractionPrompt: "Extract all text content, preserving structure"
-
-Step 3: Map back
- → original_doc_3 intent mapped to json_3
- → Final intents:
- - doc_1: ["extract"]
- - doc_2: ["render"]
- - json_3: ["extract"]
-```
-
-**Why This Matters**:
-- Determines HOW each document should be processed (extract vs. render vs. reference)
-- Generates appropriate extraction prompts for each document
-- Handles pre-extracted JSON documents correctly (maps to original for analysis)
-- Enables multiple intents per document (extract + render for images)
-- Guides content extraction phase (Phase 2) on what to extract and how
-
-**Output Structure**:
-```python
-DocumentIntent(
- documentId: str, # Document ID
- intents: List[str], # ["extract", "render", "reference"]
- extractionPrompt: Optional[str], # Prompt for extraction (if extract intent)
- reasoning: str # Why these intents were chosen
-)
-```
-
-#### Pre-Extracted JSON Documents Handling
-
-**Scenario**: ContentParts are already extracted and handed over as JSON documents in `documentList`
-
-**Target State Behavior**:
-
-1. **Identification** (Step 1 in Phase 2):
- - Use `resolvePreExtractedDocument()` to identify JSON documents containing `ContentExtracted` structure
- - These are NOT regular JSON documents - they contain pre-processed ContentParts
- - Map back to original document ID to identify which original documents are covered
-
-2. **Filtering** (Step 2 in Phase 2):
- - Keep pre-extracted JSON documents (will be processed as ContentParts)
- - Remove original documents if covered by pre-extracted JSON (prevents duplicate extraction)
- - Keep regular documents (not pre-extracted, not covered)
-
-3. **Processing** (Step 5 in Phase 2):
- - Extract ContentParts from pre-extracted JSON (not treat as regular JSON)
- - Process nested parts if structure parts contain nested ContentParts
- - Apply intents (extract, render, reference) to each ContentPart
- - Mark with metadata:
- - `isPreExtracted: True`
- - `fromPreExtractedJson: True`
- - `originalFileName`: Original document filename
- - `documentId`: Pre-extracted JSON document ID
-
-4. **Merging** (Step 7 in Phase 2):
- - Merge order: pre-extracted parts → extracted parts → provided contentParts
- - All ContentParts treated equally regardless of source
-
-**Example Flow**:
-```
-documentList = [
- "doc:original_pdf_123", # Original PDF document
- "doc:pre_extracted_json_456" # Pre-extracted JSON (contains ContentParts from original_pdf_123)
-]
-
-Step 1: Identify pre-extracted JSON
- → pre_extracted_json_456 is identified as pre-extracted
- → Maps to original_pdf_123
-
-Step 2: Filter documents
- → Keep pre_extracted_json_456 (will extract ContentParts from JSON)
- → Remove original_pdf_123 (covered by pre-extracted JSON)
-
-Step 5: Process pre-extracted JSON
- → Extract ContentParts from pre_extracted_json_456
- → Mark as isPreExtracted=True, fromPreExtractedJson=True
-
-Step 6: Extract regular documents
- → No documents to extract (all filtered out or pre-extracted)
-
-Step 7: Merge
- → finalContentParts = [ContentParts from pre_extracted_json_456]
-```
-
-**Key Point**: Pre-extracted JSON documents are identified BEFORE deduplication and processed as ContentParts, NOT as regular JSON documents. This prevents treating them as regular JSON and ensures ContentParts are properly extracted and used.
-
-#### Migration Steps
-
-**Phase 1: Update `ai.process` Action**
-
-**Step 1.1: Remove Extraction Logic from `ai.process`**
-- **File**: `gateway/modules/workflows/methods/methodAi/actions/process.py`
-- **Lines**: 85-119
-- **Action**: Remove or comment out extraction logic
-- **Code Change**:
- ```python
- # REMOVE THIS:
- # if not contentParts and documentList.references:
- # extractedResults = self.services.extraction.extractContent(...)
- # contentParts = combineExtractedResults(extractedResults)
- ```
-
-**Step 1.2: Pass `documentList` to `callAiContent()`**
-- **File**: `gateway/modules/workflows/methods/methodAi/actions/process.py`
-- **Line**: 167
-- **Action**: Add `documentList` parameter
-- **Code Change**:
- ```python
- # CURRENT:
- aiResponse = await self.services.ai.callAiContent(
- prompt=aiPrompt,
- options=options,
- contentParts=contentParts, # Only contentParts
- outputFormat=output_format,
- parentOperationId=operationId,
- generationIntent=generationIntent
- )
-
- # TARGET:
- aiResponse = await self.services.ai.callAiContent(
- prompt=aiPrompt,
- options=options,
- documentList=documentList, # ADD documentList
- contentParts=contentParts, # Keep contentParts
- outputFormat=output_format,
- parentOperationId=operationId,
- generationIntent=generationIntent
- )
- ```
-
-**Step 1.3: Update Progress Tracking**
-- **File**: `gateway/modules/workflows/methods/methodAi/actions/process.py`
-- **Action**: Remove extraction progress tracking (moved to AI service)
-- **Note**: Progress tracking will happen in `extractAndPrepareContent()`
-
-**Phase 2: Update Document Generation Path**
-
-**Step 2.1: Document Intent Clarification (Already Exists)**
-- **File**: `gateway/modules/services/serviceAi/subDocumentIntents.py`
-- **Lines**: 30-120
-- **Action**: Verify intent clarification works correctly with new flow
-- **What it does**:
- - **AI Analysis**: Uses AI to analyze user prompt and documents
- - **Determines Intents**: For each document, determines how it should be used:
- - `"extract"`: Content extraction needed (text, structure, OCR, etc.)
- - `"render"`: Image/binary should be rendered as-is (visual element)
- - `"reference"`: Document reference/attachment (no extraction, just reference)
- - **Multiple Intents**: A document can have multiple intents (e.g., `["extract", "render"]` for images)
- - **Extraction Prompt**: Generates specific extraction prompt for each document
- - **Pre-Extracted JSON Handling**: Maps pre-extracted JSONs to original documents for analysis, then maps back
-- **Example Output**:
- ```python
- [
- DocumentIntent(
- documentId="doc_1",
- intents=["extract"],
- extractionPrompt="Extract all text content, preserving structure",
- reasoning="User needs text content for document generation"
- ),
- DocumentIntent(
- documentId="doc_2",
- intents=["extract", "render"], # Both!
- extractionPrompt="Extract text content from image using vision AI",
- reasoning="Image contains text that needs extraction, but also should be rendered visually"
- )
- ]
- ```
-- **Note**: This step already exists and works correctly, just needs to be verified with new flow
-
-**Step 2.2: Identify Pre-Extracted JSON Documents**
-- **File**: `gateway/modules/services/serviceGeneration/paths/documentPath.py`
-- **Lines**: 62-87 (already exists, but needs to be integrated with deduplication)
-- **Action**: Ensure pre-extracted JSON documents are identified BEFORE deduplication
-- **Code Change**:
- ```python
- # Step 1: Identify pre-extracted JSON documents
- preExtractedDocs = []
- originalDocIdsCoveredByPreExtracted = set()
- for doc in documents:
- preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
- if preExtracted:
- preExtractedDocs.append(doc)
- originalDocId = preExtracted["originalDocument"]["id"]
- originalDocIdsCoveredByPreExtracted.add(originalDocId)
- logger.info(f"Found pre-extracted JSON {doc.id} covering original document {originalDocId}")
-
- # Step 2: Filter out original documents covered by pre-extracted JSONs
- filteredDocuments = []
- for doc in documents:
- preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
- if preExtracted:
- # Pre-extracted JSON - keep it (will be processed as ContentParts, not regular JSON)
- filteredDocuments.append(doc)
- elif doc.id in originalDocIdsCoveredByPreExtracted:
- # Original document covered by pre-extracted JSON - skip it
- logger.info(f"Skipping original document {doc.id} - already covered by pre-extracted JSON")
- else:
- # Regular document - keep it
- filteredDocuments.append(doc)
-
- documents = filteredDocuments
- ```
-
-**Step 2.2: Add Deduplication Logic for Regular Documents**
-- **File**: `gateway/modules/services/serviceGeneration/paths/documentPath.py`
-- **Lines**: 101-119
-- **Action**: Add deduplication before extraction (after pre-extracted JSON handling)
-- **Code Change**:
- ```python
- # Step 3: Identify already extracted documents (from contentParts)
- documentsAlreadyExtracted = set()
- if contentParts:
- for part in contentParts:
- documentId = part.metadata.get("documentId")
- if documentId:
- documentsAlreadyExtracted.add(documentId)
-
- # Step 4: Filter documents to extract (exclude pre-extracted JSONs and already extracted)
- documentsToExtract = [
- doc for doc in documents
- if doc.id not in documentsAlreadyExtracted
- and not self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc) # Not pre-extracted JSON
- ]
-
- # Step 5: Process pre-extracted JSON documents (handled in extractAndPrepareContent)
- # Step 6: Extract regular documents
- if documentsToExtract:
- preparedContentParts = await extractAndPrepareContent(
- documentsToExtract, # Only new documents (not pre-extracted, not already extracted)
- documentIntents or [],
- docOperationId
- )
-
- # Merge: pre-extracted parts + extracted parts + provided contentParts
- if contentParts:
- # Preserve metadata
- for part in contentParts:
- part.metadata.setdefault("isPreExtracted", True)
- preparedContentParts.extend(contentParts)
-
- contentParts = preparedContentParts
- elif contentParts:
- # All documents already extracted or pre-extracted, use contentParts as-is
- contentParts = contentParts
- ```
-
-**Step 2.4: Ensure Pre-Extracted JSON Processing**
-- **File**: `gateway/modules/services/serviceAi/subContentExtraction.py`
-- **Lines**: 75-253
-- **Action**: Ensure `extractAndPrepareContent()` properly handles pre-extracted JSON documents
-- **Note**: This logic already exists (lines 75-253) but needs to be verified:
- - Pre-extracted JSON documents are identified via `resolvePreExtractedDocument()`
- - ContentParts are extracted from JSON (not treated as regular JSON)
- - Original documents are skipped if covered by pre-extracted JSON
- - Metadata is preserved (`isPreExtracted`, `fromPreExtractedJson`)
-
-**Step 2.5: Verify Pre-Extracted JSON Identification**
-- **File**: `gateway/modules/services/serviceAi/subDocumentIntents.py`
-- **Action**: Ensure `resolvePreExtractedDocument()` correctly identifies pre-extracted JSON documents
-- **Requirements**:
- - Must identify JSON documents containing `ContentExtracted` structure
- - Must map back to original document ID
- - Must extract ContentParts from JSON (not treat as regular JSON)
- - Must preserve metadata (`isPreExtracted`, `fromPreExtractedJson`)
-
-**Step 2.6: Update Extraction Logic**
-- **File**: `gateway/modules/services/serviceAi/subContentExtraction.py`
-- **Action**: Ensure extraction handles deduplication gracefully
-- **Note**: Extraction service already supports this, just need to pass filtered documents
-- **Important**: Pre-extracted JSON documents should be processed BEFORE regular extraction
-
-**Phase 3: Testing and Validation**
-
-**Step 3.1: Unit Tests**
-- Test `ai.process` with only `documentList`
-- Test `ai.process` with only `contentParts`
-- Test `ai.process` with both `documentList` and `contentParts` (no overlap)
-- Test `ai.process` with both `documentList` and `contentParts` (full overlap)
-- Test `ai.process` with both `documentList` and `contentParts` (partial overlap)
-
-**Step 3.2: Integration Tests**
-- Test full document generation flow
-- Test progress tracking at all levels
-- Test error handling (missing documents, extraction failures)
-- Test performance (no duplicate extraction)
-
-**Step 3.3: Regression Tests**
-- Ensure existing workflows continue to work
-- Test backward compatibility
-- Test edge cases (empty lists, missing metadata, etc.)
-
-**Phase 4: Documentation Updates**
-
-**Step 4.1: Update Action Documentation**
-- **File**: `gateway/modules/workflows/methods/methodAi/methodAi.py`
-- **Action**: Update parameter descriptions to clarify merging behavior
-- **Content**: Document that both parameters can be provided and will be merged intelligently
-
-**Step 4.2: Update API Documentation**
-- Document new behavior in API docs
-- Add examples showing both parameters used together
-- Explain deduplication logic
-
-**Step 4.3: Update This Analysis Document**
-- Mark current state sections as "Current State (Pre-Migration)"
-- Add "Target State" sections (this chapter)
-- Document migration progress
-
-**Phase 5: Rollout Strategy**
-
-**Step 5.1: Feature Flag (Optional)**
-- Add feature flag to control new vs. old behavior
-- Allows gradual rollout
-- Easy rollback if issues found
-
-**Step 5.2: Gradual Migration**
-- Migrate one workflow at a time
-- Monitor for issues
-- Collect feedback
-
-**Step 5.3: Full Migration**
-- Remove old extraction logic from `ai.process`
-- Remove feature flag
-- Update all documentation
-
-#### Migration Checklist
-
-- [ ] **Phase 1: Update `ai.process` Action**
- - [ ] Remove extraction logic from `ai.process`
- - [ ] Pass `documentList` to `callAiContent()`
- - [ ] Update progress tracking
- - [ ] Test `ai.process` with new parameters
-
-- [ ] **Phase 2: Update Document Generation Path**
- - [ ] Identify pre-extracted JSON documents (before deduplication)
- - [ ] Filter out original documents covered by pre-extracted JSONs
- - [ ] Add deduplication logic for regular documents
- - [ ] Ensure pre-extracted JSON processing (extract ContentParts, not treat as JSON)
- - [ ] Update extraction to handle filtered documents
- - [ ] Test merging behavior (pre-extracted + extracted + provided)
- - [ ] Test pre-extracted JSON identification
-
-- [ ] **Phase 3: Testing and Validation**
- - [ ] Unit tests for all scenarios
- - [ ] Integration tests for full flow
- - [ ] Regression tests for existing workflows
- - [ ] Performance tests (no duplicate extraction)
-
-- [ ] **Phase 4: Documentation Updates**
- - [ ] Update action parameter documentation
- - [ ] Update API documentation
- - [ ] Update analysis document
-
-- [ ] **Phase 5: Rollout**
- - [ ] Feature flag (if needed)
- - [ ] Gradual migration
- - [ ] Full migration
- - [ ] Remove old code
-
-- [ ] **Phase 6: Security and Design Improvements**
- - [ ] **CRITICAL: Fix unfenced user input** (Finding 1)
- - [ ] Add fencing around `userPrompt` in intent analysis prompt
- - [ ] Test with various user inputs (special chars, JSON, newlines)
- - [ ] Verify AI still correctly parses user request
- - [ ] **IMPROVEMENT: Per-document output format** (Finding 2)
- - [ ] Add `outputFormat` field to `DocumentIntent` model (optional)
- - [ ] Update intent analysis prompt to determine format per document
- - [ ] Update structure generation to use per-document format
- - [ ] Fallback to global format if not specified
-
-#### Expected Benefits After Migration
-
-1. **Architectural Improvements**:
- - Single source of truth for extraction logic
- - Consistent behavior across all code paths
- - Better separation of concerns
-
-2. **Functional Improvements**:
- - Users can combine pre-extracted content with documents
- - Intelligent deduplication prevents redundant extraction
- - More flexible and powerful API
-
-3. **Maintenance Improvements**:
- - Less code duplication
- - Easier to maintain and extend
- - Clearer code organization
-
-4. **Performance Improvements**:
- - No duplicate extraction
- - Better resource utilization
- - Faster processing for common cases
-
-### 9.4 Two-Phase Extraction: Why Extract Before Structure Generation?
-
-#### Problem Statement
-
-**Question**: Why do we extract content (Step 2) BEFORE structure generation (Step 3), when we need AI to fill sections (Step 4) anyway? Are we extracting twice?
-
-**Answer**: Yes, but it's intentional and necessary. There are TWO different types of extraction happening at different phases:
-
-1. **Phase 1 (Step 2)**: RAW extraction (parsing) - NO AI
-2. **Phase 2 (Step 4)**: Vision AI extraction (for images only) - WITH AI
-
-#### Analysis
-
-**Phase 1: RAW Extraction (Step 2 - `extractAndPrepareContent`)**
-
-**What happens:**
-- Uses `extractContent()` service for pure document parsing
-- Parses PDF, DOCX, XLSX, etc. to extract structured content
-- Creates ContentParts with raw extracted data
-- **No AI involved** - just parsing/parsing
-
-**Prompt used:**
-- `intent.extractionPrompt` or default `"Extract all content from the document"`
-- **Important**: This prompt is stored in metadata but NOT used for AI extraction here
-- It's only used later during section generation (Step 4) for Vision AI
-
-**ContentPart preparation:**
-- **For Images**:
- - Marks with `needsVisionExtraction: True`
- - Stores `extractionPrompt` in metadata
- - **Reason**: Vision AI extraction is expensive, so it's deferred to section generation
-- **For Text**:
- - Marks with `skipExtraction: True` (already extracted, no AI needed)
- - Text is already extracted from document parsing
-- **For Objects**:
- - Creates object ContentParts for rendering (images, videos, etc.)
-
-**Why extract before structure generation?**
-- ContentParts are needed BEFORE structure generation so AI can assign them to chapters
-- Structure generation needs to know what content is available to assign to chapters
-- The AI needs ContentPart metadata (documentId, typeGroup, etc.) to make intelligent assignments
-
-**Phase 2: Vision AI Extraction (Step 4 - `fillStructure`)**
-
-**What happens:**
-- During section generation, checks for ContentParts with `needsVisionExtraction == True`
-- Calls Vision AI with `extractionPrompt` from metadata (line 651 in `subStructureFilling.py`)
-- Converts image ContentPart to text ContentPart with extracted text
-- Then uses the text part for section generation
-
-**Prompt used:**
-- `part.metadata.get("extractionPrompt")` or default `"Extract all text content from this image. Return only the extracted text, no additional formatting."`
-- This is the actual AI extraction prompt
-
-**Why extract during section generation?**
-- Vision AI extraction is expensive (costs tokens, takes time)
-- Only needed when actually generating content for a section
-- Not needed for structure generation (just needs to know images exist)
-- Deferred extraction saves costs and improves performance
-
-#### Current Flow
-
-```
-Step 2: extractAndPrepareContent()
- ├─→ RAW extraction (parsing PDF/DOCX/etc.) - NO AI
- ├─→ Creates ContentParts with raw data
- ├─→ For images: marks needsVisionExtraction=True, stores extractionPrompt
- └─→ For text: marks skipExtraction=True (already extracted)
-
-Step 3: generateStructure()
- ├─→ Uses ContentParts metadata to assign to chapters
- └─→ Creates structure with contentPart assignments
-
-Step 4: fillStructure()
- ├─→ For each section:
- │ ├─→ Check if ContentPart needsVisionExtraction==True
- │ ├─→ If yes: Call Vision AI with extractionPrompt (Phase 2 extraction)
- │ ├─→ Convert image → text ContentPart
- │ └─→ Generate section content with processed ContentParts
- └─→ Text ContentParts: Used directly (skipExtraction=True)
-```
-
-#### Is This Optimal?
-
-**Arguments FOR current approach:**
-- Structure generation needs ContentParts early (to assign to chapters)
-- Vision AI extraction is expensive - deferring saves costs
-- Text content doesn't need AI extraction (already extracted in Phase 1)
-- Clear separation: parsing vs. AI extraction
-
-**Arguments AGAINST current approach:**
-- Two-phase extraction can be confusing
-- `extractionPrompt` stored but not used until later (unclear)
-- Could potentially extract images earlier if structure generation needs text content
-
-#### Recommendation
-
-**Current approach is reasonable** but documentation should be clearer:
-
-1. **Clarify terminology**:
- - "Extraction" in Step 2 = RAW parsing (no AI)
- - "Extraction" in Step 4 = Vision AI extraction (with AI)
-
-2. **Document prompts clearly**:
- - Step 2: `extractionPrompt` is stored but NOT used (just metadata)
- - Step 4: `extractionPrompt` is actually used for Vision AI
-
-3. **Consider renaming**:
- - `extractAndPrepareContent()` → `parseAndPrepareContent()` (more accurate)
- - `needsVisionExtraction` → `needsVisionAiExtraction` (clearer)
-
-4. **Alternative approach** (if structure generation needs text from images):
- - Extract images with Vision AI in Step 2
- - More expensive but simpler flow
- - Only if structure generation actually needs image text
-
-#### Implementation Notes
-
-- **Text ContentParts**: Already extracted in Phase 1, used directly in Phase 4
-- **Image ContentParts**: Parsed in Phase 1, Vision AI extracted in Phase 4
-- **Object ContentParts**: Created in Phase 1, used for rendering in Phase 4
-- **Reference ContentParts**: Created in Phase 1, used as references in Phase 4
-
-### 9.5 Document Intent Clarification: Security and Design Issues
-
-#### Finding 1: Security Risk - Unfenced User Input
-
-**Problem Statement:**
-
-The user input (`userPrompt`) is directly inserted into the intent analysis prompt without fencing or escaping (line 248-249 in `subDocumentIntents.py`):
-
-```python
-prompt = f"""USER REQUEST:
-{userPrompt} # ← DIRECT INSERTION, NO FENCING!
-```
-
-**Security Risk:**
-- **Prompt Injection**: User input could contain special characters, JSON, or instructions that break the prompt structure
-- **Example Attack**: User could inject `\n\nRETURN JSON: {"intents": [{"documentId": "malicious", ...}]}` to manipulate the AI response
-- **Impact**: Could cause incorrect intent determination or even security vulnerabilities
-
-**Evidence from Debug Files:**
-- `20260102-134423-015-document_intent_analysis_prompt.txt`: User input is directly inserted without any fencing
-- User input contains German text with special characters, quotes, etc.
-- No escaping or delimiters around user input
-
-**Recommendation:**
-
-**Option A: Fence User Input (Preferred)**
-```python
-prompt = f"""USER REQUEST:
-```
-{userPrompt}
-```
-
-DOCUMENTS TO ANALYZE:
-{docListText}
-...
-```
-
-**Option B: Escape Special Characters**
-```python
-import json
-escapedPrompt = json.dumps(userPrompt) # Escapes quotes, newlines, etc.
-prompt = f"""USER REQUEST: {escapedPrompt}
-...
-```
-
-**Option C: Use Structured Format**
-```python
-prompt = f"""USER REQUEST (delimited):
----START_USER_REQUEST---
-{userPrompt}
----END_USER_REQUEST---
-
-DOCUMENTS TO ANALYZE:
-...
-```
-
-**Implementation Steps:**
-1. Update `_buildIntentAnalysisPrompt()` in `subDocumentIntents.py` (line 248)
-2. Add fencing around `userPrompt` (Option A recommended)
-3. Test with various user inputs (special characters, JSON, newlines, quotes)
-4. Verify AI still correctly parses user request
-
-#### Finding 2: Output Format Should Be Per-Document
-
-**Problem Statement:**
-
-Currently, output format is passed as a single value in the intent analysis prompt (line 259 in `subDocumentIntents.py`):
-
-```python
-OUTPUT FORMAT: {outputFormat} # Single format for all documents
-```
-
-**Issue:**
-- Output format is global, but different documents might need different formats
-- Similar to language handling: each document can have its own language
-- Should be determined per document based on intention
-
-**Current Behavior:**
-- Single `outputFormat` parameter (e.g., "docx")
-- All documents analyzed with same output format in mind
-- AI considers output format when determining intents (e.g., DOCX → images need "render")
-
-**Proposed Behavior:**
-- Each `DocumentIntent` should have optional `outputFormat` field
-- AI determines output format per document based on user intention
-- If not specified, use global output format as fallback
-- Similar to language: per-document with fallback to global
-
-**Example:**
-```python
-DocumentIntent(
- documentId: str,
- intents: List[str],
- extractionPrompt: Optional[str],
- reasoning: str,
- outputFormat: Optional[str] = None # NEW: Per-document format
-)
-```
-
-**Benefits:**
-- More flexible: Different documents can have different output formats
-- Better intention analysis: AI can determine format based on document purpose
-- Consistent with language handling (per-document with fallback)
-
-**Migration Steps:**
-1. Add `outputFormat` field to `DocumentIntent` model (optional)
-2. Update intent analysis prompt to ask AI to determine format per document
-3. Update prompt to show: "OUTPUT FORMAT (default: {outputFormat})" instead of "OUTPUT FORMAT: {outputFormat}"
-4. Update structure generation to use per-document format if available
-5. Fallback to global format if not specified per document
-
-**Updated Prompt Structure:**
-```python
-OUTPUT FORMAT (default: {outputFormat}):
-- If not specified per document, use default format above
-- Determine format per document based on user intention
-- Examples: "docx", "pdf", "html", "json", etc.
-
-RETURN JSON:
-{{
- "intents": [
- {{
- "documentId": "doc_1",
- "intents": ["extract"],
- "extractionPrompt": "...",
- "outputFormat": "docx", # NEW: Per-document format
- "reasoning": "..."
- }}
- ]
-}}
-```
-
-#### Implementation Priority
-
-**High Priority:**
-- Finding 1 (Security Risk): **CRITICAL** - Fix immediately
- - Security vulnerability that could be exploited
- - Easy to fix (add fencing)
- - Low risk change
-
-**Medium Priority:**
-- Finding 2 (Output Format): **IMPROVEMENT** - Plan for next iteration
- - Architectural improvement
- - Requires model changes
- - More complex migration
-
----
-
-## 10. Implementation Plan: Target State Migration
-
-This section provides a detailed implementation plan for migrating to the target architecture described in Section 9.3. The plan focuses on documents/content handling, output formats, languages, and clear handover states between phases.
-
-### 10.1 Overview: Major Phases and Handover States
-
-#### Phase Flow Diagram
-
-```
-┌─────────────────────────────────────────────────────────────────────┐
-│ PHASE 1: Document Intent Clarification │
-│ ────────────────────────────────────────────────────────────────── │
-│ INPUT: │
-│ - userPrompt: str (fenced) │
-│ - documentList: DocumentReferenceList (optional) │
-│ - contentParts: List[ContentPart] (optional) │
-│ - actionParameters: Dict (outputFormat, language, etc.) │
-│ │
-│ THROUGHPUT: │
-│ 1. Resolve documents from documentList │
-│ 2. Map pre-extracted JSONs to original documents │
-│ 3. AI analyzes document purposes │
-│ 4. Map intents back to JSON doc IDs (if applicable) │
-│ │
-│ OUTPUT: │
-│ - documentIntents: List[DocumentIntent] │
-│ * documentId: str │
-│ * intents: List[str] (["extract", "render", "reference"]) │
-│ * extractionPrompt: str (optional) │
-│ * outputFormat: str (optional, per-document) ← NEW │
-│ * language: str (optional, per-document) ← NEW │
-│ * reasoning: str │
-│ │
-│ HANDOVER STATE: │
-│ - documentIntents: Complete intent analysis │
-│ - documents: Resolved ChatDocuments │
-│ - preExtractedMapping: Map[originalDocId, jsonDocId] │
-└─────────────────────────────────────────────────────────────────────┘
- │
- ▼
-┌─────────────────────────────────────────────────────────────────────┐
-│ PHASE 2: Content Extraction and Preparation │
-│ ────────────────────────────────────────────────────────────────── │
-│ INPUT: │
-│ - documents: List[ChatDocument] │
-│ - documentIntents: List[DocumentIntent] │
-│ - contentParts: List[ContentPart] (optional, pre-extracted) │
-│ - preExtractedMapping: Map[originalDocId, jsonDocId] │
-│ │
-│ THROUGHPUT: │
-│ 1. Identify pre-extracted JSON documents │
-│ 2. Filter out original documents covered by pre-extracted │
-│ 3. Identify already extracted documents (from contentParts) │
-│ 4. Filter documents to extract (exclude duplicates) │
-│ 5. Process pre-extracted JSON documents → ContentParts │
-│ 6. RAW extraction (NO AI) for regular documents │
-│ 7. Merge: pre-extracted + extracted + provided contentParts │
-│ 8. Apply intents to ContentParts (extract, render, reference) │
-│ 9. Mark images for Vision AI extraction (deferred) │
-│ │
-│ OUTPUT: │
-│ - finalContentParts: List[ContentPart] │
-│ * id: str │
-│ * typeGroup: str │
-│ * mimeType: str │
-│ * data: Union[str, bytes] │
-│ * metadata: Dict │
-│ - documentId: str │
-│ - contentFormat: str ("extracted", "object", "reference") │
-│ - intent: str │
-│ - needsVisionExtraction: bool (for images) │
-│ - extractionPrompt: str (for Vision AI) │
-│ - originalFileName: str │
-│ - isPreExtracted: bool │
-│ - outputFormat: str (from DocumentIntent) ← NEW │
-│ - language: str (from DocumentIntent) ← NEW │
-│ │
-│ HANDOVER STATE: │
-│ - finalContentParts: Complete, ready for structure generation │
-│ - All documents processed (extracted or pre-extracted) │
-│ - Vision AI extraction deferred to Phase 4 │
-└─────────────────────────────────────────────────────────────────────┘
- │
- ▼
-┌─────────────────────────────────────────────────────────────────────┐
-│ PHASE 3: Structure Generation │
-│ ────────────────────────────────────────────────────────────────── │
-│ INPUT: │
-│ - userPrompt: str │
-│ - finalContentParts: List[ContentPart] │
-│ - globalOutputFormat: str (fallback) │
-│ - globalLanguage: str (fallback) │
-│ │
-│ THROUGHPUT: │
-│ 1. Group ContentParts by documentId │
-│ 2. Determine per-document outputFormat (from ContentPart.metadata│
-│ or global fallback) │
-│ 3. Determine per-document language (from ContentPart.metadata │
-│ or global fallback) │
-│ 4. AI generates structure with chapters │
-│ 5. Assign ContentParts to chapters │
-│ │
-│ OUTPUT: │
-│ - chapterStructure: Dict │
-│ * documents: List[Dict] │
-│ - id: str │
-│ - title: str │
-│ - outputFormat: str (per-document) ← NEW │
-│ - language: str (per-document) ← NEW │
-│ - chapters: List[Dict] │
-│ * id: str │
-│ * level: int │
-│ * title: str │
-│ * generationHint: str │
-│ * contentParts: List[str] (ContentPart IDs) │
-│ │
-│ HANDOVER STATE: │
-│ - chapterStructure: Complete structure with ContentPart │
-│ assignments │
-│ - Per-document format/language determined │
-└─────────────────────────────────────────────────────────────────────┘
- │
- ▼
-┌─────────────────────────────────────────────────────────────────────┐
-│ PHASE 4: Structure Filling │
-│ ────────────────────────────────────────────────────────────────── │
-│ INPUT: │
-│ - chapterStructure: Dict │
-│ - finalContentParts: List[ContentPart] │
-│ - userPrompt: str │
-│ │
-│ THROUGHPUT: │
-│ For each chapter: │
-│ 1. Generate sections structure (parallel) │
-│ 2. For each section: │
-│ a. Check if ContentParts need Vision AI extraction │
-│ b. If yes: Call Vision AI (Phase 2 deferred extraction) │
-│ c. Determine prompt type: │
-│ - WITH CONTENT: If contentParts assigned │
-│ → Use aggregation prompt (isAggregation=True) │
-│ → ContentParts passed as parameters │
-│ - WITHOUT CONTENT: If no contentParts │
-│ → Use generation prompt (isAggregation=False) │
-│ → Only generationHint in prompt │
-│ d. Generate section content with AI │
-│ │
-│ OUTPUT: │
-│ - filledStructure: Dict │
-│ * documents: List[Dict] │
-│ - chapters: List[Dict] │
-│ * sections: List[Dict] │
-│ - id: str │
-│ - content_type: str │
-│ - elements: List[Dict] │
-│ * type: str │
-│ * content: str (or base64 for images) │
-│ │
-│ HANDOVER STATE: │
-│ - filledStructure: Complete content, ready for rendering │
-│ - All Vision AI extractions completed │
-└─────────────────────────────────────────────────────────────────────┘
- │
- ▼
-┌─────────────────────────────────────────────────────────────────────┐
-│ PHASE 5: Document Rendering │
-│ ────────────────────────────────────────────────────────────────── │
-│ INPUT: │
-│ - filledStructure: Dict │
-│ - per-document outputFormat (from Phase 3) │
-│ - per-document language (from Phase 3) │
-│ │
-│ THROUGHPUT: │
-│ 1. Group sections by document (from structure) │
-│ 2. For each document: │
-│ a. Use per-document outputFormat │
-│ b. Use per-document language │
-│ c. Render document in specified format │
-│ │
-│ OUTPUT: │
-│ - renderedDocuments: List[DocumentData] │
-│ * documentName: str │
-│ * documentData: bytes │
-│ * mimeType: str │
-│ │
-│ HANDOVER STATE: │
-│ - renderedDocuments: Final output ready for user │
-└─────────────────────────────────────────────────────────────────────┘
-```
-
-### 10.2 Detailed Implementation Steps
-
-#### Step 1: Update DocumentIntent Model
-
-**File**: `gateway/modules/datamodels/datamodelExtraction.py`
-
-**Changes**:
-```python
-class DocumentIntent(BaseModel):
- documentId: str
- intents: List[str] # ["extract", "render", "reference"]
- extractionPrompt: Optional[str] = None
- outputFormat: Optional[str] = None # ← NEW: Per-document format
- language: Optional[str] = None # ← NEW: Per-document language
- reasoning: str
-```
-
-**Rationale**:
-- Enables per-document output format and language determination
-- Aligns with existing language handling pattern
-- Allows AI to determine format/language based on document purpose
-
-#### Step 2: Update Intent Analysis Prompt
-
-**File**: `gateway/modules/services/serviceAi/subDocumentIntents.py`
-
-**Changes**:
-
-1. **Add fencing around userPrompt** (Security Fix):
-```python
-def _buildIntentAnalysisPrompt(
- self,
- userPrompt: str,
- documents: List[ChatDocument],
- actionParameters: Dict[str, Any]
-) -> str:
- # FENCE user input to prevent prompt injection
- fencedUserPrompt = f"""```user_request
-{userPrompt}
-```"""
-
- prompt = f"""USER REQUEST:
-{fencedUserPrompt}
-
-DOCUMENTS TO ANALYZE:
-{docListText}
-
-TASK: For each document, determine:
-1. Intents (can be multiple): "extract", "render", "reference"
-2. Output format (optional): If document should be rendered in specific format
-3. Language (optional): If document content should be in specific language
-
-OUTPUT FORMAT: {outputFormat} (global fallback)
-
-RETURN JSON:
-{{
- "intents": [
- {{
- "documentId": "doc_1",
- "intents": ["extract"],
- "extractionPrompt": "Extract all text content",
- "outputFormat": "pdf", // ← NEW: Optional, per-document
- "language": "de", // ← NEW: Optional, per-document
- "reasoning": "..."
- }}
- ]
-}}
-"""
-```
-
-2. **Remove global outputFormat from prompt** (or keep as fallback only):
- - Output format should be determined per document based on intent
- - Global format remains as fallback if not specified per document
-
-#### Step 3: Update ContentPart Metadata Propagation
-
-**File**: `gateway/modules/services/serviceAi/subContentExtraction.py`
-
-**Changes**:
-```python
-async def extractAndPrepareContent(
- self,
- documents: List[ChatDocument],
- documentIntents: List[DocumentIntent],
- parentOperationId: str,
- getIntentForDocument: callable
-) -> List[ContentPart]:
- # ... existing extraction logic ...
-
- # When creating ContentParts, propagate outputFormat and language from DocumentIntent
- for part in allContentParts:
- intent = getIntentForDocument(part.metadata.get("documentId"), documentIntents)
- if intent:
- # Propagate per-document format and language to ContentPart
- if intent.outputFormat:
- part.metadata["outputFormat"] = intent.outputFormat
- if intent.language:
- part.metadata["language"] = intent.language
-```
-
-**Rationale**:
-- ContentParts carry format/language information through pipeline
-- Enables per-document rendering in Phase 5
-
-#### Step 4: Update Structure Generation
-
-**File**: `gateway/modules/services/serviceAi/subStructureGeneration.py`
-
-**Changes**:
-
-1. **Determine per-document format/language from ContentParts**:
-```python
-def generateStructure(
- self,
- userPrompt: str,
- contentParts: List[ContentPart],
- outputFormat: str, # Global fallback
- language: str, # Global fallback
- parentOperationId: str
-) -> Dict[str, Any]:
- # Group ContentParts by documentId
- partsByDocument = {}
- for part in contentParts:
- docId = part.metadata.get("documentId", "default")
- if docId not in partsByDocument:
- partsByDocument[docId] = []
- partsByDocument[docId].append(part)
-
- # Determine per-document format and language
- documentFormats = {}
- documentLanguages = {}
- for docId, parts in partsByDocument.items():
- # Get format from first ContentPart (all parts from same doc should have same format)
- docFormat = parts[0].metadata.get("outputFormat") or outputFormat
- docLanguage = parts[0].metadata.get("language") or language
- documentFormats[docId] = docFormat
- documentLanguages[docId] = docLanguage
-
- # Update prompt to include per-document format/language
- prompt = self._buildStructureGenerationPrompt(
- userPrompt=userPrompt,
- contentParts=contentParts,
- documentFormats=documentFormats, # ← NEW
- documentLanguages=documentLanguages, # ← NEW
- globalOutputFormat=outputFormat, # Fallback
- globalLanguage=language # Fallback
- )
-```
-
-2. **Update prompt to include per-document format/language**:
-```python
-def _buildStructureGenerationPrompt(
- self,
- userPrompt: str,
- contentParts: List[ContentPart],
- documentFormats: Dict[str, str], # ← NEW
- documentLanguages: Dict[str, str], # ← NEW
- globalOutputFormat: str,
- globalLanguage: str
-) -> str:
- # ... existing prompt building ...
-
- # Add per-document format/language information
- formatLanguageInfo = "\n## PER-DOCUMENT OUTPUT FORMATS AND LANGUAGES\n"
- for docId, docFormat in documentFormats.items():
- docLanguage = documentLanguages.get(docId, globalLanguage)
- formatLanguageInfo += f"- Document {docId}: Format={docFormat}, Language={docLanguage}\n"
-
- prompt += formatLanguageInfo
-
- prompt += """
-## DOCUMENT LANGUAGE
-- Each document can have its own language (ISO 639-1 code: "de", "en", "fr", etc.)
-- Per-document languages are listed above
-- If not specified, use global language: "{globalLanguage}"
-
-## OUTPUT FORMAT
-- Each document can have its own output format
-- Per-document formats are listed above
-- If not specified, use global format: "{globalOutputFormat}"
-"""
-```
-
-#### Step 5: Update Structure Filling - Two Prompt Types
-
-**File**: `gateway/modules/services/serviceAi/subStructureFilling.py`
-
-**Changes**:
-
-1. **Ensure two prompt types are used** (already implemented, verify):
-```python
-async def _fillSingleSection(
- self,
- section: Dict[str, Any],
- contentParts: List[ContentPart],
- userPrompt: str,
- generationHint: str,
- # ... other params ...
-) -> List[Dict[str, Any]]:
- contentPartIds = section.get("contentPartIds", [])
- hasContentParts = len(contentPartIds) > 0
-
- if hasContentParts:
- # PROMPT TYPE 1: WITH CONTENT (Aggregation)
- # ContentParts passed as parameters, not in prompt text
- isAggregation = True
- relevantParts = [p for p in contentParts if p.id in contentPartIds]
-
- generationPrompt = self._buildSectionGenerationPrompt(
- section=section,
- contentParts=relevantParts, # Passed as parameters
- userPrompt=userPrompt,
- generationHint=generationHint,
- isAggregation=True, # ← Key flag
- language=language
- )
- else:
- # PROMPT TYPE 2: WITHOUT CONTENT (Generation)
- # Only generationHint in prompt, no ContentParts
- isAggregation = False
-
- generationPrompt = self._buildSectionGenerationPrompt(
- section=section,
- contentParts=[], # Empty
- userPrompt=userPrompt,
- generationHint=generationHint,
- isAggregation=False, # ← Key flag
- language=language
- )
-```
-
-2. **Verify `_buildSectionGenerationPrompt` handles both cases**:
-```python
-def _buildSectionGenerationPrompt(
- self,
- section: Dict[str, Any],
- contentParts: List[ContentPart],
- userPrompt: str,
- generationHint: str,
- isAggregation: bool, # ← Determines prompt type
- language: str
-) -> str:
- if isAggregation:
- # TYPE 1: WITH CONTENT
- # ContentParts are passed as parameters to AI call
- # Don't include full content in prompt text (token efficiency)
- prompt = f"""Generate content for section based on provided ContentParts.
-
-Section: {sectionTitle}
-Generation Hint: {generationHint}
-Language: {language}
-
-ContentParts are provided as parameters (not shown in prompt for efficiency).
-Use the ContentParts data to generate the section content.
-"""
- else:
- # TYPE 2: WITHOUT CONTENT
- # Only generationHint, no ContentParts
- prompt = f"""Generate content for section based on generation hint.
-
-Section: {sectionTitle}
-Generation Hint: {generationHint}
-Language: {language}
-
-Generate content based on the generation hint without referencing external content.
-"""
-```
-
-**Rationale**:
-- **Type 1 (with content)**: Efficient for large content (ContentParts as parameters)
-- **Type 2 (without content)**: Simple generation based on hint only
-- Already implemented via `isAggregation` flag, verify it's used correctly
-
-#### Step 6: Update Document Rendering
-
-**File**: `gateway/modules/services/serviceGeneration/paths/documentPath.py`
-
-**Changes**:
-```python
-async def renderDocuments(
- self,
- filledStructure: Dict[str, Any],
- outputFormat: str, # Global fallback
- language: str # Global fallback
-) -> List[DocumentData]:
- renderedDocuments = []
-
- for doc in filledStructure.get("documents", []):
- docId = doc.get("id")
- docFormat = doc.get("outputFormat") or outputFormat # ← Use per-document format
- docLanguage = doc.get("language") or language # ← Use per-document language
-
- # Render document with per-document format and language
- renderedDoc = await self._renderSingleDocument(
- doc=doc,
- outputFormat=docFormat,
- language=docLanguage
- )
- renderedDocuments.append(renderedDoc)
-
- return renderedDocuments
-```
-
-#### Step 7: Update ai.process to Pass documentList
-
-**File**: `gateway/modules/workflows/methods/methodAi/actions/process.py`
-
-**Changes**:
-```python
-# Phase 7.3: Pass both documentList and contentParts to AI service
-# (Remove extraction logic from here - handled by AI service)
-
-# Use unified callAiContent method with BOTH parameters
-aiResponse = await self.services.ai.callAiContent(
- prompt=aiPrompt,
- options=options,
- documentList=documentList, # ← PASS documentList (was missing)
- contentParts=contentParts, # ← PASS contentParts
- outputFormat=output_format,
- parentOperationId=operationId,
- generationIntent=generationIntent
-)
-```
-
-**Rationale**:
-- Centralizes extraction logic in AI service
-- Enables intelligent merging with deduplication
-- Consistent behavior across all code paths
-
-### 10.3 Handover State Definitions
-
-#### State 1: After Intent Clarification
-```python
-class IntentClarificationState:
- documentIntents: List[DocumentIntent] # Complete intent analysis
- documents: List[ChatDocument] # Resolved documents
- preExtractedMapping: Dict[str, str] # Map[originalDocId, jsonDocId]
-
- # Validation
- assert len(documentIntents) == len(documents) # One intent per document
- assert all(intent.documentId in [d.id for d in documents] for intent in documentIntents)
-```
-
-#### State 2: After Content Extraction
-```python
-class ContentExtractionState:
- finalContentParts: List[ContentPart] # All content parts ready
-
- # Validation
- assert all(part.metadata.get("documentId") for part in finalContentParts)
- assert all(part.metadata.get("contentFormat") in ["extracted", "object", "reference"]
- for part in finalContentParts)
- # All documents either extracted or pre-extracted
- assert len(set(p.metadata.get("documentId") for p in finalContentParts)) == len(documents)
-```
-
-#### State 3: After Structure Generation
-```python
-class StructureGenerationState:
- chapterStructure: Dict[str, Any] # Complete structure
-
- # Validation
- assert "documents" in chapterStructure
- for doc in chapterStructure["documents"]:
- assert "outputFormat" in doc # Per-document format
- assert "language" in doc # Per-document language
- assert "chapters" in doc
- for chapter in doc["chapters"]:
- assert "contentParts" in chapter # ContentPart assignments
-```
-
-#### State 4: After Structure Filling
-```python
-class StructureFillingState:
- filledStructure: Dict[str, Any] # Complete content
-
- # Validation
- assert "documents" in filledStructure
- for doc in filledStructure["documents"]:
- for chapter in doc.get("chapters", []):
- for section in chapter.get("sections", []):
- assert "elements" in section # Generated elements
- # All Vision AI extractions completed
- assert not any(p.metadata.get("needsVisionExtraction")
- for p in contentParts)
-```
-
-#### State 5: After Document Rendering
-```python
-class DocumentRenderingState:
- renderedDocuments: List[DocumentData] # Final output
-
- # Validation
- assert len(renderedDocuments) > 0
- for doc in renderedDocuments:
- assert doc.documentData # Non-empty
- assert doc.mimeType # Valid MIME type
-```
-
-### 10.4 Migration Checklist
-
-#### Phase 1: Model Updates
-- [ ] Add `outputFormat` and `language` to `DocumentIntent` model
-- [ ] Update intent analysis prompt parser to handle new fields
-- [ ] Add validation for new fields
-
-#### Phase 2: Intent Analysis Updates
-- [ ] **CRITICAL**: Add fencing around `userPrompt` in intent analysis prompt
-- [ ] Update prompt to ask for per-document format/language
-- [ ] Update prompt to remove global outputFormat dependency (or keep as fallback)
-- [ ] Test with various user inputs (special chars, JSON, newlines)
-
-#### Phase 3: Content Extraction Updates
-- [ ] Propagate `outputFormat` and `language` from `DocumentIntent` to `ContentPart.metadata`
-- [ ] Verify pre-extracted JSON handling preserves format/language
-- [ ] Test merging logic with format/language propagation
-
-#### Phase 4: Structure Generation Updates
-- [ ] Group ContentParts by documentId
-- [ ] Determine per-document format/language from ContentPart metadata
-- [ ] Update structure generation prompt to include per-document info
-- [ ] Update structure output to include per-document format/language
-
-#### Phase 5: Structure Filling Verification
-- [ ] Verify two prompt types are correctly used:
- - [ ] `isAggregation=True`: ContentParts as parameters
- - [ ] `isAggregation=False`: Only generationHint
-- [ ] Test both prompt types with various scenarios
-- [ ] Verify Vision AI extraction happens during filling phase
-
-#### Phase 6: Document Rendering Updates
-- [ ] Use per-document format from structure
-- [ ] Use per-document language from structure
-- [ ] Fallback to global format/language if not specified
-- [ ] Test multi-document rendering with different formats/languages
-
-#### Phase 7: ai.process Refactoring
-- [ ] Remove extraction logic from `ai.process`
-- [ ] Pass `documentList` to `callAiContent()`
-- [ ] Pass `contentParts` to `callAiContent()`
-- [ ] Verify intelligent merging in AI service works correctly
-
-#### Phase 8: Testing
-- [ ] Test with pre-extracted JSON documents
-- [ ] Test with mixed `documentList` + `contentParts`
-- [ ] Test per-document format/language determination
-- [ ] Test two prompt types in structure filling
-- [ ] Test multi-document output with different formats/languages
-- [ ] Test security: prompt injection attempts with fenced input
-
-#### Phase 9: Documentation
-- [ ] Update API documentation
-- [ ] Update developer documentation
-- [ ] Update user documentation (if applicable)
-
----
-
-## End of Analysis
-
-This document provides a comprehensive overview of the content extraction and processing logic in the `ai.process` action. For implementation details, refer to the source files referenced throughout this document.
-
-**Note**: The "Recommendations and Next Steps" section (Section 9) will be expanded with additional findings and improvements as analysis continues.
diff --git a/modules/services/serviceAi/PARALLEL_PROCESSING_CONCEPT.md b/modules/services/serviceAi/PARALLEL_PROCESSING_CONCEPT.md
deleted file mode 100644
index d8b55298..00000000
--- a/modules/services/serviceAi/PARALLEL_PROCESSING_CONCEPT.md
+++ /dev/null
@@ -1,376 +0,0 @@
-# Parallel Processing Refactoring Concept
-
-## Current State (Sequential)
-
-### Chapter Sections Structure Generation (`_generateChapterSectionsStructure`)
-- **Current**: Processes chapters sequentially, one after another
-- **Flow**:
- 1. Iterate through documents
- 2. For each document, iterate through chapters
- 3. For each chapter, generate sections structure using AI
- 4. Update progress after each chapter
-
-### Section Content Generation (`_fillChapterSections`)
-- **Current**: Processes chapters sequentially, sections within each chapter sequentially
-- **Flow**:
- 1. Iterate through documents
- 2. For each document, iterate through chapters
- 3. For each chapter, iterate through sections
- 4. For each section, generate content using AI
- 5. Update progress after each section
-
-## Desired State (Parallel)
-
-### Chapter Sections Structure Generation
-- **Target**: Process all chapters in parallel
-- **Requirements**:
- - Maintain chapter order in final result
- - Each chapter can be processed independently
- - Progress updates should reflect parallel processing
- - Errors in one chapter should not stop others
-
-### Section Content Generation
-- **Target**: Process sections within each chapter in parallel
-- **Requirements**:
- - Maintain section order within each chapter
- - Sections within a chapter can be processed independently
- - Chapters still processed sequentially (to maintain order)
- - Progress updates should reflect parallel processing
- - Errors in one section should not stop others
-
-## Implementation Strategy
-
-### Phase 1: Chapter Sections Structure Generation Parallelization
-
-#### Step 1.1: Extract Single Chapter Processing
-- **Create**: `_generateSingleChapterSectionsStructure()` method
-- **Purpose**: Process one chapter independently
-- **Parameters**:
- - `chapter`: Chapter dict
- - `chapterIndex`: Index for ordering
- - `chapterId`, `chapterLevel`, `chapterTitle`: Chapter metadata
- - `generationHint`: Generation instructions
- - `contentPartIds`, `contentPartInstructions`: Content part info
- - `contentParts`: Full content parts list
- - `userPrompt`: User's original prompt
- - `language`: Language for generation
- - `parentOperationId`: For progress logging
-- **Returns**: None (modifies chapter dict in place)
-- **Error Handling**: Logs errors, raises exception to be caught by caller
-
-#### Step 1.2: Refactor Main Method
-- **Modify**: `_generateChapterSectionsStructure()`
-- **Changes**:
- 1. Collect all chapters with their indices
- 2. Create async tasks for each chapter using `_generateSingleChapterSectionsStructure`
- 3. Use `asyncio.gather()` to execute all tasks in parallel
- 4. Process results in order (using `zip` with original order)
- 5. Handle errors per chapter (don't fail entire operation)
- 6. Update progress after each chapter completes
-
-#### Step 1.3: Progress Reporting
-- **Maintain**: Overall progress tracking
-- **Update**: Progress after each chapter completes (not sequentially)
-- **Format**: "Chapter X/Y completed" or "Chapter X/Y error"
-
-### Phase 2: Section Content Generation Parallelization
-
-#### Step 2.1: Extract Single Section Processing
-- **Create**: `_processSingleSection()` method
-- **Purpose**: Process one section independently
-- **Parameters**:
- - `section`: Section dict
- - `sectionIndex`: Index for ordering
- - `totalSections`: Total sections in chapter
- - `chapterIndex`: Chapter index
- - `totalChapters`: Total chapters
- - `chapterId`: Chapter ID
- - `chapterOperationId`: Chapter progress operation ID
- - `fillOperationId`: Overall fill operation ID
- - `contentParts`: Full content parts list
- - `userPrompt`: User's original prompt
- - `all_sections_list`: All sections for context
- - `language`: Language for generation
- - `calculateOverallProgress`: Function to calculate overall progress
-- **Returns**: `List[Dict[str, Any]]` (elements for the section)
-- **Error Handling**: Returns error element instead of raising
-
-#### Step 2.2: Extract Section Processing Logic
-- **Create**: Helper methods for different processing paths:
- - `_processSectionAggregation()`: Handle aggregation path (multiple parts)
- - `_processSectionGeneration()`: Handle generation without parts (only generationHint)
- - `_processSectionParts()`: Handle individual part processing
-- **Purpose**: Keep logic organized and reusable
-
-#### Step 2.3: Refactor Main Method
-- **Modify**: `_fillChapterSections()`
-- **Changes**:
- 1. Keep sequential chapter processing (maintains order)
- 2. For each chapter, collect all sections with indices
- 3. Create async tasks for each section using `_processSingleSection`
- 4. Use `asyncio.gather()` to execute all section tasks in parallel
- 5. Process results in order (using `zip` with original order)
- 6. Assign elements to sections in correct order
- 7. Update progress after each section completes
- 8. Handle errors per section (don't fail entire chapter)
-
-#### Step 2.4: Progress Reporting
-- **Maintain**: Hierarchical progress tracking
-- **Update**:
- - Section progress: After each section completes
- - Chapter progress: After all sections in chapter complete
- - Overall progress: After each section/chapter completes
-- **Format**: "Chapter X/Y, Section A/B completed"
-
-## Key Considerations
-
-### Order Preservation
-- **Chapters**: Must maintain document order → process chapters sequentially
-- **Sections**: Must maintain chapter order → process sections sequentially within chapter
-- **Solution**: Use `asyncio.gather()` with ordered task list, then `zip` results with original order
-
-### Error Handling
-- **Chapters**: Error in one chapter should not stop others
-- **Sections**: Error in one section should not stop others
-- **Solution**: Use `return_exceptions=True` in `asyncio.gather()`, check `isinstance(result, Exception)`
-
-### Progress Reporting
-- **Challenge**: Progress updates happen out of order
-- **Solution**: Update progress when each task completes, not sequentially
-- **Format**: Show completed count, not sequential position
-
-### Shared State
-- **Chapters**: Modify chapter dicts in place (safe, each chapter is independent)
-- **Sections**: Return elements, assign to sections in order (safe, each section is independent)
-- **Content Parts**: Read-only, passed to all tasks (safe)
-
-### Dependencies
-- **Chapters**: No dependencies between chapters
-- **Sections**: No dependencies between sections (each is self-contained)
-- **Solution**: All tasks can run truly in parallel
-
-## Implementation Steps
-
-### Step 1: Clean Current Code
-1. Ensure current sequential implementation is correct
-2. Fix any existing bugs
-3. Verify all tests pass
-
-### Step 2: Implement Chapter Parallelization
-1. Create `_generateSingleChapterSectionsStructure()` method
-2. Extract chapter processing logic
-3. Refactor `_generateChapterSectionsStructure()` to use parallel processing
-4. Test with single chapter
-5. Test with multiple chapters
-6. Verify order preservation
-7. Verify error handling
-
-### Step 3: Implement Section Parallelization
-1. Create `_processSingleSection()` method
-2. Extract section processing logic into helper methods
-3. Refactor `_fillChapterSections()` to use parallel processing for sections
-4. Test with single section
-5. Test with multiple sections
-6. Test with multiple chapters
-7. Verify order preservation
-8. Verify error handling
-
-### Step 4: Testing & Validation
-1. Test with various document structures
-2. Test error scenarios
-3. Verify progress reporting accuracy
-4. Performance testing (compare sequential vs parallel)
-5. Verify final output order matches input order
-
-## Code Structure
-
-### New Methods to Create
-
-```python
-async def _generateSingleChapterSectionsStructure(
- self,
- chapter: Dict[str, Any],
- chapterIndex: int,
- chapterId: str,
- chapterLevel: int,
- chapterTitle: str,
- generationHint: str,
- contentPartIds: List[str],
- contentPartInstructions: Dict[str, Any],
- contentParts: List[ContentPart],
- userPrompt: str,
- language: str,
- parentOperationId: str
-) -> None:
- """Generate sections structure for a single chapter (used for parallel processing)."""
- # Extract logic from current sequential loop
- # Modify chapter dict in place
- # Handle errors internally, raise if critical
-
-async def _processSingleSection(
- self,
- section: Dict[str, Any],
- sectionIndex: int,
- totalSections: int,
- chapterIndex: int,
- totalChapters: int,
- chapterId: str,
- chapterOperationId: str,
- fillOperationId: str,
- contentParts: List[ContentPart],
- userPrompt: str,
- all_sections_list: List[Dict[str, Any]],
- language: str,
- calculateOverallProgress: Callable
-) -> List[Dict[str, Any]]:
- """Process a single section and return its elements."""
- # Extract logic from current sequential loop
- # Return elements list
- # Return error element on failure (don't raise)
-
-async def _processSectionAggregation(
- self,
- section: Dict[str, Any],
- sectionId: str,
- sectionTitle: str,
- sectionIndex: int,
- totalSections: int,
- chapterId: str,
- chapterOperationId: str,
- fillOperationId: str,
- contentPartIds: List[str],
- contentFormats: Dict[str, str],
- contentParts: List[ContentPart],
- userPrompt: str,
- generationHint: str,
- all_sections_list: List[Dict[str, Any]],
- language: str
-) -> List[Dict[str, Any]]:
- """Process section with aggregation (multiple parts together)."""
- # Extract aggregation logic
- # Return elements list
-
-async def _processSectionGeneration(
- self,
- section: Dict[str, Any],
- sectionId: str,
- sectionTitle: str,
- sectionIndex: int,
- totalSections: int,
- chapterId: str,
- chapterOperationId: str,
- fillOperationId: str,
- contentType: str,
- userPrompt: str,
- generationHint: str,
- all_sections_list: List[Dict[str, Any]],
- language: str
-) -> List[Dict[str, Any]]:
- """Process section generation without content parts (only generationHint)."""
- # Extract generation logic
- # Return elements list
-
-async def _processSectionParts(
- self,
- section: Dict[str, Any],
- sectionId: str,
- sectionTitle: str,
- sectionIndex: int,
- totalSections: int,
- chapterId: str,
- chapterOperationId: str,
- fillOperationId: str,
- contentPartIds: List[str],
- contentFormats: Dict[str, str],
- contentParts: List[ContentPart],
- contentType: str,
- useAiCall: bool,
- generationHint: str,
- userPrompt: str,
- all_sections_list: List[Dict[str, Any]],
- language: str
-) -> List[Dict[str, Any]]:
- """Process individual parts in a section."""
- # Extract individual part processing logic
- # Return elements list
-```
-
-### Modified Methods
-
-```python
-async def _generateChapterSectionsStructure(
- self,
- chapterStructure: Dict[str, Any],
- contentParts: List[ContentPart],
- userPrompt: str,
- parentOperationId: str
-) -> Dict[str, Any]:
- """Generate sections structure for all chapters in parallel."""
- # Collect chapters with indices
- # Create tasks
- # Execute in parallel
- # Process results in order
- # Update progress
-
-async def _fillChapterSections(
- self,
- chapterStructure: Dict[str, Any],
- contentParts: List[ContentPart],
- userPrompt: str,
- fillOperationId: str
-) -> Dict[str, Any]:
- """Fill sections with content, processing sections in parallel within each chapter."""
- # Process chapters sequentially
- # For each chapter, process sections in parallel
- # Maintain order
- # Update progress
-```
-
-## Testing Strategy
-
-### Unit Tests
-1. Test `_generateSingleChapterSectionsStructure` independently
-2. Test `_processSingleSection` independently
-3. Test helper methods independently
-
-### Integration Tests
-1. Test parallel chapter processing with multiple chapters
-2. Test parallel section processing with multiple sections
-3. Test error handling (one chapter/section fails)
-4. Test order preservation
-
-### Performance Tests
-1. Measure sequential vs parallel execution time
-2. Verify parallel processing is faster
-3. Check resource usage (memory, CPU)
-
-## Risk Mitigation
-
-### Risks
-1. **Order not preserved**: Use `zip` with original order
-2. **Race conditions**: No shared mutable state between tasks
-3. **Progress reporting incorrect**: Update progress when tasks complete
-4. **Errors not handled**: Use `return_exceptions=True` and check results
-5. **Performance degradation**: Test and measure, fallback to sequential if needed
-
-### Safety Measures
-1. Keep sequential implementation as fallback (commented out)
-2. Add feature flag to enable/disable parallel processing
-3. Extensive logging for debugging
-4. Gradual rollout (test with small datasets first)
-
-## Migration Path
-
-1. **Phase 1**: Implement chapter parallelization, test thoroughly
-2. **Phase 2**: Implement section parallelization, test thoroughly
-3. **Phase 3**: Enable both in production with monitoring
-4. **Phase 4**: Remove sequential fallback code (if stable)
-
-## Notes
-
-- All async methods must use `await` correctly
-- Progress updates happen asynchronously (may appear out of order in logs)
-- Final result order is guaranteed by processing results in order
-- Error handling is per-task, not global
-- No shared mutable state between parallel tasks (read-only contentParts, independent chapter/section dicts)
-
diff --git a/modules/services/serviceAi/README_MODULE_STRUCTURE.md b/modules/services/serviceAi/README_MODULE_STRUCTURE.md
deleted file mode 100644
index d2fca8f5..00000000
--- a/modules/services/serviceAi/README_MODULE_STRUCTURE.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Module Structure - serviceAi
-
-## Übersicht
-
-Das `mainServiceAi.py` Modul wurde in mehrere Submodule aufgeteilt, um die Übersichtlichkeit zu verbessern.
-
-## Modulstruktur
-
-### Hauptmodul
-- **mainServiceAi.py** (~800 Zeilen)
- - Initialisierung (`__init__`, `create`, `ensureAiObjectsInitialized`)
- - Public API (`callAiPlanning`, `callAiContent`)
- - Routing zu Submodulen
- - Helper-Methoden
-
-### Submodule
-
-1. **subJsonResponseHandling.py** (bereits vorhanden)
- - JSON Response Merging
- - Section Merging
- - Fragment Detection
-
-2. **subResponseParsing.py** (~200 Zeilen)
- - `ResponseParser.extractSectionsFromResponse()` - Extrahiert Sections aus AI-Responses
- - `ResponseParser.shouldContinueGeneration()` - Entscheidet ob Generation fortgesetzt werden soll
- - `ResponseParser._isStuckInLoop()` - Loop-Detection
- - `ResponseParser.extractDocumentMetadata()` - Extrahiert Metadaten
- - `ResponseParser.buildFinalResultFromSections()` - Baut finales JSON
-
-3. **subDocumentIntents.py** (~300 Zeilen)
- - `DocumentIntentAnalyzer.clarifyDocumentIntents()` - Analysiert Dokument-Intents
- - `DocumentIntentAnalyzer.resolvePreExtractedDocument()` - Löst pre-extracted Dokumente auf
- - `DocumentIntentAnalyzer._buildIntentAnalysisPrompt()` - Baut Intent-Analyse-Prompt
-
-4. **subContentExtraction.py** (~600 Zeilen)
- - `ContentExtractor.extractAndPrepareContent()` - Extrahiert und bereitet Content vor
- - `ContentExtractor.extractTextFromImage()` - Vision AI für Bilder
- - `ContentExtractor.processTextContentWithAi()` - AI-Verarbeitung von Text
- - `ContentExtractor._isBinary()` - Helper für Binary-Check
-
-5. **subStructureGeneration.py** (~200 Zeilen)
- - `StructureGenerator.generateStructure()` - Generiert Dokument-Struktur
- - `StructureGenerator._buildStructurePrompt()` - Baut Struktur-Prompt
-
-6. **subStructureFilling.py** (~400 Zeilen)
- - `StructureFiller.fillStructure()` - Füllt Struktur mit Content
- - `StructureFiller._buildSectionGenerationPrompt()` - Baut Section-Generation-Prompt
- - `StructureFiller._findContentPartById()` - Helper für ContentPart-Suche
- - `StructureFiller._needsAggregation()` - Entscheidet ob Aggregation nötig
-
-7. **subAiCallLooping.py** (~400 Zeilen)
- - `AiCallLooper.callAiWithLooping()` - Haupt-Looping-Logik
- - `AiCallLooper._defineKpisFromPrompt()` - KPI-Definition
-
-## Verwendung
-
-Alle Submodule werden über das Hauptmodul `AiService` verwendet:
-
-```python
-# Initialisierung
-aiService = await AiService.create(serviceCenter)
-
-# Submodule werden automatisch initialisiert
-# aiService.responseParser
-# aiService.intentAnalyzer
-# aiService.contentExtractor
-# etc.
-```
-
-## Migration
-
-Die öffentliche API bleibt unverändert. Interne Methoden wurden in Submodule verschoben:
-
-- `_extractSectionsFromResponse` → `responseParser.extractSectionsFromResponse`
-- `_clarifyDocumentIntents` → `intentAnalyzer.clarifyDocumentIntents`
-- `_extractAndPrepareContent` → `contentExtractor.extractAndPrepareContent`
-- etc.
-
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index a07aa441..e7bab8a3 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -222,18 +222,6 @@ Respond with ONLY a JSON object in this exact format:
prompt, options, debugPrefix, promptBuilder, promptArgs, operationId, userPrompt, contentParts, useCaseId
)
- async def _defineKpisFromPrompt(
- self,
- userPrompt: str,
- rawJsonString: Optional[str],
- continuationContext: Dict[str, Any],
- debugPrefix: str = "kpi"
- ) -> List[Dict[str, Any]]:
- """Delegate to AiCallLooper."""
- return await self.aiCallLooper._defineKpisFromPrompt(
- userPrompt, rawJsonString, continuationContext, debugPrefix
- )
-
# JSON merging logic moved to subJsonResponseHandling.py
def _extractSectionsFromResponse(
diff --git a/modules/services/serviceAi/merge_1.txt b/modules/services/serviceAi/merge_1.txt
new file mode 100644
index 00000000..7892d50a
--- /dev/null
+++ b/modules/services/serviceAi/merge_1.txt
@@ -0,0 +1,529 @@
+================================================================================
+JSON MERGE OPERATION #1
+================================================================================
+Timestamp: 2026-01-04T15:18:28.448964
+
+INPUT:
+ Accumulated length: 36937 chars
+ New Fragment length: 36843 chars
+ Accumulated: 223 lines (showing first 5 and last 5)
+ {
+ "elements": [
+ {
+ "type": "table",
+ "content": {
+ ... (213 lines omitted) ...
+ ["2111", "18433", "2112", "18439", "2113", "18443", "2114", "18451", "2115", "18457", "2116", "18461", "2117", "18481", "2118", "18493", "2119", "18503", "2120", "18517"],
+ ["2121", "18521", "2122", "18523", "2123", "18539", "2124", "18541", "2125", "18553", "2126", "18583", "2127", "18587", "2128", "18593", "2129", "18617", "2130", "18637"],
+ ["2131", "18661", "2132", "18671", "2133", "18679", "2134", "18691", "2135", "18701", "2136", "18713", "2137", "18719", "2138", "18731", "2139", "18743", "2140", "18749"],
+ ["2141", "18757", "2142", "18773", "2143", "18787", "2144", "18793", "2145", "18797", "2146", "18803", "2147", "18839", "2148", "18859", "2149", "18869", "2150", "18899"],
+ ["2151", "189
+ New Fragment: 209 lines (showing first 5 and last 5)
+ ```json
+ {
+ "elements": [
+ {
+ "type": "table",
+ ... (199 lines omitted) ...
+ ["4061", "38569", "4062", "38593", "4063", "38603", "4064", "38609", "4065", "38611", "4066", "38629", "4067", "38639", "4068", "38651", "4069", "38653", "4070", "38669"],
+ ["4071", "38671", "4072", "38677", "4073", "38693", "4074", "38699", "4075", "38707", "4076", "38711", "4077", "38713", "4078", "38723", "4079", "38729", "4080", "38737"],
+ ["4081", "38747", "4082", "38749", "4083", "38767", "4084", "38783", "4085", "38791", "4086", "38803", "4087", "38821", "4088", "38833", "4089", "38839", "4090", "38851"],
+ ["4091", "38861", "4092", "38867", "4093", "38873", "4094", "38891", "4095", "38903", "4096", "38917", "4097", "38921", "4098", "38923", "4099", "38933", "4100", "38953"],
+ ["4101", "38959", "4102", "38971", "4103", "38977", "4104", "38993", "4105", "39019", "4106", "39023", "4107
+
+
+ Normalized Accumulated (36937 chars)
+ (showing first 5 and last 5 of 223 lines)
+ {
+ "elements": [
+ {
+ "type": "table",
+ "content": {
+ ... (213 lines omitted) ...
+ ["2111", "18433", "2112", "18439", "2113", "18443", "2114", "18451", "2115", "18457", "2116", "18461", "2117", "18481", "2118", "18493", "2119", "18503", "2120", "18517"],
+ ["2121", "18521", "2122", "18523", "2123", "18539", "2124", "18541", "2125", "18553", "2126", "18583", "2127", "18587", "2128", "18593", "2129", "18617", "2130", "18637"],
+ ["2131", "18661", "2132", "18671", "2133", "18679", "2134", "18691", "2135", "18701", "2136", "18713", "2137", "18719", "2138", "18731", "2139", "18743", "2140", "18749"],
+ ["2141", "18757", "2142", "18773", "2143", "18787", "2144", "18793", "2145", "18797", "2146", "18803", "2147", "18839", "2148", "18859", "2149", "18869", "2150", "18899"],
+ ["2151", "189
+
+ Normalized New Fragment (36835 chars)
+ (showing first 5 and last 5 of 208 lines)
+ {
+ "elements": [
+ {
+ "type": "table",
+ "content": {
+ ... (198 lines omitted) ...
+ ["4061", "38569", "4062", "38593", "4063", "38603", "4064", "38609", "4065", "38611", "4066", "38629", "4067", "38639", "4068", "38651", "4069", "38653", "4070", "38669"],
+ ["4071", "38671", "4072", "38677", "4073", "38693", "4074", "38699", "4075", "38707", "4076", "38711", "4077", "38713", "4078", "38723", "4079", "38729", "4080", "38737"],
+ ["4081", "38747", "4082", "38749", "4083", "38767", "4084", "38783", "4085", "38791", "4086", "38803", "4087", "38821", "4088", "38833", "4089", "38839", "4090", "38851"],
+ ["4091", "38861", "4092", "38867", "4093", "38873", "4094", "38891", "4095", "38903", "4096", "38917", "4097", "38921", "4098", "38923", "4099", "38933", "4100", "38953"],
+ ["4101", "38959", "4102", "38971", "4103", "38977", "4104", "38993", "4105", "39019", "4106", "39023", "4107
+STEP: PHASE 1
+ Description: Finding overlap between JSON strings
+ ⏳ In progress...
+
+ Overlap Detection (string):
+ Overlap length: 0
+ ⚠️ No overlap detected - appending all
+
+ ⚠️ NO OVERLAP FOUND - This indicates iterations should stop
+ Closing JSON and returning final result
+
+ Closed JSON (36944 chars):
+ ==============================================================================
+ {
+ "elements": [
+ {
+ "type": "table",
+ "content": {
+ "headers": ["Nr.1", "Primzahl1", "Nr.2", "Primzahl2", "Nr.3", "Primzahl3", "Nr.4", "Primzahl4", "Nr.5", "Primzahl5", "Nr.6", "Primzahl6", "Nr.7", "Primzahl7", "Nr.8", "Primzahl8", "Nr.9", "Primzahl9", "Nr.10", "Primzahl10"],
+ "rows": [
+ ["1", "2", "2", "3", "3", "5", "4", "7", "5", "11", "6", "13", "7", "17", "8", "19", "9", "23", "10", "29"],
+ ["11", "31", "12", "37", "13", "41", "14", "43", "15", "47", "16", "53", "17", "59", "18", "61", "19", "67", "20", "71"],
+ ["21", "73", "22", "79", "23", "83", "24", "89", "25", "97", "26", "101", "27", "103", "28", "107", "29", "109", "30", "113"],
+ ["31", "127", "32", "131", "33", "137", "34", "139", "35", "149", "36", "151", "37", "157", "38", "163", "39", "167", "40", "173"],
+ ["41", "179", "42", "181", "43", "191", "44", "193", "45", "197", "46", "199", "47", "211", "48", "223", "49", "227", "50", "229"],
+ ["51", "233", "52", "239", "53", "241", "54", "251", "55", "257", "56", "263", "57", "269", "58", "271", "59", "277", "60", "281"],
+ ["61", "283", "62", "293", "63", "307", "64", "311", "65", "313", "66", "317", "67", "331", "68", "337", "69", "347", "70", "349"],
+ ["71", "353", "72", "359", "73", "367", "74", "373", "75", "379", "76", "383", "77", "389", "78", "397", "79", "401", "80", "409"],
+ ["81", "419", "82", "421", "83", "431", "84", "433", "85", "439", "86", "443", "87", "449", "88", "457", "89", "461", "90", "463"],
+ ["91", "467", "92", "479", "93", "487", "94", "491", "95", "499", "96", "503", "97", "509", "98", "521", "99", "523", "100", "541"],
+ ["101", "547", "102", "557", "103", "563", "104", "569", "105", "571", "106", "577", "107", "587", "108", "593", "109", "599", "110", "601"],
+ ["111", "607", "112", "613", "113", "617", "114", "619", "115", "631", "116", "641", "117", "643", "118", "647", "119", "653", "120", "659"],
+ ["121", "661", "122", "673", "123", "677", "124", "683", "125", "691", "126", "701", "127", "709", "128", "719", "129", "727", "130", "733"],
+ ["131", "739", "132", "743", "133", "751", "134", "757", "135", "761", "136", "769", "137", "773", "138", "787", "139", "797", "140", "809"],
+ ["141", "811", "142", "821", "143", "823", "144", "827", "145", "829", "146", "839", "147", "853", "148", "857", "149", "859", "150", "863"],
+ ["151", "877", "152", "881", "153", "883", "154", "887", "155", "907", "156", "911", "157", "919", "158", "929", "159", "937", "160", "941"],
+ ["161", "947", "162", "953", "163", "967", "164", "971", "165", "977", "166", "983", "167", "991", "168", "997", "169", "1009", "170", "1013"],
+ ["171", "1019", "172", "1021", "173", "1031", "174", "1033", "175", "1039", "176", "1049", "177", "1051", "178", "1061", "179", "1063", "180", "1069"],
+ ["181", "1087", "182", "1091", "183", "1093", "184", "1097", "185", "1103", "186", "1109", "187", "1117", "188", "1123", "189", "1129", "190", "1151"],
+ ["191", "1153", "192", "1163", "193", "1171", "194", "1181", "195", "1187", "196", "1193", "197", "1201", "198", "1213", "199", "1217", "200", "1223"],
+ ["201", "1229", "202", "1231", "203", "1237", "204", "1249", "205", "1259", "206", "1277", "207", "1279", "208", "1283", "209", "1289", "210", "1291"],
+ ["211", "1297", "212", "1301", "213", "1303", "214", "1307", "215", "1319", "216", "1321", "217", "1327", "218", "1361", "219", "1367", "220", "1373"],
+ ["221", "1381", "222", "1399", "223", "1409", "224", "1423", "225", "1427", "226", "1429", "227", "1433", "228", "1439", "229", "1447", "230", "1451"],
+ ["231", "1453", "232", "1459", "233", "1471", "234", "1481", "235", "1483", "236", "1487", "237", "1489", "238", "1493", "239", "1499", "240", "1511"],
+ ["241", "1523", "242", "1531", "243", "1543", "244", "1549", "245", "1553", "246", "1559", "247", "1567", "248", "1571", "249", "1579", "250", "1583"],
+ ["251", "1597", "252", "1601", "253", "1607", "254", "1609", "255", "1613", "256", "1619", "257", "1621", "258", "1627", "259", "1637", "260", "1657"],
+ ["261", "1663", "262", "1667", "263", "1669", "264", "1693", "265", "1697", "266", "1699", "267", "1709", "268", "1721", "269", "1723", "270", "1733"],
+ ["271", "1741", "272", "1747", "273", "1753", "274", "1759", "275", "1777", "276", "1783", "277", "1787", "278", "1789", "279", "1801", "280", "1811"],
+ ["281", "1823", "282", "1831", "283", "1847", "284", "1861", "285", "1867", "286", "1871", "287", "1873", "288", "1877", "289", "1879", "290", "1889"],
+ ["291", "1901", "292", "1907", "293", "1913", "294", "1931", "295", "1933", "296", "1949", "297", "1951", "298", "1973", "299", "1979", "300", "1987"],
+ ["301", "1993", "302", "1997", "303", "1999", "304", "2003", "305", "2011", "306", "2017", "307", "2027", "308", "2029", "309", "2039", "310", "2053"],
+ ["311", "2063", "312", "2069", "313", "2081", "314", "2083", "315", "2087", "316", "2089", "317", "2099", "318", "2111", "319", "2113", "320", "2129"],
+ ["321", "2131", "322", "2137", "323", "2141", "324", "2143", "325", "2153", "326", "2161", "327", "2179", "328", "2203", "329", "2207", "330", "2213"],
+ ["331", "2221", "332", "2237", "333", "2239", "334", "2243", "335", "2251", "336", "2267", "337", "2269", "338", "2273", "339", "2281", "340", "2287"],
+ ["341", "2293", "342", "2297", "343", "2309", "344", "2311", "345", "2333", "346", "2339", "347", "2341", "348", "2347", "349", "2351", "350", "2357"],
+ ["351", "2371", "352", "2377", "353", "2381", "354", "2383", "355", "2389", "356", "2393", "357", "2399", "358", "2411", "359", "2417", "360", "2423"],
+ ["361", "2437", "362", "2441", "363", "2447", "364", "2459", "365", "2467", "366", "2473", "367", "2477", "368", "2503", "369", "2521", "370", "2531"],
+ ["371", "2539", "372", "2543", "373", "2549", "374", "2551", "375", "2557", "376", "2579", "377", "2591", "378", "2593", "379", "2609", "380", "2617"],
+ ["381", "2621", "382", "2633", "383", "2647", "384", "2657", "385", "2659", "386", "2663", "387", "2671", "388", "2677", "389", "2683", "390", "2687"],
+ ["391", "2689", "392", "2693", "393", "2699", "394", "2707", "395", "2711", "396", "2713", "397", "2719", "398", "2729", "399", "2731", "400", "2741"],
+ ["401", "2749", "402", "2753", "403", "2767", "404", "2777", "405", "2789", "406", "2791", "407", "2797", "408", "2801", "409", "2803", "410", "2819"],
+ ["411", "2833", "412", "2837", "413", "2843", "414", "2851", "415", "2857", "416", "2861", "417", "2879", "418", "2887", "419", "2897", "420", "2903"],
+ ["421", "2909", "422", "2917", "423", "2927", "424", "2939", "425", "2953", "426", "2957", "427", "2963", "428", "2969", "429", "2971", "430", "2999"],
+ ["431", "3001", "432", "3011", "433", "3019", "434", "3023", "435", "3037", "436", "3041", "437", "3049", "438", "3061", "439", "3067", "440", "3079"],
+ ["441", "3083", "442", "3089", "443", "3109", "444", "3119", "445", "3121", "446", "3137", "447", "3163", "448", "3167", "449", "3169", "450", "3181"],
+ ["451", "3187", "452", "3191", "453", "3203", "454", "3209", "455", "3217", "456", "3221", "457", "3229", "458", "3251", "459", "3253", "460", "3257"],
+ ["461", "3259", "462", "3271", "463", "3299", "464", "3301", "465", "3307", "466", "3313", "467", "3319", "468", "3323", "469", "3329", "470", "3331"],
+ ["471", "3343", "472", "3347", "473", "3359", "474", "3361", "475", "3371", "476", "3373", "477", "3389", "478", "3391", "479", "3407", "480", "3413"],
+ ["481", "3433", "482", "3449", "483", "3457", "484", "3461", "485", "3463", "486", "3467", "487", "3469", "488", "3491", "489", "3499", "490", "3511"],
+ ["491", "3517", "492", "3527", "493", "3529", "494", "3533", "495", "3539", "496", "3541", "497", "3547", "498", "3557", "499", "3559", "500", "3571"],
+ ["501", "3581", "502", "3583", "503", "3593", "504", "3607", "505", "3613", "506", "3617", "507", "3623", "508", "3631", "509", "3637", "510", "3643"],
+ ["511", "3659", "512", "3671", "513", "3673", "514", "3677", "515", "3691", "516", "3697", "517", "3701", "518", "3709", "519", "3719", "520", "3727"],
+ ["521", "3733", "522", "3739", "523", "3761", "524", "3767", "525", "3769", "526", "3779", "527", "3793", "528", "3797", "529", "3803", "530", "3821"],
+ ["531", "3823", "532", "3833", "533", "3847", "534", "3851", "535", "3853", "536", "3863", "537", "3877", "538", "3881", "539", "3889", "540", "3907"],
+ ["541", "3911", "542", "3917", "543", "3919", "544", "3923", "545", "3929", "546", "3931", "547", "3943", "548", "3947", "549", "3967", "550", "3989"],
+ ["551", "4001", "552", "4003", "553", "4007", "554", "4013", "555", "4019", "556", "4021", "557", "4027", "558", "4049", "559", "4051", "560", "4057"],
+ ["561", "4073", "562", "4079", "563", "4091", "564", "4093", "565", "4099", "566", "4111", "567", "4127", "568", "4129", "569", "4133", "570", "4139"],
+ ["571", "4153", "572", "4157", "573", "4159", "574", "4177", "575", "4201", "576", "4211", "577", "4217", "578", "4219", "579", "4229", "580", "4231"],
+ ["581", "4241", "582", "4243", "583", "4253", "584", "4259", "585", "4261", "586", "4271", "587", "4273", "588", "4283", "589", "4289", "590", "4297"],
+ ["591", "4327", "592", "4337", "593", "4339", "594", "4349", "595", "4357", "596", "4363", "597", "4373", "598", "4391", "599", "4397", "600", "4409"],
+ ["601", "4421", "602", "4423", "603", "4441", "604", "4447", "605", "4451", "606", "4457", "607", "4463", "608", "4481", "609", "4483", "610", "4493"],
+ ["611", "4507", "612", "4513", "613", "4517", "614", "4519", "615", "4523", "616", "4547", "617", "4549", "618", "4561", "619", "4567", "620", "4583"],
+ ["621", "4591", "622", "4597", "623", "4603", "624", "4621", "625", "4637", "626", "4639", "627", "4643", "628", "4649", "629", "4651", "630", "4657"],
+ ["631", "4663", "632", "4673", "633", "4679", "634", "4691", "635", "4703", "636", "4721", "637", "4723", "638", "4729", "639", "4733", "640", "4751"],
+ ["641", "4759", "642", "4783", "643", "4787", "644", "4789", "645", "4793", "646", "4799", "647", "4801", "648", "4813", "649", "4817", "650", "4831"],
+ ["651", "4861", "652", "4871", "653", "4877", "654", "4889", "655", "4903", "656", "4909", "657", "4919", "658", "4931", "659", "4933", "660", "4937"],
+ ["661", "4943", "662", "4951", "663", "4957", "664", "4967", "665", "4969", "666", "4973", "667", "4987", "668", "4993", "669", "4999", "670", "5003"],
+ ["671", "5009", "672", "5011", "673", "5021", "674", "5023", "675", "5039", "676", "5051", "677", "5059", "678", "5077", "679", "5081", "680", "5087"],
+ ["681", "5099", "682", "5101", "683", "5107", "684", "5113", "685", "5119", "686", "5147", "687", "5153", "688", "5167", "689", "5171", "690", "5179"],
+ ["691", "5189", "692", "5197", "693", "5209", "694", "5227", "695", "5231", "696", "5233", "697", "5237", "698", "5261", "699", "5273", "700", "5279"],
+ ["701", "5281", "702", "5297", "703", "5303", "704", "5309", "705", "5323", "706", "5333", "707", "5347", "708", "5351", "709", "5381", "710", "5387"],
+ ["711", "5393", "712", "5399", "713", "5407", "714", "5413", "715", "5417", "716", "5419", "717", "5431", "718", "5437", "719", "5441", "720", "5443"],
+ ["721", "5449", "722", "5471", "723", "5477", "724", "5479", "725", "5483", "726", "5501", "727", "5503", "728", "5507", "729", "5519", "730", "5521"],
+ ["731", "5527", "732", "5531", "733", "5557", "734", "5563", "735", "5569", "736", "5573", "737", "5581", "738", "5591", "739", "5623", "740", "5639"],
+ ["741", "5641", "742", "5647", "743", "5651", "744", "5653", "745", "5657", "746", "5659", "747", "5669", "748", "5683", "749", "5689", "750", "5693"],
+ ["751", "5701", "752", "5711", "753", "5717", "754", "5737", "755", "5741", "756", "5743", "757", "5749", "758", "5779", "759", "5783", "760", "5791"],
+ ["761", "5801", "762", "5807", "763", "5813", "764", "5821", "765", "5827", "766", "5839", "767", "5843", "768", "5849", "769", "5851", "770", "5857"],
+ ["771", "5861", "772", "5867", "773", "5869", "774", "5879", "775", "5881", "776", "5897", "777", "5903", "778", "5923", "779", "5927", "780", "5939"],
+ ["781", "5953", "782", "5981", "783", "5987", "784", "6007", "785", "6011", "786", "6029", "787", "6037", "788", "6043", "789", "6047", "790", "6053"],
+ ["791", "6067", "792", "6073", "793", "6079", "794", "6089", "795", "6091", "796", "6101", "797", "6113", "798", "6121", "799", "6131", "800", "6133"],
+ ["801", "6143", "802", "6151", "803", "6163", "804", "6173", "805", "6197", "806", "6199", "807", "6203", "808", "6211", "809", "6217", "810", "6221"],
+ ["811", "6229", "812", "6247", "813", "6257", "814", "6263", "815", "6269", "816", "6271", "817", "6277", "818", "6287", "819", "6299", "820", "6301"],
+ ["821", "6311", "822", "6317", "823", "6323", "824", "6329", "825", "6337", "826", "6343", "827", "6353", "828", "6359", "829", "6361", "830", "6367"],
+ ["831", "6373", "832", "6379", "833", "6389", "834", "6397", "835", "6421", "836", "6427", "837", "6449", "838", "6451", "839", "6469", "840", "6473"],
+ ["841", "6481", "842", "6491", "843", "6521", "844", "6529", "845", "6547", "846", "6551", "847", "6553", "848", "6563", "849", "6569", "850", "6571"],
+ ["851", "6577", "852", "6581", "853", "6599", "854", "6607", "855", "6619", "856", "6637", "857", "6653", "858", "6659", "859", "6661", "860", "6673"],
+ ["861", "6679", "862", "6689", "863", "6691", "864", "6701", "865", "6703", "866", "6709", "867", "6719", "868", "6733", "869", "6737", "870", "6761"],
+ ["871", "6763", "872", "6779", "873", "6781", "874", "6791", "875", "6793", "876", "6803", "877", "6823", "878", "6827", "879", "6829", "880", "6833"],
+ ["881", "6841", "882", "6857", "883", "6863", "884", "6869", "885", "6871", "886", "6883", "887", "6899", "888", "6907", "889", "6911", "890", "6917"],
+ ["891", "6947", "892", "6949", "893", "6959", "894", "6961", "895", "6967", "896", "6971", "897", "6977", "898", "6983", "899", "6991", "900", "6997"],
+ ["901", "7001", "902", "7013", "903", "7019", "904", "7027", "905", "7039", "906", "7043", "907", "7057", "908", "7069", "909", "7079", "910", "7103"],
+ ["911", "7109", "912", "7121", "913", "7127", "914", "7129", "915", "7151", "916", "7159", "917", "7177", "918", "7187", "919", "7193", "920", "7207"],
+ ["921", "7211", "922", "7213", "923", "7219", "924", "7229", "925", "7237", "926", "7243", "927", "7247", "928", "7253", "929", "7283", "930", "7297"],
+ ["931", "7307", "932", "7309", "933", "7321", "934", "7331", "935", "7333", "936", "7349", "937", "7351", "938", "7369", "939", "7393", "940", "7411"],
+ ["941", "7417", "942", "7433", "943", "7451", "944", "7457", "945", "7459", "946", "7477", "947", "7481", "948", "7487", "949", "7489", "950", "7499"],
+ ["951", "7507", "952", "7517", "953", "7523", "954", "7529", "955", "7537", "956", "7541", "957", "7547", "958", "7549", "959", "7559", "960", "7561"],
+ ["961", "7573", "962", "7577", "963", "7583", "964", "7589", "965", "7591", "966", "7603", "967", "7607", "968", "7621", "969", "7639", "970", "7643"],
+ ["971", "7649", "972", "7669", "973", "7673", "974", "7681", "975", "7687", "976", "7691", "977", "7699", "978", "7703", "979", "7717", "980", "7723"],
+ ["981", "7727", "982", "7741", "983", "7753", "984", "7757", "985", "7759", "986", "7789", "987", "7793", "988", "7817", "989", "7823", "990", "7829"],
+ ["991", "7841", "992", "7853", "993", "7867", "994", "7873", "995", "7877", "996", "7879", "997", "7883", "998", "7901", "999", "7907", "1000", "7919"],
+ ["1001", "7927", "1002", "7933", "1003", "7937", "1004", "7949", "1005", "7951", "1006", "7963", "1007", "7993", "1008", "8009", "1009", "8011", "1010", "8017"],
+ ["1011", "8039", "1012", "8053", "1013", "8059", "1014", "8069", "1015", "8081", "1016", "8087", "1017", "8089", "1018", "8093", "1019", "8101", "1020", "8111"],
+ ["1021", "8117", "1022", "8123", "1023", "8147", "1024", "8161", "1025", "8167", "1026", "8171", "1027", "8179", "1028", "8191", "1029", "8209", "1030", "8219"],
+ ["1031", "8221", "1032", "8231", "1033", "8233", "1034", "8237", "1035", "8243", "1036", "8263", "1037", "8269", "1038", "8273", "1039", "8287", "1040", "8291"],
+ ["1041", "8293", "1042", "8297", "1043", "8311", "1044", "8317", "1045", "8329", "1046", "8353", "1047", "8363", "1048", "8369", "1049", "8377", "1050", "8387"],
+ ["1051", "8389", "1052", "8419", "1053", "8423", "1054", "8429", "1055", "8431", "1056", "8443", "1057", "8447", "1058", "8461", "1059", "8467", "1060", "8501"],
+ ["1061", "8513", "1062", "8521", "1063", "8527", "1064", "8537", "1065", "8539", "1066", "8543", "1067", "8563", "1068", "8573", "1069", "8581", "1070", "8597"],
+ ["1071", "8599", "1072", "8609", "1073", "8623", "1074", "8627", "1075", "8629", "1076", "8641", "1077", "8647", "1078", "8663", "1079", "8669", "1080", "8677"],
+ ["1081", "8681", "1082", "8689", "1083", "8693", "1084", "8699", "1085", "8707", "1086", "8713", "1087", "8719", "1088", "8731", "1089", "8737", "1090", "8741"],
+ ["1091", "8747", "1092", "8753", "1093", "8761", "1094", "8779", "1095", "8783", "1096", "8803", "1097", "8807", "1098", "8819", "1099", "8821", "1100", "8831"],
+ ["1101", "8837", "1102", "8839", "1103", "8849", "1104", "8861", "1105", "8863", "1106", "8867", "1107", "8887", "1108", "8893", "1109", "8923", "1110", "8929"],
+ ["1111", "8933", "1112", "8941", "1113", "8951", "1114", "8963", "1115", "8969", "1116", "8971", "1117", "8999", "1118", "9001", "1119", "9007", "1120", "9011"],
+ ["1121", "9013", "1122", "9029", "1123", "9041", "1124", "9043", "1125", "9049", "1126", "9059", "1127", "9067", "1128", "9091", "1129", "9103", "1130", "9109"],
+ ["1131", "9127", "1132", "9133", "1133", "9137", "1134", "9151", "1135", "9157", "1136", "9161", "1137", "9173", "1138", "9181", "1139", "9187", "1140", "9199"],
+ ["1141", "9203", "1142", "9209", "1143", "9221", "1144", "9227", "1145", "9239", "1146", "9241", "1147", "9257", "1148", "9277", "1149", "9281", "1150", "9283"],
+ ["1151", "9293", "1152", "9311", "1153", "9319", "1154", "9323", "1155", "9337", "1156", "9341", "1157", "9343", "1158", "9349", "1159", "9371", "1160", "9377"],
+ ["1161", "9391", "1162", "9397", "1163", "9403", "1164", "9413", "1165", "9419", "1166", "9421", "1167", "9431", "1168", "9433", "1169", "9437", "1170", "9439"],
+ ["1171", "9461", "1172", "9463", "1173", "9467", "1174", "9473", "1175", "9479", "1176", "9491", "1177", "9497", "1178", "9511", "1179", "9521", "1180", "9533"],
+ ["1181", "9539", "1182", "9547", "1183", "9551", "1184", "9587", "1185", "9601", "1186", "9613", "1187", "9619", "1188", "9623", "1189", "9629", "1190", "9631"],
+ ["1191", "9643", "1192", "9649", "1193", "9661", "1194", "9677", "1195", "9679", "1196", "9689", "1197", "9697", "1198", "9719", "1199", "9721", "1200", "9733"],
+ ["1201", "9739", "1202", "9743", "1203", "9749", "1204", "9767", "1205", "9769", "1206", "9781", "1207", "9787", "1208", "9791", "1209", "9803", "1210", "9811"],
+ ["1211", "9817", "1212", "9829", "1213", "9833", "1214", "9839", "1215", "9851", "1216", "9857", "1217", "9859", "1218", "9871", "1219", "9883", "1220", "9887"],
+ ["1221", "9901", "1222", "9907", "1223", "9923", "1224", "9929", "1225", "9931", "1226", "9941", "1227", "9949", "1228", "9967", "1229", "9973", "1230", "10007"],
+ ["1231", "10009", "1232", "10037", "1233", "10039", "1234", "10061", "1235", "10067", "1236", "10069", "1237", "10079", "1238", "10091", "1239", "10093", "1240", "10099"],
+ ["1241", "10103", "1242", "10111", "1243", "10133", "1244", "10139", "1245", "10141", "1246", "10151", "1247", "10159", "1248", "10163", "1249", "10169", "1250", "10177"],
+ ["1251", "10181", "1252", "10193", "1253", "10211", "1254", "10223", "1255", "10243", "1256", "10247", "1257", "10253", "1258", "10259", "1259", "10267", "1260", "10271"],
+ ["1261", "10273", "1262", "10289", "1263", "10301", "1264", "10303", "1265", "10313", "1266", "10321", "1267", "10331", "1268", "10333", "1269", "10337", "1270", "10343"],
+ ["1271", "10357", "1272", "10369", "1273", "10391", "1274", "10399", "1275", "10427", "1276", "10429", "1277", "10433", "1278", "10453", "1279", "10457", "1280", "10459"],
+ ["1281", "10463", "1282", "10477", "1283", "10487", "1284", "10499", "1285", "10501", "1286", "10513", "1287", "10529", "1288", "10531", "1289", "10559", "1290", "10567"],
+ ["1291", "10589", "1292", "10597", "1293", "10601", "1294", "10607", "1295", "10613", "1296", "10627", "1297", "10631", "1298", "10639", "1299", "10651", "1300", "10657"],
+ ["1301", "10663", "1302", "10667", "1303", "10687", "1304", "10691", "1305", "10709", "1306", "10711", "1307", "10723", "1308", "10729", "1309", "10733", "1310", "10739"],
+ ["1311", "10753", "1312", "10771", "1313", "10781", "1314", "10789", "1315", "10799", "1316", "10831", "1317", "10837", "1318", "10847", "1319", "10853", "1320", "10859"],
+ ["1321", "10861", "1322", "10867", "1323", "10883", "1324", "10889", "1325", "10891", "1326", "10903", "1327", "10909", "1328", "10937", "1329", "10939", "1330", "10949"],
+ ["1331", "10957", "1332", "10973", "1333", "10979", "1334", "10987", "1335", "10993", "1336", "11003", "1337", "11027", "1338", "11047", "1339", "11057", "1340", "11059"],
+ ["1341", "11069", "1342", "11071", "1343", "11083", "1344", "11087", "1345", "11093", "1346", "11113", "1347", "11117", "1348", "11119", "1349", "11131", "1350", "11149"],
+ ["1351", "11159", "1352", "11161", "1353", "11171", "1354", "11173", "1355", "11177", "1356", "11197", "1357", "11213", "1358", "11239", "1359", "11243", "1360", "11251"],
+ ["1361", "11257", "1362", "11261", "1363", "11273", "1364", "11279", "1365", "11287", "1366", "11299", "1367", "11311", "1368", "11317", "1369", "11321", "1370", "11329"],
+ ["1371", "11351", "1372", "11353", "1373", "11369", "1374", "11383", "1375", "11393", "1376", "11399", "1377", "11411", "1378", "11423", "1379", "11437", "1380", "11443"],
+ ["1381", "11447", "1382", "11467", "1383", "11471", "1384", "11483", "1385", "11489", "1386", "11491", "1387", "11497", "1388", "11503", "1389", "11519", "1390", "11527"],
+ ["1391", "11549", "1392", "11551", "1393", "11579", "1394", "11587", "1395", "11593", "1396", "11597", "1397", "11617", "1398", "11621", "1399", "11633", "1400", "11657"],
+ ["1401", "11677", "1402", "11681", "1403", "11689", "1404", "11699", "1405", "11701", "1406", "11717", "1407", "11719", "1408", "11731", "1409", "11743", "1410", "11777"],
+ ["1411", "11779", "1412", "11783", "1413", "11789", "1414", "11801", "1415", "11807", "1416", "11813", "1417", "11821", "1418", "11827", "1419", "11831", "1420", "11833"],
+ ["1421", "11839", "1422", "11863", "1423", "11867", "1424", "11887", "1425", "11897", "1426", "11903", "1427", "11909", "1428", "11923", "1429", "11927", "1430", "11933"],
+ ["1431", "11939", "1432", "11941", "1433", "11953", "1434", "11959", "1435", "11969", "1436", "11971", "1437", "11981", "1438", "11987", "1439", "12007", "1440", "12011"],
+ ["1441", "12037", "1442", "12041", "1443", "12043", "1444", "12049", "1445", "12071", "1446", "12073", "1447", "12097", "1448", "12101", "1449", "12107", "1450", "12109"],
+ ["1451", "12113", "1452", "12119", "1453", "12143", "1454", "12149", "1455", "12157", "1456", "12161", "1457", "12163", "1458", "12197", "1459", "12203", "1460", "12211"],
+ ["1461", "12227", "1462", "12239", "1463", "12241", "1464", "12251", "1465", "12253", "1466", "12263", "1467", "12269", "1468", "12277", "1469", "12281", "1470", "12289"],
+ ["1471", "12301", "1472", "12323", "1473", "12329", "1474", "12343", "1475", "12347", "1476", "12373", "1477", "12377", "1478", "12379", "1479", "12391", "1480", "12401"],
+ ["1481", "12409", "1482", "12413", "1483", "12421", "1484", "12433", "1485", "12437", "1486", "12451", "1487", "12457", "1488", "12473", "1489", "12479", "1490", "12487"],
+ ["1491", "12491", "1492", "12497", "1493", "12503", "1494", "12511", "1495", "12517", "1496", "12527", "1497", "12539", "1498", "12541", "1499", "12547", "1500", "12553"],
+ ["1501", "12569", "1502", "12577", "1503", "12583", "1504", "12589", "1505", "12601", "1506", "12611", "1507", "12613", "1508", "12619", "1509", "12637", "1510", "12641"],
+ ["1511", "12647", "1512", "12653", "1513", "12659", "1514", "12671", "1515", "12689", "1516", "12697", "1517", "12703", "1518", "12713", "1519", "12721", "1520", "12739"],
+ ["1521", "12743", "1522", "12757", "1523", "12763", "1524", "12781", "1525", "12791", "1526", "12799", "1527", "12809", "1528", "12821", "1529", "12823", "1530", "12829"],
+ ["1531", "12841", "1532", "12853", "1533", "12889", "1534", "12893", "1535", "12899", "1536", "12907", "1537", "12911", "1538", "12917", "1539", "12919", "1540", "12923"],
+ ["1541", "12941", "1542", "12953", "1543", "12959", "1544", "12967", "1545", "12973", "1546", "12979", "1547", "12983", "1548", "13001", "1549", "13003", "1550", "13007"],
+ ["1551", "13009", "1552", "13033", "1553", "13037", "1554", "13043", "1555", "13049", "1556", "13063", "1557", "13093", "1558", "13099", "1559", "13103", "1560", "13109"],
+ ["1561", "13121", "1562", "13127", "1563", "13147", "1564", "13151", "1565", "13159", "1566", "13163", "1567", "13171", "1568", "13177", "1569", "13183", "1570", "13187"],
+ ["1571", "13217", "1572", "13219", "1573", "13229", "1574", "13241", "1575", "13249", "1576", "13259", "1577", "13267", "1578", "13291", "1579", "13297", "1580", "13309"],
+ ["1581", "13313", "1582", "13327", "1583", "13331", "1584", "13337", "1585", "13339", "1586", "13367", "1587", "13381", "1588", "13397", "1589", "13399", "1590", "13411"],
+ ["1591", "13417", "1592", "13421", "1593", "13441", "1594", "13451", "1595", "13457", "1596", "13463", "1597", "13469", "1598", "13477", "1599", "13487", "1600", "13499"],
+ ["1601", "13513", "1602", "13523", "1603", "13537", "1604", "13553", "1605", "13567", "1606", "13577", "1607", "13591", "1608", "13597", "1609", "13613", "1610", "13619"],
+ ["1611", "13627", "1612", "13633", "1613", "13649", "1614", "13669", "1615", "13679", "1616", "13681", "1617", "13687", "1618", "13691", "1619", "13693", "1620", "13697"],
+ ["1621", "13709", "1622", "13711", "1623", "13721", "1624", "13723", "1625", "13729", "1626", "13751", "1627", "13757", "1628", "13759", "1629", "13763", "1630", "13781"],
+ ["1631", "13789", "1632", "13799", "1633", "13807", "1634", "13829", "1635", "13831", "1636", "13841", "1637", "13859", "1638", "13873", "1639", "13877", "1640", "13879"],
+ ["1641", "13883", "1642", "13901", "1643", "13903", "1644", "13907", "1645", "13913", "1646", "13921", "1647", "13931", "1648", "13933", "1649", "13963", "1650", "13967"],
+ ["1651", "13997", "1652", "13999", "1653", "14009", "1654", "14011", "1655", "14029", "1656", "14033", "1657", "14051", "1658", "14057", "1659", "14071", "1660", "14081"],
+ ["1661", "14083", "1662", "14087", "1663", "14107", "1664", "14143", "1665", "14149", "1666", "14153", "1667", "14159", "1668", "14173", "1669", "14177", "1670", "14197"],
+ ["1671", "14207", "1672", "14221", "1673", "14243", "1674", "14249", "1675", "14251", "1676", "14281", "1677", "14293", "1678", "14303", "1679", "14321", "1680", "14323"],
+ ["1681", "14327", "1682", "14341", "1683", "14347", "1684", "14369", "1685", "14387", "1686", "14389", "1687", "14401", "1688", "14407", "1689", "14411", "1690", "14419"],
+ ["1691", "14423", "1692", "14431", "1693", "14437", "1694", "14447", "1695", "14449", "1696", "14461", "1697", "14479", "1698", "14489", "1699", "14503", "1700", "14519"],
+ ["1701", "14533", "1702", "14537", "1703", "14543", "1704", "14549", "1705", "14551", "1706", "14557", "1707", "14561", "1708", "14563", "1709", "14591", "1710", "14593"],
+ ["1711", "14621", "1712", "14627", "1713", "14629", "1714", "14633", "1715", "14639", "1716", "14653", "1717", "14657", "1718", "14669", "1719", "14683", "1720", "14699"],
+ ["1721", "14713", "1722", "14717", "1723", "14723", "1724", "14731", "1725", "14737", "1726", "14741", "1727", "14747", "1728", "14753", "1729", "14759", "1730", "14767"],
+ ["1731", "14771", "1732", "14779", "1733", "14783", "1734", "14797", "1735", "14813", "1736", "14821", "1737", "14827", "1738", "14831", "1739", "14843", "1740", "14851"],
+ ["1741", "14867", "1742", "14869", "1743", "14879", "1744", "14887", "1745", "14891", "1746", "14897", "1747", "14923", "1748", "14929", "1749", "14939", "1750", "14947"],
+ ["1751", "14951", "1752", "14957", "1753", "14969", "1754", "14983", "1755", "15013", "1756", "15017", "1757", "15031", "1758", "15053", "1759", "15061", "1760", "15073"],
+ ["1761", "15077", "1762", "15083", "1763", "15091", "1764", "15101", "1765", "15107", "1766", "15121", "1767", "15131", "1768", "15137", "1769", "15139", "1770", "15149"],
+ ["1771", "15161", "1772", "15173", "1773", "15187", "1774", "15193", "1775", "15199", "1776", "15217", "1777", "15227", "1778", "15233", "1779", "15241", "1780", "15259"],
+ ["1781", "15263", "1782", "15269", "1783", "15271", "1784", "15277", "1785", "15287", "1786", "15289", "1787", "15299", "1788", "15307", "1789", "15313", "1790", "15319"],
+ ["1791", "15329", "1792", "15331", "1793", "15349", "1794", "15359", "1795", "15361", "1796", "15373", "1797", "15377", "1798", "15383", "1799", "15391", "1800", "15401"],
+ ["1801", "15413", "1802", "15427", "1803", "15439", "1804", "15443", "1805", "15451", "1806", "15461", "1807", "15467", "1808", "15473", "1809", "15493", "1810", "15497"],
+ ["1811", "15511", "1812", "15527", "1813", "15541", "1814", "15551", "1815", "15559", "1816", "15569", "1817", "15581", "1818", "15583", "1819", "15601", "1820", "15607"],
+ ["1821", "15619", "1822", "15629", "1823", "15641", "1824", "15643", "1825", "15647", "1826", "15649", "1827", "15661", "1828", "15667", "1829", "15671", "1830", "15679"],
+ ["1831", "15683", "1832", "15727", "1833", "15731", "1834", "15733", "1835", "15737", "1836", "15739", "1837", "15749", "1838", "15761", "1839", "15767", "1840", "15773"],
+ ["1841", "15787", "1842", "15791", "1843", "15797", "1844", "15803", "1845", "15809", "1846", "15817", "1847", "15823", "1848", "15859", "1849", "15877", "1850", "15881"],
+ ["1851", "15887", "1852", "15889", "1853", "15901", "1854", "15907", "1855", "15913", "1856", "15919", "1857", "15923", "1858", "15937", "1859", "15959", "1860", "15971"],
+ ["1861", "15973", "1862", "15991", "1863", "16001", "1864", "16007", "1865", "16033", "1866", "16057", "1867", "16061", "1868", "16063", "1869", "16067", "1870", "16069"],
+ ["1871", "16073", "1872", "16087", "1873", "16091", "1874", "16097", "1875", "16103", "1876", "16111", "1877", "16127", "1878", "16139", "1879", "16141", "1880", "16183"],
+ ["1881", "16187", "1882", "16189", "1883", "16193", "1884", "16217", "1885", "16223", "1886", "16229", "1887", "16231", "1888", "16249", "1889", "16253", "1890", "16267"],
+ ["1891", "16273", "1892", "16301", "1893", "16319", "1894", "16333", "1895", "16339", "1896", "16349", "1897", "16361", "1898", "16363", "1899", "16369", "1900", "16381"],
+ ["1901", "16411", "1902", "16417", "1903", "16421", "1904", "16427", "1905", "16433", "1906", "16447", "1907", "16451", "1908", "16453", "1909", "16477", "1910", "16481"],
+ ["1911", "16487", "1912", "16493", "1913", "16519", "1914", "16529", "1915", "16547", "1916", "16553", "1917", "16561", "1918", "16567", "1919", "16573", "1920", "16603"],
+ ["1921", "16607", "1922", "16619", "1923", "16631", "1924", "16633", "1925", "16649", "1926", "16651", "1927", "16657", "1928", "16661", "1929", "16673", "1930", "16691"],
+ ["1931", "16693", "1932", "16699", "1933", "16703", "1934", "16729", "1935", "16741", "1936", "16747", "1937", "16759", "1938", "16763", "1939", "16787", "1940", "16811"],
+ ["1941", "16823", "1942", "16829", "1943", "16831", "1944", "16843", "1945", "16871", "1946", "16879", "1947", "16883", "1948", "16889", "1949", "16901", "1950", "16903"],
+ ["1951", "16921", "1952", "16927", "1953", "16931", "1954", "16937", "1955", "16943", "1956", "16963", "1957", "16979", "1958", "16981", "1959", "16987", "1960", "16993"],
+ ["1961", "17011", "1962", "17021", "1963", "17027", "1964", "17029", "1965", "17033", "1966", "17041", "1967", "17047", "1968", "17053", "1969", "17077", "1970", "17093"],
+ ["1971", "17099", "1972", "17107", "1973", "17117", "1974", "17123", "1975", "17137", "1976", "17159", "1977", "17167", "1978", "17183", "1979", "17189", "1980", "17191"],
+ ["1981", "17203", "1982", "17207", "1983", "17209", "1984", "17231", "1985", "17239", "1986", "17257", "1987", "17291", "1988", "17293", "1989", "17299", "1990", "17317"],
+ ["1991", "17321", "1992", "17327", "1993", "17333", "1994", "17341", "1995", "17351", "1996", "17359", "1997", "17377", "1998", "17383", "1999", "17387", "2000", "17389"],
+ ["2001", "17393", "2002", "17401", "2003", "17417", "2004", "17419", "2005", "17431", "2006", "17443", "2007", "17449", "2008", "17467", "2009", "17471", "2010", "17477"],
+ ["2011", "17483", "2012", "17489", "2013", "17491", "2014", "17497", "2015", "17509", "2016", "17519", "2017", "17539", "2018", "17551", "2019", "17569", "2020", "17573"],
+ ["2021", "17579", "2022", "17581", "2023", "17597", "2024", "17599", "2025", "17609", "2026", "17623", "2027", "17627", "2028", "17657", "2029", "17659", "2030", "17669"],
+ ["2031", "17681", "2032", "17683", "2033", "17707", "2034", "17713", "2035", "17729", "2036", "17737", "2037", "17747", "2038", "17749", "2039", "17761", "2040", "17783"],
+ ["2041", "17789", "2042", "17791", "2043", "17807", "2044", "17827", "2045", "17837", "2046", "17839", "2047", "17851", "2048", "17863", "2049", "17881", "2050", "17891"],
+ ["2051", "17903", "2052", "17909", "2053", "17911", "2054", "17921", "2055", "17923", "2056", "17929", "2057", "17939", "2058", "17957", "2059", "17959", "2060", "17971"],
+ ["2061", "17977", "2062", "17981", "2063", "17987", "2064", "17989", "2065", "18013", "2066", "18041", "2067", "18043", "2068", "18047", "2069", "18049", "2070", "18059"],
+ ["2071", "18061", "2072", "18077", "2073", "18089", "2074", "18097", "2075", "18119", "2076", "18121", "2077", "18127", "2078", "18131", "2079", "18133", "2080", "18143"],
+ ["2081", "18149", "2082", "18169", "2083", "18181", "2084", "18191", "2085", "18199", "2086", "18211", "2087", "18217", "2088", "18223", "2089", "18229", "2090", "18233"],
+ ["2091", "18251", "2092", "18253", "2093", "18257", "2094", "18269", "2095", "18287", "2096", "18289", "2097", "18301", "2098", "18307", "2099", "18311", "2100", "18313"],
+ ["2101", "18329", "2102", "18341", "2103", "18353", "2104", "18367", "2105", "18371", "2106", "18379", "2107", "18397", "2108", "18401", "2109", "18413", "2110", "18427"],
+ ["2111", "18433", "2112", "18439", "2113", "18443", "2114", "18451", "2115", "18457", "2116", "18461", "2117", "18481", "2118", "18493", "2119", "18503", "2120", "18517"],
+ ["2121", "18521", "2122", "18523", "2123", "18539", "2124", "18541", "2125", "18553", "2126", "18583", "2127", "18587", "2128", "18593", "2129", "18617", "2130", "18637"],
+ ["2131", "18661", "2132", "18671", "2133", "18679", "2134", "18691", "2135", "18701", "2136", "18713", "2137", "18719", "2138", "18731", "2139", "18743", "2140", "18749"],
+ ["2141", "18757", "2142", "18773", "2143", "18787", "2144", "18793", "2145", "18797", "2146", "18803", "2147", "18839", "2148", "18859", "2149", "18869", "2150", "18899"],
+ ["2151", "189"]]}}]}
+ ==============================================================================
+
+================================================================================
+MERGE RESULT: ✅ SUCCESS
+================================================================================
+Final result length: 36944 chars
+Final result (COMPLETE):
+================================================================================
+{
+ "elements": [
+ {
+ "type": "table",
+ "content": {
+ "headers": ["Nr.1", "Primzahl1", "Nr.2", "Primzahl2", "Nr.3", "Primzahl3", "Nr.4", "Primzahl4", "Nr.5", "Primzahl5", "Nr.6", "Primzahl6", "Nr.7", "Primzahl7", "Nr.8", "Primzahl8", "Nr.9", "Primzahl9", "Nr.10", "Primzahl10"],
+ "rows": [
+ ["1", "2", "2", "3", "3", "5", "4", "7", "5", "11", "6", "13", "7", "17", "8", "19", "9", "23", "10", "29"],
+ ["11", "31", "12", "37", "13", "41", "14", "43", "15", "47", "16", "53", "17", "59", "18", "61", "19", "67", "20", "71"],
+ ["21", "73", "22", "79", "23", "83", "24", "89", "25", "97", "26", "101", "27", "103", "28", "107", "29", "109", "30", "113"],
+ ["31", "127", "32", "131", "33", "137", "34", "139", "35", "149", "36", "151", "37", "157", "38", "163", "39", "167", "40", "173"],
+ ["41", "179", "42", "181", "43", "191", "44", "193", "45", "197", "46", "199", "47", "211", "48", "223", "49", "227", "50", "229"],
+ ["51", "233", "52", "239", "53", "241", "54", "251", "55", "257", "56", "263", "57", "269", "58", "271", "59", "277", "60", "281"],
+ ["61", "283", "62", "293", "63", "307", "64", "311", "65", "313", "66", "317", "67", "331", "68", "337", "69", "347", "70", "349"],
+ ["71", "353", "72", "359", "73", "367", "74", "373", "75", "379", "76", "383", "77", "389", "78", "397", "79", "401", "80", "409"],
+ ["81", "419", "82", "421", "83", "431", "84", "433", "85", "439", "86", "443", "87", "449", "88", "457", "89", "461", "90", "463"],
+ ["91", "467", "92", "479", "93", "487", "94", "491", "95", "499", "96", "503", "97", "509", "98", "521", "99", "523", "100", "541"],
+ ["101", "547", "102", "557", "103", "563", "104", "569", "105", "571", "106", "577", "107", "587", "108", "593", "109", "599", "110", "601"],
+ ["111", "607", "112", "613", "113", "617", "114", "619", "115", "631", "116", "641", "117", "643", "118", "647", "119", "653", "120", "659"],
+ ["121", "661", "122", "673", "123", "677", "124", "683", "125", "691", "126", "701", "127", "709", "128", "719", "129", "727", "130", "733"],
+ ["131", "739", "132", "743", "133", "751", "134", "757", "135", "761", "136", "769", "137", "773", "138", "787", "139", "797", "140", "809"],
+ ["141", "811", "142", "821", "143", "823", "144", "827", "145", "829", "146", "839", "147", "853", "148", "857", "149", "859", "150", "863"],
+ ["151", "877", "152", "881", "153", "883", "154", "887", "155", "907", "156", "911", "157", "919", "158", "929", "159", "937", "160", "941"],
+ ["161", "947", "162", "953", "163", "967", "164", "971", "165", "977", "166", "983", "167", "991", "168", "997", "169", "1009", "170", "1013"],
+ ["171", "1019", "172", "1021", "173", "1031", "174", "1033", "175", "1039", "176", "1049", "177", "1051", "178", "1061", "179", "1063", "180", "1069"],
+ ["181", "1087", "182", "1091", "183", "1093", "184", "1097", "185", "1103", "186", "1109", "187", "1117", "188", "1123", "189", "1129", "190", "1151"],
+ ["191", "1153", "192", "1163", "193", "1171", "194", "1181", "195", "1187", "196", "1193", "197", "1201", "198", "1213", "199", "1217", "200", "1223"],
+ ["201", "1229", "202", "1231", "203", "1237", "204", "1249", "205", "1259", "206", "1277", "207", "1279", "208", "1283", "209", "1289", "210", "1291"],
+ ["211", "1297", "212", "1301", "213", "1303", "214", "1307", "215", "1319", "216", "1321", "217", "1327", "218", "1361", "219", "1367", "220", "1373"],
+ ["221", "1381", "222", "1399", "223", "1409", "224", "1423", "225", "1427", "226", "1429", "227", "1433", "228", "1439", "229", "1447", "230", "1451"],
+ ["231", "1453", "232", "1459", "233", "1471", "234", "1481", "235", "1483", "236", "1487", "237", "1489", "238", "1493", "239", "1499", "240", "1511"],
+ ["241", "1523", "242", "1531", "243", "1543", "244", "1549", "245", "1553", "246", "1559", "247", "1567", "248", "1571", "249", "1579", "250", "1583"],
+ ["251", "1597", "252", "1601", "253", "1607", "254", "1609", "255", "1613", "256", "1619", "257", "1621", "258", "1627", "259", "1637", "260", "1657"],
+ ["261", "1663", "262", "1667", "263", "1669", "264", "1693", "265", "1697", "266", "1699", "267", "1709", "268", "1721", "269", "1723", "270", "1733"],
+ ["271", "1741", "272", "1747", "273", "1753", "274", "1759", "275", "1777", "276", "1783", "277", "1787", "278", "1789", "279", "1801", "280", "1811"],
+ ["281", "1823", "282", "1831", "283", "1847", "284", "1861", "285", "1867", "286", "1871", "287", "1873", "288", "1877", "289", "1879", "290", "1889"],
+ ["291", "1901", "292", "1907", "293", "1913", "294", "1931", "295", "1933", "296", "1949", "297", "1951", "298", "1973", "299", "1979", "300", "1987"],
+ ["301", "1993", "302", "1997", "303", "1999", "304", "2003", "305", "2011", "306", "2017", "307", "2027", "308", "2029", "309", "2039", "310", "2053"],
+ ["311", "2063", "312", "2069", "313", "2081", "314", "2083", "315", "2087", "316", "2089", "317", "2099", "318", "2111", "319", "2113", "320", "2129"],
+ ["321", "2131", "322", "2137", "323", "2141", "324", "2143", "325", "2153", "326", "2161", "327", "2179", "328", "2203", "329", "2207", "330", "2213"],
+ ["331", "2221", "332", "2237", "333", "2239", "334", "2243", "335", "2251", "336", "2267", "337", "2269", "338", "2273", "339", "2281", "340", "2287"],
+ ["341", "2293", "342", "2297", "343", "2309", "344", "2311", "345", "2333", "346", "2339", "347", "2341", "348", "2347", "349", "2351", "350", "2357"],
+ ["351", "2371", "352", "2377", "353", "2381", "354", "2383", "355", "2389", "356", "2393", "357", "2399", "358", "2411", "359", "2417", "360", "2423"],
+ ["361", "2437", "362", "2441", "363", "2447", "364", "2459", "365", "2467", "366", "2473", "367", "2477", "368", "2503", "369", "2521", "370", "2531"],
+ ["371", "2539", "372", "2543", "373", "2549", "374", "2551", "375", "2557", "376", "2579", "377", "2591", "378", "2593", "379", "2609", "380", "2617"],
+ ["381", "2621", "382", "2633", "383", "2647", "384", "2657", "385", "2659", "386", "2663", "387", "2671", "388", "2677", "389", "2683", "390", "2687"],
+ ["391", "2689", "392", "2693", "393", "2699", "394", "2707", "395", "2711", "396", "2713", "397", "2719", "398", "2729", "399", "2731", "400", "2741"],
+ ["401", "2749", "402", "2753", "403", "2767", "404", "2777", "405", "2789", "406", "2791", "407", "2797", "408", "2801", "409", "2803", "410", "2819"],
+ ["411", "2833", "412", "2837", "413", "2843", "414", "2851", "415", "2857", "416", "2861", "417", "2879", "418", "2887", "419", "2897", "420", "2903"],
+ ["421", "2909", "422", "2917", "423", "2927", "424", "2939", "425", "2953", "426", "2957", "427", "2963", "428", "2969", "429", "2971", "430", "2999"],
+ ["431", "3001", "432", "3011", "433", "3019", "434", "3023", "435", "3037", "436", "3041", "437", "3049", "438", "3061", "439", "3067", "440", "3079"],
+ ["441", "3083", "442", "3089", "443", "3109", "444", "3119", "445", "3121", "446", "3137", "447", "3163", "448", "3167", "449", "3169", "450", "3181"],
+ ["451", "3187", "452", "3191", "453", "3203", "454", "3209", "455", "3217", "456", "3221", "457", "3229", "458", "3251", "459", "3253", "460", "3257"],
+ ["461", "3259", "462", "3271", "463", "3299", "464", "3301", "465", "3307", "466", "3313", "467", "3319", "468", "3323", "469", "3329", "470", "3331"],
+ ["471", "3343", "472", "3347", "473", "3359", "474", "3361", "475", "3371", "476", "3373", "477", "3389", "478", "3391", "479", "3407", "480", "3413"],
+ ["481", "3433", "482", "3449", "483", "3457", "484", "3461", "485", "3463", "486", "3467", "487", "3469", "488", "3491", "489", "3499", "490", "3511"],
+ ["491", "3517", "492", "3527", "493", "3529", "494", "3533", "495", "3539", "496", "3541", "497", "3547", "498", "3557", "499", "3559", "500", "3571"],
+ ["501", "3581", "502", "3583", "503", "3593", "504", "3607", "505", "3613", "506", "3617", "507", "3623", "508", "3631", "509", "3637", "510", "3643"],
+ ["511", "3659", "512", "3671", "513", "3673", "514", "3677", "515", "3691", "516", "3697", "517", "3701", "518", "3709", "519", "3719", "520", "3727"],
+ ["521", "3733", "522", "3739", "523", "3761", "524", "3767", "525", "3769", "526", "3779", "527", "3793", "528", "3797", "529", "3803", "530", "3821"],
+ ["531", "3823", "532", "3833", "533", "3847", "534", "3851", "535", "3853", "536", "3863", "537", "3877", "538", "3881", "539", "3889", "540", "3907"],
+ ["541", "3911", "542", "3917", "543", "3919", "544", "3923", "545", "3929", "546", "3931", "547", "3943", "548", "3947", "549", "3967", "550", "3989"],
+ ["551", "4001", "552", "4003", "553", "4007", "554", "4013", "555", "4019", "556", "4021", "557", "4027", "558", "4049", "559", "4051", "560", "4057"],
+ ["561", "4073", "562", "4079", "563", "4091", "564", "4093", "565", "4099", "566", "4111", "567", "4127", "568", "4129", "569", "4133", "570", "4139"],
+ ["571", "4153", "572", "4157", "573", "4159", "574", "4177", "575", "4201", "576", "4211", "577", "4217", "578", "4219", "579", "4229", "580", "4231"],
+ ["581", "4241", "582", "4243", "583", "4253", "584", "4259", "585", "4261", "586", "4271", "587", "4273", "588", "4283", "589", "4289", "590", "4297"],
+ ["591", "4327", "592", "4337", "593", "4339", "594", "4349", "595", "4357", "596", "4363", "597", "4373", "598", "4391", "599", "4397", "600", "4409"],
+ ["601", "4421", "602", "4423", "603", "4441", "604", "4447", "605", "4451", "606", "4457", "607", "4463", "608", "4481", "609", "4483", "610", "4493"],
+ ["611", "4507", "612", "4513", "613", "4517", "614", "4519", "615", "4523", "616", "4547", "617", "4549", "618", "4561", "619", "4567", "620", "4583"],
+ ["621", "4591", "622", "4597", "623", "4603", "624", "4621", "625", "4637", "626", "4639", "627", "4643", "628", "4649", "629", "4651", "630", "4657"],
+ ["631", "4663", "632", "4673", "633", "4679", "634", "4691", "635", "4703", "636", "4721", "637", "4723", "638", "4729", "639", "4733", "640", "4751"],
+ ["641", "4759", "642", "4783", "643", "4787", "644", "4789", "645", "4793", "646", "4799", "647", "4801", "648", "4813", "649", "4817", "650", "4831"],
+ ["651", "4861", "652", "4871", "653", "4877", "654", "4889", "655", "4903", "656", "4909", "657", "4919", "658", "4931", "659", "4933", "660", "4937"],
+ ["661", "4943", "662", "4951", "663", "4957", "664", "4967", "665", "4969", "666", "4973", "667", "4987", "668", "4993", "669", "4999", "670", "5003"],
+ ["671", "5009", "672", "5011", "673", "5021", "674", "5023", "675", "5039", "676", "5051", "677", "5059", "678", "5077", "679", "5081", "680", "5087"],
+ ["681", "5099", "682", "5101", "683", "5107", "684", "5113", "685", "5119", "686", "5147", "687", "5153", "688", "5167", "689", "5171", "690", "5179"],
+ ["691", "5189", "692", "5197", "693", "5209", "694", "5227", "695", "5231", "696", "5233", "697", "5237", "698", "5261", "699", "5273", "700", "5279"],
+ ["701", "5281", "702", "5297", "703", "5303", "704", "5309", "705", "5323", "706", "5333", "707", "5347", "708", "5351", "709", "5381", "710", "5387"],
+ ["711", "5393", "712", "5399", "713", "5407", "714", "5413", "715", "5417", "716", "5419", "717", "5431", "718", "5437", "719", "5441", "720", "5443"],
+ ["721", "5449", "722", "5471", "723", "5477", "724", "5479", "725", "5483", "726", "5501", "727", "5503", "728", "5507", "729", "5519", "730", "5521"],
+ ["731", "5527", "732", "5531", "733", "5557", "734", "5563", "735", "5569", "736", "5573", "737", "5581", "738", "5591", "739", "5623", "740", "5639"],
+ ["741", "5641", "742", "5647", "743", "5651", "744", "5653", "745", "5657", "746", "5659", "747", "5669", "748", "5683", "749", "5689", "750", "5693"],
+ ["751", "5701", "752", "5711", "753", "5717", "754", "5737", "755", "5741", "756", "5743", "757", "5749", "758", "5779", "759", "5783", "760", "5791"],
+ ["761", "5801", "762", "5807", "763", "5813", "764", "5821", "765", "5827", "766", "5839", "767", "5843", "768", "5849", "769", "5851", "770", "5857"],
+ ["771", "5861", "772", "5867", "773", "5869", "774", "5879", "775", "5881", "776", "5897", "777", "5903", "778", "5923", "779", "5927", "780", "5939"],
+ ["781", "5953", "782", "5981", "783", "5987", "784", "6007", "785", "6011", "786", "6029", "787", "6037", "788", "6043", "789", "6047", "790", "6053"],
+ ["791", "6067", "792", "6073", "793", "6079", "794", "6089", "795", "6091", "796", "6101", "797", "6113", "798", "6121", "799", "6131", "800", "6133"],
+ ["801", "6143", "802", "6151", "803", "6163", "804", "6173", "805", "6197", "806", "6199", "807", "6203", "808", "6211", "809", "6217", "810", "6221"],
+ ["811", "6229", "812", "6247", "813", "6257", "814", "6263", "815", "6269", "816", "6271", "817", "6277", "818", "6287", "819", "6299", "820", "6301"],
+ ["821", "6311", "822", "6317", "823", "6323", "824", "6329", "825", "6337", "826", "6343", "827", "6353", "828", "6359", "829", "6361", "830", "6367"],
+ ["831", "6373", "832", "6379", "833", "6389", "834", "6397", "835", "6421", "836", "6427", "837", "6449", "838", "6451", "839", "6469", "840", "6473"],
+ ["841", "6481", "842", "6491", "843", "6521", "844", "6529", "845", "6547", "846", "6551", "847", "6553", "848", "6563", "849", "6569", "850", "6571"],
+ ["851", "6577", "852", "6581", "853", "6599", "854", "6607", "855", "6619", "856", "6637", "857", "6653", "858", "6659", "859", "6661", "860", "6673"],
+ ["861", "6679", "862", "6689", "863", "6691", "864", "6701", "865", "6703", "866", "6709", "867", "6719", "868", "6733", "869", "6737", "870", "6761"],
+ ["871", "6763", "872", "6779", "873", "6781", "874", "6791", "875", "6793", "876", "6803", "877", "6823", "878", "6827", "879", "6829", "880", "6833"],
+ ["881", "6841", "882", "6857", "883", "6863", "884", "6869", "885", "6871", "886", "6883", "887", "6899", "888", "6907", "889", "6911", "890", "6917"],
+ ["891", "6947", "892", "6949", "893", "6959", "894", "6961", "895", "6967", "896", "6971", "897", "6977", "898", "6983", "899", "6991", "900", "6997"],
+ ["901", "7001", "902", "7013", "903", "7019", "904", "7027", "905", "7039", "906", "7043", "907", "7057", "908", "7069", "909", "7079", "910", "7103"],
+ ["911", "7109", "912", "7121", "913", "7127", "914", "7129", "915", "7151", "916", "7159", "917", "7177", "918", "7187", "919", "7193", "920", "7207"],
+ ["921", "7211", "922", "7213", "923", "7219", "924", "7229", "925", "7237", "926", "7243", "927", "7247", "928", "7253", "929", "7283", "930", "7297"],
+ ["931", "7307", "932", "7309", "933", "7321", "934", "7331", "935", "7333", "936", "7349", "937", "7351", "938", "7369", "939", "7393", "940", "7411"],
+ ["941", "7417", "942", "7433", "943", "7451", "944", "7457", "945", "7459", "946", "7477", "947", "7481", "948", "7487", "949", "7489", "950", "7499"],
+ ["951", "7507", "952", "7517", "953", "7523", "954", "7529", "955", "7537", "956", "7541", "957", "7547", "958", "7549", "959", "7559", "960", "7561"],
+ ["961", "7573", "962", "7577", "963", "7583", "964", "7589", "965", "7591", "966", "7603", "967", "7607", "968", "7621", "969", "7639", "970", "7643"],
+ ["971", "7649", "972", "7669", "973", "7673", "974", "7681", "975", "7687", "976", "7691", "977", "7699", "978", "7703", "979", "7717", "980", "7723"],
+ ["981", "7727", "982", "7741", "983", "7753", "984", "7757", "985", "7759", "986", "7789", "987", "7793", "988", "7817", "989", "7823", "990", "7829"],
+ ["991", "7841", "992", "7853", "993", "7867", "994", "7873", "995", "7877", "996", "7879", "997", "7883", "998", "7901", "999", "7907", "1000", "7919"],
+ ["1001", "7927", "1002", "7933", "1003", "7937", "1004", "7949", "1005", "7951", "1006", "7963", "1007", "7993", "1008", "8009", "1009", "8011", "1010", "8017"],
+ ["1011", "8039", "1012", "8053", "1013", "8059", "1014", "8069", "1015", "8081", "1016", "8087", "1017", "8089", "1018", "8093", "1019", "8101", "1020", "8111"],
+ ["1021", "8117", "1022", "8123", "1023", "8147", "1024", "8161", "1025", "8167", "1026", "8171", "1027", "8179", "1028", "8191", "1029", "8209", "1030", "8219"],
+ ["1031", "8221", "1032", "8231", "1033", "8233", "1034", "8237", "1035", "8243", "1036", "8263", "1037", "8269", "1038", "8273", "1039", "8287", "1040", "8291"],
+ ["1041", "8293", "1042", "8297", "1043", "8311", "1044", "8317", "1045", "8329", "1046", "8353", "1047", "8363", "1048", "8369", "1049", "8377", "1050", "8387"],
+ ["1051", "8389", "1052", "8419", "1053", "8423", "1054", "8429", "1055", "8431", "1056", "8443", "1057", "8447", "1058", "8461", "1059", "8467", "1060", "8501"],
+ ["1061", "8513", "1062", "8521", "1063", "8527", "1064", "8537", "1065", "8539", "1066", "8543", "1067", "8563", "1068", "8573", "1069", "8581", "1070", "8597"],
+ ["1071", "8599", "1072", "8609", "1073", "8623", "1074", "8627", "1075", "8629", "1076", "8641", "1077", "8647", "1078", "8663", "1079", "8669", "1080", "8677"],
+ ["1081", "8681", "1082", "8689", "1083", "8693", "1084", "8699", "1085", "8707", "1086", "8713", "1087", "8719", "1088", "8731", "1089", "8737", "1090", "8741"],
+ ["1091", "8747", "1092", "8753", "1093", "8761", "1094", "8779", "1095", "8783", "1096", "8803", "1097", "8807", "1098", "8819", "1099", "8821", "1100", "8831"],
+ ["1101", "8837", "1102", "8839", "1103", "8849", "1104", "8861", "1105", "8863", "1106", "8867", "1107", "8887", "1108", "8893", "1109", "8923", "1110", "8929"],
+ ["1111", "8933", "1112", "8941", "1113", "8951", "1114", "8963", "1115", "8969", "1116", "8971", "1117", "8999", "1118", "9001", "1119", "9007", "1120", "9011"],
+ ["1121", "9013", "1122", "9029", "1123", "9041", "1124", "9043", "1125", "9049", "1126", "9059", "1127", "9067", "1128", "9091", "1129", "9103", "1130", "9109"],
+ ["1131", "9127", "1132", "9133", "1133", "9137", "1134", "9151", "1135", "9157", "1136", "9161", "1137", "9173", "1138", "9181", "1139", "9187", "1140", "9199"],
+ ["1141", "9203", "1142", "9209", "1143", "9221", "1144", "9227", "1145", "9239", "1146", "9241", "1147", "9257", "1148", "9277", "1149", "9281", "1150", "9283"],
+ ["1151", "9293", "1152", "9311", "1153", "9319", "1154", "9323", "1155", "9337", "1156", "9341", "1157", "9343", "1158", "9349", "1159", "9371", "1160", "9377"],
+ ["1161", "9391", "1162", "9397", "1163", "9403", "1164", "9413", "1165", "9419", "1166", "9421", "1167", "9431", "1168", "9433", "1169", "9437", "1170", "9439"],
+ ["1171", "9461", "1172", "9463", "1173", "9467", "1174", "9473", "1175", "9479", "1176", "9491", "1177", "9497", "1178", "9511", "1179", "9521", "1180", "9533"],
+ ["1181", "9539", "1182", "9547", "1183", "9551", "1184", "9587", "1185", "9601", "1186", "9613", "1187", "9619", "1188", "9623", "1189", "9629", "1190", "9631"],
+ ["1191", "9643", "1192", "9649", "1193", "9661", "1194", "9677", "1195", "9679", "1196", "9689", "1197", "9697", "1198", "9719", "1199", "9721", "1200", "9733"],
+ ["1201", "9739", "1202", "9743", "1203", "9749", "1204", "9767", "1205", "9769", "1206", "9781", "1207", "9787", "1208", "9791", "1209", "9803", "1210", "9811"],
+ ["1211", "9817", "1212", "9829", "1213", "9833", "1214", "9839", "1215", "9851", "1216", "9857", "1217", "9859", "1218", "9871", "1219", "9883", "1220", "9887"],
+ ["1221", "9901", "1222", "9907", "1223", "9923", "1224", "9929", "1225", "9931", "1226", "9941", "1227", "9949", "1228", "9967", "1229", "9973", "1230", "10007"],
+ ["1231", "10009", "1232", "10037", "1233", "10039", "1234", "10061", "1235", "10067", "1236", "10069", "1237", "10079", "1238", "10091", "1239", "10093", "1240", "10099"],
+ ["1241", "10103", "1242", "10111", "1243", "10133", "1244", "10139", "1245", "10141", "1246", "10151", "1247", "10159", "1248", "10163", "1249", "10169", "1250", "10177"],
+ ["1251", "10181", "1252", "10193", "1253", "10211", "1254", "10223", "1255", "10243", "1256", "10247", "1257", "10253", "1258", "10259", "1259", "10267", "1260", "10271"],
+ ["1261", "10273", "1262", "10289", "1263", "10301", "1264", "10303", "1265", "10313", "1266", "10321", "1267", "10331", "1268", "10333", "1269", "10337", "1270", "10343"],
+ ["1271", "10357", "1272", "10369", "1273", "10391", "1274", "10399", "1275", "10427", "1276", "10429", "1277", "10433", "1278", "10453", "1279", "10457", "1280", "10459"],
+ ["1281", "10463", "1282", "10477", "1283", "10487", "1284", "10499", "1285", "10501", "1286", "10513", "1287", "10529", "1288", "10531", "1289", "10559", "1290", "10567"],
+ ["1291", "10589", "1292", "10597", "1293", "10601", "1294", "10607", "1295", "10613", "1296", "10627", "1297", "10631", "1298", "10639", "1299", "10651", "1300", "10657"],
+ ["1301", "10663", "1302", "10667", "1303", "10687", "1304", "10691", "1305", "10709", "1306", "10711", "1307", "10723", "1308", "10729", "1309", "10733", "1310", "10739"],
+ ["1311", "10753", "1312", "10771", "1313", "10781", "1314", "10789", "1315", "10799", "1316", "10831", "1317", "10837", "1318", "10847", "1319", "10853", "1320", "10859"],
+ ["1321", "10861", "1322", "10867", "1323", "10883", "1324", "10889", "1325", "10891", "1326", "10903", "1327", "10909", "1328", "10937", "1329", "10939", "1330", "10949"],
+ ["1331", "10957", "1332", "10973", "1333", "10979", "1334", "10987", "1335", "10993", "1336", "11003", "1337", "11027", "1338", "11047", "1339", "11057", "1340", "11059"],
+ ["1341", "11069", "1342", "11071", "1343", "11083", "1344", "11087", "1345", "11093", "1346", "11113", "1347", "11117", "1348", "11119", "1349", "11131", "1350", "11149"],
+ ["1351", "11159", "1352", "11161", "1353", "11171", "1354", "11173", "1355", "11177", "1356", "11197", "1357", "11213", "1358", "11239", "1359", "11243", "1360", "11251"],
+ ["1361", "11257", "1362", "11261", "1363", "11273", "1364", "11279", "1365", "11287", "1366", "11299", "1367", "11311", "1368", "11317", "1369", "11321", "1370", "11329"],
+ ["1371", "11351", "1372", "11353", "1373", "11369", "1374", "11383", "1375", "11393", "1376", "11399", "1377", "11411", "1378", "11423", "1379", "11437", "1380", "11443"],
+ ["1381", "11447", "1382", "11467", "1383", "11471", "1384", "11483", "1385", "11489", "1386", "11491", "1387", "11497", "1388", "11503", "1389", "11519", "1390", "11527"],
+ ["1391", "11549", "1392", "11551", "1393", "11579", "1394", "11587", "1395", "11593", "1396", "11597", "1397", "11617", "1398", "11621", "1399", "11633", "1400", "11657"],
+ ["1401", "11677", "1402", "11681", "1403", "11689", "1404", "11699", "1405", "11701", "1406", "11717", "1407", "11719", "1408", "11731", "1409", "11743", "1410", "11777"],
+ ["1411", "11779", "1412", "11783", "1413", "11789", "1414", "11801", "1415", "11807", "1416", "11813", "1417", "11821", "1418", "11827", "1419", "11831", "1420", "11833"],
+ ["1421", "11839", "1422", "11863", "1423", "11867", "1424", "11887", "1425", "11897", "1426", "11903", "1427", "11909", "1428", "11923", "1429", "11927", "1430", "11933"],
+ ["1431", "11939", "1432", "11941", "1433", "11953", "1434", "11959", "1435", "11969", "1436", "11971", "1437", "11981", "1438", "11987", "1439", "12007", "1440", "12011"],
+ ["1441", "12037", "1442", "12041", "1443", "12043", "1444", "12049", "1445", "12071", "1446", "12073", "1447", "12097", "1448", "12101", "1449", "12107", "1450", "12109"],
+ ["1451", "12113", "1452", "12119", "1453", "12143", "1454", "12149", "1455", "12157", "1456", "12161", "1457", "12163", "1458", "12197", "1459", "12203", "1460", "12211"],
+ ["1461", "12227", "1462", "12239", "1463", "12241", "1464", "12251", "1465", "12253", "1466", "12263", "1467", "12269", "1468", "12277", "1469", "12281", "1470", "12289"],
+ ["1471", "12301", "1472", "12323", "1473", "12329", "1474", "12343", "1475", "12347", "1476", "12373", "1477", "12377", "1478", "12379", "1479", "12391", "1480", "12401"],
+ ["1481", "12409", "1482", "12413", "1483", "12421", "1484", "12433", "1485", "12437", "1486", "12451", "1487", "12457", "1488", "12473", "1489", "12479", "1490", "12487"],
+ ["1491", "12491", "1492", "12497", "1493", "12503", "1494", "12511", "1495", "12517", "1496", "12527", "1497", "12539", "1498", "12541", "1499", "12547", "1500", "12553"],
+ ["1501", "12569", "1502", "12577", "1503", "12583", "1504", "12589", "1505", "12601", "1506", "12611", "1507", "12613", "1508", "12619", "1509", "12637", "1510", "12641"],
+ ["1511", "12647", "1512", "12653", "1513", "12659", "1514", "12671", "1515", "12689", "1516", "12697", "1517", "12703", "1518", "12713", "1519", "12721", "1520", "12739"],
+ ["1521", "12743", "1522", "12757", "1523", "12763", "1524", "12781", "1525", "12791", "1526", "12799", "1527", "12809", "1528", "12821", "1529", "12823", "1530", "12829"],
+ ["1531", "12841", "1532", "12853", "1533", "12889", "1534", "12893", "1535", "12899", "1536", "12907", "1537", "12911", "1538", "12917", "1539", "12919", "1540", "12923"],
+ ["1541", "12941", "1542", "12953", "1543", "12959", "1544", "12967", "1545", "12973", "1546", "12979", "1547", "12983", "1548", "13001", "1549", "13003", "1550", "13007"],
+ ["1551", "13009", "1552", "13033", "1553", "13037", "1554", "13043", "1555", "13049", "1556", "13063", "1557", "13093", "1558", "13099", "1559", "13103", "1560", "13109"],
+ ["1561", "13121", "1562", "13127", "1563", "13147", "1564", "13151", "1565", "13159", "1566", "13163", "1567", "13171", "1568", "13177", "1569", "13183", "1570", "13187"],
+ ["1571", "13217", "1572", "13219", "1573", "13229", "1574", "13241", "1575", "13249", "1576", "13259", "1577", "13267", "1578", "13291", "1579", "13297", "1580", "13309"],
+ ["1581", "13313", "1582", "13327", "1583", "13331", "1584", "13337", "1585", "13339", "1586", "13367", "1587", "13381", "1588", "13397", "1589", "13399", "1590", "13411"],
+ ["1591", "13417", "1592", "13421", "1593", "13441", "1594", "13451", "1595", "13457", "1596", "13463", "1597", "13469", "1598", "13477", "1599", "13487", "1600", "13499"],
+ ["1601", "13513", "1602", "13523", "1603", "13537", "1604", "13553", "1605", "13567", "1606", "13577", "1607", "13591", "1608", "13597", "1609", "13613", "1610", "13619"],
+ ["1611", "13627", "1612", "13633", "1613", "13649", "1614", "13669", "1615", "13679", "1616", "13681", "1617", "13687", "1618", "13691", "1619", "13693", "1620", "13697"],
+ ["1621", "13709", "1622", "13711", "1623", "13721", "1624", "13723", "1625", "13729", "1626", "13751", "1627", "13757", "1628", "13759", "1629", "13763", "1630", "13781"],
+ ["1631", "13789", "1632", "13799", "1633", "13807", "1634", "13829", "1635", "13831", "1636", "13841", "1637", "13859", "1638", "13873", "1639", "13877", "1640", "13879"],
+ ["1641", "13883", "1642", "13901", "1643", "13903", "1644", "13907", "1645", "13913", "1646", "13921", "1647", "13931", "1648", "13933", "1649", "13963", "1650", "13967"],
+ ["1651", "13997", "1652", "13999", "1653", "14009", "1654", "14011", "1655", "14029", "1656", "14033", "1657", "14051", "1658", "14057", "1659", "14071", "1660", "14081"],
+ ["1661", "14083", "1662", "14087", "1663", "14107", "1664", "14143", "1665", "14149", "1666", "14153", "1667", "14159", "1668", "14173", "1669", "14177", "1670", "14197"],
+ ["1671", "14207", "1672", "14221", "1673", "14243", "1674", "14249", "1675", "14251", "1676", "14281", "1677", "14293", "1678", "14303", "1679", "14321", "1680", "14323"],
+ ["1681", "14327", "1682", "14341", "1683", "14347", "1684", "14369", "1685", "14387", "1686", "14389", "1687", "14401", "1688", "14407", "1689", "14411", "1690", "14419"],
+ ["1691", "14423", "1692", "14431", "1693", "14437", "1694", "14447", "1695", "14449", "1696", "14461", "1697", "14479", "1698", "14489", "1699", "14503", "1700", "14519"],
+ ["1701", "14533", "1702", "14537", "1703", "14543", "1704", "14549", "1705", "14551", "1706", "14557", "1707", "14561", "1708", "14563", "1709", "14591", "1710", "14593"],
+ ["1711", "14621", "1712", "14627", "1713", "14629", "1714", "14633", "1715", "14639", "1716", "14653", "1717", "14657", "1718", "14669", "1719", "14683", "1720", "14699"],
+ ["1721", "14713", "1722", "14717", "1723", "14723", "1724", "14731", "1725", "14737", "1726", "14741", "1727", "14747", "1728", "14753", "1729", "14759", "1730", "14767"],
+ ["1731", "14771", "1732", "14779", "1733", "14783", "1734", "14797", "1735", "14813", "1736", "14821", "1737", "14827", "1738", "14831", "1739", "14843", "1740", "14851"],
+ ["1741", "14867", "1742", "14869", "1743", "14879", "1744", "14887", "1745", "14891", "1746", "14897", "1747", "14923", "1748", "14929", "1749", "14939", "1750", "14947"],
+ ["1751", "14951", "1752", "14957", "1753", "14969", "1754", "14983", "1755", "15013", "1756", "15017", "1757", "15031", "1758", "15053", "1759", "15061", "1760", "15073"],
+ ["1761", "15077", "1762", "15083", "1763", "15091", "1764", "15101", "1765", "15107", "1766", "15121", "1767", "15131", "1768", "15137", "1769", "15139", "1770", "15149"],
+ ["1771", "15161", "1772", "15173", "1773", "15187", "1774", "15193", "1775", "15199", "1776", "15217", "1777", "15227", "1778", "15233", "1779", "15241", "1780", "15259"],
+ ["1781", "15263", "1782", "15269", "1783", "15271", "1784", "15277", "1785", "15287", "1786", "15289", "1787", "15299", "1788", "15307", "1789", "15313", "1790", "15319"],
+ ["1791", "15329", "1792", "15331", "1793", "15349", "1794", "15359", "1795", "15361", "1796", "15373", "1797", "15377", "1798", "15383", "1799", "15391", "1800", "15401"],
+ ["1801", "15413", "1802", "15427", "1803", "15439", "1804", "15443", "1805", "15451", "1806", "15461", "1807", "15467", "1808", "15473", "1809", "15493", "1810", "15497"],
+ ["1811", "15511", "1812", "15527", "1813", "15541", "1814", "15551", "1815", "15559", "1816", "15569", "1817", "15581", "1818", "15583", "1819", "15601", "1820", "15607"],
+ ["1821", "15619", "1822", "15629", "1823", "15641", "1824", "15643", "1825", "15647", "1826", "15649", "1827", "15661", "1828", "15667", "1829", "15671", "1830", "15679"],
+ ["1831", "15683", "1832", "15727", "1833", "15731", "1834", "15733", "1835", "15737", "1836", "15739", "1837", "15749", "1838", "15761", "1839", "15767", "1840", "15773"],
+ ["1841", "15787", "1842", "15791", "1843", "15797", "1844", "15803", "1845", "15809", "1846", "15817", "1847", "15823", "1848", "15859", "1849", "15877", "1850", "15881"],
+ ["1851", "15887", "1852", "15889", "1853", "15901", "1854", "15907", "1855", "15913", "1856", "15919", "1857", "15923", "1858", "15937", "1859", "15959", "1860", "15971"],
+ ["1861", "15973", "1862", "15991", "1863", "16001", "1864", "16007", "1865", "16033", "1866", "16057", "1867", "16061", "1868", "16063", "1869", "16067", "1870", "16069"],
+ ["1871", "16073", "1872", "16087", "1873", "16091", "1874", "16097", "1875", "16103", "1876", "16111", "1877", "16127", "1878", "16139", "1879", "16141", "1880", "16183"],
+ ["1881", "16187", "1882", "16189", "1883", "16193", "1884", "16217", "1885", "16223", "1886", "16229", "1887", "16231", "1888", "16249", "1889", "16253", "1890", "16267"],
+ ["1891", "16273", "1892", "16301", "1893", "16319", "1894", "16333", "1895", "16339", "1896", "16349", "1897", "16361", "1898", "16363", "1899", "16369", "1900", "16381"],
+ ["1901", "16411", "1902", "16417", "1903", "16421", "1904", "16427", "1905", "16433", "1906", "16447", "1907", "16451", "1908", "16453", "1909", "16477", "1910", "16481"],
+ ["1911", "16487", "1912", "16493", "1913", "16519", "1914", "16529", "1915", "16547", "1916", "16553", "1917", "16561", "1918", "16567", "1919", "16573", "1920", "16603"],
+ ["1921", "16607", "1922", "16619", "1923", "16631", "1924", "16633", "1925", "16649", "1926", "16651", "1927", "16657", "1928", "16661", "1929", "16673", "1930", "16691"],
+ ["1931", "16693", "1932", "16699", "1933", "16703", "1934", "16729", "1935", "16741", "1936", "16747", "1937", "16759", "1938", "16763", "1939", "16787", "1940", "16811"],
+ ["1941", "16823", "1942", "16829", "1943", "16831", "1944", "16843", "1945", "16871", "1946", "16879", "1947", "16883", "1948", "16889", "1949", "16901", "1950", "16903"],
+ ["1951", "16921", "1952", "16927", "1953", "16931", "1954", "16937", "1955", "16943", "1956", "16963", "1957", "16979", "1958", "16981", "1959", "16987", "1960", "16993"],
+ ["1961", "17011", "1962", "17021", "1963", "17027", "1964", "17029", "1965", "17033", "1966", "17041", "1967", "17047", "1968", "17053", "1969", "17077", "1970", "17093"],
+ ["1971", "17099", "1972", "17107", "1973", "17117", "1974", "17123", "1975", "17137", "1976", "17159", "1977", "17167", "1978", "17183", "1979", "17189", "1980", "17191"],
+ ["1981", "17203", "1982", "17207", "1983", "17209", "1984", "17231", "1985", "17239", "1986", "17257", "1987", "17291", "1988", "17293", "1989", "17299", "1990", "17317"],
+ ["1991", "17321", "1992", "17327", "1993", "17333", "1994", "17341", "1995", "17351", "1996", "17359", "1997", "17377", "1998", "17383", "1999", "17387", "2000", "17389"],
+ ["2001", "17393", "2002", "17401", "2003", "17417", "2004", "17419", "2005", "17431", "2006", "17443", "2007", "17449", "2008", "17467", "2009", "17471", "2010", "17477"],
+ ["2011", "17483", "2012", "17489", "2013", "17491", "2014", "17497", "2015", "17509", "2016", "17519", "2017", "17539", "2018", "17551", "2019", "17569", "2020", "17573"],
+ ["2021", "17579", "2022", "17581", "2023", "17597", "2024", "17599", "2025", "17609", "2026", "17623", "2027", "17627", "2028", "17657", "2029", "17659", "2030", "17669"],
+ ["2031", "17681", "2032", "17683", "2033", "17707", "2034", "17713", "2035", "17729", "2036", "17737", "2037", "17747", "2038", "17749", "2039", "17761", "2040", "17783"],
+ ["2041", "17789", "2042", "17791", "2043", "17807", "2044", "17827", "2045", "17837", "2046", "17839", "2047", "17851", "2048", "17863", "2049", "17881", "2050", "17891"],
+ ["2051", "17903", "2052", "17909", "2053", "17911", "2054", "17921", "2055", "17923", "2056", "17929", "2057", "17939", "2058", "17957", "2059", "17959", "2060", "17971"],
+ ["2061", "17977", "2062", "17981", "2063", "17987", "2064", "17989", "2065", "18013", "2066", "18041", "2067", "18043", "2068", "18047", "2069", "18049", "2070", "18059"],
+ ["2071", "18061", "2072", "18077", "2073", "18089", "2074", "18097", "2075", "18119", "2076", "18121", "2077", "18127", "2078", "18131", "2079", "18133", "2080", "18143"],
+ ["2081", "18149", "2082", "18169", "2083", "18181", "2084", "18191", "2085", "18199", "2086", "18211", "2087", "18217", "2088", "18223", "2089", "18229", "2090", "18233"],
+ ["2091", "18251", "2092", "18253", "2093", "18257", "2094", "18269", "2095", "18287", "2096", "18289", "2097", "18301", "2098", "18307", "2099", "18311", "2100", "18313"],
+ ["2101", "18329", "2102", "18341", "2103", "18353", "2104", "18367", "2105", "18371", "2106", "18379", "2107", "18397", "2108", "18401", "2109", "18413", "2110", "18427"],
+ ["2111", "18433", "2112", "18439", "2113", "18443", "2114", "18451", "2115", "18457", "2116", "18461", "2117", "18481", "2118", "18493", "2119", "18503", "2120", "18517"],
+ ["2121", "18521", "2122", "18523", "2123", "18539", "2124", "18541", "2125", "18553", "2126", "18583", "2127", "18587", "2128", "18593", "2129", "18617", "2130", "18637"],
+ ["2131", "18661", "2132", "18671", "2133", "18679", "2134", "18691", "2135", "18701", "2136", "18713", "2137", "18719", "2138", "18731", "2139", "18743", "2140", "18749"],
+ ["2141", "18757", "2142", "18773", "2143", "18787", "2144", "18793", "2145", "18797", "2146", "18803", "2147", "18839", "2148", "18859", "2149", "18869", "2150", "18899"],
+ ["2151", "189"]]}}]}
+================================================================================
diff --git a/modules/services/serviceAi/subAiCallLooping.py b/modules/services/serviceAi/subAiCallLooping.py
index 2dcfa1c4..2bb2afd8 100644
--- a/modules/services/serviceAi/subAiCallLooping.py
+++ b/modules/services/serviceAi/subAiCallLooping.py
@@ -86,8 +86,6 @@ class AiCallLooper:
iteration = 0
allSections = [] # Accumulate all sections across iterations
lastRawResponse = None # Store last raw JSON response for continuation
- documentMetadata = None # Store document metadata (title, filename) from first iteration
- accumulationState = None # Track accumulation state for string accumulation
accumulatedDirectJson = [] # Accumulate JSON strings for direct return use cases (chapter_structure, code_structure)
# Get parent operation ID for iteration operations (parentId should be operationId, not log entry ID)
@@ -113,28 +111,17 @@ class AiCallLooper:
# This ensures continuation prompts are built even when JSON is so broken that no sections can be extracted
if (len(allSections) > 0 or lastRawResponse) and promptBuilder and promptArgs:
# This is a continuation - build continuation context with raw JSON and rebuild prompt
- continuationContext = buildContinuationContext(allSections, lastRawResponse)
+ continuationContext = buildContinuationContext(allSections, lastRawResponse, useCaseId)
if not lastRawResponse:
logger.warning(f"Iteration {iteration}: No previous response available for continuation!")
- # For section_content, pass all promptArgs (it uses buildSectionPromptWithContinuation which needs all args)
- # For other use cases (chapter_structure, code_structure), filter to only accepted parameters
- if useCaseId == "section_content":
- # Pass all promptArgs plus continuationContext for section_content
- iterationPrompt = await promptBuilder(**promptArgs, continuationContext=continuationContext)
- else:
- # Filter promptArgs to only include parameters that buildGenerationPrompt accepts
- # buildGenerationPrompt accepts: outputFormat, userPrompt, title, extracted_content, continuationContext, services
- filteredPromptArgs = {
- k: v for k, v in promptArgs.items()
- if k in ['outputFormat', 'userPrompt', 'title', 'extracted_content', 'services']
- }
- # Always include services if available
- if not filteredPromptArgs.get('services') and hasattr(self, 'services'):
- filteredPromptArgs['services'] = self.services
-
- # Rebuild prompt with continuation context using the provided prompt builder
- iterationPrompt = await promptBuilder(**filteredPromptArgs, continuationContext=continuationContext)
+ # Unified prompt builder call: All prompt builders accept continuationContext and **kwargs
+ # Each builder extracts only the parameters it needs from kwargs
+ # This ensures consistent architecture across all use cases
+ if not promptArgs.get('services') and hasattr(self, 'services'):
+ promptArgs['services'] = self.services
+
+ iterationPrompt = await promptBuilder(continuationContext=continuationContext, **promptArgs)
else:
# First iteration - use original prompt
iterationPrompt = prompt
@@ -251,11 +238,16 @@ class AiCallLooper:
pass
# Handle use cases that return JSON directly (no section extraction needed)
- directReturnUseCases = ["section_content", "chapter_structure", "code_structure", "code_content", "image_batch"]
+ directReturnUseCases = ["section_content", "chapter_structure", "code_structure", "code_content"]
if useCaseId in directReturnUseCases:
- # For chapter_structure, code_structure, and section_content, check completeness and support looping
- loopingUseCases = ["chapter_structure", "code_structure", "section_content"]
+ # For chapter_structure, code_structure, section_content, and code_content, check completeness and support looping
+ loopingUseCases = ["chapter_structure", "code_structure", "section_content", "code_content"]
if useCaseId in loopingUseCases:
+ # CRITICAL: Check if JSON string is incomplete BEFORE parsing
+ # If JSON is truncated, it will be closed for parsing, making it appear complete
+ # So we need to check the original string, not the parsed JSON
+ isStringIncomplete = self._isJsonStringIncomplete(extractedJsonForUseCase if extractedJsonForUseCase else result)
+
# If parsing failed (e.g., invalid JSON with comments or truncated JSON), continue looping to get valid JSON
if not parsedJsonForUseCase:
logger.info(f"Iteration {iteration}: Use case '{useCaseId}' - JSON parsing failed (likely incomplete/truncated), continuing iteration to complete")
@@ -268,8 +260,12 @@ class AiCallLooper:
self.services.chat.progressLogFinish(iterationOperationId, True)
continue
- # Check completeness if we have parsed JSON
- isComplete = JsonResponseHandler.isJsonComplete(parsedJsonForUseCase)
+ # Check completeness: Use string-based check if available, otherwise fall back to parsed JSON check
+ if isStringIncomplete:
+ isComplete = False
+ else:
+ # Check completeness if we have parsed JSON
+ isComplete = JsonResponseHandler.isJsonComplete(parsedJsonForUseCase)
if not isComplete:
logger.warning(f"Iteration {iteration}: Use case '{useCaseId}' - JSON is incomplete, continuing for continuation")
@@ -294,22 +290,45 @@ class AiCallLooper:
# Step 1: Merge all JSON strings using existing overlap detection
mergedJsonString = allJsonStrings[0] if allJsonStrings else ""
+ hasOverlap = True # Track if any overlap was found
for jsonStr in allJsonStrings[1:]:
- mergedJsonString = JsonResponseHandler.mergeJsonStringsWithOverlap(mergedJsonString, jsonStr)
+ mergedJsonString, hasOverlapInMerge = JsonResponseHandler.mergeJsonStringsWithOverlap(mergedJsonString, jsonStr)
+ # If no overlap found in any merge, stop iterations
+ if not hasOverlapInMerge:
+ hasOverlap = False
+ logger.info(f"Iteration {iteration}: No overlap found during merge - stopping iterations and closing JSON")
+ break
- # Step 2: Try to parse the merged string
- extracted = extractJsonString(mergedJsonString)
- parsed, parseErr, _ = tryParseJson(extracted)
-
- if parseErr is None and parsed:
- # Parsing succeeded - normalize and use
- normalized = self._normalizeJsonStructure(parsed, useCaseId)
- parsedJsonForUseCase = normalized
- result = json.dumps(normalized, indent=2, ensure_ascii=False)
+ # If no overlap was found, mark as complete and use closed JSON
+ if not hasOverlap:
+ isComplete = True
+ # JSON is already closed by mergeJsonStringsWithOverlap when no overlap
+ # Use the merged (closed) JSON string directly
+ result = mergedJsonString
+ # Try to parse it to get parsedJsonForUseCase
+ try:
+ extracted = extractJsonString(mergedJsonString)
+ parsed, parseErr, _ = tryParseJson(extracted)
+ if parseErr is None and parsed:
+ normalized = self._normalizeJsonStructure(parsed, useCaseId)
+ parsedJsonForUseCase = normalized
+ result = json.dumps(normalized, indent=2, ensure_ascii=False)
+ except Exception:
+ pass # Use string result if parsing fails
else:
- # Parsing failed - try to extract partial data for section_content
- if useCaseId == "section_content":
- # Use existing mergeDeepStructures approach: parse what we can from each part
+ # Overlap found - continue with normal processing
+ # Step 2: Try to parse the merged string
+ extracted = extractJsonString(mergedJsonString)
+ parsed, parseErr, _ = tryParseJson(extracted)
+
+ if parseErr is None and parsed:
+ # Parsing succeeded - normalize and use
+ normalized = self._normalizeJsonStructure(parsed, useCaseId)
+ parsedJsonForUseCase = normalized
+ result = json.dumps(normalized, indent=2, ensure_ascii=False)
+ else:
+ # Parsing failed - try to extract partial data using Deep-Structure-Merging
+ # This fallback works for all use cases: parse what we can from each part
allParsed = []
for jsonStr in allJsonStrings:
extracted = extractJsonString(jsonStr)
@@ -319,12 +338,12 @@ class AiCallLooper:
allParsed.append(normalized)
if allParsed:
- # Use existing mergeDeepStructures for intelligent merging
+ # Use mergeDeepStructures for intelligent merging across all use cases
if len(allParsed) > 1:
mergedJsonObj = allParsed[0]
for nextObj in allParsed[1:]:
mergedJsonObj = JsonResponseHandler.mergeDeepStructures(
- mergedJsonObj, nextObj, iteration, f"section_content.merge"
+ mergedJsonObj, nextObj, iteration, f"{useCaseId}.merge"
)
else:
mergedJsonObj = allParsed[0]
@@ -334,18 +353,37 @@ class AiCallLooper:
else:
# All parsing failed - use string merge result
result = mergedJsonString
- else:
- # Not section_content - use string merge result
- result = mergedJsonString
except Exception as e:
logger.warning(f"Failed data-based merge, falling back to string merging: {e}")
# Fallback to string merging
mergedJsonString = accumulatedDirectJson[0] if accumulatedDirectJson else result
+ hasOverlap = True # Track if any overlap was found
for prevJson in accumulatedDirectJson[1:]:
- mergedJsonString = JsonResponseHandler.mergeJsonStringsWithOverlap(mergedJsonString, prevJson)
- mergedJsonString = JsonResponseHandler.mergeJsonStringsWithOverlap(mergedJsonString, result)
+ mergedJsonString, hasOverlapInMerge = JsonResponseHandler.mergeJsonStringsWithOverlap(mergedJsonString, prevJson)
+ if not hasOverlapInMerge:
+ hasOverlap = False
+ logger.info(f"Iteration {iteration}: No overlap found during fallback merge - stopping iterations")
+ break
+ if hasOverlap:
+ mergedJsonString, hasOverlapInMerge = JsonResponseHandler.mergeJsonStringsWithOverlap(mergedJsonString, result)
+ if not hasOverlapInMerge:
+ hasOverlap = False
+ logger.info(f"Iteration {iteration}: No overlap found in final fallback merge - stopping iterations")
result = mergedJsonString
+ # If no overlap was found, mark as complete and use closed JSON
+ if not hasOverlap:
+ isComplete = True
+ # JSON is already closed by mergeJsonStringsWithOverlap when no overlap
+ # Try to parse it to get parsedJsonForUseCase
+ try:
+ extractedMerged = extractJsonString(result)
+ parsedMerged, parseError, _ = tryParseJson(extractedMerged)
+ if parseError is None and parsedMerged:
+ parsedJsonForUseCase = parsedMerged
+ except Exception:
+ pass # Use string result if parsing fails
+
# Try to parse the string-merged result
try:
extractedMerged = extractJsonString(result)
@@ -375,233 +413,6 @@ class AiCallLooper:
self.services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result")
return final_json
-
- # Extract sections from response (handles both valid and broken JSON)
- # Only for document generation (JSON responses)
- # CRITICAL: Pass allSections and accumulationState to enable string accumulation
- extractedSections, wasJsonComplete, parsedResult, accumulationState = self.responseParser.extractSectionsFromResponse(
- result, iteration, debugPrefix, allSections, accumulationState
- )
-
- # CRITICAL: Merge sections BEFORE KPI validation
- # This ensures sections are preserved even if KPI validation fails
- if extractedSections:
- allSections = JsonResponseHandler.mergeSectionsIntelligently(allSections, extractedSections, iteration)
-
- # Define KPIs if we just entered accumulation mode (iteration 1, incomplete JSON)
- if accumulationState and accumulationState.isAccumulationMode and iteration == 1 and not accumulationState.kpis:
- logger.info(f"Iteration {iteration}: Defining KPIs for accumulation tracking")
- continuationContext = buildContinuationContext(allSections, result)
- # Pass raw response string from first iteration for KPI definition
- kpiDefinitions = await self._defineKpisFromPrompt(
- userPrompt or prompt,
- result, # Pass raw JSON string from first iteration
- continuationContext,
- debugPrefix
- )
- # Initialize KPIs with currentValue = 0
- accumulationState.kpis = [{**kpi, "currentValue": 0} for kpi in kpiDefinitions]
- logger.info(f"Defined {len(accumulationState.kpis)} KPIs: {[kpi.get('id') for kpi in accumulationState.kpis]}")
-
- # Extract and validate KPIs (if in accumulation mode with KPIs defined)
- if accumulationState and accumulationState.isAccumulationMode and accumulationState.kpis:
- # For KPI extraction, prefer accumulated JSON string over repaired JSON
- # because repairBrokenJson may lose data (e.g., empty rows array when JSON is incomplete)
- updatedKpis = []
-
- # First try to extract from parsedResult (repaired JSON)
- if parsedResult:
- try:
- updatedKpis = JsonResponseHandler.extractKpiValuesFromJson(
- parsedResult,
- accumulationState.kpis
- )
- # Check if we got meaningful values (non-zero)
- hasValidValues = any(kpi.get("currentValue", 0) > 0 for kpi in updatedKpis)
- if not hasValidValues and accumulationState.accumulatedJsonString:
- # Repaired JSON has empty values, try accumulated string
- logger.debug("Repaired JSON has empty KPI values, trying accumulated JSON string")
- updatedKpis = JsonResponseHandler.extractKpiValuesFromIncompleteJson(
- accumulationState.accumulatedJsonString,
- accumulationState.kpis
- )
- except Exception as e:
- logger.debug(f"Error extracting KPIs from parsedResult: {e}")
- updatedKpis = []
-
- # If no parsedResult or extraction failed, try accumulated string
- if not updatedKpis and accumulationState.accumulatedJsonString:
- try:
- updatedKpis = JsonResponseHandler.extractKpiValuesFromIncompleteJson(
- accumulationState.accumulatedJsonString,
- accumulationState.kpis
- )
- except Exception as e:
- logger.debug(f"Error extracting KPIs from accumulated JSON string: {e}")
- updatedKpis = []
-
- if updatedKpis:
- shouldProceed, reason = JsonResponseHandler.validateKpiProgression(
- accumulationState,
- updatedKpis
- )
-
- if not shouldProceed:
- logger.warning(f"Iteration {iteration}: KPI validation failed: {reason}")
- if iterationOperationId:
- self.services.chat.progressLogFinish(iterationOperationId, False)
- if operationId:
- self.services.chat.progressLogUpdate(operationId, 0.9, f"KPI validation failed: {reason} ({iteration} iterations)")
- break
-
- # Update KPIs in accumulation state
- accumulationState.kpis = updatedKpis
- logger.info(f"Iteration {iteration}: KPIs updated: {[(kpi.get('id'), kpi.get('currentValue')) for kpi in updatedKpis]}")
-
- # Check if all KPIs completed
- allCompleted = True
- for kpi in updatedKpis:
- targetValue = kpi.get("targetValue", 0)
- currentValue = kpi.get("currentValue", 0)
- if currentValue < targetValue:
- allCompleted = False
- break
-
- if allCompleted:
- logger.info(f"Iteration {iteration}: All KPIs completed, finishing accumulation")
- wasJsonComplete = True # Mark as complete to exit loop
-
- # CRITICAL: Handle JSON fragments (continuation content)
- # Fragment merging happens inside extractSectionsFromResponse
- # If merge fails (returns wasJsonComplete=True), stop iterations and complete JSON
- if not extractedSections and allSections:
- if wasJsonComplete:
- # Merge failed - stop iterations, complete JSON with available data
- logger.error(f"Iteration {iteration}: ❌ MERGE FAILED - Stopping iterations, completing JSON with available data")
- if iterationOperationId:
- self.services.chat.progressLogFinish(iterationOperationId, False)
- if operationId:
- self.services.chat.progressLogUpdate(operationId, 0.9, f"Merge failed, completing JSON ({iteration} iterations)")
- break
-
- # Fragment was detected and merged successfully
- logger.info(f"Iteration {iteration}: JSON fragment detected and merged, continuing")
- # Don't break - fragment was merged, continue to get more content if needed
- # Check if we should continue based on JSON completeness
- shouldContinue = self.responseParser.shouldContinueGeneration(
- allSections,
- iteration,
- wasJsonComplete,
- result
- )
- if shouldContinue:
- if iterationOperationId:
- self.services.chat.progressLogUpdate(iterationOperationId, 0.8, "Fragment merged, continuing")
- self.services.chat.progressLogFinish(iterationOperationId, True)
- continue
- else:
- # Done - fragment was merged and JSON is complete
- if iterationOperationId:
- self.services.chat.progressLogFinish(iterationOperationId, True)
- if operationId:
- self.services.chat.progressLogUpdate(operationId, 0.95, f"Generation complete ({iteration} iterations, fragment merged)")
- logger.info(f"Generation complete after {iteration} iterations: fragment merged")
- break
-
- # Extract document metadata from first iteration if available
- if iteration == 1 and parsedResult and not documentMetadata:
- documentMetadata = self.responseParser.extractDocumentMetadata(parsedResult)
-
- # Update progress after parsing
- if iterationOperationId:
- if extractedSections:
- self.services.chat.progressLogUpdate(iterationOperationId, 0.8, f"Extracted {len(extractedSections)} sections")
-
- if not extractedSections:
- # CRITICAL: If JSON was incomplete/broken, continue even if no sections extracted
- # This allows the AI to retry and complete the broken JSON
- if not wasJsonComplete:
- logger.warning(f"Iteration {iteration}: No sections extracted from broken JSON, continuing for another attempt")
- continue
- # If JSON was complete but no sections extracted - check if it was a fragment
- # Fragments are handled above, so if we get here and it's complete, it's an error
- logger.warning(f"Iteration {iteration}: No sections extracted from complete JSON, stopping")
- break
-
- # NOTE: Section merging now happens BEFORE KPI validation (see above)
- # This ensures sections are preserved even if KPI validation fails
-
- # Calculate total bytes in merged content for progress display
- merged_json_str = json.dumps(allSections, indent=2, ensure_ascii=False)
- totalBytesGenerated = len(merged_json_str.encode('utf-8'))
-
- # Update main operation with byte progress
- if operationId:
- # Format bytes for display
- if totalBytesGenerated < 1024:
- bytesDisplay = f"{totalBytesGenerated}B"
- elif totalBytesGenerated < 1024 * 1024:
- bytesDisplay = f"{totalBytesGenerated / 1024:.1f}kB"
- else:
- bytesDisplay = f"{totalBytesGenerated / (1024 * 1024):.1f}MB"
- # Estimate progress based on iterations (rough estimate)
- estimatedProgress = min(0.9, 0.4 + (iteration * 0.1))
- self.services.chat.progressLogUpdate(operationId, estimatedProgress, f"Pipeline: {bytesDisplay} (iteration {iteration})")
-
- # Log merged sections for debugging
- # For section content generation: skip merged sections debug files (only one prompt/response needed)
- isSectionContent = "_section_" in debugPrefix
- if not isSectionContent:
- self.services.utils.writeDebugFile(merged_json_str, f"{debugPrefix}_merged_sections_iteration_{iteration}")
-
- # Check if we should continue (completion detection)
- # Simple logic: JSON completeness determines continuation
- shouldContinue = self.responseParser.shouldContinueGeneration(
- allSections,
- iteration,
- wasJsonComplete,
- result
- )
-
- if shouldContinue:
- # Finish iteration operation (will continue with next iteration)
- if iterationOperationId:
- # Show byte progress in iteration completion
- iterBytes = len(result.encode('utf-8')) if result else 0
- if iterBytes < 1024:
- iterBytesDisplay = f"{iterBytes}B"
- elif iterBytes < 1024 * 1024:
- iterBytesDisplay = f"{iterBytes / 1024:.1f}kB"
- else:
- iterBytesDisplay = f"{iterBytes / (1024 * 1024):.1f}MB"
- self.services.chat.progressLogUpdate(iterationOperationId, 0.95, f"Completed ({iterBytesDisplay})")
- self.services.chat.progressLogFinish(iterationOperationId, True)
- continue
- else:
- # Done - finish iteration and update main operation
- if iterationOperationId:
- # Show final byte count
- finalBytes = len(merged_json_str.encode('utf-8'))
- if finalBytes < 1024:
- finalBytesDisplay = f"{finalBytes}B"
- elif finalBytes < 1024 * 1024:
- finalBytesDisplay = f"{finalBytes / 1024:.1f}kB"
- else:
- finalBytesDisplay = f"{finalBytes / (1024 * 1024):.1f}MB"
- self.services.chat.progressLogUpdate(iterationOperationId, 0.95, f"Complete ({finalBytesDisplay})")
- self.services.chat.progressLogFinish(iterationOperationId, True)
- if operationId:
- # Show final size in main operation
- finalBytes = len(merged_json_str.encode('utf-8'))
- if finalBytes < 1024:
- finalBytesDisplay = f"{finalBytes}B"
- elif finalBytes < 1024 * 1024:
- finalBytesDisplay = f"{finalBytes / 1024:.1f}kB"
- else:
- finalBytesDisplay = f"{finalBytes / (1024 * 1024):.1f}MB"
- self.services.chat.progressLogUpdate(operationId, 0.95, f"Generation complete: {finalBytesDisplay} ({iteration} iterations, {len(allSections)} sections)")
- logger.info(f"Generation complete after {iteration} iterations: {len(allSections)} sections")
- break
except Exception as e:
logger.error(f"Error in AI call iteration {iteration}: {str(e)}")
@@ -612,20 +423,121 @@ class AiCallLooper:
if iteration >= maxIterations:
logger.warning(f"AI call stopped after maximum iterations ({maxIterations})")
- # CRITICAL: Complete any incomplete structures in sections before building final result
- # This ensures JSON is properly closed even if merge failed or iterations stopped early
- allSections = JsonResponseHandler.completeIncompleteStructures(allSections)
+ # This code path is never reached because all use cases are in directReturnUseCases
+ # and return early at line 417. This code would only execute for use cases that
+ # require section extraction, but no such use cases are currently registered.
+ logger.error(f"Unexpected code path: reached end of loop without return for use case '{useCaseId}'")
+ return result if result else ""
+
+ def _isJsonStringIncomplete(self, jsonString: str) -> bool:
+ """
+ Check if JSON string is incomplete (truncated) BEFORE closing/parsing.
- # Build final result from accumulated sections
- final_result = self.responseParser.buildFinalResultFromSections(allSections, documentMetadata)
+ This is critical because if JSON is truncated, closing it makes it appear complete,
+ but we need to detect the truncation to continue iteration.
- # Write final result to debug file
- # For section content generation: skip final_result debug file (response already written)
- isSectionContent = "_section_" in debugPrefix
- if not isSectionContent:
- self.services.utils.writeDebugFile(final_result, f"{debugPrefix}_final_result")
+ Args:
+ jsonString: JSON string to check
+
+ Returns:
+ True if JSON string appears incomplete/truncated, False otherwise
+ """
+ if not jsonString or not jsonString.strip():
+ return False
- return final_result
+ from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText
+
+ # Normalize JSON string
+ normalized = stripCodeFences(normalizeJsonText(jsonString)).strip()
+ if not normalized:
+ return False
+
+ # Find first '{' or '[' to start
+ startIdx = -1
+ for i, char in enumerate(normalized):
+ if char in '{[':
+ startIdx = i
+ break
+
+ if startIdx == -1:
+ return False
+
+ jsonContent = normalized[startIdx:]
+
+ # Check if structures are balanced (all opened structures are closed)
+ braceCount = 0
+ bracketCount = 0
+ inString = False
+ escapeNext = False
+
+ for char in jsonContent:
+ if escapeNext:
+ escapeNext = False
+ continue
+
+ if char == '\\':
+ escapeNext = True
+ continue
+
+ if char == '"':
+ inString = not inString
+ continue
+
+ if not inString:
+ if char == '{':
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ elif char == '[':
+ bracketCount += 1
+ elif char == ']':
+ bracketCount -= 1
+
+ # If structures are unbalanced, JSON is incomplete
+ if braceCount > 0 or bracketCount > 0:
+ return True
+
+ # Check if JSON ends with incomplete value (e.g., unclosed string, incomplete number, trailing comma)
+ trimmed = jsonContent.rstrip()
+ if not trimmed:
+ return False
+
+ # Check for trailing comma (might indicate incomplete)
+ if trimmed.endswith(','):
+ # Trailing comma might indicate incomplete, but could also be valid
+ # Check if there's a closing bracket/brace after the comma
+ return False # Trailing comma alone doesn't mean incomplete
+
+ # Check if ends with incomplete string (odd number of quotes)
+ quoteCount = jsonContent.count('"')
+ if quoteCount % 2 == 1:
+ # Odd number of quotes - string is not closed
+ return True
+
+ # Check if ends mid-value (e.g., ends with "417 instead of "4170. 41719"])
+ # Look for patterns that suggest truncation:
+ # - Ends with incomplete number (e.g., "417)
+ # - Ends with incomplete array element (e.g., ["417)
+ # - Ends with incomplete object property (e.g., {"key": "val)
+
+ # If JSON parses successfully without closing, it's complete
+ from modules.shared.jsonUtils import tryParseJson
+ parsed, parseErr, _ = tryParseJson(jsonContent)
+ if parseErr is None:
+ # Parses successfully - it's complete
+ return False
+
+ # If it doesn't parse, try closing it and see if that helps
+ from modules.shared.jsonUtils import closeJsonStructures
+ closed = closeJsonStructures(jsonContent)
+ parsedClosed, parseErrClosed, _ = tryParseJson(closed)
+
+ if parseErrClosed is None:
+ # Only parses after closing - it was incomplete
+ return True
+
+ # Doesn't parse even after closing - might be malformed, but assume incomplete to be safe
+ return True
def _normalizeJsonStructure(self, parsed: Any, useCaseId: str) -> Any:
"""
@@ -645,9 +557,19 @@ class AiCallLooper:
# Check if list contains strings (invalid format) or element objects
if parsed and isinstance(parsed[0], str):
# Invalid format - list of strings instead of elements
- # This shouldn't happen, but we'll log a warning and return empty structure
- logger.warning(f"Invalid response format: received list of strings instead of elements array. Expected {{'elements': [...]}} structure.")
- return {"elements": []}
+ # Try to convert strings to paragraph elements as fallback
+ # This can happen if AI returns raw text instead of structured JSON
+ logger.debug(f"Received list of strings instead of elements array, converting to paragraph elements")
+ elements = []
+ for text in parsed:
+ if isinstance(text, str) and text.strip():
+ elements.append({
+ "type": "paragraph",
+ "content": {
+ "text": text.strip()
+ }
+ })
+ return {"elements": elements} if elements else {"elements": []}
else:
# Convert plain list of elements to elements structure
return {"elements": parsed}
@@ -664,99 +586,4 @@ class AiCallLooper:
# For other use cases, return as-is (they have their own structures)
return parsed
-
- async def _defineKpisFromPrompt(
- self,
- userPrompt: str,
- rawJsonString: Optional[str],
- continuationContext: Dict[str, Any],
- debugPrefix: str = "kpi"
- ) -> List[Dict[str, Any]]:
- """
- Make separate AI call to define KPIs based on user prompt and incomplete JSON.
-
- Args:
- userPrompt: Original user prompt
- rawJsonString: Raw JSON string from first iteration response
- continuationContext: Continuation context (not used for JSON, kept for compatibility)
- debugPrefix: Prefix for debug file names
-
- Returns:
- List of KPI definitions: [{"id": str, "description": str, "jsonPath": str, "targetValue": int}, ...]
- """
- # Use raw JSON string from first iteration response
- if rawJsonString:
- # Remove markdown code fences if present
- from modules.shared.jsonUtils import stripCodeFences
- incompleteJson = stripCodeFences(rawJsonString.strip())
- else:
- incompleteJson = "Not available"
-
- kpiDefinitionPrompt = f"""Analyze the user request and incomplete JSON to define KPIs (Key Performance Indicators) for tracking progress.
-
-User Request:
-{userPrompt}
-
-Delivered JSON part:
-{incompleteJson}
-
-Task: Define which JSON items should be tracked to measure completion progress.
-
-IMPORTANT: Analyze the Delivered JSON part structure to understand what is being tracked:
-1. Identify the structure type (table with rows, list with items, etc.)
-2. Determine what the jsonPath actually counts (number of rows, number of items, etc.)
-3. Calculate targetValue based on what is being tracked, NOT the total quantity requested
-
-For each trackable item, provide:
-- id: Unique identifier (use descriptive name)
-- description: What this KPI measures (be specific about what is counted)
-- jsonPath: Path to extract value from JSON (use dot notation with array indices, e.g., "documents[0].sections[1].elements[0].rows")
-- targetValue: Target value to reach (integer) - MUST match what jsonPath actually tracks (rows count, items count, etc.)
-
-Return ONLY valid JSON in this format:
-{{
- "kpis": [
- {{
- "id": "unique_id",
- "description": "Description of what is measured",
- "jsonPath": "path.to.value",
- "targetValue": 0
- }}
- ]
-}}
-
-If no trackable items can be identified, return: {{"kpis": []}}
-"""
-
- try:
- request = AiCallRequest(
- prompt=kpiDefinitionPrompt,
- options=AiCallOptions(
- operationType=OperationTypeEnum.DATA_ANALYSE,
- priority=PriorityEnum.SPEED,
- processingMode=ProcessingModeEnum.BASIC
- )
- )
-
- # Write KPI definition prompt to debug file
- self.services.utils.writeDebugFile(kpiDefinitionPrompt, f"{debugPrefix}_kpi_definition_prompt")
-
- checkWorkflowStopped(self.services)
- response = await self.aiService.callAi(request)
-
- # Write KPI definition response to debug file
- self.services.utils.writeDebugFile(response.content, f"{debugPrefix}_kpi_definition_response")
-
- # Parse response
- extracted = extractJsonString(response.content)
- kpiResponse = json.loads(extracted)
-
- kpiDefinitions = kpiResponse.get("kpis", [])
- logger.info(f"Defined {len(kpiDefinitions)} KPIs for tracking")
-
- return kpiDefinitions
-
- except Exception as e:
- logger.warning(f"Failed to define KPIs: {e}, continuing without KPI tracking")
- return []
diff --git a/modules/services/serviceAi/subJsonMerger.py b/modules/services/serviceAi/subJsonMerger.py
new file mode 100644
index 00000000..005b0f95
--- /dev/null
+++ b/modules/services/serviceAi/subJsonMerger.py
@@ -0,0 +1,2049 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""
+Modular JSON Merger - Intelligent JSON Fragment Merging
+
+A clean, modular approach to merging JSON fragments that may be cut randomly.
+Designed to be simple, robust, and always return valid data.
+
+Architecture:
+1. Data Extractor: Extracts all possible data from fragments (even incomplete)
+2. Structure Detector: Detects JSON structure type (elements, documents, files, etc.)
+3. Data Merger: Intelligently merges data with overlap detection
+4. Result Builder: Always returns valid JSON structure
+"""
+
+import json
+import re
+import logging
+import os
+from datetime import datetime
+from typing import Dict, Any, List, Optional, Tuple, Union
+
+from modules.shared.jsonUtils import (
+ normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson
+)
+
+logger = logging.getLogger(__name__)
+
+
+class JsonMergeLogger:
+ """Consolidated logger for JSON merging process."""
+
+ _logBuffer: List[str] = []
+ _mergeId: int = 0
+ _currentLogFile: Optional[str] = None
+ _appendMode: bool = False
+
+ @staticmethod
+ def initializeLogFile(logFileName: Optional[str] = None):
+ """Initialize a new log file for a test run."""
+ JsonMergeLogger._logBuffer = []
+ JsonMergeLogger._mergeId = 0
+
+ if logFileName:
+ JsonMergeLogger._currentLogFile = logFileName
+ JsonMergeLogger._appendMode = False
+ # Clear existing file
+ try:
+ currentFileDir = os.path.dirname(os.path.abspath(__file__))
+ logFilePath = os.path.join(currentFileDir, logFileName)
+ with open(logFilePath, 'w', encoding='utf-8') as f:
+ f.write("") # Clear file
+ except Exception:
+ pass
+ else:
+ JsonMergeLogger._currentLogFile = None
+ JsonMergeLogger._appendMode = False
+
+ @staticmethod
+ def startMerge(accumulated: str, newFragment: str) -> str:
+ """Start a new merge operation and return merge ID."""
+ JsonMergeLogger._mergeId += 1
+ mergeId = f"merge_{JsonMergeLogger._mergeId}"
+
+ JsonMergeLogger._log(f"{'='*80}")
+ JsonMergeLogger._log(f"JSON MERGE OPERATION #{JsonMergeLogger._mergeId}")
+ JsonMergeLogger._log(f"{'='*80}")
+ JsonMergeLogger._log(f"Timestamp: {datetime.now().isoformat()}")
+ JsonMergeLogger._log("")
+
+ JsonMergeLogger._log("INPUT:")
+ JsonMergeLogger._log(f" Accumulated length: {len(accumulated)} chars")
+ JsonMergeLogger._log(f" New Fragment length: {len(newFragment)} chars")
+ # Log only summary (first 5 and last 5 lines) to avoid log spam
+ accLines = accumulated.split('\n')
+ fragLines = newFragment.split('\n')
+ JsonMergeLogger._log(f" Accumulated: {len(accLines)} lines (showing first 5 and last 5)")
+ if len(accLines) > 10:
+ for line in accLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(accLines) - 10} lines omitted) ...")
+ for line in accLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in accLines:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" New Fragment: {len(fragLines)} lines (showing first 5 and last 5)")
+ if len(fragLines) > 10:
+ for line in fragLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(fragLines) - 10} lines omitted) ...")
+ for line in fragLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in fragLines:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log("")
+
+ return mergeId
+
+ @staticmethod
+ def logStep(stepName: str, description: str, result: Any = None, error: Optional[str] = None):
+ """Log a step with its result."""
+ JsonMergeLogger._log(f"STEP: {stepName}")
+ JsonMergeLogger._log(f" Description: {description}")
+
+ if error:
+ JsonMergeLogger._log(f" ❌ ERROR: {error}")
+ elif result is not None:
+ if isinstance(result, str):
+ resultLines = result.split('\n')
+ JsonMergeLogger._log(f" ✅ Result (string, {len(result)} chars, {len(resultLines)} lines)")
+ if len(resultLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
+ for line in resultLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(resultLines) - 10} lines omitted) ...")
+ for line in resultLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in resultLines:
+ JsonMergeLogger._log(f" {line}")
+ elif isinstance(result, dict):
+ keys = list(result.keys())
+ JsonMergeLogger._log(f" ✅ Result (dict): keys={keys}, size={len(str(result))} chars")
+ # Log full structure with JSON formatting - NO TRUNCATION
+ try:
+ jsonStr = json.dumps(result, indent=2, ensure_ascii=False)
+ JsonMergeLogger._log(f" Full data (COMPLETE, {len(jsonStr)} chars):")
+ JsonMergeLogger._log(" " + "="*76)
+ for line in jsonStr.split('\n'):
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(" " + "="*76)
+ except Exception as e:
+ JsonMergeLogger._log(f" Could not serialize: {e}")
+ strRepr = str(result)
+ strLines = strRepr.split('\n')
+ JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
+ if len(strLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
+ for line in strLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
+ for line in strLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in strLines:
+ JsonMergeLogger._log(f" {line}")
+ # Log structure details
+ if "elements" in result:
+ elemCount = len(result["elements"]) if isinstance(result["elements"], list) else 0
+ JsonMergeLogger._log(f" - elements: {elemCount} items")
+ if isinstance(result["elements"], list) and elemCount > 0:
+ JsonMergeLogger._log(f" First element type: {result['elements'][0].get('type', 'unknown') if isinstance(result['elements'][0], dict) else 'not a dict'}")
+ if "documents" in result:
+ docCount = len(result["documents"]) if isinstance(result["documents"], list) else 0
+ JsonMergeLogger._log(f" - documents: {docCount} items")
+ elif isinstance(result, list):
+ JsonMergeLogger._log(f" ✅ Result (list): {len(result)} items (COMPLETE)")
+ if len(result) > 0:
+ JsonMergeLogger._log(f" First item type: {type(result[0]).__name__}")
+ try:
+ jsonStr = json.dumps(result, indent=2, ensure_ascii=False) # ALL items
+ JsonMergeLogger._log(f" All items (COMPLETE, {len(jsonStr)} chars):")
+ JsonMergeLogger._log(" " + "="*76)
+ for line in jsonStr.split('\n'):
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(" " + "="*76)
+ except Exception:
+ strRepr = str(result)
+ strLines = strRepr.split('\n')
+ JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
+ if len(strLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
+ for line in strLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
+ for line in strLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in strLines:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ JsonMergeLogger._log(f" ✅ Result: {type(result).__name__} = {str(result)[:200]}")
+ else:
+ JsonMergeLogger._log(f" ⏳ In progress...")
+
+ JsonMergeLogger._log("")
+
+ @staticmethod
+ def logExtraction(strategy: str, success: bool, data: Any = None, error: Optional[str] = None):
+ """Log extraction strategy result."""
+ status = "✅ SUCCESS" if success else "❌ FAILED"
+ JsonMergeLogger._log(f" Extraction Strategy: {strategy} - {status}")
+ if error:
+ JsonMergeLogger._log(f" Error: {error}")
+ elif data is not None:
+ if isinstance(data, dict):
+ keys = list(data.keys())
+ JsonMergeLogger._log(f" Extracted keys: {keys}")
+ # Log full extracted data - NO TRUNCATION
+ try:
+ jsonStr = json.dumps(data, indent=2, ensure_ascii=False)
+ JsonMergeLogger._log(f" Extracted data (COMPLETE, {len(jsonStr)} chars):")
+ JsonMergeLogger._log(" " + "="*76)
+ for line in jsonStr.split('\n'):
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(" " + "="*76)
+ except Exception as e:
+ JsonMergeLogger._log(f" Could not serialize extracted data: {e}")
+ strRepr = str(data)
+ strLines = strRepr.split('\n')
+ JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
+ if len(strLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
+ for line in strLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
+ for line in strLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in strLines:
+ JsonMergeLogger._log(f" {line}")
+ elif isinstance(data, list):
+ JsonMergeLogger._log(f" Extracted {len(data)} items (COMPLETE)")
+ if len(data) > 0:
+ try:
+ jsonStr = json.dumps(data, indent=2, ensure_ascii=False) # ALL items
+ JsonMergeLogger._log(f" All items (COMPLETE, {len(jsonStr)} chars):")
+ JsonMergeLogger._log(" " + "="*76)
+ for line in jsonStr.split('\n'):
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(" " + "="*76)
+ except Exception as e:
+ JsonMergeLogger._log(f" Could not serialize list: {e}")
+ strRepr = str(data)
+ strLines = strRepr.split('\n')
+ JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
+ if len(strLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
+ for line in strLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
+ for line in strLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in strLines:
+ JsonMergeLogger._log(f" {line}")
+
+ @staticmethod
+ def logOverlap(overlapType: str, overlapLen: int, accSuffix: Any = None, fragPrefix: Any = None):
+ """Log overlap detection result."""
+ JsonMergeLogger._log(f" Overlap Detection ({overlapType}):")
+ JsonMergeLogger._log(f" Overlap length: {overlapLen}")
+ if overlapLen > 0:
+ JsonMergeLogger._log(f" ✅ Found overlap of {overlapLen} chars")
+ if accSuffix is not None:
+ if isinstance(accSuffix, str):
+ JsonMergeLogger._log(f" Accumulated suffix (COMPLETE, {len(accSuffix)} chars):")
+ JsonMergeLogger._log(" " + "="*76)
+ for line in accSuffix.split('\n'):
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(" " + "="*76)
+ else:
+ JsonMergeLogger._log(f" Accumulated suffix (COMPLETE): {accSuffix}")
+ if fragPrefix is not None:
+ if isinstance(fragPrefix, str):
+ prefixLines = fragPrefix.split('\n')
+ JsonMergeLogger._log(f" Fragment prefix ({len(fragPrefix)} chars, {len(prefixLines)} lines)")
+ if len(prefixLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
+ for line in prefixLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(prefixLines) - 10} lines omitted) ...")
+ for line in prefixLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in prefixLines:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ JsonMergeLogger._log(f" Fragment prefix (COMPLETE): {fragPrefix}")
+ else:
+ JsonMergeLogger._log(f" ⚠️ No overlap detected - appending all")
+
+ @staticmethod
+ def logValidation(validationType: str, success: bool, error: Optional[str] = None):
+ """Log validation result."""
+ status = "✅ VALID" if success else "❌ INVALID"
+ JsonMergeLogger._log(f" Validation ({validationType}): {status}")
+ if error:
+ JsonMergeLogger._log(f" Error: {error}")
+
+ @staticmethod
+ def finishMerge(mergeId: str, finalResult: str, success: bool):
+ """Finish merge operation and write log file."""
+ JsonMergeLogger._log("")
+ JsonMergeLogger._log(f"{'='*80}")
+ JsonMergeLogger._log(f"MERGE RESULT: {'✅ SUCCESS' if success else '❌ FAILED'}")
+ JsonMergeLogger._log(f"{'='*80}")
+ JsonMergeLogger._log(f"Final result length: {len(finalResult)} chars")
+ JsonMergeLogger._log("Final result (COMPLETE):")
+ JsonMergeLogger._log("="*80)
+ for line in finalResult.split('\n'):
+ JsonMergeLogger._log(line)
+ JsonMergeLogger._log("="*80)
+ JsonMergeLogger._log("")
+
+ # Write log content to buffer (will be written at end of test run)
+ logContent = "\n".join(JsonMergeLogger._logBuffer)
+
+ # If we have a current log file, append to it
+ if JsonMergeLogger._currentLogFile:
+ try:
+ currentFileDir = os.path.dirname(os.path.abspath(__file__))
+ logFilePath = os.path.join(currentFileDir, JsonMergeLogger._currentLogFile)
+ mode = 'a' if JsonMergeLogger._appendMode else 'w'
+ with open(logFilePath, mode, encoding='utf-8') as f:
+ f.write(logContent)
+ f.write("\n\n") # Add separator between merges
+ JsonMergeLogger._appendMode = True # Next writes will append
+ logger.debug(f"JSON merge log appended to: {logFilePath}")
+ except Exception as e:
+ logger.error(f"Failed to write merge log file: {e}")
+ else:
+ # No log file set - write individual file (fallback)
+ currentFileDir = os.path.dirname(os.path.abspath(__file__))
+ logDir = currentFileDir
+ os.makedirs(logDir, exist_ok=True)
+ logFilePath = os.path.join(logDir, f"{mergeId}.txt")
+ try:
+ with open(logFilePath, 'w', encoding='utf-8') as f:
+ f.write(logContent)
+ logger.info(f"JSON merge log written to: {logFilePath}")
+ except Exception as e:
+ logger.error(f"Failed to write merge log file: {e}")
+
+ # Clear buffer for next merge
+ JsonMergeLogger._logBuffer = []
+
+ @staticmethod
+ def _log(message: str):
+ """Internal log method."""
+ JsonMergeLogger._logBuffer.append(message)
+ logger.debug(message)
+
+
+class JsonDataExtractor:
+ """Extracts data from JSON fragments, even if incomplete."""
+
+ @staticmethod
+ def extract(jsonString: str, mergeId: Optional[str] = None, removeFromEnd: bool = True) -> Dict[str, Any]:
+ """
+ Extract complete data from JSON fragment.
+
+ For merging: We know exactly where to clean:
+ - accumulated: remove incomplete parts at the END
+ - newFragment: remove incomplete parts at the BEGINNING
+
+ Simple approach: Remove incomplete parts at specified position, then parse.
+ """
+ if mergeId:
+ position = "END" if removeFromEnd else "BEGINNING"
+ JsonMergeLogger.logStep("EXTRACTION", f"Extracting data from JSON fragment ({len(jsonString)} chars) - cleaning from {position}")
+
+ if not jsonString or not jsonString.strip():
+ if mergeId:
+ JsonMergeLogger.logExtraction("Empty input", False, error="Input is empty")
+ return {}
+
+ normalized = stripCodeFences(normalizeJsonText(jsonString)).strip()
+ if not normalized:
+ if mergeId:
+ JsonMergeLogger.logExtraction("Normalization", False, error="Normalized string is empty")
+ return {}
+
+ # Try to parse as complete JSON first
+ parsed, parseErr, _ = tryParseJson(normalized)
+ if parseErr is None and parsed is not None:
+ if isinstance(parsed, dict):
+ finalResult = parsed
+ elif isinstance(parsed, list):
+ finalResult = {"elements": parsed}
+ else:
+ finalResult = {"elements": [parsed]} if parsed else {}
+
+ if mergeId:
+ JsonMergeLogger.logExtraction("Direct parsing", True, finalResult)
+ JsonMergeLogger.logStep("EXTRACTION", "Direct parsing successful", finalResult)
+
+ return finalResult if finalResult else {}
+
+ # Remove incomplete parts from specified position
+ if removeFromEnd:
+ cleaned = JsonDataExtractor._removeIncompleteFromEnd(normalized)
+ else:
+ cleaned = JsonDataExtractor._removeIncompleteFromBeginning(normalized)
+
+ if cleaned:
+ # Close structures and try to parse
+ closed = closeJsonStructures(cleaned)
+ parsed, parseErr2, _ = tryParseJson(closed)
+ if parseErr2 is None and parsed is not None:
+ if isinstance(parsed, dict):
+ finalResult = parsed
+ elif isinstance(parsed, list):
+ finalResult = {"elements": parsed}
+ else:
+ finalResult = {"elements": [parsed]} if parsed else {}
+
+ if mergeId:
+ JsonMergeLogger.logExtraction("Remove incomplete + close", True, finalResult)
+ JsonMergeLogger.logStep("EXTRACTION", "Remove incomplete + close successful", finalResult)
+
+ return finalResult if finalResult else {}
+
+ # Return empty dict if nothing worked
+ if mergeId:
+ JsonMergeLogger.logStep("EXTRACTION", "No data extracted", {}, error="All strategies failed")
+ return {}
+
+ @staticmethod
+ def _removeIncompleteFromEnd(jsonString: str) -> str:
+ """
+ Remove incomplete parts from the END of JSON string.
+ Goes through structure level by level, keeps complete elements, removes incomplete ones at the end.
+ """
+ # Find first '{' or '[' to start
+ startIdx = -1
+ for i, char in enumerate(jsonString):
+ if char in '{[':
+ startIdx = i
+ break
+
+ if startIdx == -1:
+ return ""
+
+ # Remove incomplete parts from end recursively
+ cleaned = JsonDataExtractor._cleanJsonFromEnd(jsonString[startIdx:])
+ return cleaned
+
+ @staticmethod
+ def _removeIncompleteFromBeginning(jsonString: str) -> str:
+ """
+ Remove incomplete parts from the BEGINNING of JSON string.
+ Finds where valid JSON starts and removes everything before it.
+ """
+ # Find first '{' or '[' to start
+ startIdx = -1
+ for i, char in enumerate(jsonString):
+ if char in '{[':
+ startIdx = i
+ break
+
+ if startIdx == -1:
+ return ""
+
+ # Return from start position - beginning cleanup is just finding the start
+ return jsonString[startIdx:]
+
+ @staticmethod
+ def _cleanJsonFromEnd(jsonStr: str) -> str:
+ """
+ Recursively clean JSON from the END: keep complete elements, remove incomplete ones at the end.
+ Goes through structure level by level.
+ """
+ # Try to parse as-is first
+ try:
+ parsed = json.loads(jsonStr)
+ return jsonStr
+ except Exception:
+ pass
+
+ # If dict: go through each key-value pair, remove incomplete ones at the end
+ if jsonStr.strip().startswith('{'):
+ return JsonDataExtractor._cleanDictFromEnd(jsonStr)
+
+ # If array: go through each element, remove incomplete ones at the end
+ if jsonStr.strip().startswith('['):
+ return JsonDataExtractor._cleanArrayFromEnd(jsonStr)
+
+ return ""
+
+ @staticmethod
+ def _cleanDictFromEnd(jsonStr: str) -> str:
+ """Clean dict from END: keep complete key-value pairs, remove incomplete ones at the end."""
+ if not jsonStr.strip().startswith('{'):
+ return ""
+
+ result = ['{']
+ i = 1 # Skip opening '{'
+ first = True
+
+ while i < len(jsonStr):
+ # Skip whitespace
+ while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
+ i += 1
+
+ if i >= len(jsonStr):
+ break
+
+ # Check if we hit closing brace
+ if jsonStr[i] == '}':
+ break
+
+ # Skip comma
+ if jsonStr[i] == ',':
+ i += 1
+ continue
+
+ # Try to extract key-value pair
+ keyStart = i
+ # Find key (string)
+ if jsonStr[i] == '"':
+ i += 1
+ while i < len(jsonStr) and jsonStr[i] != '"':
+ if jsonStr[i] == '\\':
+ i += 2
+ else:
+ i += 1
+ if i < len(jsonStr):
+ i += 1 # Skip closing quote
+ else:
+ # Invalid key - stop here (incomplete at end)
+ break
+
+ # Skip whitespace and colon
+ while i < len(jsonStr) and jsonStr[i] in ' \n\r\t:':
+ i += 1
+
+ if i >= len(jsonStr):
+ break
+
+ # Try to extract value
+ valueStart = i
+ valueEnd = JsonDataExtractor._findCompleteValue(jsonStr, i)
+
+ if valueEnd > valueStart:
+ # Try to parse this key-value pair
+ pairStr = jsonStr[keyStart:valueEnd]
+ try:
+ # Test if it's valid JSON
+ testStr = '{' + pairStr + '}'
+ json.loads(testStr)
+ # Valid pair - add it
+ if not first:
+ result.append(',')
+ result.append(pairStr)
+ first = False
+ i = valueEnd
+ except Exception:
+ # Invalid pair - stop here (incomplete at end)
+ break
+ else:
+ # Incomplete value - stop here (incomplete at end)
+ break
+
+ result.append('}')
+ return ''.join(result)
+
+ @staticmethod
+ def _cleanArrayFromEnd(jsonStr: str) -> str:
+ """Clean array from END: keep complete elements, remove incomplete ones at the end."""
+ if not jsonStr.strip().startswith('['):
+ return ""
+
+ result = ['[']
+ i = 1 # Skip opening '['
+ first = True
+
+ while i < len(jsonStr):
+ # Skip whitespace
+ while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
+ i += 1
+
+ if i >= len(jsonStr):
+ break
+
+ # Check if we hit closing bracket
+ if jsonStr[i] == ']':
+ break
+
+ # Skip comma
+ if jsonStr[i] == ',':
+ i += 1
+ continue
+
+ # Try to extract element
+ elemStart = i
+ elemEnd = JsonDataExtractor._findCompleteValue(jsonStr, i)
+
+ if elemEnd > elemStart:
+ # Try to parse this element
+ elemStr = jsonStr[elemStart:elemEnd]
+ try:
+ # Test if it's valid JSON
+ json.loads(elemStr)
+ # Valid element - add it
+ if not first:
+ result.append(',')
+ result.append(elemStr)
+ first = False
+ i = elemEnd
+ except Exception:
+ # Invalid element - stop here (incomplete at end)
+ break
+ else:
+ # Incomplete element - stop here (incomplete at end)
+ break
+
+ result.append(']')
+ return ''.join(result)
+
+ @staticmethod
+ def _findCompleteValue(jsonStr: str, start: int) -> int:
+ """Find the end of a complete JSON value starting at start position."""
+ if start >= len(jsonStr):
+ return start
+
+ i = start
+
+ # Skip whitespace
+ while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
+ i += 1
+
+ if i >= len(jsonStr):
+ return start
+
+ char = jsonStr[i]
+
+ # String
+ if char == '"':
+ i += 1
+ while i < len(jsonStr):
+ if jsonStr[i] == '\\':
+ i += 2
+ elif jsonStr[i] == '"':
+ return i + 1
+ else:
+ i += 1
+ return start # Incomplete string
+
+ # Number, boolean, null
+ if char in '-0123456789tfn':
+ while i < len(jsonStr) and jsonStr[i] not in ',}]':
+ i += 1
+ return i
+
+ # Object
+ if char == '{':
+ braceCount = 1
+ i += 1
+ while i < len(jsonStr) and braceCount > 0:
+ if jsonStr[i] == '\\':
+ i += 2
+ elif jsonStr[i] == '"':
+ # Skip string
+ i += 1
+ while i < len(jsonStr):
+ if jsonStr[i] == '\\':
+ i += 2
+ elif jsonStr[i] == '"':
+ i += 1
+ break
+ else:
+ i += 1
+ elif jsonStr[i] == '{':
+ braceCount += 1
+ i += 1
+ elif jsonStr[i] == '}':
+ braceCount -= 1
+ i += 1
+ else:
+ i += 1
+ if braceCount == 0:
+ return i
+ return start # Incomplete object
+
+ # Array
+ if char == '[':
+ bracketCount = 1
+ i += 1
+ while i < len(jsonStr) and bracketCount > 0:
+ if jsonStr[i] == '\\':
+ i += 2
+ elif jsonStr[i] == '"':
+ # Skip string
+ i += 1
+ while i < len(jsonStr):
+ if jsonStr[i] == '\\':
+ i += 2
+ elif jsonStr[i] == '"':
+ i += 1
+ break
+ else:
+ i += 1
+ elif jsonStr[i] == '[':
+ bracketCount += 1
+ i += 1
+ elif jsonStr[i] == ']':
+ bracketCount -= 1
+ i += 1
+ else:
+ i += 1
+ if bracketCount == 0:
+ return i
+ return start # Incomplete array
+
+ return start
+
+ @staticmethod
+ def _extractAllCompleteObjects(jsonString: str) -> List[Dict[str, Any]]:
+ """
+ Extract ALL complete objects from JSON string using balanced brace matching.
+ Ignores incomplete objects at the end.
+
+ Core principle: Every fragment can be cut anywhere - extract only complete objects.
+ """
+ foundObjs = []
+ braceCount = 0
+ startPos = -1
+
+ for i, char in enumerate(jsonString):
+ if char == '{':
+ if braceCount == 0:
+ startPos = i
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and startPos >= 0:
+ # Found a complete object
+ objStr = jsonString[startPos:i+1]
+ try:
+ obj = json.loads(objStr)
+ if isinstance(obj, dict) and obj:
+ foundObjs.append(obj)
+ except Exception:
+ # Not valid JSON - skip it
+ pass
+ startPos = -1
+ elif braceCount < 0:
+ # Unbalanced - reset
+ braceCount = 0
+ startPos = -1
+
+ # If we end with an incomplete object (startPos >= 0 and braceCount > 0), ignore it
+ # It will be in the next fragment
+
+ return foundObjs
+
+ @staticmethod
+ def _extractElements(jsonString: str) -> List[Dict[str, Any]]:
+ """Extract elements array from JSON string - extracts ALL complete elements."""
+ elements = []
+
+ # Pattern 1: Look for "elements": [...] (including incomplete at end)
+ elementsPattern = r'"elements"\s*:\s*\[(.*)'
+ match = re.search(elementsPattern, jsonString, re.DOTALL)
+ if match:
+ elementsContent = match.group(1)
+ # Extract ALL complete element objects using balanced brace matching
+ braceCount = 0
+ startPos = -1
+ for i, char in enumerate(elementsContent):
+ if char == '{':
+ if braceCount == 0:
+ startPos = i
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and startPos >= 0:
+ elementStr = elementsContent[startPos:i+1]
+ try:
+ element = json.loads(elementStr)
+ if isinstance(element, dict):
+ elements.append(element)
+ except Exception:
+ # Try to extract table rows from incomplete element
+ rows = JsonDataExtractor._extractTableRowsFromElement(elementStr)
+ if rows:
+ elements.append({
+ "type": "table",
+ "content": {
+ "rows": rows
+ }
+ })
+ startPos = -1
+ elif braceCount < 0:
+ break # Unbalanced - stop
+
+ # Pattern 2: Look for table structure directly (even if incomplete)
+ if not elements:
+ # Look for "type": "table" pattern
+ tablePattern = r'"type"\s*:\s*"table"[^}]*"rows"\s*:\s*\[(.*?)(?:\]|$)'
+ tableMatch = re.search(tablePattern, jsonString, re.DOTALL)
+ if tableMatch:
+ rowsContent = tableMatch.group(1)
+ rows = JsonDataExtractor._extractRowsFromContent(rowsContent)
+ if rows:
+ elements.append({
+ "type": "table",
+ "content": {
+ "rows": rows
+ }
+ })
+
+ # Pattern 3: Look for table rows directly (without structure)
+ if not elements:
+ rows = JsonDataExtractor._extractTableRows(jsonString)
+ if rows:
+ elements.append({
+ "type": "table",
+ "content": {
+ "rows": rows
+ }
+ })
+
+ return elements
+
+ @staticmethod
+ def _extractTableRowsFromElement(elementStr: str) -> List[List[str]]:
+ """Extract table rows from incomplete element string."""
+ # Look for rows array in element
+ rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)'
+ match = re.search(rowsPattern, elementStr, re.DOTALL)
+ if match:
+ return JsonDataExtractor._extractRowsFromContent(match.group(1))
+ return []
+
+ @staticmethod
+ def _extractRowsFromContent(rowsContent: str) -> List[List[str]]:
+ """Extract rows from rows content string."""
+ rows = []
+ # Extract all array patterns: ["value1", "value2"]
+ # Use non-greedy matching but ensure we get complete arrays
+ arrayPattern = r'\[(.*?)\]'
+ arrayMatches = re.findall(arrayPattern, rowsContent)
+ for arrayContent in arrayMatches:
+ # Extract cells - handle both quoted strings and numbers
+ # First try to find quoted strings
+ cellPattern = r'"([^"]*)"'
+ cells = re.findall(cellPattern, arrayContent)
+ # If no quoted strings, try numbers or other values
+ if not cells:
+ # Try to find any values (numbers, booleans, etc.)
+ valuePattern = r'(-?\d+\.?\d*|true|false|null)'
+ cells = re.findall(valuePattern, arrayContent)
+ # Only add rows with at least 1 cell (allow single-column tables)
+ if len(cells) >= 1:
+ rows.append(cells)
+ return rows
+
+ @staticmethod
+ def _extractTableRows(jsonString: str) -> List[List[str]]:
+ """Extract table rows from JSON string using multiple strategies."""
+ rows = []
+
+ # Strategy 1: Look for "rows": [[...], [...]]
+ rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)'
+ match = re.search(rowsPattern, jsonString, re.DOTALL)
+ if match:
+ rowsContent = match.group(1)
+ rows = JsonDataExtractor._extractRowsFromContent(rowsContent)
+ if rows:
+ return rows
+
+ # Strategy 2: Look for standalone array patterns ["value1", "value2"]
+ # Pattern for complete arrays with 2 columns
+ completeArrayPattern = r'\["([^"]*)",\s*"([^"]*)"\]'
+ matches = re.findall(completeArrayPattern, jsonString)
+ if len(matches) >= 2: # Need at least 2 rows to be confident
+ return [[m[0], m[1]] for m in matches]
+
+ # Strategy 3: Extract any array patterns (more lenient)
+ # Find all [ ... ] patterns that contain quoted strings
+ allArrays = re.findall(r'\[([^\]]*)\]', jsonString)
+ for arrayContent in allArrays:
+ # Extract quoted strings
+ cells = re.findall(r'"([^"]*)"', arrayContent)
+ if len(cells) >= 2: # At least 2 columns
+ rows.append(cells)
+
+ # Only return if we have multiple rows (likely a table)
+ if len(rows) >= 2:
+ return rows
+
+ return []
+
+ @staticmethod
+ def _extractDocuments(jsonString: str) -> List[Dict[str, Any]]:
+ """
+ Extract documents structure from JSON string - extracts ALL complete documents/chapters/sections.
+ Ignores incomplete ones at the end.
+
+ Core principle: Fragment can be cut anywhere - extract only complete objects.
+ """
+ documents = []
+
+ # Pattern 1: Look for "documents": [...] structure (including incomplete at end)
+ documentsPattern = r'"documents"\s*:\s*\[(.*)'
+ match = re.search(documentsPattern, jsonString, re.DOTALL)
+ if match:
+ documentsContent = match.group(1)
+ # Extract ALL complete document objects using balanced brace matching
+ braceCount = 0
+ startPos = -1
+ for i, char in enumerate(documentsContent):
+ if char == '{':
+ if braceCount == 0:
+ startPos = i
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and startPos >= 0:
+ # Found a complete document object
+ docStr = documentsContent[startPos:i+1]
+ try:
+ doc = json.loads(docStr)
+ if isinstance(doc, dict):
+ # Extract chapters/sections from document
+ chapters = JsonDataExtractor._extractChaptersFromDocument(docStr)
+ sections = JsonDataExtractor._extractSectionsFromDocument(docStr)
+ if chapters:
+ doc["chapters"] = chapters
+ if sections:
+ doc["sections"] = sections
+ if doc:
+ documents.append(doc)
+ except Exception:
+ # Not valid JSON - try to extract chapters/sections directly
+ chapters = JsonDataExtractor._extractChaptersFromDocument(docStr)
+ sections = JsonDataExtractor._extractSectionsFromDocument(docStr)
+ if chapters or sections:
+ doc = {}
+ if chapters:
+ doc["chapters"] = chapters
+ if sections:
+ doc["sections"] = sections
+ if doc:
+ documents.append(doc)
+ startPos = -1
+ elif braceCount < 0:
+ break
+
+ # If we end with an incomplete document (startPos >= 0 and braceCount > 0), ignore it
+ # It will be in the next fragment
+
+ if documents:
+ return documents
+
+ # Pattern 2: Look for "chapters": [...] pattern directly (fragment might start mid-document)
+ chapters = JsonDataExtractor._extractChaptersFromString(jsonString)
+ if chapters:
+ documents.append({"chapters": chapters})
+
+ # Pattern 3: Look for "sections": [...] pattern directly
+ sections = JsonDataExtractor._extractSectionsFromString(jsonString)
+ if sections:
+ documents.append({"sections": sections})
+
+ return documents
+
+ @staticmethod
+ def _extractChaptersFromDocument(docStr: str) -> List[Dict[str, Any]]:
+ """Extract chapters array from document string."""
+ return JsonDataExtractor._extractChaptersFromString(docStr)
+
+ @staticmethod
+ def _extractChaptersFromString(jsonString: str) -> List[Dict[str, Any]]:
+ """
+ Extract chapters array from JSON string - extracts ALL complete chapters.
+ Ignores incomplete chapters at the end.
+
+ Core principle: Fragment can be cut anywhere - extract only complete objects.
+ """
+ chapters = []
+
+ # Look for "chapters": [...] pattern (including incomplete at end)
+ chaptersPattern = r'"chapters"\s*:\s*\[(.*)'
+ match = re.search(chaptersPattern, jsonString, re.DOTALL)
+ if match:
+ chaptersContent = match.group(1)
+ # Extract ALL complete chapter objects using balanced brace matching
+ braceCount = 0
+ startPos = -1
+ for i, char in enumerate(chaptersContent):
+ if char == '{':
+ if braceCount == 0:
+ startPos = i
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and startPos >= 0:
+ # Found a complete chapter object
+ chapterStr = chaptersContent[startPos:i+1]
+ try:
+ chapter = json.loads(chapterStr)
+ if isinstance(chapter, dict):
+ chapters.append(chapter)
+ except Exception:
+ # Not valid JSON - skip it (incomplete chapter)
+ pass
+ startPos = -1
+ elif braceCount < 0:
+ # Unbalanced - stop here
+ break
+
+ # If we end with an incomplete chapter (startPos >= 0 and braceCount > 0), ignore it
+ # It will be in the next fragment
+
+ # Also try to extract chapters that might be standalone (fragment starts mid-array)
+ # Look for complete chapter objects anywhere in the string
+ if not chapters:
+ # Try to find complete chapter objects using balanced brace matching
+ allObjs = JsonDataExtractor._extractAllCompleteObjects(jsonString)
+ # Filter for objects that look like chapters (have id and title)
+ for obj in allObjs:
+ if isinstance(obj, dict) and "id" in obj and "title" in obj:
+ chapters.append(obj)
+
+ return chapters
+
+ @staticmethod
+ def _extractSectionsFromDocument(docStr: str) -> List[Dict[str, Any]]:
+ """Extract sections array from document string."""
+ return JsonDataExtractor._extractSectionsFromString(docStr)
+
+ @staticmethod
+ def _extractSectionsFromString(jsonString: str) -> List[Dict[str, Any]]:
+ """Extract sections array from JSON string, even if incomplete."""
+ sections = []
+
+ # Look for "sections": [...]
+ sectionsPattern = r'"sections"\s*:\s*\[(.*?)(?:\]|$)'
+ match = re.search(sectionsPattern, jsonString, re.DOTALL)
+ if match:
+ sectionsContent = match.group(1)
+ # Extract section objects using balanced brace matching
+ braceCount = 0
+ startPos = -1
+ for i, char in enumerate(sectionsContent):
+ if char == '{':
+ if braceCount == 0:
+ startPos = i
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and startPos >= 0:
+ sectionStr = sectionsContent[startPos:i+1]
+ try:
+ section = json.loads(sectionStr)
+ if isinstance(section, dict):
+ sections.append(section)
+ except Exception:
+ # Incomplete section - try to extract what we can
+ idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', sectionStr)
+ contentTypeMatch = re.search(r'"content_type"\s*:\s*"([^"]*)"', sectionStr)
+ if idMatch or contentTypeMatch:
+ section = {}
+ if idMatch:
+ section["id"] = idMatch.group(1)
+ if contentTypeMatch:
+ section["content_type"] = contentTypeMatch.group(1)
+ if section:
+ sections.append(section)
+ startPos = -1
+
+ return sections
+
+ @staticmethod
+ def _extractFiles(jsonString: str) -> List[Dict[str, Any]]:
+ """Extract files array from JSON string, even if incomplete."""
+ files = []
+
+ # Look for "files": [...]
+ filesPattern = r'"files"\s*:\s*\[(.*?)(?:\]|$)'
+ match = re.search(filesPattern, jsonString, re.DOTALL)
+ if match:
+ filesContent = match.group(1)
+ # Extract file objects using balanced brace matching
+ braceCount = 0
+ startPos = -1
+ for i, char in enumerate(filesContent):
+ if char == '{':
+ if braceCount == 0:
+ startPos = i
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and startPos >= 0:
+ fileStr = filesContent[startPos:i+1]
+ try:
+ fileObj = json.loads(fileStr)
+ if isinstance(fileObj, dict):
+ files.append(fileObj)
+ except Exception:
+ # Incomplete file - try to extract what we can
+ idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', fileStr)
+ filenameMatch = re.search(r'"filename"\s*:\s*"([^"]*)"', fileStr)
+ if idMatch or filenameMatch:
+ fileObj = {}
+ if idMatch:
+ fileObj["id"] = idMatch.group(1)
+ if filenameMatch:
+ fileObj["filename"] = filenameMatch.group(1)
+ if fileObj:
+ files.append(fileObj)
+ startPos = -1
+
+ return files
+
+ @staticmethod
+ def _extractImages(jsonString: str) -> List[Dict[str, Any]]:
+ """Extract images array from JSON string, even if incomplete."""
+ images = []
+
+ # Look for "images": [...]
+ imagesPattern = r'"images"\s*:\s*\[(.*?)(?:\]|$)'
+ match = re.search(imagesPattern, jsonString, re.DOTALL)
+ if match:
+ imagesContent = match.group(1)
+ # Extract image objects using balanced brace matching
+ braceCount = 0
+ startPos = -1
+ for i, char in enumerate(imagesContent):
+ if char == '{':
+ if braceCount == 0:
+ startPos = i
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and startPos >= 0:
+ imageStr = imagesContent[startPos:i+1]
+ try:
+ image = json.loads(imageStr)
+ if isinstance(image, dict):
+ images.append(image)
+ except Exception:
+ # Incomplete image - try to extract what we can
+ idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', imageStr)
+ urlMatch = re.search(r'"url"\s*:\s*"([^"]*)"', imageStr)
+ if idMatch or urlMatch:
+ image = {}
+ if idMatch:
+ image["id"] = idMatch.group(1)
+ if urlMatch:
+ image["url"] = urlMatch.group(1)
+ if image:
+ images.append(image)
+ startPos = -1
+
+ return images
+
+
+class JsonStructureDetector:
+ """Detects JSON structure type from extracted data."""
+
+ @staticmethod
+ def detect(data: Dict[str, Any], mergeId: Optional[str] = None) -> str:
+ """
+ Detect structure type from data - GENERIC approach.
+
+ Only checks for top-level keys, no content analysis.
+
+ Returns:
+ Structure type: "elements", "documents", "files", "images", or "unknown"
+ """
+ if "elements" in data:
+ structureType = "elements"
+ elif "documents" in data:
+ structureType = "documents"
+ elif "files" in data:
+ structureType = "files"
+ elif "images" in data:
+ structureType = "images"
+ else:
+ # Unknown structure - will be handled generically
+ structureType = "unknown"
+
+ if mergeId:
+ JsonMergeLogger.logStep("DETECTION", f"Detected structure type: {structureType}", structureType)
+
+ return structureType
+
+
+class JsonDataMerger:
+ """Merges JSON data intelligently with overlap detection."""
+
+ @staticmethod
+ def merge(
+ accumulated: Dict[str, Any],
+ newFragment: Dict[str, Any],
+ structureType: str,
+ mergeId: Optional[str] = None
+ ) -> Dict[str, Any]:
+ """
+ Merge two JSON data structures.
+
+ Args:
+ accumulated: Previously accumulated data
+ newFragment: New fragment data
+ structureType: Detected structure type
+ mergeId: Optional merge ID for logging
+
+ Returns:
+ Merged data structure
+ """
+ if mergeId:
+ JsonMergeLogger.logStep("MERGING", f"Merging {structureType} structures", {
+ "acc_keys": list(accumulated.keys()) if accumulated else [],
+ "frag_keys": list(newFragment.keys()) if newFragment else []
+ })
+
+ if not accumulated:
+ if mergeId:
+ JsonMergeLogger.logStep("MERGING", "No accumulated data, returning fragment", newFragment)
+ return newFragment if newFragment else {}
+ if not newFragment:
+ if mergeId:
+ JsonMergeLogger.logStep("MERGING", "No fragment data, returning accumulated", accumulated)
+ return accumulated
+
+ # Merge based on structure type
+ if structureType == "elements":
+ result = JsonDataMerger._mergeElements(accumulated, newFragment)
+ elif structureType == "documents":
+ result = JsonDataMerger._mergeDocuments(accumulated, newFragment)
+ elif structureType == "files":
+ result = JsonDataMerger._mergeFiles(accumulated, newFragment)
+ elif structureType == "images":
+ result = JsonDataMerger._mergeImages(accumulated, newFragment)
+ else:
+ # Unknown structure - try to merge generically
+ result = JsonDataMerger._mergeGeneric(accumulated, newFragment)
+
+ if mergeId:
+ JsonMergeLogger.logStep("MERGING", f"Merged {structureType} structures", result)
+
+ return result
+
+ @staticmethod
+ def _mergeElements(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
+ """Merge elements structures."""
+ accElements = accumulated.get("elements", [])
+ fragElements = newFragment.get("elements", [])
+
+ if not accElements:
+ return {"elements": fragElements} if fragElements else accumulated
+ if not fragElements:
+ return {"elements": accElements}
+
+ # Merge elements with overlap detection
+ mergedElements = JsonDataMerger._mergeElementList(accElements, fragElements)
+
+ return {"elements": mergedElements}
+
+ @staticmethod
+ def _mergeElementList(accElements: List[Dict[str, Any]], fragElements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+ """Merge two element lists with overlap detection."""
+ if not accElements:
+ return fragElements
+ if not fragElements:
+ return accElements
+
+ # Special handling: if both have table elements, merge them intelligently
+ accTables = [e for e in accElements if isinstance(e, dict) and e.get("type") == "table"]
+ fragTables = [e for e in fragElements if isinstance(e, dict) and e.get("type") == "table"]
+
+ if accTables and fragTables:
+ # Merge table elements
+ mergedTable = JsonDataMerger._mergeTableElements(accTables[0], fragTables[0])
+ if mergedTable:
+ # Replace tables with merged table
+ otherAccElements = [e for e in accElements if not (isinstance(e, dict) and e.get("type") == "table")]
+ otherFragElements = [e for e in fragElements if not (isinstance(e, dict) and e.get("type") == "table")]
+ return otherAccElements + [mergedTable] + otherFragElements
+
+ # Find overlap by comparing elements
+ overlapStart = JsonDataMerger._findOverlap(accElements, fragElements, None, "elements")
+
+ if overlapStart > 0:
+ # Found overlap - remove overlapping elements from fragment
+ merged = accElements + fragElements[overlapStart:]
+ return merged
+ else:
+ # No overlap - append all
+ return accElements + fragElements
+
+ @staticmethod
+ def _mergeTableElements(accTable: Dict[str, Any], fragTable: Dict[str, Any]) -> Dict[str, Any]:
+ """Merge two table elements by merging their rows."""
+ accRows = JsonDataMerger._getTableRows(accTable)
+ fragRows = JsonDataMerger._getTableRows(fragTable)
+
+ if not accRows:
+ return fragTable
+ if not fragRows:
+ return accTable
+
+ # Find overlap in rows
+ overlapStart = JsonDataMerger._findOverlap(accRows, fragRows, None, "table_rows")
+
+ # Merge rows
+ mergedRows = accRows + fragRows[overlapStart:] if overlapStart > 0 else accRows + fragRows
+
+ # Build merged table
+ mergedTable = accTable.copy()
+ content = mergedTable.get("content", {})
+ if not isinstance(content, dict):
+ content = {}
+ content["rows"] = mergedRows
+
+ # Preserve headers
+ if "headers" not in content:
+ fragContent = fragTable.get("content", {})
+ if isinstance(fragContent, dict) and "headers" in fragContent:
+ content["headers"] = fragContent["headers"]
+
+ mergedTable["content"] = content
+ return mergedTable
+
+ @staticmethod
+ def _findOverlap(accList: List[Any], fragList: List[Any], mergeId: Optional[str] = None, overlapType: str = "generic") -> int:
+ """Find overlap between two lists. Returns index where overlap starts in fragList."""
+ if not accList or not fragList:
+ if mergeId:
+ JsonMergeLogger.logOverlap(overlapType, 0)
+ return 0
+
+ # Try to find longest common suffix/prefix
+ maxOverlap = min(len(accList), len(fragList))
+
+ for overlapLen in range(maxOverlap, 0, -1):
+ accSuffix = accList[-overlapLen:]
+ fragPrefix = fragList[:overlapLen]
+
+ # Compare elements
+ if JsonDataMerger._listsEqual(accSuffix, fragPrefix):
+ if mergeId:
+ JsonMergeLogger.logOverlap(overlapType, overlapLen, accSuffix, fragPrefix)
+ return overlapLen
+
+ if mergeId:
+ JsonMergeLogger.logOverlap(overlapType, 0)
+ return 0
+
+ @staticmethod
+ def _listsEqual(list1: List[Any], list2: List[Any]) -> bool:
+ """Check if two lists are equal (deep comparison for dicts)."""
+ if len(list1) != len(list2):
+ return False
+
+ for i in range(len(list1)):
+ if isinstance(list1[i], dict) and isinstance(list2[i], dict):
+ # Compare dicts by comparing their content
+ if not JsonDataMerger._dictsEqual(list1[i], list2[i]):
+ return False
+ elif list1[i] != list2[i]:
+ return False
+
+ return True
+
+ @staticmethod
+ def _dictsEqual(dict1: Dict[str, Any], dict2: Dict[str, Any]) -> bool:
+ """Check if two dicts are equal (comparing key content)."""
+ # For table elements, compare rows
+ if dict1.get("type") == "table" and dict2.get("type") == "table":
+ rows1 = JsonDataMerger._getTableRows(dict1)
+ rows2 = JsonDataMerger._getTableRows(dict2)
+ return rows1 == rows2
+
+ # For other elements, compare type and key content
+ if dict1.get("type") != dict2.get("type"):
+ return False
+
+ # Compare content
+ content1 = dict1.get("content", {})
+ content2 = dict2.get("content", {})
+
+ if isinstance(content1, dict) and isinstance(content2, dict):
+ # Compare rows for tables
+ if "rows" in content1 and "rows" in content2:
+ return content1["rows"] == content2["rows"]
+ # Compare items for lists
+ if "items" in content1 and "items" in content2:
+ return content1["items"] == content2["items"]
+
+ return dict1 == dict2
+
+ @staticmethod
+ def _getTableRows(element: Dict[str, Any]) -> List[List[str]]:
+ """Extract table rows from element."""
+ content = element.get("content", {})
+ if isinstance(content, dict):
+ return content.get("rows", [])
+ return element.get("rows", [])
+
+ @staticmethod
+ def _mergeDocuments(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
+ """Merge documents structures."""
+ accDocs = accumulated.get("documents", [])
+ fragDocs = newFragment.get("documents", [])
+
+ if not accDocs:
+ return {"documents": fragDocs} if fragDocs else accumulated
+ if not fragDocs:
+ return {"documents": accDocs}
+
+ # Merge documents (simplified - would need proper merging logic)
+ mergedDocs = accDocs + fragDocs
+ return {"documents": mergedDocs}
+
+ @staticmethod
+ def _mergeFiles(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
+ """Merge files structures."""
+ accFiles = accumulated.get("files", [])
+ fragFiles = newFragment.get("files", [])
+
+ if not accFiles:
+ return {"files": fragFiles} if fragFiles else accumulated
+ if not fragFiles:
+ return {"files": accFiles}
+
+ mergedFiles = accFiles + fragFiles
+ return {"files": mergedFiles}
+
+ @staticmethod
+ def _mergeImages(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
+ """Merge images structures."""
+ accImages = accumulated.get("images", [])
+ fragImages = newFragment.get("images", [])
+
+ if not accImages:
+ return {"images": fragImages} if fragImages else accumulated
+ if not fragImages:
+ return {"images": accImages}
+
+ mergedImages = accImages + fragImages
+ return {"images": mergedImages}
+
+ @staticmethod
+ def _mergeGeneric(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
+ """Generic merge for unknown structures."""
+ # Try to merge by combining keys
+ merged = accumulated.copy()
+ for key, value in newFragment.items():
+ if key in merged:
+ # Key exists - try to merge values
+ if isinstance(merged[key], list) and isinstance(value, list):
+ merged[key] = merged[key] + value
+ elif isinstance(merged[key], dict) and isinstance(value, dict):
+ merged[key] = JsonDataMerger._mergeGeneric(merged[key], value)
+ else:
+ merged[key] = value
+ else:
+ merged[key] = value
+
+ return merged
+
+
+class JsonResultBuilder:
+ """Builds final JSON result, ensuring it's always valid."""
+
+ @staticmethod
+ def build(mergedData: Dict[str, Any], structureType: str, mergeId: Optional[str] = None) -> str:
+ """
+ Build final JSON string from merged data.
+
+ Args:
+ mergedData: Merged data structure
+ structureType: Detected structure type
+
+ Returns:
+ Valid JSON string (never empty)
+ """
+ if not mergedData:
+ # Return empty structure based on type
+ if structureType == "elements":
+ return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
+ elif structureType == "documents":
+ return json.dumps({"documents": [{}]}, indent=2, ensure_ascii=False)
+ elif structureType == "files":
+ return json.dumps({"files": []}, indent=2, ensure_ascii=False)
+ elif structureType == "images":
+ return json.dumps({"images": []}, indent=2, ensure_ascii=False)
+ else:
+ return json.dumps({}, indent=2, ensure_ascii=False)
+
+ # Ensure structure is correct - GENERIC approach
+ if structureType == "elements" and "elements" not in mergedData:
+ # Try to wrap data in elements structure
+ if isinstance(mergedData, dict):
+ # Generic: If it has any data, wrap it as an element
+ if mergedData:
+ mergedData = {"elements": [mergedData]}
+ if mergeId:
+ JsonMergeLogger.logStep("BUILDING", "Wrapping single object as element (generic)", mergedData)
+ else:
+ # Empty dict - return empty elements
+ mergedData = {"elements": []}
+
+ elif structureType == "documents" and "documents" not in mergedData:
+ # Try to wrap data in documents structure
+ if isinstance(mergedData, dict):
+ if mergedData:
+ # Generic: Wrap single object in documents structure
+ # Try to detect if it should be chapters or sections by checking accumulated data
+ # But for now, use generic approach: wrap in documents with a generic key
+ mergedData = {"documents": [mergedData]}
+ if mergeId:
+ JsonMergeLogger.logStep("BUILDING", "Wrapping single object in documents structure (generic)", mergedData)
+ else:
+ mergedData = {"documents": [{}]}
+
+ elif structureType == "files" and "files" not in mergedData:
+ # Try to wrap data in files structure
+ if isinstance(mergedData, dict):
+ if mergedData:
+ mergedData = {"files": [mergedData]}
+ if mergeId:
+ JsonMergeLogger.logStep("BUILDING", "Wrapping single object in files structure (generic)", mergedData)
+ else:
+ mergedData = {"files": []}
+
+ elif structureType == "images" and "images" not in mergedData:
+ # Try to wrap data in images structure
+ if isinstance(mergedData, dict):
+ if mergedData:
+ mergedData = {"images": [mergedData]}
+ if mergeId:
+ JsonMergeLogger.logStep("BUILDING", "Wrapping single object in images structure (generic)", mergedData)
+ else:
+ mergedData = {"images": []}
+
+ elif structureType == "unknown" and isinstance(mergedData, dict) and mergedData:
+ # Unknown structure but has data - wrap generically as elements
+ mergedData = {"elements": [mergedData]}
+ if mergeId:
+ JsonMergeLogger.logStep("BUILDING", "Unknown structure, wrapping as elements (generic)", mergedData)
+
+ # Clean data structure before serialization
+ cleanedData = JsonResultBuilder._cleanDataStructure(mergedData)
+
+ # Try to serialize
+ try:
+ jsonString = json.dumps(cleanedData, indent=2, ensure_ascii=False)
+
+ # Validate the JSON string by trying to parse it
+ try:
+ parsed, parseErr, _ = tryParseJson(jsonString)
+ if parseErr is None:
+ # Valid JSON - return it
+ return jsonString
+ else:
+ # Invalid JSON - try to repair
+ logger.warning(f"Generated JSON is invalid: {parseErr}, attempting repair")
+ repaired = closeJsonStructures(jsonString)
+ parsed2, parseErr2, _ = tryParseJson(repaired)
+ if parseErr2 is None:
+ return repaired
+ else:
+ # Repair failed - return minimal valid structure
+ logger.error(f"Repair failed: {parseErr2}, returning minimal structure")
+ return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
+ except Exception as parseEx:
+ # Parse validation failed - try repair
+ logger.warning(f"Parse validation failed: {parseEx}, attempting repair")
+ try:
+ repaired = closeJsonStructures(jsonString)
+ parsed2, parseErr2, _ = tryParseJson(repaired)
+ if parseErr2 is None:
+ return repaired
+ except Exception:
+ pass
+ # Return minimal valid structure
+ return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
+
+ except (TypeError, ValueError) as e:
+ logger.error(f"Error serializing JSON: {e}")
+ # Try to clean more aggressively and retry
+ try:
+ cleanedData2 = JsonResultBuilder._cleanDataStructure(cleanedData, aggressive=True)
+ jsonString = json.dumps(cleanedData2, indent=2, ensure_ascii=False)
+ # Validate
+ parsed, parseErr, _ = tryParseJson(jsonString)
+ if parseErr is None:
+ return jsonString
+ except Exception:
+ pass
+ # Fallback to empty structure
+ return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
+ except Exception as e:
+ logger.error(f"Unexpected error building JSON: {e}")
+ # Fallback to empty structure
+ return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
+
+ @staticmethod
+ def _cleanDataStructure(data: Any, aggressive: bool = False) -> Any:
+ """
+ Clean data structure to ensure it's JSON-serializable.
+
+ Removes None values, ensures lists contain only valid items,
+ and repairs incomplete structures.
+ """
+ if data is None:
+ return {} if aggressive else None
+
+ if isinstance(data, dict):
+ cleaned = {}
+ for key, value in data.items():
+ if value is None and aggressive:
+ continue # Skip None values in aggressive mode
+ cleaned[key] = JsonResultBuilder._cleanDataStructure(value, aggressive)
+ return cleaned
+
+ elif isinstance(data, list):
+ cleaned = []
+ for item in data:
+ cleanedItem = JsonResultBuilder._cleanDataStructure(item, aggressive)
+ if cleanedItem is not None or not aggressive:
+ cleaned.append(cleanedItem)
+ return cleaned
+
+ elif isinstance(data, (str, int, float, bool)):
+ return data
+
+ else:
+ # Unknown type - try to convert to string or skip
+ if aggressive:
+ return str(data)
+ return data
+
+
+class ModularJsonMerger:
+ """
+ Modular JSON Merger - Main entry point.
+
+ Simple pipeline:
+ 1. Find overlap between JSON strings
+ 2. Merge strings together
+ 3. Parse and clean the merged JSON
+ """
+
+ @staticmethod
+ def _findStringOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
+ """
+ Find overlap between two JSON strings - GENERIC solution.
+
+ Works for any JSON structure (arrays, objects, nested, minified, formatted).
+ Uses multiple strategies to find overlap regardless of JSON format.
+
+ Strategy:
+ 1. Exact suffix/prefix match (fastest, works for any format)
+ 2. Structure-aware: Find last complete JSON elements in accumulated that match start of fragment
+ 3. Line-based: If JSON is formatted, use line matching (for better performance)
+ 4. Partial match: Handle incomplete elements at cut point
+
+ Returns the length of the overlap (number of characters).
+ """
+ if not accStr or not fragStr:
+ if mergeId:
+ JsonMergeLogger.logOverlap("string", 0)
+ return 0
+
+ # Strategy 1: Try exact suffix/prefix match (fastest, works for any format)
+ maxOverlap = min(len(accStr), len(fragStr))
+
+ # Start from maximum possible overlap and work backwards
+ for overlapLen in range(maxOverlap, 0, -1):
+ accSuffix = accStr[-overlapLen:]
+ fragPrefix = fragStr[:overlapLen]
+
+ if accSuffix == fragPrefix:
+ if mergeId:
+ JsonMergeLogger.logOverlap("string (exact)", overlapLen, accSuffix[:200], fragPrefix[:200])
+ return overlapLen
+
+ # Strategy 2: Structure-aware overlap detection (GENERIC - works for any JSON structure)
+ # Find last complete JSON elements in accumulated and check if they appear at start of fragment
+ overlapLen = ModularJsonMerger._findStructureBasedOverlap(accStr, fragStr, mergeId)
+ if overlapLen > 0:
+ return overlapLen
+
+ # Strategy 3: Line-based overlap (works well for formatted JSON)
+ # Only use if JSON appears to be formatted (has newlines)
+ if '\n' in accStr and '\n' in fragStr:
+ overlapLen = ModularJsonMerger._findLineBasedOverlap(accStr, fragStr, mergeId)
+ if overlapLen > 0:
+ return overlapLen
+
+ # Strategy 4: Partial overlap (incomplete element at cut point)
+ overlapLen = ModularJsonMerger._findPartialOverlap(accStr, fragStr, mergeId)
+ if overlapLen > 0:
+ return overlapLen
+
+ if mergeId:
+ JsonMergeLogger.logOverlap("string", 0)
+ return 0
+
+ @staticmethod
+ def _findStructureBasedOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
+ """
+ Find overlap by detecting complete JSON elements (structure-aware, GENERIC).
+
+ Works for ANY JSON structure:
+ - Arrays: Finds last complete array elements
+ - Objects: Finds last complete object properties
+ - Nested structures: Recursively finds complete elements
+ - Minified or formatted JSON: Structure-aware, not format-dependent
+ - Any use case: section_content, chapter_structure, code_structure, etc.
+
+ Strategy: Find last complete JSON elements in accumulated that match start of fragment.
+ Uses balanced bracket/brace matching to identify complete elements regardless of format.
+ """
+ accTrimmed = accStr.rstrip()
+ fragTrimmed = fragStr.lstrip()
+
+ if not accTrimmed or not fragTrimmed:
+ return 0
+
+ # Find last complete elements in accumulated by parsing backwards
+ # Look for complete array elements or object properties
+
+ # Strategy: Find where accumulated has complete elements at the end
+ # and check if fragment starts with the same elements
+
+ # Use a sliding window approach: check different suffix lengths from accumulated
+ maxCheckLength = min(2000, len(accTrimmed), len(fragTrimmed))
+
+ # Check in reverse order (largest to smallest) to find longest overlap first
+ for checkLen in range(maxCheckLength, 50, -5): # Step by 5 for performance
+ if checkLen > len(accTrimmed) or checkLen > len(fragTrimmed):
+ continue
+
+ accSuffix = accTrimmed[-checkLen:]
+ fragPrefix = fragTrimmed[:checkLen]
+
+ # Check if accSuffix ends with complete JSON element(s) and fragPrefix starts with same
+ # A complete element ends with proper closing brackets/braces
+
+ # Verify that accSuffix ends with complete structure
+ # and fragPrefix starts with the same structure
+ if ModularJsonMerger._isCompleteJsonElement(accSuffix) and \
+ ModularJsonMerger._startsWithSameElement(accSuffix, fragPrefix):
+ # Found overlap! Verify it's meaningful (not just whitespace)
+ if len(accSuffix.strip()) > 20:
+ if mergeId:
+ JsonMergeLogger.logOverlap("string (structure-based)", checkLen, accSuffix[:200], fragPrefix[:200])
+ return checkLen
+
+ # Alternative: Try to find common substring that represents complete elements
+ # Look for patterns like complete array rows or object properties
+ # Check last 500 chars of accumulated against first 500 chars of fragment
+ checkWindow = min(500, len(accTrimmed), len(fragTrimmed))
+ if checkWindow > 100:
+ accWindow = accTrimmed[-checkWindow:]
+ fragWindow = fragTrimmed[:checkWindow]
+
+ # Find longest common substring that represents complete elements
+ # Look for boundaries like ], [ or }, { or ", "
+ for i in range(checkWindow - 50, 50, -5):
+ accSub = accWindow[-i:]
+ fragSub = fragWindow[:i]
+
+ if accSub == fragSub:
+ # Check if it's a complete element boundary
+ if ModularJsonMerger._isCompleteElementBoundary(accSub):
+ if mergeId:
+ JsonMergeLogger.logOverlap("string (structure-boundary)", i, accSub[:200], fragSub[:200])
+ return i
+
+ return 0
+
+ @staticmethod
+ def _isCompleteJsonElement(jsonStr: str) -> bool:
+ """Check if string ends with a complete JSON element (balanced brackets/braces)."""
+ jsonStr = jsonStr.strip()
+ if not jsonStr:
+ return False
+
+ # Check if it ends with complete structure markers
+ # Complete array element: ends with ] or ], or ],
+ # Complete object element: ends with } or }, or },
+ if jsonStr[-1] in ']}':
+ # Check if brackets/braces are balanced
+ braceCount = jsonStr.count('{') - jsonStr.count('}')
+ bracketCount = jsonStr.count('[') - jsonStr.count(']')
+ return braceCount == 0 and bracketCount == 0
+
+ return False
+
+ @staticmethod
+ def _startsWithSameElement(accSuffix: str, fragPrefix: str) -> bool:
+ """Check if fragment prefix starts with the same element as accumulated suffix."""
+ # Normalize whitespace for comparison
+ accNorm = accSuffix.strip()
+ fragNorm = fragPrefix.strip()
+
+ # Check if fragPrefix starts with accSuffix (or vice versa for partial matches)
+ if fragNorm.startswith(accNorm):
+ return True
+
+ # Check if they have common prefix (for partial element completion)
+ minLen = min(len(accNorm), len(fragNorm))
+ if minLen > 20:
+ # Check if first 80% of accSuffix matches start of fragPrefix
+ checkLen = int(minLen * 0.8)
+ return accNorm[:checkLen] == fragNorm[:checkLen]
+
+ return False
+
+ @staticmethod
+ def _isCompleteElementBoundary(jsonStr: str) -> bool:
+ """Check if string represents a complete element boundary (e.g., ], [ or }, {)."""
+ jsonStr = jsonStr.strip()
+ if not jsonStr:
+ return False
+
+ # Check if it contains complete element boundaries
+ # Pattern: ends with ], or }, or ],\n or },\n
+ if jsonStr.rstrip().endswith(('],', '},', ']', '}')):
+ return True
+
+ # Check if it's a complete array element or object property
+ if '],' in jsonStr or '},' in jsonStr:
+ return True
+
+ return False
+
+ @staticmethod
+ def _findLineBasedOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
+ """
+ Find overlap using line-based matching (for formatted JSON).
+ """
+ accLines = accStr.rstrip().split('\n')
+ fragLines = fragStr.lstrip().split('\n')
+
+ # Try to find matching lines from the end of accumulated at the start of fragment
+ maxLinesToCheck = min(10, len(accLines), len(fragLines))
+
+ for numLines in range(maxLinesToCheck, 0, -1):
+ # Get last N lines from accumulated (excluding empty lines)
+ accLastLines = [line.strip() for line in accLines[-numLines:] if line.strip()]
+ # Get first N lines from fragment (excluding empty lines)
+ fragFirstLines = [line.strip() for line in fragLines[:numLines] if line.strip()]
+
+ # Check if they match
+ if len(accLastLines) > 0 and len(fragFirstLines) > 0:
+ # Try to find where accLastLines match fragFirstLines
+ for i in range(len(accLastLines)):
+ # Check if accLastLines[i:] matches fragFirstLines[:len(accLastLines)-i]
+ accSuffixLines = accLastLines[i:]
+ fragPrefixLines = fragFirstLines[:len(accSuffixLines)]
+
+ if accSuffixLines == fragPrefixLines and len(accSuffixLines) > 0:
+ # Found overlap! Calculate character length
+ accSuffixText = '\n'.join(accLastLines[i:])
+ fragPrefixText = '\n'.join(fragPrefixLines)
+
+ # Find where this text appears in the original strings
+ accPos = accStr.rfind(accSuffixText)
+ fragPos = fragStr.find(fragPrefixText)
+
+ if accPos >= 0 and fragPos == 0:
+ # Found valid overlap
+ overlapLen = len(accSuffixText)
+ if mergeId:
+ JsonMergeLogger.logOverlap("string (line-based)", overlapLen, accSuffixText[:200], fragPrefixText[:200])
+ return overlapLen
+
+ return 0
+
+ @staticmethod
+ def _findPartialOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
+ """
+ Find partial overlap (incomplete element at cut point).
+ """
+ accLines = accStr.rstrip().split('\n')
+ fragLines = fragStr.lstrip().split('\n')
+
+ if accLines and fragLines:
+ lastAccLine = accLines[-1].strip()
+ firstFragLine = fragLines[0].strip()
+
+ # Check if lastAccLine is a prefix of firstFragLine (incomplete line completed)
+ if lastAccLine and firstFragLine.startswith(lastAccLine):
+ # Also check if there are more matching lines after
+ overlapLen = len(lastAccLine)
+ # Try to extend overlap with more lines
+ for i in range(1, min(len(accLines), len(fragLines))):
+ if accLines[-1-i].strip() == fragLines[i].strip():
+ overlapLen += len('\n' + fragLines[i])
+ else:
+ break
+
+ if overlapLen > 20: # Only if meaningful overlap
+ if mergeId:
+ JsonMergeLogger.logOverlap("string (partial line)", overlapLen, lastAccLine[:200], firstFragLine[:200])
+ return overlapLen
+
+ return 0
+
+ @staticmethod
+ def _mergeStrings(accStr: str, fragStr: str, overlapLength: int) -> str:
+ """
+ Merge two JSON strings together, removing the overlap.
+ """
+ if overlapLength > 0:
+ # Remove overlap from fragment and append
+ merged = accStr + fragStr[overlapLength:]
+ else:
+ # No overlap - just concatenate (might need comma or other separator)
+ # Try to add comma if needed
+ accTrimmed = accStr.rstrip().rstrip(',')
+ fragTrimmed = fragStr.lstrip().lstrip(',')
+
+ # Check if we need a separator
+ if accTrimmed and fragTrimmed:
+ # If accumulated ends with } or ] and fragment starts with { or [, we might need comma
+ if (accTrimmed[-1] in '}]' and fragTrimmed[0] in '{['):
+ merged = accTrimmed + ',' + fragTrimmed
+ else:
+ merged = accTrimmed + fragTrimmed
+ else:
+ merged = accStr + fragStr
+
+ return merged
+
+ @staticmethod
+ def merge(accumulated: str, newFragment: str) -> Tuple[str, bool]:
+ """
+ Merge two JSON fragments intelligently.
+
+ Args:
+ accumulated: Previously accumulated JSON string
+ newFragment: New fragment JSON string
+
+ Returns:
+ Tuple of (merged_json_string, has_overlap):
+ - merged_json_string: Merged JSON string (closed if no overlap, unclosed if overlap found)
+ - has_overlap: True if overlap was found (iterations should continue), False if no overlap (iterations should stop)
+ """
+ # Start logging
+ mergeId = JsonMergeLogger.startMerge(accumulated, newFragment)
+
+ if not accumulated:
+ result = newFragment if newFragment else "{}"
+ JsonMergeLogger.finishMerge(mergeId, result, True)
+ return (result, False) # No overlap if no accumulated data
+ if not newFragment:
+ JsonMergeLogger.finishMerge(mergeId, accumulated, True)
+ return (accumulated, False) # No overlap if no new fragment
+
+ try:
+ # Normalize both strings
+ accNormalized = stripCodeFences(normalizeJsonText(accumulated)).strip()
+ fragNormalized = stripCodeFences(normalizeJsonText(newFragment)).strip()
+
+ JsonMergeLogger._log(f"\n Normalized Accumulated ({len(accNormalized)} chars)")
+ accNormLines = accNormalized.split('\n')
+ if len(accNormLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(accNormLines)} lines)")
+ for line in accNormLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(accNormLines) - 10} lines omitted) ...")
+ for line in accNormLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in accNormLines:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f"\n Normalized New Fragment ({len(fragNormalized)} chars)")
+ fragNormLines = fragNormalized.split('\n')
+ if len(fragNormLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(fragNormLines)} lines)")
+ for line in fragNormLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(fragNormLines) - 10} lines omitted) ...")
+ for line in fragNormLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in fragNormLines:
+ JsonMergeLogger._log(f" {line}")
+
+ # Step 1: Find overlap between JSON strings
+ JsonMergeLogger.logStep("PHASE 1", "Finding overlap between JSON strings", None)
+ overlapLength = ModularJsonMerger._findStringOverlap(accNormalized, fragNormalized, mergeId)
+
+ if overlapLength > 0:
+ accSuffix = accNormalized[-overlapLength:]
+ fragPrefix = fragNormalized[:overlapLength]
+ JsonMergeLogger._log(f"\n Overlap found ({overlapLength} chars):")
+ JsonMergeLogger._log(f" Accumulated suffix: {accSuffix}")
+ JsonMergeLogger._log(f" Fragment prefix: {fragPrefix}")
+ else:
+ # CRITICAL: No overlap found - this means iterations should stop
+ JsonMergeLogger._log(f"\n ⚠️ NO OVERLAP FOUND - This indicates iterations should stop")
+ JsonMergeLogger._log(f" Closing JSON and returning final result")
+
+ # Close the accumulated JSON (it's complete as far as we can tell)
+ closedJson = closeJsonStructures(accNormalized)
+ JsonMergeLogger._log(f"\n Closed JSON ({len(closedJson)} chars):")
+ JsonMergeLogger._log(" " + "="*78)
+ for line in closedJson.split('\n'):
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(" " + "="*78)
+
+ JsonMergeLogger.finishMerge(mergeId, closedJson, True)
+ # Return closed JSON with has_overlap=False to indicate iterations should stop
+ return (closedJson, False)
+
+ # Step 2: Merge strings together (only if overlap was found)
+ JsonMergeLogger.logStep("PHASE 2", f"Merging strings (overlap: {overlapLength} chars)", None)
+ mergedString = ModularJsonMerger._mergeStrings(accNormalized, fragNormalized, overlapLength)
+
+ JsonMergeLogger._log(f"\n Merged String ({len(mergedString)} chars)")
+ mergedLines = mergedString.split('\n')
+ if len(mergedLines) > 10:
+ JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(mergedLines)} lines)")
+ for line in mergedLines[:5]:
+ JsonMergeLogger._log(f" {line}")
+ JsonMergeLogger._log(f" ... ({len(mergedLines) - 10} lines omitted) ...")
+ for line in mergedLines[-5:]:
+ JsonMergeLogger._log(f" {line}")
+ else:
+ for line in mergedLines:
+ JsonMergeLogger._log(f" {line}")
+
+ # Step 3: Return merged string (with incomplete element at end for next iteration)
+ JsonMergeLogger.logStep("PHASE 3", "Returning merged string (may be unclosed)", None)
+ JsonMergeLogger._log(f"\n Returning merged string (preserving incomplete element at end for next iteration)")
+
+ JsonMergeLogger.finishMerge(mergeId, mergedString, True)
+ # Return merged string with has_overlap=True to indicate iterations should continue
+ return (mergedString, True)
+
+ except Exception as e:
+ logger.error(f"Error in modular merger: {e}")
+ JsonMergeLogger.logStep("ERROR", f"Exception occurred: {str(e)}", None, error=str(e))
+ # Fallback: try to return accumulated if valid
+ try:
+ accParsed, accErr, _ = tryParseJson(accumulated)
+ if accErr is None:
+ JsonMergeLogger.finishMerge(mergeId, accumulated, False)
+ return (accumulated, False) # No overlap on error
+ except Exception:
+ pass
+ # Last resort: return empty valid JSON
+ fallback = json.dumps({"elements": []}, indent=2, ensure_ascii=False)
+ JsonMergeLogger.finishMerge(mergeId, fallback, False)
+ return (fallback, False) # No overlap on error
diff --git a/modules/services/serviceAi/subJsonResponseHandling.py b/modules/services/serviceAi/subJsonResponseHandling.py
index 0e8820f2..c088d598 100644
--- a/modules/services/serviceAi/subJsonResponseHandling.py
+++ b/modules/services/serviceAi/subJsonResponseHandling.py
@@ -1322,153 +1322,1314 @@ class JsonResponseHandler:
def mergeJsonStringsWithOverlap(
accumulated: str,
newFragment: str
- ) -> str:
+ ) -> Tuple[str, bool]:
"""
- ROBUST generic function to merge two JSON strings, handling fragments cut anywhere.
+ Merge JSON fragments intelligently using modular parser.
- Works for ANY JSON structure - handles cuts at beginning, middle, or end.
-
- Fragment scenarios (all handled):
- - Cut at beginning: newFragment starts mid-structure (e.g., `["item1", ...]`)
- - Cut in middle: newFragment continues incomplete structure from accumulated
- - Cut at end: accumulated ends mid-structure (e.g., `["item1", "item2", `)
- - Full overlap: newFragment overlaps with end of accumulated
- - Exact continuation: newFragment starts exactly where accumulated ends
-
- Strategy:
- 1. Extract JSON from both strings (handles code fences, extra text)
- 2. Detect if newFragment is a fragment (doesn't start with { or [ as root)
- 3. Detect if accumulated ends mid-structure (incomplete)
- 4. Repair incomplete structures using existing jsonUtils
- 5. Merge intelligently based on structure analysis
+ Uses the new ModularJsonMerger for clean, robust merging.
+ Falls back to legacy code only if new merger fails completely.
Args:
- accumulated: Previously accumulated JSON string (may be incomplete)
- newFragment: New fragment string to append (may be a fragment)
+ accumulated: Previously accumulated JSON string (may be incomplete/fragmented)
+ newFragment: New fragment string to append (may be incomplete/fragmented)
Returns:
- Combined JSON string with fragments properly merged
+ Tuple of (merged_json_string, has_overlap):
+ - merged_json_string: Combined JSON string with fragments properly merged
+ - has_overlap: True if overlap was found (iterations should continue), False if no overlap (iterations should stop)
"""
if not accumulated:
- return newFragment
+ result = newFragment if newFragment else "{}"
+ return (result, False) # No overlap if no accumulated data
if not newFragment:
+ return (accumulated, False) # No overlap if no new fragment
+
+ # Use new modular merger
+ try:
+ from modules.services.serviceAi.subJsonMerger import ModularJsonMerger
+ result, hasOverlap = ModularJsonMerger.merge(accumulated, newFragment)
+ # IMPORTANT: ModularJsonMerger returns unclosed JSON if overlap found (with incomplete element at end)
+ # If no overlap, returns closed JSON (iterations should stop)
+ if result and result.strip() and result.strip() != "{}":
+ # Return result with overlap flag
+ return (result, hasOverlap)
+ except Exception as e:
+ logger.debug(f"Modular merger failed, using fallback: {e}")
+
+ # Fallback to legacy merger (simplified)
+ from modules.shared.jsonUtils import normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson
+
+ accumulatedExtracted = stripCodeFences(normalizeJsonText(accumulated)).strip()
+ newFragmentExtracted = stripCodeFences(normalizeJsonText(newFragment)).strip()
+
+ # Try simple string merge with repair
+ try:
+ # Close structures
+ accClosed = closeJsonStructures(accumulatedExtracted) if accumulatedExtracted else "{}"
+ fragClosed = closeJsonStructures(newFragmentExtracted) if newFragmentExtracted else "{}"
+
+ # Try to parse both
+ accParsed, accErr, _ = tryParseJson(accClosed)
+ fragParsed, fragErr, _ = tryParseJson(fragClosed)
+
+ # If both parse, merge structurally
+ if accErr is None and fragErr is None:
+ merged = JsonResponseHandler._mergeParsedJson(accParsed, fragParsed)
+ if merged:
+ result = json.dumps(merged, indent=2, ensure_ascii=False)
+ return (result, False) # No overlap in fallback - close and stop
+
+ # If only accumulated parses, return it
+ if accErr is None and accParsed:
+ result = json.dumps(accParsed, indent=2, ensure_ascii=False)
+ return (result, False) # No overlap - close and stop
+ except Exception:
+ pass
+
+ # Last resort: return accumulated (at least we have that) - close it
+ if accumulatedExtracted:
+ try:
+ closed = closeJsonStructures(accumulatedExtracted)
+ return (closed, False) # No overlap - close and stop
+ except Exception:
+ return (accumulatedExtracted, False) # No overlap - return as-is
+
+ result = accumulated if accumulated else "{}"
+ return (result, False) # No overlap - return as-is
+
+ @staticmethod
+ def _mergeParsedJson(accParsed: Any, fragParsed: Any) -> Optional[Dict[str, Any]]:
+ """Simple merge of two parsed JSON objects."""
+ if isinstance(accParsed, dict) and isinstance(fragParsed, dict):
+ # Merge dicts
+ merged = accParsed.copy()
+
+ # Merge elements if both have them
+ if "elements" in accParsed and "elements" in fragParsed:
+ accElements = accParsed.get("elements", [])
+ fragElements = fragParsed.get("elements", [])
+ # Simple merge - append new elements
+ merged["elements"] = accElements + fragElements
+ elif "elements" in fragParsed:
+ merged["elements"] = fragParsed["elements"]
+
+ # Merge other keys
+ for key, value in fragParsed.items():
+ if key != "elements":
+ if key in merged and isinstance(merged[key], list) and isinstance(value, list):
+ merged[key] = merged[key] + value
+ else:
+ merged[key] = value
+
+ return merged
+
+ return None
+
+ @staticmethod
+ def _normalizeToElementsStructure(
+ jsonString: str,
+ originalString: str
+ ) -> Optional[Dict[str, Any]]:
+ """
+ Normalize any JSON structure (Dict, List, None, or parse error) to {"elements": [...]} format.
+
+ Handles:
+ - Dict with "elements" → return as-is
+ - Dict without "elements" but with "type" → wrap in elements array
+ - List → wrap in elements structure
+ - Parse error → try repairBrokenJson
+ - None → return None
+
+ Args:
+ jsonString: Extracted JSON string
+ originalString: Original string (for context)
+
+ Returns:
+ Normalized Dict with "elements" array, or None if normalization fails
+ """
+ if not jsonString:
+ return None
+
+ from modules.shared.jsonUtils import tryParseJson, repairBrokenJson, closeJsonStructures
+
+ # Try to parse directly first
+ try:
+ parsed = json.loads(jsonString)
+ parseErr = None
+ except Exception as e:
+ parseErr = e
+ parsed = None
+
+ # If parsing failed, try closing structures first (for incomplete fragments)
+ if parseErr is not None:
+ try:
+ closed = closeJsonStructures(jsonString)
+ parsed = json.loads(closed)
+ parseErr = None
+ except Exception:
+ pass
+
+ # If still failed, try repairBrokenJson ONLY if it looks like document structure
+ # For other structures (like section_content), use fragment detection instead
+ if parseErr is not None:
+ # Check if this looks like a document structure (has "documents" or "sections")
+ isDocumentStructure = '"documents"' in jsonString or '"sections"' in jsonString
+
+ if isDocumentStructure:
+ # Use repairBrokenJson for document structures
+ repaired = repairBrokenJson(jsonString)
+ if repaired:
+ parsed = repaired
+ parseErr = None
+ else:
+ # Still can't parse - try to detect fragment structure
+ return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString)
+ else:
+ # For non-document structures, skip repairBrokenJson and go straight to fragment detection
+ # repairBrokenJson tries to extract "sections" which doesn't work for other structures
+ return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString)
+
+ # Normalize based on type
+ if parsed is None:
+ return None
+ elif isinstance(parsed, dict):
+ # Already a dict
+ if "elements" in parsed:
+ return parsed
+ elif "type" in parsed:
+ # Single element - wrap in elements array
+ return {"elements": [parsed]}
+ else:
+ # Unknown dict structure - try to extract elements
+ return JsonResponseHandler._extractElementsFromDict(parsed)
+ elif isinstance(parsed, list):
+ # List - check if it's a list of elements or a fragment
+ if parsed and isinstance(parsed[0], dict) and "type" in parsed[0]:
+ # List of elements
+ return {"elements": parsed}
+ else:
+ # Fragment list (e.g., array of rows) - detect structure
+ return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString)
+ else:
+ # Primitive type - can't normalize
+ return None
+
+ @staticmethod
+ def _detectAndNormalizeFragment(
+ jsonString: str,
+ originalString: str
+ ) -> Optional[Dict[str, Any]]:
+ """
+ Detect fragment structure and normalize it.
+
+ Fragments can be:
+ - Array of arrays (table rows): `[["row1"], ["row2"]]` or `["1947", "16883"], ["1948", "16889"]`
+ - Array of strings (list items): `["item1", "item2"]`
+ - Incomplete structure: `["item1", "item2", ` (ends with comma)
+ - Partial object: `{"type": "table", "content": {"rows": [["1947"...` (cut mid-string)
+
+ Returns normalized structure or None if detection fails.
+ """
+ jsonStripped = jsonString.strip()
+
+ # Strategy 1: Check if it's an array fragment
+ if jsonStripped.startswith('['):
+ # Try to parse as array
+ from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
+
+ # Close incomplete structures
+ closed = closeJsonStructures(jsonStripped)
+ parsed, parseErr, _ = tryParseJson(closed)
+
+ if parseErr is None and isinstance(parsed, list):
+ # Check structure: array of arrays (table rows) or array of strings (list items)
+ if parsed and isinstance(parsed[0], list):
+ # Array of arrays - likely table rows fragment
+ return {
+ "elements": [{
+ "type": "table",
+ "content": {
+ "rows": parsed
+ }
+ }]
+ }
+ elif parsed and isinstance(parsed[0], str):
+ # Array of strings - likely list items fragment
+ return {
+ "elements": [{
+ "type": "bullet_list",
+ "content": {
+ "items": parsed
+ }
+ }]
+ }
+ elif parseErr is not None:
+ # Can't parse - try regex extraction for table rows
+ rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped)
+ if rows:
+ return {
+ "elements": [{
+ "type": "table",
+ "content": {
+ "rows": rows
+ }
+ }]
+ }
+
+ # Strategy 2: Check if it's a partial object (cut mid-structure)
+ # Look for patterns like: {"elements": [...] or {"type": "table"...
+ if jsonStripped.startswith('{'):
+ from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
+
+ # Try to close and parse
+ closed = closeJsonStructures(jsonStripped)
+ parsed, parseErr, _ = tryParseJson(closed)
+
+ if parseErr is None and isinstance(parsed, dict):
+ # Successfully parsed - normalize it
+ return JsonResponseHandler._normalizeToElementsStructure(closed, originalString)
+ elif parseErr is not None:
+ # Can't parse - try to extract table rows from the raw string
+ # This handles cases like: {"elements": [{"type": "table", "content": {"rows": [["1947"...
+ rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped)
+ if rows:
+ return {
+ "elements": [{
+ "type": "table",
+ "content": {
+ "rows": rows
+ }
+ }]
+ }
+
+ # Try to extract any array patterns that might be table rows
+ # Look for patterns like: ["1947", "10000"], ["1948", "10100"]
+ import re
+ # Pattern: ["value1", "value2"], ["value3", "value4"]
+ rowPattern = r'\["([^"]*)",\s*"([^"]*)"\]'
+ matches = re.findall(rowPattern, jsonStripped)
+ if matches and len(matches) >= 2:
+ # Found multiple row patterns - likely table rows
+ rows = [[match[0], match[1]] for match in matches]
+ return {
+ "elements": [{
+ "type": "table",
+ "content": {
+ "rows": rows
+ }
+ }]
+ }
+
+ # Strategy 3: Try to extract rows from any text (even if not starting with [ or {)
+ rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped)
+ if rows:
+ return {
+ "elements": [{
+ "type": "table",
+ "content": {
+ "rows": rows
+ }
+ }]
+ }
+
+ return None
+
+ @staticmethod
+ def _extractElementsFromDict(d: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Try to extract elements from unknown dict structure.
+ Returns normalized structure or empty elements array.
+ """
+ # Check common patterns
+ if "sections" in d:
+ # Document structure with sections
+ sections = d.get("sections", [])
+ elements = []
+ for section in sections:
+ if isinstance(section, dict) and "elements" in section:
+ elements.extend(section.get("elements", []))
+ return {"elements": elements}
+
+ # Unknown structure - return empty
+ return {"elements": []}
+
+ @staticmethod
+ def _mergeJsonStructuresGeneric(
+ accumulatedObj: Dict[str, Any],
+ newFragmentObj: Dict[str, Any],
+ accumulatedRaw: str,
+ newFragmentRaw: str,
+ overlapElements: Optional[List[Dict[str, Any]]] = None
+ ) -> Optional[Dict[str, Any]]:
+ """
+ GENERIC merge of two JSON structures, handling overlaps and missing parts.
+
+ Strategy:
+ 1. Extract elements from both structures (both are normalized to {"elements": [...]})
+ 2. Use overlap elements if provided to identify merge point
+ 3. Detect if both have same structure (same content type)
+ 4. Group elements by type
+ 5. Merge elements of same type using content-type-specific logic with overlap detection
+ 6. Handle overlaps and missing parts intelligently
+
+ Args:
+ accumulatedObj: Normalized accumulated JSON object (guaranteed to have "elements")
+ newFragmentObj: Normalized new fragment JSON object (guaranteed to have "elements")
+ accumulatedRaw: Raw accumulated string (for fragment detection)
+ newFragmentRaw: Raw new fragment string (for fragment detection)
+ overlapElements: Optional list of overlap elements from continuation response
+
+ Returns:
+ Merged JSON object or None if merging fails
+ """
+ try:
+ # Step 1: Extract elements (both are normalized, so this should always work)
+ accumulatedElements = accumulatedObj.get("elements", []) if isinstance(accumulatedObj, dict) else []
+ newFragmentElements = newFragmentObj.get("elements", []) if isinstance(newFragmentObj, dict) else []
+
+ if not accumulatedElements and not newFragmentElements:
+ # No elements found - try to extract from raw strings
+ # Try to extract any valid JSON structure from raw strings
+ from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
+
+ # Try accumulated first
+ if accumulatedRaw:
+ try:
+ closedAccumulated = closeJsonStructures(accumulatedRaw)
+ parsed, parseErr, _ = tryParseJson(closedAccumulated)
+ if parseErr is None and parsed:
+ normalized = JsonResponseHandler._normalizeToElementsStructure(closedAccumulated, accumulatedRaw)
+ if normalized:
+ return normalized
+ except Exception:
+ pass
+
+ # Try new fragment
+ if newFragmentRaw:
+ try:
+ closedFragment = closeJsonStructures(newFragmentRaw)
+ parsed, parseErr, _ = tryParseJson(closedFragment)
+ if parseErr is None and parsed:
+ normalized = JsonResponseHandler._normalizeToElementsStructure(closedFragment, newFragmentRaw)
+ if normalized:
+ return normalized
+ except Exception:
+ pass
+
+ # If still nothing, return empty structure (never None)
+ return {"elements": []}
+
+ # Step 2: Use overlap elements to identify merge point
+ # If overlap elements are provided, use them to find where to merge
+ if overlapElements and isinstance(overlapElements, list) and len(overlapElements) > 0:
+ # Find overlap in accumulated elements
+ overlapStartIndex = JsonResponseHandler._findOverlapStartIndex(accumulatedElements, overlapElements)
+ if overlapStartIndex >= 0:
+ # Remove overlapping elements from accumulated (they'll be replaced by continuation)
+ accumulatedElements = accumulatedElements[:overlapStartIndex]
+ logger.debug(f"Found overlap at index {overlapStartIndex}, removed {len(accumulatedElements) - overlapStartIndex} overlapping elements")
+
+ # Step 3: Detect if newFragment is a continuation fragment
+ # Check if newFragment starts with array elements (fragment, not full JSON)
+ isFragment = JsonResponseHandler._isFragment(newFragmentRaw, newFragmentElements)
+
+ # Step 4: Group elements by type for intelligent merging
+ accumulatedByType = {}
+ for elem in accumulatedElements:
+ if isinstance(elem, dict):
+ elemType = elem.get("type", "unknown")
+ if elemType not in accumulatedByType:
+ accumulatedByType[elemType] = []
+ accumulatedByType[elemType].append(elem)
+
+ newFragmentByType = {}
+ for elem in newFragmentElements:
+ if isinstance(elem, dict):
+ elemType = elem.get("type", "unknown")
+ if elemType not in newFragmentByType:
+ newFragmentByType[elemType] = []
+ newFragmentByType[elemType].append(elem)
+
+ # Step 5: Merge elements intelligently
+ mergedElements = []
+ allTypes = set(accumulatedByType.keys()) | set(newFragmentByType.keys())
+
+ for elemType in allTypes:
+ accElems = accumulatedByType.get(elemType, [])
+ fragElems = newFragmentByType.get(elemType, [])
+
+ if not accElems:
+ # Only in fragment - add all
+ mergedElements.extend(fragElems)
+ elif not fragElems:
+ # Only in accumulated - add all
+ mergedElements.extend(accElems)
+ else:
+ # Both have elements of this type - merge them using content-type-specific logic
+ mergedElem = JsonResponseHandler._mergeElementsOfSameTypeGeneric(
+ accElems[0], fragElems[0], elemType, accumulatedRaw, newFragmentRaw, isFragment
+ )
+ if mergedElem:
+ mergedElements.append(mergedElem)
+
+ # Step 6: Reconstruct base structure
+ if mergedElements:
+ return {"elements": mergedElements}
+ else:
+ # No merged elements - return accumulated if available (NEVER return None)
+ if accumulatedElements:
+ return {"elements": accumulatedElements}
+ # If no accumulated, return new fragment if available
+ if newFragmentElements:
+ return {"elements": newFragmentElements}
+ # Last resort: return empty structure (never None)
+ return {"elements": []}
+
+ except Exception as e:
+ logger.debug(f"Structure-based merge failed: {e}")
+ import traceback
+ logger.debug(traceback.format_exc())
+ return None
+
+ @staticmethod
+ def _isFragment(jsonString: str, elements: List[Dict[str, Any]]) -> bool:
+ """
+ Detect if JSON string is a fragment (not a complete JSON object).
+
+ Fragments:
+ - Start with `[` but not `[{"` (array fragment, not full elements array)
+ - Start with array elements like `["cell1", "cell2"],` (table rows fragment)
+ - Don't have full structure (missing outer object with "elements")
+ - Are continuations of previous structure
+ """
+ jsonStripped = jsonString.strip()
+
+ # Check if it starts with array (fragment)
+ if jsonStripped.startswith('['):
+ # Check if it's a full elements array `[{"type": ...}]` or a fragment `["cell1", "cell2"]`
+ if jsonStripped.startswith('[{"') or jsonStripped.startswith('[{'):
+ # Could be full structure - check if it has "type" field
+ if elements and isinstance(elements[0], dict) and "type" in elements[0]:
+ return False # Full structure
+ # Otherwise it's a fragment (array of primitives or incomplete)
+ return True
+
+ # Check if it starts with object but missing "elements" wrapper
+ if jsonStripped.startswith('{'):
+ # Check if it has "elements" field
+ if '"elements"' not in jsonStripped[:200]: # Check first 200 chars
+ # Might be a single element fragment
+ return True
+
+ # Check if elements are incomplete (no full structure)
+ if elements and isinstance(elements[0], dict):
+ # Check if first element is missing required fields
+ firstElem = elements[0]
+ if "type" not in firstElem and "content" not in firstElem:
+ return True
+
+ return False
+
+ @staticmethod
+ def _mergeElementsOfSameTypeGeneric(
+ accumulatedElem: Dict[str, Any],
+ newFragmentElem: Dict[str, Any],
+ elemType: str,
+ accumulatedRaw: str,
+ newFragmentRaw: str,
+ isFragment: bool
+ ) -> Optional[Dict[str, Any]]:
+ """
+ GENERIC merge of two elements of the same type, with content-type-specific optimizations.
+
+ Content-type-specific merging:
+ - table: Merge rows arrays with overlap detection
+ - paragraph: Merge text content
+ - code_block: Merge code strings
+ - bullet_list/numbered_list: Merge items arrays
+ - heading: Use new fragment (usually complete)
+ - image: Use new fragment (usually complete)
+ - Other: Generic deep merge
+
+ Args:
+ accumulatedElem: Accumulated element
+ newFragmentElem: New fragment element
+ elemType: Content type (table, paragraph, etc.)
+ accumulatedRaw: Raw accumulated string
+ newFragmentRaw: Raw new fragment string
+ isFragment: Whether newFragment is a fragment (continuation)
+
+ Returns:
+ Merged element or None if merging fails
+ """
+ if elemType == "table":
+ return JsonResponseHandler._mergeTableElementsGeneric(
+ accumulatedElem, newFragmentElem, accumulatedRaw, newFragmentRaw, isFragment
+ )
+ elif elemType == "paragraph":
+ return JsonResponseHandler._mergeParagraphElements(
+ accumulatedElem, newFragmentElem, isFragment
+ )
+ elif elemType == "code_block":
+ return JsonResponseHandler._mergeCodeBlockElements(
+ accumulatedElem, newFragmentElem, isFragment
+ )
+ elif elemType in ["bullet_list", "numbered_list"]:
+ return JsonResponseHandler._mergeListElements(
+ accumulatedElem, newFragmentElem, isFragment
+ )
+ elif elemType in ["heading", "image"]:
+ # Usually complete - use new fragment if it exists, otherwise accumulated
+ return newFragmentElem if newFragmentElem else accumulatedElem
+ else:
+ # Generic merge: use mergeDeepStructures
+ return JsonResponseHandler.mergeDeepStructures(
+ accumulatedElem, newFragmentElem, 0, f"element_merge.{elemType}"
+ )
+
+ @staticmethod
+ def _mergeTableElementsGeneric(
+ accumulatedElem: Dict[str, Any],
+ newFragmentElem: Dict[str, Any],
+ accumulatedRaw: str,
+ newFragmentRaw: str,
+ isFragment: bool
+ ) -> Dict[str, Any]:
+ """
+ GENERIC merge of two table elements with content-type-specific optimizations.
+
+ Handles:
+ - Overlapping rows (detect duplicates by comparing row content)
+ - Missing headers (complete with existing headers)
+ - Incomplete rows (complete with null values if needed)
+ - Fragment rows (if newFragment is a fragment, extract rows from raw string)
+
+ Args:
+ accumulatedElem: Accumulated table element
+ newFragmentElem: New fragment table element
+ accumulatedRaw: Raw accumulated string (for fragment detection)
+ newFragmentRaw: Raw new fragment string (for fragment extraction)
+ isFragment: Whether newFragment is a fragment
+
+ Returns:
+ Merged table element
+ """
+ # Extract content (handle both nested and flat structures)
+ accContent = accumulatedElem.get("content", {})
+ if not accContent and "rows" in accumulatedElem:
+ accContent = accumulatedElem
+
+ fragContent = newFragmentElem.get("content", {})
+ if not fragContent and "rows" in newFragmentElem:
+ fragContent = newFragmentElem
+
+ # Extract rows
+ accRows = accContent.get("rows", []) if isinstance(accContent, dict) else []
+
+ # If fragment, try to extract rows from raw string
+ fragRows = fragContent.get("rows", []) if isinstance(fragContent, dict) else []
+ if isFragment and not fragRows:
+ fragRows = JsonResponseHandler._extractRowsFromFragment(newFragmentRaw)
+
+ # Extract headers (complete missing with existing)
+ accHeaders = accContent.get("headers", []) if isinstance(accContent, dict) else []
+ fragHeaders = fragContent.get("headers", []) if isinstance(fragContent, dict) else []
+ mergedHeaders = accHeaders if accHeaders else fragHeaders
+
+ # Merge rows with overlap detection
+ mergedRows = JsonResponseHandler._mergeRowsWithOverlapDetection(accRows, fragRows)
+
+ # Reconstruct table element
+ mergedContent = {
+ "headers": mergedHeaders,
+ "rows": mergedRows
+ }
+
+ # Preserve other fields (caption, etc.)
+ if isinstance(accContent, dict) and "caption" in accContent:
+ mergedContent["caption"] = accContent["caption"]
+ elif isinstance(fragContent, dict) and "caption" in fragContent:
+ mergedContent["caption"] = fragContent["caption"]
+
+ return {
+ "type": "table",
+ "content": mergedContent
+ }
+
+ @staticmethod
+ def _extractRowsFromFragment(fragmentRaw: str) -> List[List[str]]:
+ """
+ Extract table rows from fragment string.
+
+ Handles fragments like:
+ - `["1947", "16883"], ["1948", "16889"], ...`
+ - `"rows": [["1947", "10000"], ["1948", "10100"]...`
+ - Incomplete fragments cut mid-string
+ Also handles fragments with more than 2 columns.
+ """
+ import re
+ rows = []
+
+ # Pattern 1: Array of arrays with 2 columns `["cell1", "cell2"], ["cell3", "cell4"]`
+ # This pattern matches complete arrays: ["value1", "value2"]
+ pattern2Col = r'\["([^"]*)",\s*"([^"]*)"\]'
+ matches2Col = re.findall(pattern2Col, fragmentRaw)
+
+ if matches2Col and len(matches2Col) >= 2: # Need at least 2 rows to be confident
+ for match in matches2Col:
+ if len(match) == 2:
+ rows.append([match[0], match[1]])
+ if rows:
+ return rows
+
+ # Pattern 2: Array of arrays with variable columns (more robust)
+ # Find all array patterns: ["...", "...", ...]
+ # Use non-greedy matching but ensure we get complete arrays
+ arrayPattern = r'\[(.*?)\]'
+ arrayMatches = re.findall(arrayPattern, fragmentRaw)
+
+ # Filter to only arrays that look like table rows (have multiple quoted values)
+ validArrays = []
+ for arrayContent in arrayMatches:
+ # Extract quoted strings from array content
+ cellPattern = r'"([^"]*)"'
+ cells = re.findall(cellPattern, arrayContent)
+ # Only consider arrays with 2+ cells (likely table rows)
+ if len(cells) >= 2:
+ validArrays.append(cells)
+
+ if validArrays and len(validArrays) >= 2: # Need at least 2 rows
+ return validArrays
+
+ # Pattern 3: Look for "rows": [...] pattern in incomplete JSON
+ # This handles cases like: "rows": [["1947", "10000"], ["1948", "10100"]...
+ rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)'
+ rowsMatch = re.search(rowsPattern, fragmentRaw, re.DOTALL)
+ if rowsMatch:
+ rowsContent = rowsMatch.group(1)
+ # Extract all array patterns from rows content
+ arrayPattern = r'\[(.*?)\]'
+ arrayMatches = re.findall(arrayPattern, rowsContent)
+ for arrayContent in arrayMatches:
+ cellPattern = r'"([^"]*)"'
+ cells = re.findall(cellPattern, arrayContent)
+ if len(cells) >= 2: # At least 2 columns
+ rows.append(cells)
+ if rows:
+ return rows
+
+ # Pattern 4: Try to parse as JSON array (handles complete arrays)
+ from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
+
+ # Try to close incomplete structures
+ closed = closeJsonStructures(fragmentRaw.strip())
+ parsed, parseErr, _ = tryParseJson(closed)
+
+ if parseErr is None and isinstance(parsed, list):
+ if parsed and isinstance(parsed[0], list):
+ # Array of arrays - table rows
+ return parsed
+ elif parsed and isinstance(parsed[0], str):
+ # Array of strings - might be single column table
+ return [[item] for item in parsed]
+
+ # Pattern 5: Last resort - extract any array patterns we can find
+ # Even if incomplete, try to extract what we can
+ if not rows:
+ # Find all patterns like ["value1", "value2"] even if incomplete
+ # Use a more lenient pattern that handles incomplete strings
+ incompletePattern = r'\["([^"]*)"(?:,\s*"([^"]*)")?'
+ incompleteMatches = re.findall(incompletePattern, fragmentRaw)
+ for match in incompleteMatches:
+ if match[0]: # First value exists
+ if match[1]: # Second value exists
+ rows.append([match[0], match[1]])
+ else:
+ # Only one value - might be incomplete, skip for now
+ pass
+
+ return rows
+
+ @staticmethod
+ def _mergeParagraphElements(
+ accumulatedElem: Dict[str, Any],
+ newFragmentElem: Dict[str, Any],
+ isFragment: bool
+ ) -> Dict[str, Any]:
+ """Merge two paragraph elements."""
+ accContent = accumulatedElem.get("content", {})
+ fragContent = newFragmentElem.get("content", {})
+
+ accText = accContent.get("text", "") if isinstance(accContent, dict) else ""
+ fragText = fragContent.get("text", "") if isinstance(fragContent, dict) else ""
+
+ # Merge text (remove overlap if fragment)
+ mergedText = accText + fragText if not isFragment else (accText.rstrip() + " " + fragText.lstrip())
+
+ return {
+ "type": "paragraph",
+ "content": {"text": mergedText}
+ }
+
+ @staticmethod
+ def _mergeCodeBlockElements(
+ accumulatedElem: Dict[str, Any],
+ newFragmentElem: Dict[str, Any],
+ isFragment: bool
+ ) -> Dict[str, Any]:
+ """Merge two code block elements."""
+ accContent = accumulatedElem.get("content", {})
+ fragContent = newFragmentElem.get("content", {})
+
+ accCode = accContent.get("code", "") if isinstance(accContent, dict) else ""
+ fragCode = fragContent.get("code", "") if isinstance(fragContent, dict) else ""
+
+ accLanguage = accContent.get("language") if isinstance(accContent, dict) else None
+ fragLanguage = fragContent.get("language") if isinstance(fragContent, dict) else None
+
+ mergedCode = accCode + "\n" + fragCode if fragCode else accCode
+ mergedLanguage = accLanguage or fragLanguage
+
+ result = {
+ "type": "code_block",
+ "content": {"code": mergedCode}
+ }
+ if mergedLanguage:
+ result["content"]["language"] = mergedLanguage
+
+ return result
+
+ @staticmethod
+ def _mergeListElements(
+ accumulatedElem: Dict[str, Any],
+ newFragmentElem: Dict[str, Any],
+ isFragment: bool
+ ) -> Dict[str, Any]:
+ """Merge two list elements (bullet_list or numbered_list)."""
+ accContent = accumulatedElem.get("content", {})
+ fragContent = newFragmentElem.get("content", {})
+
+ accItems = accContent.get("items", []) if isinstance(accContent, dict) else []
+ fragItems = fragContent.get("items", []) if isinstance(fragContent, dict) else []
+
+ # Merge items with overlap detection
+ mergedItems = JsonResponseHandler._mergeItemsWithOverlapDetection(accItems, fragItems)
+
+ elemType = accumulatedElem.get("type") or newFragmentElem.get("type")
+
+ return {
+ "type": elemType,
+ "content": {"items": mergedItems}
+ }
+
+ @staticmethod
+ def _findOverlapStartIndex(
+ accumulatedElements: List[Dict[str, Any]],
+ overlapElements: List[Dict[str, Any]]
+ ) -> int:
+ """
+ Find the start index in accumulatedElements where overlapElements begin.
+
+ This helps identify where to merge continuation elements by matching
+ the overlap elements with the end of accumulated elements.
+
+ Args:
+ accumulatedElements: List of accumulated elements
+ overlapElements: List of overlap elements from continuation response
+
+ Returns:
+ Index where overlap starts, or -1 if not found
+ """
+ if not overlapElements or not accumulatedElements:
+ return -1
+
+ # Try to find overlap by matching element structures
+ # Start from the end of accumulatedElements and work backwards
+ overlapLen = len(overlapElements)
+ accLen = len(accumulatedElements)
+
+ if overlapLen > accLen:
+ return -1
+
+ # Try matching from different start positions
+ for startIdx in range(max(0, accLen - overlapLen), accLen):
+ # Check if elements from startIdx match overlapElements
+ matches = True
+ for i in range(min(overlapLen, accLen - startIdx)):
+ accElem = accumulatedElements[startIdx + i]
+ overlapElem = overlapElements[i]
+
+ # Compare element types
+ if isinstance(accElem, dict) and isinstance(overlapElem, dict):
+ accType = accElem.get("type")
+ overlapType = overlapElem.get("type")
+ if accType != overlapType:
+ matches = False
+ break
+
+ # For tables, compare row counts or last rows
+ if accType == "table":
+ accRows = accElem.get("rows", []) or (accElem.get("content", {}).get("rows", []) if isinstance(accElem.get("content"), dict) else [])
+ overlapRows = overlapElem.get("rows", []) or (overlapElem.get("content", {}).get("rows", []) if isinstance(overlapElem.get("content"), dict) else [])
+ if accRows and overlapRows:
+ # Check if last rows match
+ if len(accRows) >= len(overlapRows):
+ lastAccRows = accRows[-len(overlapRows):]
+ if lastAccRows != overlapRows:
+ matches = False
+ break
+ # For lists, compare items
+ elif accType in ["bullet_list", "numbered_list"]:
+ accItems = accElem.get("items", []) or (accElem.get("content", {}).get("items", []) if isinstance(accElem.get("content"), dict) else [])
+ overlapItems = overlapElem.get("items", []) or (overlapElem.get("content", {}).get("items", []) if isinstance(overlapElem.get("content"), dict) else [])
+ if accItems and overlapItems:
+ if len(accItems) >= len(overlapItems):
+ lastAccItems = accItems[-len(overlapItems):]
+ if lastAccItems != overlapItems:
+ matches = False
+ break
+ else:
+ matches = False
+ break
+
+ if matches:
+ return startIdx
+
+ return -1
+
+ @staticmethod
+ def _mergeRowsWithOverlapDetection(
+ accRows: List[List[str]],
+ fragRows: List[List[str]]
+ ) -> List[List[str]]:
+ """
+ Merge two row arrays, detecting and removing overlaps.
+
+ Overlap detection: Compare rows to find duplicates.
+ Missing parts: Complete with null values if needed.
+ """
+ if not accRows:
+ return fragRows
+ if not fragRows:
+ return accRows
+
+ # Find overlap by comparing last rows of accRows with first rows of fragRows
+ overlapStart = 0
+ maxOverlap = min(len(accRows), len(fragRows))
+
+ # Find the longest overlap
+ for overlapLen in range(maxOverlap, 0, -1):
+ accSuffix = accRows[-overlapLen:]
+ fragPrefix = fragRows[:overlapLen]
+
+ # Compare rows (exact match)
+ if accSuffix == fragPrefix:
+ overlapStart = overlapLen
+ break
+
+ # Merge: accumulated rows + non-overlapping fragment rows
+ merged = accRows + fragRows[overlapStart:]
+
+ return merged
+
+ @staticmethod
+ def _mergeItemsWithOverlapDetection(
+ accItems: List[str],
+ fragItems: List[str]
+ ) -> List[str]:
+ """
+ Merge two item arrays (for lists), detecting and removing overlaps.
+
+ Overlap detection: Compare items to find duplicates.
+ """
+ if not accItems:
+ return fragItems
+ if not fragItems:
+ return accItems
+
+ # Find overlap by comparing last items of accItems with first items of fragItems
+ overlapStart = 0
+ maxOverlap = min(len(accItems), len(fragItems))
+
+ # Find the longest overlap
+ for overlapLen in range(maxOverlap, 0, -1):
+ accSuffix = accItems[-overlapLen:]
+ fragPrefix = fragItems[:overlapLen]
+
+ # Compare items (exact match)
+ if accSuffix == fragPrefix:
+ overlapStart = overlapLen
+ break
+
+ # Merge: accumulated items + non-overlapping fragment items
+ merged = accItems + fragItems[overlapStart:]
+
+ return merged
+
+ @staticmethod
+ def _extractOverlapAndContinuation(jsonString: str) -> Tuple[Optional[List[Dict[str, Any]]], Optional[str]]:
+ """
+ Extract overlap and continuation sections from AI response with explicit overlap structure.
+
+ Expected format:
+ {
+ "overlap": [...], // Elements to repeat for merging
+ "continuation": [...] // New elements to add
+ }
+
+ Or alternative format:
+ {
+ "overlap": "...", // Overlap as string
+ "continuation": "..." // Continuation as string
+ }
+
+ Args:
+ jsonString: JSON string that may contain overlap/continuation structure
+
+ Returns:
+ Tuple of (overlap_elements, continuation_json_string) or (None, None) if not found
+ """
+ if not jsonString:
+ return None, None
+
+ from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText, tryParseJson, closeJsonStructures
+
+ # Extract and normalize JSON
+ extracted = stripCodeFences(normalizeJsonText(jsonString)).strip()
+ if not extracted:
+ return None, None
+
+ # Try to parse
+ try:
+ closed = closeJsonStructures(extracted)
+ parsed, parseErr, _ = tryParseJson(closed)
+
+ if parseErr is None and isinstance(parsed, dict):
+ # Check for overlap/continuation structure
+ overlap = parsed.get("overlap")
+ continuation = parsed.get("continuation")
+
+ if overlap is not None and continuation is not None:
+ # Found explicit overlap structure
+ overlapElements = None
+ continuationJson = None
+
+ # Extract overlap elements
+ if isinstance(overlap, list):
+ overlapElements = overlap
+ elif isinstance(overlap, str):
+ # Overlap is a string - try to parse it
+ try:
+ overlapParsed, _, _ = tryParseJson(closeJsonStructures(overlap))
+ if isinstance(overlapParsed, list):
+ overlapElements = overlapParsed
+ except Exception:
+ pass
+
+ # Extract continuation JSON
+ if isinstance(continuation, (dict, list)):
+ continuationJson = json.dumps(continuation, indent=2, ensure_ascii=False)
+ elif isinstance(continuation, str):
+ continuationJson = continuation
+
+ if overlapElements is not None and continuationJson:
+ return overlapElements, continuationJson
+ except Exception:
+ pass
+
+ return None, None
+
+ @staticmethod
+ def _mergeWithExplicitOverlap(
+ accumulated: str,
+ continuationJson: str,
+ overlapElements: List[Dict[str, Any]]
+ ) -> str:
+ """
+ Merge accumulated JSON with continuation JSON using explicit overlap information.
+
+ Strategy:
+ 1. Find overlap in accumulated using overlapElements
+ 2. Remove overlapping elements from accumulated
+ 3. Append continuation JSON
+
+ Args:
+ accumulated: Previously accumulated JSON string
+ continuationJson: Continuation JSON string (new content)
+ overlapElements: List of overlap elements from AI response
+
+ Returns:
+ Merged JSON string
+ """
+ if not accumulated:
+ return continuationJson
+ if not continuationJson:
return accumulated
- # Step 1: Extract JSON from both strings (handles code fences, extra text)
- from modules.shared.jsonUtils import extractJsonString, closeJsonStructures, tryParseJson
+ from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText, tryParseJson, closeJsonStructures
- accumulatedExtracted = extractJsonString(accumulated)
- newFragmentExtracted = extractJsonString(newFragment)
+ # Normalize accumulated
+ accumulatedExtracted = stripCodeFences(normalizeJsonText(accumulated)).strip()
+ accumulatedNormalized = JsonResponseHandler._normalizeToElementsStructure(
+ accumulatedExtracted, accumulated
+ )
- # Step 2: Detect fragment type and incomplete structures
- accumulatedStripped = accumulatedExtracted.strip()
- newFragmentStripped = newFragmentExtracted.strip()
+ # Normalize continuation
+ continuationExtracted = stripCodeFences(normalizeJsonText(continuationJson)).strip()
+ continuationNormalized = JsonResponseHandler._normalizeToElementsStructure(
+ continuationExtracted, continuationJson
+ )
- # Check if accumulated ends mid-structure
- accumulatedEndsMidStructure = False
- accumulatedParsed, accumulatedParseErr, _ = tryParseJson(accumulatedExtracted)
- if accumulatedParseErr is not None:
- # Cannot parse - likely incomplete
- accumulatedEndsMidStructure = True
- elif accumulatedStripped:
- # Check if it ends with incomplete patterns (comma, incomplete string, etc.)
- lastChar = accumulatedStripped[-1] if accumulatedStripped else ''
- # Ends with comma - likely incomplete array/object element
- if lastChar == ',' or accumulatedStripped.rstrip().endswith(','):
- accumulatedEndsMidStructure = True
- # Ends with incomplete string pattern (quote but no closing)
- elif lastChar == '"' and accumulatedStripped.count('"') % 2 != 0:
- accumulatedEndsMidStructure = True
-
- # Check if newFragment is a fragment
- # A fragment can be:
- # 1. Doesn't start with { or [ as root (plain text continuation)
- # 2. Starts with [ but is part of a larger array (e.g., continuation of table rows)
- # 3. Starts with { but is part of a larger object
- isNewFragment = False
- newFragmentParsed, newFragmentParseErr, _ = tryParseJson(newFragmentExtracted)
-
- if newFragmentParseErr is not None:
- # Cannot parse - it's a fragment
- isNewFragment = True
- elif not (newFragmentStripped.startswith('{') or newFragmentStripped.startswith('[')):
- # Doesn't start with JSON structure - it's a fragment
- isNewFragment = True
- elif accumulatedEndsMidStructure:
- # Accumulated is incomplete - newFragment is likely a continuation fragment
- # Even if it starts with [ or {, it might be continuing an incomplete structure
- isNewFragment = True
-
- # Step 3: Handle fragment merging
- if isNewFragment or accumulatedEndsMidStructure:
- # This is a fragment continuation - merge by repairing and combining
-
- # Strategy: String-based merging for fragments
- # 1. Remove trailing comma from accumulated if it ends with comma
- accumulatedForMerge = accumulatedExtracted
- if accumulatedStripped.rstrip().endswith(','):
- # Remove trailing comma and whitespace
- accumulatedForMerge = accumulatedExtracted.rstrip().rstrip(',').rstrip()
-
- # 2. Handle newFragment - if it starts with [ or {, it might be continuing an array/object
- newFragmentForMerge = newFragmentExtracted.strip()
-
- # Special case: If accumulated ends with incomplete array element and newFragment starts with array element
- # Pattern: accumulated ends with `["item1", "item2", ` and newFragment starts with `["item3", ...]`
- # We need to merge them: `["item1", "item2", "item3", ...]`
- if accumulatedStripped.rstrip().endswith(',') and newFragmentStripped.startswith('['):
- # Check if newFragment is a complete array element or just starts with [
- # If it's a complete array element, we need to extract its content and merge
- # Try to parse newFragment as a complete array
- newFragmentArrayParsed, newFragmentArrayErr, _ = tryParseJson(newFragmentStripped)
- if newFragmentArrayErr is None and isinstance(newFragmentArrayParsed, list):
- # newFragment is a complete array - extract its content
- # We need to merge the arrays: accumulated array + newFragment array
- # But accumulated ends with comma, so we need to close it first
- # Strategy: Remove trailing comma, add closing bracket, then merge arrays
- accumulatedClosed = accumulatedForMerge + ']'
- accumulatedClosedParsed, accumulatedClosedErr, _ = tryParseJson(accumulatedClosed)
- if accumulatedClosedErr is None:
- # Both are valid - merge parsed arrays
- if isinstance(accumulatedClosedParsed, list):
- mergedArray = accumulatedClosedParsed + newFragmentArrayParsed
- # Now we need to reconstruct the JSON structure
- # Find where the array starts in accumulated
- # For now, use string replacement
- # Find the last [ before the comma
- lastBracketPos = accumulatedForMerge.rfind('[')
- if lastBracketPos >= 0:
- # Replace from [ to end with merged array
- merged = accumulatedForMerge[:lastBracketPos] + json.dumps(mergedArray, ensure_ascii=False)
- # Try to repair and parse
- mergedRepaired = closeJsonStructures(merged)
- mergedParsed, mergedErr, _ = tryParseJson(mergedRepaired)
- if mergedErr is None and mergedParsed:
- return json.dumps(mergedParsed, indent=2, ensure_ascii=False)
-
- # 3. Merge strings: accumulated (without trailing comma) + fragment
- merged = accumulatedForMerge + newFragmentForMerge
-
- # 4. Try to repair the merged result
- mergedRepaired = closeJsonStructures(merged)
-
- # 5. Try to parse the repaired result
- mergedParsed, mergedErr, _ = tryParseJson(mergedRepaired)
-
- if mergedErr is None and mergedParsed:
- # Successfully parsed - return formatted JSON
- return json.dumps(mergedParsed, indent=2, ensure_ascii=False)
- else:
- # Still can't parse - try overlap detection as fallback
- logger.debug(f"Fragment merge repair failed, trying overlap detection: {mergedErr}")
- return JsonResponseHandler._mergeJsonStringsWithOverlapFallback(
- accumulatedExtracted, newFragmentExtracted
- )
- else:
- # Both are complete JSON - use original overlap detection
- return JsonResponseHandler._mergeJsonStringsWithOverlapFallback(
- accumulatedExtracted, newFragmentExtracted
+ # If both normalized successfully, use structure-based merge with overlap
+ if accumulatedNormalized and continuationNormalized:
+ merged = JsonResponseHandler._mergeJsonStructuresGeneric(
+ accumulatedNormalized, continuationNormalized, accumulatedExtracted, continuationExtracted,
+ overlapElements=overlapElements
)
+ if merged:
+ return json.dumps(merged, indent=2, ensure_ascii=False)
+
+ # Fallback: use overlap elements to find merge point in accumulated
+ # Find where overlap elements match in accumulated
+ if accumulatedNormalized and overlapElements:
+ accumulatedElements = accumulatedNormalized.get("elements", [])
+ overlapStartIndex = JsonResponseHandler._findOverlapStartIndex(accumulatedElements, overlapElements)
+
+ if overlapStartIndex >= 0:
+ # Remove overlapping elements
+ accumulatedElements = accumulatedElements[:overlapStartIndex]
+ accumulatedNormalized["elements"] = accumulatedElements
+
+ # Merge continuation
+ if continuationNormalized:
+ continuationElements = continuationNormalized.get("elements", [])
+ accumulatedElements.extend(continuationElements)
+ accumulatedNormalized["elements"] = accumulatedElements
+ return json.dumps(accumulatedNormalized, indent=2, ensure_ascii=False)
+
+ # Last resort: simple concatenation
+ return JsonResponseHandler._mergeJsonStringsWithOverlapFallback(accumulated, continuationJson)
+
+ @staticmethod
+ def _extractValidJsonPrefix(jsonString: str) -> str:
+ """
+ Extract the longest valid JSON prefix from a string that may be cut randomly.
+
+ Strategy:
+ 1. Try to find the longest prefix that can be closed and parsed
+ 2. Handle random cuts (mid-string, mid-number, etc.)
+ 3. Return the longest valid prefix found
+
+ Args:
+ jsonString: JSON string that may be cut randomly
+
+ Returns:
+ Longest valid JSON prefix, or empty string if none found
+ """
+ if not jsonString or not jsonString.strip():
+ return ""
+
+ from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
+
+ # Strategy 1: Try progressive truncation to find longest valid JSON
+ # Use binary search-like approach for efficiency
+ bestValid = ""
+ bestLength = 0
+ maxLen = len(jsonString)
+
+ # Generate test lengths: full, 95%, 90%, ..., 10%
+ testLengths = []
+ for percent in range(100, 9, -5):
+ testLen = int(maxLen * percent / 100)
+ if testLen > bestLength:
+ testLengths.append(testLen)
+
+ # Also test specific points near the end (common cut points)
+ for offset in [200, 100, 50, 20, 10, 5, 2, 1]:
+ if maxLen > offset:
+ testLen = maxLen - offset
+ if testLen > bestLength:
+ testLengths.append(testLen)
+
+ # Sort and deduplicate
+ testLengths = sorted(set(testLengths), reverse=True)
+
+ for testLen in testLengths:
+ if testLen <= bestLength:
+ continue # Already found better
+
+ testStr = jsonString[:testLen]
+ if not testStr.strip():
+ continue
+
+ # Try to close and parse
+ try:
+ closed = closeJsonStructures(testStr)
+ parsed, parseErr, _ = tryParseJson(closed)
+
+ if parseErr is None and parsed is not None:
+ # Valid JSON found
+ if testLen > bestLength:
+ bestValid = closed
+ bestLength = testLen
+ except Exception:
+ continue
+
+ # Strategy 2: If we found valid JSON, return it
+ if bestValid:
+ return bestValid
+
+ # Strategy 3: Try to extract balanced JSON (find first complete structure)
+ jsonStripped = jsonString.strip()
+
+ if jsonStripped.startswith('{') or jsonStripped.startswith('['):
+ # Try to extract balanced JSON
+ from modules.shared.jsonUtils import extractFirstBalancedJson
+ balanced = extractFirstBalancedJson(jsonStripped)
+ if balanced and balanced != jsonStripped:
+ try:
+ closed = closeJsonStructures(balanced)
+ parsed, parseErr, _ = tryParseJson(closed)
+ if parseErr is None:
+ return closed
+ except Exception:
+ pass
+
+ # Strategy 4: Try to repair by removing incomplete trailing structures
+ # Find the last complete element/item before the cut
+ try:
+ # For arrays: find last complete element
+ if jsonStripped.startswith('['):
+ # Find last complete array element
+ lastComma = jsonStripped.rfind(',')
+ if lastComma > 0:
+ # Try prefix up to last comma
+ prefix = jsonStripped[:lastComma].strip()
+ if prefix.endswith(','):
+ prefix = prefix[:-1].strip()
+ if prefix:
+ closed = closeJsonStructures(prefix + ']')
+ parsed, parseErr, _ = tryParseJson(closed)
+ if parseErr is None:
+ return closed
+
+ # For objects: find last complete key-value pair
+ elif jsonStripped.startswith('{'):
+ # Find last complete key-value pair
+ lastComma = jsonStripped.rfind(',')
+ if lastComma > 0:
+ # Try prefix up to last comma
+ prefix = jsonStripped[:lastComma].strip()
+ if prefix.endswith(','):
+ prefix = prefix[:-1].strip()
+ if prefix:
+ closed = closeJsonStructures(prefix + '}')
+ parsed, parseErr, _ = tryParseJson(closed)
+ if parseErr is None:
+ return closed
+ except Exception:
+ pass
+
+ # Last resort: return empty (caller will handle)
+ return ""
+
+ @staticmethod
+ def _smartConcatenate(accumulated: str, newFragment: str) -> str:
+ """
+ Smart concatenation that tries to merge JSON fragments intelligently.
+
+ Strategy:
+ 1. Extract valid JSON from both fragments
+ 2. Parse both as JSON objects/arrays
+ 3. Merge them structurally
+ 4. Return valid JSON
+
+ Args:
+ accumulated: Accumulated JSON string
+ newFragment: New fragment to append
+
+ Returns:
+ Merged string with valid JSON, or empty if merging not possible
+ """
+ if not accumulated or not newFragment:
+ return ""
+
+ from modules.shared.jsonUtils import closeJsonStructures, tryParseJson
+
+ # Extract valid JSON prefixes from both
+ accumulatedValid = JsonResponseHandler._extractValidJsonPrefix(accumulated)
+ newFragmentValid = JsonResponseHandler._extractValidJsonPrefix(newFragment)
+
+ if not accumulatedValid:
+ accumulatedValid = accumulated
+ if not newFragmentValid:
+ newFragmentValid = newFragment
+
+ # Try to parse both
+ try:
+ closedAccumulated = closeJsonStructures(accumulatedValid)
+ parsedAccumulated, parseErr1, _ = tryParseJson(closedAccumulated)
+
+ closedNewFragment = closeJsonStructures(newFragmentValid)
+ parsedNewFragment, parseErr2, _ = tryParseJson(closedNewFragment)
+
+ # If both parse successfully, merge structurally
+ if parseErr1 is None and parseErr2 is None:
+ # Normalize both to elements structure
+ accNormalized = JsonResponseHandler._normalizeToElementsStructure(closedAccumulated, accumulated)
+ newNormalized = JsonResponseHandler._normalizeToElementsStructure(closedNewFragment, newFragment)
+
+ if accNormalized and newNormalized:
+ merged = JsonResponseHandler._mergeJsonStructuresGeneric(
+ accNormalized, newNormalized, closedAccumulated, closedNewFragment
+ )
+ if merged:
+ return json.dumps(merged, indent=2, ensure_ascii=False)
+
+ # If only accumulated parses, return it
+ if parseErr1 is None and parsedAccumulated:
+ return json.dumps(parsedAccumulated, indent=2, ensure_ascii=False)
+
+ # If only new fragment parses, return it
+ if parseErr2 is None and parsedNewFragment:
+ return json.dumps(parsedNewFragment, indent=2, ensure_ascii=False)
+ except Exception:
+ pass
+
+ # Fallback: Try simple string concatenation with repair
+ accumulatedStripped = accumulated.strip()
+ newFragmentStripped = newFragment.strip()
+
+ # If accumulated doesn't end with } or ], it might be incomplete
+ if accumulatedStripped and not accumulatedStripped.endswith(('}', ']')):
+ try:
+ closedAccumulated = closeJsonStructures(accumulatedStripped)
+
+ # Check if newFragment starts with continuation
+ if newFragmentStripped.startswith(','):
+ # Remove leading comma and append
+ merged = closedAccumulated.rstrip() + newFragmentStripped.lstrip(',').strip()
+ elif newFragmentStripped.startswith(('}', ']')):
+ # Fragment starts with closing - might be completing accumulated
+ merged = closedAccumulated.rstrip() + newFragmentStripped
+ else:
+ # Try to append as continuation
+ # Check if we need a comma separator
+ if not closedAccumulated.rstrip().endswith((',', '[', '{')):
+ merged = closedAccumulated.rstrip() + ',' + newFragmentStripped
+ else:
+ merged = closedAccumulated.rstrip() + newFragmentStripped
+
+ # Try to repair and parse the merged result
+ repaired = closeJsonStructures(merged)
+ parsed, parseErr, _ = tryParseJson(repaired)
+ if parseErr is None:
+ return json.dumps(parsed, indent=2, ensure_ascii=False)
+ except Exception:
+ pass
+
+ # If smart concatenation failed, return empty (caller will handle)
+ return ""
@staticmethod
def _mergeJsonStringsWithOverlapFallback(
@@ -1477,28 +2638,107 @@ class JsonResponseHandler:
) -> str:
"""
Fallback overlap detection using string comparison.
- Used when both strings are complete JSON structures.
+ Used when both strings are complete JSON structures or fragments.
+
+ CRITICAL: Never returns empty JSON - always returns at least accumulated.
"""
if not accumulated:
- return newFragment
+ return newFragment if newFragment else "{}"
if not newFragment:
return accumulated
- # Find longest common suffix/prefix match
+ from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
+
+ # Strategy 1: Try to extract valid JSON parts from both fragments
+ # This handles random cuts better by finding the longest valid prefix/suffix
+
+ # Extract valid JSON from accumulated (find longest valid prefix)
+ accumulatedValid = JsonResponseHandler._extractValidJsonPrefix(accumulated)
+
+ # Extract valid JSON from newFragment (find longest valid prefix)
+ newFragmentValid = JsonResponseHandler._extractValidJsonPrefix(newFragment)
+
+ # If we have valid JSON from both, try structure-based merge
+ if accumulatedValid and newFragmentValid:
+ try:
+ parsedAccumulated, parseErr1, _ = tryParseJson(closeJsonStructures(accumulatedValid))
+ parsedNewFragment, parseErr2, _ = tryParseJson(closeJsonStructures(newFragmentValid))
+
+ if parseErr1 is None and parseErr2 is None:
+ # Both are valid JSON - try structure merge
+ accNormalized = JsonResponseHandler._normalizeToElementsStructure(accumulatedValid, accumulated)
+ newNormalized = JsonResponseHandler._normalizeToElementsStructure(newFragmentValid, newFragment)
+
+ if accNormalized and newNormalized:
+ merged = JsonResponseHandler._mergeJsonStructuresGeneric(
+ accNormalized, newNormalized, accumulatedValid, newFragmentValid
+ )
+ if merged:
+ return json.dumps(merged, indent=2, ensure_ascii=False)
+ except Exception:
+ pass
+
+ # Strategy 2: Find longest common suffix/prefix match (character-level overlap)
maxOverlapLen = min(len(accumulated), len(newFragment))
# Start from maximum possible overlap down to 1 character
- for overlapLen in range(maxOverlapLen, 0, -1):
+ # But limit to reasonable overlap (max 50% of shorter string)
+ maxReasonableOverlap = min(maxOverlapLen, min(len(accumulated), len(newFragment)) // 2)
+
+ for overlapLen in range(maxReasonableOverlap, 0, -1):
accumulatedSuffix = accumulated[-overlapLen:]
newFragmentPrefix = newFragment[:overlapLen]
if accumulatedSuffix == newFragmentPrefix:
# Found overlap - remove duplicate part
logger.debug(f"Found overlap of {overlapLen} characters, removing duplicate")
- return accumulated + newFragment[overlapLen:]
+ merged = accumulated + newFragment[overlapLen:]
+ # Ensure result is not empty
+ if merged and merged.strip():
+ return merged
- # No overlap found - simple concatenation
- return accumulated + newFragment
+ # Strategy 3: No overlap found - try smart concatenation
+ # Check if we can append newFragment to accumulated without breaking JSON structure
+ merged = JsonResponseHandler._smartConcatenate(accumulated, newFragment)
+ if merged and merged.strip():
+ return merged
+
+ # Strategy 4: Last resort - simple concatenation (but ensure non-empty and valid JSON)
+ result = accumulated + newFragment
+ if not result or result.strip() in ['{}', '[]', '']:
+ # Return accumulated as fallback (at least we have that)
+ return accumulated if accumulated else "{}"
+
+ # CRITICAL: Try to repair and validate the merged result
+ try:
+ repaired = closeJsonStructures(result)
+ parsed, parseErr, _ = tryParseJson(repaired)
+ if parseErr is None:
+ # Valid JSON - return it
+ return json.dumps(parsed, indent=2, ensure_ascii=False)
+ else:
+ # Still invalid - try to extract valid parts
+ validPrefix = JsonResponseHandler._extractValidJsonPrefix(result)
+ if validPrefix:
+ parsedPrefix, parseErr2, _ = tryParseJson(validPrefix)
+ if parseErr2 is None:
+ return json.dumps(parsedPrefix, indent=2, ensure_ascii=False)
+ except Exception:
+ pass
+
+ # If repair failed, return accumulated (at least we have that)
+ if accumulated:
+ try:
+ repairedAccumulated = closeJsonStructures(accumulated)
+ parsedAcc, parseErrAcc, _ = tryParseJson(repairedAccumulated)
+ if parseErrAcc is None:
+ return json.dumps(parsedAcc, indent=2, ensure_ascii=False)
+ except Exception:
+ pass
+ return accumulated
+
+ # Last resort: return empty structure
+ return "{}"
@staticmethod
def isJsonComplete(parsedJson: Dict[str, Any]) -> bool:
@@ -1851,10 +3091,12 @@ class JsonResponseHandler:
cleanedFragment = JsonResponseHandler.cleanEncodingIssues(newFragmentString)
# Step 3: Concatenate with overlap handling
- combinedString = JsonResponseHandler.mergeJsonStringsWithOverlap(
+ combinedString, hasOverlap = JsonResponseHandler.mergeJsonStringsWithOverlap(
cleanedAccumulated,
cleanedFragment
)
+ # Note: hasOverlap indicates if iterations should continue, but this function
+ # doesn't control iterations, so we just use the merged string
# Step 4: Try to parse
try:
diff --git a/modules/services/serviceAi/subLoopingUseCases.py b/modules/services/serviceAi/subLoopingUseCases.py
index c52ed1bc..dcf3e31e 100644
--- a/modules/services/serviceAi/subLoopingUseCases.py
+++ b/modules/services/serviceAi/subLoopingUseCases.py
@@ -18,7 +18,7 @@ class LoopingUseCase:
"""Configuration for a specific looping use case."""
# Identification
- useCaseId: str # "section_content", "chapter_structure", "document_structure", "code_structure", "code_content", "image_batch"
+ useCaseId: str # "section_content", "chapter_structure", "code_structure", "code_content"
# JSON Format Detection
jsonTemplate: Dict[str, Any] # Expected JSON structure template
@@ -145,24 +145,7 @@ class LoopingUseCaseRegistry:
requiresExtraction=False
))
- # Use Case 3: Document Structure Generation
- # Returns JSON with "documents[0].sections" structure, requires extraction and accumulation
- self.register(LoopingUseCase(
- useCaseId="document_structure",
- jsonTemplate={"documents": [{"sections": []}]},
- detectionKeys=["sections"],
- detectionPath="documents[0].sections",
- initialPromptBuilder=None,
- continuationPromptBuilder=None,
- accumulator=None, # Will use default accumulator
- merger=None, # Will use default merger
- continuationContextBuilder=None,
- resultBuilder=None, # Will use default result builder
- supportsAccumulation=True,
- requiresExtraction=True
- ))
-
- # Use Case 4: Code Structure Generation (NEW)
+ # Use Case 3: Code Structure Generation
self.register(LoopingUseCase(
useCaseId="code_structure",
jsonTemplate={
@@ -211,21 +194,5 @@ class LoopingUseCaseRegistry:
requiresExtraction=False
))
- # Use Case 6: Image Batch Generation (NEW)
- self.register(LoopingUseCase(
- useCaseId="image_batch",
- jsonTemplate={"images": []},
- detectionKeys=["images"],
- detectionPath="images",
- initialPromptBuilder=None,
- continuationPromptBuilder=None,
- accumulator=None, # Direct return
- merger=None,
- continuationContextBuilder=None,
- resultBuilder=None,
- supportsAccumulation=False,
- requiresExtraction=False
- ))
-
logger.info(f"Registered {len(self.useCases)} default looping use cases")
diff --git a/modules/services/serviceAi/subStructureFilling.py b/modules/services/serviceAi/subStructureFilling.py
index cad01980..81e82dcc 100644
--- a/modules/services/serviceAi/subStructureFilling.py
+++ b/modules/services/serviceAi/subStructureFilling.py
@@ -812,16 +812,18 @@ class StructureFiller:
)
else:
async def buildSectionPromptWithContinuation(
- section: Dict[str, Any],
- contentParts: List[ContentPart],
- userPrompt: str,
- generationHint: str,
- allSections: List[Dict[str, Any]],
- sectionIndex: int,
- isAggregation: bool,
continuationContext: Dict[str, Any],
- services: Any
+ **kwargs
) -> str:
+ """Build section prompt with continuation context. Extracts section-specific parameters from kwargs."""
+ # Extract parameters from kwargs (for section_content use case)
+ section = kwargs.get("section")
+ contentParts = kwargs.get("contentParts", [])
+ userPrompt = kwargs.get("userPrompt", "")
+ generationHint = kwargs.get("generationHint", "")
+ allSections = kwargs.get("allSections", [])
+ sectionIndex = kwargs.get("sectionIndex", 0)
+ isAggregation = kwargs.get("isAggregation", False)
basePrompt = self._buildSectionGenerationPrompt(
section=section,
contentParts=contentParts,
@@ -833,25 +835,81 @@ class StructureFiller:
language=language
)
- continuationInfo = continuationContext.get("delivered_summary", "")
- cutOffElement = continuationContext.get("cut_off_element", "")
+ # Extract JSON structure context for continuation
+ incompletePart = continuationContext.get("incomplete_part", "")
+ lastRawJson = continuationContext.get("last_raw_json", "")
+
+ # Build overlap context: extract last ~100 characters from the response for overlap
+ overlapContext = ""
+ if lastRawJson:
+ # Get last 100 characters for overlap
+ overlapContext = lastRawJson[-100:].strip()
+
+ # Build unified context showing structure hierarchy with cut point
+ # This combines structure template, last complete part, and incomplete part in one view
+ unifiedContext = ""
+ if lastRawJson:
+ # Find break position in raw JSON
+ if incompletePart:
+ breakPos = lastRawJson.find(incompletePart)
+ if breakPos == -1:
+ # Try to find where JSON ends
+ breakPos = len(lastRawJson.rstrip())
+ else:
+ # No incomplete part found - assume end of JSON
+ breakPos = len(lastRawJson.rstrip())
+
+ # Build intelligent context showing hierarchy
+ from modules.shared.jsonUtils import _buildIncompleteContext
+ unifiedContext = _buildIncompleteContext(lastRawJson, breakPos)
+ elif incompletePart:
+ # Fallback: use incomplete part directly
+ unifiedContext = incompletePart
+ else:
+ unifiedContext = "Unable to extract context - response was completely broken"
+
+ # Use the SAME template structure as in initial prompt
+ # Get contentType and contentStructureExample exactly like in _buildSectionGenerationPrompt
+ contentType = section.get("content_type", "paragraph")
+ contentStructureExample = self._getContentStructureExample(contentType)
+
+ # Build the exact same JSON structure template as in initial prompt
+ structureTemplate = f"""JSON Structure Template:
+{{
+ "elements": [
+ {{
+ "type": "{contentType}",
+ "content": {contentStructureExample}
+ }}
+ ]
+}}
+
+"""
continuationPrompt = f"""{basePrompt}
--- CONTINUATION REQUEST ---
-The previous JSON response was incomplete. Please continue from where it stopped.
+The previous JSON response was incomplete. Continue from where it stopped.
-PREVIOUSLY DELIVERED SUMMARY:
-{continuationInfo}
+{structureTemplate}Context showing structure hierarchy with cut point:
+{unifiedContext}
-LAST INCOMPLETE ELEMENT:
-{cutOffElement}
+Overlap Requirement:
+To ensure proper merging, your response MUST start by repeating approximately the last 100 characters from the previous response, then continue with new content.
-TASK: Continue generating the JSON elements array from where it was cut off.
-Complete the incomplete element and continue with remaining elements.
+Last ~100 characters from previous response (repeat these at the start):
+{overlapContext if overlapContext else "No overlap context available"}
-Return ONLY the continuation JSON (starting from the incomplete element).
-The JSON should be a fragment that can be merged with the previous response."""
+TASK:
+1. Start your response by repeating the last ~100 characters shown above (for overlap/merging)
+2. Complete the incomplete element shown in the context above (marked with CUT POINT)
+3. Continue generating the remaining content following the JSON structure template above
+4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
+
+CRITICAL:
+- Your response must be valid JSON matching the structure template above
+- Start with overlap (~100 chars) then continue seamlessly
+- Complete the incomplete element and continue with remaining elements"""
return continuationPrompt
options = AiCallOptions(
@@ -1040,16 +1098,18 @@ The JSON should be a fragment that can be merged with the previous response."""
isAggregation = False
async def buildSectionPromptWithContinuation(
- section: Dict[str, Any],
- contentParts: List[ContentPart],
- userPrompt: str,
- generationHint: str,
- allSections: List[Dict[str, Any]],
- sectionIndex: int,
- isAggregation: bool,
continuationContext: Dict[str, Any],
- services: Any
+ **kwargs
) -> str:
+ """Build section prompt with continuation context. Extracts section-specific parameters from kwargs."""
+ # Extract parameters from kwargs (for section_content use case)
+ section = kwargs.get("section")
+ contentParts = kwargs.get("contentParts", [])
+ userPrompt = kwargs.get("userPrompt", "")
+ generationHint = kwargs.get("generationHint", "")
+ allSections = kwargs.get("allSections", [])
+ sectionIndex = kwargs.get("sectionIndex", 0)
+ isAggregation = kwargs.get("isAggregation", False)
basePrompt = self._buildSectionGenerationPrompt(
section=section,
contentParts=contentParts,
@@ -1061,25 +1121,81 @@ The JSON should be a fragment that can be merged with the previous response."""
language=language
)
- continuationInfo = continuationContext.get("delivered_summary", "")
- cutOffElement = continuationContext.get("cut_off_element", "")
+ # Extract JSON structure context for continuation
+ incompletePart = continuationContext.get("incomplete_part", "")
+ lastRawJson = continuationContext.get("last_raw_json", "")
+
+ # Build overlap context: extract last ~100 characters from the response for overlap
+ overlapContext = ""
+ if lastRawJson:
+ # Get last 100 characters for overlap
+ overlapContext = lastRawJson[-100:].strip()
+
+ # Build unified context showing structure hierarchy with cut point
+ # This combines structure template, last complete part, and incomplete part in one view
+ unifiedContext = ""
+ if lastRawJson:
+ # Find break position in raw JSON
+ if incompletePart:
+ breakPos = lastRawJson.find(incompletePart)
+ if breakPos == -1:
+ # Try to find where JSON ends
+ breakPos = len(lastRawJson.rstrip())
+ else:
+ # No incomplete part found - assume end of JSON
+ breakPos = len(lastRawJson.rstrip())
+
+ # Build intelligent context showing hierarchy
+ from modules.shared.jsonUtils import _buildIncompleteContext
+ unifiedContext = _buildIncompleteContext(lastRawJson, breakPos)
+ elif incompletePart:
+ # Fallback: use incomplete part directly
+ unifiedContext = incompletePart
+ else:
+ unifiedContext = "Unable to extract context - response was completely broken"
+
+ # Use the SAME template structure as in initial prompt
+ # Get contentType and contentStructureExample exactly like in _buildSectionGenerationPrompt
+ contentType = section.get("content_type", "paragraph")
+ contentStructureExample = self._getContentStructureExample(contentType)
+
+ # Build the exact same JSON structure template as in initial prompt
+ structureTemplate = f"""JSON Structure Template:
+{{
+ "elements": [
+ {{
+ "type": "{contentType}",
+ "content": {contentStructureExample}
+ }}
+ ]
+}}
+
+"""
continuationPrompt = f"""{basePrompt}
--- CONTINUATION REQUEST ---
-The previous JSON response was incomplete. Please continue from where it stopped.
+The previous JSON response was incomplete. Continue from where it stopped.
-PREVIOUSLY DELIVERED SUMMARY:
-{continuationInfo}
+{structureTemplate}Context showing structure hierarchy with cut point:
+{unifiedContext}
-LAST INCOMPLETE ELEMENT:
-{cutOffElement}
+Overlap Requirement:
+To ensure proper merging, your response MUST start by repeating approximately the last 100 characters from the previous response, then continue with new content.
-TASK: Continue generating the JSON elements array from where it was cut off.
-Complete the incomplete element and continue with remaining elements.
+Last ~100 characters from previous response (repeat these at the start):
+{overlapContext if overlapContext else "No overlap context available"}
-Return ONLY the continuation JSON (starting from the incomplete element).
-The JSON should be a fragment that can be merged with the previous response."""
+TASK:
+1. Start your response by repeating the last ~100 characters shown above (for overlap/merging)
+2. Complete the incomplete element shown in the context above (marked with CUT POINT)
+3. Continue generating the remaining content following the JSON structure template above
+4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
+
+CRITICAL:
+- Your response must be valid JSON matching the structure template above
+- Start with overlap (~100 chars) then continue seamlessly
+- Complete the incomplete element and continue with remaining elements"""
return continuationPrompt
options = AiCallOptions(
@@ -1343,16 +1459,19 @@ The JSON should be a fragment that can be merged with the previous response."""
isAggregation = False
async def buildSectionPromptWithContinuation(
- section: Dict[str, Any],
- contentParts: List[ContentPart],
- userPrompt: str,
- generationHint: str,
- allSections: List[Dict[str, Any]],
- sectionIndex: int,
- isAggregation: bool,
continuationContext: Dict[str, Any],
- services: Any
+ **kwargs
) -> str:
+ """Build section prompt with continuation context. Extracts section-specific parameters from kwargs."""
+ # Extract parameters from kwargs (for section_content use case)
+ section = kwargs.get("section")
+ contentParts = kwargs.get("contentParts", [])
+ userPrompt = kwargs.get("userPrompt", "")
+ generationHint = kwargs.get("generationHint", "")
+ allSections = kwargs.get("allSections", [])
+ sectionIndex = kwargs.get("sectionIndex", 0)
+ isAggregation = kwargs.get("isAggregation", False)
+ services = kwargs.get("services")
basePrompt = self._buildSectionGenerationPrompt(
section=section,
contentParts=contentParts,
@@ -1364,25 +1483,83 @@ The JSON should be a fragment that can be merged with the previous response."""
language=language
)
- continuationInfo = continuationContext.get("delivered_summary", "")
- cutOffElement = continuationContext.get("cut_off_element", "")
+ # Extract JSON structure context for continuation
+ templateStructure = continuationContext.get("template_structure", "")
+ lastCompletePart = continuationContext.get("last_complete_part", "")
+ incompletePart = continuationContext.get("incomplete_part", "")
+ structureContext = continuationContext.get("structure_context", "")
+ lastRawJson = continuationContext.get("last_raw_json", "")
+
+ # Build overlap context: extract last ~100 characters from the response for overlap
+ overlapContext = ""
+ if lastRawJson:
+ # Get last 100 characters for overlap
+ overlapContext = lastRawJson[-100:].strip()
+
+ # Build unified context showing structure hierarchy with cut point
+ unifiedContext = ""
+ if lastRawJson:
+ # Find break position in raw JSON
+ if incompletePart:
+ breakPos = lastRawJson.find(incompletePart)
+ if breakPos == -1:
+ # Try to find where JSON ends
+ breakPos = len(lastRawJson.rstrip())
+ else:
+ # No incomplete part found - assume end of JSON
+ breakPos = len(lastRawJson.rstrip())
+
+ # Build intelligent context showing hierarchy
+ from modules.shared.jsonUtils import _buildIncompleteContext
+ unifiedContext = _buildIncompleteContext(lastRawJson, breakPos)
+ elif incompletePart:
+ # Fallback: use incomplete part directly
+ unifiedContext = incompletePart
+ else:
+ unifiedContext = "Unable to extract context - response was completely broken"
+
+ # Use the SAME template structure as in initial prompt
+ # Get contentType and contentStructureExample exactly like in _buildSectionGenerationPrompt
+ contentType = section.get("content_type", "paragraph")
+ contentStructureExample = self._getContentStructureExample(contentType)
+
+ # Build the exact same JSON structure template as in initial prompt
+ structureTemplate = f"""JSON Structure Template:
+{{
+ "elements": [
+ {{
+ "type": "{contentType}",
+ "content": {contentStructureExample}
+ }}
+ ]
+}}
+
+"""
continuationPrompt = f"""{basePrompt}
--- CONTINUATION REQUEST ---
-The previous JSON response was incomplete. Please continue from where it stopped.
+The previous JSON response was incomplete. Continue from where it stopped.
-PREVIOUSLY DELIVERED SUMMARY:
-{continuationInfo}
+{structureTemplate}Context showing structure hierarchy with cut point:
+{unifiedContext}
-LAST INCOMPLETE ELEMENT:
-{cutOffElement}
+Overlap Requirement:
+To ensure proper merging, your response MUST start by repeating approximately the last 100 characters from the previous response, then continue with new content.
-TASK: Continue generating the JSON elements array from where it was cut off.
-Complete the incomplete element and continue with remaining elements.
+Last ~100 characters from previous response (repeat these at the start):
+{overlapContext if overlapContext else "No overlap context available"}
-Return ONLY the continuation JSON (starting from the incomplete element).
-The JSON should be a fragment that can be merged with the previous response."""
+TASK:
+1. Start your response by repeating the last ~100 characters shown above (for overlap/merging)
+2. Complete the incomplete element shown in the context above (marked with CUT POINT)
+3. Continue generating the remaining content following the JSON structure template above
+4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
+
+CRITICAL:
+- Your response must be valid JSON matching the structure template above
+- Start with overlap (~100 chars) then continue seamlessly
+- Complete the incomplete element and continue with remaining elements"""
return continuationPrompt
options = AiCallOptions(
diff --git a/modules/services/serviceAi/subStructureGeneration.py b/modules/services/serviceAi/subStructureGeneration.py
index f029a432..085815fd 100644
--- a/modules/services/serviceAi/subStructureGeneration.py
+++ b/modules/services/serviceAi/subStructureGeneration.py
@@ -112,7 +112,12 @@ class StructureGenerator:
continuationContext: Optional[Dict[str, Any]] = None,
**kwargs
) -> str:
- """Build chapter structure prompt with optional continuation context."""
+ """Build chapter structure prompt with optional continuation context. Extracts chapter-specific parameters from kwargs."""
+ # Extract parameters from kwargs (for chapter_structure use case)
+ userPrompt = kwargs.get("userPrompt", "")
+ contentParts = kwargs.get("contentParts", [])
+ outputFormat = kwargs.get("outputFormat", "txt")
+
basePrompt = self._buildChapterStructurePrompt(
userPrompt=userPrompt,
contentParts=contentParts,
diff --git a/modules/services/serviceAi/test_json_merger.py b/modules/services/serviceAi/test_json_merger.py
new file mode 100644
index 00000000..13fa780c
--- /dev/null
+++ b/modules/services/serviceAi/test_json_merger.py
@@ -0,0 +1,594 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""
+Test cases for JSON merger with different use cases and random cuts.
+
+Tests the robustness of the JSON merger by:
+1. Creating test JSON for different use cases
+2. Cutting it randomly at various points
+3. Running the merger for each piece
+4. Checking completeness against original
+"""
+
+import json
+import random
+import logging
+import sys
+import os
+from typing import Dict, Any, List, Tuple
+
+# Add project root to Python path
+# Find project root by looking for gateway/modules structure
+currentFile = os.path.abspath(__file__)
+currentDir = os.path.dirname(currentFile)
+
+# Navigate up from: gateway/modules/services/serviceAi/test_json_merger.py
+# To project root: D:\Athi\Local\Web\poweron
+# Try different levels up
+candidates = [
+ os.path.abspath(os.path.join(currentDir, '../../../../')), # From gateway/modules/services/serviceAi
+ os.path.abspath(os.path.join(currentDir, '../../..')), # Alternative
+ os.path.abspath(os.path.join(currentDir, '../..')), # Another alternative
+]
+
+projectRoot = None
+for candidate in candidates:
+ gatewayModulesPath = os.path.join(candidate, 'gateway', 'modules')
+ if os.path.exists(gatewayModulesPath):
+ projectRoot = candidate
+ break
+
+# If still not found, try to find by looking for gateway directory
+if projectRoot is None:
+ searchDir = currentDir
+ for _ in range(10): # Max 10 levels up
+ gatewayPath = os.path.join(searchDir, 'gateway')
+ if os.path.exists(gatewayPath) and os.path.exists(os.path.join(gatewayPath, 'modules')):
+ projectRoot = searchDir
+ break
+ parent = os.path.dirname(searchDir)
+ if parent == searchDir: # Reached root
+ break
+ searchDir = parent
+
+if projectRoot is None:
+ raise RuntimeError(f"Could not find project root. Current file: {currentFile}")
+
+# Add gateway directory to Python path (not project root)
+gatewayPath = os.path.join(projectRoot, 'gateway')
+if gatewayPath not in sys.path:
+ sys.path.insert(0, gatewayPath)
+
+# Verify the path is correct
+modulesPath = os.path.join(projectRoot, 'gateway', 'modules')
+if not os.path.exists(modulesPath):
+ raise RuntimeError(f"Project root verification failed. Expected gateway/modules at: {modulesPath}")
+
+try:
+ from modules.services.serviceAi.subJsonResponseHandling import JsonResponseHandler
+ from modules.services.serviceAi.subJsonMerger import JsonMergeLogger
+ from modules.shared.jsonUtils import (
+ normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson,
+ extractJsonStructureContext
+ )
+except ImportError as e:
+ # Try to help debug
+ print(f"Import error: {e}")
+ print(f"Project root: {projectRoot}")
+ print(f"Gateway path: {gatewayPath}")
+ print(f"Python path (first 3): {sys.path[:3]}")
+ print(f"Looking for modules at: {modulesPath}")
+ print(f"Exists: {os.path.exists(modulesPath)}")
+ if os.path.exists(modulesPath):
+ print(f"Contents: {os.listdir(modulesPath)[:5]}")
+ raise
+
+logger = logging.getLogger(__name__)
+
+
+def createTestJsonForUseCase(useCaseId: str, size: int = 100) -> Dict[str, Any]:
+ """
+ Create test JSON for a specific use case.
+
+ Args:
+ useCaseId: Use case ID (section_content, chapter_structure, etc.)
+ size: Size of test data (number of elements/rows/items)
+
+ Returns:
+ Test JSON dictionary
+ """
+ if useCaseId == "section_content":
+ # Create table with rows
+ elements = [{
+ "type": "table",
+ "content": {
+ "headers": ["Year", "Value"],
+ "rows": [[str(1947 + i), str(10000 + i * 100)] for i in range(size)]
+ }
+ }]
+ return {"elements": elements}
+
+ elif useCaseId == "chapter_structure":
+ chapters = [{
+ "id": f"chapter_{i}",
+ "title": f"Chapter {i}",
+ "level": 1
+ } for i in range(size)]
+ return {"documents": [{"chapters": chapters}]}
+
+ elif useCaseId == "code_structure":
+ files = [{
+ "id": f"file_{i}",
+ "filename": f"file_{i}.py",
+ "fileType": "python",
+ "functions": [f"function_{i}_{j}" for j in range(5)]
+ } for i in range(size)]
+ return {"files": files}
+
+ elif useCaseId == "code_content":
+ files = [{
+ "id": f"file_{i}",
+ "content": f"# File {i}\ndef function_{i}():\n pass\n" * 10,
+ "functions": [{"name": f"function_{i}_{j}", "line": j * 3} for j in range(5)]
+ } for i in range(size)]
+ return {"files": files}
+
+ else:
+ raise ValueError(f"Unknown use case: {useCaseId}")
+
+
+def cutJsonRandomly(jsonString: str, numCuts: int = 5, overlapSize: int = 100) -> List[str]:
+ """
+ Cut JSON string RANDOMLY at different points WITH OVERLAP between fragments.
+ Each fragment overlaps with the previous one to help merging.
+
+ Args:
+ jsonString: JSON string to cut
+ numCuts: Number of cuts to make
+ overlapSize: Size of overlap between fragments (in characters)
+
+ Returns:
+ List of JSON fragments with overlap
+ """
+ fragments = []
+ currentPos = 0
+ totalLength = len(jsonString)
+
+ if totalLength == 0:
+ return []
+
+ # First fragment: from start to first cut point
+ if numCuts > 0:
+ # First cut point (between 20% and 40% of total)
+ firstCutPoint = random.randint(int(totalLength * 0.2), int(totalLength * 0.4))
+ fragment = jsonString[:firstCutPoint]
+ fragments.append(fragment)
+ currentPos = firstCutPoint
+ else:
+ # No cuts - return whole string
+ return [jsonString]
+
+ # Subsequent fragments: each starts with overlap from previous, then continues
+ for i in range(numCuts - 1):
+ if currentPos >= totalLength:
+ break
+
+ # Calculate overlap start (go back overlapSize from current position)
+ overlapStart = max(0, currentPos - overlapSize)
+
+ # Calculate next cut point (between 20% and 40% of remaining)
+ remaining = totalLength - currentPos
+ if remaining < overlapSize * 2:
+ # Not enough remaining - add rest as last fragment
+ fragment = jsonString[overlapStart:]
+ fragments.append(fragment)
+ break
+
+ # Next cut point from current position
+ nextCutPoint = currentPos + random.randint(int(remaining * 0.2), int(remaining * 0.4))
+ nextCutPoint = min(nextCutPoint, totalLength)
+
+ # Fragment: from overlap start to next cut point
+ fragment = jsonString[overlapStart:nextCutPoint]
+ fragments.append(fragment)
+
+ currentPos = nextCutPoint
+
+ # Add remaining as last fragment (with overlap)
+ if currentPos < totalLength:
+ overlapStart = max(0, currentPos - overlapSize)
+ fragment = jsonString[overlapStart:]
+ fragments.append(fragment)
+
+ return fragments
+
+
+def testMergerWithFragments(
+ originalJson: Dict[str, Any],
+ fragments: List[str],
+ useCaseId: str
+) -> Tuple[bool, Dict[str, Any], str]:
+ """
+ Test merger by merging fragments sequentially.
+
+ Args:
+ originalJson: Original complete JSON
+ fragments: List of JSON fragments to merge
+ useCaseId: Use case ID
+
+ Returns:
+ Tuple of (success, merged_json, error_message)
+ """
+ if not fragments:
+ return False, {}, "No fragments provided"
+
+ # Log structure context for each fragment (especially incomplete ones)
+ print(f"\n{'='*60}")
+ print(f"FRAGMENT ANALYSIS (use case: {useCaseId})")
+ print(f"{'='*60}")
+
+ for fragIdx, fragment in enumerate(fragments):
+ print(f"\nFragment {fragIdx + 1}/{len(fragments)}:")
+ print(f" Length: {len(fragment)} chars")
+
+ # Extract structure context for this fragment
+ try:
+ structureContext = extractJsonStructureContext(fragment, useCaseId)
+
+ templateStructure = structureContext.get("template_structure", "")
+ lastCompletePart = structureContext.get("last_complete_part", "")
+ incompletePart = structureContext.get("incomplete_part", "")
+ structureContextJson = structureContext.get("structure_context", "")
+
+ # Check if fragment is incomplete
+ normalized = stripCodeFences(normalizeJsonText(fragment)).strip()
+ parsed, parseErr, _ = tryParseJson(normalized)
+ isIncomplete = parseErr is not None or (parsed is None)
+
+ if isIncomplete:
+ print(f" Status: INCOMPLETE (cut off)")
+ print(f" Template Structure:")
+ if templateStructure:
+ # Show first few lines of template
+ templateLines = templateStructure.split('\n')
+ templateLinesToShow = templateLines[:5]
+ for line in templateLinesToShow:
+ print(f" {line}")
+ if len(templateLines) > 5:
+ remainingLines = len(templateLines) - 5
+ print(f" ... ({remainingLines} more lines)")
+ else:
+ print(f" (not available)")
+
+ print(f" Structure Context:")
+ if structureContextJson:
+ # Show structure context
+ contextLines = structureContextJson.split('\n')
+ contextLinesToShow = contextLines[:5]
+ for line in contextLinesToShow:
+ print(f" {line}")
+ if len(contextLines) > 5:
+ remainingContextLines = len(contextLines) - 5
+ print(f" ... ({remainingContextLines} more lines)")
+ else:
+ print(f" (not available)")
+
+ print(f" Last Complete Part:")
+ if lastCompletePart:
+ # Show last complete part (truncated if too long)
+ if len(lastCompletePart) > 200:
+ print(f" {lastCompletePart[:200]}... ({len(lastCompletePart)} chars total)")
+ else:
+ print(f" {lastCompletePart}")
+ else:
+ print(f" (not available)")
+
+ print(f" Incomplete Part:")
+ if incompletePart:
+ # Show incomplete part (truncated if too long)
+ if len(incompletePart) > 200:
+ print(f" {incompletePart[:200]}... ({len(incompletePart)} chars total)")
+ else:
+ print(f" {incompletePart}")
+ else:
+ print(f" (not available)")
+ else:
+ print(f" Status: COMPLETE")
+ if structureContextJson:
+ print(f" Structure Context:")
+ contextLines = structureContextJson.split('\n')
+ contextLinesToShow = contextLines[:3]
+ for line in contextLinesToShow:
+ print(f" {line}")
+ if len(contextLines) > 3:
+ remainingContextLines = len(contextLines) - 3
+ print(f" ... ({remainingContextLines} more lines)")
+ except Exception as e:
+ print(f" Error extracting structure context: {e}")
+
+ print(f"\n{'='*60}\n")
+
+ # Start with first fragment
+ accumulated = fragments[0]
+
+ # Merge each subsequent fragment
+ for i, fragment in enumerate(fragments[1:], 1):
+ try:
+ accumulated, hasOverlap = JsonResponseHandler.mergeJsonStringsWithOverlap(
+ accumulated, fragment
+ )
+ # Log if no overlap was found (iterations would stop in real scenario)
+ if not hasOverlap:
+ print(f" ⚠️ Fragment {i}: No overlap found - iterations would stop here")
+
+ # Check if result is empty (should never happen)
+ if not accumulated or accumulated.strip() in ['{"elements": []}', '{}', '']:
+ return False, {}, f"Merge {i} returned empty JSON"
+
+ except Exception as e:
+ return False, {}, f"Merge {i} failed with error: {str(e)}"
+
+ # Parse merged result
+ try:
+ # Normalize and try to parse
+ normalized = stripCodeFences(normalizeJsonText(accumulated)).strip()
+
+ # Try to parse directly
+ parsed, parseErr, _ = tryParseJson(normalized)
+
+ if parseErr is not None:
+ # Try closing structures if incomplete
+ try:
+ closed = closeJsonStructures(normalized)
+ parsed, parseErr2, _ = tryParseJson(closed)
+ if parseErr2 is not None:
+ # Try to extract valid JSON prefix
+ # JsonResponseHandler is already imported at module level
+ validPrefix = JsonResponseHandler._extractValidJsonPrefix(normalized)
+ if validPrefix:
+ parsed, parseErr3, _ = tryParseJson(validPrefix)
+ if parseErr3 is not None:
+ return False, {}, f"Final parse error: {str(parseErr3)}"
+ else:
+ return False, {}, f"Final parse error: {str(parseErr2)}"
+ except Exception as parseErr:
+ return False, {}, f"Final parse error: {str(parseErr)}"
+
+ if not parsed:
+ return False, {}, "Final parse returned None"
+
+ # CRITICAL: Ensure parsed is a dict, not a list
+ # If it's a list, wrap it in the expected structure based on use case
+ if isinstance(parsed, list):
+ # Try to normalize list to expected structure
+ if useCaseId == "section_content":
+ # List of elements - wrap in elements structure
+ parsed = {"elements": parsed}
+ elif useCaseId == "chapter_structure":
+ # List of chapters - wrap in documents structure
+ parsed = {"documents": [{"chapters": parsed}]}
+ elif useCaseId == "code_structure":
+ # List of files - wrap in files structure
+ parsed = {"files": parsed}
+ elif useCaseId == "code_content":
+ # List of files - wrap in files structure
+ parsed = {"files": parsed}
+ else:
+ # Unknown use case - try to wrap as elements
+ parsed = {"elements": parsed}
+
+ # Ensure it's a dict now
+ if not isinstance(parsed, dict):
+ return False, {}, f"Final parse returned unexpected type: {type(parsed).__name__}"
+
+ return True, parsed, ""
+
+ except Exception as e:
+ return False, {}, f"Final parse failed: {str(e)}"
+
+
+def compareJsonCompleteness(
+ original: Dict[str, Any],
+ merged: Dict[str, Any],
+ useCaseId: str
+) -> Tuple[bool, str]:
+ """
+ Compare merged JSON with original to check completeness.
+
+ Args:
+ original: Original JSON
+ merged: Merged JSON (must be a dict)
+ useCaseId: Use case ID
+
+ Returns:
+ Tuple of (is_complete, message)
+ """
+ # CRITICAL: Ensure merged is a dict
+ if not isinstance(merged, dict):
+ return False, f"Merged JSON is not a dict, got {type(merged).__name__}"
+
+ if useCaseId == "section_content":
+ origElements = original.get("elements", [])
+ mergedElements = merged.get("elements", [])
+
+ if not isinstance(origElements, list):
+ return False, f"Original elements is not a list: {type(origElements).__name__}"
+ if not isinstance(mergedElements, list):
+ return False, f"Merged elements is not a list: {type(mergedElements).__name__}"
+
+ if len(mergedElements) < len(origElements):
+ return False, f"Missing elements: {len(origElements)} expected, {len(mergedElements)} found"
+
+ # Check table rows
+ if origElements and mergedElements:
+ origTable = origElements[0] if isinstance(origElements[0], dict) else {}
+ mergedTable = mergedElements[0] if isinstance(mergedElements[0], dict) else {}
+
+ if not origTable or not mergedTable:
+ return False, f"Table structure missing: origTable={bool(origTable)}, mergedTable={bool(mergedTable)}"
+
+ origRows = origTable.get("content", {}).get("rows", []) if isinstance(origTable.get("content"), dict) else origTable.get("rows", [])
+ mergedRows = mergedTable.get("content", {}).get("rows", []) if isinstance(mergedTable.get("content"), dict) else mergedTable.get("rows", [])
+
+ if not isinstance(origRows, list):
+ return False, f"Original rows is not a list: {type(origRows).__name__}"
+ if not isinstance(mergedRows, list):
+ return False, f"Merged rows is not a list: {type(mergedRows).__name__}"
+
+ if len(mergedRows) < len(origRows):
+ return False, f"Missing rows: {len(origRows)} expected, {len(mergedRows)} found"
+
+ return True, "Complete"
+
+ elif useCaseId == "chapter_structure":
+ origChapters = original.get("documents", [{}])[0].get("chapters", [])
+ mergedChapters = merged.get("documents", [{}])[0].get("chapters", [])
+
+ if len(mergedChapters) < len(origChapters):
+ return False, f"Missing chapters: {len(origChapters)} expected, {len(mergedChapters)} found"
+
+ return True, "Complete"
+
+ elif useCaseId == "code_structure":
+ origFiles = original.get("files", [])
+ mergedFiles = merged.get("files", [])
+
+ if len(mergedFiles) < len(origFiles):
+ return False, f"Missing files: {len(origFiles)} expected, {len(mergedFiles)} found"
+
+ return True, "Complete"
+
+ elif useCaseId == "code_content":
+ origFiles = original.get("files", [])
+ mergedFiles = merged.get("files", [])
+
+ if len(mergedFiles) < len(origFiles):
+ return False, f"Missing files: {len(origFiles)} expected, {len(mergedFiles)} found"
+
+ return True, "Complete"
+
+ else:
+ return False, f"Unknown use case: {useCaseId}"
+
+
+def runTestForUseCase(useCaseId: str, size: int = 50, numTests: int = 10) -> Dict[str, Any]:
+ """
+ Run multiple tests for a use case with random cuts.
+
+ Args:
+ useCaseId: Use case ID
+ size: Size of test data
+ numTests: Number of test runs
+
+ Returns:
+ Test results dictionary
+ """
+ results = {
+ "useCaseId": useCaseId,
+ "size": size,
+ "numTests": numTests,
+ "passed": 0,
+ "failed": 0,
+ "errors": []
+ }
+
+ for testNum in range(numTests):
+ try:
+ # Create test JSON
+ originalJson = createTestJsonForUseCase(useCaseId, size)
+ originalString = json.dumps(originalJson, indent=2, ensure_ascii=False)
+
+ # Cut randomly
+ fragments = cutJsonRandomly(originalString, numCuts=random.randint(3, 7))
+
+ # Test merger
+ success, mergedJson, errorMsg = testMergerWithFragments(
+ originalJson, fragments, useCaseId
+ )
+
+ if not success:
+ results["failed"] += 1
+ results["errors"].append(f"Test {testNum + 1}: {errorMsg}")
+ continue
+
+ # Check completeness
+ isComplete, completenessMsg = compareJsonCompleteness(
+ originalJson, mergedJson, useCaseId
+ )
+
+ if isComplete:
+ results["passed"] += 1
+ else:
+ results["failed"] += 1
+ results["errors"].append(f"Test {testNum + 1}: {completenessMsg}")
+
+ except Exception as e:
+ results["failed"] += 1
+ results["errors"].append(f"Test {testNum + 1}: Exception - {str(e)}")
+
+ return results
+
+
+def runAllTests():
+ """Run tests for all use cases."""
+ useCases = [
+ "section_content",
+ "chapter_structure",
+ "code_structure",
+ "code_content"
+ ]
+
+ allResults = []
+
+ for useCaseId in useCases:
+ print(f"\n{'='*60}")
+ print(f"Testing use case: {useCaseId}")
+ print(f"{'='*60}")
+
+ # Initialize log file for this use case
+ # Initialize log file (overwrite on each test run)
+ logFileName = f"json_merger_{useCaseId}.txt"
+ JsonMergeLogger.initializeLogFile(logFileName)
+ print(f"Log file: {logFileName}")
+
+ results = runTestForUseCase(useCaseId, size=50, numTests=10)
+ allResults.append(results)
+
+ print(f"Passed: {results['passed']}/{results['numTests']}")
+ print(f"Failed: {results['failed']}/{results['numTests']}")
+
+ if results["errors"]:
+ print("\nErrors:")
+ for error in results["errors"][:5]: # Show first 5 errors
+ print(f" - {error}")
+
+ # Summary
+ print(f"\n{'='*60}")
+ print("SUMMARY")
+ print(f"{'='*60}")
+
+ totalPassed = sum(r["passed"] for r in allResults)
+ totalFailed = sum(r["failed"] for r in allResults)
+ totalTests = sum(r["numTests"] for r in allResults)
+
+ print(f"Total tests: {totalTests}")
+ print(f"Passed: {totalPassed}")
+ print(f"Failed: {totalFailed}")
+ print(f"Success rate: {totalPassed / totalTests * 100:.1f}%")
+
+ return allResults
+
+
+if __name__ == "__main__":
+ # Set up logging - use WARNING level to reduce noise from jsonUtils
+ logging.basicConfig(level=logging.WARNING)
+
+ # Run tests
+ results = runAllTests()
+
+ # Save results to file (in project root)
+ resultsFile = os.path.join(projectRoot, "test_json_merger_results.json")
+ with open(resultsFile, "w", encoding="utf-8") as f:
+ json.dump(results, f, indent=2, ensure_ascii=False)
+
+ print(f"\nResults saved to {resultsFile}")
diff --git a/modules/services/serviceGeneration/paths/codePath.py b/modules/services/serviceGeneration/paths/codePath.py
index e25cfccc..336c30d8 100644
--- a/modules/services/serviceGeneration/paths/codePath.py
+++ b/modules/services/serviceGeneration/paths/codePath.py
@@ -640,6 +640,7 @@ Return ONLY valid JSON matching the request above.
```
"""
+ # Build base prompt
contentPrompt = f"""# TASK: Generate Code File Content
Generate complete, executable code for the file: {filename}
@@ -678,6 +679,130 @@ Return ONLY valid JSON in this format:
}}
"""
+ # Build continuation prompt builder
+ async def buildCodeContentPromptWithContinuation(
+ continuationContext: Optional[Dict[str, Any]] = None,
+ **kwargs
+ ) -> str:
+ """Build code content prompt with optional continuation context. Extracts code-specific parameters from kwargs."""
+ # Extract parameters from kwargs (for code_content use case)
+ filename = kwargs.get("filename", "")
+ fileType = kwargs.get("fileType", "")
+ functions = kwargs.get("functions", [])
+ classes = kwargs.get("classes", [])
+ dependencies = kwargs.get("dependencies", [])
+ metadata = kwargs.get("metadata", {})
+ userPrompt = kwargs.get("userPrompt", "")
+ contentParts = kwargs.get("contentParts", [])
+ contextInfo = kwargs.get("contextInfo", "")
+
+ # Rebuild base prompt (same as initial prompt)
+ userRequestSection = ""
+ if userPrompt:
+ userRequestSection = f"""
+## ORIGINAL USER REQUEST
+```
+{userPrompt}
+```
+"""
+
+ contentPartsSection = ""
+ if contentParts:
+ relevantParts = []
+ for part in contentParts:
+ usageHint = part.metadata.get('usageHint', '').lower()
+ originalFileName = part.metadata.get('originalFileName', '').lower()
+ filenameLower = filename.lower()
+
+ if (filenameLower in usageHint or
+ filenameLower in originalFileName or
+ part.metadata.get('contentFormat') == 'reference' or
+ (part.data and len(str(part.data).strip()) > 0)):
+ relevantParts.append(part)
+
+ if relevantParts:
+ contentPartsSection = "\n## AVAILABLE CONTENT PARTS\n"
+ for i, part in enumerate(relevantParts, 1):
+ contentFormat = part.metadata.get("contentFormat", "unknown")
+ originalFileName = part.metadata.get('originalFileName', 'N/A')
+ contentPartsSection += f"\n{i}. ContentPart ID: {part.id}\n"
+ contentPartsSection += f" Format: {contentFormat}\n"
+ contentPartsSection += f" Type: {part.typeGroup}\n"
+ contentPartsSection += f" Original file name: {originalFileName}\n"
+ contentPartsSection += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
+ if part.data and isinstance(part.data, str) and len(part.data) < 2000:
+ contentPartsSection += f" Content preview: {part.data[:500]}...\n"
+
+ basePrompt = f"""# TASK: Generate Code File Content
+
+Generate complete, executable code for the file: {filename}
+{userRequestSection}## FILE SPECIFICATIONS
+
+File Type: {fileType}
+Language: {metadata.get('language', 'python') if metadata else 'python'}
+{contentPartsSection}
+
+Required functions:
+{json.dumps(functions, indent=2) if functions else 'None specified'}
+
+Required classes:
+{json.dumps(classes, indent=2) if classes else 'None specified'}
+
+Dependencies on other files: {', '.join(dependencies) if dependencies else 'None'}
+{contextInfo}
+
+Generate complete, production-ready code with:
+1. Proper imports (including imports from other files in the project if dependencies exist)
+2. All required functions and classes
+3. Error handling
+4. Documentation/docstrings
+5. Type hints where appropriate
+
+Return ONLY valid JSON in this format:
+{{
+ "files": [
+ {{
+ "filename": "{filename}",
+ "content": "// Complete code here",
+ "functions": {json.dumps(functions, indent=2) if functions else '[]'},
+ "classes": {json.dumps(classes, indent=2) if classes else '[]'}
+ }}
+ ]
+}}
+"""
+
+ if continuationContext:
+ # Add continuation instructions
+ deliveredSummary = continuationContext.get("delivered_summary", "")
+ elementBeforeCutoff = continuationContext.get("element_before_cutoff", "")
+ cutOffElement = continuationContext.get("cut_off_element", "")
+
+ continuationText = f"{deliveredSummary}\n\n"
+ continuationText += "⚠️ CONTINUATION: Response was cut off. Generate ONLY the remaining content that comes AFTER the reference elements below.\n\n"
+
+ if elementBeforeCutoff:
+ continuationText += "# REFERENCE: Last complete element (already delivered - DO NOT repeat):\n"
+ continuationText += f"{elementBeforeCutoff}\n\n"
+
+ if cutOffElement:
+ continuationText += "# REFERENCE: Incomplete element (cut off here - DO NOT repeat):\n"
+ continuationText += f"{cutOffElement}\n\n"
+
+ continuationText += "⚠️ CRITICAL: The elements above are REFERENCE ONLY. They are already delivered.\n"
+ continuationText += "Generate ONLY what comes AFTER these elements. DO NOT regenerate the entire JSON structure.\n"
+ continuationText += "Continue generating the remaining code content now.\n\n"
+
+ return f"""{basePrompt}
+
+--- CONTINUATION REQUEST ---
+
+{continuationText}
+
+Continue generating the remaining code content now.
+"""
+ else:
+ return basePrompt
+
# Use generic looping system with code_content use case
options = AiCallOptions(
operationType=OperationTypeEnum.DATA_GENERATE,
@@ -687,6 +812,19 @@ Return ONLY valid JSON in this format:
contentJson = await self.services.ai.callAiWithLooping(
prompt=contentPrompt,
options=options,
+ promptBuilder=buildCodeContentPromptWithContinuation,
+ promptArgs={
+ "filename": filename,
+ "fileType": fileType,
+ "functions": functions,
+ "classes": classes,
+ "dependencies": dependencies,
+ "metadata": metadata,
+ "userPrompt": userPrompt,
+ "contentParts": contentParts,
+ "contextInfo": contextInfo,
+ "services": self.services
+ },
useCaseId="code_content",
debugPrefix=f"code_content_{fileStructure.get('id', 'file')}",
)
diff --git a/modules/shared/jsonUtils.py b/modules/shared/jsonUtils.py
index 9a7cffab..7769e0d9 100644
--- a/modules/shared/jsonUtils.py
+++ b/modules/shared/jsonUtils.py
@@ -212,106 +212,77 @@ def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]:
def closeJsonStructures(text: str) -> str:
"""
- Close incomplete JSON structures by adding missing closing brackets.
- Also handles unterminated strings by closing them.
+ Close incomplete JSON structures generically and correctly.
+
+ Generic approach:
+ 1. Close unterminated strings (if odd number of quotes)
+ 2. Track structure opening order with stack (LIFO)
+ 3. Close structures in reverse order (last opened, first closed)
+ 4. Remove trailing commas only directly before closing brackets/braces
"""
if not text:
return text
result = text
- # Handle unterminated strings: find the last unclosed string
- # Look for patterns like: "value" or "value\n (unterminated)
- # Check if we're in the middle of a string value when text ends
- if result.strip():
- # re is already imported at module level
- # Count quotes - if odd number, we have an unterminated string
- quoteCount = result.count('"')
- if quoteCount % 2 == 1:
- # Find the last opening quote that's not escaped
- lastQuotePos = result.rfind('"')
- if lastQuotePos >= 0:
- # Check if it's escaped
+ # Step 1: Close unterminated strings
+ # Simple: if odd number of quotes, find last unescaped quote and close it
+ quoteCount = result.count('"')
+ if quoteCount % 2 == 1:
+ # Find last unescaped quote
+ i = len(result) - 1
+ while i >= 0:
+ if result[i] == '"':
+ # Count backslashes before quote
escapeCount = 0
- i = lastQuotePos - 1
- while i >= 0 and result[i] == '\\':
+ j = i - 1
+ while j >= 0 and result[j] == '\\':
escapeCount += 1
- i -= 1
- # If not escaped (even number of backslashes), close the string
+ j -= 1
+ # If even number of backslashes, quote is not escaped
if escapeCount % 2 == 0:
- # Find where the string should end (before next comma, bracket, or brace)
- # For now, just close it at the end
result += '"'
- else:
- # Even number of quotes, but might still be in middle of string if cut off
- # More robust detection: check if text ends with alphanumeric/text chars after a quote
- # This handles cases like: "text": "value cut off mid-word
-
- # Pattern 1: ends with colon + quote + text (no closing quote)
- if re.search(r':\s*"[^"]*$', result):
- # We're in the middle of a string value, close it
- result += '"'
- else:
- # Pattern 2: find last quote and check what comes after
- lastQuotePos = result.rfind('"')
- if lastQuotePos >= 0:
- afterQuote = result[lastQuotePos + 1:]
- # If after quote we have text (alphanumeric/whitespace) but no closing quote/comma/brace
- # and the text doesn't end with structural characters, we're likely in a string
- if afterQuote:
- # Check if it looks like we're in a string value (has text, no closing quote)
- # Pattern: ends with letters/numbers/spaces, not ending with quote, comma, }, or ]
- if re.search(r'[a-zA-Z0-9\s]$', result) and not re.match(r'^\s*[,}\]\]]', afterQuote):
- # Check if it's escaped
- escapeCount = 0
- i = lastQuotePos - 1
- while i >= 0 and result[i] == '\\':
- escapeCount += 1
- i -= 1
- if escapeCount % 2 == 0:
- # Verify we're actually in a string context (not in a key name)
- # Look backwards to see if we have ": " before the quote (value context)
- beforeQuote = result[:lastQuotePos]
- # Check if we're in a value context (has ": " before quote) or in an array (has "[ before quote)
- if re.search(r':\s*"', beforeQuote[-50:]) or re.search(r'\[\s*"', beforeQuote[-50:]):
- result += '"'
- # Also check if text ends with alphanumeric (likely cut off mid-word)
- elif re.search(r'[a-zA-Z]$', result):
- # If we end with a letter and have a quote before it, likely in a string
- result += '"'
-
- # Final fallback: if text ends with alphanumeric and we have quotes, try to close the last string
- # This handles edge cases where patterns above didn't match
- if result.strip() and re.search(r'[a-zA-Z0-9]$', result):
- # Count quotes - if we have quotes and end with text, might be in a string
- if quoteCount > 0:
- lastQuotePos = result.rfind('"')
- if lastQuotePos >= 0:
- afterQuote = result[lastQuotePos + 1:]
- # If after quote is text (not empty, not structural), close it
- if afterQuote and re.search(r'^[a-zA-Z0-9\s]+$', afterQuote[:50]): # Check first 50 chars after quote
- # Make sure we're not already closed (check if next char would be quote/comma/brace)
- if not result.endswith('"') and not result.endswith(',') and not result.endswith('}') and not result.endswith(']'):
- # Check if escaped
- escapeCount = 0
- i = lastQuotePos - 1
- while i >= 0 and result[i] == '\\':
- escapeCount += 1
- i -= 1
- if escapeCount % 2 == 0:
- result += '"'
+ break
+ i -= 1
- # Count open/close brackets and braces
- openBraces = result.count('{')
- closeBraces = result.count('}')
- openBrackets = result.count('[')
- closeBrackets = result.count(']')
+ # Step 2: Track structure opening order with stack
+ stack = []
+ inString = False
+ escapeNext = False
- # Close incomplete structures
- for _ in range(openBraces - closeBraces):
- result += '}'
- for _ in range(openBrackets - closeBrackets):
- result += ']'
+ for char in result:
+ if escapeNext:
+ escapeNext = False
+ continue
+
+ if char == '\\':
+ escapeNext = True
+ continue
+
+ if char == '"':
+ inString = not inString
+ continue
+
+ # Only track braces/brackets outside of strings
+ if not inString:
+ if char == '{':
+ stack.append('}')
+ elif char == '[':
+ stack.append(']')
+ elif char == '}' or char == ']':
+ # Pop matching closing bracket/brace from stack
+ if stack and stack[-1] == char:
+ stack.pop()
+
+ # Step 3: Close remaining structures in reverse order (LIFO)
+ # Remove trailing comma ONLY directly before each closing bracket/brace
+ while stack:
+ closingChar = stack.pop()
+ result = result.rstrip()
+ # Remove trailing comma if present (invalid before closing)
+ if result and result[-1] == ',':
+ result = result[:-1].rstrip()
+ result += closingChar
return result
@@ -731,7 +702,149 @@ def extractSectionsFromDocument(documentData: Dict[str, Any]) -> List[Dict[str,
return []
-def buildContinuationContext(allSections: List[Dict[str, Any]], lastRawResponse: Optional[str] = None) -> Dict[str, Any]:
+def _extractOverlapFromElement(elem: Dict[str, Any], elemType: str) -> Optional[Dict[str, Any]]:
+ """
+ GENERIC function to extract overlap portion from an element.
+
+ Handles elements of any size, including very long strings:
+ - Paragraphs: Extract last N characters/words
+ - Code blocks: Extract last N lines
+ - Tables: Extract last N rows
+ - Lists: Extract last N items
+ - Other elements: Extract representative portion
+
+ Args:
+ elem: Element dictionary
+ elemType: Element type (table, paragraph, code_block, etc.)
+
+ Returns:
+ Overlap element dictionary with size-limited content, or None
+ """
+ if not isinstance(elem, dict):
+ return None
+
+ # Get content (handle both flat and nested structures)
+ content = elem.get("content", {}) if isinstance(elem.get("content"), dict) else {}
+
+ if elemType == "table":
+ rows = elem.get("rows", []) or content.get("rows", [])
+ headers = elem.get("headers", []) or content.get("headers", [])
+
+ if rows:
+ # Extract last 3-5 rows as overlap (enough for context, not too large)
+ overlapRowCount = min(5, len(rows))
+ overlapRows = rows[-overlapRowCount:]
+
+ overlapElem = {
+ "type": "table",
+ "content": {
+ "headers": headers,
+ "rows": overlapRows
+ }
+ }
+ return overlapElem
+
+ elif elemType in ["bullet_list", "numbered_list"]:
+ items = elem.get("items", []) or content.get("items", [])
+
+ if items:
+ # Extract last 5-10 items as overlap
+ overlapItemCount = min(10, len(items))
+ overlapItems = items[-overlapItemCount:]
+
+ overlapElem = {
+ "type": elemType,
+ "content": {
+ "items": overlapItems
+ }
+ }
+ return overlapElem
+
+ elif elemType == "paragraph":
+ text = elem.get("text", "") or content.get("text", "")
+
+ if text:
+ # Extract last portion of text
+ # For very long text, use last 300-500 characters
+ # For shorter text, use all of it
+ maxOverlapChars = 500
+ minOverlapChars = 100
+
+ if len(text) > maxOverlapChars:
+ # Very long text - extract last portion
+ # Try to break at word boundary for readability
+ textSnippet = text[-maxOverlapChars:]
+ # Find first space/newline to start from word boundary
+ firstSpace = textSnippet.find(' ')
+ if firstSpace > 0 and firstSpace < 50:
+ textSnippet = textSnippet[firstSpace + 1:]
+ overlapText = textSnippet
+ elif len(text) > minOverlapChars:
+ # Medium text - use last portion
+ overlapText = text[-minOverlapChars:]
+ else:
+ # Short text - use all
+ overlapText = text
+
+ overlapElem = {
+ "type": "paragraph",
+ "content": {
+ "text": overlapText
+ }
+ }
+ return overlapElem
+
+ elif elemType == "code_block":
+ code = elem.get("code", "") or content.get("code", "")
+
+ if code:
+ # Extract last N lines of code
+ codeLines = code.split('\n')
+ # Use last 10-20 lines as overlap (enough context for continuation)
+ overlapLineCount = min(20, len(codeLines))
+ overlapLines = codeLines[-overlapLineCount:]
+ overlapCode = '\n'.join(overlapLines)
+
+ overlapElem = {
+ "type": "code_block",
+ "content": {
+ "code": overlapCode
+ }
+ }
+ return overlapElem
+
+ elif elemType == "heading":
+ # Headings are usually short - return as-is
+ return elem
+
+ elif elemType == "image":
+ # Images are usually small - return as-is
+ return elem
+
+ else:
+ # Generic element - try to extract a representative portion
+ # Convert to JSON and limit size
+ elemJson = json.dumps(elem, ensure_ascii=False)
+
+ # If element is very large, try to extract key fields only
+ if len(elemJson) > 1000:
+ # Extract only essential fields
+ overlapElem = {
+ "type": elemType,
+ "id": elem.get("id"),
+ "content": "..." # Indicate truncated content
+ }
+ return overlapElem
+
+ # Small element - return as-is
+ return elem
+
+
+def buildContinuationContext(
+ allSections: List[Dict[str, Any]],
+ lastRawResponse: Optional[str] = None,
+ useCaseId: Optional[str] = None
+) -> Dict[str, Any]:
"""
Build context information from accumulated sections for continuation prompt.
@@ -740,9 +853,11 @@ def buildContinuationContext(allSections: List[Dict[str, Any]], lastRawResponse:
Args:
allSections: List of ALL sections accumulated across ALL iterations
lastRawResponse: Raw JSON response from last iteration (can be broken/incomplete)
+ useCaseId: Optional use case ID to determine expected JSON structure
Returns:
- Dict with delivered_summary, cut_off_element, element_before_cutoff
+ Dict with delivered_summary, cut_off_element, element_before_cutoff, template_structure,
+ last_complete_part, incomplete_part, structure_context
"""
context = {
"section_count": len(allSections),
@@ -917,15 +1032,1145 @@ def buildContinuationContext(allSections: List[Dict[str, Any]], lastRawResponse:
context["element_before_cutoff"] = element_before_cutoff
context["cut_off_element"] = cut_off_element
+ # Extract overlap information for continuation prompt
+ # GENERIC overlap extraction: handles elements of any size, including long strings
+ # Strategy: Extract last N elements, but if an element is very large, extract only a portion
+ overlapElements = []
+ overlapString = ""
+
+ if allSections:
+ # Get last section
+ lastSection = allSections[-1]
+ elements = lastSection.get("elements", [])
+
+ if isinstance(elements, list) and len(elements) > 0:
+ # Extract last 2-3 complete elements as overlap context
+ # This helps the AI understand what was already delivered
+ overlapCount = min(3, len(elements))
+ overlapElements = elements[-overlapCount:]
+
+ # Build overlap string showing these elements (with size limits for large elements)
+ overlapStrings = []
+ for elem in overlapElements:
+ if isinstance(elem, dict):
+ elemType = elem.get("type", "unknown")
+ overlapElem = _extractOverlapFromElement(elem, elemType)
+ if overlapElem:
+ overlapStrings.append(json.dumps(overlapElem, ensure_ascii=False))
+ else:
+ # Non-dict element - show as-is (but limit size)
+ elemStr = json.dumps(elem, ensure_ascii=False)
+ if len(elemStr) > 500:
+ elemStr = elemStr[:500] + "..."
+ overlapStrings.append(elemStr)
+
+ if overlapStrings:
+ overlapString = ",\n".join(overlapStrings)
+
+ context["overlap_elements"] = overlapElements
+ context["overlap_string"] = overlapString
+
# Store raw JSON response for prompt builder to check
if lastRawResponse:
context["last_raw_json"] = lastRawResponse
+
+ # Extract JSON structure context for continuation prompt
+ # This provides: template structure, last complete part, incomplete part, structure context
+ try:
+ structureContext = extractJsonStructureContext(lastRawResponse, useCaseId)
+ context["template_structure"] = structureContext.get("template_structure", "")
+ context["last_complete_part"] = structureContext.get("last_complete_part", "")
+ context["incomplete_part"] = structureContext.get("incomplete_part", "")
+ context["structure_context"] = structureContext.get("structure_context", "")
+ # Log if extraction succeeded but returned empty values
+ if not context["template_structure"] and not context["structure_context"]:
+ logger.debug(f"JSON structure context extraction returned empty values for useCaseId={useCaseId}")
+ except Exception as e:
+ logger.warning(f"Error extracting JSON structure context: {e}", exc_info=True)
+ context["template_structure"] = ""
+ context["last_complete_part"] = ""
+ context["incomplete_part"] = ""
+ context["structure_context"] = ""
else:
context["last_raw_json"] = ""
+ context["template_structure"] = ""
+ context["last_complete_part"] = ""
+ context["incomplete_part"] = ""
+ context["structure_context"] = ""
return context
+def extractJsonStructureContext(
+ incompleteJson: str,
+ useCaseId: Optional[str] = None
+) -> Dict[str, Any]:
+ """
+ Extract JSON structure context from incomplete JSON for continuation prompts.
+
+ Extracts:
+ 1. Template JSON structure of the complete object (structure only, no content)
+ 2. Last complete part (last complete element/object)
+ 3. Incomplete part (the cut-off portion)
+ 4. Structure context (parent structure metadata only, no content)
+
+ Args:
+ incompleteJson: Incomplete JSON string (may be cut off mid-element)
+ useCaseId: Optional use case ID to determine expected structure
+
+ Returns:
+ Dict with:
+ - template_structure: Template JSON structure (structure only)
+ - last_complete_part: Last complete element/object as JSON string
+ - incomplete_part: Incomplete/cut-off portion as JSON string
+ - structure_context: Parent structure metadata (keys only, no content)
+ """
+ from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText
+
+ result = {
+ "template_structure": "",
+ "last_complete_part": "",
+ "incomplete_part": "",
+ "structure_context": ""
+ }
+
+ if not incompleteJson or not incompleteJson.strip():
+ return result
+
+ # Normalize JSON string
+ normalized = stripCodeFences(normalizeJsonText(incompleteJson)).strip()
+ if not normalized:
+ return result
+
+ # Find first '{' or '[' to start
+ startIdx = -1
+ for i, char in enumerate(normalized):
+ if char in '{[':
+ startIdx = i
+ break
+
+ if startIdx == -1:
+ return result
+
+ jsonContent = normalized[startIdx:]
+
+ # Step 1: Extract template structure (structure only, no content)
+ templateStructure = _extractTemplateStructure(jsonContent, useCaseId)
+ result["template_structure"] = templateStructure
+
+ # Step 2: Find last complete part and incomplete part
+ lastComplete, incompletePart = _extractLastCompleteAndIncomplete(jsonContent)
+ result["last_complete_part"] = lastComplete
+ result["incomplete_part"] = incompletePart
+
+ # Step 3: Extract structure context (parent structure metadata only)
+ # Pass both incomplete part and last complete part to show positions
+ structureContext = _extractStructureContext(jsonContent, incompletePart, lastComplete)
+ result["structure_context"] = structureContext
+
+ return result
+
+
+def _extractTemplateStructure(jsonContent: str, useCaseId: Optional[str] = None) -> str:
+ """
+ Extract template JSON structure (structure only, no content).
+
+ Examples:
+ - {"documents": [{"chapters": [{"sections": [...]}]}]}
+ - {"elements": [{"type": "...", "content": {...}}]}
+ """
+ import json
+ import re
+
+ # Try to parse JSON to understand structure
+ try:
+ # Try to close and parse
+ closed = closeJsonStructures(jsonContent)
+ parsed = json.loads(closed)
+
+ # Build template structure (keys only, no content)
+ template = _buildStructureTemplate(parsed)
+ return json.dumps(template, indent=2, ensure_ascii=False)
+ except Exception:
+ # If parsing fails, try to extract structure from string
+ # Look for top-level keys
+ topLevelKeys = []
+
+ # Pattern: "key": { or "key": [
+ keyPattern = r'"([^"]+)"\s*:\s*[{\[]'
+ matches = re.findall(keyPattern, jsonContent)
+ if matches:
+ topLevelKeys = matches[:3] # Take first 3 keys
+
+ # Build template based on use case or detected keys
+ if useCaseId == "chapter_structure":
+ return json.dumps({"documents": [{"chapters": [{"id": "", "title": "", "level": 0}]}]}, indent=2, ensure_ascii=False)
+ elif useCaseId == "section_content":
+ return json.dumps({"elements": [{"type": "", "content": {}}]}, indent=2, ensure_ascii=False)
+ elif useCaseId == "code_structure":
+ return json.dumps({"files": [{"id": "", "filename": "", "fileType": ""}]}, indent=2, ensure_ascii=False)
+ elif topLevelKeys:
+ # Build generic template
+ template = {}
+ for key in topLevelKeys:
+ template[key] = []
+ return json.dumps(template, indent=2, ensure_ascii=False)
+ else:
+ return json.dumps({}, indent=2, ensure_ascii=False)
+
+
+def _buildStructureTemplate(obj: Any, maxDepth: int = 3) -> Any:
+ """
+ Build structure template from parsed JSON (keys only, no content).
+ """
+ if isinstance(obj, dict):
+ template = {}
+ for key, value in obj.items():
+ if isinstance(value, (dict, list)):
+ template[key] = _buildStructureTemplate(value, maxDepth - 1) if maxDepth > 0 else None
+ else:
+ # Keep key but use empty value of same type
+ if isinstance(value, str):
+ template[key] = ""
+ elif isinstance(value, (int, float)):
+ template[key] = 0
+ elif isinstance(value, bool):
+ template[key] = False
+ else:
+ template[key] = None
+ return template
+ elif isinstance(obj, list) and obj:
+ # Use first element as template
+ return [_buildStructureTemplate(obj[0], maxDepth - 1) if maxDepth > 0 else None]
+ else:
+ return None
+
+
+def _extractLastCompleteAndIncomplete(jsonContent: str) -> Tuple[str, str]:
+ """
+ Extract last complete part and incomplete part from JSON.
+
+ Returns:
+ Tuple of (last_complete_part, incomplete_part) as JSON strings
+ """
+ import json
+
+ # Try to find the last complete element/object
+ # Strategy: Parse backwards, find where structures are balanced
+
+ # Count braces and brackets to find where JSON becomes incomplete
+ braceCount = 0
+ bracketCount = 0
+ lastCompleteEnd = -1
+
+ inString = False
+ escapeNext = False
+
+ for i, char in enumerate(jsonContent):
+ if escapeNext:
+ escapeNext = False
+ continue
+
+ if char == '\\':
+ escapeNext = True
+ continue
+
+ if char == '"':
+ inString = not inString
+ continue
+
+ if not inString:
+ if char == '{':
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and bracketCount == 0:
+ # Found end of complete structure
+ lastCompleteEnd = i + 1
+ elif char == '[':
+ bracketCount += 1
+ elif char == ']':
+ bracketCount -= 1
+ if braceCount == 0 and bracketCount == 0:
+ # Found end of complete structure
+ lastCompleteEnd = i + 1
+
+ # Extract parts
+ if lastCompleteEnd > 0:
+ lastCompletePart = jsonContent[:lastCompleteEnd]
+ incompletePart = jsonContent[lastCompleteEnd:].strip()
+
+ # Try to find last complete element within the structure
+ # Look for last complete object/array element
+ lastCompleteElement = _findLastCompleteElement(lastCompletePart)
+ if lastCompleteElement:
+ # Build context for incomplete part - show structure around the break
+ incompleteWithContext = _buildIncompleteContext(jsonContent, lastCompleteEnd)
+ return lastCompleteElement, incompleteWithContext
+ else:
+ # Build context for incomplete part
+ incompleteWithContext = _buildIncompleteContext(jsonContent, lastCompleteEnd)
+ return lastCompletePart, incompleteWithContext
+ else:
+ # No complete structure found - everything is incomplete
+ # Still try to show context
+ incompleteWithContext = _buildIncompleteContext(jsonContent, 0)
+ return "", incompleteWithContext
+
+
+def _findLastCompleteElement(jsonStr: str) -> str:
+ """
+ Find the last complete element in JSON string.
+ """
+ import json
+
+ # Try to parse and extract last element
+ try:
+ closed = closeJsonStructures(jsonStr)
+ parsed = json.loads(closed)
+
+ # If it's a dict with arrays, get last element from first array
+ if isinstance(parsed, dict):
+ for key, value in parsed.items():
+ if isinstance(value, list) and value:
+ lastElem = value[-1]
+ return json.dumps(lastElem, indent=2, ensure_ascii=False)
+
+ # If it's a list, get last element
+ if isinstance(parsed, list) and parsed:
+ lastElem = parsed[-1]
+ return json.dumps(lastElem, indent=2, ensure_ascii=False)
+ except Exception:
+ pass
+
+ # Fallback: try to find last complete object using brace matching
+ braceCount = 0
+ startPos = -1
+ lastCompleteEnd = -1
+
+ for i, char in enumerate(jsonStr):
+ if char == '{':
+ if braceCount == 0:
+ startPos = i
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0 and startPos >= 0:
+ lastCompleteEnd = i + 1
+
+ if lastCompleteEnd > 0:
+ return jsonStr[startPos:lastCompleteEnd]
+
+ return ""
+
+
+def _buildIncompleteContext(jsonContent: str, breakPosition: int) -> str:
+ """
+ Build intelligent context showing the incomplete element with its parent structure hierarchy.
+
+ Logic (as per user instruction):
+ 1. Cut piece level: element of a list (the incomplete element at cut point)
+ 2. Parent of the cut element: the list/array containing the cut piece (with cut point shown)
+ 3. Last complete object on the same level like the cut object (if exists) PLUS further previous
+ content from the json string (maximum 1000 characters)
+ 4. Next parent levels, until root. Further 1000 characters to show content (but only complete
+ objects - if too big, not to show), then only showing metadata until root
+
+ Example output structure:
+ {
+ "elements": [
+ {
+ "content": {
+ "rows": [
+ [37847, 37853, 37861, 37871, 37879, 37889, 37897, 37907, 37951, 37957],
+ [37957, 37963, 37967, 37987, 37991, <-- CUT POINT (incomplete)
+ """
+ import json
+ import re
+
+ if breakPosition <= 0 or breakPosition >= len(jsonContent):
+ # Invalid break position - show last 500 chars
+ return jsonContent[-500:] if len(jsonContent) > 500 else jsonContent
+
+ contextParts = []
+
+ # Find structure hierarchy backwards from break point
+ hierarchy = _findStructureHierarchy(jsonContent, breakPosition)
+
+ if not hierarchy:
+ # Fallback: show simple context
+ contextParts.append("Cut point context:\n")
+ contextStart = max(0, breakPosition - 500)
+ contextParts.append(jsonContent[contextStart:breakPosition + 100])
+ return "\n".join(contextParts)
+
+ # Step 1: Extract cut piece (incomplete element at cut point)
+ cutPiece = _extractCutPiece(jsonContent, breakPosition)
+
+ # Step 2: Find the cut level (the array/object containing the cut piece)
+ cutLevel = hierarchy[-1] if hierarchy else None
+
+ if not cutLevel:
+ # Fallback
+ contextParts.append("Cut point context:\n")
+ contextStart = max(0, breakPosition - 500)
+ contextParts.append(jsonContent[contextStart:breakPosition + 100])
+ return "\n".join(contextParts)
+
+ # Build context following the exact structure requested
+ # Show hierarchical structure from root to cut point
+
+ # Extract the actual JSON structure from root to cut point
+ # Build the full hierarchical structure showing:
+ # 4. Parent levels until root (with content/metadata limits)
+ # 3. Last complete elements on same level + previous content (max 1000 chars)
+ # 2. Parent container (the list) with cut piece
+ # 1. Cut piece
+
+ resultLines = []
+
+ # Build structure from root to cut level
+ # Extract actual JSON content for each level
+ for i, level in enumerate(hierarchy):
+ levelType = level['type']
+ start = level['start_pos']
+ end = level['end_pos'] if i < len(hierarchy) - 1 else breakPosition
+ key = level.get('key')
+ depth = level['depth']
+
+ indent = " " * depth
+
+ if i < len(hierarchy) - 1:
+ # Parent levels - show opening structure
+ levelContent = jsonContent[start:end]
+
+ # If content is too large, show only metadata
+ if len(levelContent) > 1000:
+ # Show opening with key
+ opening = jsonContent[start:min(start + 100, end)]
+ if key:
+ resultLines.append(f'{indent}"{key}": {{')
+ else:
+ resultLines.append(f'{indent}{{')
+ resultLines.append(f'{indent} ...')
+ else:
+ # Show opening structure
+ if key:
+ # Find where the key's value starts
+ keyEnd = jsonContent.find(':', start)
+ if keyEnd > 0:
+ opening = jsonContent[start:min(keyEnd + 50, end)]
+ resultLines.append(f'{indent}{opening}')
+ else:
+ opening = jsonContent[start:min(start + 50, end)]
+ resultLines.append(f'{indent}{opening}')
+ else:
+ # Cut level - show detailed context
+ cutLevelType = levelType
+ cutLevelStart = start
+ cutLevelKey = key
+ cutLevelDepth = depth
+
+ # Show key if available
+ if cutLevelKey:
+ resultLines.append(f'{indent}"{cutLevelKey}": {{')
+ indent += " "
+
+ if cutLevelType == 'array':
+ # Show array opening
+ arrayKey = _findKeyBefore(jsonContent, cutLevelStart)
+ if arrayKey:
+ resultLines.append(f'{indent}"{arrayKey}": [')
+ else:
+ resultLines.append(f'{indent}[')
+ indent += " "
+
+ # 3. Show last complete elements on same level + previous content (max 1000 chars)
+ contentBeforeBreak = jsonContent[cutLevelStart:breakPosition]
+ lastCompleteElements = _extractLastCompleteArrayElementsWithContext(
+ contentBeforeBreak, jsonContent, cutLevelStart, maxChars=1000
+ )
+ if lastCompleteElements:
+ resultLines.append(lastCompleteElements)
+
+ # 2. Show parent container (the list) with cut piece
+ cutArrayElement = _findCutArrayElement(jsonContent, breakPosition, cutLevelStart)
+ if cutArrayElement:
+ resultLines.append(f'{indent}{cutArrayElement} <-- CUT POINT (incomplete)')
+ else:
+ # Fallback: show what we have at break point
+ cutPart = jsonContent[breakPosition:breakPosition + 200].strip()
+ resultLines.append(f'{indent}{cutPart} <-- CUT POINT (incomplete)')
+
+ # Close the array
+ indent = indent[:-2] if len(indent) >= 2 else indent
+ resultLines.append(f'{indent}]')
+ else:
+ # Object at cut level
+ cutPart = jsonContent[breakPosition:breakPosition + 200].strip()
+ preview = jsonContent[cutLevelStart:breakPosition]
+ preview = preview[-500:] if len(preview) > 500 else preview
+ resultLines.append(f'{indent}{preview}... {cutPart} <-- CUT POINT (incomplete)')
+
+ # Close all parent structures
+ for i in range(len(hierarchy) - 2, -1, -1):
+ level = hierarchy[i]
+ depth = level['depth']
+ indent = " " * depth
+ resultLines.append(f'{indent}}}')
+
+ contextParts.append("\n".join(resultLines))
+
+ return "\n".join(contextParts)
+
+
+def _extractCutPiece(jsonContent: str, breakPosition: int) -> str:
+ """Extract the incomplete piece at the cut point."""
+ # Get characters after break point (incomplete part)
+ afterBreak = jsonContent[breakPosition:breakPosition + 200].strip()
+ # Find where the incomplete piece ends (next comma, bracket, brace, or end)
+ for i, char in enumerate(afterBreak):
+ if char in [',', ']', '}', '\n']:
+ return afterBreak[:i].strip()
+ return afterBreak[:50].strip() # Limit to 50 chars if no delimiter found
+
+
+def _findStructureHierarchy(jsonContent: str, breakPosition: int) -> List[Dict[str, Any]]:
+ """
+ Find the structure hierarchy backwards from break point to root.
+
+ Returns list of level info dicts, from root to cut level.
+ Each level has: type, start_pos, end_pos, parent_start, content_preview
+ """
+ hierarchy = []
+
+ # Track depth and positions
+ braceDepth = 0
+ bracketDepth = 0
+ inString = False
+ escapeNext = False
+
+ # Find all structure boundaries before break point
+ structureStack = [] # Stack of (type, start_pos, depth)
+
+ for i in range(breakPosition):
+ if i >= len(jsonContent):
+ break
+
+ char = jsonContent[i]
+
+ if escapeNext:
+ escapeNext = False
+ continue
+
+ if char == '\\':
+ escapeNext = True
+ continue
+
+ if char == '"':
+ inString = not inString
+ continue
+
+ if not inString:
+ if char == '{':
+ structureStack.append(('object', i, braceDepth + bracketDepth))
+ braceDepth += 1
+ elif char == '}':
+ if structureStack and structureStack[-1][0] == 'object':
+ _, start, depth = structureStack.pop()
+ hierarchy.append({
+ 'type': 'object',
+ 'start_pos': start,
+ 'end_pos': i + 1,
+ 'depth': depth,
+ 'key': _findKeyBefore(jsonContent, start)
+ })
+ braceDepth -= 1
+ elif char == '[':
+ structureStack.append(('array', i, braceDepth + bracketDepth))
+ bracketDepth += 1
+ elif char == ']':
+ if structureStack and structureStack[-1][0] == 'array':
+ _, start, depth = structureStack.pop()
+ hierarchy.append({
+ 'type': 'array',
+ 'start_pos': start,
+ 'end_pos': i + 1,
+ 'depth': depth,
+ 'key': _findKeyBefore(jsonContent, start)
+ })
+ bracketDepth -= 1
+
+ # Sort by depth (root first) and filter to get hierarchy from root to cut
+ hierarchy.sort(key=lambda x: x['depth'])
+
+ # Find which level contains the break point
+ cutLevelIndex = -1
+ for i, level in enumerate(hierarchy):
+ if level['start_pos'] < breakPosition <= level['end_pos']:
+ cutLevelIndex = i
+ break
+
+ if cutLevelIndex >= 0:
+ # Return hierarchy from root to cut level
+ return hierarchy[:cutLevelIndex + 1]
+
+ return []
+
+
+def _findKeyBefore(jsonContent: str, pos: int) -> Optional[str]:
+ """Find the key name before a structure start position."""
+ # Look backwards for "key": pattern
+ before = jsonContent[max(0, pos - 100):pos]
+ match = re.search(r'"([^"]+)"\s*:\s*[{\[]\s*$', before)
+ if match:
+ return match.group(1)
+ return None
+
+
+def _formatLevelContext(level: Dict[str, Any], jsonContent: str, maxContentChars: int = 1000) -> str:
+ """Format a level in the hierarchy for display."""
+ levelType = level['type']
+ start = level['start_pos']
+ end = level['end_pos']
+ key = level.get('key')
+
+ # Get content for this level
+ levelContent = jsonContent[start:end]
+
+ # If content is too large, show only metadata
+ if len(levelContent) > maxContentChars:
+ # Show opening and key if available
+ if key:
+ return f' "{key}": {levelType} (content too large, {len(levelContent)} chars)'
+ else:
+ return f' {levelType} (content too large, {len(levelContent)} chars)'
+ else:
+ # Show full content (formatted)
+ indent = " " * level['depth']
+ if key:
+ return f'{indent}"{key}": {levelContent[:maxContentChars]}'
+ else:
+ return f'{indent}{levelContent[:maxContentChars]}'
+
+
+def _formatCutLevelContextDetailed(level: Dict[str, Any], cutPiece: str, jsonContent: str, breakPosition: int) -> str:
+ """
+ Format the cut level showing detailed hierarchy as per user instruction:
+ 1. Cut piece level: element of a list (the incomplete element)
+ 2. Parent of the cut element: the list containing the cut piece (with cut point shown)
+ 3. Last complete object on the same level like the cut object (if exists) PLUS further
+ previous content from the json string (maximum 1000 characters)
+ """
+ levelType = level['type']
+ start = level['start_pos']
+ key = level.get('key')
+
+ # Get content before break point in this level
+ contentBeforeBreak = jsonContent[start:breakPosition]
+
+ result = []
+
+ if levelType == 'array':
+ # Step 3: Show last complete elements on same level + previous content (max 1000 chars)
+ # Extract last complete array elements with context (up to 1000 chars)
+ lastCompleteElements = _extractLastCompleteArrayElementsWithContext(
+ contentBeforeBreak, jsonContent, start, maxChars=1000
+ )
+ if lastCompleteElements:
+ result.append("3. Last complete elements on same level (plus previous content, max 1000 chars):")
+ result.append(lastCompleteElements)
+ result.append("")
+
+ # Step 2: Show parent container (the list) with cut piece
+ # Find the array element that contains the cut piece
+ cutArrayElement = _findCutArrayElement(jsonContent, breakPosition, start)
+ if cutArrayElement:
+ result.append("2. Parent container (list containing cut piece):")
+ result.append(f" {cutArrayElement}")
+ else:
+ # Fallback: show cut piece directly
+ cutPart = jsonContent[breakPosition:breakPosition + 200].strip()
+ result.append("2. Parent container (list containing cut piece):")
+ result.append(f" {cutPart}")
+ result.append("")
+
+ # Step 1: Show cut piece (incomplete element at cut point)
+ result.append("1. Cut piece level (incomplete element at cut point):")
+ if cutPiece:
+ result.append(f" {cutPiece}")
+ else:
+ cutPart = jsonContent[breakPosition:breakPosition + 50].strip()
+ result.append(f" {cutPart}")
+ else:
+ # Object - show structure with cut point
+ result.append("Cut point in object:")
+ cutPart = jsonContent[breakPosition:breakPosition + 200].strip()
+ preview = contentBeforeBreak[-500:] if len(contentBeforeBreak) > 500 else contentBeforeBreak
+ result.append(f" {preview}... {cutPart} <-- CUT POINT")
+
+ return "\n".join(result)
+
+
+def _formatParentLevelContext(level: Dict[str, Any], jsonContent: str, maxContentChars: int = 1000) -> str:
+ """
+ Format a parent level showing content (if small enough) or metadata only.
+ Used for levels above the cut level, showing path to root.
+ """
+ levelType = level['type']
+ start = level['start_pos']
+ end = level['end_pos']
+ key = level.get('key')
+
+ # Get content for this level
+ levelContent = jsonContent[start:end]
+
+ # If content is too large, show only metadata
+ if len(levelContent) > maxContentChars:
+ # Show opening structure with key if available
+ opening = jsonContent[start:start + 200].strip()
+ if key:
+ return f' "{key}": {levelType} (content too large, {len(levelContent)} chars)\n {opening}...'
+ else:
+ return f' {levelType} (content too large, {len(levelContent)} chars)\n {opening}...'
+ else:
+ # Show full content (formatted, but limit to maxContentChars)
+ content = levelContent[:maxContentChars]
+ if key:
+ return f' "{key}": {content}'
+ else:
+ return f' {content}'
+
+
+def _extractLastCompleteArrayElementsWithContext(
+ arrayContent: str, fullJsonContent: str, arrayStart: int, maxChars: int = 1000
+) -> str:
+ """
+ Extract last complete array elements PLUS further previous content from json string (max 1000 chars).
+
+ This shows:
+ - Last complete elements on the same level as the cut element
+ - Additional previous content from the JSON string (up to maxChars total)
+ """
+ # First, extract last complete elements from arrayContent
+ completeElements = []
+ currentElement = ""
+ braceDepth = 0
+ bracketDepth = 0
+ inString = False
+ escapeNext = False
+ totalChars = 0
+
+ # Parse backwards to find complete elements
+ for i in range(len(arrayContent) - 1, -1, -1):
+ char = arrayContent[i]
+
+ if escapeNext:
+ escapeNext = False
+ currentElement = char + currentElement
+ continue
+
+ if char == '\\':
+ escapeNext = True
+ currentElement = char + currentElement
+ continue
+
+ if char == '"':
+ inString = not inString
+ currentElement = char + currentElement
+ continue
+
+ if not inString:
+ if char == '}':
+ braceDepth += 1
+ currentElement = char + currentElement
+ elif char == '{':
+ braceDepth -= 1
+ currentElement = char + currentElement
+ if braceDepth == 0 and bracketDepth == 0:
+ # Found complete element
+ element = currentElement.strip()
+ if element and element[0] in ['{', '[']:
+ completeElements.insert(0, element)
+ totalChars += len(element)
+ if totalChars >= maxChars:
+ break
+ currentElement = ""
+ elif char == ']':
+ bracketDepth += 1
+ currentElement = char + currentElement
+ elif char == '[':
+ bracketDepth -= 1
+ currentElement = char + currentElement
+ if braceDepth == 0 and bracketDepth == 0:
+ # Found complete element
+ element = currentElement.strip()
+ if element and element[0] == '[':
+ completeElements.insert(0, element)
+ totalChars += len(element)
+ if totalChars >= maxChars:
+ break
+ currentElement = ""
+ elif char == ',' and braceDepth == 0 and bracketDepth == 0:
+ # Element boundary
+ if currentElement.strip():
+ element = currentElement.strip()
+ if element and element[0] in ['{', '[', '"']:
+ completeElements.insert(0, element)
+ totalChars += len(element)
+ if totalChars >= maxChars:
+ break
+ currentElement = ""
+ else:
+ currentElement = char + currentElement
+
+ # Format the elements
+ if completeElements:
+ # Show last few complete elements (up to maxChars)
+ formattedElements = []
+ charsUsed = 0
+ for elem in reversed(completeElements): # Show from newest to oldest
+ if charsUsed + len(elem) <= maxChars:
+ formattedElements.insert(0, elem)
+ charsUsed += len(elem)
+ else:
+ break
+
+ if formattedElements:
+ # Format as JSON array rows
+ result = []
+ for elem in formattedElements:
+ result.append(f" {elem},")
+ return "\n".join(result)
+
+ return ""
+
+
+def _findCutArrayElement(jsonContent: str, breakPosition: int, arrayStart: int) -> Optional[str]:
+ """Find the array element that contains the cut piece."""
+ # Look backwards from break position to find the start of the current array element
+ braceDepth = 0
+ bracketDepth = 0
+ inString = False
+ escapeNext = False
+ elementStart = -1
+
+ # Search backwards from break position
+ for i in range(breakPosition - 1, arrayStart - 1, -1):
+ if i < 0:
+ break
+
+ char = jsonContent[i]
+
+ if escapeNext:
+ escapeNext = False
+ continue
+
+ if char == '\\':
+ escapeNext = True
+ continue
+
+ if char == '"':
+ inString = not inString
+ continue
+
+ if not inString:
+ if char == '}':
+ braceDepth += 1
+ elif char == '{':
+ braceDepth -= 1
+ if braceDepth == 0 and bracketDepth == 0:
+ elementStart = i
+ break
+ elif char == ']':
+ bracketDepth += 1
+ elif char == '[':
+ bracketDepth -= 1
+ if braceDepth == 0 and bracketDepth == 0:
+ elementStart = i
+ break
+ elif char == ',' and braceDepth == 0 and bracketDepth == 0:
+ # Found element boundary
+ elementStart = i + 1
+ break
+
+ if elementStart >= 0:
+ # Extract the element (including incomplete part)
+ elementContent = jsonContent[elementStart:breakPosition + 100].strip()
+ # Clean up - remove leading comma if present
+ if elementContent.startswith(','):
+ elementContent = elementContent[1:].strip()
+ return elementContent[:300] # Limit length
+
+ return None
+
+
+def _extractLastCompleteArrayElements(arrayContent: str, maxChars: int = 1000) -> str:
+ """Extract last complete array elements, up to maxChars."""
+ # Count complete elements from the end
+ elements = []
+ currentElement = ""
+ braceDepth = 0
+ bracketDepth = 0
+ inString = False
+ escapeNext = False
+ totalChars = 0
+
+ # Parse backwards to find complete elements
+ for i in range(len(arrayContent) - 1, -1, -1):
+ char = arrayContent[i]
+
+ if escapeNext:
+ escapeNext = False
+ currentElement = char + currentElement
+ continue
+
+ if char == '\\':
+ escapeNext = True
+ currentElement = char + currentElement
+ continue
+
+ if char == '"':
+ inString = not inString
+ currentElement = char + currentElement
+ continue
+
+ if not inString:
+ if char == '}':
+ braceDepth += 1
+ currentElement = char + currentElement
+ elif char == '{':
+ braceDepth -= 1
+ currentElement = char + currentElement
+ if braceDepth == 0 and bracketDepth == 0:
+ # Found complete element
+ element = currentElement.strip()
+ if element and element[0] in ['{', '[']:
+ elements.insert(0, element)
+ totalChars += len(element)
+ if totalChars >= maxChars:
+ break
+ currentElement = ""
+ elif char == ']':
+ bracketDepth += 1
+ currentElement = char + currentElement
+ elif char == '[':
+ bracketDepth -= 1
+ currentElement = char + currentElement
+ if braceDepth == 0 and bracketDepth == 0:
+ # Found complete element
+ element = currentElement.strip()
+ if element and element[0] == '[':
+ elements.insert(0, element)
+ totalChars += len(element)
+ if totalChars >= maxChars:
+ break
+ currentElement = ""
+ elif char == ',' and braceDepth == 0 and bracketDepth == 0:
+ # Element boundary
+ if currentElement.strip():
+ element = currentElement.strip()
+ if element and element[0] in ['{', '[', '"']:
+ elements.insert(0, element)
+ totalChars += len(element)
+ if totalChars >= maxChars:
+ break
+ currentElement = ""
+ else:
+ currentElement = char + currentElement
+
+ if elements:
+ indent = " "
+ formatted = ",\n".join([f"{indent}{elem}" for elem in elements[-5:]]) # Show last 5 elements
+ if len(elements) > 5:
+ formatted = f"... ({len(elements) - 5} more elements) ...\n{formatted}"
+ return formatted
+
+ return ""
+
+
+def _extractStructureContext(jsonContent: str, incompletePart: str, lastCompletePart: str = "") -> str:
+ """
+ Extract structure context showing WHERE in the structure the last complete and incomplete elements are.
+
+ Returns a clear description of the structure context for the broken element.
+ """
+ import json
+ import re
+
+ if not incompletePart:
+ # No incomplete part extracted - try to show context from raw JSON
+ try:
+ # Show last part of JSON to indicate where it broke
+ lastPart = jsonContent[-300:] if len(jsonContent) > 300 else jsonContent
+ return f"Structure context unavailable. Last part of response:\n{lastPart}"
+ except Exception:
+ return "Structure context unavailable - response was completely broken"
+
+ # Find where incomplete part starts
+ incompleteStart = jsonContent.find(incompletePart)
+ if incompleteStart == -1:
+ incompleteStart = len(jsonContent)
+
+ # Try to extract the structure context showing the broken element
+ try:
+ # Get the part before incomplete to understand structure
+ beforeIncomplete = jsonContent[:incompleteStart]
+
+ # Try to find the array/object context where the break occurred
+ # Look for the last complete structure before the break
+ structureContext = ""
+
+ # Try to parse what we have before the incomplete part
+ try:
+ closed = closeJsonStructures(beforeIncomplete)
+ parsed = json.loads(closed)
+
+ # Build structure showing where we are
+ if isinstance(parsed, dict) and "elements" in parsed:
+ elements = parsed.get("elements", [])
+ if isinstance(elements, list):
+ structureContext = f"Structure: elements array with {len(elements)} complete elements\n"
+ structureContext += f"Break occurred in element at index {len(elements)}"
+ else:
+ structureContext = "Structure: elements (not an array)"
+ else:
+ structureContext = "Structure: " + json.dumps(_buildStructureContext(parsed), indent=2, ensure_ascii=False)
+ except Exception:
+ # Can't parse - show raw context
+ structureContext = f"Structure parsing failed. Context before break:\n{beforeIncomplete[-200:]}"
+
+ return structureContext
+
+ except Exception:
+ # Fallback: show minimal context
+ return f"Structure context unavailable. Break occurred at position {incompleteStart} in JSON string"
+
+
+def _findElementPath(parsed: Any, elementStr: str, originalJson: str, isIncomplete: bool = False) -> str:
+ """
+ Find the path to an element in the parsed JSON structure.
+
+ Returns a path like "elements[2]" or "documents[0].chapters[1].sections[3]"
+ """
+ import json
+
+ if not elementStr or not elementStr.strip():
+ return ""
+
+ # Strategy: Find position in original JSON string, then determine path from structure
+ elementStart = originalJson.find(elementStr.strip())
+ if elementStart == -1:
+ return ""
+
+ # Find the array context by looking backwards from element position
+ beforeElement = originalJson[:elementStart]
+
+ # Find the nearest array declaration before this position
+ # Look for patterns like "elements": [ or "chapters": [
+ arrayPattern = r'"(\w+)"\s*:\s*\['
+ matches = list(re.finditer(arrayPattern, beforeElement))
+ if not matches:
+ return ""
+
+ # Get the most recent array (closest to element)
+ lastMatch = matches[-1]
+ arrayName = lastMatch.group(1)
+ arrayStartPos = lastMatch.end()
+
+ # Count complete array elements before this position
+ arrayContent = beforeElement[arrayStartPos:]
+
+ # Count complete objects (balanced braces) - each complete object is an array element
+ braceCount = 0
+ elementIndex = 0
+ inString = False
+ escapeNext = False
+ lastCompleteObjectEnd = -1
+
+ for i, char in enumerate(arrayContent):
+ if escapeNext:
+ escapeNext = False
+ continue
+ if char == '\\':
+ escapeNext = True
+ continue
+ if char == '"':
+ inString = not inString
+ continue
+ if not inString:
+ if char == '{':
+ if braceCount == 0:
+ # Start of new object
+ elementIndex += 1
+ braceCount += 1
+ elif char == '}':
+ braceCount -= 1
+ if braceCount == 0:
+ # End of complete object
+ lastCompleteObjectEnd = i
+
+ # Determine the index
+ # If we're looking for incomplete element, it's at the current elementIndex
+ # If we're looking for last complete element, it's at elementIndex - 1
+ if isIncomplete:
+ index = elementIndex
+ else:
+ index = elementIndex - 1 if elementIndex > 0 else 0
+
+ # Build the full path by traversing the parsed structure
+ def _buildPathToArray(obj: Any, targetArrayName: str, targetIndex: int, currentPath: str = "") -> Optional[str]:
+ """Recursively find path to array element."""
+ if isinstance(obj, dict):
+ for key, value in obj.items():
+ newPath = f"{currentPath}.{key}" if currentPath else key
+ if key == targetArrayName and isinstance(value, list):
+ # Found the target array
+ if 0 <= targetIndex < len(value):
+ return f"{newPath}[{targetIndex}]"
+ elif targetIndex >= len(value):
+ # Index beyond array - return array path with index
+ return f"{newPath}[{targetIndex}]"
+ result = _buildPathToArray(value, targetArrayName, targetIndex, newPath)
+ if result:
+ return result
+ elif isinstance(obj, list):
+ for i, item in enumerate(obj):
+ result = _buildPathToArray(item, targetArrayName, targetIndex, currentPath)
+ if result:
+ return result
+ return None
+
+ # Try to find full path in parsed structure
+ fullPath = _buildPathToArray(parsed, arrayName, index)
+ if fullPath:
+ return fullPath
+
+ # Fallback: return simple array path
+ return f"{arrayName}[{index}]"
+
+
+def _buildStructureContext(obj: Any, maxDepth: int = 5) -> Any:
+ """
+ Build structure context (metadata only, no content).
+ Similar to _buildStructureTemplate but focuses on parent structure.
+ """
+ if isinstance(obj, dict):
+ structure = {}
+ for key, value in obj.items():
+ if isinstance(value, (dict, list)):
+ structure[key] = _buildStructureContext(value, maxDepth - 1) if maxDepth > 0 else []
+ else:
+ # Skip content values - only keep structure
+ pass
+ return structure
+ elif isinstance(obj, list) and obj:
+ # Return empty list structure (no content)
+ return []
+ else:
+ return None
+
+
def _findIncompleteSectionInRaw(raw_json: str) -> Optional[Dict[str, Any]]:
"""
Find the incomplete section in raw JSON.