32 lines
1.2 KiB
Python
32 lines
1.2 KiB
Python
from typing import Any, Dict, List
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from ..subRegistry import Chunker
|
|
|
|
|
|
class TextChunker(Chunker):
|
|
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
|
maxBytes = int(options.get("textChunkSize", 40000))
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
|
|
logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
|
|
chunks: List[Dict[str, Any]] = []
|
|
current: List[str] = []
|
|
size = 0
|
|
for line in part.data.split('\n'):
|
|
lineSize = len(line.encode('utf-8')) + 1
|
|
if size + lineSize > maxBytes and current:
|
|
data = '\n'.join(current)
|
|
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
|
|
current = [line]
|
|
size = lineSize
|
|
else:
|
|
current.append(line)
|
|
size += lineSize
|
|
if current:
|
|
data = '\n'.join(current)
|
|
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
|
|
return chunks
|
|
|
|
|