fixes stt paras

This commit is contained in:
ValueOn AG 2026-05-12 23:33:43 +02:00
parent 16ab816c65
commit e3284994d0

View file

@ -602,6 +602,13 @@ class TeamsbotService:
self._lastTranscriptText: Optional[str] = None
self._lastTranscriptId: Optional[str] = None
self._lastSttTime: float = 0.0
# Audio chunk aggregation: collect chunks and send to STT only
# after a speech pause or when the buffer reaches a target duration.
self._audioBuffer: bytes = b""
self._audioBufferStartTime: float = 0.0
self._audioBufferLastChunkTime: float = 0.0
self._audioBufferSampleRate: int = 16000
self._lastBotResponseText: Optional[str] = None
self._lastBotResponseTs: float = 0.0
@ -1203,6 +1210,14 @@ class TeamsbotService:
interface.updateSession(sessionId, updates)
await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
# Flush remaining audio buffer before generating summary
if dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
if self._audioBuffer:
logger.info(f"[AudioChunk] Flushing remaining buffer on session end ({len(self._audioBuffer)} bytes)")
self._audioBuffer = b""
self._audioBufferStartTime = 0.0
self._audioBufferLastChunkTime = 0.0
# Generate summary when session ends
if dbStatus == TeamsbotSessionStatus.ENDED.value:
asyncio.create_task(self._generateMeetingSummary(sessionId))
@ -1217,11 +1232,18 @@ class TeamsbotService:
voiceInterface,
websocket: WebSocket,
):
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
"""Process an audio chunk from WebRTC capture. The bot-side VAD
(AudioWorklet / ScriptProcessor) already segments speech into 1-8s
voiced chunks. Here we apply a minimum-duration safety net: very short
chunks (<1s) are buffered until they reach 1s; everything else goes
straight to STT. A wall-clock timeout flushes stale buffers."""
import base64
_MIN_CHUNK_SEC = 1.0
_STALE_TIMEOUT_SEC = 3.0
try:
audioBytes = base64.b64decode(audioBase64)
if len(audioBytes) < 1000:
if len(audioBytes) < 500:
return
if captureDiagnostics:
@ -1234,14 +1256,12 @@ class TeamsbotService:
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
)
# Use RMS from capture diagnostics to skip real silence.
# Byte-variation heuristics produced false positives and dropped valid speech.
isSilent = False
if captureDiagnostics and captureDiagnostics.get("rms") is not None:
try:
rmsVal = float(captureDiagnostics.get("rms"))
if rmsVal < 0.0003:
logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
return
isSilent = True
except Exception:
pass
@ -1249,23 +1269,47 @@ class TeamsbotService:
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
return
# Treat sampleRate=0 as unknown (triggers auto-detection)
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
now = time.time()
effectiveRate = sampleRate if sampleRate and sampleRate > 0 else 16000
if not isSilent:
if not self._audioBuffer:
self._audioBufferStartTime = now
self._audioBuffer += audioBytes
self._audioBufferLastChunkTime = now
self._audioBufferSampleRate = effectiveRate
bufferDuration = len(self._audioBuffer) / (effectiveRate * 2) if self._audioBuffer else 0.0
bufferAge = (now - self._audioBufferStartTime) if self._audioBuffer else 0.0
shouldFlush = (
self._audioBuffer
and (
bufferDuration >= _MIN_CHUNK_SEC
or (bufferAge >= _STALE_TIMEOUT_SEC and bufferDuration > 0.3)
)
)
if not shouldFlush:
return
flushBytes = self._audioBuffer
flushRate = self._audioBufferSampleRate
self._audioBuffer = b""
self._audioBufferStartTime = 0.0
self._audioBufferLastChunkTime = 0.0
flushDuration = len(flushBytes) / (flushRate * 2)
logger.info(f"[AudioChunk] Flushing buffer: {len(flushBytes)} bytes, {flushDuration:.1f}s, {flushRate}Hz")
phraseHints = list(self._knownSpeakers)
if self.config.botName:
phraseHints.append(self.config.botName)
# Sprache kommt ausschliesslich aus der Session/Instance-Konfig
# (TeamsbotUserSettings.language ueberschreibt
# TeamsbotConfig.language, Fallback de-DE im Schema).
# KEIN hardcodierter Alternative-Sprachen-Pool — der hat dafuer
# gesorgt, dass Google STT bei verrauschter Audio auf en-US
# gesprungen ist und englisches Kauderwelsch geliefert hat.
sttResult = await voiceInterface.speechToText(
audioContent=audioBytes,
audioContent=flushBytes,
language=self.config.language or "de-DE",
sampleRate=effectiveSampleRate,
sampleRate=flushRate,
channels=1,
skipFallbacks=True,
phraseHints=phraseHints if phraseHints else None,