Use laptop MP3 TTS playback

2026-04-28 15:54:10 +02:00
parent 96435e53e1
commit aa3397c6f6
9 changed files with 1120 additions and 54 deletions
--- a/.env.example
+++ b/.env.example
@@ -1,7 +1,6 @@
 EXPO_PUBLIC_OLLAMA_BASE_URL=http://localhost:11434
 EXPO_PUBLIC_OLLAMA_MODEL=llama3.2
-EXPO_PUBLIC_SPEECH_LANGUAGE=en-US
+EXPO_PUBLIC_TTS_BASE_URL=http://localhost:3333
-EXPO_PUBLIC_SPEECH_RATE=0.85
+EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
-EXPO_PUBLIC_SPEECH_PITCH=1
+EXPO_PUBLIC_TTS_RATE=0.88
-# Optional iOS voice identifier. Leave empty to use the first matching iPhone voice.
+EXPO_PUBLIC_TTS_PITCH=+0Hz
 # EXPO_PUBLIC_SPEECH_VOICE=com.apple.voice.compact.en-US.Samantha
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ node_modules/
 dist/
 web-build/
 expo-env.d.ts
 .tts-cache/
 # Native
 .kotlin/
--- a/README.md
+++ b/README.md
@@ -109,7 +109,39 @@ npm run start
 ## Speech Input And Playback
-Playback works directly in Expo Go. AI replies are read aloud through the iPhone speaker. The chat also has buttons for `Read last answer` and `Stop`.
+Playback uses a local MP3 TTS server on the laptop. AI replies are sent to the laptop, converted to an MP3 with a Microsoft neural English voice, and then played on the iPhone. This avoids the robotic iPhone system voice.
 Start the TTS server in a second terminal:
 ```bash
 npm run tts:start
 ```
 For Expo Go on iPhone, `.env` must point to the laptop IP:
 ```text
 EXPO_PUBLIC_TTS_BASE_URL=http://192.168.10.33:3333
 EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
 EXPO_PUBLIC_TTS_RATE=0.88
 EXPO_PUBLIC_TTS_PITCH=+0Hz
 ```
 Useful English voices:
 - `en-US-JennyNeural`
 - `en-US-AvaNeural`
 - `en-US-EmmaNeural`
 - `en-US-GuyNeural`
 - `en-GB-SoniaNeural`
 - `en-GB-RyanNeural`
 You can list available voices while the TTS server is running:
 ```text
 http://192.168.10.33:3333/voices
 ```
 The chat has buttons for `Read last answer` and `Stop`.
 Speech input currently works through the iPhone keyboard:
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -7,14 +7,18 @@
    "android": "expo start --android",
    "ios": "expo start --ios",
    "web": "expo start --web",
-    "prebuild:ios": "expo prebuild --platform ios"
+    "prebuild:ios": "expo prebuild --platform ios",
    "tts:start": "node tools/tts-server.mjs"
  },
  "dependencies": {
    "@react-navigation/native": "^7.2.2",
    "@react-navigation/native-stack": "^7.14.12",
    "cors": "^2.8.6",
    "expo": "~54.0.33",
-    "expo-speech": "~14.0.8",
+    "expo-av": "~16.0.8",
    "expo-status-bar": "~3.0.9",
    "express": "^5.2.1",
    "msedge-tts": "^2.0.5",
    "react": "19.1.0",
    "react-native": "0.81.5",
    "react-native-safe-area-context": "~5.6.0",
--- a/src/config/speech.ts
+++ b/src/config/speech.ts
@@ -1,6 +0,0 @@
 export const speechConfig = {
  language: process.env.EXPO_PUBLIC_SPEECH_LANGUAGE ?? 'en-US',
  preferredVoice: process.env.EXPO_PUBLIC_SPEECH_VOICE,
  pitch: Number(process.env.EXPO_PUBLIC_SPEECH_PITCH ?? 1),
  rate: Number(process.env.EXPO_PUBLIC_SPEECH_RATE ?? 0.85),
 };
--- a/src/config/tts.ts
+++ b/src/config/tts.ts
@@ -0,0 +1,6 @@
 export const ttsConfig = {
  baseUrl: process.env.EXPO_PUBLIC_TTS_BASE_URL ?? '',
  voice: process.env.EXPO_PUBLIC_TTS_VOICE ?? 'en-US-JennyNeural',
  rate: process.env.EXPO_PUBLIC_TTS_RATE ?? '-8%',
  pitch: process.env.EXPO_PUBLIC_TTS_PITCH ?? '+0Hz',
 };
--- a/src/services/speechService.ts
+++ b/src/services/speechService.ts
@@ -1,43 +1,62 @@
-import * as Speech from 'expo-speech';
+import { Audio, type AVPlaybackSource } from 'expo-av';
-import { speechConfig } from '../config/speech';
+import { ttsConfig } from '../config/tts';
-let cachedVoice: string | undefined;
+let currentSound: Audio.Sound | undefined;
 export async function speakText(text: string) {
-  Speech.stop();
+  await stopSpeaking();
-  const voice = await getPreferredVoice();
+  const cleanText = stripThinkingBlocks(text);
  if (!cleanText || !ttsConfig.baseUrl) {
    return;
  }
-  Speech.speak(stripThinkingBlocks(text), {
+  const response = await fetch(`${ttsConfig.baseUrl}/tts`, {
-    language: speechConfig.language,
+    method: 'POST',
-    pitch: speechConfig.pitch,
+    headers: {
-    rate: speechConfig.rate,
+      'Content-Type': 'application/json',
-    voice,
+    },
    body: JSON.stringify({
      text: cleanText,
      voice: ttsConfig.voice,
      rate: ttsConfig.rate,
      pitch: ttsConfig.pitch,
    }),
  });
  if (!response.ok) {
    throw new Error(`TTS request failed with ${response.status}`);
  }
  const data = (await response.json()) as { audioUrl?: string };
  if (!data.audioUrl) {
    return;
  }
  const source: AVPlaybackSource = { uri: data.audioUrl };
  const { sound } = await Audio.Sound.createAsync(source, { shouldPlay: true });
  currentSound = sound;
 }
-export function stopSpeaking() {
+export async function stopSpeaking() {
-  Speech.stop();
+  if (!currentSound) {
    return;
  }
  await currentSound.stopAsync();
  await currentSound.unloadAsync();
  currentSound = undefined;
 }
 function stripThinkingBlocks(text: string) {
-  return text.replace(/<think>[\s\S]*?<\/think>/gi, '').trim();
+  return text
-}
+    .replace(/<think>[\s\S]*?<\/think>/gi, '')
-
+    .replace(/```[\s\S]*?```/g, ' ')
-async function getPreferredVoice() {
+    .replace(/`([^`]+)`/g, '$1')
-  if (cachedVoice) {
+    .replace(/\*\*([^*]+)\*\*/g, '$1')
-    return cachedVoice;
+    .replace(/[*_#>~]/g, '')
-  }
+    .replace(/https?:\/\/\S+/g, '')
-
+    .replace(/\s+/g, ' ')
-  if (speechConfig.preferredVoice) {
+    .trim();
    cachedVoice = speechConfig.preferredVoice;
    return cachedVoice;
  }
  const voices = await Speech.getAvailableVoicesAsync();
  const matchingVoice = voices.find((voice) => voice.language === speechConfig.language);
  cachedVoice = matchingVoice?.identifier;
  return cachedVoice;
 }
--- a/tools/tts-server.mjs
+++ b/tools/tts-server.mjs
@@ -0,0 +1,96 @@
 import crypto from 'node:crypto';
 import fs from 'node:fs';
 import path from 'node:path';
 import cors from 'cors';
 import express from 'express';
 import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
 const app = express();
 const port = Number(process.env.TTS_PORT ?? 3333);
 const host = process.env.TTS_HOST ?? '0.0.0.0';
 const cacheDir = path.resolve(process.cwd(), '.tts-cache');
 fs.mkdirSync(cacheDir, { recursive: true });
 app.use(cors());
 app.use(express.json({ limit: '64kb' }));
 app.use('/audio', express.static(cacheDir, { maxAge: '1h' }));
 app.get('/health', (_request, response) => {
  response.json({ ok: true, voice: getVoiceFromEnv() });
 });
 app.get('/voices', async (_request, response) => {
  const tts = new MsEdgeTTS({});
  const voices = await tts.getVoices();
  response.json(
    voices
      .filter((voice) => voice.Locale.startsWith('en-'))
      .map((voice) => ({
        name: voice.ShortName,
        friendlyName: voice.FriendlyName,
        gender: voice.Gender,
        locale: voice.Locale,
      })),
  );
 });
 app.post('/tts', async (request, response) => {
  const text = sanitizeText(String(request.body?.text ?? ''));
  if (!text) {
    response.status(400).json({ error: 'Missing text' });
    return;
  }
  const voice = String(request.body?.voice ?? getVoiceFromEnv());
  const rate = String(request.body?.rate ?? process.env.TTS_RATE ?? '-8%');
  const pitch = String(request.body?.pitch ?? process.env.TTS_PITCH ?? '+0Hz');
  const hash = crypto.createHash('sha256').update(`${voice}|${rate}|${pitch}|${text}`).digest('hex');
  const filename = `${hash}.mp3`;
  const filePath = path.join(cacheDir, filename);
  if (!fs.existsSync(filePath)) {
    const tts = new MsEdgeTTS({});
    await tts.setMetadata(voice, OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3);
    const { audioFilePath, metadataFilePath } = await tts.toFile(cacheDir, escapeXml(text), { rate, pitch });
    fs.renameSync(audioFilePath, filePath);
    if (metadataFilePath && fs.existsSync(metadataFilePath)) {
      fs.unlinkSync(metadataFilePath);
    }
    tts.close();
  }
  response.json({ audioUrl: `${request.protocol}://${request.get('host')}/audio/${filename}` });
 });
 app.listen(port, host, () => {
  console.log(`TTS server listening on http://${host}:${port}`);
  console.log(`Voice: ${getVoiceFromEnv()}`);
 });
 function getVoiceFromEnv() {
  return process.env.TTS_VOICE ?? 'en-US-JennyNeural';
 }
 function sanitizeText(value) {
  return value
    .replace(/<think>[\s\S]*?<\/think>/gi, '')
    .replace(/```[\s\S]*?```/g, ' ')
    .replace(/`([^`]+)`/g, '$1')
    .replace(/\*\*([^*]+)\*\*/g, '$1')
    .replace(/[*_#>~]/g, '')
    .replace(/https?:\/\/\S+/g, '')
    .replace(/\s+/g, ' ')
    .trim()
    .slice(0, 1800);
 }
 function escapeXml(value) {
  return value
    .replace(/&/g, '&amp;')
    .replace(/</g, '&lt;')
    .replace(/>/g, '&gt;')
    .replace(/"/g, '&quot;')
    .replace(/'/g, '&apos;');
 }