Use laptop MP3 TTS playback

2026-04-28 15:54:10 +02:00
parent 96435e53e1
commit aa3397c6f6
9 changed files with 1120 additions and 54 deletions
--- a/.env.example
+++ b/.env.example
@@ -1,7 +1,6 @@
 EXPO_PUBLIC_OLLAMA_BASE_URL=http://localhost:11434
 EXPO_PUBLIC_OLLAMA_MODEL=llama3.2
-EXPO_PUBLIC_SPEECH_LANGUAGE=en-US
-EXPO_PUBLIC_SPEECH_RATE=0.85
-EXPO_PUBLIC_SPEECH_PITCH=1
-# Optional iOS voice identifier. Leave empty to use the first matching iPhone voice.
-# EXPO_PUBLIC_SPEECH_VOICE=com.apple.voice.compact.en-US.Samantha
+EXPO_PUBLIC_TTS_BASE_URL=http://localhost:3333
+EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
+EXPO_PUBLIC_TTS_RATE=0.88
+EXPO_PUBLIC_TTS_PITCH=+0Hz
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ node_modules/
 dist/
 web-build/
 expo-env.d.ts
+.tts-cache/

 # Native
 .kotlin/
--- a/README.md
+++ b/README.md
@@ -109,7 +109,39 @@ npm run start

 ## Speech Input And Playback

-Playback works directly in Expo Go. AI replies are read aloud through the iPhone speaker. The chat also has buttons for `Read last answer` and `Stop`.
+Playback uses a local MP3 TTS server on the laptop. AI replies are sent to the laptop, converted to an MP3 with a Microsoft neural English voice, and then played on the iPhone. This avoids the robotic iPhone system voice.
+
+Start the TTS server in a second terminal:
+
+```bash
+npm run tts:start
+```
+
+For Expo Go on iPhone, `.env` must point to the laptop IP:
+
+```text
+EXPO_PUBLIC_TTS_BASE_URL=http://192.168.10.33:3333
+EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
+EXPO_PUBLIC_TTS_RATE=0.88
+EXPO_PUBLIC_TTS_PITCH=+0Hz
+```
+
+Useful English voices:
+
+- `en-US-JennyNeural`
+- `en-US-AvaNeural`
+- `en-US-EmmaNeural`
+- `en-US-GuyNeural`
+- `en-GB-SoniaNeural`
+- `en-GB-RyanNeural`
+
+You can list available voices while the TTS server is running:
+
+```text
+http://192.168.10.33:3333/voices
+```
+
+The chat has buttons for `Read last answer` and `Stop`.

 Speech input currently works through the iPhone keyboard:

--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -7,14 +7,18 @@
    "android": "expo start --android",
    "ios": "expo start --ios",
    "web": "expo start --web",
-    "prebuild:ios": "expo prebuild --platform ios"
+    "prebuild:ios": "expo prebuild --platform ios",
+    "tts:start": "node tools/tts-server.mjs"
  },
  "dependencies": {
    "@react-navigation/native": "^7.2.2",
    "@react-navigation/native-stack": "^7.14.12",
+    "cors": "^2.8.6",
    "expo": "~54.0.33",
-    "expo-speech": "~14.0.8",
+    "expo-av": "~16.0.8",
    "expo-status-bar": "~3.0.9",
+    "express": "^5.2.1",
+    "msedge-tts": "^2.0.5",
    "react": "19.1.0",
    "react-native": "0.81.5",
    "react-native-safe-area-context": "~5.6.0",
--- a/src/config/speech.ts
+++ b/src/config/speech.ts
@@ -1,6 +0,0 @@
-export const speechConfig = {
-  language: process.env.EXPO_PUBLIC_SPEECH_LANGUAGE ?? 'en-US',
-  preferredVoice: process.env.EXPO_PUBLIC_SPEECH_VOICE,
-  pitch: Number(process.env.EXPO_PUBLIC_SPEECH_PITCH ?? 1),
-  rate: Number(process.env.EXPO_PUBLIC_SPEECH_RATE ?? 0.85),
-};
--- a/src/config/tts.ts
+++ b/src/config/tts.ts
@@ -0,0 +1,6 @@
+export const ttsConfig = {
+  baseUrl: process.env.EXPO_PUBLIC_TTS_BASE_URL ?? '',
+  voice: process.env.EXPO_PUBLIC_TTS_VOICE ?? 'en-US-JennyNeural',
+  rate: process.env.EXPO_PUBLIC_TTS_RATE ?? '-8%',
+  pitch: process.env.EXPO_PUBLIC_TTS_PITCH ?? '+0Hz',
+};
--- a/src/services/speechService.ts
+++ b/src/services/speechService.ts
@@ -1,43 +1,62 @@
-import * as Speech from 'expo-speech';
+import { Audio, type AVPlaybackSource } from 'expo-av';

-import { speechConfig } from '../config/speech';
+import { ttsConfig } from '../config/tts';

-let cachedVoice: string | undefined;
+let currentSound: Audio.Sound | undefined;

 export async function speakText(text: string) {
-  Speech.stop();
+  await stopSpeaking();

-  const voice = await getPreferredVoice();
+  const cleanText = stripThinkingBlocks(text);
+  if (!cleanText || !ttsConfig.baseUrl) {
+    return;
+  }

-  Speech.speak(stripThinkingBlocks(text), {
-    language: speechConfig.language,
-    pitch: speechConfig.pitch,
-    rate: speechConfig.rate,
-    voice,
+  const response = await fetch(`${ttsConfig.baseUrl}/tts`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({
+      text: cleanText,
+      voice: ttsConfig.voice,
+      rate: ttsConfig.rate,
+      pitch: ttsConfig.pitch,
+    }),
  });
+
+  if (!response.ok) {
+    throw new Error(`TTS request failed with ${response.status}`);
+  }
+
+  const data = (await response.json()) as { audioUrl?: string };
+  if (!data.audioUrl) {
+    return;
+  }
+
+  const source: AVPlaybackSource = { uri: data.audioUrl };
+  const { sound } = await Audio.Sound.createAsync(source, { shouldPlay: true });
+  currentSound = sound;
 }

-export function stopSpeaking() {
-  Speech.stop();
+export async function stopSpeaking() {
+  if (!currentSound) {
+    return;
+  }
+
+  await currentSound.stopAsync();
+  await currentSound.unloadAsync();
+  currentSound = undefined;
 }

 function stripThinkingBlocks(text: string) {
-  return text.replace(/<think>[\s\S]*?<\/think>/gi, '').trim();
-}
-
-async function getPreferredVoice() {
-  if (cachedVoice) {
-    return cachedVoice;
-  }
-
-  if (speechConfig.preferredVoice) {
-    cachedVoice = speechConfig.preferredVoice;
-    return cachedVoice;
-  }
-
-  const voices = await Speech.getAvailableVoicesAsync();
-  const matchingVoice = voices.find((voice) => voice.language === speechConfig.language);
-  cachedVoice = matchingVoice?.identifier;
-
-  return cachedVoice;
+  return text
+    .replace(/<think>[\s\S]*?<\/think>/gi, '')
+    .replace(/```[\s\S]*?```/g, ' ')
+    .replace(/`([^`]+)`/g, '$1')
+    .replace(/\*\*([^*]+)\*\*/g, '$1')
+    .replace(/[*_#>~]/g, '')
+    .replace(/https?:\/\/\S+/g, '')
+    .replace(/\s+/g, ' ')
+    .trim();
 }
--- a/tools/tts-server.mjs
+++ b/tools/tts-server.mjs
@@ -0,0 +1,96 @@
+import crypto from 'node:crypto';
+import fs from 'node:fs';
+import path from 'node:path';
+import cors from 'cors';
+import express from 'express';
+import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
+
+const app = express();
+const port = Number(process.env.TTS_PORT ?? 3333);
+const host = process.env.TTS_HOST ?? '0.0.0.0';
+const cacheDir = path.resolve(process.cwd(), '.tts-cache');
+
+fs.mkdirSync(cacheDir, { recursive: true });
+
+app.use(cors());
+app.use(express.json({ limit: '64kb' }));
+app.use('/audio', express.static(cacheDir, { maxAge: '1h' }));
+
+app.get('/health', (_request, response) => {
+  response.json({ ok: true, voice: getVoiceFromEnv() });
+});
+
+app.get('/voices', async (_request, response) => {
+  const tts = new MsEdgeTTS({});
+  const voices = await tts.getVoices();
+  response.json(
+    voices
+      .filter((voice) => voice.Locale.startsWith('en-'))
+      .map((voice) => ({
+        name: voice.ShortName,
+        friendlyName: voice.FriendlyName,
+        gender: voice.Gender,
+        locale: voice.Locale,
+      })),
+  );
+});
+
+app.post('/tts', async (request, response) => {
+  const text = sanitizeText(String(request.body?.text ?? ''));
+
+  if (!text) {
+    response.status(400).json({ error: 'Missing text' });
+    return;
+  }
+
+  const voice = String(request.body?.voice ?? getVoiceFromEnv());
+  const rate = String(request.body?.rate ?? process.env.TTS_RATE ?? '-8%');
+  const pitch = String(request.body?.pitch ?? process.env.TTS_PITCH ?? '+0Hz');
+  const hash = crypto.createHash('sha256').update(`${voice}|${rate}|${pitch}|${text}`).digest('hex');
+  const filename = `${hash}.mp3`;
+  const filePath = path.join(cacheDir, filename);
+
+  if (!fs.existsSync(filePath)) {
+    const tts = new MsEdgeTTS({});
+    await tts.setMetadata(voice, OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3);
+    const { audioFilePath, metadataFilePath } = await tts.toFile(cacheDir, escapeXml(text), { rate, pitch });
+    fs.renameSync(audioFilePath, filePath);
+    if (metadataFilePath && fs.existsSync(metadataFilePath)) {
+      fs.unlinkSync(metadataFilePath);
+    }
+    tts.close();
+  }
+
+  response.json({ audioUrl: `${request.protocol}://${request.get('host')}/audio/${filename}` });
+});
+
+app.listen(port, host, () => {
+  console.log(`TTS server listening on http://${host}:${port}`);
+  console.log(`Voice: ${getVoiceFromEnv()}`);
+});
+
+function getVoiceFromEnv() {
+  return process.env.TTS_VOICE ?? 'en-US-JennyNeural';
+}
+
+function sanitizeText(value) {
+  return value
+    .replace(/<think>[\s\S]*?<\/think>/gi, '')
+    .replace(/```[\s\S]*?```/g, ' ')
+    .replace(/`([^`]+)`/g, '$1')
+    .replace(/\*\*([^*]+)\*\*/g, '$1')
+    .replace(/[*_#>~]/g, '')
+    .replace(/https?:\/\/\S+/g, '')
+    .replace(/\s+/g, ' ')
+    .trim()
+    .slice(0, 1800);
+}
+
+function escapeXml(value) {
+  return value
+    .replace(/&/g, '&amp;')
+    .replace(/</g, '&lt;')
+    .replace(/>/g, '&gt;')
+    .replace(/"/g, '&quot;')
+    .replace(/'/g, '&apos;');
+}