Use laptop MP3 TTS playback

This commit is contained in:
Ismail Ali
2026-04-28 15:54:10 +02:00
parent 96435e53e1
commit aa3397c6f6
9 changed files with 1120 additions and 54 deletions

View File

@@ -1,7 +1,6 @@
EXPO_PUBLIC_OLLAMA_BASE_URL=http://localhost:11434
EXPO_PUBLIC_OLLAMA_MODEL=llama3.2
EXPO_PUBLIC_SPEECH_LANGUAGE=en-US
EXPO_PUBLIC_SPEECH_RATE=0.85
EXPO_PUBLIC_SPEECH_PITCH=1
# Optional iOS voice identifier. Leave empty to use the first matching iPhone voice.
# EXPO_PUBLIC_SPEECH_VOICE=com.apple.voice.compact.en-US.Samantha
EXPO_PUBLIC_TTS_BASE_URL=http://localhost:3333
EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
EXPO_PUBLIC_TTS_RATE=0.88
EXPO_PUBLIC_TTS_PITCH=+0Hz

1
.gitignore vendored
View File

@@ -8,6 +8,7 @@ node_modules/
dist/
web-build/
expo-env.d.ts
.tts-cache/
# Native
.kotlin/

View File

@@ -109,7 +109,39 @@ npm run start
## Speech Input And Playback
Playback works directly in Expo Go. AI replies are read aloud through the iPhone speaker. The chat also has buttons for `Read last answer` and `Stop`.
Playback uses a local MP3 TTS server on the laptop. AI replies are sent to the laptop, converted to an MP3 with a Microsoft neural English voice, and then played on the iPhone. This avoids the robotic iPhone system voice.
Start the TTS server in a second terminal:
```bash
npm run tts:start
```
For Expo Go on iPhone, `.env` must point to the laptop IP:
```text
EXPO_PUBLIC_TTS_BASE_URL=http://192.168.10.33:3333
EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
EXPO_PUBLIC_TTS_RATE=0.88
EXPO_PUBLIC_TTS_PITCH=+0Hz
```
Useful English voices:
- `en-US-JennyNeural`
- `en-US-AvaNeural`
- `en-US-EmmaNeural`
- `en-US-GuyNeural`
- `en-GB-SoniaNeural`
- `en-GB-RyanNeural`
You can list available voices while the TTS server is running:
```text
http://192.168.10.33:3333/voices
```
The chat has buttons for `Read last answer` and `Stop`.
Speech input currently works through the iPhone keyboard:

935
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -7,14 +7,18 @@
"android": "expo start --android",
"ios": "expo start --ios",
"web": "expo start --web",
"prebuild:ios": "expo prebuild --platform ios"
"prebuild:ios": "expo prebuild --platform ios",
"tts:start": "node tools/tts-server.mjs"
},
"dependencies": {
"@react-navigation/native": "^7.2.2",
"@react-navigation/native-stack": "^7.14.12",
"cors": "^2.8.6",
"expo": "~54.0.33",
"expo-speech": "~14.0.8",
"expo-av": "~16.0.8",
"expo-status-bar": "~3.0.9",
"express": "^5.2.1",
"msedge-tts": "^2.0.5",
"react": "19.1.0",
"react-native": "0.81.5",
"react-native-safe-area-context": "~5.6.0",

View File

@@ -1,6 +0,0 @@
export const speechConfig = {
language: process.env.EXPO_PUBLIC_SPEECH_LANGUAGE ?? 'en-US',
preferredVoice: process.env.EXPO_PUBLIC_SPEECH_VOICE,
pitch: Number(process.env.EXPO_PUBLIC_SPEECH_PITCH ?? 1),
rate: Number(process.env.EXPO_PUBLIC_SPEECH_RATE ?? 0.85),
};

6
src/config/tts.ts Normal file
View File

@@ -0,0 +1,6 @@
export const ttsConfig = {
baseUrl: process.env.EXPO_PUBLIC_TTS_BASE_URL ?? '',
voice: process.env.EXPO_PUBLIC_TTS_VOICE ?? 'en-US-JennyNeural',
rate: process.env.EXPO_PUBLIC_TTS_RATE ?? '-8%',
pitch: process.env.EXPO_PUBLIC_TTS_PITCH ?? '+0Hz',
};

View File

@@ -1,43 +1,62 @@
import * as Speech from 'expo-speech';
import { Audio, type AVPlaybackSource } from 'expo-av';
import { speechConfig } from '../config/speech';
import { ttsConfig } from '../config/tts';
let cachedVoice: string | undefined;
let currentSound: Audio.Sound | undefined;
export async function speakText(text: string) {
Speech.stop();
await stopSpeaking();
const voice = await getPreferredVoice();
const cleanText = stripThinkingBlocks(text);
if (!cleanText || !ttsConfig.baseUrl) {
return;
}
Speech.speak(stripThinkingBlocks(text), {
language: speechConfig.language,
pitch: speechConfig.pitch,
rate: speechConfig.rate,
voice,
const response = await fetch(`${ttsConfig.baseUrl}/tts`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
text: cleanText,
voice: ttsConfig.voice,
rate: ttsConfig.rate,
pitch: ttsConfig.pitch,
}),
});
if (!response.ok) {
throw new Error(`TTS request failed with ${response.status}`);
}
const data = (await response.json()) as { audioUrl?: string };
if (!data.audioUrl) {
return;
}
const source: AVPlaybackSource = { uri: data.audioUrl };
const { sound } = await Audio.Sound.createAsync(source, { shouldPlay: true });
currentSound = sound;
}
export function stopSpeaking() {
Speech.stop();
export async function stopSpeaking() {
if (!currentSound) {
return;
}
await currentSound.stopAsync();
await currentSound.unloadAsync();
currentSound = undefined;
}
function stripThinkingBlocks(text: string) {
return text.replace(/<think>[\s\S]*?<\/think>/gi, '').trim();
}
async function getPreferredVoice() {
if (cachedVoice) {
return cachedVoice;
}
if (speechConfig.preferredVoice) {
cachedVoice = speechConfig.preferredVoice;
return cachedVoice;
}
const voices = await Speech.getAvailableVoicesAsync();
const matchingVoice = voices.find((voice) => voice.language === speechConfig.language);
cachedVoice = matchingVoice?.identifier;
return cachedVoice;
return text
.replace(/<think>[\s\S]*?<\/think>/gi, '')
.replace(/```[\s\S]*?```/g, ' ')
.replace(/`([^`]+)`/g, '$1')
.replace(/\*\*([^*]+)\*\*/g, '$1')
.replace(/[*_#>~]/g, '')
.replace(/https?:\/\/\S+/g, '')
.replace(/\s+/g, ' ')
.trim();
}

96
tools/tts-server.mjs Normal file
View File

@@ -0,0 +1,96 @@
import crypto from 'node:crypto';
import fs from 'node:fs';
import path from 'node:path';
import cors from 'cors';
import express from 'express';
import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
const app = express();
const port = Number(process.env.TTS_PORT ?? 3333);
const host = process.env.TTS_HOST ?? '0.0.0.0';
const cacheDir = path.resolve(process.cwd(), '.tts-cache');
fs.mkdirSync(cacheDir, { recursive: true });
app.use(cors());
app.use(express.json({ limit: '64kb' }));
app.use('/audio', express.static(cacheDir, { maxAge: '1h' }));
app.get('/health', (_request, response) => {
response.json({ ok: true, voice: getVoiceFromEnv() });
});
app.get('/voices', async (_request, response) => {
const tts = new MsEdgeTTS({});
const voices = await tts.getVoices();
response.json(
voices
.filter((voice) => voice.Locale.startsWith('en-'))
.map((voice) => ({
name: voice.ShortName,
friendlyName: voice.FriendlyName,
gender: voice.Gender,
locale: voice.Locale,
})),
);
});
app.post('/tts', async (request, response) => {
const text = sanitizeText(String(request.body?.text ?? ''));
if (!text) {
response.status(400).json({ error: 'Missing text' });
return;
}
const voice = String(request.body?.voice ?? getVoiceFromEnv());
const rate = String(request.body?.rate ?? process.env.TTS_RATE ?? '-8%');
const pitch = String(request.body?.pitch ?? process.env.TTS_PITCH ?? '+0Hz');
const hash = crypto.createHash('sha256').update(`${voice}|${rate}|${pitch}|${text}`).digest('hex');
const filename = `${hash}.mp3`;
const filePath = path.join(cacheDir, filename);
if (!fs.existsSync(filePath)) {
const tts = new MsEdgeTTS({});
await tts.setMetadata(voice, OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3);
const { audioFilePath, metadataFilePath } = await tts.toFile(cacheDir, escapeXml(text), { rate, pitch });
fs.renameSync(audioFilePath, filePath);
if (metadataFilePath && fs.existsSync(metadataFilePath)) {
fs.unlinkSync(metadataFilePath);
}
tts.close();
}
response.json({ audioUrl: `${request.protocol}://${request.get('host')}/audio/${filename}` });
});
app.listen(port, host, () => {
console.log(`TTS server listening on http://${host}:${port}`);
console.log(`Voice: ${getVoiceFromEnv()}`);
});
function getVoiceFromEnv() {
return process.env.TTS_VOICE ?? 'en-US-JennyNeural';
}
function sanitizeText(value) {
return value
.replace(/<think>[\s\S]*?<\/think>/gi, '')
.replace(/```[\s\S]*?```/g, ' ')
.replace(/`([^`]+)`/g, '$1')
.replace(/\*\*([^*]+)\*\*/g, '$1')
.replace(/[*_#>~]/g, '')
.replace(/https?:\/\/\S+/g, '')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 1800);
}
function escapeXml(value) {
return value
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&apos;');
}