Use laptop MP3 TTS playback
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
EXPO_PUBLIC_OLLAMA_BASE_URL=http://localhost:11434
|
||||
EXPO_PUBLIC_OLLAMA_MODEL=llama3.2
|
||||
EXPO_PUBLIC_SPEECH_LANGUAGE=en-US
|
||||
EXPO_PUBLIC_SPEECH_RATE=0.85
|
||||
EXPO_PUBLIC_SPEECH_PITCH=1
|
||||
# Optional iOS voice identifier. Leave empty to use the first matching iPhone voice.
|
||||
# EXPO_PUBLIC_SPEECH_VOICE=com.apple.voice.compact.en-US.Samantha
|
||||
EXPO_PUBLIC_TTS_BASE_URL=http://localhost:3333
|
||||
EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
|
||||
EXPO_PUBLIC_TTS_RATE=0.88
|
||||
EXPO_PUBLIC_TTS_PITCH=+0Hz
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,6 +8,7 @@ node_modules/
|
||||
dist/
|
||||
web-build/
|
||||
expo-env.d.ts
|
||||
.tts-cache/
|
||||
|
||||
# Native
|
||||
.kotlin/
|
||||
|
||||
34
README.md
34
README.md
@@ -109,7 +109,39 @@ npm run start
|
||||
|
||||
## Speech Input And Playback
|
||||
|
||||
Playback works directly in Expo Go. AI replies are read aloud through the iPhone speaker. The chat also has buttons for `Read last answer` and `Stop`.
|
||||
Playback uses a local MP3 TTS server on the laptop. AI replies are sent to the laptop, converted to an MP3 with a Microsoft neural English voice, and then played on the iPhone. This avoids the robotic iPhone system voice.
|
||||
|
||||
Start the TTS server in a second terminal:
|
||||
|
||||
```bash
|
||||
npm run tts:start
|
||||
```
|
||||
|
||||
For Expo Go on iPhone, `.env` must point to the laptop IP:
|
||||
|
||||
```text
|
||||
EXPO_PUBLIC_TTS_BASE_URL=http://192.168.10.33:3333
|
||||
EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
|
||||
EXPO_PUBLIC_TTS_RATE=0.88
|
||||
EXPO_PUBLIC_TTS_PITCH=+0Hz
|
||||
```
|
||||
|
||||
Useful English voices:
|
||||
|
||||
- `en-US-JennyNeural`
|
||||
- `en-US-AvaNeural`
|
||||
- `en-US-EmmaNeural`
|
||||
- `en-US-GuyNeural`
|
||||
- `en-GB-SoniaNeural`
|
||||
- `en-GB-RyanNeural`
|
||||
|
||||
You can list available voices while the TTS server is running:
|
||||
|
||||
```text
|
||||
http://192.168.10.33:3333/voices
|
||||
```
|
||||
|
||||
The chat has buttons for `Read last answer` and `Stop`.
|
||||
|
||||
Speech input currently works through the iPhone keyboard:
|
||||
|
||||
|
||||
935
package-lock.json
generated
935
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -7,14 +7,18 @@
|
||||
"android": "expo start --android",
|
||||
"ios": "expo start --ios",
|
||||
"web": "expo start --web",
|
||||
"prebuild:ios": "expo prebuild --platform ios"
|
||||
"prebuild:ios": "expo prebuild --platform ios",
|
||||
"tts:start": "node tools/tts-server.mjs"
|
||||
},
|
||||
"dependencies": {
|
||||
"@react-navigation/native": "^7.2.2",
|
||||
"@react-navigation/native-stack": "^7.14.12",
|
||||
"cors": "^2.8.6",
|
||||
"expo": "~54.0.33",
|
||||
"expo-speech": "~14.0.8",
|
||||
"expo-av": "~16.0.8",
|
||||
"expo-status-bar": "~3.0.9",
|
||||
"express": "^5.2.1",
|
||||
"msedge-tts": "^2.0.5",
|
||||
"react": "19.1.0",
|
||||
"react-native": "0.81.5",
|
||||
"react-native-safe-area-context": "~5.6.0",
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
export const speechConfig = {
|
||||
language: process.env.EXPO_PUBLIC_SPEECH_LANGUAGE ?? 'en-US',
|
||||
preferredVoice: process.env.EXPO_PUBLIC_SPEECH_VOICE,
|
||||
pitch: Number(process.env.EXPO_PUBLIC_SPEECH_PITCH ?? 1),
|
||||
rate: Number(process.env.EXPO_PUBLIC_SPEECH_RATE ?? 0.85),
|
||||
};
|
||||
6
src/config/tts.ts
Normal file
6
src/config/tts.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
export const ttsConfig = {
|
||||
baseUrl: process.env.EXPO_PUBLIC_TTS_BASE_URL ?? '',
|
||||
voice: process.env.EXPO_PUBLIC_TTS_VOICE ?? 'en-US-JennyNeural',
|
||||
rate: process.env.EXPO_PUBLIC_TTS_RATE ?? '-8%',
|
||||
pitch: process.env.EXPO_PUBLIC_TTS_PITCH ?? '+0Hz',
|
||||
};
|
||||
@@ -1,43 +1,62 @@
|
||||
import * as Speech from 'expo-speech';
|
||||
import { Audio, type AVPlaybackSource } from 'expo-av';
|
||||
|
||||
import { speechConfig } from '../config/speech';
|
||||
import { ttsConfig } from '../config/tts';
|
||||
|
||||
let cachedVoice: string | undefined;
|
||||
let currentSound: Audio.Sound | undefined;
|
||||
|
||||
export async function speakText(text: string) {
|
||||
Speech.stop();
|
||||
await stopSpeaking();
|
||||
|
||||
const voice = await getPreferredVoice();
|
||||
|
||||
Speech.speak(stripThinkingBlocks(text), {
|
||||
language: speechConfig.language,
|
||||
pitch: speechConfig.pitch,
|
||||
rate: speechConfig.rate,
|
||||
voice,
|
||||
});
|
||||
const cleanText = stripThinkingBlocks(text);
|
||||
if (!cleanText || !ttsConfig.baseUrl) {
|
||||
return;
|
||||
}
|
||||
|
||||
export function stopSpeaking() {
|
||||
Speech.stop();
|
||||
const response = await fetch(`${ttsConfig.baseUrl}/tts`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text: cleanText,
|
||||
voice: ttsConfig.voice,
|
||||
rate: ttsConfig.rate,
|
||||
pitch: ttsConfig.pitch,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`TTS request failed with ${response.status}`);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as { audioUrl?: string };
|
||||
if (!data.audioUrl) {
|
||||
return;
|
||||
}
|
||||
|
||||
const source: AVPlaybackSource = { uri: data.audioUrl };
|
||||
const { sound } = await Audio.Sound.createAsync(source, { shouldPlay: true });
|
||||
currentSound = sound;
|
||||
}
|
||||
|
||||
export async function stopSpeaking() {
|
||||
if (!currentSound) {
|
||||
return;
|
||||
}
|
||||
|
||||
await currentSound.stopAsync();
|
||||
await currentSound.unloadAsync();
|
||||
currentSound = undefined;
|
||||
}
|
||||
|
||||
function stripThinkingBlocks(text: string) {
|
||||
return text.replace(/<think>[\s\S]*?<\/think>/gi, '').trim();
|
||||
}
|
||||
|
||||
async function getPreferredVoice() {
|
||||
if (cachedVoice) {
|
||||
return cachedVoice;
|
||||
}
|
||||
|
||||
if (speechConfig.preferredVoice) {
|
||||
cachedVoice = speechConfig.preferredVoice;
|
||||
return cachedVoice;
|
||||
}
|
||||
|
||||
const voices = await Speech.getAvailableVoicesAsync();
|
||||
const matchingVoice = voices.find((voice) => voice.language === speechConfig.language);
|
||||
cachedVoice = matchingVoice?.identifier;
|
||||
|
||||
return cachedVoice;
|
||||
return text
|
||||
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
||||
.replace(/```[\s\S]*?```/g, ' ')
|
||||
.replace(/`([^`]+)`/g, '$1')
|
||||
.replace(/\*\*([^*]+)\*\*/g, '$1')
|
||||
.replace(/[*_#>~]/g, '')
|
||||
.replace(/https?:\/\/\S+/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
96
tools/tts-server.mjs
Normal file
96
tools/tts-server.mjs
Normal file
@@ -0,0 +1,96 @@
|
||||
import crypto from 'node:crypto';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import cors from 'cors';
|
||||
import express from 'express';
|
||||
import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
|
||||
|
||||
const app = express();
|
||||
const port = Number(process.env.TTS_PORT ?? 3333);
|
||||
const host = process.env.TTS_HOST ?? '0.0.0.0';
|
||||
const cacheDir = path.resolve(process.cwd(), '.tts-cache');
|
||||
|
||||
fs.mkdirSync(cacheDir, { recursive: true });
|
||||
|
||||
app.use(cors());
|
||||
app.use(express.json({ limit: '64kb' }));
|
||||
app.use('/audio', express.static(cacheDir, { maxAge: '1h' }));
|
||||
|
||||
app.get('/health', (_request, response) => {
|
||||
response.json({ ok: true, voice: getVoiceFromEnv() });
|
||||
});
|
||||
|
||||
app.get('/voices', async (_request, response) => {
|
||||
const tts = new MsEdgeTTS({});
|
||||
const voices = await tts.getVoices();
|
||||
response.json(
|
||||
voices
|
||||
.filter((voice) => voice.Locale.startsWith('en-'))
|
||||
.map((voice) => ({
|
||||
name: voice.ShortName,
|
||||
friendlyName: voice.FriendlyName,
|
||||
gender: voice.Gender,
|
||||
locale: voice.Locale,
|
||||
})),
|
||||
);
|
||||
});
|
||||
|
||||
app.post('/tts', async (request, response) => {
|
||||
const text = sanitizeText(String(request.body?.text ?? ''));
|
||||
|
||||
if (!text) {
|
||||
response.status(400).json({ error: 'Missing text' });
|
||||
return;
|
||||
}
|
||||
|
||||
const voice = String(request.body?.voice ?? getVoiceFromEnv());
|
||||
const rate = String(request.body?.rate ?? process.env.TTS_RATE ?? '-8%');
|
||||
const pitch = String(request.body?.pitch ?? process.env.TTS_PITCH ?? '+0Hz');
|
||||
const hash = crypto.createHash('sha256').update(`${voice}|${rate}|${pitch}|${text}`).digest('hex');
|
||||
const filename = `${hash}.mp3`;
|
||||
const filePath = path.join(cacheDir, filename);
|
||||
|
||||
if (!fs.existsSync(filePath)) {
|
||||
const tts = new MsEdgeTTS({});
|
||||
await tts.setMetadata(voice, OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3);
|
||||
const { audioFilePath, metadataFilePath } = await tts.toFile(cacheDir, escapeXml(text), { rate, pitch });
|
||||
fs.renameSync(audioFilePath, filePath);
|
||||
if (metadataFilePath && fs.existsSync(metadataFilePath)) {
|
||||
fs.unlinkSync(metadataFilePath);
|
||||
}
|
||||
tts.close();
|
||||
}
|
||||
|
||||
response.json({ audioUrl: `${request.protocol}://${request.get('host')}/audio/${filename}` });
|
||||
});
|
||||
|
||||
app.listen(port, host, () => {
|
||||
console.log(`TTS server listening on http://${host}:${port}`);
|
||||
console.log(`Voice: ${getVoiceFromEnv()}`);
|
||||
});
|
||||
|
||||
function getVoiceFromEnv() {
|
||||
return process.env.TTS_VOICE ?? 'en-US-JennyNeural';
|
||||
}
|
||||
|
||||
function sanitizeText(value) {
|
||||
return value
|
||||
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
||||
.replace(/```[\s\S]*?```/g, ' ')
|
||||
.replace(/`([^`]+)`/g, '$1')
|
||||
.replace(/\*\*([^*]+)\*\*/g, '$1')
|
||||
.replace(/[*_#>~]/g, '')
|
||||
.replace(/https?:\/\/\S+/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
.slice(0, 1800);
|
||||
}
|
||||
|
||||
function escapeXml(value) {
|
||||
return value
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, ''');
|
||||
}
|
||||
Reference in New Issue
Block a user