Bot - Full Example: Speech Translation API
// https://github.com/MicrosoftTranslator/NodeJs-Example/blob/master/app.js
Node.js Speech Translation Sample App
The sample illustrates how to use two methods supported in the Microsoft Translator Speech Translation API:
Calling ~/languages to get the list of supported languages for speech, text and text-to-speech.
Calling ~/speech/translate to get the recognition and translation of an audio file. The audio file is in PCM 16-bit, 16 kHz, mono WAV format (with header).
Instructions
The sample requires a subscription with Microsoft Translator Speech Translation API, which is part of Microsoft Azure Cognitive Services. Visit the Speech Translation API documentation page for steps about getting a subscription.
To run the sample:
First execute npm install to get the package dependencies.
Edit app.js and enter your Azure Cognitive Services subscription key for Microsoft Translator Speech Translation API:
var azureClientSecret = '[subscription secret key]';
Run by executing node app.js
If you want to change the audio file, change the line:
var file = 'helloworld.wav';
//**********************************************************************************//
// Copyright (c) Microsoft. All rights reserved.
//
// MIT License
//
// You may obtain a copy of the License at
// http://opensource.org/licenses/MIT
//
// THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
// OR OTHER DEALINGS IN THE SOFTWARE.
//
//**********************************************************************************//
var request = require('request');
var wsClient = require('websocket').client;
var fs = require('fs');
var streamBuffers = require('stream-buffers');
var azureClientSecret = '[Speech Translation API Subscription Key]';
var speechTranslateUrl = 'wss://dev.microsofttranslator.com/speech/translate?api-version=1.0&from=en&to=fr';
// input wav file is in PCM 16bit, 16kHz, mono with proper WAV header
var file = 'helloworld.wav';
// get all the supported languages for speech/text/text to speech
request.get({
url: 'https://dev.microsofttranslator.com/languages?api-version=1.0&scope=text,tts,speech',
headers: {
'Accept-Language': 'fr' // the language names will be localized to the 'Accept-Language'
}
},
function (error, response, body) {
if (!error && response.statusCode == 200) {
// helper functions for sorting and getting voices given a language code
var nameSortFunc = function (x, y) { return x.name.localeCompare(y.name); };
var getVoices = function (code) { return ttsDict[code] == null ? null : ttsDict[code].sort(nameSortFunc).map(function (item) { return item.name; }) };
var jsonBody = JSON.parse(body);
// list of languages that support speech input (the 'from' language in speech/translate)
var speechDict = {};
var speechLang = jsonBody.speech;
for (var speechCode in speechLang) {
speechDict[speechLang[speechCode].language] = { name : speechLang[speechCode].name, code: speechCode };
}
// list of text to speech output voices
var ttsDict = {};
var ttsLang = jsonBody.tts;
for (var voiceName in ttsLang) {
var langCode = ttsLang[voiceName].language;
if (ttsDict[langCode] == null)
ttsDict[langCode] = [];
ttsDict[langCode].push({ name: ttsLang[voiceName].regionName + ' (' + ttsLang[voiceName].displayName + ' ' + ttsLang[voiceName].gender + ')', code: voiceName });
}
// list of languages that we can use for text translation (the 'to' language in speech/translate)
var langArr = [];
var textLang = jsonBody.text;
for (var langCode in textLang) {
var item = {
name : textLang[langCode].name,
code : langCode
};
// get the list of voices for this language code
var voices = getVoices(langCode);
if (voices != null)
item.voices = voices;
// does the language support speech input
if (speechDict[langCode] != null)
item.speech = speechDict[langCode];
langArr.push(item);
}
// sort the list based on name
langArr.sort(nameSortFunc);
// print out to console
console.log(langArr);
}
});
// speech translalate api
// get Azure Cognitive Services Access Token for Translator APIs
request.post(
{
url: 'https://api.cognitive.microsoft.com/sts/v1.0/issueToken',
headers: {
'Ocp-Apim-Subscription-Key': azureClientSecret
},
method: 'POST'
},
// once we get the access token, we hook up the necessary websocket events for sending audio and processing the response
function (error, response, body) {
if (!error && response.statusCode == 200) {
// get the acces token
var accessToken = body;
// connect to the speech translate api
var ws = new wsClient();
// event for connection failure
ws.on('connectFailed', function (error) {
console.log('Initial connection failed: ' + error.toString());
});
// event for connection succeed
ws.on('connect', function (connection) {
console.log('Websocket client connected');
// process message that is returned
connection.on('message', processMessage);
connection.on('close', function (reasonCode, description) {
console.log('Connection closed: ' + reasonCode);
});
// print out the error
connection.on('error', function (error) {
console.log('Connection error: ' + error.toString());
});
// send the file to the websocket endpoint
sendData(connection, file);
});
// connect to the service
ws.connect(speechTranslateUrl, null, null, { 'Authorization' : 'Bearer ' + accessToken });
}
}
);
// process the respond from the service
function processMessage(message) {
if (message.type == 'utf8') {
var result = JSON.parse(message.utf8Data)
console.log('type:%s recognition:%s translation:%s', result.type, result.recognition, result.translation);
}
else {
// text to speech binary audio data if features=texttospeech is passed in the url
// the format will be PCM 16bit 16kHz mono
console.log(message.type);
}
}
// load the file and send the data to the websocket connection in chunks
function sendData(connection, filename) {
// the streambuffer will raise the 'data' event based on the frequency and chunksize
var myReadableStreamBuffer = new streamBuffers.ReadableStreamBuffer({
frequency: 100, // in milliseconds.
chunkSize: 32000 // 32 bytes per millisecond for PCM 16 bit, 16 khz, mono. So we are sending 1 second worth of audio every 100ms
});
// read the file and put it to the buffer
myReadableStreamBuffer.put(fs.readFileSync(filename));
// silence bytes. If the audio file is too short after the user finished speeaking,
// we need to add some silences at the end to tell the service that it is the end of the sentences
// 32 bytes / ms, so 3200000 = 100 seconds of silences
myReadableStreamBuffer.put(new Buffer(3200000));
// no more data to send
myReadableStreamBuffer.stop();
// send data to underlying connection
myReadableStreamBuffer.on('data', function (data) {
connection.sendBytes(data);
});
myReadableStreamBuffer.on('end', function () {
console.log('All data sent, closing connection');
connection.close(1000);
});
}