This page explains how you can incorporate the Open Vokaturi software in your own C app (real-time version).
In the batch examples, there were precisely as many calls to
VokaturiVoice_fill()
as to
VokaturiVoice_extract()
. Now consider a real-time
situation, where a recording callback gives you 512 samples every 12
milliseconds, and a timer thread wants to analyze the incoming samples
every 100 milliseconds and show the results to the user in some way.
The samples have to go from the recording callback in the recording
thread to the VokaturiVoice buffer via
VokaturiVoice_fill()
. Meanwhile, the analysing thread
retrieves samples from the VokaturiVoice with
VokaturiVoice_extract()
. You can achieve this magic by
switching on the “multithreading” parameter when creating the
VokaturiVoice. The procedure works like this:
// initialize at start-up with VokaturiVoice_create (fsamp, bufSize, 1):
VokaturiVoice ourVoice;
// called every 12 ms:
void recordingCallback (int numberOfSamples, int16_t samples) {
VokaturiVoice_fill_int16array (ourVoice, numberOfSamples, samples);
}
// called every 100 ms:
void timerCallback () {
VokaturiQuality quality;
VokaturiEmotionProbabilities emotionProbabilities;
VokaturiVoice_extract (ourVoice, & quality, & emotionProbabilities);
if (quality.valid)
printf (
"%.6f %.6f %.6f %.6f %.6f\n",
emotionProbabilities.neutrality,
emotionProbabilities.happiness,
emotionProbabilities.sadness,
emotionProbabilities.anger,
emotionProbabilities.fear
);
}
The simplest app for real-time emotion measurement would include an
audio input library such as portaudio
. A bare version can
look like this:
/*
VokaListen.c
public-domain sample code by Vokaturi, 2022-09-03
(note that the Vokaturi functions are not public-domain)
A program that calls the Vokaturi API
to extract the emotions from speech in real time.
*/
#include <stdio.h>
#include <portaudio.h>
#include "../api/Vokaturi.h"
static int callback (const void *input, void *output,
unsigned long frameCount,
const PaStreamCallbackTimeInfo* timeInfo,
PaStreamCallbackFlags statusFlags,
void *userData
) {
VokaturiVoice voice = userData;
VokaturiVoice_fill_float32array (voice, frameCount, input);
return paContinue;
}
int main (int argc, const char * argv[]) {
Pa_Initialize ();
const int samplingFrequency = 44100; // hertz
const double bufferDuration = 10.0; // seconds
const int bufferLength = samplingFrequency * bufferDuration;
VokaturiVoice voice = VokaturiVoice_create (
samplingFrequency,
bufferLength,
1 // because fill() and extract() operate in different threads
);
printf ("PLEASE START TO SPEAK\n");
PaStream *stream;
Pa_OpenDefaultStream (
& stream, // the return parameter
1, // mono input
0, // no output channels
paFloat32,
samplingFrequency,
paFramesPerBufferUnspecified,
callback,
voice
);
double approximateTimeElapsed = 0.0;
// will not include extract() processing time
const double timeStep = 0.5; // seconds
Pa_StartStream (stream);
while (Pa_IsStreamActive (stream)) {
Pa_Sleep (timeStep * 1000);
VokaturiQuality quality;
VokaturiEmotionProbabilities emotionProbabilities;
VokaturiVoice_extract (voice, & quality, & emotionProbabilities);
approximateTimeElapsed += timeStep;
if (quality.valid) {
printf ("%5.1f time %5.0f N %5.0f H %5.0f S %5.0f A %5.0f F\n",
approximateTimeElapsed,
100 * emotionProbabilities.neutrality,
100 * emotionProbabilities.happiness,
100 * emotionProbabilities.sadness,
100 * emotionProbabilities.anger,
100 * emotionProbabilities.fear);
} else {
printf ("%5.1f no valid emotions\n", approximateTimeElapsed);
}
}
Pa_StopStream (stream); // will not be reached
VokaturiVoice_destroy (voice); // will not be reached
}
You would run this from the command line.
Building the example requires installing portaudio
(on
the Mac via Homebrew and on Windows under Cygwin, for instance), so that
portaudio.h
is found in /usr/local/include
or
/usr/x86_64-w64-mingw32/sys-root/mingw/include/
, and
libportaudio.a
is found in /usr/local/lib
or
/usr/x86_64-w64-mingw32/sys-root/mingw/lib/
. The build
command on an Intel Mac would then be (after navigating to the root
folder of the OpenVokaturi distribution):
clang -std=gnu11 -O3 -arch x86_64 -isysroot /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk -mmacosx-version-min=11.0 examples/VokaListen.c -I /usr/local/include /usr/local/lib/libportaudio.a lib/open/macos/OpenVokaturi-mac_intel64.o -framework CoreServices -framework CoreAudio -framework AudioUnit -framework AudioToolbox -o bin/OpenVokaListen-4-0-mac_intel64
and on Windows under Cygwin:
x86_64-w64-mingw32-gcc -std=gnu11 -O3 -mwindows -mconsole examples/VokaListen.c lib/open/win/OpenVokaturi-win64.o -static -lportaudio -lwinmm -lsetupapi -lole32 -o bin/OpenVokaListen-4-0-win64.exe
On Linux, perhaps after installing portaudio19-dev
, one
can simply do:
gcc -std=gnu11 -O3 examples/VokaListen.c lib/open/linux/OpenVokaturi-linux.o -lm -lportaudio -o bin/OpenVokaListen-4-0-linux
The files VokaMonoOpen-4-0-ios.zip and VokaStereoOpen-4-0-ios.zip
contain complete demo apps called “VokaMono” and “VokaStereo”,
respectively, which you can open directly with Xcode. Each of these
projects contains a copy of the OpenVokaturi-4-0-ios.a
library.
This section describes how Vokaturi can be used in real time on iOS, by explaining parts of the VokaMono demo app.
This is how the recording callback would fit in the aurioTouch source code from Apple:
static VokaturiVoice theVoice;
struct CallbackData {
AudioUnit rioUnit;
BOOL *audioChainIsBeingReconstructed;
CallbackData ():
rioUnit (NULL), audioChainIsBeingReconstructed (NULL) {}
} cd;
static OSStatus renderCallback (
void *inRefCon,
AudioUnitRenderActionFlags *ioActionFlags,
const AudioTimeStamp *inTimeStamp,
UInt32 inBusNumber,
UInt32 inNumberFrames,
AudioBufferList *ioData
) {
OSStatus err = noErr;
if (*cd.audioChainIsBeingReconstructed == NO) {
// we are calling AudioUnitRender on the input bus of AURemoteIO
// this will store the audio data captured by
// the microphone in ioData
err = AudioUnitRender (cd.rioUnit, ioActionFlags, inTimeStamp, 1,
inNumberFrames, ioData);
float *source = (float *) ioData -> mBuffers [0]. mData;
if (theVoice)
VokaturiVoice_fill_float32array (theVoice,
inNumberOfFrames, source);
/*
The audio unit is a bidirectional one:
it does both input and output.
Silence the output sound.
*/
for (int i = 0; i < ioData -> mNumberBuffers; ++ i)
memset (ioData -> mBuffers [i]. mData, 0,
ioData -> mBuffers [i]. mDataByteSize);
}
return err;
}
In the GUI thread, we find (in Objective C):
- (void) timerCallback
{
if (! theVoice) {
theVoice = VokaturiVoice_create (
44100.0, // sampling frequency in hertz
441000, // the buffer size (10 seconds)
1 // because fill() and extract() run in different threads
);
if (! theVoice)
return;
}
static VokaturiQuality quality;
static VokaturiEmotionProbabilities emotionProbabilities;
VokaturiVoice_extract (theVoice, & quality, & emotionProbabilities);
if (quality.valid)
ourShowInGUI (
emotionProbabilities.neutrality,
emotionProbabilities.happiness,
emotionProbabilities.sadness,
emotionProbabilities.anger,
emotionProbabilities.fear
);
}
Here is how the iOS audio elements are initialized (based on source code from aurioTouch by Apple):
/*
AudioController.h
By Vokaturi 2016-04-17, with source code from aurioTouch by Apple.
*/
#import <AudioToolbox/AudioToolbox.h>
#import <AVFoundation/AVFoundation.h>
@interface AudioController : NSObject {
AudioUnit _rioUnit;
AVAudioPlayer* _audioPlayer; // for button pressed sound
BOOL _audioChainIsBeingReconstructed;
}
@property (nonatomic, assign, readonly)
BOOL audioChainIsBeingReconstructed;
- (OSStatus) startIOUnit;
- (OSStatus) stopIOUnit;
@end
/*
AudioController.mm
By Vokaturi 2022-08-25, with source code from aurioTouch by Apple.
*/
#import "AudioController.h"
// Framework includes
#import <AVFoundation/AVAudioSession.h>
- (void) setupIOUnit
{
// Create a new instance of AURemoteIO
AudioComponentDescription desc;
desc.componentType = kAudioUnitType_Output;
desc.componentSubType = kAudioUnitSubType_RemoteIO;
desc.componentManufacturer = kAudioUnitManufacturer_Apple;
desc.componentFlags = 0;
desc.componentFlagsMask = 0;
AudioComponent comp = AudioComponentFindNext (NULL, & desc);
AudioComponentInstanceNew (comp, & _rioUnit);
/*
Enable input and output on AURemoteIO.
Input is enabled on the input scope of the input element.
Output is enabled on the output scope of the output element.
*/
UInt32 one = 1;
AudioUnitSetProperty (_rioUnit, kAudioOutputUnitProperty_EnableIO,
kAudioUnitScope_Input, 1, & one, sizeof one);
AudioUnitSetProperty (_rioUnit, kAudioOutputUnitProperty_EnableIO,
kAudioUnitScope_Output, 0, & one, sizeof one);
/*
Explicitly set the input and output client formats:
sample rate = 44100 Hz,
number of channels = 1,
format = 32-bit floating point
*/
AudioStreamBasicDescription ioFormat;
int numberOfChannels = 1; // set to 1 for mono, or 2 for stereo
bool channelsAreInterleaved = false;
// true: left[0], right[0], left[1], right[1]
// false: separate buffers for left and right
ioFormat. mSampleRate = 44100;
ioFormat. mFormatID = kAudioFormatLinearPCM;
ioFormat. mFormatFlags =
kAudioFormatFlagsNativeEndian |
kAudioFormatFlagIsPacked |
kAudioFormatFlagIsFloat |
( channelsAreInterleaved ? 0 : kAudioFormatFlagIsNonInterleaved )
;
ioFormat. mBytesPerPacket = sizeof (float) *
( channelsAreInterleaved ? numberOfChannels : 1);
ioFormat. mFramesPerPacket = 1;
ioFormat. mBytesPerFrame = ioFormat. mBytesPerPacket;
ioFormat. mChannelsPerFrame = numberOfChannels;
ioFormat. mBitsPerChannel = sizeof (float) * 8;
ioFormat. mReserved = 0;
AudioUnitSetProperty (_rioUnit, kAudioUnitProperty_StreamFormat,
kAudioUnitScope_Output, 1, & ioFormat, sizeof ioFormat);
AudioUnitSetProperty (_rioUnit, kAudioUnitProperty_StreamFormat,
kAudioUnitScope_Input, 0, & ioFormat, sizeof ioFormat);
/*
Set the MaximumFramesPerSlice property.
This property is used to describe to an audio unit
the maximum number of samples it will be asked to produce on
any single given call to AudioUnitRender.
*/
UInt32 maxFramesPerSlice = 4096;
AudioUnitSetProperty (
_rioUnit, kAudioUnitProperty_MaximumFramesPerSlice,
kAudioUnitScope_Global, 0,
& maxFramesPerSlice, sizeof maxFramesPerSlice
);
/*
Get the property value back from AURemoteIO.
We are going to use this value to allocate buffers accordingly.
*/
UInt32 propSize = sizeof (UInt32);
AudioUnitGetProperty (
_rioUnit, kAudioUnitProperty_MaximumFramesPerSlice,
kAudioUnitScope_Global, 0,
& maxFramesPerSlice, & propSize
);
/*
We need references to certain data in the render callback.
This simple struct is used to hold that information.
*/
cd.rioUnit = _rioUnit;
cd.audioChainIsBeingReconstructed = &_audioChainIsBeingReconstructed;
/*
Set the render callback on AURemoteIO.
*/
AURenderCallbackStruct renderCallbackStruct;
renderCallbackStruct.inputProc = renderCallback;
renderCallbackStruct.inputProcRefCon = NULL;
AudioUnitSetProperty (
_rioUnit, kAudioUnitProperty_SetRenderCallback,
kAudioUnitScope_Input, 0,
& renderCallbackStruct, sizeof renderCallbackStruct
);
/*
Initialize the AURemoteIO instance.
*/
AudioUnitInitialize (_rioUnit);
}
- (OSStatus) startIOUnit
{
OSStatus err = AudioOutputUnitStart (_rioUnit);
if (err)
NSLog (@"couldn't start AURemoteIO: %d", (int) err);
return err;
}
The files VokaMonoOpen-4-0-android.zip
and VokaStereoOpen-4-0-android.zip
contain complete demo apps called “VokaMono” and “VokaStereo”,
respectively, which you can open directly with Android Studio. Each of
these projects contains a copy of the
OpenVokaturi-4-0-android.aar
library.
If you want to experiment with the demo code, please understand the following issues about the licence:
OpenVokaturi-4-0-ios.a
or
OpenVokaturi-4-0-android.aar
(or any other OpenVokaturi
library, or the OpenVokaturi source code) into your app only if you
distribute your app under the General Public Licence, i.e. as open
source. This is because the open-source edition of the Vokaturi library
is released under the General Public Licence.