Usage¶
The aim of opensmile
is to provide
a high-level interface to openSMILE.
It ships pre-compiled binaries and default feature sets,
but it’s also possible to run custom config files.
Getting ready¶
Let’s do some imports and load some files from the emodb database.
import os
import time
import numpy as np
import pandas as pd
import audb
import audiofile
import opensmile
db = audb.load(
"emodb",
version="1.1.1",
format="wav",
mixdown=True,
sampling_rate=16000,
media="wav/03a01.*", # load subset
full_path=False,
verbose=False,
)
Process signal¶
Read first ten seconds of a file into memory.
file = os.path.join(db.root, db.files[0])
signal, sampling_rate = audiofile.read(
file,
duration=10,
always_2d=True,
)
We set up a feature extractor for functionals of a pre-defined feature set.
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.Functionals,
)
smile.feature_names
['F0semitoneFrom27.5Hz_sma3nz_amean',
'F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
'F0semitoneFrom27.5Hz_sma3nz_percentile20.0',
'F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
'F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope',
'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
'loudness_sma3_amean',
'loudness_sma3_stddevNorm',
'loudness_sma3_percentile20.0',
'loudness_sma3_percentile50.0',
'loudness_sma3_percentile80.0',
'loudness_sma3_pctlrange0-2',
'loudness_sma3_meanRisingSlope',
'loudness_sma3_stddevRisingSlope',
'loudness_sma3_meanFallingSlope',
'loudness_sma3_stddevFallingSlope',
'spectralFlux_sma3_amean',
'spectralFlux_sma3_stddevNorm',
'mfcc1_sma3_amean',
'mfcc1_sma3_stddevNorm',
'mfcc2_sma3_amean',
'mfcc2_sma3_stddevNorm',
'mfcc3_sma3_amean',
'mfcc3_sma3_stddevNorm',
'mfcc4_sma3_amean',
'mfcc4_sma3_stddevNorm',
'jitterLocal_sma3nz_amean',
'jitterLocal_sma3nz_stddevNorm',
'shimmerLocaldB_sma3nz_amean',
'shimmerLocaldB_sma3nz_stddevNorm',
'HNRdBACF_sma3nz_amean',
'HNRdBACF_sma3nz_stddevNorm',
'logRelF0-H1-H2_sma3nz_amean',
'logRelF0-H1-H2_sma3nz_stddevNorm',
'logRelF0-H1-A3_sma3nz_amean',
'logRelF0-H1-A3_sma3nz_stddevNorm',
'F1frequency_sma3nz_amean',
'F1frequency_sma3nz_stddevNorm',
'F1bandwidth_sma3nz_amean',
'F1bandwidth_sma3nz_stddevNorm',
'F1amplitudeLogRelF0_sma3nz_amean',
'F1amplitudeLogRelF0_sma3nz_stddevNorm',
'F2frequency_sma3nz_amean',
'F2frequency_sma3nz_stddevNorm',
'F2bandwidth_sma3nz_amean',
'F2bandwidth_sma3nz_stddevNorm',
'F2amplitudeLogRelF0_sma3nz_amean',
'F2amplitudeLogRelF0_sma3nz_stddevNorm',
'F3frequency_sma3nz_amean',
'F3frequency_sma3nz_stddevNorm',
'F3bandwidth_sma3nz_amean',
'F3bandwidth_sma3nz_stddevNorm',
'F3amplitudeLogRelF0_sma3nz_amean',
'F3amplitudeLogRelF0_sma3nz_stddevNorm',
'alphaRatioV_sma3nz_amean',
'alphaRatioV_sma3nz_stddevNorm',
'hammarbergIndexV_sma3nz_amean',
'hammarbergIndexV_sma3nz_stddevNorm',
'slopeV0-500_sma3nz_amean',
'slopeV0-500_sma3nz_stddevNorm',
'slopeV500-1500_sma3nz_amean',
'slopeV500-1500_sma3nz_stddevNorm',
'spectralFluxV_sma3nz_amean',
'spectralFluxV_sma3nz_stddevNorm',
'mfcc1V_sma3nz_amean',
'mfcc1V_sma3nz_stddevNorm',
'mfcc2V_sma3nz_amean',
'mfcc2V_sma3nz_stddevNorm',
'mfcc3V_sma3nz_amean',
'mfcc3V_sma3nz_stddevNorm',
'mfcc4V_sma3nz_amean',
'mfcc4V_sma3nz_stddevNorm',
'alphaRatioUV_sma3nz_amean',
'hammarbergIndexUV_sma3nz_amean',
'slopeUV0-500_sma3nz_amean',
'slopeUV500-1500_sma3nz_amean',
'spectralFluxUV_sma3nz_amean',
'loudnessPeaksPerSec',
'VoicedSegmentsPerSec',
'MeanVoicedSegmentLengthSec',
'StddevVoicedSegmentLengthSec',
'MeanUnvoicedSegmentLength',
'StddevUnvoicedSegmentLength',
'equivalentSoundLevel_dBp']
And extract features for the signal.
smile.process_signal(
signal,
sampling_rate
)
F0semitoneFrom27.5Hz_sma3nz_amean | F0semitoneFrom27.5Hz_sma3nz_stddevNorm | ... | StddevUnvoicedSegmentLength | equivalentSoundLevel_dBp | ||
---|---|---|---|---|---|---|
start | end | |||||
0 days | 0 days 00:00:01.898250 | 31.188166 | 0.15256 | ... | 0.036422 | -21.647932 |
1 rows × 88 columns
Now we create a feature extractor for low-level descriptors (LLDs).
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)
smile.feature_names
['Loudness_sma3',
'alphaRatio_sma3',
'hammarbergIndex_sma3',
'slope0-500_sma3',
'slope500-1500_sma3',
'spectralFlux_sma3',
'mfcc1_sma3',
'mfcc2_sma3',
'mfcc3_sma3',
'mfcc4_sma3',
'F0semitoneFrom27.5Hz_sma3nz',
'jitterLocal_sma3nz',
'shimmerLocaldB_sma3nz',
'HNRdBACF_sma3nz',
'logRelF0-H1-H2_sma3nz',
'logRelF0-H1-A3_sma3nz',
'F1frequency_sma3nz',
'F1bandwidth_sma3nz',
'F1amplitudeLogRelF0_sma3nz',
'F2frequency_sma3nz',
'F2bandwidth_sma3nz',
'F2amplitudeLogRelF0_sma3nz',
'F3frequency_sma3nz',
'F3bandwidth_sma3nz',
'F3amplitudeLogRelF0_sma3nz']
And re-run feature extraction.
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)
smile.process_signal(
signal,
sampling_rate
)
Loudness_sma3 | alphaRatio_sma3 | ... | F3bandwidth_sma3nz | F3amplitudeLogRelF0_sma3nz | ||
---|---|---|---|---|---|---|
start | end | |||||
0 days 00:00:00 | 0 days 00:00:00.020000 | 0.036621 | -22.231188 | ... | 863.510254 | -201.0 |
0 days 00:00:00.010000 | 0 days 00:00:00.030000 | 0.035347 | -24.530127 | ... | 895.209167 | -201.0 |
0 days 00:00:00.020000 | 0 days 00:00:00.040000 | 0.034774 | -25.316446 | ... | 959.248840 | -201.0 |
0 days 00:00:00.030000 | 0 days 00:00:00.050000 | 0.037970 | -21.849371 | ... | 871.751770 | -201.0 |
0 days 00:00:00.040000 | 0 days 00:00:00.060000 | 0.038702 | -21.143106 | ... | 882.373474 | -201.0 |
... | ... | ... | ... | ... | ... | ... |
0 days 00:00:01.800000 | 0 days 00:00:01.820000 | 0.139252 | -2.237949 | ... | 852.564026 | -201.0 |
0 days 00:00:01.810000 | 0 days 00:00:01.830000 | 0.117584 | -7.198416 | ... | 860.577881 | -201.0 |
0 days 00:00:01.820000 | 0 days 00:00:01.840000 | 0.093313 | -12.354507 | ... | 888.525818 | -201.0 |
0 days 00:00:01.830000 | 0 days 00:00:01.850000 | 0.080512 | -14.350678 | ... | 933.154114 | -201.0 |
0 days 00:00:01.840000 | 0 days 00:00:01.898250 | 0.067301 | -14.426320 | ... | 883.508606 | -201.0 |
185 rows × 25 columns
Logging¶
To know what happens under the hood we can create a log file.
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.Functionals,
loglevel=2,
logfile="smile.log",
)
smile.process_signal(
signal,
sampling_rate
)
with open("./smile.log", "r") as fp:
log = fp.readlines()
log
['[ 16.12.2024 - 16:20:56 ]\n',
' (MSG) [2] SMILEapi: openSMILE starting!\n',
'[ 16.12.2024 - 16:20:56 ]\n',
' (MSG) [2] SMILEapi: config file is: /home/runner/work/opensmile-python/opensmile-python/opensmile/core/config/egemaps/v02/eGeMAPSv02.conf\n',
'[ 16.12.2024 - 16:20:56 ]\n',
' (MSG) [2] cComponentManager: successfully registered 102 component types.\n',
'[ 16.12.2024 - 16:20:56 ]\n',
" (MSG) [2] instance 'gemapsv01b_logSpectral': logSpecFloor = -140.00 (specFloor = 1.000000e-14)\n",
'[ 16.12.2024 - 16:20:56 ]\n',
" (MSG) [2] instance 'egemapsv02_logSpectral_flux': logSpecFloor = -140.00 (specFloor = 1.000000e-14)\n",
'[ 16.12.2024 - 16:20:56 ]\n',
' (MSG) [2] cComponentManager: successfully finished createInstances (73 component instances were finalised, 1 data memories were finalised)\n',
'[ 16.12.2024 - 16:20:56 ]\n',
' (MSG) [2] cComponentManager: starting single thread processing loop\n',
'[ 16.12.2024 - 16:20:56 ]\n',
' (WRN) [2] cComponentManager: The following component(s) could not perform any work because destination levels are full but no other component performed any work either: egemapsv02_smoE. Processing will possibly be incomplete. For more details, enable the execDebug option of cComponentManager.\n',
'[ 16.12.2024 - 16:20:56 ]\n',
' (WRN) [2] cComponentManager: The following component(s) could not perform any work because destination levels are full but no other component performed any work either: egemapsv02_smoE. Processing will possibly be incomplete. For more details, enable the execDebug option of cComponentManager.\n',
'[ 16.12.2024 - 16:20:56 ]\n',
' (WRN) [2] cComponentManager: The following component(s) could not perform any work because destination levels are full but no other component performed any work either: egemapsv02_smoE. Processing will possibly be incomplete. For more details, enable the execDebug option of cComponentManager.\n',
'[ 16.12.2024 - 16:20:56 ]\n',
' (MSG) [2] cComponentManager: Processing finished! System ran for 199 ticks.\n']
Custom config¶
We can create a custom config.
config_str = """
[componentInstances:cComponentManager]
instance[dataMemory].type=cDataMemory
;;; default source
[componentInstances:cComponentManager]
instance[dataMemory].type=cDataMemory
;;; source
\{\cm[source{?}:include external source]}
;;; main section
[componentInstances:cComponentManager]
instance[framer].type = cFramer
instance[lld].type = cEnergy
instance[func].type=cFunctionals
[framer:cFramer]
reader.dmLevel = wave
writer.dmLevel = frames
copyInputName = 1
frameMode = fixed
frameSize = 0.025000
frameStep = 0.010000
frameCenterSpecial = left
noPostEOIprocessing = 1
[lld:cEnergy]
reader.dmLevel = frames
writer.dmLevel = lld
\{\cm[bufferModeRbConf{?}:path to included config to set the buffer mode for the standard ringbuffer levels]}
nameAppend = energy
copyInputName = 1
rms = 1
log = 1
[func:cFunctionals]
reader.dmLevel=lld
writer.dmLevel=func
copyInputName = 1
\{\cm[bufferModeRbConf]}
\{\cm[frameModeFunctionalsConf{?}:path to included config to set frame mode for all functionals]}
functionalsEnabled=Moments
Moments.variance = 0
Moments.stddev = 1
Moments.skewness = 0
Moments.kurtosis = 0
Moments.amean = 1
Moments.doRatioLimit = 0
;;; sink
\{\cm[sink{?}:include external sink]}
"""
It’s important to always set the
source
and sink
as we did above.
But we are free in choosing the levels.
In the above we have added two
levels "func"
and "lld"
.
Now, we simply pass the level
we are interested in.
with open("my.conf", "w") as fp:
fp.write(config_str)
smile = opensmile.Smile(
feature_set="my.conf",
feature_level="func",
)
smile.process_signal(
signal,
sampling_rate
)
pcm_RMSenergy_stddev | pcm_RMSenergy_amean | pcm_LOGenergy_stddev | pcm_LOGenergy_amean | ||
---|---|---|---|---|---|
start | end | ||||
0 days | 0 days 00:00:01.898250 | 0.000195 | 0.001623 | 0.23714 | -12.861267 |
And…
smile = opensmile.Smile(
feature_set="my.conf",
feature_level="lld",
)
smile.process_signal(
signal,
sampling_rate,
)
pcm_RMSenergy | pcm_LOGenergy | ||
---|---|---|---|
start | end | ||
0 days 00:00:00 | 0 days 00:00:00.025000 | 0.001434 | -13.095183 |
0 days 00:00:00.010000 | 0 days 00:00:00.035000 | 0.001859 | -12.575962 |
0 days 00:00:00.020000 | 0 days 00:00:00.045000 | 0.001858 | -12.576361 |
0 days 00:00:00.030000 | 0 days 00:00:00.055000 | 0.001527 | -12.968824 |
0 days 00:00:00.040000 | 0 days 00:00:00.065000 | 0.001437 | -13.090006 |
0 days 00:00:00.050000 | 0 days 00:00:01.898250 | 0.001395 | -13.149714 |
Resample¶
It’s possible to resample the input signals on the fly.
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.Functionals,
sampling_rate=8000,
resample=True,
)
smile.process_signal(
signal,
sampling_rate,
)
F0semitoneFrom27.5Hz_sma3nz_amean | F0semitoneFrom27.5Hz_sma3nz_stddevNorm | ... | StddevUnvoicedSegmentLength | equivalentSoundLevel_dBp | ||
---|---|---|---|---|---|---|
start | end | |||||
0 days | 0 days 00:00:01.898250 | 31.717445 | 0.140366 | ... | 0.066639 | -21.728216 |
1 rows × 88 columns
Multi-channel¶
We can process multi-channel audio. Note that we need to set the channels we want to process when we create the feature extractor.
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.Functionals,
channels=[0, -1], # process first and last channel
)
signal = np.concatenate([signal, signal, signal], axis=0)
smile.process_signal(
signal,
sampling_rate,
)
0 | ... | -1 | ||||
---|---|---|---|---|---|---|
F0semitoneFrom27.5Hz_sma3nz_amean | F0semitoneFrom27.5Hz_sma3nz_stddevNorm | ... | StddevUnvoicedSegmentLength | equivalentSoundLevel_dBp | ||
start | end | |||||
0 days | 0 days 00:00:01.898250 | 31.188166 | 0.15256 | ... | 0.036422 | -21.647932 |
1 rows × 176 columns
File input¶
We can extract features from files. Note that we only process the first ten seconds of the files
files = db.files # pick files
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.Functionals,
)
smile.process_files(
files,
ends=["2s"] * len(files),
root=db.root,
)
F0semitoneFrom27.5Hz_sma3nz_amean | F0semitoneFrom27.5Hz_sma3nz_stddevNorm | ... | StddevUnvoicedSegmentLength | equivalentSoundLevel_dBp | |||
---|---|---|---|---|---|---|---|
file | start | end | |||||
wav/03a01Fa.wav | 0 days | 0 days 00:00:01.898250 | 31.188166 | 0.152560 | ... | 0.036422 | -21.647932 |
wav/03a01Nc.wav | 0 days | 0 days 00:00:01.611250 | 25.022938 | 0.148540 | ... | 0.049816 | -18.010019 |
wav/03a01Wa.wav | 0 days | 0 days 00:00:01.877812500 | 34.292320 | 0.102067 | ... | 0.060339 | -17.855310 |
3 rows × 88 columns
audformat¶
We can extract features from an index in the audformat. Note that we set five workers to speed up the processing.
index = db["emotion"].index # pick table index
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02,
feature_level=opensmile.FeatureLevel.Functionals,
num_workers=5,
)
smile.process_index(
index,
root=db.root,
)
F0semitoneFrom27.5Hz_sma3nz_amean | F0semitoneFrom27.5Hz_sma3nz_stddevNorm | ... | StddevUnvoicedSegmentLength | equivalentSoundLevel_dBp | |||
---|---|---|---|---|---|---|---|
file | start | end | |||||
wav/03a01Fa.wav | 0 days | 0 days 00:00:01.898250 | 31.188166 | 0.152560 | ... | 0.036422 | -21.647932 |
wav/03a01Nc.wav | 0 days | 0 days 00:00:01.611250 | 25.022938 | 0.148540 | ... | 0.049816 | -18.010019 |
wav/03a01Wa.wav | 0 days | 0 days 00:00:01.877812500 | 34.292320 | 0.102067 | ... | 0.060339 | -17.855310 |
3 rows × 88 columns