84 lines
2.6 KiB
C++
84 lines
2.6 KiB
C++
#include "whisper.h"
|
|
#include "common-whisper.h"
|
|
|
|
#include <cstdio>
|
|
#include <string>
|
|
|
|
#ifdef NDEBUG
|
|
#undef NDEBUG
|
|
#endif
|
|
#include <cassert>
|
|
|
|
void assert_default_params(const struct whisper_vad_params & params) {
|
|
assert(params.threshold == 0.5);
|
|
assert(params.min_speech_duration_ms == 250);
|
|
assert(params.min_silence_duration_ms == 100);
|
|
assert(params.samples_overlap == 0.1f);
|
|
}
|
|
|
|
void assert_default_context_params(const struct whisper_vad_context_params & params) {
|
|
assert(params.n_threads == 4);
|
|
assert(params.use_gpu == false);
|
|
assert(params.gpu_device == 0);
|
|
}
|
|
|
|
void test_detect_speech(
|
|
struct whisper_vad_context * vctx,
|
|
struct whisper_vad_params params,
|
|
const float * pcmf32,
|
|
int n_samples) {
|
|
assert(whisper_vad_detect_speech(vctx, pcmf32, n_samples));
|
|
assert(whisper_vad_n_probs(vctx) == 344);
|
|
assert(whisper_vad_probs(vctx) != nullptr);
|
|
}
|
|
|
|
struct whisper_vad_segments * test_detect_timestamps(
|
|
struct whisper_vad_context * vctx,
|
|
struct whisper_vad_params params) {
|
|
struct whisper_vad_segments * timestamps = whisper_vad_segments_from_probs(vctx, params);
|
|
assert(whisper_vad_segments_n_segments(timestamps) == 5);
|
|
|
|
for (int i = 0; i < whisper_vad_segments_n_segments(timestamps); ++i) {
|
|
printf("VAD segment %d: start = %.2f, end = %.2f\n", i,
|
|
whisper_vad_segments_get_segment_t0(timestamps, i),
|
|
whisper_vad_segments_get_segment_t1(timestamps, i));
|
|
}
|
|
|
|
return timestamps;
|
|
}
|
|
|
|
int main() {
|
|
std::string vad_model_path = VAD_MODEL_PATH;
|
|
std::string sample_path = SAMPLE_PATH;
|
|
|
|
// Load the sample audio file
|
|
std::vector<float> pcmf32;
|
|
std::vector<std::vector<float>> pcmf32s;
|
|
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
|
|
assert(pcmf32.size() > 0);
|
|
assert(pcmf32s.size() == 0); // no stereo vector
|
|
|
|
// Load the VAD model
|
|
struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
|
|
assert_default_context_params(ctx_params);
|
|
|
|
struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
|
|
vad_model_path.c_str(),
|
|
ctx_params);
|
|
assert(vctx != nullptr);
|
|
|
|
struct whisper_vad_params params = whisper_vad_default_params();
|
|
assert_default_params(params);
|
|
|
|
// Test speech probabilites
|
|
test_detect_speech(vctx, params, pcmf32.data(), pcmf32.size());
|
|
|
|
// Test speech timestamps (uses speech probabilities from above)
|
|
struct whisper_vad_segments * timestamps = test_detect_timestamps(vctx, params);
|
|
|
|
whisper_vad_free_segments(timestamps);
|
|
whisper_vad_free(vctx);
|
|
|
|
return 0;
|
|
}
|