ruby : VAD separately from ASR (#3518)

* Add Whisper::VAD::Context

* Add test for Whisper::VAD::Context

* Add Whisper::VAD::Segment

* Add Whisper::VAD::Segments

* Add Whisper::VAD::Context#detect

* Define Whisper::VAD::Segments#each

* Define Whisper::VAD::Segment#start_time and #end_time

* Define Whisper::VAD::Segment#deconstruct_keys

* Add tests for Whisper::VAD family

* Add signatures for VAD family

* Add document on VAD in README

* Define Whisper::VAD::Segments#length

* Add test for Whisper::VAD::Segments#length

* Add signature of Segments#length

* Make vad_segments responsible to initialize VAD::Segments

* Remove meaningless argument check

* Check NULL of segments member

* Add tests for Whisper::VAD::Segments

* Initialize Whisper::VAD::Segment on .allocate

* Add tests for Whisper::VAD::Segment

* Check NULL of context member

* Add test for Whisper::VAD::Context.allocate
This commit is contained in:
KITAITI Makoto 2025-11-13 10:15:26 +09:00 committed by GitHub
parent a1867e0dad
commit d9b7613b34
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 528 additions and 0 deletions

View File

@ -324,6 +324,22 @@ whisper
The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
Using VAD separately from ASR
-----------------------------
VAD feature itself is useful. You can use it separately from ASR:
```ruby
vad = Whisper::VAD::Context.new("silero-v5.1.2")
vad
.detect("path/to/audio.wav", Whisper::VAD::Params.new)
.each_with_index do |segment, index|
segment => {start_time: st, end_time: ed} # `Segment` responds to `#deconstruct_keys`
puts "[%{nth}: %{st} --> %{ed}]" % {nth: index + 1, st:, ed:}
end
```
Development
-----------

View File

@ -6,7 +6,10 @@ VALUE mWhisper;
VALUE mVAD;
VALUE cContext;
VALUE cParams;
VALUE cVADContext;
VALUE cVADParams;
VALUE cVADSegments;
VALUE cVADSegment;
VALUE eError;
VALUE cSegment;
@ -37,6 +40,9 @@ extern void init_ruby_whisper_error(VALUE *mWhisper);
extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
extern void init_ruby_whisper_model(VALUE *mWhisper);
extern void init_ruby_whisper_vad_params(VALUE *mVAD);
extern void init_ruby_whisper_vad_context(VALUE *mVAD);
extern void init_ruby_whisper_vad_segment(VALUE *mVAD);
extern void init_ruby_whisper_vad_segments(VALUE *mVAD);
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
/*
@ -170,6 +176,9 @@ void Init_whisper() {
init_ruby_whisper_segment(&mWhisper, &cContext);
init_ruby_whisper_model(&mWhisper);
init_ruby_whisper_vad_params(&mVAD);
init_ruby_whisper_vad_segment(&mVAD);
init_ruby_whisper_vad_segments(&mVAD);
init_ruby_whisper_vad_context(&mVAD);
rb_require("whisper/context");
rb_require("whisper/segment");

View File

@ -37,4 +37,17 @@ typedef struct {
VALUE context;
} ruby_whisper_model;
typedef struct {
struct whisper_vad_segments *segments;
} ruby_whisper_vad_segments;
typedef struct {
VALUE segments;
int index;
} ruby_whisper_vad_segment;
typedef struct {
struct whisper_vad_context *context;
} ruby_whisper_vad_context;
#endif

View File

@ -29,6 +29,9 @@ ruby_whisper_segment_memsize(const void *p)
if (!rws) {
return 0;
}
if (rws->index) {
size += sizeof(rws->index);
}
return size;
}

View File

@ -0,0 +1,75 @@
#include <ruby.h>
#include "ruby_whisper.h"
extern ID id_to_s;
extern VALUE cVADContext;
extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params);
extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
static size_t
ruby_whisper_vad_context_memsize(const void *p)
{
const ruby_whisper_vad_context *rwvc = p;
size_t size = sizeof(rwvc);
if (!rwvc) {
return 0;
}
if (rwvc->context) {
size += sizeof(rwvc->context);
}
return size;
}
static void
ruby_whisper_vad_context_free(void *p)
{
ruby_whisper_vad_context *rwvc = (ruby_whisper_vad_context *)p;
if (rwvc->context) {
whisper_vad_free(rwvc->context);
rwvc->context = NULL;
}
xfree(rwvc);
}
const rb_data_type_t ruby_whisper_vad_context_type = {
"ruby_whisper_vad_context",
{0, ruby_whisper_vad_context_free, ruby_whisper_vad_context_memsize,},
0, 0,
0
};
static VALUE
ruby_whisper_vad_context_s_allocate(VALUE klass)
{
ruby_whisper_vad_context *rwvc;
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
rwvc->context = NULL;
return obj;
}
static VALUE
ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path)
{
ruby_whisper_vad_context *rwvc;
struct whisper_vad_context *context;
model_path = ruby_whisper_normalize_model_path(model_path);
context = whisper_vad_init_from_file_with_params(StringValueCStr(model_path), whisper_vad_default_context_params());
if (context == NULL) {
rb_raise(rb_eRuntimeError, "Failed to initialize whisper VAD context");
}
TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
rwvc->context = context;
return Qnil;
}
void init_ruby_whisper_vad_context(VALUE *mVAD)
{
cVADContext = rb_define_class_under(*mVAD, "Context", rb_cObject);
rb_define_alloc_func(cVADContext, ruby_whisper_vad_context_s_allocate);
rb_define_method(cVADContext, "initialize", ruby_whisper_vad_context_initialize, 1);
rb_define_method(cVADContext, "detect", ruby_whisper_vad_detect, 2);
}

View File

@ -0,0 +1,50 @@
#include <ruby.h>
#include "ruby_whisper.h"
#include "common-whisper.h"
#include <string>
#include <vector>
#ifdef __cplusplus
extern "C" {
#endif
extern VALUE cVADSegments;
extern const rb_data_type_t ruby_whisper_vad_context_type;
extern const rb_data_type_t ruby_whisper_vad_params_type;
extern const rb_data_type_t ruby_whisper_vad_segments_type;
extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments);
VALUE
ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) {
ruby_whisper_vad_context *rwvc;
ruby_whisper_vad_params *rwvp;
std::string cpp_file_path;
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
whisper_vad_segments *segments;
TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
if (rwvc->context == NULL) {
rb_raise(rb_eRuntimeError, "Doesn't have referenxe to context internally");
}
TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
cpp_file_path = StringValueCStr(file_path);
if (!read_audio_data(cpp_file_path, pcmf32, pcmf32s, false)) {
rb_raise(rb_eRuntimeError, "Failed to open '%s' as WAV file\n", cpp_file_path.c_str());
}
segments = whisper_vad_segments_from_samples(rwvc->context, rwvp->params, pcmf32.data(), pcmf32.size());
if (segments == nullptr) {
rb_raise(rb_eRuntimeError, "Failed to process audio\n");
}
return ruby_whisper_vad_segments_s_init(segments);
}
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,141 @@
#include <ruby.h>
#include "ruby_whisper.h"
#define N_KEY_NAMES 2
extern VALUE cVADSegment;
extern const rb_data_type_t ruby_whisper_vad_segments_type;
static VALUE sym_start_time;
static VALUE sym_end_time;
static VALUE key_names;
static void
rb_whisper_vad_segment_mark(void *p)
{
ruby_whisper_vad_segment *rwvs = (ruby_whisper_vad_segment *)p;
rb_gc_mark(rwvs->segments);
}
static size_t
ruby_whisper_vad_segment_memsize(const void *p)
{
const ruby_whisper_vad_segment *rwvs = p;
size_t size = sizeof(rwvs);
if (!rwvs) {
return 0;
}
if (rwvs->index) {
size += sizeof(rwvs->index);
}
return size;
}
static const rb_data_type_t ruby_whisper_vad_segment_type = {
"ruby_whisper_vad_segment",
{rb_whisper_vad_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_vad_segment_memsize,},
0, 0,
0
};
static VALUE
ruby_whisper_vad_segment_s_allocate(VALUE klass)
{
ruby_whisper_vad_segment *rwvs;
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
rwvs->segments = Qnil;
rwvs->index = -1;
return obj;
}
VALUE
rb_whisper_vad_segment_s_new(VALUE segments, int index)
{
ruby_whisper_vad_segment *rwvs;
const VALUE segment = ruby_whisper_vad_segment_s_allocate(cVADSegment);
TypedData_Get_Struct(segment, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
rwvs->segments = segments;
rwvs->index = index;
return segment;
}
static VALUE
ruby_whisper_vad_segment_get_start_time(VALUE self)
{
ruby_whisper_vad_segment *rwvs;
ruby_whisper_vad_segments *rwvss;
float t0;
TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
t0 = whisper_vad_segments_get_segment_t0(rwvss->segments, rwvs->index);
return DBL2NUM(t0 * 10);
}
static VALUE
ruby_whisper_vad_segment_get_end_time(VALUE self)
{
ruby_whisper_vad_segment *rwvs;
ruby_whisper_vad_segments *rwvss;
float t1;
TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
t1 = whisper_vad_segments_get_segment_t1(rwvss->segments, rwvs->index);
return DBL2NUM(t1 * 10);
}
static VALUE
ruby_whisper_vad_segment_deconstruct_keys(VALUE self, VALUE keys)
{
ruby_whisper_vad_segment *rwvs;
ruby_whisper_vad_segments *rwvss;
VALUE hash, key;
long n_keys;
int i;
TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
hash = rb_hash_new();
if (NIL_P(keys)) {
keys = key_names;
n_keys = N_KEY_NAMES;
} else {
n_keys = RARRAY_LEN(keys);
if (n_keys > N_KEY_NAMES) {
return hash;
}
}
for (i = 0; i < n_keys; i++) {
key = rb_ary_entry(keys, i);
if (key == sym_start_time) {
rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_start_time(self));
}
if (key == sym_end_time) {
rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_end_time(self));
}
}
return hash;
}
void
init_ruby_whisper_vad_segment(VALUE *mVAD)
{
cVADSegment = rb_define_class_under(*mVAD, "Segment", rb_cObject);
sym_start_time = ID2SYM(rb_intern("start_time"));
sym_end_time = ID2SYM(rb_intern("end_time"));
key_names = rb_ary_new3(
N_KEY_NAMES,
sym_start_time,
sym_end_time
);
rb_define_alloc_func(cVADSegment, ruby_whisper_vad_segment_s_allocate);
rb_define_method(cVADSegment, "start_time", ruby_whisper_vad_segment_get_start_time, 0);
rb_define_method(cVADSegment, "end_time", ruby_whisper_vad_segment_get_end_time, 0);
rb_define_method(cVADSegment, "deconstruct_keys", ruby_whisper_vad_segment_deconstruct_keys, 1);
}

View File

@ -0,0 +1,112 @@
#include <ruby.h>
#include "ruby_whisper.h"
extern ID id___method__;
extern ID id_to_enum;
extern VALUE cVADSegments;
extern VALUE rb_whisper_vad_segment_s_new(VALUE segments, int index);
static size_t
ruby_whisper_vad_segments_memsize(const void *p)
{
const ruby_whisper_vad_segments *rwvss = p;
size_t size = sizeof(rwvss);
if (!rwvss) {
return 0;
}
if (rwvss->segments) {
size += sizeof(rwvss->segments);
}
return size;
}
static void
ruby_whisper_vad_segments_free(void *p)
{
ruby_whisper_vad_segments *rwvss = (ruby_whisper_vad_segments *)p;
if (rwvss->segments) {
whisper_vad_free_segments(rwvss->segments);
rwvss->segments = NULL;
}
xfree(rwvss);
}
const rb_data_type_t ruby_whisper_vad_segments_type = {
"ruby_whisper_vad_segments",
{0, ruby_whisper_vad_segments_free, ruby_whisper_vad_segments_memsize,},
0, 0,
0
};
static VALUE
ruby_whisper_vad_segments_s_allocate(VALUE klass)
{
ruby_whisper_vad_segments *rwvss;
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
rwvss->segments = NULL;
return obj;
}
VALUE
ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments)
{
VALUE rb_segments;
ruby_whisper_vad_segments *rwvss;
rb_segments = ruby_whisper_vad_segments_s_allocate(cVADSegments);
TypedData_Get_Struct(rb_segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
rwvss->segments = segments;
return rb_segments;
}
static VALUE
ruby_whisper_vad_segments_each(VALUE self)
{
ruby_whisper_vad_segments *rwvss;
VALUE method_name;
int n_segments, i;
if (!rb_block_given_p()) {
method_name = rb_funcall(self, id___method__, 0);
return rb_funcall(self, id_to_enum, 1, method_name);
}
TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
if (rwvss->segments == NULL) {
rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally");
}
n_segments = whisper_vad_segments_n_segments(rwvss->segments);
for (i = 0; i < n_segments; ++i) {
rb_yield(rb_whisper_vad_segment_s_new(self, i));
}
return self;
}
static VALUE
ruby_whisper_vad_segments_get_length(VALUE self)
{
ruby_whisper_vad_segments *rwvss;
int n_segments;
TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
if (rwvss->segments == NULL) {
rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally");
}
n_segments = whisper_vad_segments_n_segments(rwvss->segments);
return INT2NUM(n_segments);
}
void
init_ruby_whisper_vad_segments(VALUE *mVAD)
{
cVADSegments = rb_define_class_under(*mVAD, "Segments", rb_cObject);
rb_define_alloc_func(cVADSegments, ruby_whisper_vad_segments_s_allocate);
rb_define_method(cVADSegments, "each", ruby_whisper_vad_segments_each, 0);
rb_define_method(cVADSegments, "length", ruby_whisper_vad_segments_get_length, 0);
rb_include_module(cVADSegments, rb_path2class("Enumerable"));
}

View File

@ -510,6 +510,30 @@ module Whisper
def samples_overlap: () -> Float
def ==: (Params) -> (true | false)
end
class Context
def self.new: (String | path | ::URI::HTTP model_name_or_path) -> instance
def detect: (path wav_file_path, Params) -> Segments
end
class Segments
include Enumerable[Segment]
def each: { (Segment) -> void } -> void
| () -> Enumerator[Segment]
def length: -> Integer
end
class Segment
type deconstructed_keys = {
start_time: (Integer | nil),
end_time: (Integer | nil),
}
def start_time: () -> Integer
def end_time: () -> Integer
def deconstruct_keys: (Array[:start_time | :end_time] | nil) -> deconstructed_keys
end
end
class Error < StandardError

View File

@ -0,0 +1,50 @@
require_relative "helper"
class TestVADContext < TestBase
def test_initialize
context = Whisper::VAD::Context.new("silero-v5.1.2")
assert_instance_of Whisper::VAD::Context, context
end
def test_detect
context = Whisper::VAD::Context.new("silero-v5.1.2")
segments = context.detect(AUDIO, Whisper::VAD::Params.new)
assert_instance_of Whisper::VAD::Segments, segments
i = 0
segments.each do |segment|
i += 1
assert_instance_of Whisper::VAD::Segment, segment
end
assert i > 0
segments.each_with_index do |segment, index|
assert_instance_of Integer, index
end
assert_instance_of Enumerator, segments.each
segment = segments.each.first
assert_instance_of Float, segment.start_time
assert_instance_of Float, segment.end_time
segment => {start_time:, end_time:}
assert_equal segment.start_time, start_time
assert_equal segment.end_time, end_time
assert_equal 5, segments.length
end
def test_invalid_model_type
assert_raise TypeError do
Whisper::VAD::Context.new(Object.new)
end
end
def test_allocate
vad = Whisper::VAD::Context.allocate
assert_raise do
vad.detect(AUDIO, Whisper::VAD::Params.new)
end
end
end

View File

@ -0,0 +1,19 @@
require_relative "helper"
class TestVADSegment < TestBase
def test_initialize
segment = Whisper::VAD::Segment.new
assert_raise do
segment.start_time
end
assert_raise do
segments.end_time
end
assert_raise do
segment => {start_time:, end_time:}
end
end
end

View File

@ -0,0 +1,16 @@
require_relative "helper"
class TestVADSegments < TestBase
def test_initialize
segments = Whisper::VAD::Segments.new
assert_raise do
segments.each do |segment|
end
end
assert_raise do
segments.length
end
end
end