diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md index fff6efc7..2b586ef3 100644 --- a/bindings/ruby/README.md +++ b/bindings/ruby/README.md @@ -324,6 +324,22 @@ whisper The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy. +Using VAD separately from ASR +----------------------------- + +VAD feature itself is useful. You can use it separately from ASR: + +```ruby +vad = Whisper::VAD::Context.new("silero-v5.1.2") +vad + .detect("path/to/audio.wav", Whisper::VAD::Params.new) + .each_with_index do |segment, index| + segment => {start_time: st, end_time: ed} # `Segment` responds to `#deconstruct_keys` + + puts "[%{nth}: %{st} --> %{ed}]" % {nth: index + 1, st:, ed:} + end +``` + Development ----------- diff --git a/bindings/ruby/ext/ruby_whisper.c b/bindings/ruby/ext/ruby_whisper.c index 533bda74..59c7818e 100644 --- a/bindings/ruby/ext/ruby_whisper.c +++ b/bindings/ruby/ext/ruby_whisper.c @@ -6,7 +6,10 @@ VALUE mWhisper; VALUE mVAD; VALUE cContext; VALUE cParams; +VALUE cVADContext; VALUE cVADParams; +VALUE cVADSegments; +VALUE cVADSegment; VALUE eError; VALUE cSegment; @@ -37,6 +40,9 @@ extern void init_ruby_whisper_error(VALUE *mWhisper); extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment); extern void init_ruby_whisper_model(VALUE *mWhisper); extern void init_ruby_whisper_vad_params(VALUE *mVAD); +extern void init_ruby_whisper_vad_context(VALUE *mVAD); +extern void init_ruby_whisper_vad_segment(VALUE *mVAD); +extern void init_ruby_whisper_vad_segments(VALUE *mVAD); extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context); /* @@ -170,6 +176,9 @@ void Init_whisper() { init_ruby_whisper_segment(&mWhisper, &cContext); init_ruby_whisper_model(&mWhisper); init_ruby_whisper_vad_params(&mVAD); + init_ruby_whisper_vad_segment(&mVAD); + init_ruby_whisper_vad_segments(&mVAD); + init_ruby_whisper_vad_context(&mVAD); rb_require("whisper/context"); rb_require("whisper/segment"); diff --git a/bindings/ruby/ext/ruby_whisper.h b/bindings/ruby/ext/ruby_whisper.h index 65b88122..ff8591aa 100644 --- a/bindings/ruby/ext/ruby_whisper.h +++ b/bindings/ruby/ext/ruby_whisper.h @@ -37,4 +37,17 @@ typedef struct { VALUE context; } ruby_whisper_model; +typedef struct { + struct whisper_vad_segments *segments; +} ruby_whisper_vad_segments; + +typedef struct { + VALUE segments; + int index; +} ruby_whisper_vad_segment; + +typedef struct { + struct whisper_vad_context *context; +} ruby_whisper_vad_context; + #endif diff --git a/bindings/ruby/ext/ruby_whisper_segment.c b/bindings/ruby/ext/ruby_whisper_segment.c index a303187c..c05632c7 100644 --- a/bindings/ruby/ext/ruby_whisper_segment.c +++ b/bindings/ruby/ext/ruby_whisper_segment.c @@ -29,6 +29,9 @@ ruby_whisper_segment_memsize(const void *p) if (!rws) { return 0; } + if (rws->index) { + size += sizeof(rws->index); + } return size; } diff --git a/bindings/ruby/ext/ruby_whisper_vad_context.c b/bindings/ruby/ext/ruby_whisper_vad_context.c new file mode 100644 index 00000000..bf2ed2ba --- /dev/null +++ b/bindings/ruby/ext/ruby_whisper_vad_context.c @@ -0,0 +1,75 @@ +#include +#include "ruby_whisper.h" + +extern ID id_to_s; + +extern VALUE cVADContext; + +extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params); +extern VALUE ruby_whisper_normalize_model_path(VALUE model_path); + +static size_t +ruby_whisper_vad_context_memsize(const void *p) +{ + const ruby_whisper_vad_context *rwvc = p; + size_t size = sizeof(rwvc); + if (!rwvc) { + return 0; + } + if (rwvc->context) { + size += sizeof(rwvc->context); + } + return size; +} + +static void +ruby_whisper_vad_context_free(void *p) +{ + ruby_whisper_vad_context *rwvc = (ruby_whisper_vad_context *)p; + if (rwvc->context) { + whisper_vad_free(rwvc->context); + rwvc->context = NULL; + } + xfree(rwvc); +} + +const rb_data_type_t ruby_whisper_vad_context_type = { + "ruby_whisper_vad_context", + {0, ruby_whisper_vad_context_free, ruby_whisper_vad_context_memsize,}, + 0, 0, + 0 +}; + +static VALUE +ruby_whisper_vad_context_s_allocate(VALUE klass) +{ + ruby_whisper_vad_context *rwvc; + VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc); + rwvc->context = NULL; + return obj; +} + +static VALUE +ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path) +{ + ruby_whisper_vad_context *rwvc; + struct whisper_vad_context *context; + + model_path = ruby_whisper_normalize_model_path(model_path); + context = whisper_vad_init_from_file_with_params(StringValueCStr(model_path), whisper_vad_default_context_params()); + if (context == NULL) { + rb_raise(rb_eRuntimeError, "Failed to initialize whisper VAD context"); + } + TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc); + rwvc->context = context; + + return Qnil; +} + +void init_ruby_whisper_vad_context(VALUE *mVAD) +{ + cVADContext = rb_define_class_under(*mVAD, "Context", rb_cObject); + rb_define_alloc_func(cVADContext, ruby_whisper_vad_context_s_allocate); + rb_define_method(cVADContext, "initialize", ruby_whisper_vad_context_initialize, 1); + rb_define_method(cVADContext, "detect", ruby_whisper_vad_detect, 2); +} diff --git a/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp new file mode 100644 index 00000000..58609f87 --- /dev/null +++ b/bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp @@ -0,0 +1,50 @@ +#include +#include "ruby_whisper.h" +#include "common-whisper.h" +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern VALUE cVADSegments; + +extern const rb_data_type_t ruby_whisper_vad_context_type; +extern const rb_data_type_t ruby_whisper_vad_params_type; +extern const rb_data_type_t ruby_whisper_vad_segments_type; + +extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments); + +VALUE +ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) { + ruby_whisper_vad_context *rwvc; + ruby_whisper_vad_params *rwvp; + std::string cpp_file_path; + std::vector pcmf32; + std::vector> pcmf32s; + whisper_vad_segments *segments; + + TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc); + if (rwvc->context == NULL) { + rb_raise(rb_eRuntimeError, "Doesn't have referenxe to context internally"); + } + TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp); + + cpp_file_path = StringValueCStr(file_path); + + if (!read_audio_data(cpp_file_path, pcmf32, pcmf32s, false)) { + rb_raise(rb_eRuntimeError, "Failed to open '%s' as WAV file\n", cpp_file_path.c_str()); + } + + segments = whisper_vad_segments_from_samples(rwvc->context, rwvp->params, pcmf32.data(), pcmf32.size()); + if (segments == nullptr) { + rb_raise(rb_eRuntimeError, "Failed to process audio\n"); + } + + return ruby_whisper_vad_segments_s_init(segments); +} + +#ifdef __cplusplus +} +#endif diff --git a/bindings/ruby/ext/ruby_whisper_vad_segment.c b/bindings/ruby/ext/ruby_whisper_vad_segment.c new file mode 100644 index 00000000..f444b419 --- /dev/null +++ b/bindings/ruby/ext/ruby_whisper_vad_segment.c @@ -0,0 +1,141 @@ +#include +#include "ruby_whisper.h" + +#define N_KEY_NAMES 2 + +extern VALUE cVADSegment; + +extern const rb_data_type_t ruby_whisper_vad_segments_type; + +static VALUE sym_start_time; +static VALUE sym_end_time; +static VALUE key_names; + +static void +rb_whisper_vad_segment_mark(void *p) +{ + ruby_whisper_vad_segment *rwvs = (ruby_whisper_vad_segment *)p; + rb_gc_mark(rwvs->segments); +} + +static size_t +ruby_whisper_vad_segment_memsize(const void *p) +{ + const ruby_whisper_vad_segment *rwvs = p; + size_t size = sizeof(rwvs); + if (!rwvs) { + return 0; + } + if (rwvs->index) { + size += sizeof(rwvs->index); + } + return size; +} + +static const rb_data_type_t ruby_whisper_vad_segment_type = { + "ruby_whisper_vad_segment", + {rb_whisper_vad_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_vad_segment_memsize,}, + 0, 0, + 0 +}; + +static VALUE +ruby_whisper_vad_segment_s_allocate(VALUE klass) +{ + ruby_whisper_vad_segment *rwvs; + VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs); + rwvs->segments = Qnil; + rwvs->index = -1; + return obj; +} + +VALUE +rb_whisper_vad_segment_s_new(VALUE segments, int index) +{ + ruby_whisper_vad_segment *rwvs; + const VALUE segment = ruby_whisper_vad_segment_s_allocate(cVADSegment); + TypedData_Get_Struct(segment, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs); + rwvs->segments = segments; + rwvs->index = index; + return segment; +} + +static VALUE +ruby_whisper_vad_segment_get_start_time(VALUE self) +{ + ruby_whisper_vad_segment *rwvs; + ruby_whisper_vad_segments *rwvss; + float t0; + + TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs); + TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss); + t0 = whisper_vad_segments_get_segment_t0(rwvss->segments, rwvs->index); + return DBL2NUM(t0 * 10); +} + +static VALUE +ruby_whisper_vad_segment_get_end_time(VALUE self) +{ + ruby_whisper_vad_segment *rwvs; + ruby_whisper_vad_segments *rwvss; + float t1; + + TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs); + TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss); + t1 = whisper_vad_segments_get_segment_t1(rwvss->segments, rwvs->index); + return DBL2NUM(t1 * 10); +} + +static VALUE +ruby_whisper_vad_segment_deconstruct_keys(VALUE self, VALUE keys) +{ + ruby_whisper_vad_segment *rwvs; + ruby_whisper_vad_segments *rwvss; + VALUE hash, key; + long n_keys; + int i; + + TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs); + TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss); + + hash = rb_hash_new(); + if (NIL_P(keys)) { + keys = key_names; + n_keys = N_KEY_NAMES; + } else { + n_keys = RARRAY_LEN(keys); + if (n_keys > N_KEY_NAMES) { + return hash; + } + } + for (i = 0; i < n_keys; i++) { + key = rb_ary_entry(keys, i); + if (key == sym_start_time) { + rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_start_time(self)); + } + if (key == sym_end_time) { + rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_end_time(self)); + } + } + + return hash; +} + +void +init_ruby_whisper_vad_segment(VALUE *mVAD) +{ + cVADSegment = rb_define_class_under(*mVAD, "Segment", rb_cObject); + + sym_start_time = ID2SYM(rb_intern("start_time")); + sym_end_time = ID2SYM(rb_intern("end_time")); + key_names = rb_ary_new3( + N_KEY_NAMES, + sym_start_time, + sym_end_time + ); + + rb_define_alloc_func(cVADSegment, ruby_whisper_vad_segment_s_allocate); + rb_define_method(cVADSegment, "start_time", ruby_whisper_vad_segment_get_start_time, 0); + rb_define_method(cVADSegment, "end_time", ruby_whisper_vad_segment_get_end_time, 0); + rb_define_method(cVADSegment, "deconstruct_keys", ruby_whisper_vad_segment_deconstruct_keys, 1); +} diff --git a/bindings/ruby/ext/ruby_whisper_vad_segments.c b/bindings/ruby/ext/ruby_whisper_vad_segments.c new file mode 100644 index 00000000..ae1c21b6 --- /dev/null +++ b/bindings/ruby/ext/ruby_whisper_vad_segments.c @@ -0,0 +1,112 @@ +#include +#include "ruby_whisper.h" + +extern ID id___method__; +extern ID id_to_enum; + +extern VALUE cVADSegments; + +extern VALUE rb_whisper_vad_segment_s_new(VALUE segments, int index); + +static size_t +ruby_whisper_vad_segments_memsize(const void *p) +{ + const ruby_whisper_vad_segments *rwvss = p; + size_t size = sizeof(rwvss); + if (!rwvss) { + return 0; + } + if (rwvss->segments) { + size += sizeof(rwvss->segments); + } + return size; +} + +static void +ruby_whisper_vad_segments_free(void *p) +{ + ruby_whisper_vad_segments *rwvss = (ruby_whisper_vad_segments *)p; + if (rwvss->segments) { + whisper_vad_free_segments(rwvss->segments); + rwvss->segments = NULL; + } + xfree(rwvss); +} + +const rb_data_type_t ruby_whisper_vad_segments_type = { + "ruby_whisper_vad_segments", + {0, ruby_whisper_vad_segments_free, ruby_whisper_vad_segments_memsize,}, + 0, 0, + 0 +}; + +static VALUE +ruby_whisper_vad_segments_s_allocate(VALUE klass) +{ + ruby_whisper_vad_segments *rwvss; + VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss); + rwvss->segments = NULL; + return obj; +} + +VALUE +ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments) +{ + VALUE rb_segments; + ruby_whisper_vad_segments *rwvss; + + rb_segments = ruby_whisper_vad_segments_s_allocate(cVADSegments); + TypedData_Get_Struct(rb_segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss); + rwvss->segments = segments; + + return rb_segments; +} + +static VALUE +ruby_whisper_vad_segments_each(VALUE self) +{ + ruby_whisper_vad_segments *rwvss; + VALUE method_name; + int n_segments, i; + + if (!rb_block_given_p()) { + method_name = rb_funcall(self, id___method__, 0); + return rb_funcall(self, id_to_enum, 1, method_name); + } + + TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss); + if (rwvss->segments == NULL) { + rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally"); + } + n_segments = whisper_vad_segments_n_segments(rwvss->segments); + for (i = 0; i < n_segments; ++i) { + rb_yield(rb_whisper_vad_segment_s_new(self, i)); + } + + return self; +} + +static VALUE +ruby_whisper_vad_segments_get_length(VALUE self) +{ + ruby_whisper_vad_segments *rwvss; + int n_segments; + + TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss); + if (rwvss->segments == NULL) { + rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally"); + } + n_segments = whisper_vad_segments_n_segments(rwvss->segments); + + return INT2NUM(n_segments); +} + +void +init_ruby_whisper_vad_segments(VALUE *mVAD) +{ + cVADSegments = rb_define_class_under(*mVAD, "Segments", rb_cObject); + rb_define_alloc_func(cVADSegments, ruby_whisper_vad_segments_s_allocate); + rb_define_method(cVADSegments, "each", ruby_whisper_vad_segments_each, 0); + rb_define_method(cVADSegments, "length", ruby_whisper_vad_segments_get_length, 0); + rb_include_module(cVADSegments, rb_path2class("Enumerable")); +} diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index d5905dd7..dcb387a2 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -510,6 +510,30 @@ module Whisper def samples_overlap: () -> Float def ==: (Params) -> (true | false) end + + class Context + def self.new: (String | path | ::URI::HTTP model_name_or_path) -> instance + def detect: (path wav_file_path, Params) -> Segments + end + + class Segments + include Enumerable[Segment] + + def each: { (Segment) -> void } -> void + | () -> Enumerator[Segment] + def length: -> Integer + end + + class Segment + type deconstructed_keys = { + start_time: (Integer | nil), + end_time: (Integer | nil), + } + + def start_time: () -> Integer + def end_time: () -> Integer + def deconstruct_keys: (Array[:start_time | :end_time] | nil) -> deconstructed_keys + end end class Error < StandardError diff --git a/bindings/ruby/test/test_vad_context.rb b/bindings/ruby/test/test_vad_context.rb new file mode 100644 index 00000000..bfc83adf --- /dev/null +++ b/bindings/ruby/test/test_vad_context.rb @@ -0,0 +1,50 @@ +require_relative "helper" + +class TestVADContext < TestBase + def test_initialize + context = Whisper::VAD::Context.new("silero-v5.1.2") + assert_instance_of Whisper::VAD::Context, context + end + + def test_detect + context = Whisper::VAD::Context.new("silero-v5.1.2") + segments = context.detect(AUDIO, Whisper::VAD::Params.new) + assert_instance_of Whisper::VAD::Segments, segments + + i = 0 + segments.each do |segment| + i += 1 + assert_instance_of Whisper::VAD::Segment, segment + end + assert i > 0 + + segments.each_with_index do |segment, index| + assert_instance_of Integer, index + end + + assert_instance_of Enumerator, segments.each + + segment = segments.each.first + assert_instance_of Float, segment.start_time + assert_instance_of Float, segment.end_time + + segment => {start_time:, end_time:} + assert_equal segment.start_time, start_time + assert_equal segment.end_time, end_time + + assert_equal 5, segments.length + end + + def test_invalid_model_type + assert_raise TypeError do + Whisper::VAD::Context.new(Object.new) + end + end + + def test_allocate + vad = Whisper::VAD::Context.allocate + assert_raise do + vad.detect(AUDIO, Whisper::VAD::Params.new) + end + end +end diff --git a/bindings/ruby/test/test_vad_segment.rb b/bindings/ruby/test/test_vad_segment.rb new file mode 100644 index 00000000..7348562c --- /dev/null +++ b/bindings/ruby/test/test_vad_segment.rb @@ -0,0 +1,19 @@ +require_relative "helper" + +class TestVADSegment < TestBase + def test_initialize + segment = Whisper::VAD::Segment.new + + assert_raise do + segment.start_time + end + + assert_raise do + segments.end_time + end + + assert_raise do + segment => {start_time:, end_time:} + end + end +end diff --git a/bindings/ruby/test/test_vad_segments.rb b/bindings/ruby/test/test_vad_segments.rb new file mode 100644 index 00000000..855dc48e --- /dev/null +++ b/bindings/ruby/test/test_vad_segments.rb @@ -0,0 +1,16 @@ +require_relative "helper" + +class TestVADSegments < TestBase + def test_initialize + segments = Whisper::VAD::Segments.new + + assert_raise do + segments.each do |segment| + end + end + + assert_raise do + segments.length + end + end +end