diff --git a/README.md b/README.md index 8a8c342..c1d24be 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,13 @@ decoder = Xav.Decoder.new(:vp8) {:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>) ``` +Decode with audio resampling + +```elixir +decoder = Xav.Decoder.new(:opus, out_format: :f32, out_sample_rate: 16_000) +{:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>) +``` + Read from a file: ```elixir @@ -52,7 +59,8 @@ Kino.Image.new(tensor) Speech to text: ```elixir -r = Xav.Reader.new!("sample.mp3", read: :audio) +# See https://hexdocs.pm/bumblebee/Bumblebee.Audio.WhisperFeaturizer.html for default sampling rate +r = Xav.Reader.new!("sample.mp3", read: :audio, out_format: :f32, out_channels: 1, out_sample_rate: 16_000) {:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"}) {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"}) diff --git a/c_src/xav/audio_converter.c b/c_src/xav/audio_converter.c index 98c1926..f98aadd 100644 --- a/c_src/xav/audio_converter.c +++ b/c_src/xav/audio_converter.c @@ -28,9 +28,11 @@ int audio_converter_init(struct AudioConverter *c, struct ChannelLayout in_chlay #if LIBAVUTIL_VERSION_MAJOR >= 58 av_opt_set_chlayout(c->swr_ctx, "in_chlayout", &in_chlayout.layout, 0); av_opt_set_chlayout(c->swr_ctx, "out_chlayout", &out_chlayout.layout, 0); + c->out_channels = out_chlayout.layout.nb_channels; #else av_opt_set_channel_layout(c->swr_ctx, "in_channel_layout", in_chlayout.layout, 0); av_opt_set_channel_layout(c->swr_ctx, "out_channel_layout", out_chlayout.layout, 0); + c->out_channels = av_get_channel_layout_nb_channels(out_chlayout.layout); #endif av_opt_set_int(c->swr_ctx, "in_sample_rate", in_sample_rate, 0); diff --git a/c_src/xav/audio_converter.h b/c_src/xav/audio_converter.h index 7912072..a55803b 100644 --- a/c_src/xav/audio_converter.h +++ b/c_src/xav/audio_converter.h @@ -10,6 +10,7 @@ struct AudioConverter { SwrContext *swr_ctx; int64_t in_sample_rate; int64_t out_sample_rate; + int64_t out_channels; struct ChannelLayout out_chlayout; enum AVSampleFormat out_sample_fmt; }; diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c index 5a9c60c..8332a72 100644 --- a/c_src/xav/xav_decoder.c +++ b/c_src/xav/xav_decoder.c @@ -7,7 +7,7 @@ ErlNifResourceType *xav_decoder_resource_type; static int init_audio_converter(struct XavDecoder *xav_decoder); ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - if (argc != 1) { + if (argc != 4) { return xav_nif_raise(env, "invalid_arg_count"); } @@ -22,10 +22,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { return xav_nif_raise(env, "failed_to_get_atom"); } + unsigned int out_format_len; + if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) { + return xav_nif_raise(env, "failed_to_get_atom_length"); + } + + char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *)); + + if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) { + return xav_nif_raise(env, "failed_to_get_atom"); + } + + int out_sample_rate; + if (!enif_get_int(env, argv[2], &out_sample_rate)) { + return xav_nif_raise(env, "invalid_out_sample_rate"); + } + + int out_channels; + if (!enif_get_int(env, argv[3], &out_channels)) { + return xav_nif_raise(env, "invalid_out_channels"); + } + struct XavDecoder *xav_decoder = enif_alloc_resource(xav_decoder_resource_type, sizeof(struct XavDecoder)); xav_decoder->decoder = NULL; xav_decoder->ac = NULL; + xav_decoder->out_format = out_format; + xav_decoder->out_sample_rate = out_sample_rate; + xav_decoder->out_channels = out_channels; xav_decoder->decoder = decoder_alloc(); if (xav_decoder->decoder == NULL) { @@ -120,6 +144,12 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { const char *out_format = av_get_sample_fmt_name(xav_decoder->ac->out_sample_fmt); + if (strcmp(out_format, "flt") == 0) { + out_format = "f32"; + } else if (strcmp(out_format, "dbl") == 0) { + out_format = "f64"; + } + frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format, xav_decoder->decoder->frame->pts); @@ -139,16 +169,47 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) { return -1; } - int out_sample_rate = xav_decoder->decoder->c->sample_rate; - enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; + int out_sample_rate; + if (xav_decoder->out_sample_rate == 0) { + out_sample_rate = xav_decoder->decoder->c->sample_rate; + } else { + out_sample_rate = xav_decoder->out_sample_rate; + } + + enum AVSampleFormat out_sample_fmt; + if (strcmp(xav_decoder->out_format, "u8") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_U8; + } else if (strcmp(xav_decoder->out_format, "s16") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_S16; + } else if (strcmp(xav_decoder->out_format, "s32") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_S32; + } else if (strcmp(xav_decoder->out_format, "s64") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_S64; + } else if (strcmp(xav_decoder->out_format, "f32") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_FLT; + } else if (strcmp(xav_decoder->out_format, "f64") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_DBL; + } else if (strcmp(xav_decoder->out_format, "nil") == 0) { + out_sample_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0); + } else { + return -1; + } struct ChannelLayout in_chlayout, out_chlayout; #if LIBAVUTIL_VERSION_MAJOR >= 58 in_chlayout.layout = xav_decoder->decoder->c->ch_layout; - out_chlayout.layout = xav_decoder->decoder->c->ch_layout; + if (xav_decoder->out_channels == 0) { + out_chlayout.layout = in_chlayout.layout; + } else { + av_channel_layout_default(&out_chlayout.layout, xav_decoder->out_channels); + } #else in_chlayout.layout = xav_decoder->decoder->c->channel_layout; - out_chlayout.layout = xav_decoder->decoder->c->channel_layout; + if (xav_decoder->out_channels == 0) { + out_chlayout.layout = in_chlayout.layout; + } else { + out_chlayout.layout = av_get_default_channel_layout(xav_decoder->out_channels); + } #endif return audio_converter_init(xav_decoder->ac, in_chlayout, xav_decoder->decoder->c->sample_rate, @@ -168,7 +229,7 @@ void free_xav_decoder(ErlNifEnv *env, void *obj) { } } -static ErlNifFunc xav_funcs[] = {{"new", 1, new}, +static ErlNifFunc xav_funcs[] = {{"new", 4, new}, {"decode", 4, decode, ERL_NIF_DIRTY_JOB_CPU_BOUND}}; static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) { diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h index eb263ea..08e01fe 100644 --- a/c_src/xav/xav_decoder.h +++ b/c_src/xav/xav_decoder.h @@ -4,4 +4,7 @@ struct XavDecoder { struct Decoder *decoder; struct AudioConverter *ac; + char *out_format; + int out_sample_rate; + int out_channels; }; \ No newline at end of file diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c index d4bab4b..e27b40d 100644 --- a/c_src/xav/xav_reader.c +++ b/c_src/xav/xav_reader.c @@ -5,7 +5,7 @@ static int init_audio_converter(struct XavReader *xav_reader); ErlNifResourceType *xav_reader_resource_type; ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { - if (argc != 3) { + if (argc != 6) { return xav_nif_raise(env, "invalid_arg_count"); } @@ -31,10 +31,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { media_type = AVMEDIA_TYPE_AUDIO; } + unsigned int out_format_len; + if (!enif_get_atom_length(env, argv[3], &out_format_len, ERL_NIF_LATIN1)) { + return xav_nif_raise(env, "failed_to_get_atom_length"); + } + + char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *)); + + if (enif_get_atom(env, argv[3], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) { + return xav_nif_raise(env, "failed_to_get_atom"); + } + + int out_sample_rate; + if (!enif_get_int(env, argv[4], &out_sample_rate)) { + return xav_nif_raise(env, "invalid_out_sample_rate"); + } + + int out_channels; + if (!enif_get_int(env, argv[5], &out_channels)) { + return xav_nif_raise(env, "invalid_out_channels"); + } + struct XavReader *xav_reader = enif_alloc_resource(xav_reader_resource_type, sizeof(struct XavReader)); xav_reader->reader = NULL; xav_reader->ac = NULL; + xav_reader->out_format = out_format; + xav_reader->out_sample_rate = out_sample_rate; + xav_reader->out_channels = out_channels; xav_reader->reader = reader_alloc(); if (xav_reader->reader == NULL) { @@ -65,13 +89,24 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { enif_release_resource(xav_reader); if (xav_reader->reader->media_type == AVMEDIA_TYPE_AUDIO) { - ERL_NIF_TERM sample_rate_term = enif_make_int(env, xav_reader->reader->c->sample_rate); + ERL_NIF_TERM in_sample_rate_term = enif_make_int(env, xav_reader->reader->c->sample_rate); ERL_NIF_TERM in_format_term = enif_make_atom(env, av_get_sample_fmt_name(xav_reader->reader->c->sample_fmt)); + +#if LIBAVUTIL_VERSION_MAJOR >= 58 + ERL_NIF_TERM in_channels_term = + enif_make_int(env, xav_reader->reader->c->ch_layout.nb_channels); +#else + ERL_NIF_TERM in_channels_term = enif_make_int(env, xav_reader->reader->c->channels); +#endif + ERL_NIF_TERM out_format_term = enif_make_atom(env, av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt)); - return enif_make_tuple(env, 8, ok_term, xav_term, in_format_term, out_format_term, - sample_rate_term, bit_rate_term, duration_term, codec_term); + ERL_NIF_TERM out_sample_rate_term = enif_make_int(env, xav_reader->ac->out_sample_rate); + ERL_NIF_TERM out_channels_term = enif_make_int(env, xav_reader->ac->out_channels); + return enif_make_tuple(env, 11, ok_term, xav_term, in_format_term, out_format_term, + in_sample_rate_term, out_sample_rate_term, in_channels_term, + out_channels_term, bit_rate_term, duration_term, codec_term); } else if (xav_reader->reader->media_type == AVMEDIA_TYPE_VIDEO) { ERL_NIF_TERM in_format_term = @@ -133,6 +168,12 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { const char *out_format = av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt); + if (strcmp(out_format, "flt") == 0) { + out_format = "f32"; + } else if (strcmp(out_format, "dbl") == 0) { + out_format = "f64"; + } + frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format, xav_reader->reader->frame->pts); av_freep(&out_data[0]); @@ -151,16 +192,42 @@ static int init_audio_converter(struct XavReader *xav_reader) { return -1; } - int out_sample_rate = 16000; - enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; + int out_sample_rate; + if (xav_reader->out_sample_rate == 0) { + out_sample_rate = xav_reader->reader->c->sample_rate; + } else { + out_sample_rate = xav_reader->out_sample_rate; + } + + enum AVSampleFormat out_sample_fmt; + if (strcmp(xav_reader->out_format, "u8") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_U8; + } else if (strcmp(xav_reader->out_format, "s16") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_S16; + } else if (strcmp(xav_reader->out_format, "s32") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_S32; + } else if (strcmp(xav_reader->out_format, "s64") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_S64; + } else if (strcmp(xav_reader->out_format, "f32") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_FLT; + } else if (strcmp(xav_reader->out_format, "f64") == 0) { + out_sample_fmt = AV_SAMPLE_FMT_DBL; + } else if (strcmp(xav_reader->out_format, "nil") == 0) { + out_sample_fmt = av_get_alt_sample_fmt(xav_reader->reader->c->sample_fmt, 0); + } else { + return -1; + } struct ChannelLayout in_chlayout, out_chlayout; #if LIBAVUTIL_VERSION_MAJOR >= 58 in_chlayout.layout = xav_reader->reader->c->ch_layout; - av_channel_layout_from_mask(&out_chlayout.layout, AV_CH_LAYOUT_MONO); + if (xav_reader->out_channels == 0) { + out_chlayout.layout = in_chlayout.layout; + } else { + av_channel_layout_default(&out_chlayout.layout, xav_reader->out_channels); + } #else in_chlayout.layout = xav_reader->reader->c->channel_layout; - out_chlayout.layout = AV_CH_LAYOUT_MONO; if (xav_reader->reader->c->channel_layout == 0 && xav_reader->reader->c->channels > 0) { // In newer FFmpeg versions, 0 means that the order of channels is @@ -176,6 +243,11 @@ static int init_audio_converter(struct XavReader *xav_reader) { return -1; } + if (xav_reader->out_channels == 0) { + out_chlayout.layout = in_chlayout.layout; + } else { + out_chlayout.layout = av_get_default_channel_layout(xav_reader->out_channels); + } #endif return audio_converter_init(xav_reader->ac, in_chlayout, xav_reader->reader->c->sample_rate, @@ -195,7 +267,7 @@ void free_xav_reader(ErlNifEnv *env, void *obj) { } } -static ErlNifFunc xav_funcs[] = {{"new", 3, new}, +static ErlNifFunc xav_funcs[] = {{"new", 6, new}, {"next_frame", 1, next_frame, ERL_NIF_DIRTY_JOB_CPU_BOUND}}; static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) { diff --git a/c_src/xav/xav_reader.h b/c_src/xav/xav_reader.h index 08ee17f..610bb0a 100644 --- a/c_src/xav/xav_reader.h +++ b/c_src/xav/xav_reader.h @@ -4,4 +4,7 @@ struct XavReader { struct Reader *reader; struct AudioConverter *ac; + char *out_format; + int out_sample_rate; + int out_channels; }; \ No newline at end of file diff --git a/lib/decoder.ex b/lib/decoder.ex index 7ac80bb..e6be460 100644 --- a/lib/decoder.ex +++ b/lib/decoder.ex @@ -10,19 +10,59 @@ defmodule Xav.Decoder do @type t() :: reference() + @typedoc """ + Opts that can be passed to `new/2`. + """ + @type opts :: [ + out_format: Xav.Frame.format(), + out_sample_rate: integer(), + out_channels: integer() + ] + @doc """ Creates a new decoder. + + `opts` can be used to specify desired output parameters. + + E.g. if you want to change audio samples format just pass: + + ```elixir + [out_format: :f32] + ``` + + Video frames are always returned in RGB format. + This setting cannot be changed. + + Audio samples are always in the packed form - + samples from different channels are interleaved in the same, single binary: + + ``` + <> + ``` + + The way in which samples are interleaved is not specified. + + An alternative would be to return a list of binaries, where + each binary represents different channel: + + ``` + [ + <>, + <>, + <> + ] + ``` """ - @spec new(codec()) :: t() - def new(codec) do - Xav.Decoder.NIF.new(codec) + @spec new(codec(), opts()) :: t() + def new(codec, opts \\ []) do + out_format = opts[:out_format] + out_sample_rate = opts[:out_sample_rate] || 0 + out_channels = opts[:out_channels] || 0 + Xav.Decoder.NIF.new(codec, out_format, out_sample_rate, out_channels) end @doc """ - Decodes an audio or video frame. - - Video frames are always in the RGB format. - Audio samples are always interleaved. + Decodes an audio/video frame. """ @spec decode(t(), binary(), pts: integer(), dts: integer()) :: {:ok, Xav.Frame.t()} | {:error, atom()} diff --git a/lib/decoder_nif.ex b/lib/decoder_nif.ex index 6cad870..df6751c 100644 --- a/lib/decoder_nif.ex +++ b/lib/decoder_nif.ex @@ -8,7 +8,7 @@ defmodule Xav.Decoder.NIF do :ok = :erlang.load_nif(path, 0) end - def new(_codec), do: :erlang.nif_error(:undef) + def new(_codec, _out_format, _out_sample_rate, _out_channels), do: :erlang.nif_error(:undef) def decode(_decoder, _data, _pts, _dts), do: :erlang.nif_error(:undef) end diff --git a/lib/frame.ex b/lib/frame.ex index d2424a5..982b9fb 100644 --- a/lib/frame.ex +++ b/lib/frame.ex @@ -3,10 +3,24 @@ defmodule Xav.Frame do Audio/video frame. """ + @typedoc """ + Possible audio samples formats. + """ + @type audio_format() :: :u8 | :s16 | :s32 | :s64 | :f32 | :f64 + + @typedoc """ + Possible video frame formats. + + Currently, only RGB is supported. + """ + @type video_format() :: :rgb + + @type format() :: audio_format() | video_format() + @type t() :: %__MODULE__{ type: :audio | :video, data: binary(), - format: atom(), + format: format(), width: non_neg_integer() | nil, height: non_neg_integer() | nil, samples: integer() | nil, @@ -23,7 +37,10 @@ defmodule Xav.Frame do :pts ] - @spec new(binary(), atom(), non_neg_integer(), non_neg_integer(), integer()) :: t() + @doc """ + Creates a new audio/video frame. + """ + @spec new(binary(), format(), non_neg_integer(), non_neg_integer(), integer()) :: t() def new(data, format, width, height, pts) do %__MODULE__{ type: :video, @@ -35,7 +52,7 @@ defmodule Xav.Frame do } end - @spec new(binary(), atom(), integer(), integer()) :: t() + @spec new(binary(), format(), integer(), integer()) :: t() def new(data, format, samples, pts) do %__MODULE__{ type: :audio, @@ -47,7 +64,7 @@ defmodule Xav.Frame do end @doc """ - Converts frame to Nx tensor. + Converts a frame to an Nx tensor. """ @spec to_nx(t()) :: Nx.Tensor.t() def to_nx(%__MODULE__{type: :video} = frame) do @@ -57,13 +74,6 @@ defmodule Xav.Frame do end def to_nx(%__MODULE__{type: :audio} = frame) do - Nx.from_binary(frame.data, to_nx_format(frame.format)) + Nx.from_binary(frame.data, frame.format) end - - defp to_nx_format(:u8), do: :u8 - defp to_nx_format(:s16), do: :s16 - defp to_nx_format(:s32), do: :s32 - defp to_nx_format(:s64), do: :s64 - defp to_nx_format(:flt), do: :f32 - defp to_nx_format(:dbl), do: :f64 end diff --git a/lib/reader.ex b/lib/reader.ex index eddff52..2d83228 100644 --- a/lib/reader.ex +++ b/lib/reader.ex @@ -10,20 +10,29 @@ defmodule Xav.Reader do Defaults to `:video`. * `device?` - determines whether path points to the camera. Defaults to `false`. """ - @type opts :: [read: :audio | :video, device?: boolean] + @type opts :: [ + read: :audio | :video, + device?: boolean, + out_format: Xav.Frame.format(), + out_sample_rate: integer(), + out_channels: integer() + ] @type t() :: %__MODULE__{ reader: reference(), in_format: atom(), out_format: atom(), - sample_rate: integer() | nil, + in_sample_rate: integer() | nil, + out_sample_rate: integer() | nil, + in_channels: integer() | nil, + out_channels: integer() | nil, bit_rate: integer(), duration: integer(), codec: atom() } @enforce_keys [:reader, :in_format, :out_format, :bit_rate, :duration, :codec] - defstruct @enforce_keys ++ [:sample_rate] + defstruct @enforce_keys ++ [:in_sample_rate, :out_sample_rate, :in_channels, :out_channels] @doc """ The same as new/1 but raises on error. @@ -44,20 +53,39 @@ defmodule Xav.Reader do locked to 10. Microphone input is not supported. + + `opts` can be used to specify desired output parameters. + Video frames are always returned in RGB format. This setting cannot be changed. + Audio samples are always in the packed form. + See `Xav.Decoder.new/2` for more information. """ @spec new(String.t(), opts()) :: {:ok, t()} | {:error, term()} def new(path, opts \\ []) do read = opts[:read] || :video device? = opts[:device?] || false - - case Xav.Reader.NIF.new(path, to_int(device?), to_int(read)) do - {:ok, reader, in_format, out_format, sample_rate, bit_rate, duration, codec} -> + out_format = opts[:out_format] + out_sample_rate = opts[:out_sample_rate] || 0 + out_channels = opts[:out_channels] || 0 + + case Xav.Reader.NIF.new( + path, + to_int(device?), + to_int(read), + out_format, + out_sample_rate, + out_channels + ) do + {:ok, reader, in_format, out_format, in_sample_rate, out_sample_rate, in_channels, + out_channels, bit_rate, duration, codec} -> {:ok, %__MODULE__{ reader: reader, in_format: in_format, out_format: out_format, - sample_rate: sample_rate, + in_sample_rate: in_sample_rate, + out_sample_rate: out_sample_rate, + in_channels: in_channels, + out_channels: out_channels, bit_rate: bit_rate, duration: duration, codec: to_human_readable(codec) @@ -80,11 +108,7 @@ defmodule Xav.Reader do end @doc """ - Reads the next frame. - - A frame is always decoded. - Video frames are always in the RGB format. - Audio samples are always interleaved. + Reads and decodes the next frame. """ @spec next_frame(t()) :: {:ok, Xav.Frame.t()} | {:error, :eof} def next_frame(%__MODULE__{reader: reader}) do diff --git a/lib/reader_nif.ex b/lib/reader_nif.ex index b642c27..0161450 100644 --- a/lib/reader_nif.ex +++ b/lib/reader_nif.ex @@ -8,7 +8,8 @@ defmodule Xav.Reader.NIF do :ok = :erlang.load_nif(path, 0) end - def new(_path, _device, _video), do: :erlang.nif_error(:undef) + def new(_path, _device, _video, _out_format, _out_sample_rate, _out_channels), + do: :erlang.nif_error(:undef) def next_frame(_reader), do: :erlang.nif_error(:undef) end diff --git a/test/decoder_test.exs b/test/decoder_test.exs index f4163ad..2682ffd 100644 --- a/test/decoder_test.exs +++ b/test/decoder_test.exs @@ -300,8 +300,21 @@ defmodule Xav.DecoderTest do test "audio" do decoder = Xav.Decoder.new(:opus) - assert {:ok, %Xav.Frame{samples: 960, pts: 0, format: :flt}} = + assert {:ok, %Xav.Frame{data: data, samples: 960, pts: 0, format: :f32}} = Xav.Decoder.decode(decoder, @opus_frame) + + assert byte_size(data) == 7680 + end + + test "audio with resampling" do + decoder = Xav.Decoder.new(:opus, out_format: :u8, out_sample_rate: 16_000, out_channels: 1) + + # after changing out_format and out_sample rate, we should have less samples + # and the data should be smaller + assert {:ok, %Xav.Frame{data: data, samples: 304, pts: 0, format: :u8}} = + Xav.Decoder.decode(decoder, @opus_frame) + + assert byte_size(data) == 304 end test "video keyframe" do diff --git a/test/fixtures/README.md b/test/fixtures/stt/README.md similarity index 95% rename from test/fixtures/README.md rename to test/fixtures/stt/README.md index cfa4773..3de472b 100644 --- a/test/fixtures/README.md +++ b/test/fixtures/stt/README.md @@ -1,4 +1,4 @@ -# Fixtures +# STT Fixtures * [melnet_sample_0.mp3](https://audio-samples.github.io/) - 22050Hz, 1 channel, fltp * [harvard.wav](https://www.kaggle.com/datasets/pavanelisetty/sample-audio-files-for-speech-recognition) - 44100Hz, 2 channels, s16 diff --git a/test/fixtures/harvard.mp3 b/test/fixtures/stt/harvard.mp3 similarity index 100% rename from test/fixtures/harvard.mp3 rename to test/fixtures/stt/harvard.mp3 diff --git a/test/fixtures/harvard.wav b/test/fixtures/stt/harvard.wav similarity index 100% rename from test/fixtures/harvard.wav rename to test/fixtures/stt/harvard.wav diff --git a/test/fixtures/melnet_sample_0.mp3 b/test/fixtures/stt/melnet_sample_0.mp3 similarity index 100% rename from test/fixtures/melnet_sample_0.mp3 rename to test/fixtures/stt/melnet_sample_0.mp3 diff --git a/test/reader_test.exs b/test/reader_test.exs index 2dd441c..7798504 100644 --- a/test/reader_test.exs +++ b/test/reader_test.exs @@ -45,13 +45,13 @@ defmodule Xav.ReaderTest do for {path, expected_output} <- [ # This file has been downloaded from https://audio-samples.github.io/ # Section: Samples from the model without biasing or priming. - {"./test/fixtures/melnet_sample_0.mp3", + {"./test/fixtures/stt/melnet_sample_0.mp3", """ My thought, I have nobody by a beauty and will as you poured. \ Mr. Rochester has served and that so don't find a simple and \ devoted aboud to what might in a\ """}, - {"./test/fixtures/harvard.wav", + {"./test/fixtures/stt/harvard.wav", """ The stale smell of old beer lingers. It takes heat to bring out the odor. \ A cold dip restores health in zest. A salt pickle tastes fine with ham. \ @@ -63,7 +63,13 @@ defmodule Xav.ReaderTest do end defp test_speech_to_text(path, expected_output) do - reader = Xav.Reader.new!(path, read: :audio) + reader = + Xav.Reader.new!(path, + read: :audio, + out_channels: 1, + out_format: :f32, + out_sample_rate: 16_000 + ) {:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"}) {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})