Add ability to specify output parameters (#9)

elixir-webrtc · Aug 7, 2024 · 499bf2c · 499bf2c
1 parent dad836a
commit 499bf2c
Show file tree

Hide file tree

Showing 18 changed files with 298 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -31,6 +31,13 @@ decoder = Xav.Decoder.new(:vp8)
 {:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>)
 ```
 
+Decode with audio resampling
+
+```elixir
+decoder = Xav.Decoder.new(:opus, out_format: :f32, out_sample_rate: 16_000)
+{:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>)
+```
+
 Read from a file:
 
 ```elixir
@@ -52,7 +59,8 @@ Kino.Image.new(tensor)
 Speech to text:
 
 ```elixir
-r = Xav.Reader.new!("sample.mp3", read: :audio)
+# See https://hexdocs.pm/bumblebee/Bumblebee.Audio.WhisperFeaturizer.html for default sampling rate
+r = Xav.Reader.new!("sample.mp3", read: :audio, out_format: :f32, out_channels: 1, out_sample_rate: 16_000)
 
 {:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
 {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})

diff --git a/c_src/xav/audio_converter.c b/c_src/xav/audio_converter.c
@@ -28,9 +28,11 @@ int audio_converter_init(struct AudioConverter *c, struct ChannelLayout in_chlay
 #if LIBAVUTIL_VERSION_MAJOR >= 58
   av_opt_set_chlayout(c->swr_ctx, "in_chlayout", &in_chlayout.layout, 0);
   av_opt_set_chlayout(c->swr_ctx, "out_chlayout", &out_chlayout.layout, 0);
+  c->out_channels = out_chlayout.layout.nb_channels;
 #else
   av_opt_set_channel_layout(c->swr_ctx, "in_channel_layout", in_chlayout.layout, 0);
   av_opt_set_channel_layout(c->swr_ctx, "out_channel_layout", out_chlayout.layout, 0);
+  c->out_channels = av_get_channel_layout_nb_channels(out_chlayout.layout);
 #endif
 
   av_opt_set_int(c->swr_ctx, "in_sample_rate", in_sample_rate, 0);

diff --git a/c_src/xav/audio_converter.h b/c_src/xav/audio_converter.h
@@ -10,6 +10,7 @@ struct AudioConverter {
   SwrContext *swr_ctx;
   int64_t in_sample_rate;
   int64_t out_sample_rate;
+  int64_t out_channels;
   struct ChannelLayout out_chlayout;
   enum AVSampleFormat out_sample_fmt;
 };

diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c
@@ -7,7 +7,7 @@ ErlNifResourceType *xav_decoder_resource_type;
 static int init_audio_converter(struct XavDecoder *xav_decoder);
 
 ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 1) {
+  if (argc != 4) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
@@ -22,10 +22,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     return xav_nif_raise(env, "failed_to_get_atom");
   }
 
+  unsigned int out_format_len;
+  if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) {
+    return xav_nif_raise(env, "failed_to_get_atom_length");
+  }
+
+  char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
+
+  if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
+    return xav_nif_raise(env, "failed_to_get_atom");
+  }
+
+  int out_sample_rate;
+  if (!enif_get_int(env, argv[2], &out_sample_rate)) {
+    return xav_nif_raise(env, "invalid_out_sample_rate");
+  }
+
+  int out_channels;
+  if (!enif_get_int(env, argv[3], &out_channels)) {
+    return xav_nif_raise(env, "invalid_out_channels");
+  }
+
   struct XavDecoder *xav_decoder =
       enif_alloc_resource(xav_decoder_resource_type, sizeof(struct XavDecoder));
   xav_decoder->decoder = NULL;
   xav_decoder->ac = NULL;
+  xav_decoder->out_format = out_format;
+  xav_decoder->out_sample_rate = out_sample_rate;
+  xav_decoder->out_channels = out_channels;
 
   xav_decoder->decoder = decoder_alloc();
   if (xav_decoder->decoder == NULL) {
@@ -120,6 +144,12 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
 
     const char *out_format = av_get_sample_fmt_name(xav_decoder->ac->out_sample_fmt);
 
+    if (strcmp(out_format, "flt") == 0) {
+      out_format = "f32";
+    } else if (strcmp(out_format, "dbl") == 0) {
+      out_format = "f64";
+    }
+
     frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
                                              xav_decoder->decoder->frame->pts);
 
@@ -139,16 +169,47 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) {
     return -1;
   }
 
-  int out_sample_rate = xav_decoder->decoder->c->sample_rate;
-  enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  int out_sample_rate;
+  if (xav_decoder->out_sample_rate == 0) {
+    out_sample_rate = xav_decoder->decoder->c->sample_rate;
+  } else {
+    out_sample_rate = xav_decoder->out_sample_rate;
+  }
+
+  enum AVSampleFormat out_sample_fmt;
+  if (strcmp(xav_decoder->out_format, "u8") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_U8;
+  } else if (strcmp(xav_decoder->out_format, "s16") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S16;
+  } else if (strcmp(xav_decoder->out_format, "s32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S32;
+  } else if (strcmp(xav_decoder->out_format, "s64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S64;
+  } else if (strcmp(xav_decoder->out_format, "f32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  } else if (strcmp(xav_decoder->out_format, "f64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_DBL;
+  } else if (strcmp(xav_decoder->out_format, "nil") == 0) {
+    out_sample_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0);
+  } else {
+    return -1;
+  }
 
   struct ChannelLayout in_chlayout, out_chlayout;
 #if LIBAVUTIL_VERSION_MAJOR >= 58
   in_chlayout.layout = xav_decoder->decoder->c->ch_layout;
-  out_chlayout.layout = xav_decoder->decoder->c->ch_layout;
+  if (xav_decoder->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    av_channel_layout_default(&out_chlayout.layout, xav_decoder->out_channels);
+  }
 #else
   in_chlayout.layout = xav_decoder->decoder->c->channel_layout;
-  out_chlayout.layout = xav_decoder->decoder->c->channel_layout;
+  if (xav_decoder->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    out_chlayout.layout = av_get_default_channel_layout(xav_decoder->out_channels);
+  }
 #endif
 
   return audio_converter_init(xav_decoder->ac, in_chlayout, xav_decoder->decoder->c->sample_rate,
@@ -168,7 +229,7 @@ void free_xav_decoder(ErlNifEnv *env, void *obj) {
   }
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 1, new},
+static ErlNifFunc xav_funcs[] = {{"new", 4, new},
                                  {"decode", 4, decode, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 
 static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {

diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h
@@ -4,4 +4,7 @@
 struct XavDecoder {
   struct Decoder *decoder;
   struct AudioConverter *ac;
+  char *out_format;
+  int out_sample_rate;
+  int out_channels;
 };
diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c
@@ -5,7 +5,7 @@ static int init_audio_converter(struct XavReader *xav_reader);
 ErlNifResourceType *xav_reader_resource_type;
 
 ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 3) {
+  if (argc != 6) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
@@ -31,10 +31,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     media_type = AVMEDIA_TYPE_AUDIO;
   }
 
+  unsigned int out_format_len;
+  if (!enif_get_atom_length(env, argv[3], &out_format_len, ERL_NIF_LATIN1)) {
+    return xav_nif_raise(env, "failed_to_get_atom_length");
+  }
+
+  char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
+
+  if (enif_get_atom(env, argv[3], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
+    return xav_nif_raise(env, "failed_to_get_atom");
+  }
+
+  int out_sample_rate;
+  if (!enif_get_int(env, argv[4], &out_sample_rate)) {
+    return xav_nif_raise(env, "invalid_out_sample_rate");
+  }
+
+  int out_channels;
+  if (!enif_get_int(env, argv[5], &out_channels)) {
+    return xav_nif_raise(env, "invalid_out_channels");
+  }
+
   struct XavReader *xav_reader =
       enif_alloc_resource(xav_reader_resource_type, sizeof(struct XavReader));
   xav_reader->reader = NULL;
   xav_reader->ac = NULL;
+  xav_reader->out_format = out_format;
+  xav_reader->out_sample_rate = out_sample_rate;
+  xav_reader->out_channels = out_channels;
 
   xav_reader->reader = reader_alloc();
   if (xav_reader->reader == NULL) {
@@ -65,13 +89,24 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   enif_release_resource(xav_reader);
 
   if (xav_reader->reader->media_type == AVMEDIA_TYPE_AUDIO) {
-    ERL_NIF_TERM sample_rate_term = enif_make_int(env, xav_reader->reader->c->sample_rate);
+    ERL_NIF_TERM in_sample_rate_term = enif_make_int(env, xav_reader->reader->c->sample_rate);
     ERL_NIF_TERM in_format_term =
         enif_make_atom(env, av_get_sample_fmt_name(xav_reader->reader->c->sample_fmt));
+
+#if LIBAVUTIL_VERSION_MAJOR >= 58
+    ERL_NIF_TERM in_channels_term =
+        enif_make_int(env, xav_reader->reader->c->ch_layout.nb_channels);
+#else
+    ERL_NIF_TERM in_channels_term = enif_make_int(env, xav_reader->reader->c->channels);
+#endif
+
     ERL_NIF_TERM out_format_term =
         enif_make_atom(env, av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt));
-    return enif_make_tuple(env, 8, ok_term, xav_term, in_format_term, out_format_term,
-                           sample_rate_term, bit_rate_term, duration_term, codec_term);
+    ERL_NIF_TERM out_sample_rate_term = enif_make_int(env, xav_reader->ac->out_sample_rate);
+    ERL_NIF_TERM out_channels_term = enif_make_int(env, xav_reader->ac->out_channels);
+    return enif_make_tuple(env, 11, ok_term, xav_term, in_format_term, out_format_term,
+                           in_sample_rate_term, out_sample_rate_term, in_channels_term,
+                           out_channels_term, bit_rate_term, duration_term, codec_term);
 
   } else if (xav_reader->reader->media_type == AVMEDIA_TYPE_VIDEO) {
     ERL_NIF_TERM in_format_term =
@@ -133,6 +168,12 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
 
     const char *out_format = av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt);
 
+    if (strcmp(out_format, "flt") == 0) {
+      out_format = "f32";
+    } else if (strcmp(out_format, "dbl") == 0) {
+      out_format = "f64";
+    }
+
     frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
                                              xav_reader->reader->frame->pts);
     av_freep(&out_data[0]);
@@ -151,16 +192,42 @@ static int init_audio_converter(struct XavReader *xav_reader) {
     return -1;
   }
 
-  int out_sample_rate = 16000;
-  enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  int out_sample_rate;
+  if (xav_reader->out_sample_rate == 0) {
+    out_sample_rate = xav_reader->reader->c->sample_rate;
+  } else {
+    out_sample_rate = xav_reader->out_sample_rate;
+  }
+
+  enum AVSampleFormat out_sample_fmt;
+  if (strcmp(xav_reader->out_format, "u8") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_U8;
+  } else if (strcmp(xav_reader->out_format, "s16") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S16;
+  } else if (strcmp(xav_reader->out_format, "s32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S32;
+  } else if (strcmp(xav_reader->out_format, "s64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S64;
+  } else if (strcmp(xav_reader->out_format, "f32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  } else if (strcmp(xav_reader->out_format, "f64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_DBL;
+  } else if (strcmp(xav_reader->out_format, "nil") == 0) {
+    out_sample_fmt = av_get_alt_sample_fmt(xav_reader->reader->c->sample_fmt, 0);
+  } else {
+    return -1;
+  }
 
   struct ChannelLayout in_chlayout, out_chlayout;
 #if LIBAVUTIL_VERSION_MAJOR >= 58
   in_chlayout.layout = xav_reader->reader->c->ch_layout;
-  av_channel_layout_from_mask(&out_chlayout.layout, AV_CH_LAYOUT_MONO);
+  if (xav_reader->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    av_channel_layout_default(&out_chlayout.layout, xav_reader->out_channels);
+  }
 #else
   in_chlayout.layout = xav_reader->reader->c->channel_layout;
-  out_chlayout.layout = AV_CH_LAYOUT_MONO;
 
   if (xav_reader->reader->c->channel_layout == 0 && xav_reader->reader->c->channels > 0) {
     // In newer FFmpeg versions, 0 means that the order of channels is
@@ -176,6 +243,11 @@ static int init_audio_converter(struct XavReader *xav_reader) {
     return -1;
   }
 
+  if (xav_reader->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    out_chlayout.layout = av_get_default_channel_layout(xav_reader->out_channels);
+  }
 #endif
 
   return audio_converter_init(xav_reader->ac, in_chlayout, xav_reader->reader->c->sample_rate,
@@ -195,7 +267,7 @@ void free_xav_reader(ErlNifEnv *env, void *obj) {
   }
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 3, new},
+static ErlNifFunc xav_funcs[] = {{"new", 6, new},
                                  {"next_frame", 1, next_frame, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 
 static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {

diff --git a/c_src/xav/xav_reader.h b/c_src/xav/xav_reader.h
@@ -4,4 +4,7 @@
 struct XavReader {
   struct Reader *reader;
   struct AudioConverter *ac;
+  char *out_format;
+  int out_sample_rate;
+  int out_channels;
 };
diff --git a/lib/decoder.ex b/lib/decoder.ex
@@ -10,19 +10,59 @@ defmodule Xav.Decoder do
 
   @type t() :: reference()
 
+  @typedoc """
+  Opts that can be passed to `new/2`.
+  """
+  @type opts :: [
+          out_format: Xav.Frame.format(),
+          out_sample_rate: integer(),
+          out_channels: integer()
+        ]
+
   @doc """
   Creates a new decoder.
+
+  `opts` can be used to specify desired output parameters.
+
+  E.g. if you want to change audio samples format just pass:
+
+  ```elixir
+  [out_format: :f32]
+  ```
+
+  Video frames are always returned in RGB format.
+  This setting cannot be changed.
+
+  Audio samples are always in the packed form -
+  samples from different channels are interleaved in the same, single binary:
+
+  ```
+  <<c10, c20, c30, c11, c21, c31, c12, c22, c32>>
+  ```
+
+  The way in which samples are interleaved is not specified.
+
+  An alternative would be to return a list of binaries, where
+  each binary represents different channel:
+
+  ```
+  [
+    <<c10, c11, c12, c13, c14>>,
+    <<c20, c21, c22, c23, c24>>,
+    <<c30, c31, c32, c33, c34>>
+  ]
+  ```
   """
-  @spec new(codec()) :: t()
-  def new(codec) do
-    Xav.Decoder.NIF.new(codec)
+  @spec new(codec(), opts()) :: t()
+  def new(codec, opts \\ []) do
+    out_format = opts[:out_format]
+    out_sample_rate = opts[:out_sample_rate] || 0
+    out_channels = opts[:out_channels] || 0
+    Xav.Decoder.NIF.new(codec, out_format, out_sample_rate, out_channels)
   end
 
   @doc """
-  Decodes an audio or video frame.
-
-  Video frames are always in the RGB format.
-  Audio samples are always interleaved.
+  Decodes an audio/video frame.
   """
   @spec decode(t(), binary(), pts: integer(), dts: integer()) ::
           {:ok, Xav.Frame.t()} | {:error, atom()}