Skip to content

Commit

Permalink
Add ability to specify output parameters (#9)
Browse files Browse the repository at this point in the history
  • Loading branch information
mickel8 authored Aug 7, 2024
1 parent dad836a commit 499bf2c
Show file tree
Hide file tree
Showing 18 changed files with 298 additions and 54 deletions.
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ decoder = Xav.Decoder.new(:vp8)
{:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>)
```

Decode with audio resampling

```elixir
decoder = Xav.Decoder.new(:opus, out_format: :f32, out_sample_rate: 16_000)
{:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>)
```

Read from a file:

```elixir
Expand All @@ -52,7 +59,8 @@ Kino.Image.new(tensor)
Speech to text:

```elixir
r = Xav.Reader.new!("sample.mp3", read: :audio)
# See https://hexdocs.pm/bumblebee/Bumblebee.Audio.WhisperFeaturizer.html for default sampling rate
r = Xav.Reader.new!("sample.mp3", read: :audio, out_format: :f32, out_channels: 1, out_sample_rate: 16_000)

{:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
Expand Down
2 changes: 2 additions & 0 deletions c_src/xav/audio_converter.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ int audio_converter_init(struct AudioConverter *c, struct ChannelLayout in_chlay
#if LIBAVUTIL_VERSION_MAJOR >= 58
av_opt_set_chlayout(c->swr_ctx, "in_chlayout", &in_chlayout.layout, 0);
av_opt_set_chlayout(c->swr_ctx, "out_chlayout", &out_chlayout.layout, 0);
c->out_channels = out_chlayout.layout.nb_channels;
#else
av_opt_set_channel_layout(c->swr_ctx, "in_channel_layout", in_chlayout.layout, 0);
av_opt_set_channel_layout(c->swr_ctx, "out_channel_layout", out_chlayout.layout, 0);
c->out_channels = av_get_channel_layout_nb_channels(out_chlayout.layout);
#endif

av_opt_set_int(c->swr_ctx, "in_sample_rate", in_sample_rate, 0);
Expand Down
1 change: 1 addition & 0 deletions c_src/xav/audio_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ struct AudioConverter {
SwrContext *swr_ctx;
int64_t in_sample_rate;
int64_t out_sample_rate;
int64_t out_channels;
struct ChannelLayout out_chlayout;
enum AVSampleFormat out_sample_fmt;
};
Expand Down
73 changes: 67 additions & 6 deletions c_src/xav/xav_decoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ ErlNifResourceType *xav_decoder_resource_type;
static int init_audio_converter(struct XavDecoder *xav_decoder);

ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
if (argc != 1) {
if (argc != 4) {
return xav_nif_raise(env, "invalid_arg_count");
}

Expand All @@ -22,10 +22,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
return xav_nif_raise(env, "failed_to_get_atom");
}

unsigned int out_format_len;
if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) {
return xav_nif_raise(env, "failed_to_get_atom_length");
}

char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));

if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
return xav_nif_raise(env, "failed_to_get_atom");
}

int out_sample_rate;
if (!enif_get_int(env, argv[2], &out_sample_rate)) {
return xav_nif_raise(env, "invalid_out_sample_rate");
}

int out_channels;
if (!enif_get_int(env, argv[3], &out_channels)) {
return xav_nif_raise(env, "invalid_out_channels");
}

struct XavDecoder *xav_decoder =
enif_alloc_resource(xav_decoder_resource_type, sizeof(struct XavDecoder));
xav_decoder->decoder = NULL;
xav_decoder->ac = NULL;
xav_decoder->out_format = out_format;
xav_decoder->out_sample_rate = out_sample_rate;
xav_decoder->out_channels = out_channels;

xav_decoder->decoder = decoder_alloc();
if (xav_decoder->decoder == NULL) {
Expand Down Expand Up @@ -120,6 +144,12 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {

const char *out_format = av_get_sample_fmt_name(xav_decoder->ac->out_sample_fmt);

if (strcmp(out_format, "flt") == 0) {
out_format = "f32";
} else if (strcmp(out_format, "dbl") == 0) {
out_format = "f64";
}

frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
xav_decoder->decoder->frame->pts);

Expand All @@ -139,16 +169,47 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) {
return -1;
}

int out_sample_rate = xav_decoder->decoder->c->sample_rate;
enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT;
int out_sample_rate;
if (xav_decoder->out_sample_rate == 0) {
out_sample_rate = xav_decoder->decoder->c->sample_rate;
} else {
out_sample_rate = xav_decoder->out_sample_rate;
}

enum AVSampleFormat out_sample_fmt;
if (strcmp(xav_decoder->out_format, "u8") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_U8;
} else if (strcmp(xav_decoder->out_format, "s16") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_S16;
} else if (strcmp(xav_decoder->out_format, "s32") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_S32;
} else if (strcmp(xav_decoder->out_format, "s64") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_S64;
} else if (strcmp(xav_decoder->out_format, "f32") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_FLT;
} else if (strcmp(xav_decoder->out_format, "f64") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_DBL;
} else if (strcmp(xav_decoder->out_format, "nil") == 0) {
out_sample_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0);
} else {
return -1;
}

struct ChannelLayout in_chlayout, out_chlayout;
#if LIBAVUTIL_VERSION_MAJOR >= 58
in_chlayout.layout = xav_decoder->decoder->c->ch_layout;
out_chlayout.layout = xav_decoder->decoder->c->ch_layout;
if (xav_decoder->out_channels == 0) {
out_chlayout.layout = in_chlayout.layout;
} else {
av_channel_layout_default(&out_chlayout.layout, xav_decoder->out_channels);
}
#else
in_chlayout.layout = xav_decoder->decoder->c->channel_layout;
out_chlayout.layout = xav_decoder->decoder->c->channel_layout;
if (xav_decoder->out_channels == 0) {
out_chlayout.layout = in_chlayout.layout;
} else {
out_chlayout.layout = av_get_default_channel_layout(xav_decoder->out_channels);
}
#endif

return audio_converter_init(xav_decoder->ac, in_chlayout, xav_decoder->decoder->c->sample_rate,
Expand All @@ -168,7 +229,7 @@ void free_xav_decoder(ErlNifEnv *env, void *obj) {
}
}

static ErlNifFunc xav_funcs[] = {{"new", 1, new},
static ErlNifFunc xav_funcs[] = {{"new", 4, new},
{"decode", 4, decode, ERL_NIF_DIRTY_JOB_CPU_BOUND}};

static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {
Expand Down
3 changes: 3 additions & 0 deletions c_src/xav/xav_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@
struct XavDecoder {
struct Decoder *decoder;
struct AudioConverter *ac;
char *out_format;
int out_sample_rate;
int out_channels;
};
90 changes: 81 additions & 9 deletions c_src/xav/xav_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ static int init_audio_converter(struct XavReader *xav_reader);
ErlNifResourceType *xav_reader_resource_type;

ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
if (argc != 3) {
if (argc != 6) {
return xav_nif_raise(env, "invalid_arg_count");
}

Expand All @@ -31,10 +31,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
media_type = AVMEDIA_TYPE_AUDIO;
}

unsigned int out_format_len;
if (!enif_get_atom_length(env, argv[3], &out_format_len, ERL_NIF_LATIN1)) {
return xav_nif_raise(env, "failed_to_get_atom_length");
}

char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));

if (enif_get_atom(env, argv[3], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
return xav_nif_raise(env, "failed_to_get_atom");
}

int out_sample_rate;
if (!enif_get_int(env, argv[4], &out_sample_rate)) {
return xav_nif_raise(env, "invalid_out_sample_rate");
}

int out_channels;
if (!enif_get_int(env, argv[5], &out_channels)) {
return xav_nif_raise(env, "invalid_out_channels");
}

struct XavReader *xav_reader =
enif_alloc_resource(xav_reader_resource_type, sizeof(struct XavReader));
xav_reader->reader = NULL;
xav_reader->ac = NULL;
xav_reader->out_format = out_format;
xav_reader->out_sample_rate = out_sample_rate;
xav_reader->out_channels = out_channels;

xav_reader->reader = reader_alloc();
if (xav_reader->reader == NULL) {
Expand Down Expand Up @@ -65,13 +89,24 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
enif_release_resource(xav_reader);

if (xav_reader->reader->media_type == AVMEDIA_TYPE_AUDIO) {
ERL_NIF_TERM sample_rate_term = enif_make_int(env, xav_reader->reader->c->sample_rate);
ERL_NIF_TERM in_sample_rate_term = enif_make_int(env, xav_reader->reader->c->sample_rate);
ERL_NIF_TERM in_format_term =
enif_make_atom(env, av_get_sample_fmt_name(xav_reader->reader->c->sample_fmt));

#if LIBAVUTIL_VERSION_MAJOR >= 58
ERL_NIF_TERM in_channels_term =
enif_make_int(env, xav_reader->reader->c->ch_layout.nb_channels);
#else
ERL_NIF_TERM in_channels_term = enif_make_int(env, xav_reader->reader->c->channels);
#endif

ERL_NIF_TERM out_format_term =
enif_make_atom(env, av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt));
return enif_make_tuple(env, 8, ok_term, xav_term, in_format_term, out_format_term,
sample_rate_term, bit_rate_term, duration_term, codec_term);
ERL_NIF_TERM out_sample_rate_term = enif_make_int(env, xav_reader->ac->out_sample_rate);
ERL_NIF_TERM out_channels_term = enif_make_int(env, xav_reader->ac->out_channels);
return enif_make_tuple(env, 11, ok_term, xav_term, in_format_term, out_format_term,
in_sample_rate_term, out_sample_rate_term, in_channels_term,
out_channels_term, bit_rate_term, duration_term, codec_term);

} else if (xav_reader->reader->media_type == AVMEDIA_TYPE_VIDEO) {
ERL_NIF_TERM in_format_term =
Expand Down Expand Up @@ -133,6 +168,12 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {

const char *out_format = av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt);

if (strcmp(out_format, "flt") == 0) {
out_format = "f32";
} else if (strcmp(out_format, "dbl") == 0) {
out_format = "f64";
}

frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
xav_reader->reader->frame->pts);
av_freep(&out_data[0]);
Expand All @@ -151,16 +192,42 @@ static int init_audio_converter(struct XavReader *xav_reader) {
return -1;
}

int out_sample_rate = 16000;
enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT;
int out_sample_rate;
if (xav_reader->out_sample_rate == 0) {
out_sample_rate = xav_reader->reader->c->sample_rate;
} else {
out_sample_rate = xav_reader->out_sample_rate;
}

enum AVSampleFormat out_sample_fmt;
if (strcmp(xav_reader->out_format, "u8") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_U8;
} else if (strcmp(xav_reader->out_format, "s16") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_S16;
} else if (strcmp(xav_reader->out_format, "s32") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_S32;
} else if (strcmp(xav_reader->out_format, "s64") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_S64;
} else if (strcmp(xav_reader->out_format, "f32") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_FLT;
} else if (strcmp(xav_reader->out_format, "f64") == 0) {
out_sample_fmt = AV_SAMPLE_FMT_DBL;
} else if (strcmp(xav_reader->out_format, "nil") == 0) {
out_sample_fmt = av_get_alt_sample_fmt(xav_reader->reader->c->sample_fmt, 0);
} else {
return -1;
}

struct ChannelLayout in_chlayout, out_chlayout;
#if LIBAVUTIL_VERSION_MAJOR >= 58
in_chlayout.layout = xav_reader->reader->c->ch_layout;
av_channel_layout_from_mask(&out_chlayout.layout, AV_CH_LAYOUT_MONO);
if (xav_reader->out_channels == 0) {
out_chlayout.layout = in_chlayout.layout;
} else {
av_channel_layout_default(&out_chlayout.layout, xav_reader->out_channels);
}
#else
in_chlayout.layout = xav_reader->reader->c->channel_layout;
out_chlayout.layout = AV_CH_LAYOUT_MONO;

if (xav_reader->reader->c->channel_layout == 0 && xav_reader->reader->c->channels > 0) {
// In newer FFmpeg versions, 0 means that the order of channels is
Expand All @@ -176,6 +243,11 @@ static int init_audio_converter(struct XavReader *xav_reader) {
return -1;
}

if (xav_reader->out_channels == 0) {
out_chlayout.layout = in_chlayout.layout;
} else {
out_chlayout.layout = av_get_default_channel_layout(xav_reader->out_channels);
}
#endif

return audio_converter_init(xav_reader->ac, in_chlayout, xav_reader->reader->c->sample_rate,
Expand All @@ -195,7 +267,7 @@ void free_xav_reader(ErlNifEnv *env, void *obj) {
}
}

static ErlNifFunc xav_funcs[] = {{"new", 3, new},
static ErlNifFunc xav_funcs[] = {{"new", 6, new},
{"next_frame", 1, next_frame, ERL_NIF_DIRTY_JOB_CPU_BOUND}};

static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {
Expand Down
3 changes: 3 additions & 0 deletions c_src/xav/xav_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@
struct XavReader {
struct Reader *reader;
struct AudioConverter *ac;
char *out_format;
int out_sample_rate;
int out_channels;
};
54 changes: 47 additions & 7 deletions lib/decoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,59 @@ defmodule Xav.Decoder do

@type t() :: reference()

@typedoc """
Opts that can be passed to `new/2`.
"""
@type opts :: [
out_format: Xav.Frame.format(),
out_sample_rate: integer(),
out_channels: integer()
]

@doc """
Creates a new decoder.
`opts` can be used to specify desired output parameters.
E.g. if you want to change audio samples format just pass:
```elixir
[out_format: :f32]
```
Video frames are always returned in RGB format.
This setting cannot be changed.
Audio samples are always in the packed form -
samples from different channels are interleaved in the same, single binary:
```
<<c10, c20, c30, c11, c21, c31, c12, c22, c32>>
```
The way in which samples are interleaved is not specified.
An alternative would be to return a list of binaries, where
each binary represents different channel:
```
[
<<c10, c11, c12, c13, c14>>,
<<c20, c21, c22, c23, c24>>,
<<c30, c31, c32, c33, c34>>
]
```
"""
@spec new(codec()) :: t()
def new(codec) do
Xav.Decoder.NIF.new(codec)
@spec new(codec(), opts()) :: t()
def new(codec, opts \\ []) do
out_format = opts[:out_format]
out_sample_rate = opts[:out_sample_rate] || 0
out_channels = opts[:out_channels] || 0
Xav.Decoder.NIF.new(codec, out_format, out_sample_rate, out_channels)
end

@doc """
Decodes an audio or video frame.
Video frames are always in the RGB format.
Audio samples are always interleaved.
Decodes an audio/video frame.
"""
@spec decode(t(), binary(), pts: integer(), dts: integer()) ::
{:ok, Xav.Frame.t()} | {:error, atom()}
Expand Down
Loading

0 comments on commit 499bf2c

Please sign in to comment.