diff --git a/README.md b/README.md
index 8a8c342..c1d24be 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,13 @@ decoder = Xav.Decoder.new(:vp8)
 {:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>)
 ```
 
+Decode with audio resampling
+
+```elixir
+decoder = Xav.Decoder.new(:opus, out_format: :f32, out_sample_rate: 16_000)
+{:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>)
+```
+
 Read from a file:
 
 ```elixir
@@ -52,7 +59,8 @@ Kino.Image.new(tensor)
 Speech to text:
 
 ```elixir
-r = Xav.Reader.new!("sample.mp3", read: :audio)
+# See https://hexdocs.pm/bumblebee/Bumblebee.Audio.WhisperFeaturizer.html for default sampling rate
+r = Xav.Reader.new!("sample.mp3", read: :audio, out_format: :f32, out_channels: 1, out_sample_rate: 16_000)
 
 {:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
 {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
diff --git a/c_src/xav/audio_converter.c b/c_src/xav/audio_converter.c
index 98c1926..f98aadd 100644
--- a/c_src/xav/audio_converter.c
+++ b/c_src/xav/audio_converter.c
@@ -28,9 +28,11 @@ int audio_converter_init(struct AudioConverter *c, struct ChannelLayout in_chlay
 #if LIBAVUTIL_VERSION_MAJOR >= 58
   av_opt_set_chlayout(c->swr_ctx, "in_chlayout", &in_chlayout.layout, 0);
   av_opt_set_chlayout(c->swr_ctx, "out_chlayout", &out_chlayout.layout, 0);
+  c->out_channels = out_chlayout.layout.nb_channels;
 #else
   av_opt_set_channel_layout(c->swr_ctx, "in_channel_layout", in_chlayout.layout, 0);
   av_opt_set_channel_layout(c->swr_ctx, "out_channel_layout", out_chlayout.layout, 0);
+  c->out_channels = av_get_channel_layout_nb_channels(out_chlayout.layout);
 #endif
 
   av_opt_set_int(c->swr_ctx, "in_sample_rate", in_sample_rate, 0);
diff --git a/c_src/xav/audio_converter.h b/c_src/xav/audio_converter.h
index 7912072..a55803b 100644
--- a/c_src/xav/audio_converter.h
+++ b/c_src/xav/audio_converter.h
@@ -10,6 +10,7 @@ struct AudioConverter {
   SwrContext *swr_ctx;
   int64_t in_sample_rate;
   int64_t out_sample_rate;
+  int64_t out_channels;
   struct ChannelLayout out_chlayout;
   enum AVSampleFormat out_sample_fmt;
 };
diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c
index 5a9c60c..8332a72 100644
--- a/c_src/xav/xav_decoder.c
+++ b/c_src/xav/xav_decoder.c
@@ -7,7 +7,7 @@ ErlNifResourceType *xav_decoder_resource_type;
 static int init_audio_converter(struct XavDecoder *xav_decoder);
 
 ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 1) {
+  if (argc != 4) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
@@ -22,10 +22,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     return xav_nif_raise(env, "failed_to_get_atom");
   }
 
+  unsigned int out_format_len;
+  if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) {
+    return xav_nif_raise(env, "failed_to_get_atom_length");
+  }
+
+  char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
+
+  if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
+    return xav_nif_raise(env, "failed_to_get_atom");
+  }
+
+  int out_sample_rate;
+  if (!enif_get_int(env, argv[2], &out_sample_rate)) {
+    return xav_nif_raise(env, "invalid_out_sample_rate");
+  }
+
+  int out_channels;
+  if (!enif_get_int(env, argv[3], &out_channels)) {
+    return xav_nif_raise(env, "invalid_out_channels");
+  }
+
   struct XavDecoder *xav_decoder =
       enif_alloc_resource(xav_decoder_resource_type, sizeof(struct XavDecoder));
   xav_decoder->decoder = NULL;
   xav_decoder->ac = NULL;
+  xav_decoder->out_format = out_format;
+  xav_decoder->out_sample_rate = out_sample_rate;
+  xav_decoder->out_channels = out_channels;
 
   xav_decoder->decoder = decoder_alloc();
   if (xav_decoder->decoder == NULL) {
@@ -120,6 +144,12 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
 
     const char *out_format = av_get_sample_fmt_name(xav_decoder->ac->out_sample_fmt);
 
+    if (strcmp(out_format, "flt") == 0) {
+      out_format = "f32";
+    } else if (strcmp(out_format, "dbl") == 0) {
+      out_format = "f64";
+    }
+
     frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
                                              xav_decoder->decoder->frame->pts);
 
@@ -139,16 +169,47 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) {
     return -1;
   }
 
-  int out_sample_rate = xav_decoder->decoder->c->sample_rate;
-  enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  int out_sample_rate;
+  if (xav_decoder->out_sample_rate == 0) {
+    out_sample_rate = xav_decoder->decoder->c->sample_rate;
+  } else {
+    out_sample_rate = xav_decoder->out_sample_rate;
+  }
+
+  enum AVSampleFormat out_sample_fmt;
+  if (strcmp(xav_decoder->out_format, "u8") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_U8;
+  } else if (strcmp(xav_decoder->out_format, "s16") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S16;
+  } else if (strcmp(xav_decoder->out_format, "s32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S32;
+  } else if (strcmp(xav_decoder->out_format, "s64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S64;
+  } else if (strcmp(xav_decoder->out_format, "f32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  } else if (strcmp(xav_decoder->out_format, "f64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_DBL;
+  } else if (strcmp(xav_decoder->out_format, "nil") == 0) {
+    out_sample_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0);
+  } else {
+    return -1;
+  }
 
   struct ChannelLayout in_chlayout, out_chlayout;
 #if LIBAVUTIL_VERSION_MAJOR >= 58
   in_chlayout.layout = xav_decoder->decoder->c->ch_layout;
-  out_chlayout.layout = xav_decoder->decoder->c->ch_layout;
+  if (xav_decoder->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    av_channel_layout_default(&out_chlayout.layout, xav_decoder->out_channels);
+  }
 #else
   in_chlayout.layout = xav_decoder->decoder->c->channel_layout;
-  out_chlayout.layout = xav_decoder->decoder->c->channel_layout;
+  if (xav_decoder->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    out_chlayout.layout = av_get_default_channel_layout(xav_decoder->out_channels);
+  }
 #endif
 
   return audio_converter_init(xav_decoder->ac, in_chlayout, xav_decoder->decoder->c->sample_rate,
@@ -168,7 +229,7 @@ void free_xav_decoder(ErlNifEnv *env, void *obj) {
   }
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 1, new},
+static ErlNifFunc xav_funcs[] = {{"new", 4, new},
                                  {"decode", 4, decode, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 
 static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {
diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h
index eb263ea..08e01fe 100644
--- a/c_src/xav/xav_decoder.h
+++ b/c_src/xav/xav_decoder.h
@@ -4,4 +4,7 @@
 struct XavDecoder {
   struct Decoder *decoder;
   struct AudioConverter *ac;
+  char *out_format;
+  int out_sample_rate;
+  int out_channels;
 };
\ No newline at end of file
diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c
index d4bab4b..e27b40d 100644
--- a/c_src/xav/xav_reader.c
+++ b/c_src/xav/xav_reader.c
@@ -5,7 +5,7 @@ static int init_audio_converter(struct XavReader *xav_reader);
 ErlNifResourceType *xav_reader_resource_type;
 
 ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 3) {
+  if (argc != 6) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
@@ -31,10 +31,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     media_type = AVMEDIA_TYPE_AUDIO;
   }
 
+  unsigned int out_format_len;
+  if (!enif_get_atom_length(env, argv[3], &out_format_len, ERL_NIF_LATIN1)) {
+    return xav_nif_raise(env, "failed_to_get_atom_length");
+  }
+
+  char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
+
+  if (enif_get_atom(env, argv[3], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
+    return xav_nif_raise(env, "failed_to_get_atom");
+  }
+
+  int out_sample_rate;
+  if (!enif_get_int(env, argv[4], &out_sample_rate)) {
+    return xav_nif_raise(env, "invalid_out_sample_rate");
+  }
+
+  int out_channels;
+  if (!enif_get_int(env, argv[5], &out_channels)) {
+    return xav_nif_raise(env, "invalid_out_channels");
+  }
+
   struct XavReader *xav_reader =
       enif_alloc_resource(xav_reader_resource_type, sizeof(struct XavReader));
   xav_reader->reader = NULL;
   xav_reader->ac = NULL;
+  xav_reader->out_format = out_format;
+  xav_reader->out_sample_rate = out_sample_rate;
+  xav_reader->out_channels = out_channels;
 
   xav_reader->reader = reader_alloc();
   if (xav_reader->reader == NULL) {
@@ -65,13 +89,24 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   enif_release_resource(xav_reader);
 
   if (xav_reader->reader->media_type == AVMEDIA_TYPE_AUDIO) {
-    ERL_NIF_TERM sample_rate_term = enif_make_int(env, xav_reader->reader->c->sample_rate);
+    ERL_NIF_TERM in_sample_rate_term = enif_make_int(env, xav_reader->reader->c->sample_rate);
     ERL_NIF_TERM in_format_term =
         enif_make_atom(env, av_get_sample_fmt_name(xav_reader->reader->c->sample_fmt));
+
+#if LIBAVUTIL_VERSION_MAJOR >= 58
+    ERL_NIF_TERM in_channels_term =
+        enif_make_int(env, xav_reader->reader->c->ch_layout.nb_channels);
+#else
+    ERL_NIF_TERM in_channels_term = enif_make_int(env, xav_reader->reader->c->channels);
+#endif
+
     ERL_NIF_TERM out_format_term =
         enif_make_atom(env, av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt));
-    return enif_make_tuple(env, 8, ok_term, xav_term, in_format_term, out_format_term,
-                           sample_rate_term, bit_rate_term, duration_term, codec_term);
+    ERL_NIF_TERM out_sample_rate_term = enif_make_int(env, xav_reader->ac->out_sample_rate);
+    ERL_NIF_TERM out_channels_term = enif_make_int(env, xav_reader->ac->out_channels);
+    return enif_make_tuple(env, 11, ok_term, xav_term, in_format_term, out_format_term,
+                           in_sample_rate_term, out_sample_rate_term, in_channels_term,
+                           out_channels_term, bit_rate_term, duration_term, codec_term);
 
   } else if (xav_reader->reader->media_type == AVMEDIA_TYPE_VIDEO) {
     ERL_NIF_TERM in_format_term =
@@ -133,6 +168,12 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
 
     const char *out_format = av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt);
 
+    if (strcmp(out_format, "flt") == 0) {
+      out_format = "f32";
+    } else if (strcmp(out_format, "dbl") == 0) {
+      out_format = "f64";
+    }
+
     frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
                                              xav_reader->reader->frame->pts);
     av_freep(&out_data[0]);
@@ -151,16 +192,42 @@ static int init_audio_converter(struct XavReader *xav_reader) {
     return -1;
   }
 
-  int out_sample_rate = 16000;
-  enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  int out_sample_rate;
+  if (xav_reader->out_sample_rate == 0) {
+    out_sample_rate = xav_reader->reader->c->sample_rate;
+  } else {
+    out_sample_rate = xav_reader->out_sample_rate;
+  }
+
+  enum AVSampleFormat out_sample_fmt;
+  if (strcmp(xav_reader->out_format, "u8") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_U8;
+  } else if (strcmp(xav_reader->out_format, "s16") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S16;
+  } else if (strcmp(xav_reader->out_format, "s32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S32;
+  } else if (strcmp(xav_reader->out_format, "s64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S64;
+  } else if (strcmp(xav_reader->out_format, "f32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  } else if (strcmp(xav_reader->out_format, "f64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_DBL;
+  } else if (strcmp(xav_reader->out_format, "nil") == 0) {
+    out_sample_fmt = av_get_alt_sample_fmt(xav_reader->reader->c->sample_fmt, 0);
+  } else {
+    return -1;
+  }
 
   struct ChannelLayout in_chlayout, out_chlayout;
 #if LIBAVUTIL_VERSION_MAJOR >= 58
   in_chlayout.layout = xav_reader->reader->c->ch_layout;
-  av_channel_layout_from_mask(&out_chlayout.layout, AV_CH_LAYOUT_MONO);
+  if (xav_reader->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    av_channel_layout_default(&out_chlayout.layout, xav_reader->out_channels);
+  }
 #else
   in_chlayout.layout = xav_reader->reader->c->channel_layout;
-  out_chlayout.layout = AV_CH_LAYOUT_MONO;
 
   if (xav_reader->reader->c->channel_layout == 0 && xav_reader->reader->c->channels > 0) {
     // In newer FFmpeg versions, 0 means that the order of channels is
@@ -176,6 +243,11 @@ static int init_audio_converter(struct XavReader *xav_reader) {
     return -1;
   }
 
+  if (xav_reader->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    out_chlayout.layout = av_get_default_channel_layout(xav_reader->out_channels);
+  }
 #endif
 
   return audio_converter_init(xav_reader->ac, in_chlayout, xav_reader->reader->c->sample_rate,
@@ -195,7 +267,7 @@ void free_xav_reader(ErlNifEnv *env, void *obj) {
   }
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 3, new},
+static ErlNifFunc xav_funcs[] = {{"new", 6, new},
                                  {"next_frame", 1, next_frame, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 
 static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {
diff --git a/c_src/xav/xav_reader.h b/c_src/xav/xav_reader.h
index 08ee17f..610bb0a 100644
--- a/c_src/xav/xav_reader.h
+++ b/c_src/xav/xav_reader.h
@@ -4,4 +4,7 @@
 struct XavReader {
   struct Reader *reader;
   struct AudioConverter *ac;
+  char *out_format;
+  int out_sample_rate;
+  int out_channels;
 };
\ No newline at end of file
diff --git a/lib/decoder.ex b/lib/decoder.ex
index 7ac80bb..e6be460 100644
--- a/lib/decoder.ex
+++ b/lib/decoder.ex
@@ -10,19 +10,59 @@ defmodule Xav.Decoder do
 
   @type t() :: reference()
 
+  @typedoc """
+  Opts that can be passed to `new/2`.
+  """
+  @type opts :: [
+          out_format: Xav.Frame.format(),
+          out_sample_rate: integer(),
+          out_channels: integer()
+        ]
+
   @doc """
   Creates a new decoder.
+
+  `opts` can be used to specify desired output parameters.
+
+  E.g. if you want to change audio samples format just pass:
+
+  ```elixir
+  [out_format: :f32]
+  ```
+
+  Video frames are always returned in RGB format.
+  This setting cannot be changed.
+
+  Audio samples are always in the packed form -
+  samples from different channels are interleaved in the same, single binary:
+
+  ```
+  <<c10, c20, c30, c11, c21, c31, c12, c22, c32>>
+  ```
+
+  The way in which samples are interleaved is not specified.
+
+  An alternative would be to return a list of binaries, where
+  each binary represents different channel:
+
+  ```
+  [
+    <<c10, c11, c12, c13, c14>>,
+    <<c20, c21, c22, c23, c24>>,
+    <<c30, c31, c32, c33, c34>>
+  ]
+  ```
   """
-  @spec new(codec()) :: t()
-  def new(codec) do
-    Xav.Decoder.NIF.new(codec)
+  @spec new(codec(), opts()) :: t()
+  def new(codec, opts \\ []) do
+    out_format = opts[:out_format]
+    out_sample_rate = opts[:out_sample_rate] || 0
+    out_channels = opts[:out_channels] || 0
+    Xav.Decoder.NIF.new(codec, out_format, out_sample_rate, out_channels)
   end
 
   @doc """
-  Decodes an audio or video frame.
-
-  Video frames are always in the RGB format.
-  Audio samples are always interleaved.
+  Decodes an audio/video frame.
   """
   @spec decode(t(), binary(), pts: integer(), dts: integer()) ::
           {:ok, Xav.Frame.t()} | {:error, atom()}
diff --git a/lib/decoder_nif.ex b/lib/decoder_nif.ex
index 6cad870..df6751c 100644
--- a/lib/decoder_nif.ex
+++ b/lib/decoder_nif.ex
@@ -8,7 +8,7 @@ defmodule Xav.Decoder.NIF do
     :ok = :erlang.load_nif(path, 0)
   end
 
-  def new(_codec), do: :erlang.nif_error(:undef)
+  def new(_codec, _out_format, _out_sample_rate, _out_channels), do: :erlang.nif_error(:undef)
 
   def decode(_decoder, _data, _pts, _dts), do: :erlang.nif_error(:undef)
 end
diff --git a/lib/frame.ex b/lib/frame.ex
index d2424a5..982b9fb 100644
--- a/lib/frame.ex
+++ b/lib/frame.ex
@@ -3,10 +3,24 @@ defmodule Xav.Frame do
   Audio/video frame.
   """
 
+  @typedoc """
+  Possible audio samples formats.
+  """
+  @type audio_format() :: :u8 | :s16 | :s32 | :s64 | :f32 | :f64
+
+  @typedoc """
+  Possible video frame formats.
+
+  Currently, only RGB is supported.
+  """
+  @type video_format() :: :rgb
+
+  @type format() :: audio_format() | video_format()
+
   @type t() :: %__MODULE__{
           type: :audio | :video,
           data: binary(),
-          format: atom(),
+          format: format(),
           width: non_neg_integer() | nil,
           height: non_neg_integer() | nil,
           samples: integer() | nil,
@@ -23,7 +37,10 @@ defmodule Xav.Frame do
     :pts
   ]
 
-  @spec new(binary(), atom(), non_neg_integer(), non_neg_integer(), integer()) :: t()
+  @doc """
+  Creates a new audio/video frame.
+  """
+  @spec new(binary(), format(), non_neg_integer(), non_neg_integer(), integer()) :: t()
   def new(data, format, width, height, pts) do
     %__MODULE__{
       type: :video,
@@ -35,7 +52,7 @@ defmodule Xav.Frame do
     }
   end
 
-  @spec new(binary(), atom(), integer(), integer()) :: t()
+  @spec new(binary(), format(), integer(), integer()) :: t()
   def new(data, format, samples, pts) do
     %__MODULE__{
       type: :audio,
@@ -47,7 +64,7 @@ defmodule Xav.Frame do
   end
 
   @doc """
-  Converts frame to Nx tensor.
+  Converts a frame to an Nx tensor.
   """
   @spec to_nx(t()) :: Nx.Tensor.t()
   def to_nx(%__MODULE__{type: :video} = frame) do
@@ -57,13 +74,6 @@ defmodule Xav.Frame do
   end
 
   def to_nx(%__MODULE__{type: :audio} = frame) do
-    Nx.from_binary(frame.data, to_nx_format(frame.format))
+    Nx.from_binary(frame.data, frame.format)
   end
-
-  defp to_nx_format(:u8), do: :u8
-  defp to_nx_format(:s16), do: :s16
-  defp to_nx_format(:s32), do: :s32
-  defp to_nx_format(:s64), do: :s64
-  defp to_nx_format(:flt), do: :f32
-  defp to_nx_format(:dbl), do: :f64
 end
diff --git a/lib/reader.ex b/lib/reader.ex
index eddff52..2d83228 100644
--- a/lib/reader.ex
+++ b/lib/reader.ex
@@ -10,20 +10,29 @@ defmodule Xav.Reader do
   Defaults to `:video`.
   * `device?` - determines whether path points to the camera. Defaults to `false`.
   """
-  @type opts :: [read: :audio | :video, device?: boolean]
+  @type opts :: [
+          read: :audio | :video,
+          device?: boolean,
+          out_format: Xav.Frame.format(),
+          out_sample_rate: integer(),
+          out_channels: integer()
+        ]
 
   @type t() :: %__MODULE__{
           reader: reference(),
           in_format: atom(),
           out_format: atom(),
-          sample_rate: integer() | nil,
+          in_sample_rate: integer() | nil,
+          out_sample_rate: integer() | nil,
+          in_channels: integer() | nil,
+          out_channels: integer() | nil,
           bit_rate: integer(),
           duration: integer(),
           codec: atom()
         }
 
   @enforce_keys [:reader, :in_format, :out_format, :bit_rate, :duration, :codec]
-  defstruct @enforce_keys ++ [:sample_rate]
+  defstruct @enforce_keys ++ [:in_sample_rate, :out_sample_rate, :in_channels, :out_channels]
 
   @doc """
   The same as new/1 but raises on error.
@@ -44,20 +53,39 @@ defmodule Xav.Reader do
   locked to 10.
 
   Microphone input is not supported.
+
+  `opts` can be used to specify desired output parameters.
+  Video frames are always returned in RGB format. This setting cannot be changed.
+  Audio samples are always in the packed form.
+  See `Xav.Decoder.new/2` for more information.
   """
   @spec new(String.t(), opts()) :: {:ok, t()} | {:error, term()}
   def new(path, opts \\ []) do
     read = opts[:read] || :video
     device? = opts[:device?] || false
-
-    case Xav.Reader.NIF.new(path, to_int(device?), to_int(read)) do
-      {:ok, reader, in_format, out_format, sample_rate, bit_rate, duration, codec} ->
+    out_format = opts[:out_format]
+    out_sample_rate = opts[:out_sample_rate] || 0
+    out_channels = opts[:out_channels] || 0
+
+    case Xav.Reader.NIF.new(
+           path,
+           to_int(device?),
+           to_int(read),
+           out_format,
+           out_sample_rate,
+           out_channels
+         ) do
+      {:ok, reader, in_format, out_format, in_sample_rate, out_sample_rate, in_channels,
+       out_channels, bit_rate, duration, codec} ->
         {:ok,
          %__MODULE__{
            reader: reader,
            in_format: in_format,
            out_format: out_format,
-           sample_rate: sample_rate,
+           in_sample_rate: in_sample_rate,
+           out_sample_rate: out_sample_rate,
+           in_channels: in_channels,
+           out_channels: out_channels,
            bit_rate: bit_rate,
            duration: duration,
            codec: to_human_readable(codec)
@@ -80,11 +108,7 @@ defmodule Xav.Reader do
   end
 
   @doc """
-  Reads the next frame.
-
-  A frame is always decoded.
-  Video frames are always in the RGB format.
-  Audio samples are always interleaved.
+  Reads and decodes the next frame.
   """
   @spec next_frame(t()) :: {:ok, Xav.Frame.t()} | {:error, :eof}
   def next_frame(%__MODULE__{reader: reader}) do
diff --git a/lib/reader_nif.ex b/lib/reader_nif.ex
index b642c27..0161450 100644
--- a/lib/reader_nif.ex
+++ b/lib/reader_nif.ex
@@ -8,7 +8,8 @@ defmodule Xav.Reader.NIF do
     :ok = :erlang.load_nif(path, 0)
   end
 
-  def new(_path, _device, _video), do: :erlang.nif_error(:undef)
+  def new(_path, _device, _video, _out_format, _out_sample_rate, _out_channels),
+    do: :erlang.nif_error(:undef)
 
   def next_frame(_reader), do: :erlang.nif_error(:undef)
 end
diff --git a/test/decoder_test.exs b/test/decoder_test.exs
index f4163ad..2682ffd 100644
--- a/test/decoder_test.exs
+++ b/test/decoder_test.exs
@@ -300,8 +300,21 @@ defmodule Xav.DecoderTest do
     test "audio" do
       decoder = Xav.Decoder.new(:opus)
 
-      assert {:ok, %Xav.Frame{samples: 960, pts: 0, format: :flt}} =
+      assert {:ok, %Xav.Frame{data: data, samples: 960, pts: 0, format: :f32}} =
                Xav.Decoder.decode(decoder, @opus_frame)
+
+      assert byte_size(data) == 7680
+    end
+
+    test "audio with resampling" do
+      decoder = Xav.Decoder.new(:opus, out_format: :u8, out_sample_rate: 16_000, out_channels: 1)
+
+      # after changing out_format and out_sample rate, we should have less samples
+      # and the data should be smaller
+      assert {:ok, %Xav.Frame{data: data, samples: 304, pts: 0, format: :u8}} =
+               Xav.Decoder.decode(decoder, @opus_frame)
+
+      assert byte_size(data) == 304
     end
 
     test "video keyframe" do
diff --git a/test/fixtures/README.md b/test/fixtures/stt/README.md
similarity index 95%
rename from test/fixtures/README.md
rename to test/fixtures/stt/README.md
index cfa4773..3de472b 100644
--- a/test/fixtures/README.md
+++ b/test/fixtures/stt/README.md
@@ -1,4 +1,4 @@
-# Fixtures
+# STT Fixtures
 
 * [melnet_sample_0.mp3](https://audio-samples.github.io/) - 22050Hz, 1 channel, fltp
 * [harvard.wav](https://www.kaggle.com/datasets/pavanelisetty/sample-audio-files-for-speech-recognition) - 44100Hz, 2 channels, s16
diff --git a/test/fixtures/harvard.mp3 b/test/fixtures/stt/harvard.mp3
similarity index 100%
rename from test/fixtures/harvard.mp3
rename to test/fixtures/stt/harvard.mp3
diff --git a/test/fixtures/harvard.wav b/test/fixtures/stt/harvard.wav
similarity index 100%
rename from test/fixtures/harvard.wav
rename to test/fixtures/stt/harvard.wav
diff --git a/test/fixtures/melnet_sample_0.mp3 b/test/fixtures/stt/melnet_sample_0.mp3
similarity index 100%
rename from test/fixtures/melnet_sample_0.mp3
rename to test/fixtures/stt/melnet_sample_0.mp3
diff --git a/test/reader_test.exs b/test/reader_test.exs
index 2dd441c..7798504 100644
--- a/test/reader_test.exs
+++ b/test/reader_test.exs
@@ -45,13 +45,13 @@ defmodule Xav.ReaderTest do
     for {path, expected_output} <- [
           # This file has been downloaded from https://audio-samples.github.io/
           # Section: Samples from the model without biasing or priming.
-          {"./test/fixtures/melnet_sample_0.mp3",
+          {"./test/fixtures/stt/melnet_sample_0.mp3",
            """
             My thought, I have nobody by a beauty and will as you poured. \
            Mr. Rochester has served and that so don't find a simple and \
            devoted aboud to what might in a\
            """},
-          {"./test/fixtures/harvard.wav",
+          {"./test/fixtures/stt/harvard.wav",
            """
             The stale smell of old beer lingers. It takes heat to bring out the odor. \
            A cold dip restores health in zest. A salt pickle tastes fine with ham. \
@@ -63,7 +63,13 @@ defmodule Xav.ReaderTest do
   end
 
   defp test_speech_to_text(path, expected_output) do
-    reader = Xav.Reader.new!(path, read: :audio)
+    reader =
+      Xav.Reader.new!(path,
+        read: :audio,
+        out_channels: 1,
+        out_format: :f32,
+        out_sample_rate: 16_000
+      )
 
     {:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
     {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})