Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize a2o utils #655

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 0 additions & 107 deletions chirp/inference/a2o_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,115 +15,8 @@

"""Utility functions for working with the A2O API."""

import io
import os
from typing import Generator, Sequence
import urllib

from chirp import audio_utils
import librosa
from ml_collections import config_dict
import numpy as np
import requests
import soundfile


def make_a2o_audio_url_from_file_id(
file_id: str, offset_s: float, window_size_s: float
) -> str:
"""Construct an A2O audio URL."""
# Extract the recording UID. Example:
# 'site_0277/20210428T100000+1000_Five-Rivers-Dry-A_909057.flac' -> 909057
file_id = file_id.split("_")[-1]
file_id = file_id.replace(".flac", "")
offset_s = int(offset_s)
# See: https://api.staging.ecosounds.org/api-docs/index.html
audio_path = (
"https://api.acousticobservatory.org/audio_recordings/"
f"{file_id}/media.flac"
)
if offset_s <= 0 and window_size_s <= 0:
return audio_path
params = {}
if offset_s > 0:
params["start_offset"] = offset_s
if window_size_s > 0:
params["end_offset"] = offset_s + int(window_size_s)
audio_path = audio_path + "?" + urllib.parse.urlencode(params)
return audio_path


def load_a2o_audio(
audio_url: str,
auth_token: str,
sample_rate: int,
session: requests.Session,
) -> np.ndarray | None:
"""Load audio from the A2O API.

Args:
audio_url: URL to load the audio from.
auth_token: The A2O API auth token.
sample_rate: The sample rate to resample the audio to.
session: The requests session to use.

Returns:
The audio as a numpy array, or None if the audio could not be loaded.
"""
if session is None:
# Use requests.get instead of session.get if no session is provided.
session = requests
audio_response = session.get(
url=audio_url,
headers={"Authorization": f"Token token={auth_token}"},
)
if not audio_response.ok:
print(audio_response.status_code)
return None

# Load the audio and resample.
try:
with io.BytesIO(audio_response.content) as f:
sf = soundfile.SoundFile(f)
audio = sf.read()
audio = librosa.resample(
audio, orig_sr=sf.samplerate, target_sr=sample_rate
)
except soundfile.LibsndfileError:
return None
return audio


def multi_load_a2o_audio(
filepaths: Sequence[str],
offsets: Sequence[int],
auth_token: str,
sample_rate: int = 32000,
**kwargs,
) -> Generator[np.ndarray, None, None]:
"""Creates a generator that loads audio from the A2O API."""
session = requests.Session()
session.mount(
"https://",
requests.adapters.HTTPAdapter(
max_retries=requests.adapters.Retry(total=5, backoff_factor=0.5)
),
)
a2o_audio_loader = lambda fp, offset: load_a2o_audio(
fp, sample_rate=sample_rate, auth_token=auth_token, session=session
)
iterator = audio_utils.multi_load_audio_window(
filepaths=filepaths,
offsets=offsets,
audio_loader=a2o_audio_loader,
**kwargs,
)
try:
for ex in iterator:
yield ex
finally:
session.close()


def get_a2o_embeddings_config() -> config_dict.ConfigDict:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is thin enough that we might as well move this last function into baw_utils as well.

"""Returns an embeddings config for the public A2O embeddings."""
Expand Down
131 changes: 131 additions & 0 deletions chirp/inference/baw_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# coding=utf-8
# Copyright 2024 The Perch Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utility functions for working with a Bioacoustic Workbench (baw) (e.g. A2O) API."""

import io
import os
import re
from typing import Generator, Sequence
import urllib

from chirp import audio_utils
import librosa
from ml_collections import config_dict
import numpy as np
import requests
import soundfile


def make_baw_audio_url_from_file_id(
file_id: str, offset_s: float, window_size_s: float, baw_domain: str = "data.acousticsobervatory.org"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a typo here, which threw me for a loop: data.acousticsobervatory.org

Is there a reason to prefer the data subdomain vs the api subdomain we were using before?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah no, my mistake. I will fix

) -> str:
"""Construct an baw audio URL."""
# Extract the recording UID. Example:
# 'site_0277/20210428T100000+1000_Five-Rivers-Dry-A_909057.flac' -> 909057
# 'site_0277/20210428T100000+1000_Five-Rivers-Dry-A_909057.wav' -> 909057
pattern = re.compile(r'.*_(\d+)\.[^\.]+$')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's upgrade this to a file-level compiled pattern; it will then compile once on import, saving a few cycles every time we construct a url. Something like:

FILE_ID_TO_UID_PATTERN = re.compile(r'.*_(\d+).[^\.]+$')

match = pattern.search(file_id)
if not match:
raise ValueError("Invalid file_id format")
file_id = match.group(1)
offset_s = int(offset_s)
# See: https://api.staging.ecosounds.org/api-docs/index.html
audio_path = (
f"https://{baw_domain}/audio_recordings/"
f"{file_id}/media.flac"
)
if offset_s <= 0 and window_size_s <= 0:
return audio_path
params = {}
if offset_s > 0:
params["start_offset"] = offset_s
if window_size_s > 0:
params["end_offset"] = offset_s + int(window_size_s)
audio_path = audio_path + "?" + urllib.parse.urlencode(params)
return audio_path


def load_baw_audio(
audio_url: str,
auth_token: str,
sample_rate: int,
session: requests.Session,
) -> np.ndarray | None:
"""Load audio from the baw API.

Args:
audio_url: URL to load the audio from.
auth_token: The baw API auth token.
sample_rate: The sample rate to resample the audio to.
session: The requests session to use.

Returns:
The audio as a numpy array, or None if the audio could not be loaded.
"""

if session is None:
# Use requests.get instead of session.get if no session is provided.
session = requests
audio_response = session.get(
url=audio_url,
headers={"Authorization": f"Token token={auth_token}"},
)
if not audio_response.ok:
print(audio_response.status_code)
return None

# Load the audio and resample.
try:
with io.BytesIO(audio_response.content) as f:
sf = soundfile.SoundFile(f)
audio = sf.read()
audio = librosa.resample(
audio, orig_sr=sf.samplerate, target_sr=sample_rate
)
except soundfile.LibsndfileError:
return None
return audio


def multi_load_baw_audio(
filepaths: Sequence[str],
offsets: Sequence[int],
auth_token: str,
sample_rate: int = 32000,
**kwargs,
) -> Generator[np.ndarray, None, None]:
"""Creates a generator that loads audio from the baw API."""
session = requests.Session()
session.mount(
"https://",
requests.adapters.HTTPAdapter(
max_retries=requests.adapters.Retry(total=5, backoff_factor=0.5)
),
)
baw_audio_loader = lambda fp, offset: load_baw_audio(
fp, sample_rate=sample_rate, auth_token=auth_token, session=session
)
iterator = audio_utils.multi_load_audio_window(
filepaths=filepaths,
offsets=offsets,
audio_loader=baw_audio_loader,
**kwargs,
)
try:
for ex in iterator:
yield ex
finally:
session.close()
18 changes: 10 additions & 8 deletions chirp/inference/search/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from typing import Callable, Iterator, Sequence

from chirp import audio_utils
from chirp.inference import a2o_utils
from chirp.inference import baw_utils
from chirp.inference import embed_lib
from chirp.inference import interface
from chirp.inference import models
Expand All @@ -41,14 +41,15 @@ class BootstrapState:
embedding_model: The model used to compute embeddings, loaded on init.
embeddings_dataset: A TF Dataset of the embeddings, loaded on init.
source_map: A Callable mapping file_id to full filepath.
a2o_auth_token: Auth token for fetching A2O data.
baw_auth_token: Auth token for fetching Bioacoustic Workbench (e.g. A2O) data.
"""

config: 'BootstrapConfig'
embedding_model: interface.EmbeddingModel | None = None
embeddings_dataset: tf.data.Dataset | None = None
source_map: Callable[[str, float], str] | None = None
a2o_auth_token: str = ''
baw_auth_token: str = ''
baw_domain: str = 'data.acousticsobervatory.org'

def __post_init__(self):
if self.embedding_model is None:
Expand All @@ -57,11 +58,12 @@ def __post_init__(self):
].from_config(self.config.model_config)
self.create_embeddings_dataset()
if self.source_map is None:
if self.a2o_auth_token:
if self.baw_auth_token:
window_size_s = self.config.model_config.window_size_s
self.source_map = functools.partial(
a2o_utils.make_a2o_audio_url_from_file_id,
baw_utils.make_baw_audio_url_from_file_id,
window_size_s=window_size_s,
baw_domain=self.baw_domain,
)
else:
self.source_map = lambda file_id, offset: filesystem_source_map(
Expand Down Expand Up @@ -92,11 +94,11 @@ def search_results_audio_iterator(
offsets = [r.timestamp_offset for r in search_results.search_results]
sample_rate = self.config.model_config.sample_rate
window_size_s = self.config.model_config.window_size_s
if self.a2o_auth_token:
iterator = a2o_utils.multi_load_a2o_audio(
if self.baw_auth_token:
iterator = baw_utils.multi_load_baw_audio(
filepaths=filepaths,
offsets=offsets,
auth_token=self.a2o_auth_token,
auth_token=self.baw_auth_token,
sample_rate=sample_rate,
**kwargs,
)
Expand Down
74 changes: 74 additions & 0 deletions chirp/inference/tests/baw_utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# coding=utf-8
# Copyright 2024 The Perch Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for baw utils."""

from absl.testing import absltest
from chirp.inference import baw_utils

class TestMakeBawAudioUrlFromFileId(absltest.TestCase):

def test_basic_url_no_offset(self):
url = baw_utils.make_baw_audio_url_from_file_id('20210428T100000+1000_Five-Rivers-Dry-A_909057.flac', 0, 0)
expected_url = 'https://data.acousticsobervatory.org/audio_recordings/909057/media.flac'
self.assertEqual(url, expected_url)

def test_url_with_offset_and_window_size(self):
url = baw_utils.make_baw_audio_url_from_file_id('/folder/site_0277/20210428T100000+1000_Five-Rivers-Dry-A_909057.wav', 10, 30)
expected_url = 'https://data.acousticsobervatory.org/audio_recordings/909057/media.flac?start_offset=10&end_offset=40'
self.assertEqual(url, expected_url)

def test_url_with_negative_offset(self):
url = baw_utils.make_baw_audio_url_from_file_id('site_0277/20210428T100000+1000_Five-Rivers-Dry-A_10000.wav', -10, 20)
expected_url = 'https://data.acousticsobervatory.org/audio_recordings/10000/media.flac?end_offset=10'
self.assertEqual(url, expected_url)

def test_url_with_negative_window_size(self):
url = baw_utils.make_baw_audio_url_from_file_id('site_0277/20210428T100000+1000_Five-Rivers-Dry-A_909057.flac', 5, -1)
expected_url = 'https://data.acousticsobervatory.org/audio_recordings/909057/media.flac?start_offset=5'
self.assertEqual(url, expected_url)

def test_url_with_zero_window_size(self):
url = baw_utils.make_baw_audio_url_from_file_id('site_0277/20210428T100000+1000_Five-Rivers-Dry-A_909057.wav', 5, 0)
expected_url = 'https://data.acousticsobervatory.org/audio_recordings/909057/media.flac?start_offset=5'
self.assertEqual(url, expected_url)

def test_basic_url_no_offset_default_domain(self):
url = baw_utils.make_baw_audio_url_from_file_id('site_0277/20210428T100000+1000_Five-Rivers-Dry-A_909057.flac', 0, 0)
expected_url = 'https://data.acousticsobervatory.org/audio_recordings/909057/media.flac'
self.assertEqual(url, expected_url)

def test_basic_url_no_offset_custom_domain(self):
url = baw_utils.make_baw_audio_url_from_file_id(
'20210428T100000+1000_Five-Rivers-Dry-A_12345.flac', 0, 0, baw_domain="www.some.other.domain.com"
)
expected_url = 'https://www.some.other.domain.com/audio_recordings/12345/media.flac'
self.assertEqual(url, expected_url)

def test_url_with_offset_and_window_size_custom_domain(self):
url = baw_utils.make_baw_audio_url_from_file_id(
'site_0277/20210428T100000+1000_Five-Rivers-Dry-A_909057.wav', 10, 30, baw_domain="example.com"
)
expected_url = 'https://example.com/audio_recordings/909057/media.flac?start_offset=10&end_offset=40'
self.assertEqual(url, expected_url)

def test_invalid_file_id_format(self):
with self.assertRaises(ValueError):
baw_utils.make_baw_audio_url_from_file_id('invalid_file_id', 10, 30, baw_domain="bad.domain.com")



if __name__ == '__main__':
absltest.main()