Skip to content

Commit

Permalink
Sanitize direct download (#13313)
Browse files Browse the repository at this point in the history
The 'direct' option in 'spacy download' is supposed to only download from our model releases repository. However, users were able to pass in a relative path, allowing download from arbitrary repositories. This meant that a service that sourced strings from user input and which used the direct option would allow users to install arbitrary packages.
  • Loading branch information
honnibal committed Feb 20, 2024
1 parent bff8725 commit 0518c36
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 2 deletions.
2 changes: 2 additions & 0 deletions spacy/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from wasabi import msg

# Needed for testing
from . import download as download_module # noqa: F401
from ._util import app, setup_cli # noqa: F401
from .apply import apply # noqa: F401
from .assemble import assemble_cli # noqa: F401
Expand Down
19 changes: 18 additions & 1 deletion spacy/cli/download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
from typing import Optional, Sequence
from urllib.parse import urljoin

import requests
import typer
Expand Down Expand Up @@ -63,6 +64,13 @@ def download(
)
pip_args = pip_args + ("--no-deps",)
if direct:
# Reject model names with '/', in order to prevent shenanigans.
if "/" in model:
msg.fail(
title="Model download rejected",
text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
exits=True,
)
components = model.split("-")
model_name = "".join(components[:-1])
version = components[-1]
Expand Down Expand Up @@ -153,7 +161,16 @@ def get_latest_version(model: str) -> str:
def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None:
download_url = about.__download_url__ + "/" + filename
# Construct the download URL carefully. We need to make sure we don't
# allow relative paths or other shenanigans to trick us into download
# from outside our own repo.
base_url = about.__download_url__
# urljoin requires that the path ends with /, or the last path part will be dropped
if not base_url.endswith("/"):
base_url = about.__download_url__ + "/"
download_url = urljoin(base_url, filename)
if not download_url.startswith(about.__download_url__):
raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
pip_args = list(user_pip_args) if user_pip_args is not None else []
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
run_command(cmd)
14 changes: 13 additions & 1 deletion spacy/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import spacy
from spacy import about
from spacy.cli import info
from spacy.cli import download_module, info
from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
from spacy.cli.apply import apply
from spacy.cli.debug_data import (
Expand Down Expand Up @@ -1066,3 +1066,15 @@ def test_debug_data_trainable_lemmatizer_not_annotated():
def test_project_api_imports():
from spacy.cli import project_run
from spacy.cli.project.run import project_run # noqa: F401, F811


def test_download_rejects_relative_urls(monkeypatch):
"""Test that we can't tell spacy download to get an arbitrary model by using a
relative path in the filename"""

monkeypatch.setattr(download_module, "run_command", lambda cmd: None)

# Check that normal download works
download_module.download("en_core_web_sm-3.7.1", direct=True)
with pytest.raises(SystemExit):
download_module.download("../en_core_web_sm-3.7.1", direct=True)

0 comments on commit 0518c36

Please sign in to comment.