diff --git a/bookwyrm/activitypub/base_activity.py b/bookwyrm/activitypub/base_activity.py index c78b4f1954..615db440a3 100644 --- a/bookwyrm/activitypub/base_activity.py +++ b/bookwyrm/activitypub/base_activity.py @@ -2,6 +2,8 @@ from dataclasses import dataclass, fields, MISSING from json import JSONEncoder import logging +from typing import Optional, Union, TypeVar, overload, Any + import requests from django.apps import apps @@ -10,12 +12,15 @@ from bookwyrm import models from bookwyrm.connectors import ConnectorException, get_data +from bookwyrm.models import base_model from bookwyrm.signatures import make_signature from bookwyrm.settings import DOMAIN, INSTANCE_ACTOR_USERNAME from bookwyrm.tasks import app, MISC logger = logging.getLogger(__name__) +TBookWyrmModel = TypeVar("TBookWyrmModel", bound=base_model.BookWyrmModel) + class ActivitySerializerError(ValueError): """routine problems serializing activitypub json""" @@ -65,7 +70,11 @@ class ActivityObject: id: str type: str - def __init__(self, activity_objects=None, **kwargs): + def __init__( + self, + activity_objects: Optional[list[str, base_model.BookWyrmModel]] = None, + **kwargs: dict[str, Any], + ): """this lets you pass in an object with fields that aren't in the dataclass, which it ignores. Any field in the dataclass is required or has a default value""" @@ -101,13 +110,13 @@ def __init__(self, activity_objects=None, **kwargs): # pylint: disable=too-many-locals,too-many-branches,too-many-arguments def to_model( self, - model=None, - instance=None, - allow_create=True, - save=True, - overwrite=True, - allow_external_connections=True, - ): + model: Optional[type[TBookWyrmModel]] = None, + instance: Optional[TBookWyrmModel] = None, + allow_create: bool = True, + save: bool = True, + overwrite: bool = True, + allow_external_connections: bool = True, + ) -> Optional[TBookWyrmModel]: """convert from an activity to a model instance. Args: model: the django model that this object is being converted to (will guess if not known) @@ -296,14 +305,40 @@ def get_model_from_type(activity_type): # pylint: disable=too-many-arguments +@overload def resolve_remote_id( - remote_id, - model=None, - refresh=False, - save=True, - get_activity=False, - allow_external_connections=True, -): + remote_id: str, + model: type[TBookWyrmModel], + refresh: bool = False, + save: bool = True, + get_activity: bool = False, + allow_external_connections: bool = True, +) -> TBookWyrmModel: + ... + + +# pylint: disable=too-many-arguments +@overload +def resolve_remote_id( + remote_id: str, + model: Optional[str] = None, + refresh: bool = False, + save: bool = True, + get_activity: bool = False, + allow_external_connections: bool = True, +) -> base_model.BookWyrmModel: + ... + + +# pylint: disable=too-many-arguments +def resolve_remote_id( + remote_id: str, + model: Optional[Union[str, type[base_model.BookWyrmModel]]] = None, + refresh: bool = False, + save: bool = True, + get_activity: bool = False, + allow_external_connections: bool = True, +) -> base_model.BookWyrmModel: """take a remote_id and return an instance, creating if necessary. Args: remote_id: the unique url for looking up the object in the db or by http model: a string or object representing the model that corresponds to the object diff --git a/bookwyrm/activitypub/book.py b/bookwyrm/activitypub/book.py index d3aca4471d..5db0dc3ac1 100644 --- a/bookwyrm/activitypub/book.py +++ b/bookwyrm/activitypub/book.py @@ -1,6 +1,6 @@ """ book and author data """ from dataclasses import dataclass, field -from typing import List +from typing import Optional from .base_activity import ActivityObject from .image import Document @@ -11,19 +11,19 @@ class BookData(ActivityObject): """shared fields for all book data and authors""" - openlibraryKey: str = None - inventaireId: str = None - librarythingKey: str = None - goodreadsKey: str = None - bnfId: str = None - viaf: str = None - wikidata: str = None - asin: str = None - aasin: str = None - isfdb: str = None - lastEditedBy: str = None - links: List[str] = field(default_factory=lambda: []) - fileLinks: List[str] = field(default_factory=lambda: []) + openlibraryKey: Optional[str] = None + inventaireId: Optional[str] = None + librarythingKey: Optional[str] = None + goodreadsKey: Optional[str] = None + bnfId: Optional[str] = None + viaf: Optional[str] = None + wikidata: Optional[str] = None + asin: Optional[str] = None + aasin: Optional[str] = None + isfdb: Optional[str] = None + lastEditedBy: Optional[str] = None + links: list[str] = field(default_factory=list) + fileLinks: list[str] = field(default_factory=list) # pylint: disable=invalid-name @@ -35,17 +35,17 @@ class Book(BookData): sortTitle: str = None subtitle: str = None description: str = "" - languages: List[str] = field(default_factory=lambda: []) + languages: list[str] = field(default_factory=list) series: str = "" seriesNumber: str = "" - subjects: List[str] = field(default_factory=lambda: []) - subjectPlaces: List[str] = field(default_factory=lambda: []) + subjects: list[str] = field(default_factory=list) + subjectPlaces: list[str] = field(default_factory=list) - authors: List[str] = field(default_factory=lambda: []) + authors: list[str] = field(default_factory=list) firstPublishedDate: str = "" publishedDate: str = "" - cover: Document = None + cover: Optional[Document] = None type: str = "Book" @@ -58,10 +58,10 @@ class Edition(Book): isbn10: str = "" isbn13: str = "" oclcNumber: str = "" - pages: int = None + pages: Optional[int] = None physicalFormat: str = "" physicalFormatDetail: str = "" - publishers: List[str] = field(default_factory=lambda: []) + publishers: list[str] = field(default_factory=list) editionRank: int = 0 type: str = "Edition" @@ -73,7 +73,7 @@ class Work(Book): """work instance of a book object""" lccn: str = "" - editions: List[str] = field(default_factory=lambda: []) + editions: list[str] = field(default_factory=list) type: str = "Work" @@ -83,12 +83,12 @@ class Author(BookData): """author of a book""" name: str - isni: str = None - viafId: str = None - gutenbergId: str = None - born: str = None - died: str = None - aliases: List[str] = field(default_factory=lambda: []) + isni: Optional[str] = None + viafId: Optional[str] = None + gutenbergId: Optional[str] = None + born: Optional[str] = None + died: Optional[str] = None + aliases: list[str] = field(default_factory=list) bio: str = "" wikipediaLink: str = "" type: str = "Author" diff --git a/bookwyrm/book_search.py b/bookwyrm/book_search.py index 822c87f016..ceb228f40c 100644 --- a/bookwyrm/book_search.py +++ b/bookwyrm/book_search.py @@ -1,22 +1,53 @@ """ using a bookwyrm instance as a source of book data """ +from __future__ import annotations from dataclasses import asdict, dataclass from functools import reduce import operator +from typing import Optional, Union, Any, Literal, overload from django.contrib.postgres.search import SearchRank, SearchQuery from django.db.models import F, Q +from django.db.models.query import QuerySet from bookwyrm import models from bookwyrm import connectors from bookwyrm.settings import MEDIA_FULL_URL +@overload +def search( + query: str, + *, + min_confidence: float = 0, + filters: Optional[list[Any]] = None, + return_first: Literal[False], +) -> QuerySet[models.Edition]: + ... + + +@overload +def search( + query: str, + *, + min_confidence: float = 0, + filters: Optional[list[Any]] = None, + return_first: Literal[True], +) -> Optional[models.Edition]: + ... + + # pylint: disable=arguments-differ -def search(query, min_confidence=0, filters=None, return_first=False): +def search( + query: str, + *, + min_confidence: float = 0, + filters: Optional[list[Any]] = None, + return_first: bool = False, +) -> Union[Optional[models.Edition], QuerySet[models.Edition]]: """search your local database""" filters = filters or [] if not query: - return [] + return None if return_first else [] query = query.strip() results = None @@ -66,7 +97,9 @@ def format_search_result(search_result): ).json() -def search_identifiers(query, *filters, return_first=False): +def search_identifiers( + query, *filters, return_first=False +) -> Union[Optional[models.Edition], QuerySet[models.Edition]]: """tries remote_id, isbn; defined as dedupe fields on the model""" if connectors.maybe_isbn(query): # Oh did you think the 'S' in ISBN stood for 'standard'? @@ -87,7 +120,9 @@ def search_identifiers(query, *filters, return_first=False): return results -def search_title_author(query, min_confidence, *filters, return_first=False): +def search_title_author( + query, min_confidence, *filters, return_first=False +) -> QuerySet[models.Edition]: """searches for title and author""" query = SearchQuery(query, config="simple") | SearchQuery(query, config="english") results = ( @@ -122,11 +157,11 @@ class SearchResult: title: str key: str connector: object - view_link: str = None - author: str = None - year: str = None - cover: str = None - confidence: int = 1 + view_link: Optional[str] = None + author: Optional[str] = None + year: Optional[str] = None + cover: Optional[str] = None + confidence: float = 1.0 def __repr__(self): # pylint: disable=consider-using-f-string diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py index 950bb11f98..8b6dcb8858 100644 --- a/bookwyrm/connectors/abstract_connector.py +++ b/bookwyrm/connectors/abstract_connector.py @@ -1,5 +1,7 @@ """ functionality outline for a book data connector """ +from __future__ import annotations from abc import ABC, abstractmethod +from typing import Optional, TypedDict, Any, Callable, Union, Iterator from urllib.parse import quote_plus import imghdr import logging @@ -16,33 +18,38 @@ from bookwyrm.settings import USER_AGENT from .connector_manager import load_more_data, ConnectorException, raise_not_valid_url from .format_mappings import format_mappings - +from ..book_search import SearchResult logger = logging.getLogger(__name__) +JsonDict = dict[str, Any] + + +class ConnectorResults(TypedDict): + """TypedDict for results returned by connector""" + + connector: AbstractMinimalConnector + results: list[SearchResult] + class AbstractMinimalConnector(ABC): """just the bare bones, for other bookwyrm instances""" - def __init__(self, identifier): + def __init__(self, identifier: str): # load connector settings info = models.Connector.objects.get(identifier=identifier) self.connector = info # the things in the connector model to copy over - self_fields = [ - "base_url", - "books_url", - "covers_url", - "search_url", - "isbn_search_url", - "name", - "identifier", - ] - for field in self_fields: - setattr(self, field, getattr(info, field)) - - def get_search_url(self, query): + self.base_url = info.base_url + self.books_url = info.books_url + self.covers_url = info.covers_url + self.search_url = info.search_url + self.isbn_search_url = info.isbn_search_url + self.name = info.name + self.identifier = info.identifier + + def get_search_url(self, query: str) -> str: """format the query url""" # Check if the query resembles an ISBN if maybe_isbn(query) and self.isbn_search_url and self.isbn_search_url != "": @@ -54,13 +61,21 @@ def get_search_url(self, query): # searched as free text. This, instead, only searches isbn if it's isbn-y return f"{self.search_url}{quote_plus(query)}" - def process_search_response(self, query, data, min_confidence): + def process_search_response( + self, query: str, data: Any, min_confidence: float + ) -> list[SearchResult]: """Format the search results based on the format of the query""" if maybe_isbn(query): return list(self.parse_isbn_search_data(data))[:10] return list(self.parse_search_data(data, min_confidence))[:10] - async def get_results(self, session, url, min_confidence, query): + async def get_results( + self, + session: aiohttp.ClientSession, + url: str, + min_confidence: float, + query: str, + ) -> Optional[ConnectorResults]: """try this specific connector""" # pylint: disable=line-too-long headers = { @@ -74,55 +89,63 @@ async def get_results(self, session, url, min_confidence, query): async with session.get(url, headers=headers, params=params) as response: if not response.ok: logger.info("Unable to connect to %s: %s", url, response.reason) - return + return None try: raw_data = await response.json() except aiohttp.client_exceptions.ContentTypeError as err: logger.exception(err) - return + return None - return { - "connector": self, - "results": self.process_search_response( + return ConnectorResults( + connector=self, + results=self.process_search_response( query, raw_data, min_confidence ), - } + ) except asyncio.TimeoutError: logger.info("Connection timed out for url: %s", url) except aiohttp.ClientError as err: logger.info(err) + return None @abstractmethod - def get_or_create_book(self, remote_id): + def get_or_create_book(self, remote_id: str) -> Optional[models.Book]: """pull up a book record by whatever means possible""" @abstractmethod - def parse_search_data(self, data, min_confidence): + def parse_search_data( + self, data: Any, min_confidence: float + ) -> Iterator[SearchResult]: """turn the result json from a search into a list""" @abstractmethod - def parse_isbn_search_data(self, data): + def parse_isbn_search_data(self, data: Any) -> Iterator[SearchResult]: """turn the result json from a search into a list""" class AbstractConnector(AbstractMinimalConnector): """generic book data connector""" - def __init__(self, identifier): + generated_remote_link_field = "" + + def __init__(self, identifier: str): super().__init__(identifier) # fields we want to look for in book data to copy over # title we handle separately. - self.book_mappings = [] + self.book_mappings: list[Mapping] = [] + self.author_mappings: list[Mapping] = [] - def get_or_create_book(self, remote_id): + def get_or_create_book(self, remote_id: str) -> Optional[models.Book]: """translate arbitrary json into an Activitypub dataclass""" # first, check if we have the origin_id saved existing = models.Edition.find_existing_by_remote_id( remote_id ) or models.Work.find_existing_by_remote_id(remote_id) if existing: - if hasattr(existing, "default_edition"): + if hasattr(existing, "default_edition") and isinstance( + existing.default_edition, models.Edition + ): return existing.default_edition return existing @@ -154,6 +177,9 @@ def get_or_create_book(self, remote_id): ) # this will dedupe automatically work = work_activity.to_model(model=models.Work, overwrite=False) + if not work: + return None + for author in self.get_authors_from_data(work_data): work.authors.add(author) @@ -161,12 +187,21 @@ def get_or_create_book(self, remote_id): load_more_data.delay(self.connector.id, work.id) return edition - def get_book_data(self, remote_id): # pylint: disable=no-self-use + def get_book_data(self, remote_id: str) -> JsonDict: # pylint: disable=no-self-use """this allows connectors to override the default behavior""" return get_data(remote_id) - def create_edition_from_data(self, work, edition_data, instance=None): + def create_edition_from_data( + self, + work: models.Work, + edition_data: Union[str, JsonDict], + instance: Optional[models.Edition] = None, + ) -> Optional[models.Edition]: """if we already have the work, we're ready""" + if isinstance(edition_data, str): + # We don't expect a string here + return None + mapped_data = dict_from_mappings(edition_data, self.book_mappings) mapped_data["work"] = work.remote_id edition_activity = activitypub.Edition(**mapped_data) @@ -174,6 +209,9 @@ def create_edition_from_data(self, work, edition_data, instance=None): model=models.Edition, overwrite=False, instance=instance ) + if not edition: + return None + # if we're updating an existing instance, we don't need to load authors if instance: return edition @@ -190,7 +228,9 @@ def create_edition_from_data(self, work, edition_data, instance=None): return edition - def get_or_create_author(self, remote_id, instance=None): + def get_or_create_author( + self, remote_id: str, instance: Optional[models.Author] = None + ) -> Optional[models.Author]: """load that author""" if not instance: existing = models.Author.find_existing_by_remote_id(remote_id) @@ -210,46 +250,51 @@ def get_or_create_author(self, remote_id, instance=None): model=models.Author, overwrite=False, instance=instance ) - def get_remote_id_from_model(self, obj): + def get_remote_id_from_model(self, obj: models.BookDataModel) -> Optional[str]: """given the data stored, how can we look this up""" - return getattr(obj, getattr(self, "generated_remote_link_field")) + remote_id: Optional[str] = getattr(obj, self.generated_remote_link_field) + return remote_id - def update_author_from_remote(self, obj): + def update_author_from_remote(self, obj: models.Author) -> Optional[models.Author]: """load the remote data from this connector and add it to an existing author""" remote_id = self.get_remote_id_from_model(obj) + if not remote_id: + return None return self.get_or_create_author(remote_id, instance=obj) - def update_book_from_remote(self, obj): + def update_book_from_remote(self, obj: models.Edition) -> Optional[models.Edition]: """load the remote data from this connector and add it to an existing book""" remote_id = self.get_remote_id_from_model(obj) + if not remote_id: + return None data = self.get_book_data(remote_id) return self.create_edition_from_data(obj.parent_work, data, instance=obj) @abstractmethod - def is_work_data(self, data): + def is_work_data(self, data: JsonDict) -> bool: """differentiate works and editions""" @abstractmethod - def get_edition_from_work_data(self, data): + def get_edition_from_work_data(self, data: JsonDict) -> JsonDict: """every work needs at least one edition""" @abstractmethod - def get_work_from_edition_data(self, data): + def get_work_from_edition_data(self, data: JsonDict) -> JsonDict: """every edition needs a work""" @abstractmethod - def get_authors_from_data(self, data): + def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]: """load author data""" @abstractmethod - def expand_book_data(self, book): + def expand_book_data(self, book: models.Book) -> None: """get more info on a book""" -def dict_from_mappings(data, mappings): +def dict_from_mappings(data: JsonDict, mappings: list[Mapping]) -> JsonDict: """create a dict in Activitypub format, using mappings supplies by the subclass""" - result = {} + result: JsonDict = {} for mapping in mappings: # sometimes there are multiple mappings for one field, don't # overwrite earlier writes in that case @@ -259,7 +304,11 @@ def dict_from_mappings(data, mappings): return result -def get_data(url, params=None, timeout=settings.QUERY_TIMEOUT): +def get_data( + url: str, + params: Optional[dict[str, str]] = None, + timeout: int = settings.QUERY_TIMEOUT, +) -> JsonDict: """wrapper for request.get""" # check if the url is blocked raise_not_valid_url(url) @@ -292,10 +341,15 @@ def get_data(url, params=None, timeout=settings.QUERY_TIMEOUT): logger.info(err) raise ConnectorException(err) + if not isinstance(data, dict): + raise ConnectorException("Unexpected data format") + return data -def get_image(url, timeout=10): +def get_image( + url: str, timeout: int = 10 +) -> Union[tuple[ContentFile[bytes], str], tuple[None, None]]: """wrapper for requesting an image""" raise_not_valid_url(url) try: @@ -325,14 +379,19 @@ def get_image(url, timeout=10): class Mapping: """associate a local database field with a field in an external dataset""" - def __init__(self, local_field, remote_field=None, formatter=None): + def __init__( + self, + local_field: str, + remote_field: Optional[str] = None, + formatter: Optional[Callable[[Any], Any]] = None, + ): noop = lambda x: x self.local_field = local_field self.remote_field = remote_field or local_field self.formatter = formatter or noop - def get_value(self, data): + def get_value(self, data: JsonDict) -> Optional[Any]: """pull a field from incoming json and return the formatted version""" value = data.get(self.remote_field) if not value: @@ -343,7 +402,7 @@ def get_value(self, data): return None -def infer_physical_format(format_text): +def infer_physical_format(format_text: str) -> Optional[str]: """try to figure out what the standardized format is from the free value""" format_text = format_text.lower() if format_text in format_mappings: @@ -356,7 +415,7 @@ def infer_physical_format(format_text): return matches[0] -def unique_physical_format(format_text): +def unique_physical_format(format_text: str) -> Optional[str]: """only store the format if it isn't directly in the format mappings""" format_text = format_text.lower() if format_text in format_mappings: @@ -365,7 +424,7 @@ def unique_physical_format(format_text): return format_text -def maybe_isbn(query): +def maybe_isbn(query: str) -> bool: """check if a query looks like an isbn""" isbn = re.sub(r"[\W_]", "", query) # removes filler characters # ISBNs must be numeric except an ISBN10 checkdigit can be 'X' diff --git a/bookwyrm/connectors/bookwyrm_connector.py b/bookwyrm/connectors/bookwyrm_connector.py index e07a0b281a..4064f4b4c5 100644 --- a/bookwyrm/connectors/bookwyrm_connector.py +++ b/bookwyrm/connectors/bookwyrm_connector.py @@ -1,4 +1,7 @@ """ using another bookwyrm instance as a source of book data """ +from __future__ import annotations +from typing import Any, Iterator + from bookwyrm import activitypub, models from bookwyrm.book_search import SearchResult from .abstract_connector import AbstractMinimalConnector @@ -7,15 +10,19 @@ class Connector(AbstractMinimalConnector): """this is basically just for search""" - def get_or_create_book(self, remote_id): + def get_or_create_book(self, remote_id: str) -> models.Edition: return activitypub.resolve_remote_id(remote_id, model=models.Edition) - def parse_search_data(self, data, min_confidence): + def parse_search_data( + self, data: list[dict[str, Any]], min_confidence: float + ) -> Iterator[SearchResult]: for search_result in data: search_result["connector"] = self yield SearchResult(**search_result) - def parse_isbn_search_data(self, data): + def parse_isbn_search_data( + self, data: list[dict[str, Any]] + ) -> Iterator[SearchResult]: for search_result in data: search_result["connector"] = self yield SearchResult(**search_result) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index e32da7c00f..444a626ba6 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -1,8 +1,11 @@ """ interface with whatever connectors the app has """ +from __future__ import annotations import asyncio import importlib import ipaddress import logging +from asyncio import Future +from typing import Iterator, Any, Optional, Union, overload, Literal from urllib.parse import urlparse import aiohttp @@ -12,6 +15,8 @@ from requests import HTTPError from bookwyrm import book_search, models +from bookwyrm.book_search import SearchResult +from bookwyrm.connectors import abstract_connector from bookwyrm.settings import SEARCH_TIMEOUT from bookwyrm.tasks import app, CONNECTORS @@ -22,11 +27,15 @@ class ConnectorException(HTTPError): """when the connector can't do what was asked""" -async def async_connector_search(query, items, min_confidence): +async def async_connector_search( + query: str, + items: list[tuple[str, abstract_connector.AbstractConnector]], + min_confidence: float, +) -> list[Optional[abstract_connector.ConnectorResults]]: """Try a number of requests simultaneously""" timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) async with aiohttp.ClientSession(timeout=timeout) as session: - tasks = [] + tasks: list[Future[Optional[abstract_connector.ConnectorResults]]] = [] for url, connector in items: tasks.append( asyncio.ensure_future( @@ -35,14 +44,29 @@ async def async_connector_search(query, items, min_confidence): ) results = await asyncio.gather(*tasks) - return results + return list(results) -def search(query, min_confidence=0.1, return_first=False): +@overload +def search( + query: str, *, min_confidence: float = 0.1, return_first: Literal[False] +) -> list[abstract_connector.ConnectorResults]: + ... + + +@overload +def search( + query: str, *, min_confidence: float = 0.1, return_first: Literal[True] +) -> Optional[SearchResult]: + ... + + +def search( + query: str, *, min_confidence: float = 0.1, return_first: bool = False +) -> Union[list[abstract_connector.ConnectorResults], Optional[SearchResult]]: """find books based on arbitrary keywords""" if not query: - return [] - results = [] + return None if return_first else [] items = [] for connector in get_connectors(): @@ -57,8 +81,12 @@ def search(query, min_confidence=0.1, return_first=False): items.append((url, connector)) # load as many results as we can - results = asyncio.run(async_connector_search(query, items, min_confidence)) - results = [r for r in results if r] + # failed requests will return None, so filter those out + results = [ + r + for r in asyncio.run(async_connector_search(query, items, min_confidence)) + if r + ] if return_first: # find the best result from all the responses and return that @@ -66,11 +94,12 @@ def search(query, min_confidence=0.1, return_first=False): all_results = sorted(all_results, key=lambda r: r.confidence, reverse=True) return all_results[0] if all_results else None - # failed requests will return None, so filter those out return results -def first_search_result(query, min_confidence=0.1): +def first_search_result( + query: str, min_confidence: float = 0.1 +) -> Union[models.Edition, SearchResult, None]: """search until you find a result that fits""" # try local search first result = book_search.search(query, min_confidence=min_confidence, return_first=True) @@ -80,13 +109,13 @@ def first_search_result(query, min_confidence=0.1): return search(query, min_confidence=min_confidence, return_first=True) or None -def get_connectors(): +def get_connectors() -> Iterator[abstract_connector.AbstractConnector]: """load all connectors""" for info in models.Connector.objects.filter(active=True).order_by("priority").all(): yield load_connector(info) -def get_or_create_connector(remote_id): +def get_or_create_connector(remote_id: str) -> abstract_connector.AbstractConnector: """get the connector related to the object's server""" url = urlparse(remote_id) identifier = url.netloc @@ -110,7 +139,7 @@ def get_or_create_connector(remote_id): @app.task(queue=CONNECTORS) -def load_more_data(connector_id, book_id): +def load_more_data(connector_id: str, book_id: str) -> None: """background the work of getting all 10,000 editions of LoTR""" connector_info = models.Connector.objects.get(id=connector_id) connector = load_connector(connector_info) @@ -119,7 +148,9 @@ def load_more_data(connector_id, book_id): @app.task(queue=CONNECTORS) -def create_edition_task(connector_id, work_id, data): +def create_edition_task( + connector_id: int, work_id: int, data: Union[str, abstract_connector.JsonDict] +) -> None: """separate task for each of the 10,000 editions of LoTR""" connector_info = models.Connector.objects.get(id=connector_id) connector = load_connector(connector_info) @@ -127,23 +158,31 @@ def create_edition_task(connector_id, work_id, data): connector.create_edition_from_data(work, data) -def load_connector(connector_info): +def load_connector( + connector_info: models.Connector, +) -> abstract_connector.AbstractConnector: """instantiate the connector class""" connector = importlib.import_module( f"bookwyrm.connectors.{connector_info.connector_file}" ) - return connector.Connector(connector_info.identifier) + return connector.Connector(connector_info.identifier) # type: ignore[no-any-return] @receiver(signals.post_save, sender="bookwyrm.FederatedServer") # pylint: disable=unused-argument -def create_connector(sender, instance, created, *args, **kwargs): +def create_connector( + sender: Any, + instance: models.FederatedServer, + created: Any, + *args: Any, + **kwargs: Any, +) -> None: """create a connector to an external bookwyrm server""" if instance.application_type == "bookwyrm": get_or_create_connector(f"https://{instance.server_name}") -def raise_not_valid_url(url): +def raise_not_valid_url(url: str) -> None: """do some basic reality checks on the url""" parsed = urlparse(url) if not parsed.scheme in ["http", "https"]: diff --git a/bookwyrm/connectors/inventaire.py b/bookwyrm/connectors/inventaire.py index f3e24c0ec5..c08bcdee14 100644 --- a/bookwyrm/connectors/inventaire.py +++ b/bookwyrm/connectors/inventaire.py @@ -1,9 +1,10 @@ """ inventaire data connector """ import re +from typing import Any, Union, Optional, Iterator, Iterable from bookwyrm import models from bookwyrm.book_search import SearchResult -from .abstract_connector import AbstractConnector, Mapping +from .abstract_connector import AbstractConnector, Mapping, JsonDict from .abstract_connector import get_data from .connector_manager import ConnectorException, create_edition_task @@ -13,7 +14,7 @@ class Connector(AbstractConnector): generated_remote_link_field = "inventaire_id" - def __init__(self, identifier): + def __init__(self, identifier: str): super().__init__(identifier) get_first = lambda a: a[0] @@ -60,13 +61,13 @@ def __init__(self, identifier): Mapping("died", remote_field="wdt:P570", formatter=get_first), ] + shared_mappings - def get_remote_id(self, value): + def get_remote_id(self, value: str) -> str: """convert an id/uri into a url""" return f"{self.books_url}?action=by-uris&uris={value}" - def get_book_data(self, remote_id): + def get_book_data(self, remote_id: str) -> JsonDict: data = get_data(remote_id) - extracted = list(data.get("entities").values()) + extracted = list(data.get("entities", {}).values()) try: data = extracted[0] except (KeyError, IndexError): @@ -74,10 +75,16 @@ def get_book_data(self, remote_id): # flatten the data so that images, uri, and claims are on the same level return { **data.get("claims", {}), - **{k: data.get(k) for k in ["uri", "image", "labels", "sitelinks", "type"]}, + **{ + k: data.get(k) + for k in ["uri", "image", "labels", "sitelinks", "type"] + if k in data + }, } - def parse_search_data(self, data, min_confidence): + def parse_search_data( + self, data: JsonDict, min_confidence: float + ) -> Iterator[SearchResult]: for search_result in data.get("results", []): images = search_result.get("image") cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None @@ -96,7 +103,7 @@ def parse_search_data(self, data, min_confidence): connector=self, ) - def parse_isbn_search_data(self, data): + def parse_isbn_search_data(self, data: JsonDict) -> Iterator[SearchResult]: """got some data""" results = data.get("entities") if not results: @@ -114,35 +121,44 @@ def parse_isbn_search_data(self, data): connector=self, ) - def is_work_data(self, data): + def is_work_data(self, data: JsonDict) -> bool: return data.get("type") == "work" - def load_edition_data(self, work_uri): + def load_edition_data(self, work_uri: str) -> JsonDict: """get a list of editions for a work""" # pylint: disable=line-too-long url = f"{self.books_url}?action=reverse-claims&property=wdt:P629&value={work_uri}&sort=true" return get_data(url) - def get_edition_from_work_data(self, data): - data = self.load_edition_data(data.get("uri")) + def get_edition_from_work_data(self, data: JsonDict) -> JsonDict: + work_uri = data.get("uri") + if not work_uri: + raise ConnectorException("Invalid URI") + data = self.load_edition_data(work_uri) try: uri = data.get("uris", [])[0] except IndexError: raise ConnectorException("Invalid book data") return self.get_book_data(self.get_remote_id(uri)) - def get_work_from_edition_data(self, data): - uri = data.get("wdt:P629", [None])[0] + def get_work_from_edition_data(self, data: JsonDict) -> JsonDict: + try: + uri = data.get("wdt:P629", [])[0] + except IndexError: + raise ConnectorException("Invalid book data") + if not uri: raise ConnectorException("Invalid book data") return self.get_book_data(self.get_remote_id(uri)) - def get_authors_from_data(self, data): + def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]: authors = data.get("wdt:P50", []) for author in authors: - yield self.get_or_create_author(self.get_remote_id(author)) + model = self.get_or_create_author(self.get_remote_id(author)) + if model: + yield model - def expand_book_data(self, book): + def expand_book_data(self, book: models.Book) -> None: work = book # go from the edition to the work, if necessary if isinstance(book, models.Edition): @@ -154,11 +170,16 @@ def expand_book_data(self, book): # who knows, man return - for edition_uri in edition_options.get("uris"): + for edition_uri in edition_options.get("uris", []): remote_id = self.get_remote_id(edition_uri) create_edition_task.delay(self.connector.id, work.id, remote_id) - def create_edition_from_data(self, work, edition_data, instance=None): + def create_edition_from_data( + self, + work: models.Work, + edition_data: Union[str, JsonDict], + instance: Optional[models.Edition] = None, + ) -> Optional[models.Edition]: """pass in the url as data and then call the version in abstract connector""" if isinstance(edition_data, str): try: @@ -168,22 +189,26 @@ def create_edition_from_data(self, work, edition_data, instance=None): return None return super().create_edition_from_data(work, edition_data, instance=instance) - def get_cover_url(self, cover_blob, *_): + def get_cover_url( + self, cover_blob: Union[list[JsonDict], JsonDict], *_: Any + ) -> Optional[str]: """format the relative cover url into an absolute one: {"url": "/img/entities/e794783f01b9d4f897a1ea9820b96e00d346994f"} """ # covers may or may not be a list - if isinstance(cover_blob, list) and len(cover_blob) > 0: + if isinstance(cover_blob, list): + if len(cover_blob) == 0: + return None cover_blob = cover_blob[0] cover_id = cover_blob.get("url") - if not cover_id: + if not isinstance(cover_id, str): return None # cover may or may not be an absolute url already if re.match(r"^http", cover_id): return cover_id return f"{self.covers_url}{cover_id}" - def resolve_keys(self, keys): + def resolve_keys(self, keys: Iterable[str]) -> list[str]: """cool, it's "wd:Q3156592" now what the heck does that mean""" results = [] for uri in keys: @@ -191,10 +216,10 @@ def resolve_keys(self, keys): data = self.get_book_data(self.get_remote_id(uri)) except ConnectorException: continue - results.append(get_language_code(data.get("labels"))) + results.append(get_language_code(data.get("labels", {}))) return results - def get_description(self, links): + def get_description(self, links: JsonDict) -> str: """grab an extracted excerpt from wikipedia""" link = links.get("enwiki") if not link: @@ -204,15 +229,15 @@ def get_description(self, links): data = get_data(url) except ConnectorException: return "" - return data.get("extract") + return data.get("extract", "") - def get_remote_id_from_model(self, obj): + def get_remote_id_from_model(self, obj: models.BookDataModel) -> str: """use get_remote_id to figure out the link from a model obj""" remote_id_value = obj.inventaire_id return self.get_remote_id(remote_id_value) -def get_language_code(options, code="en"): +def get_language_code(options: JsonDict, code: str = "en") -> Any: """when there are a bunch of translation but we need a single field""" result = options.get(code) if result: diff --git a/bookwyrm/connectors/openlibrary.py b/bookwyrm/connectors/openlibrary.py index 0fd786660c..98c1b2b7ce 100644 --- a/bookwyrm/connectors/openlibrary.py +++ b/bookwyrm/connectors/openlibrary.py @@ -1,9 +1,10 @@ """ openlibrary data connector """ import re +from typing import Any, Optional, Union, Iterator, Iterable from bookwyrm import models from bookwyrm.book_search import SearchResult -from .abstract_connector import AbstractConnector, Mapping +from .abstract_connector import AbstractConnector, Mapping, JsonDict from .abstract_connector import get_data, infer_physical_format, unique_physical_format from .connector_manager import ConnectorException, create_edition_task from .openlibrary_languages import languages @@ -14,7 +15,7 @@ class Connector(AbstractConnector): generated_remote_link_field = "openlibrary_link" - def __init__(self, identifier): + def __init__(self, identifier: str): super().__init__(identifier) get_first = lambda a, *args: a[0] @@ -94,14 +95,14 @@ def __init__(self, identifier): Mapping("inventaire_id", remote_field="links", formatter=get_inventaire_id), ] - def get_book_data(self, remote_id): + def get_book_data(self, remote_id: str) -> JsonDict: data = get_data(remote_id) if data.get("type", {}).get("key") == "/type/redirect": - remote_id = self.base_url + data.get("location") + remote_id = self.base_url + data.get("location", "") return get_data(remote_id) return data - def get_remote_id_from_data(self, data): + def get_remote_id_from_data(self, data: JsonDict) -> str: """format a url from an openlibrary id field""" try: key = data["key"] @@ -109,10 +110,10 @@ def get_remote_id_from_data(self, data): raise ConnectorException("Invalid book data") return f"{self.books_url}{key}" - def is_work_data(self, data): + def is_work_data(self, data: JsonDict) -> bool: return bool(re.match(r"^[\/\w]+OL\d+W$", data["key"])) - def get_edition_from_work_data(self, data): + def get_edition_from_work_data(self, data: JsonDict) -> JsonDict: try: key = data["key"] except KeyError: @@ -124,7 +125,7 @@ def get_edition_from_work_data(self, data): raise ConnectorException("No editions for work") return edition - def get_work_from_edition_data(self, data): + def get_work_from_edition_data(self, data: JsonDict) -> JsonDict: try: key = data["works"][0]["key"] except (IndexError, KeyError): @@ -132,7 +133,7 @@ def get_work_from_edition_data(self, data): url = f"{self.books_url}{key}" return self.get_book_data(url) - def get_authors_from_data(self, data): + def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]: """parse author json and load or create authors""" for author_blob in data.get("authors", []): author_blob = author_blob.get("author", author_blob) @@ -144,7 +145,7 @@ def get_authors_from_data(self, data): continue yield author - def get_cover_url(self, cover_blob, size="L"): + def get_cover_url(self, cover_blob: list[str], size: str = "L") -> Optional[str]: """ask openlibrary for the cover""" if not cover_blob: return None @@ -152,8 +153,10 @@ def get_cover_url(self, cover_blob, size="L"): image_name = f"{cover_id}-{size}.jpg" return f"{self.covers_url}/b/id/{image_name}" - def parse_search_data(self, data, min_confidence): - for idx, search_result in enumerate(data.get("docs")): + def parse_search_data( + self, data: JsonDict, min_confidence: float + ) -> Iterator[SearchResult]: + for idx, search_result in enumerate(data.get("docs", [])): # build the remote id from the openlibrary key key = self.books_url + search_result["key"] author = search_result.get("author_name") or ["Unknown"] @@ -174,7 +177,7 @@ def parse_search_data(self, data, min_confidence): confidence=confidence, ) - def parse_isbn_search_data(self, data): + def parse_isbn_search_data(self, data: JsonDict) -> Iterator[SearchResult]: for search_result in list(data.values()): # build the remote id from the openlibrary key key = self.books_url + search_result["key"] @@ -188,12 +191,12 @@ def parse_isbn_search_data(self, data): year=search_result.get("publish_date"), ) - def load_edition_data(self, olkey): + def load_edition_data(self, olkey: str) -> JsonDict: """query openlibrary for editions of a work""" url = f"{self.books_url}/works/{olkey}/editions" return self.get_book_data(url) - def expand_book_data(self, book): + def expand_book_data(self, book: models.Book) -> None: work = book # go from the edition to the work, if necessary if isinstance(book, models.Edition): @@ -206,14 +209,14 @@ def expand_book_data(self, book): # who knows, man return - for edition_data in edition_options.get("entries"): + for edition_data in edition_options.get("entries", []): # does this edition have ANY interesting data? if ignore_edition(edition_data): continue create_edition_task.delay(self.connector.id, work.id, edition_data) -def ignore_edition(edition_data): +def ignore_edition(edition_data: JsonDict) -> bool: """don't load a million editions that have no metadata""" # an isbn, we love to see it if edition_data.get("isbn_13") or edition_data.get("isbn_10"): @@ -232,19 +235,19 @@ def ignore_edition(edition_data): return True -def get_description(description_blob): +def get_description(description_blob: Union[JsonDict, str]) -> Optional[str]: """descriptions can be a string or a dict""" if isinstance(description_blob, dict): return description_blob.get("value") return description_blob -def get_openlibrary_key(key): +def get_openlibrary_key(key: str) -> str: """convert /books/OL27320736M into OL27320736M""" return key.split("/")[-1] -def get_languages(language_blob): +def get_languages(language_blob: Iterable[JsonDict]) -> list[Optional[str]]: """/language/eng -> English""" langs = [] for lang in language_blob: @@ -252,14 +255,14 @@ def get_languages(language_blob): return langs -def get_dict_field(blob, field_name): +def get_dict_field(blob: Optional[JsonDict], field_name: str) -> Optional[Any]: """extract the isni from the remote id data for the author""" if not blob or not isinstance(blob, dict): return None return blob.get(field_name) -def get_wikipedia_link(links): +def get_wikipedia_link(links: list[Any]) -> Optional[str]: """extract wikipedia links""" if not isinstance(links, list): return None @@ -272,7 +275,7 @@ def get_wikipedia_link(links): return None -def get_inventaire_id(links): +def get_inventaire_id(links: list[Any]) -> Optional[str]: """extract and format inventaire ids""" if not isinstance(links, list): return None @@ -282,11 +285,13 @@ def get_inventaire_id(links): continue if link.get("title") == "inventaire.io": iv_link = link.get("url") + if not isinstance(iv_link, str): + return None return iv_link.split("/")[-1] return None -def pick_default_edition(options): +def pick_default_edition(options: list[JsonDict]) -> Optional[JsonDict]: """favor physical copies with covers in english""" if not options: return None diff --git a/bookwyrm/models/activitypub_mixin.py b/bookwyrm/models/activitypub_mixin.py index 4b53c6e872..36317ad4ed 100644 --- a/bookwyrm/models/activitypub_mixin.py +++ b/bookwyrm/models/activitypub_mixin.py @@ -6,8 +6,9 @@ import json import operator import logging -from typing import List +from typing import Any, Optional from uuid import uuid4 +from typing_extensions import Self import aiohttp from Crypto.PublicKey import RSA @@ -85,7 +86,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @classmethod - def find_existing_by_remote_id(cls, remote_id): + def find_existing_by_remote_id(cls, remote_id: str) -> Self: """look up a remote id in the db""" return cls.find_existing({"id": remote_id}) @@ -137,7 +138,7 @@ def broadcast(self, activity, sender, software=None, queue=BROADCAST): queue=queue, ) - def get_recipients(self, software=None) -> List[str]: + def get_recipients(self, software=None) -> list[str]: """figure out which inbox urls to post to""" # first we have to figure out who should receive this activity privacy = self.privacy if hasattr(self, "privacy") else "public" @@ -198,7 +199,14 @@ def to_activity(self, **kwargs): # pylint: disable=unused-argument class ObjectMixin(ActivitypubMixin): """add this mixin for object models that are AP serializable""" - def save(self, *args, created=None, software=None, priority=BROADCAST, **kwargs): + def save( + self, + *args: Any, + created: Optional[bool] = None, + software: Any = None, + priority: str = BROADCAST, + **kwargs: Any, + ) -> None: """broadcast created/updated/deleted objects as appropriate""" broadcast = kwargs.get("broadcast", True) # this bonus kwarg would cause an error in the base save method @@ -507,14 +515,14 @@ def unfurl_related_field(related_field, sort_field=None): @app.task(queue=BROADCAST) -def broadcast_task(sender_id: int, activity: str, recipients: List[str]): +def broadcast_task(sender_id: int, activity: str, recipients: list[str]): """the celery task for broadcast""" user_model = apps.get_model("bookwyrm.User", require_ready=True) sender = user_model.objects.select_related("key_pair").get(id=sender_id) asyncio.run(async_broadcast(recipients, sender, activity)) -async def async_broadcast(recipients: List[str], sender, data: str): +async def async_broadcast(recipients: list[str], sender, data: str): """Send all the broadcasts simultaneously""" timeout = aiohttp.ClientTimeout(total=10) async with aiohttp.ClientSession(timeout=timeout) as session: diff --git a/bookwyrm/models/book.py b/bookwyrm/models/book.py index c25f8fee2b..f4a9849032 100644 --- a/bookwyrm/models/book.py +++ b/bookwyrm/models/book.py @@ -1,6 +1,7 @@ """ database schema for books and shelves """ from itertools import chain import re +from typing import Any from django.contrib.postgres.search import SearchVectorField from django.contrib.postgres.indexes import GinIndex @@ -90,7 +91,7 @@ class Meta: abstract = True - def save(self, *args, **kwargs): + def save(self, *args: Any, **kwargs: Any) -> None: """ensure that the remote_id is within this instance""" if self.id: self.remote_id = self.get_remote_id() @@ -204,7 +205,7 @@ def alt_text(self): text += f" ({self.edition_info})" return text - def save(self, *args, **kwargs): + def save(self, *args: Any, **kwargs: Any) -> None: """can't be abstract for query reasons, but you shouldn't USE it""" if not isinstance(self, Edition) and not isinstance(self, Work): raise ValueError("Books should be added as Editions or Works") @@ -343,7 +344,7 @@ def get_rank(self): # max rank is 9 return rank - def save(self, *args, **kwargs): + def save(self, *args: Any, **kwargs: Any) -> None: """set some fields on the edition object""" # calculate isbn 10/13 if self.isbn_13 and self.isbn_13[:3] == "978" and not self.isbn_10: diff --git a/bookwyrm/models/federated_server.py b/bookwyrm/models/federated_server.py index eb03d457e0..e1081ed45c 100644 --- a/bookwyrm/models/federated_server.py +++ b/bookwyrm/models/federated_server.py @@ -61,7 +61,7 @@ def unblock(self): ).update(active=True, deactivation_reason=None) @classmethod - def is_blocked(cls, url): + def is_blocked(cls, url: str) -> bool: """look up if a domain is blocked""" url = urlparse(url) domain = url.netloc diff --git a/bookwyrm/tests/connectors/test_openlibrary_connector.py b/bookwyrm/tests/connectors/test_openlibrary_connector.py index 01b9b9f6a6..88ab09856e 100644 --- a/bookwyrm/tests/connectors/test_openlibrary_connector.py +++ b/bookwyrm/tests/connectors/test_openlibrary_connector.py @@ -233,3 +233,13 @@ def test_ignore_edition(self): self.assertFalse(ignore_edition({"languages": "languages/fr"})) self.assertTrue(ignore_edition({"languages": "languages/eng"})) self.assertTrue(ignore_edition({"format": "paperback"})) + + def test_remote_id_from_model(self): + """figure out a url from an id""" + obj = models.Author.objects.create( + name="George Elliott", openlibrary_key="OL453734A" + ) + self.assertEqual( + self.connector.get_remote_id_from_model(obj), + "https://openlibrary.org/authors/OL453734A", + ) diff --git a/mypy.ini b/mypy.ini index 39f6863fe6..2a29e314f0 100644 --- a/mypy.ini +++ b/mypy.ini @@ -10,6 +10,9 @@ django_settings_module = "bookwyrm.settings" ignore_errors = True implicit_reexport = True +[mypy-bookwyrm.connectors.*] +ignore_errors = False + [mypy-celerywyrm.*] ignore_errors = False