From 0628a56a9a962f9b239bf65ab3b637e079d7b27a Mon Sep 17 00:00:00 2001 From: Bart de Rooij Date: Mon, 2 Sep 2024 17:18:05 +0200 Subject: [PATCH] Move helper functions to utils/backends --- .mypy.ini | 3 ++ dlup/backends/common.py | 1 + dlup/backends/deepzoom_backend.py | 73 ++--------------------------- dlup/backends/slidescore_backend.py | 12 +++-- dlup/utils/backends.py | 70 +++++++++++++++++++++++++++ dlup/utils/imports.py | 1 - pyproject.toml | 6 --- 7 files changed, 85 insertions(+), 81 deletions(-) diff --git a/.mypy.ini b/.mypy.ini index 73099ef5..78a280e0 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -31,3 +31,6 @@ ignore_missing_imports = True [mypy-darwin.*] ignore_missing_imports = True + +[mypy-aiohttp.*] +ignore_missing_imports = True diff --git a/dlup/backends/common.py b/dlup/backends/common.py index eea12dba..2200bc87 100644 --- a/dlup/backends/common.py +++ b/dlup/backends/common.py @@ -1,3 +1,4 @@ +# Copyright (c) dlup contributors from __future__ import annotations import abc diff --git a/dlup/backends/deepzoom_backend.py b/dlup/backends/deepzoom_backend.py index 31c65de7..02e5e9b3 100644 --- a/dlup/backends/deepzoom_backend.py +++ b/dlup/backends/deepzoom_backend.py @@ -4,8 +4,6 @@ import io import itertools import math -import re -import xml.etree.ElementTree as ET from pathlib import Path from typing import Any, Union @@ -13,12 +11,12 @@ from dlup._types import PathLike from dlup.backends.common import AbstractSlideBackend +from dlup.utils.backends import dict_to_snake_case, parse_xml_to_dict # TODO: Fix cmyk case in read_region so we can remove PIL and numpy # import PIL # import numpy as np - METADATA_CACHE = 128 RELEVANT_VIPS_PROPERTIES = { "openslide.vendor": str, @@ -35,73 +33,6 @@ TileResponseTypes = Union[str, io.BytesIO] -def parse_xml_to_dict(file_path: PathLike | io.BytesIO, _to_snake_case: bool = True) -> dict[str, Any]: - """Parse XML file with name space. vips-properties.xml files will extract every property name-value pair in - `properties`. - - Parameters - ---------- - file_path : Pathlike or BytesIO - Path or BytesIO object to XML file - _to_snake_case : bool, optional - Convert keys to snake case naming convention, by default True - - Returns - ------- - dict[str, Any] - Parsed XML file as a dictionary. Name space will be replaced with an empty string. - """ - root = ET.parse(file_path).getroot() - namespace = root.tag.split("}")[0] + "}" if len(root.tag.split("}")) > 1 else "" - root_tag = root.tag.replace(namespace, "") - parsed_dict: dict[str, dict[str, Any]] = {root_tag: dict(root.attrib)} - for elem in root: - tag = elem.tag.replace(namespace, "") - if tag == "properties": - properties = {} - for prop in elem.findall(f".//{namespace}property"): - name = prop.find(f"{namespace}name") - if name is None: - continue - value = prop.find(f"{namespace}value") - properties[str(name.text)] = value.text if value is not None else value - parsed_dict["properties"] = properties - else: - parsed_dict[root_tag][tag] = dict(elem.attrib) - return dict_to_snake_case(parsed_dict) if _to_snake_case else parsed_dict - - -def dict_to_snake_case(dictionary: dict[str, Any]) -> dict[str, Any]: - """Recursively convert all keys in a dictionary to snake case naming convention. String values will be - converted to floats and integers if appropriate. - - Parameters - ---------- - dictionary : dict[str, Any] - Dictionary with keys using Camel/Pascal naming convention. - - Returns - ------- - dict[str, Any] - Dictionary with keys using Snake case naming convention and values as strings, floats and integers. - """ - return_dict = {} - for k, v in dictionary.items(): - if isinstance(v, dict): - # Recursively convert dictionary keys - v = dict_to_snake_case(v) - elif isinstance(v, str): - # Cast to float, int or leave as string - if re.compile(r"^\d+(\.\d+)?$").match(v): - v = float(v) if "." in v else int(v) - - # Convert key to snake_case (i.e. no dashes/spaces, lowercase and underscore before capital letters) - if isinstance(k, str): - k = re.sub("([a-z0-9])([A-Z])", r"\1_\2", re.sub("(.)([A-Z][a-z]+)", r"\1_\2", k)).lower().replace("-", "_") - return_dict[k] = v - return return_dict - - def open_slide(filename: PathLike) -> "DeepZoomSlide": """ Read slide with DeepZoomSlide backend. The input file should be a .dzi file with the deep zoom tiles @@ -116,6 +47,8 @@ def open_slide(filename: PathLike) -> "DeepZoomSlide": class DeepZoomSlide(AbstractSlideBackend): + _properties: dict[str, Any] + _dz_properties: dict[str, Any] def __init__(self, filename: PathLike): super().__init__(filename) diff --git a/dlup/backends/slidescore_backend.py b/dlup/backends/slidescore_backend.py index 8d63b256..b7bfe4b4 100644 --- a/dlup/backends/slidescore_backend.py +++ b/dlup/backends/slidescore_backend.py @@ -8,12 +8,13 @@ from io import BytesIO from typing import Any, Optional -import dlup.utils.imports from dlup._types import PathLike -from dlup.backends.deepzoom_backend import DeepZoomSlide, TileResponseTypes, dict_to_snake_case, parse_xml_to_dict +from dlup.backends.deepzoom_backend import DeepZoomSlide, TileResponseTypes from dlup.backends.remote_backends import RemoteSlideBackend +from dlup.utils.backends import dict_to_snake_case, parse_xml_to_dict +from dlup.utils.imports import AIOHTTP_AVAILABLE -if dlup.utils.imports.AIOHTTP_AVAILABLE: +if AIOHTTP_AVAILABLE: import asyncio import aiohttp @@ -42,6 +43,9 @@ def __init__(self, filename: PathLike): if isinstance(filename, pathlib.Path): raise ValueError("Filename should be SlideScore URL for SlideScoreSlide.") + if not AIOHTTP_AVAILABLE: + raise RuntimeError("`aiohtpp` is not available. Install dlup with `slidescore_remote` dependencies.") + # Parse URL with regex parsed_url = re.search(r"(https?://[^/?]+)(?=.*\bstudyId=(\d+))(?=.*\bimageId=(\d+)).*$", filename) if parsed_url is None: @@ -174,7 +178,7 @@ async def fetch_requests( connector = aiohttp.TCPConnector(limit=self._max_async_request) async with aiohttp.ClientSession(cookies=self.cookies, headers=self.headers, connector=connector) as session: tasks = [self.fetch_request(session=session, url=url, data=data) for url, data in zip(urls, data_dicts)] - return await asyncio.gather(*tasks) + return await asyncio.gather(*tasks) # pylint: disable=possibly-used-before-assignment def run_fetch_requests( self, diff --git a/dlup/utils/backends.py b/dlup/utils/backends.py index ec83241a..f9d570db 100644 --- a/dlup/utils/backends.py +++ b/dlup/utils/backends.py @@ -2,6 +2,9 @@ """Utilities to handle backends.""" from __future__ import annotations +import io +import re +import xml.etree.ElementTree as ET from enum import Enum from typing import Any, Callable @@ -9,6 +12,73 @@ from dlup.utils.imports import AIOHTTP_AVAILABLE +def parse_xml_to_dict(file_path: PathLike | io.BytesIO, _to_snake_case: bool = True) -> dict[str, Any]: + """Parse XML file with name space. vips-properties.xml files will extract every property name-value pair in + `properties`. + + Parameters + ---------- + file_path : Pathlike or BytesIO + Path or BytesIO object to XML file + _to_snake_case : bool, optional + Convert keys to snake case naming convention, by default True + + Returns + ------- + dict[str, Any] + Parsed XML file as a dictionary. Name space will be replaced with an empty string. + """ + root = ET.parse(file_path).getroot() + namespace = root.tag.split("}")[0] + "}" if len(root.tag.split("}")) > 1 else "" + root_tag = root.tag.replace(namespace, "") + parsed_dict: dict[str, dict[str, Any]] = {root_tag: dict(root.attrib)} + for elem in root: + tag = elem.tag.replace(namespace, "") + if tag == "properties": + properties = {} + for prop in elem.findall(f".//{namespace}property"): + name = prop.find(f"{namespace}name") + if name is None: + continue + value = prop.find(f"{namespace}value") + properties[str(name.text)] = value.text if value is not None else value + parsed_dict["properties"] = properties + else: + parsed_dict[root_tag][tag] = dict(elem.attrib) + return dict_to_snake_case(parsed_dict) if _to_snake_case else parsed_dict + + +def dict_to_snake_case(dictionary: dict[str, Any]) -> dict[str, Any]: + """Recursively convert all keys in a dictionary to snake case naming convention. String values will be + converted to floats and integers if appropriate. + + Parameters + ---------- + dictionary : dict[str, Any] + Dictionary with keys using Camel/Pascal naming convention. + + Returns + ------- + dict[str, Any] + Dictionary with keys using Snake case naming convention and values as strings, floats and integers. + """ + return_dict = {} + for k, v in dictionary.items(): + if isinstance(v, dict): + # Recursively convert dictionary keys + v = dict_to_snake_case(v) + elif isinstance(v, str): + # Cast to float, int or leave as string + if re.compile(r"^\d+(\.\d+)?$").match(v): + v = float(v) if "." in v else int(v) + + # Convert key to snake_case (i.e. no dashes/spaces, lowercase and underscore before capital letters) + if isinstance(k, str): + k = re.sub("([a-z0-9])([A-Z])", r"\1_\2", re.sub("(.)([A-Z][a-z]+)", r"\1_\2", k)).lower().replace("-", "_") + return_dict[k] = v + return return_dict + + class ImageBackend(Enum): """Available image experimental_backends.""" diff --git a/dlup/utils/imports.py b/dlup/utils/imports.py index 8c72814f..59536e21 100644 --- a/dlup/utils/imports.py +++ b/dlup/utils/imports.py @@ -23,5 +23,4 @@ def _module_available(module_path: str) -> bool: PYTORCH_AVAILABLE = _module_available("pytorch") PYHALOXML_AVAILABLE = _module_available("pyhaloxml") DARWIN_SDK_AVAILABLE = _module_available("darwin") -BOTO3_AVAILABLE = _module_available("boto3") AIOHTTP_AVAILABLE = _module_available("aiohttp") diff --git a/pyproject.toml b/pyproject.toml index ba209ad8..2241bbd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,12 +61,6 @@ dev = [ "darwin-py>=0.8.62", ] darwin = ["darwin-py>=0.8.59"] -s3_remote = [ - "boto3", - "botocore", - "boto3-stubs", - "botocore-stubs", -] slidescore_remote = ["asyncio", "aiohttp"] [project.urls]