From 94925fbf60c7966fb6c8e4466ae5a9a9dc0ada9d Mon Sep 17 00:00:00 2001
From: Bart de Rooij <bpderooij@gmail.com>
Date: Fri, 6 Sep 2024 12:24:54 +0200
Subject: [PATCH] Insert images to fix bug with pyvips.arrayjoin & remove
 cmyk2rgb conversion

---
 dlup/backends/deepzoom_backend.py   | 65 ++++++++++++++++-------------
 dlup/backends/slidescore_backend.py |  7 ++--
 dlup/utils/backends.py              | 46 +++++++++++++-------
 3 files changed, 72 insertions(+), 46 deletions(-)

diff --git a/dlup/backends/deepzoom_backend.py b/dlup/backends/deepzoom_backend.py
index 02e5e9b3..8c93bd1b 100644
--- a/dlup/backends/deepzoom_backend.py
+++ b/dlup/backends/deepzoom_backend.py
@@ -7,16 +7,15 @@
 from pathlib import Path
 from typing import Any, Union
 
+# TODO: Fix cmyk case in read_region so we can remove PIL and numpy
+# import PIL
+import numpy as np
 import pyvips
 
 from dlup._types import PathLike
 from dlup.backends.common import AbstractSlideBackend
 from dlup.utils.backends import dict_to_snake_case, parse_xml_to_dict
 
-# TODO: Fix cmyk case in read_region so we can remove PIL and numpy
-# import PIL
-# import numpy as np
-
 METADATA_CACHE = 128
 RELEVANT_VIPS_PROPERTIES = {
     "openslide.vendor": str,
@@ -27,7 +26,9 @@
     "openslide.bounds-width": int,
     "openslide.bounds-x": int,
     "openslide.bounds-y": int,
+    "openslide.quickhash-1": str,
     "vips-loader": str,
+    "bands": int,
 }
 
 TileResponseTypes = Union[str, io.BytesIO]
@@ -100,16 +101,18 @@ def _fetch_properties(self) -> dict[str, Any]:
         if not vips_properties_file.exists():
             return {}
         # Don't convert to snake case for now to keep original vips-property names
-        vips_properties_dict = parse_xml_to_dict(vips_properties_file, _to_snake_case=False)
-        properties = {
-            relevant_key.split("openslide.")[-1]: cast_fn(vips_properties_dict["properties"][relevant_key])
+        vips_properties = parse_xml_to_dict(vips_properties_file, _to_snake_case=False)["image"]["properties"]
+        relevant_properties = {
+            relevant_key.split("openslide.")[-1]: cast_fn(vips_properties[relevant_key])
             for relevant_key, cast_fn in RELEVANT_VIPS_PROPERTIES.items()
-            if relevant_key in vips_properties_dict["properties"]
+            if relevant_key in vips_properties
         }
-        if properties.get("vips-loader") is None or properties.get("vips-loader") != "openslideload":
-            raise NotImplementedError(f"Properties not implemented for vips-loader {properties.get('vips-loader')}.")
+        if relevant_properties.get("vips-loader", "") != "openslideload":
+            raise NotImplementedError(
+                f"Properties not implemented for vips-loader {relevant_properties.get('vips-loader')}."
+            )
         # Convert to snake case naming convention in the end
-        return dict_to_snake_case(properties)
+        return dict_to_snake_case(relevant_properties)
 
     @property
     def dz_properties(self) -> dict[str, Any]:
@@ -218,6 +221,7 @@ def read_region(self, coordinates: tuple[Any, ...], level: int, size: tuple[int,
 
         x, y = (coordinates[0] // level_downsample, coordinates[1] // level_downsample)
         w, h = size
+        _overlap = self._overlap
 
         # Calculate the range of rows and columns for tiles covering the specified region
         start_row = y // tile_h
@@ -228,7 +232,12 @@ def read_region(self, coordinates: tuple[Any, ...], level: int, size: tuple[int,
         indices = list(itertools.product(range(start_row, end_row), range(start_col, end_col)))
         level_dz = self._level_count - level - 1
         tile_files = self.retrieve_deepzoom_tiles(level_dz, indices)
-        _region_tiles = []
+
+        # The number of bands can be in the vips-properties.xml, but otherwise we can interpret from image mode
+        num_bands = self.properties.get("bands", 3 if self.mode != "cmyk" else 4)
+        # We create an image from an array with zeros so we can give the correct interpretation. Arrayjoin would be
+        # faster, but does not work for unregular grids of images.
+        _region = pyvips.Image.new_from_array(np.zeros((h, w, num_bands), dtype=np.uint8), interpretation=self.mode)
         for (row, col), tile_file in zip(indices, tile_files):
             _region_tile: pyvips.Image = (
                 pyvips.Image.new_from_buffer(tile_file.getvalue(), "")
@@ -251,30 +260,28 @@ def read_region(self, coordinates: tuple[Any, ...], level: int, size: tuple[int,
 
             # All but edge tiles have overlap pixels outside of tile
             if col > 0:
-                crop_start_x += self._overlap
-                crop_end_x += self._overlap
+                crop_start_x += _overlap
+                crop_end_x += _overlap
             if col == level_end_col - 1:
-                crop_end_x -= self._overlap
+                crop_end_x -= _overlap
 
             if row > 0:
-                crop_start_y += self._overlap
-                crop_end_y += self._overlap
+                crop_start_y += _overlap
+                crop_end_y += _overlap
             if row == level_end_row - 1:
-                crop_end_y -= self._overlap
-
+                crop_end_y -= _overlap
             _cropped_region_tile = _region_tile.crop(
                 crop_start_x, crop_start_y, crop_end_x - crop_start_x, crop_end_y - crop_start_y
             )
-            _region_tiles.append(_cropped_region_tile)
-
-        _region = pyvips.Image.arrayjoin(_region_tiles, across=end_col - start_col)
-        # Convert to RGB if mode is cmyk
-        if self.mode == "cmyk":
-            # FIXME: This looks off when using pyvips but not when using PIL
-            _region = _region.colourspace("srgb")  # _region = _region.icc_transform("srgb") <- Both look off
-            # _region = pyvips.Image.new_from_array(
-            #     np.asarray(PIL.Image.fromarray(_region.numpy(), mode="CMYK").convert("RGB")), interpretation="srgb"
-            # )  # <- This looks good
+            _region = _region.insert(_cropped_region_tile, img_start_x, img_start_y)
+
+        # # Should convert from cmyk to rgb or should a user do this afterwards theirself?
+        # if self.mode == "cmyk":
+        #     # FIXME: This looks off when using pyvips (colourspace and icc_transform) but not when using PIL
+        #     _region = _region.colourspace("srgb", source_space="cmyk")  # _region = _region.icc_transform("srgb")
+        #     # _region = pyvips.Image.new_from_array(
+        #     #     np.asarray(PIL.Image.fromarray(_region.numpy(), mode="CMYK").convert("RGB")), interpretation="srgb"
+        #     # )  # <- This looks good, but seems like a roundabout way
         return _region
 
     def close(self) -> None:
diff --git a/dlup/backends/slidescore_backend.py b/dlup/backends/slidescore_backend.py
index b7bfe4b4..cd92b54f 100644
--- a/dlup/backends/slidescore_backend.py
+++ b/dlup/backends/slidescore_backend.py
@@ -22,6 +22,7 @@
 
 METADATA_CACHE = 128
 DEFAULT_ASYNC_REQUESTS = 6  # This is the number of requests to make asynchronously
+API_TOKEN_OS_VARIABLE_NAME = "SLIDESCORE_API_TOKEN"
 
 
 def open_slide(filename: PathLike) -> "SlideScoreSlide":
@@ -44,7 +45,7 @@ def __init__(self, filename: PathLike):
             raise ValueError("Filename should be SlideScore URL for SlideScoreSlide.")
 
         if not AIOHTTP_AVAILABLE:
-            raise RuntimeError("`aiohtpp` is not available. Install dlup with `slidescore_remote` dependencies.")
+            raise RuntimeError("`aiohttp` is not available. Install dlup with `slidescore_remote` dependencies.")
 
         # Parse URL with regex
         parsed_url = re.search(r"(https?://[^/?]+)(?=.*\bstudyId=(\d+))(?=.*\bimageId=(\d+)).*$", filename)
@@ -107,7 +108,7 @@ def _set_metadata(self) -> None:
         RuntimeError
             If serverside studyID is not the same as slide study_id
         """
-        api_token = os.getenv("SLIDESCORE_API_TOKEN")
+        api_token = os.getenv(API_TOKEN_OS_VARIABLE_NAME)
         if api_token is None:
             raise RuntimeError("SlideScore API token not found. Please set SLIDESCORE_API_TOKEN in os environment")
         self.headers = {"Accept": "application/json", "Authorization": f"Bearer {api_token}"}
@@ -230,7 +231,7 @@ def close(self) -> None:
         return
 
 
-def export_api_key(file_path: PathLike, os_variable_name: str = "SLIDESCORE_API_TOKEN") -> None:
+def export_api_key(file_path: PathLike, os_variable_name: str = API_TOKEN_OS_VARIABLE_NAME) -> None:
     """Reads SlideScore API key from path and exports it into operating system environment.
 
     Parameters
diff --git a/dlup/utils/backends.py b/dlup/utils/backends.py
index f9d570db..1af23a32 100644
--- a/dlup/utils/backends.py
+++ b/dlup/utils/backends.py
@@ -13,8 +13,8 @@
 
 
 def parse_xml_to_dict(file_path: PathLike | io.BytesIO, _to_snake_case: bool = True) -> dict[str, Any]:
-    """Parse XML file with name space. vips-properties.xml files will extract every property name-value pair in
-    `properties`.
+    """Parse XML file (DeepZoom DZI or vips-properties.xml) into a dictionary. `vips-properties.xml` files will extract
+    every property name-value pair in `properties` key.
 
     Parameters
     ----------
@@ -29,25 +29,43 @@ def parse_xml_to_dict(file_path: PathLike | io.BytesIO, _to_snake_case: bool = T
         Parsed XML file as a dictionary. Name space will be replaced with an empty string.
     """
     root = ET.parse(file_path).getroot()
-    namespace = root.tag.split("}")[0] + "}" if len(root.tag.split("}")) > 1 else ""
+    namespace = "".join(root.tag.partition("}")[:2]) if "}" in root.tag else ""
     root_tag = root.tag.replace(namespace, "")
     parsed_dict: dict[str, dict[str, Any]] = {root_tag: dict(root.attrib)}
     for elem in root:
         tag = elem.tag.replace(namespace, "")
-        if tag == "properties":
-            properties = {}
-            for prop in elem.findall(f".//{namespace}property"):
-                name = prop.find(f"{namespace}name")
-                if name is None:
-                    continue
-                value = prop.find(f"{namespace}value")
-                properties[str(name.text)] = value.text if value is not None else value
-            parsed_dict["properties"] = properties
-        else:
-            parsed_dict[root_tag][tag] = dict(elem.attrib)
+        attributes = extract_vips_properties(elem, namespace=namespace) if tag == "properties" else dict(elem.attrib)
+        parsed_dict[root_tag][tag] = attributes
     return dict_to_snake_case(parsed_dict) if _to_snake_case else parsed_dict
 
 
+def extract_vips_properties(properties_elem: ET.Element, namespace: str) -> dict[str, Any]:
+    """
+    Extract 'properties' section from vips-properties.xml, with name-value pairs.
+
+    Parameters
+    ----------
+    properties_elem : xml.etree.ElementTree.Element
+        The 'properties' XML element.
+    namespace : str
+        The namespace for the XML document.
+
+    Returns
+    -------
+    dict[str, Any]
+        Dictionary of properties with name-value pairs.
+    """
+    properties = {}
+    for prop in properties_elem.findall(f".//{namespace}property"):
+        name_elem = prop.find(f"{namespace}name")
+        if name_elem is None:
+            continue
+        value_elem = prop.find(f"{namespace}value")
+        properties[str(name_elem.text)] = value_elem.text if value_elem is not None else None
+
+    return properties
+
+
 def dict_to_snake_case(dictionary: dict[str, Any]) -> dict[str, Any]:
     """Recursively convert all keys in a dictionary to snake case naming convention. String values will be
     converted to floats and integers if appropriate.