Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve usability of Directory datatype #17614

Merged
merged 17 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -288,13 +288,14 @@
<display file="qiime/qiime2/q2view.xml"/>
</datatype>
<datatype extension='qiime2.tabular' type="galaxy.datatypes.qiime2:QIIME2Metadata" display_in_upload="true"/>
<datatype extension="zip" type="galaxy.datatypes.binary:CompressedZipArchive" display_in_upload="true"/>
<datatype extension="ncbi_genome_dataset.zip" type="galaxy.datatypes.binary:CompressedZipArchive" subclass="true" display_in_upload="true"/>
<datatype extension="tar" type="galaxy.datatypes.binary:CompressedArchive" subclass="true" display_in_upload="true">
<converter file="tar_to_directory.xml" target_datatype="directory"/>
<datatype extension="zip" type="galaxy.datatypes.binary:CompressedZipArchive" display_in_upload="true">
<converter file="archive_to_directory.xml" target_datatype="directory" />
</datatype>
<datatype extension="directory" type="galaxy.datatypes.data:Directory">
<datatype extension="ncbi_genome_dataset.zip" type="galaxy.datatypes.binary:CompressedZipArchive" subclass="true" display_in_upload="true"/>
<datatype extension="tar" auto_compressed_types="gz,bz2" type="galaxy.datatypes.binary:CompressedArchive" subclass="true" display_in_upload="true">
<converter file="archive_to_directory.xml" target_datatype="directory"/>
</datatype>
<datatype extension="directory" type="galaxy.datatypes.data:Directory"/>
<datatype extension="yaml" type="galaxy.datatypes.text:Yaml" display_in_upload="true" />
<!-- Proteomics Datatypes -->
<datatype extension="mrm" type="galaxy.datatypes.tabular:Tabular" display_in_upload="true" subclass="true"/>
Expand Down
37 changes: 37 additions & 0 deletions lib/galaxy/datatypes/converters/archive_to_directory.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<tool id="CONVERTER_archive_to_directory" name="Unpack archive to directory" version="1.0.0" profile="21.09">
<!-- Use compression_utils instead of shell commands (tar/unzip) so we can verify safety of results -->
<requirements>
<requirement type="package" version="23.2.1">galaxy-util</requirement>
</requirements>
<command><![CDATA[
mkdir '$output1.files_path' &&
cd '$output1.files_path' &&
python -c "from galaxy.util.compression_utils import CompressedFile; CompressedFile('$input1').extract('.')"
]]></command>
<configfiles>
<configfile filename="metadata_json"><![CDATA[{
"output1": {
"name": "$input1.name unpacked to $__target_datatype__",
"ext": "$__target_datatype__"
}
}]]></configfile>
davelopez marked this conversation as resolved.
Show resolved Hide resolved
</configfiles>
<inputs>
<param format="tar,zip" name="input1" type="data"/>
wm75 marked this conversation as resolved.
Show resolved Hide resolved
<param name="__target_datatype__" type="select" label="Target data type">
<option value="directory">directory</option>
</param>
</inputs>
<outputs provided_metadata_file="metadata_json">
<data format="auto" name="output1"/>
davelopez marked this conversation as resolved.
Show resolved Hide resolved
</outputs>
<tests>
<test>
<param name="input1" ftype="tar" value="testdir1.tar"/>
<param name="__target_datatype__" value="directory"/>
<output name="output1" ftype="directory" value="testdir1.tar.directory"/>
</test>
</tests>
<help>
</help>
</tool>
2 changes: 1 addition & 1 deletion lib/galaxy/datatypes/converters/tar_to_directory.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<tool id="CONVERTER_tar_to_directory" name="Convert tar to directory" version="1.0.1" profile="17.05">
<!-- Don't use tar directly so we can verify safety of results - tar -xzf '$input1'; -->
<requirements>
<requirement type="package" version="23.1">galaxy-util</requirement>
<requirement type="package" version="23.2.1">galaxy-util</requirement>
</requirements>
<command>
mkdir '$output1.files_path';
Expand Down
13 changes: 13 additions & 0 deletions lib/galaxy/datatypes/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd):
composite_extensions = trans.app.datatypes_registry.get_composite_extensions()
composite_extensions.append("html") # for archiving composite datatypes
composite_extensions.append("data_manager_json") # for downloading bundles if bundled.
composite_extensions.append("directory") # for downloading directories.

if data.extension in composite_extensions:
return self._archive_composite_dataset(trans, data, headers, do_action=kwd.get("do_action", "zip"))
Expand Down Expand Up @@ -1212,6 +1213,18 @@ def regex_line_dataprovider(
class Directory(Data):
mvdbeek marked this conversation as resolved.
Show resolved Hide resolved
"""Class representing a directory of files."""

file_ext = "directory"

def _archive_main_file(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test for roundtripping a directory via the API ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure I can try!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added a test, but I'm not sure why I'm getting ModuleNotFoundError: No module named 'galaxy' when running the converter tool... using the UI seems to work fine 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the API tests are setting up dependency resolution. I am mostly interested in verifying that the structure of the tar archive is the same pre-and-post upload. In fact I think even the checksum should match if we're not compressing the archive. In either case if you upload a tar file and download it again that should be sufficient. The converter is tested in the test framework.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh I see, so you mean something like this fc7e959#diff-a6ab1700bcef9e1585a2bb0f84e8888470a770fb81c3e0337930e7cad573093fR662

I'll do that 👍

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've tried this:

    def test_fetch_directory(self, history_id):
        testdir = TestDataResolver().get_filename("testdir.tar")
        with open(testdir, "rb") as fh:
            details = self._upload_and_get_details(
                fh, api="fetch", history_id=history_id, ext="directory", assert_ok=True
            )
        assert details["file_ext"] == "directory"
        assert details["file_size"] == 3584
        content = self.dataset_populator.get_history_dataset_content(
            history_id, dataset=details, to_ext="directory", type="bytes"
        )
        dir_path = decompress_bytes_to_directory(cast(bytes, content))
        assert dir_path.endswith("testdir")
        for path, entry_class in EXPECTED_CONTENTS.items():
            path = os.path.join(dir_path, os.path.pardir, path)
            if entry_class == "Directory":
                assert os.path.isdir(path)
            else:
                assert os.path.isfile(path)

But if I don't run the converter manually instead of the to_ext="directory" the extra_files_path is empty, I guess that is why you have more changes persisting extra files in the object store in your referenced branch dev...mvdbeek:galaxy:directory_datatype_improvements#diff-8640d91ef47bca302b00039012979f4b1b79f5dbffbe2431bc9a05f19fb4c7d0R132

Should we merge your branch instead? Is something still missing in your branch or should that be how to do it?

Sorry, I'm a bit lost 😅

Copy link
Contributor

@davelopez davelopez Oct 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mvdbeek, re-reading your comment:

In either case if you upload a tar file and download it again that should be sufficient.

do you mean something simpler like this instead?

def test_upload_tar_roundtrip(self, history_id):
        testdir = TestDataResolver().get_filename("testdir.tar")
        expected_hash = md5_hash_file(testdir)
        expected_size = os.path.getsize(testdir)
        with open(testdir, "rb") as fh:
            details = self._upload_and_get_details(fh, api="fetch", history_id=history_id, assert_ok=True)
        assert details["file_ext"] == "tar"
        assert details["file_size"] == expected_size
        content = cast(
            bytes, self.dataset_populator.get_history_dataset_content(history_id, dataset=details, type="bytes")
        )
        assert len(content) == expected_size
        dir_path = decompress_bytes_to_directory(content)
        expected_contents = {
            "testdir": "Directory",
            "testdir/c": "Directory",
            "testdir/a": "File",
            "testdir/b": "File",
            "testdir/c/d": "File",
        }
        assert dir_path.endswith("testdir")
        for path, entry_class in expected_contents.items():
            path = os.path.join(dir_path, os.path.pardir, path)
            if entry_class == "Directory":
                assert os.path.isdir(path)
            else:
                assert os.path.isfile(path)

        with tempfile.NamedTemporaryFile("wb") as temp:
            temp.write(content)
            actual_hash = md5_hash_file(temp.name)
            assert actual_hash == expected_hash

If this is what you mean, the uploaded vs downloaded tar size and contents match, but the hashes don't (not sure why).
I still don't see the connection with the directory datatype or the converter changes in this PR, so I might be misunderstanding something 😞

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You've overwritten _archive_main_file, as a result you've made sure you're not getting an (empty) extra file added to the archive.

test_upload_tar_roundtrip looks fine to me, you sure you don't need to flush and that that is why the checksums don't match ? You could also skip the hash and just compare the bytes. If the contents are the same, no added or removed files or structure then it's all good.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahhh that was it... I was missing the flush 🙈

Thank you very much!!

self, archive: ZipstreamWrapper, display_name: str, data_filename: str
) -> Tuple[bool, str, str]:
"""Overwrites the method to not do anything.

No main file gets added to a directory archive.
"""
error, msg, messagetype = False, "", ""
return error, msg, messagetype


class GenericAsn1(Text):
"""Class for generic ASN.1 text format"""
Expand Down
92 changes: 83 additions & 9 deletions lib/galaxy_test/api/test_tools_upload.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import json
import os
import tempfile
import urllib.parse
from base64 import b64encode
from typing import cast

import pytest
from tusclient import client

from galaxy.tool_util.verify.test_data import TestDataResolver
from galaxy.util import UNKNOWN
from galaxy.util.compression_utils import decompress_bytes_to_directory
from galaxy.util.hash_util import md5_hash_file
from galaxy.util.unittest_utils import (
skip_if_github_down,
skip_if_site_down,
Expand All @@ -29,6 +33,14 @@
B64_FOR_1_2_3 = b64encode(b"1 2 3").decode("utf-8")
URI_FOR_1_2_3 = f"base64://{B64_FOR_1_2_3}"

EXPECTED_TAR_CONTENTS = {
"testdir": "Directory",
"testdir/c": "Directory",
"testdir/a": "File",
"testdir/b": "File",
"testdir/c/d": "File",
}


class TestToolsUpload(ApiTestCase):
dataset_populator: DatasetPopulator
Expand Down Expand Up @@ -604,18 +616,11 @@ def _check_testdir_composite(self, dataset, history_id):
assert content.strip() == "Test123"
extra_files = self.dataset_populator.get_history_dataset_extra_files(history_id, dataset_id=dataset["id"])
assert len(extra_files) == 5, extra_files
expected_contents = {
"testdir": "Directory",
"testdir/c": "Directory",
"testdir/a": "File",
"testdir/b": "File",
"testdir/c/d": "File",
}
found_files = set()
for extra_file in extra_files:
path = extra_file["path"]
assert path in expected_contents
assert extra_file["class"] == expected_contents[path]
assert path in EXPECTED_TAR_CONTENTS
assert extra_file["class"] == EXPECTED_TAR_CONTENTS[path]
found_files.add(path)

assert len(found_files) == 5, found_files
Expand All @@ -639,6 +644,75 @@ def test_upload_composite_from_bad_tar(self, history_id):
details = self.dataset_populator.get_history_dataset_details(history_id, dataset=dataset, assert_ok=False)
assert details["state"] == "error"

def test_upload_tar_roundtrip(self, history_id):
testdir = TestDataResolver().get_filename("testdir.tar")
expected_size = os.path.getsize(testdir)
with open(testdir, "rb") as fh:
details = self._upload_and_get_details(fh, api="fetch", history_id=history_id, assert_ok=True)
assert details["file_ext"] == "tar"
assert details["file_size"] == expected_size
content = cast(
bytes, self.dataset_populator.get_history_dataset_content(history_id, dataset=details, type="bytes")
)
# Make sure we got the expected content size.
assert len(content) == expected_size

# Make sure we get the expected contents.
dir_path = decompress_bytes_to_directory(content)
assert dir_path.endswith("testdir")
for path, entry_class in EXPECTED_TAR_CONTENTS.items():
path = os.path.join(dir_path, os.path.pardir, path)
if entry_class == "Directory":
assert os.path.isdir(path)
else:
assert os.path.isfile(path)

# Make sure the hash of the content matches the hash of the original file.
expected_hash = md5_hash_file(testdir)
assert expected_hash is not None
self._assert_content_matches_hash(content, expected_hash)

def _assert_content_matches_hash(self, content: bytes, expected_hash: str):
with tempfile.NamedTemporaryFile("wb") as temp:
temp.write(content)
temp.flush()
actual_hash = md5_hash_file(temp.name)
assert actual_hash == expected_hash

def test_upload_zip_roundtrip(self, history_id):
testdir = TestDataResolver().get_filename("testdir1.zip")
expected_size = os.path.getsize(testdir)
with open(testdir, "rb") as fh:
details = self._upload_and_get_details(fh, api="fetch", history_id=history_id, assert_ok=True)
assert details["file_ext"] == "zip"
assert details["file_size"] == expected_size
content = cast(
bytes, self.dataset_populator.get_history_dataset_content(history_id, dataset=details, type="bytes")
)
# Make sure we got the expected content size.
assert len(content) == expected_size

# Make sure we get the expected contents.
dir_path = decompress_bytes_to_directory(content)
assert dir_path.endswith("testdir1")
EXPECTED_ZIP_CONTENTS = {
"file1": "File",
"file2": "File",
"dir1/": "Directory",
"dir1/file3": "File",
}
for path, entry_class in EXPECTED_ZIP_CONTENTS.items():
path = os.path.join(dir_path, path)
if entry_class == "Directory":
assert os.path.isdir(path)
else:
assert os.path.isfile(path)

# Make sure the hash of the content matches the hash of the original file.
expected_hash = md5_hash_file(testdir)
assert expected_hash is not None
self._assert_content_matches_hash(content, expected_hash)

def test_upload_dbkey(self):
with self.dataset_populator.test_history() as history_id:
payload = self.dataset_populator.upload_payload(history_id, "Test123", dbkey="hg19")
Expand Down
Loading