From fbbe7ee338439f91141bb32b4d7ee0ceb7ef8905 Mon Sep 17 00:00:00 2001 From: hkir-dev Date: Mon, 8 Jul 2024 16:00:53 +0100 Subject: [PATCH] save CAS zip support --- setup.py | 2 +- src/tdta/tdt_export.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 97536a9..38415b0 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="tdta", - version="0.1.0.dev15", + version="0.1.0.dev16", description="The aim of this project is to provide taxonomy development tools custom actions.", long_description=README, long_description_content_type="text/markdown", diff --git a/src/tdta/tdt_export.py b/src/tdta/tdt_export.py index abf8727..9785de6 100644 --- a/src/tdta/tdt_export.py +++ b/src/tdta/tdt_export.py @@ -3,12 +3,15 @@ import ast import json import typing +import zipfile +import shutil from typing import Union, List from contextlib import closing from pathlib import Path from datetime import datetime from tdta.utils import read_project_config +from tdta.command_line_utils import runcmd from cas.model import (CellTypeAnnotation, Annotation, Labelset, AnnotationTransfer, AutomatedAnnotation, Review) from cas.file_utils import write_json_file from cas.matrix_file.resolver import resolve_matrix_file @@ -18,6 +21,9 @@ cas_table_names = ["annotation", "labelset", "metadata", "annotation_transfer", "review"] +GITHUB_SIZE_LIMIT = 50 * 1000 * 1000 # 50 MB +# GITHUB_SIZE_LIMIT = 2 * 1000 + def export_cas_data(sqlite_db: str, output_file: str, dataset_cache_folder: str = None): """ @@ -62,9 +68,43 @@ def export_cas_data(sqlite_db: str, output_file: str, dataset_cache_folder: str write_json_file(cta, output_file, False) print("CAS json successfully created at: {}".format(output_file)) + ensure_file_size_limit(output_file) return cta +def ensure_file_size_limit(file_path): + """ + Checks if the file size exceeds the GitHub size limit and zips the file if needed. + Parameters: + file_path: file path to check + """ + if os.path.getsize(file_path) > GITHUB_SIZE_LIMIT: + zip_path = zip_file(file_path) + folder = os.path.dirname(file_path) + is_git_repo = runcmd("cd {dir} && git rev-parse --is-inside-work-tree".format(dir=folder)).strip() + if is_git_repo == "true": + runcmd("cd {dir} && git reset {file_path}".format(dir=folder, file_path=file_path)) + runcmd("cd {dir} && git add {zip_path}".format(dir=folder, zip_path=zip_path)) + + +def zip_file(file_path): + """ + Zips the file into smaller parts if it exceeds the GitHub size limit. + Parameters: + file_path: file path to zip + Returns: zipped file path + """ + folder = os.path.dirname(file_path) + base_name = os.path.basename(file_path) + zip_base = os.path.splitext(base_name)[0] + + single_zip_path = os.path.join(folder, f"{zip_base}.zip") + with zipfile.ZipFile(single_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: + zipf.write(file_path, base_name) + print("File zipped due to GitHub size limits: " + single_zip_path) + return single_zip_path + + def parse_metadata_data(cta, sqlite_db, table_name): """ Reads 'Metadata' table data into the CAS object