From 0c538838f5d579d690180e1129621cc19c642bc3 Mon Sep 17 00:00:00 2001 From: Pieter Robberechts Date: Fri, 12 Apr 2024 13:55:15 +0200 Subject: [PATCH] feat(io): Allow reading data from compressed file - Add support for opening a gzip, bzip or lzma-compressed file. - Additional tests for io.open_as_file function --- kloppy/io.py | 30 ++++++++---- kloppy/tests/test_helpers.py | 37 +++++---------- kloppy/tests/test_io.py | 88 ++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 34 deletions(-) create mode 100644 kloppy/tests/test_io.py diff --git a/kloppy/io.py b/kloppy/io.py index a3b43245..c213a90b 100644 --- a/kloppy/io.py +++ b/kloppy/io.py @@ -3,19 +3,31 @@ import os import urllib.parse from dataclasses import dataclass, replace -from pathlib import PurePath -from typing import Union, IO, BinaryIO, Tuple - from io import BytesIO +from pathlib import PurePath +from typing import IO, BinaryIO, Tuple, Union from kloppy.config import get_config from kloppy.exceptions import InputNotFoundError from kloppy.infra.io.adapters import get_adapter - logger = logging.getLogger(__name__) -_open = open + +def _open(file: str, mode: str): + if file.endswith(".gz"): + import gzip + + return gzip.open(file, mode) + elif file.endswith(".xz"): + import lzma + + return lzma.open(file, mode) + elif file.endswith(".bz2"): + import bz2 + + return bz2.open(file, mode) + return open(file, mode) @dataclass(frozen=True) @@ -35,10 +47,12 @@ def create(cls, input_: "FileLike", **kwargs): def get_file_extension(f: FileLike) -> str: - if isinstance(f, str): + if isinstance(f, PurePath) or isinstance(f, str): + f = str(f) + for ext in [".gz", ".xz", ".bz2"]: + if f.endswith(ext): + f = f[: -len(ext)] return os.path.splitext(f)[1] - elif isinstance(f, PurePath): - return os.path.splitext(f.name)[1] elif isinstance(f, Source): return get_file_extension(f.data) else: diff --git a/kloppy/tests/test_helpers.py b/kloppy/tests/test_helpers.py index 65df77ad..cd634893 100644 --- a/kloppy/tests/test_helpers.py +++ b/kloppy/tests/test_helpers.py @@ -1,37 +1,31 @@ -import os import sys -from pathlib import Path import pytest - -from kloppy.config import config_context from pandas import DataFrame from pandas.testing import assert_frame_equal - +from kloppy import opta, statsbomb, tracab +from kloppy.config import config_context from kloppy.domain import ( - Period, - DatasetFlag, - Point, AttackingDirection, - TrackingDataset, - NormalizedPitchDimensions, + DatasetFlag, Dimension, - Orientation, - Provider, Frame, + Ground, Metadata, MetricaCoordinateSystem, - Team, - Ground, + NormalizedPitchDimensions, + Orientation, + Period, Player, PlayerData, + Point, Point3D, + Provider, + Team, + TrackingDataset, ) -from kloppy import opta, tracab, statsbomb -from kloppy.io import open_as_file - class TestHelpers: def _get_tracking_dataset(self): @@ -517,12 +511,3 @@ def test_to_df_pyarrow(self): df = dataset.to_df(engine="pandas[pyarrow]") assert isinstance(df, pd.DataFrame) assert isinstance(df.dtypes["ball_x"], pd.ArrowDtype) - - -class TestOpenAsFile: - def test_path(self): - path = Path(__file__).parent / "files/tracab_meta.xml" - with open_as_file(path) as fp: - data = fp.read() - - assert len(data) == os.path.getsize(path) diff --git a/kloppy/tests/test_io.py b/kloppy/tests/test_io.py new file mode 100644 index 00000000..52035f84 --- /dev/null +++ b/kloppy/tests/test_io.py @@ -0,0 +1,88 @@ +import os +from pathlib import Path + +from kloppy.io import open_as_file, get_file_extension + + +class TestOpenAsFile: + """Tests for the open_as_file function.""" + + def test_bytes(self, base_dir: Path): + """It should be able to open a file from a bytes object.""" + path = base_dir / "files" / "tracab_meta.xml" + with open(path, "rb") as f: + data = f.read() + + with open_as_file(data) as fp: + assert fp.read() == data + + def test_str(self, base_dir: Path): + """It should be able to open a file from a string object.""" + path = str(base_dir / "files" / "tracab_meta.xml") + with open_as_file(path) as fp: + data = fp.read() + + assert len(data) == os.path.getsize(path) + + def test_path(self, base_dir: Path): + """It should be able to open a file from a Path object.""" + path = base_dir / "files" / "tracab_meta.xml" + with open_as_file(path) as fp: + data = fp.read() + + assert len(data) == os.path.getsize(path) + + def test_gzip(self, base_dir: Path, tmp_path: Path): + """It should be able to open a gzipped file.""" + raw_path = base_dir / "files" / "tracab_meta.xml" + gz_path = tmp_path / "tracab_meta.xml.gz" + # Create a gzipped file + import gzip + + with open(raw_path, "rb") as f: + with gzip.open(gz_path, "wb") as f_out: + f_out.write(f.read()) + # Read the gzipped file + with open_as_file(raw_path) as fp: + data = fp.read() + + assert len(data) == os.path.getsize(raw_path) + + def test_xz(self, base_dir: Path, tmp_path: Path): + """It should be able to open a LZMA-compressed file.""" + raw_path = base_dir / "files" / "tracab_meta.xml" + gz_path = tmp_path / "tracab_meta.xml.gz" + # Create a LMZA-compressed file + import lzma + + with open(raw_path, "rb") as f: + with lzma.open(gz_path, "wb") as f_out: + f_out.write(f.read()) + # Read the gzipped file + with open_as_file(raw_path) as fp: + data = fp.read() + + assert len(data) == os.path.getsize(raw_path) + + def test_bz2(self, base_dir: Path, tmp_path: Path): + """It should be able to open a bzip2-compressed file.""" + raw_path = base_dir / "files" / "tracab_meta.xml" + gz_path = tmp_path / "tracab_meta.xml.gz" + # Create a bz2-compressed file + import bz2 + + with open(raw_path, "rb") as f: + with bz2.open(gz_path, "wb") as f_out: + f_out.write(f.read()) + # Read the gzipped file + with open_as_file(raw_path) as fp: + data = fp.read() + + assert len(data) == os.path.getsize(raw_path) + + +def test_get_file_extension(): + assert get_file_extension(Path("data.xml")) == ".xml" + assert get_file_extension("data.xml") == ".xml" + assert get_file_extension("data.xml.gz") == ".xml" + assert get_file_extension("data") == ""