Skip to content

Commit

Permalink
feat(io): Allow reading data from compressed file
Browse files Browse the repository at this point in the history
- Add support for opening a gzip, bzip or lzma-compressed file.
- Additional tests for io.open_as_file function
  • Loading branch information
probberechts committed Apr 12, 2024
1 parent 92a4818 commit 0c53883
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 34 deletions.
30 changes: 22 additions & 8 deletions kloppy/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,31 @@
import os
import urllib.parse
from dataclasses import dataclass, replace
from pathlib import PurePath
from typing import Union, IO, BinaryIO, Tuple

from io import BytesIO
from pathlib import PurePath
from typing import IO, BinaryIO, Tuple, Union

from kloppy.config import get_config
from kloppy.exceptions import InputNotFoundError
from kloppy.infra.io.adapters import get_adapter


logger = logging.getLogger(__name__)

_open = open

def _open(file: str, mode: str):
if file.endswith(".gz"):
import gzip

return gzip.open(file, mode)
elif file.endswith(".xz"):
import lzma

return lzma.open(file, mode)
elif file.endswith(".bz2"):
import bz2

return bz2.open(file, mode)
return open(file, mode)


@dataclass(frozen=True)
Expand All @@ -35,10 +47,12 @@ def create(cls, input_: "FileLike", **kwargs):


def get_file_extension(f: FileLike) -> str:
if isinstance(f, str):
if isinstance(f, PurePath) or isinstance(f, str):
f = str(f)
for ext in [".gz", ".xz", ".bz2"]:
if f.endswith(ext):
f = f[: -len(ext)]
return os.path.splitext(f)[1]
elif isinstance(f, PurePath):
return os.path.splitext(f.name)[1]
elif isinstance(f, Source):
return get_file_extension(f.data)
else:
Expand Down
37 changes: 11 additions & 26 deletions kloppy/tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,31 @@
import os
import sys
from pathlib import Path

import pytest

from kloppy.config import config_context
from pandas import DataFrame
from pandas.testing import assert_frame_equal


from kloppy import opta, statsbomb, tracab
from kloppy.config import config_context
from kloppy.domain import (
Period,
DatasetFlag,
Point,
AttackingDirection,
TrackingDataset,
NormalizedPitchDimensions,
DatasetFlag,
Dimension,
Orientation,
Provider,
Frame,
Ground,
Metadata,
MetricaCoordinateSystem,
Team,
Ground,
NormalizedPitchDimensions,
Orientation,
Period,
Player,
PlayerData,
Point,
Point3D,
Provider,
Team,
TrackingDataset,
)

from kloppy import opta, tracab, statsbomb
from kloppy.io import open_as_file


class TestHelpers:
def _get_tracking_dataset(self):
Expand Down Expand Up @@ -517,12 +511,3 @@ def test_to_df_pyarrow(self):
df = dataset.to_df(engine="pandas[pyarrow]")
assert isinstance(df, pd.DataFrame)
assert isinstance(df.dtypes["ball_x"], pd.ArrowDtype)


class TestOpenAsFile:
def test_path(self):
path = Path(__file__).parent / "files/tracab_meta.xml"
with open_as_file(path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(path)
88 changes: 88 additions & 0 deletions kloppy/tests/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
from pathlib import Path

from kloppy.io import open_as_file, get_file_extension


class TestOpenAsFile:
"""Tests for the open_as_file function."""

def test_bytes(self, base_dir: Path):
"""It should be able to open a file from a bytes object."""
path = base_dir / "files" / "tracab_meta.xml"
with open(path, "rb") as f:
data = f.read()

with open_as_file(data) as fp:
assert fp.read() == data

def test_str(self, base_dir: Path):
"""It should be able to open a file from a string object."""
path = str(base_dir / "files" / "tracab_meta.xml")
with open_as_file(path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(path)

def test_path(self, base_dir: Path):
"""It should be able to open a file from a Path object."""
path = base_dir / "files" / "tracab_meta.xml"
with open_as_file(path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(path)

def test_gzip(self, base_dir: Path, tmp_path: Path):
"""It should be able to open a gzipped file."""
raw_path = base_dir / "files" / "tracab_meta.xml"
gz_path = tmp_path / "tracab_meta.xml.gz"
# Create a gzipped file
import gzip

with open(raw_path, "rb") as f:
with gzip.open(gz_path, "wb") as f_out:
f_out.write(f.read())
# Read the gzipped file
with open_as_file(raw_path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(raw_path)

def test_xz(self, base_dir: Path, tmp_path: Path):
"""It should be able to open a LZMA-compressed file."""
raw_path = base_dir / "files" / "tracab_meta.xml"
gz_path = tmp_path / "tracab_meta.xml.gz"
# Create a LMZA-compressed file
import lzma

with open(raw_path, "rb") as f:
with lzma.open(gz_path, "wb") as f_out:
f_out.write(f.read())
# Read the gzipped file
with open_as_file(raw_path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(raw_path)

def test_bz2(self, base_dir: Path, tmp_path: Path):
"""It should be able to open a bzip2-compressed file."""
raw_path = base_dir / "files" / "tracab_meta.xml"
gz_path = tmp_path / "tracab_meta.xml.gz"
# Create a bz2-compressed file
import bz2

with open(raw_path, "rb") as f:
with bz2.open(gz_path, "wb") as f_out:
f_out.write(f.read())
# Read the gzipped file
with open_as_file(raw_path) as fp:
data = fp.read()

assert len(data) == os.path.getsize(raw_path)


def test_get_file_extension():
assert get_file_extension(Path("data.xml")) == ".xml"
assert get_file_extension("data.xml") == ".xml"
assert get_file_extension("data.xml.gz") == ".xml"
assert get_file_extension("data") == ""

0 comments on commit 0c53883

Please sign in to comment.