Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement FileIO.append (for CSV, JSON and NDJSON) #44

Merged
merged 4 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ dist
_version.py
.idea/
venv/
tmp/
95 changes: 89 additions & 6 deletions dune_client/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import logging
import os.path
from enum import Enum
from os.path import exists
from pathlib import Path
from typing import TextIO
from typing import TextIO, Callable, List, Tuple

# ndjson missing types: https://github.com/rhgrant10/ndjson/issues/10
import ndjson # type: ignore
Expand Down Expand Up @@ -60,14 +61,17 @@ def load(self, file: TextIO) -> list[DuneRecord]:
return list(ndjson.reader(file))
raise ValueError(f"Unrecognized FileType {self} for {file.name}")

def write(self, out_file: TextIO, data: list[DuneRecord]) -> None:
def write(
self, out_file: TextIO, data: list[DuneRecord], skip_headers: bool = False
) -> None:
"""Writes `data` to `out_file`"""
logger.debug(f"writing results to file {out_file.name}")
if self == FileType.CSV:
headers = data[0].keys()
data_tuple = [tuple(rec.values()) for rec in data]
dict_writer = csv.DictWriter(out_file, headers, lineterminator="\n")
dict_writer.writeheader()
if not skip_headers:
dict_writer.writeheader()
writer = csv.writer(out_file, lineterminator="\n")
writer.writerows(data_tuple)

Expand All @@ -82,6 +86,18 @@ def write(self, out_file: TextIO, data: list[DuneRecord]) -> None:
raise ValueError(f"Unrecognized FileType {self} for {out_file.name}")


# def skip_empty(func):
# """Decorator used on write methods"""
#
# def write_wrapper(self, data, name, ftype):
# if len(data) == 0:
# logger.info(f"Nothing to write to {name}... skipping")
# return
# func(self, data, name, ftype)
#
# return write_wrapper


bh2smith marked this conversation as resolved.
Show resolved Hide resolved
class FileIO:
"""
CSV is a more compact file type,
Expand All @@ -106,13 +122,77 @@ def _filepath(self, name: str, ftype: FileType) -> str:
"""Internal method for building absolute path."""
return os.path.join(self.path, name + str(ftype))

def _write(self, data: list[DuneRecord], name: str, ftype: FileType) -> None:
# TODO - use @skip_empty decorator here. Couldn't get the types to work.
def _write(self, data: List[DuneRecord], name: str, ftype: FileType) -> None:
# The following three lines are duplicated in _append, due to python version compatibility
# https://github.com/cowprotocol/dune-client/issues/37#issuecomment-1319901120
# We will continue to support python < 3.10 until ~3.13, this issue will remain open.
if len(data) == 0:
logger.info(f"Nothing to write to {name}... skipping")
return
return None
with open(self._filepath(name, ftype), "w", encoding=self.encoding) as out_file:
ftype.write(out_file, data)
return None

def _assert_matching_keys(
self, keys: Tuple[str, ...], fname: str, ftype: FileType
) -> None:
with open(fname, "r", encoding=self.encoding) as file:
if ftype == FileType.CSV:
# Check matching headers.
headers = file.readline()
existing_keys = headers.strip().split(",")
elif ftype == FileType.JSON:
single_object = json.loads(file.readline())[0]
existing_keys = single_object.keys()
elif ftype == FileType.NDJSON:
single_object = json.loads(file.readline())
existing_keys = single_object.keys()

key_tuple = tuple(existing_keys)
assert keys == key_tuple, f"{keys} != {key_tuple}"

def _append(self, data: List[DuneRecord], name: str, ftype: FileType) -> None:
if len(data) == 0:
logger.info(f"Nothing to write to {name}... skipping")
return None
fname = self._filepath(name, ftype)
if not exists(fname):
logger.warning(
f"File {fname} does not exist, using write instead of append!"
)
return self._write(data, name, ftype)

# validate that the incoming content to be appended has the same schema
# The skip empty decorator ensures existence of data[0]!
self._assert_matching_keys(tuple(data[0].keys()), fname, ftype)

if ftype == FileType.JSON:
# These are JSON lists, so we have to concatenate the data.
with open(fname, "r", encoding=self.encoding) as existing_file:
existing_data = ftype.load(existing_file)
return self._write(existing_data + data, name, ftype)

with open(fname, "a+", encoding=self.encoding) as out_file:
return ftype.write(out_file, data, skip_headers=True)

def append_csv(self, data: list[DuneRecord], name: str) -> None:
"""Appends `data` to csv file `name`"""
# This is a special case because we want to skip headers when the file already exists
# Additionally, we may want to validate that the headers actually coincide.
self._append(data, name, FileType.CSV)

def append_json(self, data: list[DuneRecord], name: str) -> None:
"""
Appends `data` to json file `name`
This is the least efficient of all, since we have to load the entire file,
concatenate the lists and then overwrite the file!
Other filetypes such as CSV and NDJSON can be directly appended to!
"""
self._append(data, name, FileType.JSON)

def append_ndjson(self, data: list[DuneRecord], name: str) -> None:
"""Appends `data` to ndjson file `name`"""
self._append(data, name, FileType.NDJSON)

def write_csv(self, data: list[DuneRecord], name: str) -> None:
"""Writes `data` to csv file `name`"""
Expand Down Expand Up @@ -154,3 +234,6 @@ def load_singleton(
) -> DuneRecord:
"""Loads and returns single entry by index (default 0)"""
return self._load(name, self._parse_ftype(ftype))[index]


WriteLikeSignature = Callable[[FileIO, List[DuneRecord], str, FileType], None]
26 changes: 25 additions & 1 deletion tests/unit/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def cleanup():


def cleanup_files(func):
"""This decorator can be used for testing methods outside of this class"""
"""This decorator can be used for testing methods outside this class"""

def wrapped_func(self):
func(self)
Expand Down Expand Up @@ -48,6 +48,30 @@ def test_invertible_write_and_load(self):
f"Assert invertible failed on {ftype}",
)

def test_append_ok(self):
for ftype in FileType:
self.file_manager._write(self.dune_records, TEST_FILE, ftype)
self.file_manager._append(self.dune_records, TEST_FILE, ftype)
loaded_records = self.file_manager._load(TEST_FILE, ftype)
expected = self.dune_records + self.dune_records
self.assertEqual(
expected,
loaded_records,
f"test_append failed on {ftype}",
)

def test_append_calls_write_on_new_file(self):
for ftype in FileType:
with self.assertLogs(level="WARNING"):
self.file_manager._append(self.dune_records, TEST_FILE, ftype)

def test_append_error(self):
invalid_records = [{}] # Empty dict has different keys than self.dune_records
for ftype in FileType:
self.file_manager._write(self.dune_records, TEST_FILE, ftype)
with self.assertRaises(AssertionError):
self.file_manager._append(invalid_records, TEST_FILE, ftype)

def test_load_singleton(self):
for file_type in FileType:
self.file_manager._write(self.dune_records, TEST_FILE, file_type)
Expand Down