Skip to content

Commit

Permalink
DuckDB is super fast
Browse files Browse the repository at this point in the history
  • Loading branch information
ybressler committed May 30, 2024
1 parent 64b8d4e commit defdeb9
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 1 deletion.
58 changes: 57 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ parallel-pandas = "^0.6.2"
dask = "^2024.5.1"
dask-expr = "^1.1.1"
bs4 = "^0.0.2"
duckdb = "^0.10.3"

[tool.poetry.group.dev.dependencies]
pre-commit = "*"
Expand Down
Empty file.
72 changes: 72 additions & 0 deletions src/process_data/duckdb/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
"""
Process the stuff in duckdb
Resources:
1. DuckDB documentation: ...
"""

import argparse
import time

import duckdb
import pandas as pd

from src.process_data.base import BaseProcessDataInterface


class DuckDBInterface(BaseProcessDataInterface):
"""
Interface for executing duckDB transformations.
Use as follows:
>> df = DuckDBInterface.in_memory("foo.txt")
"""

@classmethod
def in_memory(cls, filename: str) -> pd.DataFrame:
"""
Process the whole thing in memory
Args:
filename: Name of the file. Should be relative path to the location where
this script is invoked.
"""

with duckdb.connect() as conn:
data = conn.sql(f"""
select
station_name,
min(measurement) as min_measurement,
cast(avg(measurement) as decimal(8, 1)) as mean_measurement,
max(measurement) as max_measurement
from read_csv(
"{filename}",
header=false,
columns={{'station_name': 'varchar', 'measurement': 'decimal(8, 1)'}},
delim=';',
parallel=true
)
group by station_name
order by station_name
""")

return data.df()

@classmethod
def streaming(cls, filename: str):
raise NotImplementedError


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Analyze measurement file")
parser.add_argument("-f", "--file_name", dest="file_name", type=str, help="File name", default="measurements.txt")
args = parser.parse_args()

start = time.time()
df = DuckDBInterface().in_memory(args.file_name)

duration = time.time() - start
print(f"Duration = {duration: .2f}s")
print(df.query("station_name == 'Alexandria'").head())

0 comments on commit defdeb9

Please sign in to comment.