some tmp work

ybressler · May 29, 2024 · 5ad576f · 5ad576f
1 parent 06859ad
commit 5ad576f
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # Big Data Workshop
 Placeholder for something meaningful.
 
-## Getting Started:
+## Getting Started: (Mac Users)
 1. Prerequisites:
    1. poetry is installed
       ```bash
@@ -22,14 +22,34 @@ Placeholder for something meaningful.
       ```bash
       brew install pyenv
       ```
-2. Install python 3.11
+## Getting Started: (Windows Users)
+1. Install [scoop](https://scoop.sh/)
+2. Use `scoop` to install poetry and make:
+   ```shell
+   scoop install pipx
+   scoop install make
+   ```
+3. Use `pipx` to install poetry:
+   ```shell
+   pipx install poetry
+   pipx ensurepath
+   ```
+4. Install [pyenv-win](https://github.com/pyenv-win/pyenv-win):
+5. Reload your terminal
+
+## Getting Started: (All Users)
+2. Use `pyenv` to install python 3.11
    ```
    pyenv install 3.11
    ```
-2. Run `make setup`
+2. Install everything with: `make`:
+   ```shell
+   make setup
+   ```
 3. Install pre-commit hooks:
    ```shell
    pipx install pre-commit
+   pipx ensurepath
    pre-commit install
    ```
 4. Install pre-commit hooks: `pre-commit run -a`

diff --git a/src/create_data/main.py b/src/create_data/main.py
@@ -27,7 +27,7 @@
     print("finished generating the data")
 
     dt_start = datetime.datetime.now()
-    # res = s3_service.upload_file(file_name, with_percentage=True)
+    res = s3_service.upload_file(file_name, with_percentage=True)
     dt_end = datetime.datetime.now()
 
     duration = (dt_end - dt_start).total_seconds()

diff --git a/src/process_data/pandas/main.py b/src/process_data/pandas/main.py
@@ -2,7 +2,7 @@
 """
 Process the stuff in pandas
 """
-
+import numpy as np
 import pandas as pd
 
 
@@ -50,7 +50,7 @@ def in_chunks(cls, filename: str, chunksize: int = 100_000):
                 )
                 .query("station_name == 'Alexandria'")
             )
-            # df_agg.columns = df_agg.columns.droplevel(0)
+
             df_result = pd.concat(
                 [df_result, df_agg]
             )  # .groupby(level=0)["count"].agg({"measurement": ["min", "mean", "max", "count"]})
@@ -64,12 +64,18 @@ def in_chunks(cls, filename: str, chunksize: int = 100_000):
             #         "mean": lambda s: sum(s['count'] * s['mean']) / sum(s['count']),
             #     }
             # )
+        def calc_mean():
+            ...
         # Now aggregate at the end (count * mean / count)
-        df_result.groupby(level=0).agg(
-            min=("measurement", "min"),
-            mean=("measurement", "mean"),
-            max=("measurement", "max"),
-            count=("measurement", "count"),
+        # tmp = df_result.groupby(level=0).agglambda s: pd.Series({
+        #     "corr(x, y)": np.corrcoef(s["x"], s["y"]),
+        #     "corr(x, z)": np.corrcoef(s["x"], s["z"]),
+        # })
+
+        df_result = df_result.groupby(level=0).agg(
+            min=("min", "min"),
+            max=("max", "max"),
+            count=("count", "sum"),
         )
 
         return df_result