Skip to content

Commit

Permalink
Merge branch 'main' into feat/custom-opt
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda committed Nov 13, 2023
2 parents ae637a1 + 1914c91 commit 79205fd
Show file tree
Hide file tree
Showing 16 changed files with 1,005 additions and 1,013 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/label-conflicts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Label merge conflicts

on:
push:
branches: ["main"]
pull_request_target:
types: ["synchronize", "reopened", "opened"]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: true

jobs:
triage-conflicts:
runs-on: ubuntu-latest
steps:
- uses: mschilde/auto-label-merge-conflicts@8c6faa8a252e35ba5e15703b3d747bf726cdb95c # Oct 25, 2021
with:
CONFLICT_LABEL_NAME: "has conflicts"
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MAX_RETRIES: 3
WAIT_MS: 5000
17 changes: 16 additions & 1 deletion .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ jobs:
- {os: 'ubuntu-20.04', python-version: "3.7", requires: 'oldest'}
- {os: 'ubuntu-20.04', python-version: "3.8", requires: 'oldest'}

env:
TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html"

steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -34,8 +37,20 @@ jobs:

- name: Install main package & dependencies
run: |
pip install -e .[extra] -r requirements_dev.txt -f https://download.pytorch.org/whl/cpu/torch_stable.html
pip install -e .[extra] -r requirements_dev.txt -f ${TORCH_URL}
pip list
- name: Restore test's datasets
uses: actions/cache/restore@v3
with:
path: tests/.datasets
key: test-datasets

- name: Run test-suite
run: python -m pytest -v

- name: Save test's datasets
uses: actions/cache/save@v3
with:
path: tests/.datasets
key: test-datasets
8 changes: 4 additions & 4 deletions docs/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ Pytorch Tabular handles this using a `DataConfig` object.
## Basic Usage

- `target`: List\[str\]: A list of strings with the names of the target column(s)
- `continuous_columns`: List\[str\]: Column names of the numeric fields. Defaults to \[\]
- `categorical_columns`: List\[str\]: Column names of the categorical fields to treat differently
- `continuous_cols`: List\[str\]: Column names of the numeric fields. Defaults to \[\]
- `categorical_cols`: List\[str\]: Column names of the categorical fields to treat differently

### Usage Example

```python
data_config = DataConfig(
target=["label"],
continuous_columns=["feature_1", "feature_2"],
categorical_columns=["cat_feature_1", "cat_feature_2"],
continuous_cols=["feature_1", "feature_2"],
categorical_cols=["cat_feature_1", "cat_feature_2"],
)
```

Expand Down
160 changes: 81 additions & 79 deletions src/pytorch_tabular/tabular_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,29 +160,26 @@ def _encode_date_columns(self, data: pd.DataFrame) -> pd.DataFrame:
return data, added_features

def _encode_categorical_columns(self, data: pd.DataFrame, stage: str) -> pd.DataFrame:
if stage == "fit":
if self.do_leave_one_out_encoder():
logger.debug("Encoding Categorical Columns using LeavOneOutEncoder")
self.categorical_encoder = ce.LeaveOneOutEncoder(
cols=self.config.categorical_cols, random_state=self.seed
)
# Multi-Target Regression uses the first target to encode the categorical columns
if len(self.config.target) > 1:
logger.warning(
f"Multi-Target Regression: using the first target({self.config.target[0]})"
f" to encode the categorical columns"
)
data = self.categorical_encoder.fit_transform(data, data[self.config.target[0]])
else:
logger.debug("Encoding Categorical Columns using OrdinalEncoder")
self.categorical_encoder = OrdinalEncoder(
cols=self.config.categorical_cols,
handle_unseen="impute" if self.config.handle_unknown_categories else "error",
handle_missing="impute" if self.config.handle_missing_values else "error",
if stage != "fit":
return self.categorical_encoder.transform(data)
if self.do_leave_one_out_encoder():
logger.debug("Encoding Categorical Columns using LeavOneOutEncoder")
self.categorical_encoder = ce.LeaveOneOutEncoder(cols=self.config.categorical_cols, random_state=self.seed)
# Multi-Target Regression uses the first target to encode the categorical columns
if len(self.config.target) > 1:
logger.warning(
f"Multi-Target Regression: using the first target({self.config.target[0]})"
f" to encode the categorical columns"
)
data = self.categorical_encoder.fit_transform(data)
data = self.categorical_encoder.fit_transform(data, data[self.config.target[0]])
else:
data = self.categorical_encoder.transform(data)
logger.debug("Encoding Categorical Columns using OrdinalEncoder")
self.categorical_encoder = OrdinalEncoder(
cols=self.config.categorical_cols,
handle_unseen="impute" if self.config.handle_unknown_categories else "error",
handle_missing="impute" if self.config.handle_missing_values else "error",
)
data = self.categorical_encoder.fit_transform(data)
return data

def _transform_continuous_columns(self, data: pd.DataFrame, stage: str) -> pd.DataFrame:
Expand Down Expand Up @@ -212,30 +209,33 @@ def _normalize_continuous_columns(self, data: pd.DataFrame, stage: str) -> pd.Da
return data

def _label_encode_target(self, data: pd.DataFrame, stage: str) -> pd.DataFrame:
if self.config.task == "classification":
if stage == "fit":
self.label_encoder = LabelEncoder()
data[self.config.target[0]] = self.label_encoder.fit_transform(data[self.config.target[0]])
else:
if self.config.target[0] in data.columns:
data[self.config.target[0]] = self.label_encoder.transform(data[self.config.target[0]])
if self.config.task != "classification":
return data
if stage == "fit":
self.label_encoder = LabelEncoder()
data[self.config.target[0]] = self.label_encoder.fit_transform(data[self.config.target[0]])
else:
if self.config.target[0] in data.columns:
data[self.config.target[0]] = self.label_encoder.transform(data[self.config.target[0]])
return data

def _target_transform(self, data: pd.DataFrame, stage: str) -> pd.DataFrame:
if self.config.task == "regression":
# target transform only for regression
if all(col in data.columns for col in self.config.target):
if self.do_target_transform:
if stage == "fit":
target_transforms = []
for col in self.config.target:
_target_transform = copy.deepcopy(self.target_transform_template)
data[col] = _target_transform.fit_transform(data[col].values.reshape(-1, 1))
target_transforms.append(_target_transform)
self.target_transforms = target_transforms
else:
for col, _target_transform in zip(self.config.target, self.target_transforms):
data[col] = _target_transform.transform(data[col].values.reshape(-1, 1))
if self.config.task != "regression":
return data
# target transform only for regression
if not all(col in data.columns for col in self.config.target):
return data
if self.do_target_transform:
if stage == "fit":
target_transforms = []
for col in self.config.target:
_target_transform = copy.deepcopy(self.target_transform_template)
data[col] = _target_transform.fit_transform(data[col].values.reshape(-1, 1))
target_transforms.append(_target_transform)
self.target_transforms = target_transforms
else:
for col, _target_transform in zip(self.config.target, self.target_transforms):
data[col] = _target_transform.transform(data[col].values.reshape(-1, 1))
return data

def preprocess_data(self, data: pd.DataFrame, stage: str = "inference") -> Tuple[pd.DataFrame, list]:
Expand Down Expand Up @@ -286,27 +286,28 @@ def setup(self, stage: Optional[str] = None) -> None:
stage (Optional[str], optional):
Internal parameter to distinguish between fit and inference. Defaults to None.
"""
if stage == "fit" or stage is None:
logger.info(f"Setting up the datamodule for {self.config.task} task")
if self.validation is None:
logger.debug(
f"No validation data provided."
f" Using {self.config.validation_split*100}% of train data as validation"
)
val_idx = self.train.sample(
int(self.config.validation_split * len(self.train)),
random_state=self.seed,
).index
self.validation = self.train[self.train.index.isin(val_idx)]
self.train = self.train[~self.train.index.isin(val_idx)]
else:
self.validation = self.validation.copy()
# Preprocessing Train, Validation
self.train, _ = self.preprocess_data(self.train, stage="fit")
self.validation, _ = self.preprocess_data(self.validation, stage="inference")
if self.test is not None:
self.test, _ = self.preprocess_data(self.test, stage="inference")
self._fitted = True
if not (stage is None or stage == "fit"):
return
logger.info(f"Setting up the datamodule for {self.config.task} task")
if self.validation is None:
logger.debug(
f"No validation data provided."
f" Using {self.config.validation_split*100}% of train data as validation"
)
val_idx = self.train.sample(
int(self.config.validation_split * len(self.train)),
random_state=self.seed,
).index
self.validation = self.train[self.train.index.isin(val_idx)]
self.train = self.train[~self.train.index.isin(val_idx)]
else:
self.validation = self.validation.copy()
# Preprocessing Train, Validation
self.train, _ = self.preprocess_data(self.train, stage="fit")
self.validation, _ = self.preprocess_data(self.validation, stage="inference")
if self.test is not None:
self.test, _ = self.preprocess_data(self.test, stage="inference")
self._fitted = True

# adapted from gluonts
@classmethod
Expand Down Expand Up @@ -561,22 +562,23 @@ def test_dataloader(self, batch_size: Optional[int] = None) -> DataLoader:
Returns:
DataLoader: Test dataloader
"""
if self.test is not None:
dataset = TabularDataset(
task=self.config.task,
data=self.test,
categorical_cols=self.config.categorical_cols,
continuous_cols=self.config.continuous_cols,
embed_categorical=(not self.do_leave_one_out_encoder()),
target=self.target,
)
return DataLoader(
dataset,
batch_size if batch_size is not None else self.batch_size,
shuffle=False,
num_workers=self.config.num_workers,
pin_memory=self.config.pin_memory,
)
if self.test is None:
raise RuntimeError("Undefined test attribute.")
dataset = TabularDataset(
task=self.config.task,
data=self.test,
categorical_cols=self.config.categorical_cols,
continuous_cols=self.config.continuous_cols,
embed_categorical=(not self.do_leave_one_out_encoder()),
target=self.target,
)
return DataLoader(
dataset,
batch_size if batch_size is not None else self.batch_size,
shuffle=False,
num_workers=self.config.num_workers,
pin_memory=self.config.pin_memory,
)

def _prepare_inference_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Prepare data for inference."""
Expand Down
19 changes: 14 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
from io import BytesIO
from urllib.request import urlopen
import os.path
from zipfile import ZipFile

import numpy as np
import pandas as pd
import pytest
from sklearn.datasets import fetch_california_housing, fetch_covtype

_PATH_TEST = os.path.dirname(__file__)
PATH_DATASETS = os.path.join(_PATH_TEST, ".datasets")
os.makedirs(PATH_DATASETS, exist_ok=True)

DATASET_ZIP_OCCUPANCY = os.path.join(PATH_DATASETS, "occupancy_data.zip")
if not os.path.isfile(DATASET_ZIP_OCCUPANCY):
import urllib.request

urllib.request.urlretrieve(
"https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip", DATASET_ZIP_OCCUPANCY
)


def load_regression_data():
dataset = fetch_california_housing(data_home="data", as_frame=True)
Expand Down Expand Up @@ -34,9 +45,7 @@ def load_classification_data():


def load_timeseries_data():
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip"
resp = urlopen(url)
zipfile = ZipFile(BytesIO(resp.read()))
zipfile = ZipFile(DATASET_ZIP_OCCUPANCY)
train = pd.read_csv(zipfile.open("datatraining.txt"), sep=",")
val = pd.read_csv(zipfile.open("datatest.txt"), sep=",")
test = pd.read_csv(zipfile.open("datatest2.txt"), sep=",")
Expand Down
Loading

0 comments on commit 79205fd

Please sign in to comment.