Skip to content

Commit

Permalink
Try a simple test via GitHub Actions
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelmior committed Jul 30, 2024
1 parent c4e2f5a commit ab169b3
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 15 deletions.
45 changes: 45 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: CI
on: [push, pull_request]
jobs:
pre_job:
name: Check duplicate
runs-on: ubuntu-latest
outputs:
should_skip: ${{ steps.skip_check.outputs.should_skip }}
steps:
- id: skip_check
uses: fkirc/skip-duplicate-actions@12aca0a884f6137d619d6a8a09fcc3406ced5281
with:
cancel_others: true
concurrent_skipping: same_content
do_not_skip: '["pull_request", "schedule", "workflow_dispatch"]'

build:
name: Test
needs: pre_job
if: ${{ needs.pre_job.outputs.should_skip != 'true' }}
runs-on: ubuntu-latest
steps:
- run: git config --global core.autocrlf input
- uses: actions/checkout@v3
with:
fetch-depth: 0
- uses: actions/setup-python@v4
with:
python-version: '3.11'
cache: 'pipenv'
- name: Install pipenv
run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
- run: pipenv install
- name: Compile database
run: pipenv run python compile_db.py --output test/hs.db < test/patterns.json > test/patterns_final.json
- name: Generate test data
run: pipenv run python gen_test_data.py
- name: Preprocess training data
run: pipenv run python preprocess.py train --database test/hs.db --sherlock-path test/ --output-dir test/
- name: Train the model
run: pipenv run python train.py --sherlock-path test/ --input-dir test/ --output-dir test/
- name: Preprocess test data
run: pipenv run python preprocess.py test --database test/hs.db --sherlock-path test/ --output-dir test/
- name: Evaluate the model
run: pipenv run python test.py --sherlock-path test/ --input-dir test/ | grep "weighted avg 1.00"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ patterns.json
*.json
*.h5
*.png
*.parquet
regex101/
7 changes: 6 additions & 1 deletion compile_db.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import argparse
import ast
import hyperscan
import json
import pickle
import sys


parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output", default="hs.db")
args = parser.parse_args()

sys.stderr.write("Collecting patterns...\n")
regexes = set()
for line in sys.stdin:
Expand Down Expand Up @@ -34,5 +39,5 @@
# Compile the final database and save to file
sys.stderr.write("Compiling %d patterns...\n" % len(patterns))
db.compile(expressions=patterns, ids=ids, flags=flags)
with open("hs.db", "wb") as f:
with open(args.output, "wb") as f:
pickle.dump([len(patterns), hyperscan.dumpb(db)], f)
16 changes: 16 additions & 0 deletions gen_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pandas as pd


data = ["['a', 'b', 'c']", "['1', '2', '3']"]
df = pd.DataFrame(data, columns=['values'])
df.to_parquet('test/test_values.parquet', index=True)

df = pd.DataFrame(data * 100, columns=['values'])
df.to_parquet('test/train_values.parquet', index=True)

labels = ["alpha", "numeric"]
df = pd.DataFrame(labels, columns=['type'])
df.to_parquet('test/test_labels.parquet', index=True)

df = pd.DataFrame(labels * 100, columns=['type'])
df.to_parquet('test/train_labels.parquet', index=True)
9 changes: 6 additions & 3 deletions preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@

parser = argparse.ArgumentParser()
parser.add_argument("dataset", choices=["train", "test"])
parser.add_argument("--database", default="hs.db")
parser.add_argument("--sherlock-path", default="../sherlock-project/data/data/raw")
parser.add_argument("--output-dir", default=".")
args = parser.parse_args()

output_file = f"preprocessed_{args.dataset}.txt"
output_file = os.path.join(args.output_dir, f"preprocessed_{args.dataset}.txt")

# Load the precompiled regular expression database
sys.stderr.write("Loading regexes from file…\n")
with open("hs.db", "rb") as f:
with open(args.database, "rb") as f:
[num_patterns, bdb] = pickle.load(f)
db = hyperscan.loadb(bdb)
# Scratch is not correctly initialized for deserialized databses
Expand All @@ -43,7 +46,7 @@ def on_match(match_id, from_idx, to_idx, flags, context):


# Load the values
pq_values = ParquetFile(f"../sherlock-project/data/data/raw/{args.dataset}_values.parquet")
pq_values = ParquetFile(os.path.join(args.sherlock_path, f"{args.dataset}_values.parquet"))

# Remove the output if it exists
if os.path.exists(output_file):
Expand Down
17 changes: 12 additions & 5 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import argparse
import os
import sys

import numpy as np
Expand All @@ -10,22 +12,27 @@

BATCH_SIZE = 1000

parser = argparse.ArgumentParser()
parser.add_argument("--sherlock-path", default="../sherlock-project/data/data/raw")
parser.add_argument("--input-dir", default=".")
args = parser.parse_args()

sys.stderr.write("Loading labels...\n")
pq_labels = ParquetFile("../sherlock-project/data/data/raw/test_labels.parquet")
pq_labels = ParquetFile(os.path.join(args.sherlock_path, "test_labels.parquet"))
labels = pd.DataFrame(
{"type": pd.Categorical(pq_labels.read(columns=["type"]).columns[0].to_numpy())}
)
le = LabelEncoder()
le.classes_ = np.load("classes.npy", allow_pickle=True)
le.classes_ = np.load(os.path.join(args.input_dir, "classes.npy"), allow_pickle=True)
# labels = le.transform(labels.values.ravel())
num_examples = len(labels)

model = model_from_json(open("nn_model_sherlock.json", "r").read())
model.load_weights("nn_model_weights_sherlock.h5")
model = model_from_json(open(os.path.join(args.input_dir, "nn_model_sherlock.json"), "r").read())
model.load_weights(os.path.join(args.input_dir, "nn_model_weights_sherlock.h5"))

sys.stderr.write("Evaluating...\n")
labels_pred = [""] * len(labels)
preprocessed = open("preprocessed_test.txt", "r")
preprocessed = open(os.path.join(args.input_dir, "preprocessed_test.txt"), "r")
batch = 0
with tqdm(total=len(labels)) as pbar:
while True:
Expand Down
1 change: 1 addition & 0 deletions test/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
!patterns.json
2 changes: 2 additions & 0 deletions test/patterns.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"^[A-Za-z]$"
"^[0-9]$"
20 changes: 14 additions & 6 deletions train.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import argparse
import os
import sys

import numpy as np
Expand All @@ -17,8 +19,14 @@

BATCH_SIZE = 1000

parser = argparse.ArgumentParser()
parser.add_argument("--sherlock-path", default="../sherlock-project/data/data/raw")
parser.add_argument("--input-dir", default=".")
parser.add_argument("--output-dir", default=".")
args = parser.parse_args()

sys.stderr.write("Loading labels...\n")
pq_labels = ParquetFile("../sherlock-project/data/data/raw/train_labels.parquet")
pq_labels = ParquetFile(os.path.join(args.sherlock_path, "train_labels.parquet"))
labels = pd.DataFrame(
{
"type": pd.Categorical(
Expand All @@ -31,10 +39,10 @@
# Encode the labels as integers
le = LabelEncoder().fit(labels.values.ravel())
labels = le.transform(labels.values.ravel())
np.save("classes.npy", le.classes_)
np.save(os.path.join(args.output_dir, "classes.npy"), le.classes_)

# Load one row just to get the shape of the input
preprocessed = open("preprocessed_train.txt", "r")
preprocessed = open(os.path.join(args.input_dir, "preprocessed_train.txt"), "r")
matrix = np.loadtxt(preprocessed, max_rows=1)
regex_shape = matrix.shape[0]

Expand Down Expand Up @@ -78,9 +86,9 @@
loss="categorical_crossentropy",
metrics=["categorical_accuracy"],
)
open("nn_model_sherlock.json", "w").write(model.to_json())
open(os.path.join(args.output_dir, "nn_model_sherlock.json"), "w").write(model.to_json())

preprocessed = open("preprocessed_train.txt", "r")
preprocessed = open(os.path.join(args.input_dir, "preprocessed_train.txt"), "r")
i = 0
with tqdm(total=len(labels)) as pbar:
while True:
Expand All @@ -105,4 +113,4 @@
pbar.update(len(matrix))

# Save the trained model weights
model.save_weights("nn_model_weights_sherlock.h5")
model.save_weights(os.path.join(args.output_dir, "nn_model_weights_sherlock.h5"))

0 comments on commit ab169b3

Please sign in to comment.