Skip to content

Commit

Permalink
Merge pull request #12 from mlibrary/DWI-32-cronjob
Browse files Browse the repository at this point in the history
DWI-32 cronjob
  • Loading branch information
niquerio authored Oct 23, 2024
2 parents a305f2e + de530e4 commit 1c31614
Show file tree
Hide file tree
Showing 6 changed files with 475 additions and 5 deletions.
22 changes: 20 additions & 2 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Run Tests
on: push

jobs:
test:
test-python:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand All @@ -26,4 +26,22 @@ jobs:
- name: Run tests
env:
CI: "true"
run: poetry run pytest
run: poetry run pytest

test-bash:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Bats and bats libs
id: setup-bats
uses: bats-core/[email protected]
- name: Setup Shell Mock
run: wget -P /opt/ https://github.com/boschresearch/shellmock/releases/download/0.9.1/shellmock.bash
- run: ls /opt
- name: Run tests
shell: bash
env:
BATS_LIB_PATH: ${{ steps.setup-bats.outputs.lib-path }}
SHELLMOCK_PATH: /opt/shellmock.bash
TERM: xterm
run: bats ./bin/**/*_test.sh
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@ htmlcov/
.coverage
.gnupg
requirements.txt
docs/_build

docs/_build
bin/digifeeds/*.config
15 changes: 13 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,18 @@ RUN poetry export --without dev -f requirements.txt --output requirements.txt
# We want poetry on in development
FROM poetry AS development
RUN apt-get update -yqq && apt-get install -yqq --no-install-recommends \
git
git \
bats \
bats-assert \
bats-file\
wget\
zip\
unzip

RUN wget -P /opt/ https://github.com/boschresearch/shellmock/releases/download/0.9.1/shellmock.bash && \
chown ${UID}:${GID} /opt/shellmock.bash

ENV SHELLMOCK_PATH=/opt/shellmock.bash

# Switch to the non-root user "user"
USER app
Expand All @@ -84,4 +95,4 @@ COPY --chown=${UID}:${GID} --from=build "/app/requirements.txt" /app/requirement

RUN pip install -r /app/requirements.txt

USER app
USER app
3 changes: 3 additions & 0 deletions bin/digifeeds/upload_to_s3.config.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input_directory="some/path/to/input/directory"
processed_directory="some/path/to/processed/directory"
digifeeds_bucket="rclone_remote_to_s3_bucket"
242 changes: 242 additions & 0 deletions bin/digifeeds/upload_to_s3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
#! /bin/bash

###########
# CONSTANTS
###########

# For push gateway
START_TIME=$(date '+%s')

# Directory this script lives in
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)

APP_ENV=${APP_ENV:-"production"}

if [[ $APP_ENV != "test" ]]; then
# CONFIG
# Variables contained in the config file:
#
# input_directory: path to the input directory
# processed_directory: path to the directory of processed files+%
# digifeeds_bucket: rclone remote for the digifeeds bucket
#
# timestamp: used for testing timestamps; should be ommited in production
# send_metrics: when "false" metrics don't get sent;
# APP_ENV: when "test" the main script is not executed
CONFIG_FILE=${1:-$SCRIPT_DIR/upload_to_s3.config}
# shellcheck source=/dev/null
source "$CONFIG_FILE"
fi
if ! input_directory=${input_directory:?}; then exit 1; fi
if ! processed_directory=${processed_directory:?}; then exit 1; fi
if ! digifeeds_bucket=${digifeeds_bucket:?}; then exit 1; fi
send_metrics=${send_metrics:-"true"}

# matches .tif and .jp2 files with 8 digit file names that start with 0 OR
# checksum.md5 files
# examples that match:
# 01234567.tif
# 01234567.jp2
# checksum.md5
IMGAWK='/^(0[0-9][0-9][0-9][0-9][0-9][0-9][0-9]\.(tif|jp2)|checksum\.md5)$/'

# For push gateway
JOB_NAME="aim_digifeeds_upload_to_aws"

###########
# COUNTERS
###########
files_processed_total=0
image_order_errors_total=0
upload_errors_total=0
errors_total=0

###########
# FUNCTIONS
###########

log_info() {
echo "$(date --rfc-3339=seconds) - INFO: $*"
}

log_error() {
echo "$(date --rfc-3339=seconds) - ERROR: $*"
}
log_debug() {
[[ ${DEBUG:-false} == "true" ]] && echo "$(date --rfc-3339=seconds) - DEBUG: $*"
}

#equivalent to ls
list_files() {
local path=$1
find "$path" -maxdepth 1 ! -printf '%P\n'
}

# Gets the last count from a job in the push gateway push gateway
last_count() {
local metric=$1
pushgateway_advanced -j $JOB_NAME -q "${metric}"
}

verify_image_order() {
#Sort the array
mapfile -t sorted < <(printf '%s\n' "$@" | sort)

local cnt=0
for arg in "${sorted[@]}"; do
cnt=$((cnt + 1))
int=${arg:0:8}
[ $((10#$int)) != $cnt ] && return 1
done
return 0
}

zip_it() {
local barcode_path=$1
cd "$barcode_path" || return 1
list_files . | awk "$IMGAWK" | xargs zip -rq "$barcode_path".zip
local zip_return=$?
#Go back to previous directory; Don't print the output.
cd - >/dev/null || return 1
return $zip_return
}

verify_zip() {
local barcode_path=$1

local files_in_dir
if ! files_in_dir=$(list_files "$barcode_path" | awk "$IMGAWK" | sort); then
return 1
fi
local files_in_zip
if ! files_in_zip=$(zipinfo -1 "$barcode_path".zip | sort); then
return 1
fi

if [ "$files_in_dir" == "$files_in_zip" ]; then
return 0
else
return 1
fi
}

print_metrics() {
local fp_current_total=$1
local image_order_errors_current_total=$2
local upload_errors_current_total=$3
local errors_current_total=$4

local fp_metric="${JOB_NAME}_files_processed_total"
local fp_last
fp_last=$(last_count $fp_metric)
local fp_total=$((fp_last + fp_current_total))

local image_order_errors_metric="${JOB_NAME}_image_order_errors_total"
local image_order_errors_last
image_order_errors_last=$(last_count $image_order_errors_metric)
local image_order_errors_total=$((image_order_errors_last + image_order_errors_current_total))

local upload_errors_metric="${JOB_NAME}_upload_errors_total"
local upload_errors_last
upload_errors_last=$(last_count $upload_errors_metric)
local upload_errors_total=$((upload_errors_last + upload_errors_current_total))

local errors_metric="${JOB_NAME}_errors_total"
local errors_last
errors_last=$(last_count $errors_metric)
local errors_total=$((errors_last + errors_current_total))

cat <<EOMETRICS
# HELP ${fp_metric} Count of digifeeds zip files sent to S3
# TYPE ${fp_metric} counter
$fp_metric $fp_total
# HELP ${image_order_errors_metric} Count of folders where there are missing pages of images
# TYPE ${image_order_errors_metric} counter
${image_order_errors_metric} $image_order_errors_total
# HELP ${upload_errors_metric} Count of errors when uploading digifeeds zip files to S3
# TYPE ${upload_errors_metric} counter
${upload_errors_metric} $upload_errors_total
# HELP ${errors_metric} Count of all errors relating ot uploading digifeeds files sent to S3
# TYPE ${errors_metric} counter
${errors_metric} $errors_total
EOMETRICS
}

main() {
TIMESTAMP=${timestamp:-$(date +%F_%H-%M-%S)} #YYY-MM-DD_hh-mm-ss

#This is so that the script works on empty directories.
shopt -s nullglob

for barcode_path in "${input_directory}"/*/; do
local barcode
barcode=$(basename "${barcode_path%%/}")

log_info "Copying $barcode"

log_debug "Verifying image order $barcode"
#8 digits, ends in .tif or .jp2
filter_regex='[[:digit:]]{8}\.tif$|[[:digit:]]{8}\.jp2$'
local image_list
image_list=$(list_files "$barcode_path" | grep -E "$filter_regex")
if ! verify_image_order "$image_list"; then
log_error "Image order incorrect for $barcode"
image_order_errors_total=$((image_order_errors_total + 1))
errors_total=$((errors_total + 1))
continue
fi

log_debug "Zipping $barcode"
if ! zip_it "$input_directory"/"$barcode"; then
log_error "Failed to zip $barcode"
errors_total=$((errors_total + 1))
continue
fi

log_debug "Verifying zip of $barcode"
if ! verify_zip "$input_directory"/"$barcode"; then
log_error "$barcode.zip does not contain the correct files"
errors_total=$((errors_total + 1))
continue
fi

log_debug "Sending $barcode to S3"
if ! rclone copy "$input_directory"/"$barcode".zip "$digifeeds_bucket":; then
log_error "Failed to copy $barcode"
upload_errors_total=$((upload_errors_total + 1))
errors_total=$((errors_total + 1))
continue
fi

log_debug "Verifying barcode in S3"
if ! rclone check "$input_directory"/"$barcode".zip "$digifeeds_bucket":; then
log_error "$barcode not found in S3"
upload_errors_total=$((upload_errors_total + 1))
errors_total=$((errors_total + 1))
continue
fi

log_info "Moving $barcode to processed"
mv "$input_directory"/"$barcode".zip "$processed_directory"/"${TIMESTAMP}"_"${barcode}".zip
mv "$input_directory"/"$barcode" "$processed_directory"/"${TIMESTAMP}"_"${barcode}"
files_processed_total=$((files_processed_total + 1))
done

log_info "Total files processed: $files_processed_total"
log_info "Total errors image order: $image_order_errors_total "
log_info "Total errors uploading to S3: $upload_errors_total"
log_info "Total errors: $errors_total"
}

if [[ $APP_ENV != "test" ]]; then

log_info "=====Start $(date)====="

main

if [ "$send_metrics" != "false" ]; then
print_metrics $files_processed_total $image_order_errors_total $upload_errors_total $errors_total | /usr/local/bin/pushgateway_advanced -j $JOB_NAME
/usr/local/bin/pushgateway -j $JOB_NAME -b "$START_TIME"
fi
log_info "=====End $(date)====="
fi
Loading

0 comments on commit 1c31614

Please sign in to comment.