Scrape #1232
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Scrape | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: "0 0 * * *" # daily | |
jobs: | |
scrape: | |
runs-on: ubuntu-latest | |
name: Official timetable | |
env: | |
TIMETABLE_PREFIX: https://timetabling.anu.edu.au/sws | |
strategy: | |
fail-fast: false # continue scraping even if a session fails | |
max-parallel: 1 # conservative ratelimit to avoid serverside crashes (since the script doesn't have backoff/retry) | |
matrix: | |
year: [2024] | |
session: ['1 S1', '2 S2', '3 X1', '4 X2', '5 X3', '6 X4'] | |
steps: | |
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 | |
- uses: actions/[email protected] | |
with: | |
python-version: "3.8" | |
- name: Install pipenv | |
run: python -m pip install pipenv | |
- name: Install dependencies | |
working-directory: scraper | |
run: python -m pipenv install --system | |
- name: Run scraper script | |
working-directory: scraper | |
env: | |
TIMETABLE_PREFIX: ${{ env.TIMETABLE_PREFIX }} | |
YEAR: ${{ matrix.year }} | |
SESSION: ${{ matrix.session }} | |
# eg http://timetabling.anu.edu.au/sws2022/ 1 S1 | |
run: python3 ./scraper.py $TIMETABLE_PREFIX$YEAR/ $SESSION | |
- name: Compare existing and new data | |
id: compare | |
continue-on-error: true | |
working-directory: scraper | |
env: | |
YEAR: ${{ matrix.year }} | |
SESSION: ${{ matrix.session }} | |
# inline bash string manipulation to get string after first space | |
# gets the session string (S1), eg timetable_2022_S1.json | |
run: cmp timetable.json ../public/timetable_data/${YEAR}/${SESSION#* }.json | |
- name: Commit to repo if data has changed | |
if: ${{ steps.compare.outcome == 'failure' }} | |
env: | |
YEAR: ${{ matrix.year }} | |
SESSION: ${{ matrix.session }} | |
GH_USER: pl4nty | |
GH_PAT: ${{ secrets.GH_PAT }} | |
# inline bash string manipulation to get string after first space | |
# gets the session string (S1), eg timetable_2022_S1.json | |
run: | | |
git config user.name github-actions | |
git config user.email 41898282+github-actions[bot]@users.noreply.github.com | |
git config pull.rebase true | |
git remote set-url origin https://pl4nty:[email protected]/$GITHUB_REPOSITORY.git | |
git pull origin | |
cd scraper | |
test -e ./timetable.json || exit 0 | |
mv -f ./timetable.json ../public/timetable_data/${YEAR}/${SESSION#* }.json | |
mv -f ./timetable.min.json ../public/timetable_data/${YEAR}/${SESSION#* }.min.json | |
sed -Ei 's/^( +Course data updated at )[0-9]{1,2}:[0-9]{2} [ap]m, [0-9]{1,2} [A-Za-z]{3} [0-9]+ \([A-Z]+\)(\. \{)/\1'"$(date +'%-I:%M %P, %-d %b %Y (%Z)')"'\2/' ../src/App.js | |
git add --all | |
git commit -m "chore(data): add new timetable data" | |
git push origin |