Skip to content

Scrape a release with typesense #22

Scrape a release with typesense

Scrape a release with typesense #22

name: Scrape a release with typesense
on:
workflow_dispatch:
inputs:
relative_path:
type: string
default: "snapshots/master/docs/slint"
description: relative path to site root to scrape (like snapshots/master/docs/slint)
version:
type: string
default: "master"
description: Slint version (used as typesense index name, for example "master")
jobs:
scrape:
runs-on: ubuntu-latest
steps:
- name: Repository Checkout
uses: actions/checkout@v4
- name: Run
uses: tj-actions/docker-run@v2
id: docker-run
with:
name: nginx
image: nginx:latest
options: "-d -p 80:80"
- name: Populate web server
run: |
docker exec nginx rm -rf /usr/share/nginx/html
docker cp ${{ inputs.relative_path }} nginx:/usr/share/nginx/html
- name: test web server
run: |
curl http://localhost:80/index.html > test.html
cat test.html
- name: Clone slint directory
uses: actions/checkout@v4
with:
repository: slint-ui/slint
ref: master
path: slint
persist-credentials: false
- name: Prepare config
run: |
sed -i "s/\$TYPESENSE_INDEX_NAME/${{ inputs.version }}/g" config/typesense-scraper-config.json
- name: run scraper
run: |
docker run -i \
--add-host=host.docker.internal:host-gateway \
-e TYPESENSE_API_KEY=${{ secrets.TYPESENSE_API_KEY }} \
-e TYPESENSE_HOST="062ykax5pgwon3q7p-1.a1.typesense.net" \
-e TYPESENSE_PORT="443" \
-e TYPESENSE_PROTOCOL="https" \
-e CONFIG="$(cat config/typesense-scraper-config.json | jq -r tostring)" \
typesense/docsearch-scraper:0.10.0 2>&1 | tee temp_scraper_output.txt
# Retrieve the collection name
collection_name=$(grep -o -m 1 ${{ inputs.version }}'_[0-9]\+' temp_scraper_output.txt)
# Retrieve documents from typesense server
curl -H "X-TYPESENSE-API-KEY: ${{ secrets.TYPESENSE_API_KEY }}" \
"https://062ykax5pgwon3q7p-1.a1.typesense.net/collections/$collection_name/documents/export" > temp_docs.jsonl
# Update documents in typesense server
curl -H "X-TYPESENSE-API-KEY: ${{ secrets.TYPESENSE_API_KEY }}" \
-X POST \
-T temp_docs.jsonl \
"https://062ykax5pgwon3q7p-1.a1.typesense.net/collections/$collection_name/documents/import?action=update"
# Set alias to the collection
curl "https://062ykax5pgwon3q7p-1.a1.typesense.net/aliases/$index" -X PUT \
-H "Content-Type: application/json" \
-H "X-TYPESENSE-API-KEY: ${{ secrets.TYPESENSE_API_KEY }}" -d '{
"collection_name": "'"$collection_name"'"
}'