diff --git a/configs/iow/pids-geoconnex-dev-gleanerconfig.yaml b/configs/iow/pids-geoconnex-dev-gleanerconfig.yaml new file mode 100644 index 00000000..6cde1bff --- /dev/null +++ b/configs/iow/pids-geoconnex-dev-gleanerconfig.yaml @@ -0,0 +1,68 @@ +minio: + address: localhost + port: 9000 + accessKey: amazingaccesskey + secretKey: amazingsecretkey + ssl: false + bucket: iow +gleaner: + runid: iow # this will be the bucket the output is placed in... + summon: true # do we want to visit the web sites and pull down the files + mill: false +context: + cache: true +contextmaps: +- prefix: "https://schema.org/" + file: "configs/jsonldcontext.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld +- prefix: "http://schema.org/" + file: "configs/jsonldcontext.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld +summoner: + after: "" # "21 May 20 10:00 UTC" + mode: full # full || diff: If diff compare what we have currently in gleaner to sitemap, get only new, delete missing + threads: 5 + delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) + headless: http://localhost:9222 # URL for headless see docs/headless +millers: + graph: true +sources: +- active: 'true' + domain: https://pids.geoconnex.dev + headless: 'false' + name: refgages0 + pid: https://gleaner.io/genid/geoconnex + propername: refgages0 + sourcetype: sitemap + url: https://pids.geoconnex.dev/sitemap/ref/gages/gages__0.xml +- active: 'true' + domain: https://pids.geoconnex.dev + headless: 'false' + name: refmainstems + pid: https://gleaner.io/genid/geoconnex + propername: refmainstems + sourcetype: sitemap + url: https://pids.geoconnex.dev/sitemap/ref/mainstems/mainstems__0.xml +- active: 'true' + domain: https://pids.geoconnex.dev + headless: 'false' + name: dams0 + pid: https://gleaner.io/genid/geoconnex + propername: dams0 + sourcetype: sitemap + url: https://pids.geoconnex.dev/sitemap/ref/dams/dams__0.xml +- active: 'true' + domain: https://pids.geoconnex.dev + headless: 'false' + name: cdss0 + pid: https://gleaner.io/genid/geoconnex + propername: cdss0 + sourcetype: sitemap + url: https://pids.geoconnex.dev/sitemap/cdss/co_gages__0.xml +- active: 'true' + domain: https://pids.geoconnex.dev + headless: 'false' + name: nmwdist0 + pid: https://gleaner.io/genid/geoconnex + propername: nmwdist0 + sourcetype: sitemap + url: https://pids.geoconnex.dev/sitemap/nmwdi/st/nmwdi-st__0.xml + diff --git a/scripts/iow/start-gleaner.sh b/scripts/iow/start-gleaner.sh index 69d3bcfc..1df45e07 100755 --- a/scripts/iow/start-gleaner.sh +++ b/scripts/iow/start-gleaner.sh @@ -3,20 +3,13 @@ -TS=`date +%Y-%m-%dT%H.%M.%S` -LOGDIR="$HOME/logs/$TS" -mkdir -p $LOGDIR || exit 1 -cd $LOGDIR || exit 1 -for src in `cat ~/conf/gleanerconfig.yaml | grep '\Wname:'|awk '{print $2}'` +for src in `cat configs/iow/pids-geoconnex-dev-gleanerconfig.yaml | grep '\Wname:'|awk '{print $2}'` do -OUTFILE="$LOGDIR/gleaner-$src.out" -ERRFILE="$LOGDIR/gleaner-$src.err" echo "harvesting source '$src'..." -#strace -f -o $LOGDIR/strace-$src.out gleaner -cfg $HOME/conf/gleanerconfig.yaml -source $src -rude > $OUTFILE 2>$ERRFILE -gleaner -log debug -cfg $HOME/conf/gleanerconfig.yaml -source $src -rude > $OUTFILE 2>$ERRFILE +gleaner -log debug -cfg config/gleanerconfig.yaml -source $src -rude done echo "complete!"