#!/bin/sh
test $# -gt 0||exec echo "
10 requests per second
500 requests per day without acct
tokens last 2 weeks
usage: ${0##*/} =querystring -- search
usage: ${0##*/} resume < 1.json >> 1.json -- continuation search
usage: ${0##*/} 12345678 -- info
usage: ${0##*/} 12345678 pdf -- download pdf
usage: ${0##*/} -h -- search/api help
example: ${0##*/} =created_at%3a%5bNOW-1DAY+TO+*%5d -- all docs created in last 24h
"
x=https://api.www.documentcloud.org/api/documents
y=""
case $1 in =*)
ftp -4o\|yy059 "$x/search/?q$1&expand=user%2Corganization&version=2.0&hl=true&per_page=100"|tmux loadb -b dcl /dev/stdin
tmux saveb -b dcl /dev/stdout
while true;do
y=$(tmux saveb -b dcl /dev/stdout|sed -n '/\"next\":\"/{s///;s/\"//p;}')
test ${#y} -ge 140||exit
ftp -4o\|yy059 "$y"|tmux loadb -b dcl /dev/stdin
tmux saveb -b dcl /dev/stdout
done
;;resume)
y=$(grep \"next\":|tail -1|sed -n '/\"next\":\"/{s///;s/\"//p;}');
ftp -4o\|yy059 "$x/search/?q$1&expand=user%2Corganization&version=2.0&hl=true&per_page=100"|tmux loadb -b dcl /dev/stdin
tmux saveb -b dcl /dev/stdout
while true;do
y=$(tmux saveb -b dcl /dev/stdout|sed -n '/\"next\":\"/{s///;s/\"//p;}')
test ${#y} -ge 140||exit
ftp -4o\|yy059 "$y" |tmux loadb -b dcl /dev/stdin
tmux saveb -b dcl /dev/stdout
done
;;-h)
exec echo '
https://api.www.documentcloud.org/pages/help/search/
https://api.www.documentcloud.org/pages/help/api/
https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html
'
esac
ftp -4o\|yy059 $x/$1/?expand=user%2Corganization%2Cnotes%2Csections%2Cnotes.organization%2Cnotes.user \
|if test $# -gt 1;
then y=$(sed -n -e "/\"canonical_url\":/{ s///;s/\"//g;s/www/s3/; s/-/\//;s/$/.$2/p;}");
ftp -4o $1.$2 $y;
else sed;exit;
fi
can sign up for acct with unverified email addrcould use curl, python, etc. to send token instead of haproxy
http-request add-header cookie "csrftoken=...; sessionid=..." if { hdr(host) api.www.documentcloud.org }
could use curl, etc. instead of ftp (tnftp)
yy059 is a filter that prints JSON left justified according to personal preference
could use jq, etc. instead of yy059
could use a temp file instead of tmux buffers
verdict: documentcloud.org is a welcome alternative to proprietary algo search engines and opinionated news, not to mention ads and tracking
interesting fact: the site actually offers a free "scraping service" if sign up (not for me; dont like python)