Lot of the content is unclassified though there is stuff which could be interesting from a journalistic pov. Also a lot of the files disappeared from state.gov after the Snowden leaks. Anyway all is in the public domain since some years already. #!/bin/bash
snapshots="20120713050942 20121013154343 20121010165822 20120921054221 20130413152313 20130113162428"
# orig source http://state.gov/robots.txt but also on pastebin in case they delete it:
wget --output-document=robots.txt http://pastebin.com/raw.php?i=RE2tpyR3
for x in `echo $snapshots`
do
for i in `cat ./robots.txt|cut -d ' ' -f2 | tr -d '\15\32'`
do
if [ -e `basename $i` ]; then
echo "$i already fetched"
else
wget https://web.archive.org/web/$x/http://www.state.gov/documents/$i;
fi
done
done