DOC: Different approach to checking links validity

Consider replacing with something standard.
This commit is contained in:
Kernc
2019-01-02 10:40:18 +01:00
parent 3ac0d1d20d
commit 4b222e94e9

View File

@@ -65,18 +65,30 @@ echo
echo 'Testing for broken links' echo 'Testing for broken links'
echo echo
pushd "$BUILDROOT" >/dev/null pushd "$BUILDROOT" >/dev/null
tmpdir="$(mktemp -d)" WEBSITE='https://kernc\.github\.io/backtesting\.py'
python3 -m http.server 51296 & sleep 1 grep -PR '<a .*?href=' |
trap '{ rm -r "$tmpdir"; kill %1; wait; } >/dev/null 2>&1' EXIT sed -E "s/:.*?<a .*?href=([\"'])(.*?)/\t\2/g" |
[ ! "$(jobs -p)" ] && die 'Server not running. See above.' tr "\"'" '#' |
find -name '*.html' -print0 | cut -d'#' -f1 |
sed --null-data 's/^/http:\/\/127.0.0.1:51296\//' | sort -u -t$'\t' -k 2 |
xargs -0 -- \ sort -u |
wget --user-agent "Mozilla/5.0 Firefox 61" -e 'robots=off' --random-wait \ python -c '
--no-verbose --recursive --span-hosts --level=1 --tries=2 \ import sys
--directory-prefix "$tmpdir" --no-clobber \ from urllib.parse import urljoin
--reject-regex='\bfonts\b|\.css\b|\bjs\b|\.png\b' |& for line in sys.stdin.readlines():
grep -B1 'ERROR 404' base, url = line.split("\t")
print(base, urljoin(base, url.strip()), sep="\t")
' |
sed "s,$WEBSITE/doc/,," |
grep -Pv "$WEBSITE"'/?$' |
grep -v $'\t''$' |
while read -r line; do
while IFS=$'\t' read -r file url; do
[ -f "$url" ] ||
curl --silent --fail --retry 2 --user-agent 'Mozilla/5.0 Firefox 61' "$url" >/dev/null 2>&1 ||
die "broken link in $file: $url"
done
done
popd >/dev/null popd >/dev/null