zips/links_and_dests.py

225 lines
7.9 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request
from urllib.error import URLError, HTTPError
from os.path import relpath
from collections import deque
import sys
from time import sleep
import ssl
from io import BytesIO
try:
from bs4 import BeautifulSoup
import html5lib
import certifi
except ImportError:
print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n")
raise
if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]:
print("Please upgrade certifi using `pip install --upgrade certifi`.\n")
sys.exit(1)
def get_links_and_destinations_from_pdf(f):
try:
from PyPDF2 import PdfFileReader
except ImportError:
print("Please install the PyPDF2 library using `pip install PyPDF2`.\n")
raise
# Based on <https://stackoverflow.com/a/5978161/393146>
pdf = PdfFileReader(f)
links = set()
for pg in range(pdf.getNumPages()):
obj = pdf.getPage(pg).getObject()
for annotation in obj.get('/Annots', []):
uri = annotation.getObject().get('/A', {}).get('/URI', None)
if uri is not None and uri not in links:
links.add(uri)
dests = pdf.getNamedDestinations().keys()
return (links, dests)
def get_links_and_destinations_from_html(f):
links = set()
internal = set()
dests = set()
soup = BeautifulSoup(f.read(), "html5lib")
for link in soup.find_all('a'):
if link.has_attr('href'):
url = link['href']
(internal if url.startswith('#') else links).add(url)
if link.has_attr('name'):
dests.add(link['name'])
for link in soup.find_all(id=True):
dests.add(link['id'])
# GitHub's rendering of .mediawiki files puts 'id="user-content-<ANCHOR>"' in the source
# and dynamically creates a corresponding link #<ANCHOR>.
if link['id'].startswith("user-content-"):
dests.add(link['id'][13:])
internal.difference_update(['#' + d for d in dests]) # ignore internal links satisfied by a dest
links.update(internal)
return (links, dests)
def main(args):
if len(args) < 2:
print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf|html|xhtml>")
return 1
check = '--check' in args[1:]
print_dests = '--print-dests' in args[1:]
paths = [arg for arg in args[1:] if not arg.startswith('--')]
all_links = {} # url -> pdf_paths
all_dests = {} # url -> dests
errors = deque()
print("Reading files...")
for path in paths:
print(path, end=" ")
sys.stdout.flush()
with open(path, 'rb') as f:
if path.endswith(".html") or path.endswith(".xhtml"):
(links, dests) = get_links_and_destinations_from_html(f)
elif path.endswith(".pdf"):
(links, dests) = get_links_and_destinations_from_pdf(f)
else:
errors.append("Unrecognized file type: " + path)
continue
path = relpath(path)
for l in links:
refs = all_links.get(l, None)
if refs is None:
all_links[l] = refs = deque()
refs.append(path)
all_dests["https://zips.z.cash/" + path] = dests
if path.endswith(".html"):
all_dests["https://zips.z.cash/" + path[:-5]] = dests
print("\n")
print("Links:")
last_url = None
content = None
content_type = None
dests = None
for (l, p) in sorted(all_links.items()):
print(l, end=" ")
sys.stdout.flush()
what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l
status = ""
if ":" not in l:
l = "https://zips.z.cash/" + l
if l.startswith("mailto:"):
status = "(not checked)"
elif l.startswith("https:") or l.startswith("HTTP:"): # use uppercase HTTP: for links with no https: equivalent
(url, _, fragment) = l.partition("#")
if url in all_dests:
if fragment and fragment not in all_dests[url]:
errors.append("Missing link target: " + what)
status = ""
else:
status = ""
elif check:
# If url == last_url, there is no need to refetch content. This is an optimization when
# checking URLs with the same site but different fragments (which will be sorted together).
if url != last_url:
headers = {"User-Agent": "Mozilla/5.0"}
https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where()))
# Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com
# in a way that requires cookies (booo!). We allow this for DOI links,
# but for all other links we simulate a client that never sets cookies.
if l.startswith("https://doi.org/"):
opener = build_opener(HTTPCookieProcessor(), https_handler)
else:
opener = build_opener(https_handler)
for retry in range(2):
try:
response = opener.open(Request(url=l, headers=headers))
content_type = response.info().get_content_type()
content = response.read()
last_url = url
except URLError as e:
if retry == 0 and isinstance(e, HTTPError) and e.code == 429:
try:
delay = int(e.headers['Retry-After'], 10) + 1
except Exception:
delay = 60
print("(waiting %ds due to rate limiting)" % (delay,), end=" ")
sys.stdout.flush()
sleep(delay)
continue
errors.append("Could not open link: %s due to %r" % (what, e))
status = ""
content_type = None
content = None
last_url = None
dests = None
break
if content is not None:
if fragment:
if dests is None:
if content_type in ('text/html', 'application/xhtml+xml'):
(_, dests) = get_links_and_destinations_from_html(BytesIO(content))
elif content_type == 'application/pdf':
(_, dests) = get_links_and_destinations_from_pdf(BytesIO(content))
if dests is None:
print("(link target not checked)", end=" ")
status = ""
elif fragment not in dests:
errors.append("Missing link target: " + what)
status = ""
else:
status = ""
else:
status = ""
else:
errors.append("Insecure or unrecognized protocol in link: " + what)
status = ""
print(status)
if print_dests:
for (path, dests) in all_dests.items():
if path + ".html" not in all_dests: # avoid duplication
print("\nDestinations for %s:" % (path,))
for d in dests:
print(d)
if errors:
print("\nErrors:")
for e in errors:
print(e)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))