diff --git a/protocol/links_and_dests.py b/protocol/links_and_dests.py index a6242929..fa113a74 100755 --- a/protocol/links_and_dests.py +++ b/protocol/links_and_dests.py @@ -1,19 +1,34 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -try: - from PyPDF2 import PdfFileReader -except ImportError: - print("Please install the PyPDF2 library using `pip3 install PyPDF2`.\n") - raise - -from urllib.request import build_opener, HTTPCookieProcessor, Request -from urllib.error import URLError -from os.path import basename +from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request +from urllib.error import URLError, HTTPError +from os.path import relpath from collections import deque import sys +from time import sleep +import ssl +from io import BytesIO + +try: + from bs4 import BeautifulSoup + import html5lib + import certifi +except ImportError: + print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n") + raise + +if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]: + print("Please upgrade certifi using `pip install --upgrade certifi`.\n") + sys.exit(1) + +def get_links_and_destinations_from_pdf(f): + try: + from PyPDF2 import PdfFileReader + except ImportError: + print("Please install the PyPDF2 library using `pip install PyPDF2`.\n") + raise -def get_links_and_destinations(f): # Based on pdf = PdfFileReader(f) @@ -26,14 +41,36 @@ def get_links_and_destinations(f): if uri is not None and uri not in links: links.add(uri) - dests = pdf.getNamedDestinations() + dests = pdf.getNamedDestinations().keys() return (links, dests) +def get_links_and_destinations_from_html(f): + links = set() + internal = set() + dests = set() + + soup = BeautifulSoup(f.read(), "html5lib") + for link in soup.find_all('a'): + if link.has_attr('href'): + url = link['href'] + (internal if url.startswith('#') else links).add(url) + + if link.has_attr('name'): + dests.add(link['name']) + + for link in soup.find_all(id=True): + dests.add(link['id']) + + internal.difference_update(['#' + d for d in dests]) # ignore internal links satisfied by a dest + links.update(internal) + return (links, dests) + + def main(args): if len(args) < 2: - print("Usage: ./links_and_dests.py [--check] [--print-dests] ") + print("Usage: ./links_and_dests.py [--check] [--print-dests] ") return 1 check = '--check' in args[1:] @@ -43,32 +80,55 @@ def main(args): all_links = {} # url -> pdf_paths all_dests = {} # url -> dests - for pdf_path in paths: - with open(pdf_path, 'rb') as f: - (links, dests) = get_links_and_destinations(f) + errors = deque() + print("Reading files...") + for path in paths: + print(path, end=" ") + sys.stdout.flush() + + with open(path, 'rb') as f: + if path.endswith(".html") or path.endswith(".xhtml"): + (links, dests) = get_links_and_destinations_from_html(f) + elif path.endswith(".pdf"): + (links, dests) = get_links_and_destinations_from_pdf(f) + else: + errors.append("Unrecognized file type: " + path) + continue + + path = relpath(path) for l in links: refs = all_links.get(l, None) if refs is None: all_links[l] = refs = deque() - refs.append(pdf_path) + refs.append(path) - all_dests["https://zips.z.cash/protocol/" + basename(pdf_path)] = dests - - errors = deque() + all_dests["https://zips.z.cash/" + path] = dests + if path.endswith(".html"): + all_dests["https://zips.z.cash/" + path[:-5]] = dests + print("\n") print("Links:") + + last_url = None + content = None + content_type = None + dests = None + for (l, p) in sorted(all_links.items()): print(l, end=" ") sys.stdout.flush() what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l status = "" - if not l.startswith("https:"): - errors.append("Insecure or unrecognized protocol in link: " + what) - status = "❌" - else: + if ":" not in l: + l = "https://zips.z.cash/" + l + + if l.startswith("mailto:"): + status = "(not checked)" + elif l.startswith("https:") or l.startswith("HTTP:"): # use uppercase HTTP: for links with no https: equivalent (url, _, fragment) = l.partition("#") + if url in all_dests: if fragment and fragment not in all_dests[url]: errors.append("Missing link target: " + what) @@ -76,29 +136,77 @@ def main(args): else: status = "✓" elif check: - try: + # If url == last_url, there is no need to refetch content. This is an optimization when + # checking URLs with the same site but different fragments (which will be sorted together). + if url != last_url: headers = {"User-Agent": "Mozilla/5.0"} + https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where())) + # Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com # in a way that requires cookies (booo!). We allow this for DOI links, # but for all other links we simulate a client that never sets cookies. if l.startswith("https://doi.org/"): - opener = build_opener(HTTPCookieProcessor()) + opener = build_opener(HTTPCookieProcessor(), https_handler) else: - opener = build_opener() - response = opener.open(Request(url=l, headers=headers)) - response.read() - status = "✓" - except URLError as e: - errors.append("Could not open link: %s due to %r" % (what, e)) - status = "❌" + opener = build_opener(https_handler) + + for retry in range(2): + try: + response = opener.open(Request(url=l, headers=headers)) + content_type = response.info().get_content_type() + content = response.read() + last_url = url + except URLError as e: + if retry == 0 and isinstance(e, HTTPError) and e.code == 429: + try: + delay = int(e.headers['Retry-After'], 10) + 1 + except Exception: + delay = 60 + + print("(waiting %ds due to rate limiting)" % (delay,), end=" ") + sys.stdout.flush() + sleep(delay) + continue + + errors.append("Could not open link: %s due to %r" % (what, e)) + status = "❌" + content_type = None + content = None + last_url = None + + dests = None + break + + if content is not None: + if fragment: + if dests is None: + if content_type == 'text/html': + (_, dests) = get_links_and_destinations_from_html(BytesIO(content)) + elif content_type == 'application/pdf': + (_, dests) = get_links_and_destinations_from_pdf(BytesIO(content)) + + if dests is None: + print("(link target not checked)", end=" ") + status = "✓" + elif fragment not in dests: + errors.append("Missing link target: " + what) + status = "❌" + else: + status = "✓" + else: + status = "✓" + else: + errors.append("Insecure or unrecognized protocol in link: " + what) + status = "❌" print(status) if print_dests: - for dests in all_dests: - print("\nDestinations for %s:" % (dests,)) - for d in dests: - print(d) + for (path, dests) in all_dests.items(): + if path + ".html" not in all_dests: # avoid duplication + print("\nDestinations for %s:" % (path,)) + for d in dests: + print(d) if errors: print("\nErrors:") @@ -107,5 +215,6 @@ def main(args): return 0 + if __name__ == '__main__': sys.exit(main(sys.argv))