#!/usr/bin/env python3 # -*- coding: utf-8 -*- from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request from urllib.error import URLError, HTTPError from os.path import relpath from collections import deque import sys from time import sleep import ssl from io import BytesIO try: from bs4 import BeautifulSoup import html5lib import certifi except ImportError: print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n") raise if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]: print("Please upgrade certifi using `pip install --upgrade certifi`.\n") sys.exit(1) def get_links_and_destinations_from_pdf(f): try: from PyPDF2 import PdfFileReader except ImportError: print("Please install the PyPDF2 library using `pip install PyPDF2`.\n") raise # Based on pdf = PdfFileReader(f) links = set() for pg in range(pdf.getNumPages()): obj = pdf.getPage(pg).getObject() for annotation in obj.get('/Annots', []): uri = annotation.getObject().get('/A', {}).get('/URI', None) if uri is not None and uri not in links: links.add(uri) dests = pdf.getNamedDestinations().keys() return (links, dests) def get_links_and_destinations_from_html(f): links = set() internal = set() dests = set() soup = BeautifulSoup(f.read(), "html5lib") for link in soup.find_all('a'): if link.has_attr('href'): url = link['href'] (internal if url.startswith('#') else links).add(url) if link.has_attr('name'): dests.add(link['name']) for link in soup.find_all(id=True): dests.add(link['id']) # GitHub's rendering of .mediawiki files puts 'id="user-content-"' in the source # and dynamically creates a corresponding link #. if link['id'].startswith("user-content-"): dests.add(link['id'][13:]) internal.difference_update(['#' + d for d in dests]) # ignore internal links satisfied by a dest links.update(internal) return (links, dests) def main(args): if len(args) < 2: print("Usage: ./links_and_dests.py [--check] [--print-dests] ") return 1 check = '--check' in args[1:] print_dests = '--print-dests' in args[1:] paths = [arg for arg in args[1:] if not arg.startswith('--')] all_links = {} # url -> pdf_paths all_dests = {} # url -> dests errors = deque() print("Reading files...") for path in paths: print(path, end=" ") sys.stdout.flush() with open(path, 'rb') as f: if path.endswith(".html") or path.endswith(".xhtml"): (links, dests) = get_links_and_destinations_from_html(f) elif path.endswith(".pdf"): (links, dests) = get_links_and_destinations_from_pdf(f) else: errors.append("Unrecognized file type: " + path) continue path = relpath(path) for l in links: refs = all_links.get(l, None) if refs is None: all_links[l] = refs = deque() refs.append(path) all_dests["https://zips.z.cash/" + path] = dests if path.endswith(".html"): all_dests["https://zips.z.cash/" + path[:-5]] = dests print("\n") print("Links:") last_url = None content = None content_type = None dests = None for (l, p) in sorted(all_links.items()): print(l, end=" ") sys.stdout.flush() what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l status = "" if ":" not in l: l = "https://zips.z.cash/" + l if l.startswith("mailto:"): status = "(not checked)" elif l.startswith("https:") or l.startswith("HTTP:"): # use uppercase HTTP: for links with no https: equivalent (url, _, fragment) = l.partition("#") if url in all_dests: if fragment and fragment not in all_dests[url]: errors.append("Missing link target: " + what) status = "❌" else: status = "✓" elif check: # If url == last_url, there is no need to refetch content. This is an optimization when # checking URLs with the same site but different fragments (which will be sorted together). if url != last_url: headers = {"User-Agent": "Mozilla/5.0"} https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where())) # Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com # in a way that requires cookies (booo!). We allow this for DOI links, # but for all other links we simulate a client that never sets cookies. if l.startswith("https://doi.org/"): opener = build_opener(HTTPCookieProcessor(), https_handler) else: opener = build_opener(https_handler) for retry in range(2): try: response = opener.open(Request(url=l, headers=headers)) content_type = response.info().get_content_type() content = response.read() last_url = url except URLError as e: if retry == 0 and isinstance(e, HTTPError) and e.code == 429: try: delay = int(e.headers['Retry-After'], 10) + 1 except Exception: delay = 60 print("(waiting %ds due to rate limiting)" % (delay,), end=" ") sys.stdout.flush() sleep(delay) continue errors.append("Could not open link: %s due to %r" % (what, e)) status = "❌" content_type = None content = None last_url = None dests = None break if content is not None: if fragment: if dests is None: if content_type in ('text/html', 'application/xhtml+xml'): (_, dests) = get_links_and_destinations_from_html(BytesIO(content)) elif content_type == 'application/pdf': (_, dests) = get_links_and_destinations_from_pdf(BytesIO(content)) if dests is None: print("(link target not checked)", end=" ") status = "✓" elif fragment not in dests: errors.append("Missing link target: " + what) status = "❌" else: status = "✓" else: status = "✓" else: errors.append("Insecure or unrecognized protocol in link: " + what) status = "❌" print(status) if print_dests: for (path, dests) in all_dests.items(): if path + ".html" not in all_dests: # avoid duplication print("\nDestinations for %s:" % (path,)) for d in dests: print(d) if errors: print("\nErrors:") for e in errors: print(e) return 0 if __name__ == '__main__': sys.exit(main(sys.argv))