Update links_and_dests.py to support HTML files and rate limiting (part 1).

Signed-off-by: Daira Hopwood <daira@jacaranda.org>
2021-09-09 14:57:51 +01:00 · 2021-09-09 14:57:51 +01:00 · 4af8a9684d
parent dcb4c4e89a
commit 4af8a9684d
1 changed files with 145 additions and 36 deletions
--- a/protocol/links_and_dests.py
+++ b/protocol/links_and_dests.py
@ -1,19 +1,34 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-try:
+from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request
-    from PyPDF2 import PdfFileReader
+from urllib.error import URLError, HTTPError
-except ImportError:
+from os.path import relpath
    print("Please install the PyPDF2 library using `pip3 install PyPDF2`.\n")
    raise
 from urllib.request import build_opener, HTTPCookieProcessor, Request
 from urllib.error import URLError
 from os.path import basename
 from collections import deque
 import sys
 from time import sleep
 import ssl
 from io import BytesIO
 try:
    from bs4 import BeautifulSoup
    import html5lib
    import certifi
 except ImportError:
    print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n")
    raise
 if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]:
    print("Please upgrade certifi using `pip install --upgrade certifi`.\n")
    sys.exit(1)
 def get_links_and_destinations_from_pdf(f):
    try:
        from PyPDF2 import PdfFileReader
    except ImportError:
        print("Please install the PyPDF2 library using `pip install PyPDF2`.\n")
        raise
 def get_links_and_destinations(f):
    # Based on <https://stackoverflow.com/a/5978161/393146>
    pdf = PdfFileReader(f)
@ -26,14 +41,36 @@ def get_links_and_destinations(f):
            if uri is not None and uri not in links:
                links.add(uri)
-    dests = pdf.getNamedDestinations()
+    dests = pdf.getNamedDestinations().keys()
    return (links, dests)
 def get_links_and_destinations_from_html(f):
    links = set()
    internal = set()
    dests = set()
    soup = BeautifulSoup(f.read(), "html5lib")
    for link in soup.find_all('a'):
        if link.has_attr('href'):
           url = link['href']
           (internal if url.startswith('#') else links).add(url)
        if link.has_attr('name'):
           dests.add(link['name'])
    for link in soup.find_all(id=True):
        dests.add(link['id'])
    internal.difference_update(['#' + d for d in dests])  # ignore internal links satisfied by a dest
    links.update(internal)
    return (links, dests)
 def main(args):
    if len(args) < 2:
-        print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf>")
+        print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf|html|xhtml>")
        return 1
    check = '--check' in args[1:]
@ -43,32 +80,55 @@ def main(args):
    all_links = {}  # url -> pdf_paths
    all_dests = {}  # url -> dests
-    for pdf_path in paths:
+    errors = deque()
        with open(pdf_path, 'rb') as f:
            (links, dests) = get_links_and_destinations(f)
    print("Reading files...")
    for path in paths:
        print(path, end=" ")
        sys.stdout.flush()
        with open(path, 'rb') as f:
            if path.endswith(".html") or path.endswith(".xhtml"):
                (links, dests) = get_links_and_destinations_from_html(f)
            elif path.endswith(".pdf"):
                (links, dests) = get_links_and_destinations_from_pdf(f)
            else:
                errors.append("Unrecognized file type: " + path)
                continue
        path = relpath(path)
        for l in links:
            refs = all_links.get(l, None)
            if refs is None:
                all_links[l] = refs = deque()
-            refs.append(pdf_path)
+            refs.append(path)
-        all_dests["https://zips.z.cash/protocol/" + basename(pdf_path)] = dests
+        all_dests["https://zips.z.cash/" + path] = dests
-
+        if path.endswith(".html"):
-    errors = deque()
+            all_dests["https://zips.z.cash/" + path[:-5]] = dests
    print("\n")
    print("Links:")
    last_url = None
    content = None
    content_type = None
    dests = None
    for (l, p) in sorted(all_links.items()):
        print(l, end=" ")
        sys.stdout.flush()
        what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l
        status = ""
-        if not l.startswith("https:"):
+        if ":" not in l:
-            errors.append("Insecure or unrecognized protocol in link: " + what)
+            l = "https://zips.z.cash/" + l
-            status = "❌"
+
-        else:
+        if l.startswith("mailto:"):
            status = "(not checked)"
        elif l.startswith("https:") or l.startswith("HTTP:"):  # use uppercase HTTP: for links with no https: equivalent
            (url, _, fragment) = l.partition("#")
            if url in all_dests:
                if fragment and fragment not in all_dests[url]:
                    errors.append("Missing link target: " + what)
@ -76,29 +136,77 @@ def main(args):
                else:
                    status = "✓"
            elif check:
-                try:
+                # If url == last_url, there is no need to refetch content. This is an optimization when
                # checking URLs with the same site but different fragments (which will be sorted together).
                if url != last_url:
                    headers = {"User-Agent": "Mozilla/5.0"}
                    https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where()))
                    # Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com
                    # in a way that requires cookies (booo!). We allow this for DOI links,
                    # but for all other links we simulate a client that never sets cookies.
                    if l.startswith("https://doi.org/"):
-                        opener = build_opener(HTTPCookieProcessor())
+                        opener = build_opener(HTTPCookieProcessor(), https_handler)
                    else:
-                        opener = build_opener()
+                        opener = build_opener(https_handler)
-                    response = opener.open(Request(url=l, headers=headers))
+
-                    response.read()
+                    for retry in range(2):
-                    status = "✓"
+                        try:
-                except URLError as e:
+                            response = opener.open(Request(url=l, headers=headers))
-                    errors.append("Could not open link: %s due to %r" % (what, e))
+                            content_type = response.info().get_content_type()
-                    status = "❌"
+                            content = response.read()
                            last_url = url
                        except URLError as e:
                            if retry == 0 and isinstance(e, HTTPError) and e.code == 429:
                                try:
                                    delay = int(e.headers['Retry-After'], 10) + 1
                                except Exception:
                                    delay = 60
                                print("(waiting %ds due to rate limiting)" % (delay,), end=" ")
                                sys.stdout.flush()
                                sleep(delay)
                                continue
                            errors.append("Could not open link: %s due to %r" % (what, e))
                            status = "❌"
                            content_type = None
                            content = None
                            last_url = None
                        dests = None
                        break
                if content is not None:
                    if fragment:
                        if dests is None:
                            if content_type == 'text/html':
                                (_, dests) = get_links_and_destinations_from_html(BytesIO(content))
                            elif content_type == 'application/pdf':
                                (_, dests) = get_links_and_destinations_from_pdf(BytesIO(content))
                        if dests is None:
                            print("(link target not checked)", end=" ")
                            status = "✓"
                        elif fragment not in dests:
                            errors.append("Missing link target: " + what)
                            status = "❌"
                        else:
                            status = "✓"
                    else:
                        status = "✓"
        else:
            errors.append("Insecure or unrecognized protocol in link: " + what)
            status = "❌"
        print(status)
    if print_dests:
-        for dests in all_dests:
+        for (path, dests) in all_dests.items():
-            print("\nDestinations for %s:" % (dests,))
+            if path + ".html" not in all_dests:  # avoid duplication
-            for d in dests:
+                print("\nDestinations for %s:" % (path,))
-                print(d)
+                for d in dests:
                    print(d)
    if errors:
        print("\nErrors:")
@ -107,5 +215,6 @@ def main(args):
    return 0
 if __name__ == '__main__':
    sys.exit(main(sys.argv))