zips/links_and_dests.py

245 lines
8.9 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request
from urllib.error import URLError, HTTPError
from os.path import relpath
from collections import deque
import sys
from time import sleep
import ssl
from io import BytesIO, StringIO
import json
import re
try:
from bs4 import BeautifulSoup
import html5lib
import certifi
except ImportError:
print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n")
raise
if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]:
print("Please upgrade certifi using `pip install --upgrade certifi`.\n")
sys.exit(1)
GITHUB_LINE_FRAGMENT = re.compile('L[0-9]+')
def get_links_and_destinations_from_pdf(f):
try:
from PyPDF2 import PdfFileReader
except ImportError:
print("Please install the PyPDF2 library using `pip install PyPDF2`.\n")
raise
# Based on <https://stackoverflow.com/a/5978161/393146>
pdf = PdfFileReader(f)
links = set()
for pg in range(pdf.getNumPages()):
obj = pdf.getPage(pg).getObject()
for annotation in obj.get('/Annots', []):
uri = annotation.getObject().get('/A', {}).get('/URI', None)
if uri is not None and uri not in links:
links.add(uri)
dests = pdf.getNamedDestinations().keys()
return (links, dests)
def get_links_and_destinations_from_html(f):
links = set()
internal = set()
dests = set()
soup = BeautifulSoup(f.read(), "html5lib")
# First try to find this: <script type="application/json" data-target="react-app.embeddedData">
# If it exists, its content is some JSON that we need to parse to get the real content.
for script in soup.find_all('script'):
if script.get('data-target') == "react-app.embeddedData":
content = json.loads(script.string).get('payload', {}).get('blob', {}).get('richText')
if content is not None:
(links, dests) = get_links_and_destinations_from_html(StringIO(content))
break
for link in soup.find_all('a'):
if link.has_attr('href'):
url = link['href']
(internal if url.startswith('#') else links).add(url)
if link.has_attr('name'):
dests.add(link['name'])
for link in soup.find_all(id=True):
dests.add(link['id'])
# GitHub's rendering of .mediawiki files puts 'id="user-content-<ANCHOR>"' in the source
# and dynamically creates a corresponding link #<ANCHOR>.
if link['id'].startswith("user-content-"):
dests.add(link['id'][13:])
internal.difference_update(['#' + d for d in dests]) # ignore internal links satisfied by a dest
links.update(internal)
return (links, dests)
def main(args):
if len(args) < 2:
print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf|html|xhtml>")
return 1
check = '--check' in args[1:]
print_dests = '--print-dests' in args[1:]
paths = [arg for arg in args[1:] if not arg.startswith('--')]
all_links = {} # url -> pdf_paths
all_dests = {} # url -> dests
errors = deque()
print("Reading files...")
for path in paths:
print(path, end=" ")
sys.stdout.flush()
with open(path, 'rb') as f:
if path.endswith(".html") or path.endswith(".xhtml"):
(links, dests) = get_links_and_destinations_from_html(f)
elif path.endswith(".pdf"):
(links, dests) = get_links_and_destinations_from_pdf(f)
else:
errors.append("Unrecognized file type: " + path)
continue
path = relpath(path)
for l in links:
refs = all_links.get(l, None)
if refs is None:
all_links[l] = refs = deque()
refs.append(path)
all_dests["https://zips.z.cash/" + path] = dests
if path.endswith(".html"):
all_dests["https://zips.z.cash/" + path[:-5]] = dests
print("\n")
print("Links:")
last_url = None
content = None
content_type = None
dests = None
for (l, p) in sorted(all_links.items()):
print(l, end=" ")
sys.stdout.flush()
what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l
status = ""
if ":" not in l:
l = "https://zips.z.cash/" + l
if l.startswith("mailto:"):
status = "(not checked)"
elif l.startswith("https:") or l.startswith("HTTP:"): # use uppercase HTTP: for links with no https: equivalent
(url, _, fragment) = l.partition("#")
if url in all_dests:
if fragment and fragment not in all_dests[url]:
errors.append("Missing link target: " + what)
status = ""
else:
status = ""
elif check:
# If url == last_url, there is no need to refetch content. This is an optimization when
# checking URLs with the same site but different fragments (which will be sorted together).
if url != last_url:
headers = {"User-Agent": "Mozilla/5.0"}
https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where()))
# Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com
# in a way that requires cookies (booo!). We allow this for DOI links,
# but for all other links we simulate a client that never sets cookies.
if l.startswith("https://doi.org/"):
opener = build_opener(HTTPCookieProcessor(), https_handler)
else:
opener = build_opener(https_handler)
for retry in range(2):
try:
response = opener.open(Request(url=l, headers=headers))
content_type = response.info().get_content_type()
content = response.read()
last_url = url
except URLError as e:
if retry == 0 and isinstance(e, HTTPError) and e.code == 429:
try:
delay = int(e.headers['Retry-After'], 10) + 1
except Exception:
delay = 60
print("(waiting %ds due to rate limiting)" % (delay,), end=" ")
sys.stdout.flush()
sleep(delay)
continue
errors.append("Could not open link: %s due to %r" % (what, e))
status = ""
content_type = None
content = None
last_url = None
dests = None
break
if content is not None:
if fragment:
if dests is None:
if content_type in ('text/html', 'application/xhtml+xml'):
(_, dests) = get_links_and_destinations_from_html(BytesIO(content))
elif content_type == 'application/pdf':
(_, dests) = get_links_and_destinations_from_pdf(BytesIO(content))
if dests is None:
print("(link target not checked)", end=" ")
status = ""
elif fragment not in dests:
# Filter out known false positive GitHub fragments that we can't check.
if last_url.startswith("https://github.com/") and (fragment.startswith('diff-') or GITHUB_LINE_FRAGMENT.match(fragment) is not None):
print("(link target not checked)", end=" ")
status = ""
else:
errors.append("Missing link target: " + what)
status = ""
else:
status = ""
else:
status = ""
else:
errors.append("Insecure or unrecognized protocol in link: " + what)
status = ""
print(status)
if print_dests:
for (path, dests) in all_dests.items():
if path + ".html" not in all_dests: # avoid duplication
print("\nDestinations for %s:" % (path,))
for d in dests:
print(d)
if errors:
print("\nErrors:")
for e in errors:
print(e)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))