Update links_and_dests.py to support HTML files and rate limiting (part 1).

Signed-off-by: Daira Hopwood <daira@jacaranda.org>
This commit is contained in:
Daira Hopwood 2021-09-09 14:57:51 +01:00
parent dcb4c4e89a
commit 4af8a9684d
1 changed files with 145 additions and 36 deletions

View File

@ -1,19 +1,34 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
try: from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request
from PyPDF2 import PdfFileReader from urllib.error import URLError, HTTPError
except ImportError: from os.path import relpath
print("Please install the PyPDF2 library using `pip3 install PyPDF2`.\n")
raise
from urllib.request import build_opener, HTTPCookieProcessor, Request
from urllib.error import URLError
from os.path import basename
from collections import deque from collections import deque
import sys import sys
from time import sleep
import ssl
from io import BytesIO
try:
from bs4 import BeautifulSoup
import html5lib
import certifi
except ImportError:
print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n")
raise
if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]:
print("Please upgrade certifi using `pip install --upgrade certifi`.\n")
sys.exit(1)
def get_links_and_destinations_from_pdf(f):
try:
from PyPDF2 import PdfFileReader
except ImportError:
print("Please install the PyPDF2 library using `pip install PyPDF2`.\n")
raise
def get_links_and_destinations(f):
# Based on <https://stackoverflow.com/a/5978161/393146> # Based on <https://stackoverflow.com/a/5978161/393146>
pdf = PdfFileReader(f) pdf = PdfFileReader(f)
@ -26,14 +41,36 @@ def get_links_and_destinations(f):
if uri is not None and uri not in links: if uri is not None and uri not in links:
links.add(uri) links.add(uri)
dests = pdf.getNamedDestinations() dests = pdf.getNamedDestinations().keys()
return (links, dests) return (links, dests)
def get_links_and_destinations_from_html(f):
links = set()
internal = set()
dests = set()
soup = BeautifulSoup(f.read(), "html5lib")
for link in soup.find_all('a'):
if link.has_attr('href'):
url = link['href']
(internal if url.startswith('#') else links).add(url)
if link.has_attr('name'):
dests.add(link['name'])
for link in soup.find_all(id=True):
dests.add(link['id'])
internal.difference_update(['#' + d for d in dests]) # ignore internal links satisfied by a dest
links.update(internal)
return (links, dests)
def main(args): def main(args):
if len(args) < 2: if len(args) < 2:
print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf>") print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf|html|xhtml>")
return 1 return 1
check = '--check' in args[1:] check = '--check' in args[1:]
@ -43,32 +80,55 @@ def main(args):
all_links = {} # url -> pdf_paths all_links = {} # url -> pdf_paths
all_dests = {} # url -> dests all_dests = {} # url -> dests
for pdf_path in paths: errors = deque()
with open(pdf_path, 'rb') as f:
(links, dests) = get_links_and_destinations(f)
print("Reading files...")
for path in paths:
print(path, end=" ")
sys.stdout.flush()
with open(path, 'rb') as f:
if path.endswith(".html") or path.endswith(".xhtml"):
(links, dests) = get_links_and_destinations_from_html(f)
elif path.endswith(".pdf"):
(links, dests) = get_links_and_destinations_from_pdf(f)
else:
errors.append("Unrecognized file type: " + path)
continue
path = relpath(path)
for l in links: for l in links:
refs = all_links.get(l, None) refs = all_links.get(l, None)
if refs is None: if refs is None:
all_links[l] = refs = deque() all_links[l] = refs = deque()
refs.append(pdf_path) refs.append(path)
all_dests["https://zips.z.cash/protocol/" + basename(pdf_path)] = dests all_dests["https://zips.z.cash/" + path] = dests
if path.endswith(".html"):
errors = deque() all_dests["https://zips.z.cash/" + path[:-5]] = dests
print("\n")
print("Links:") print("Links:")
last_url = None
content = None
content_type = None
dests = None
for (l, p) in sorted(all_links.items()): for (l, p) in sorted(all_links.items()):
print(l, end=" ") print(l, end=" ")
sys.stdout.flush() sys.stdout.flush()
what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l
status = "" status = ""
if not l.startswith("https:"): if ":" not in l:
errors.append("Insecure or unrecognized protocol in link: " + what) l = "https://zips.z.cash/" + l
status = ""
else: if l.startswith("mailto:"):
status = "(not checked)"
elif l.startswith("https:") or l.startswith("HTTP:"): # use uppercase HTTP: for links with no https: equivalent
(url, _, fragment) = l.partition("#") (url, _, fragment) = l.partition("#")
if url in all_dests: if url in all_dests:
if fragment and fragment not in all_dests[url]: if fragment and fragment not in all_dests[url]:
errors.append("Missing link target: " + what) errors.append("Missing link target: " + what)
@ -76,29 +136,77 @@ def main(args):
else: else:
status = "" status = ""
elif check: elif check:
try: # If url == last_url, there is no need to refetch content. This is an optimization when
# checking URLs with the same site but different fragments (which will be sorted together).
if url != last_url:
headers = {"User-Agent": "Mozilla/5.0"} headers = {"User-Agent": "Mozilla/5.0"}
https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where()))
# Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com # Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com
# in a way that requires cookies (booo!). We allow this for DOI links, # in a way that requires cookies (booo!). We allow this for DOI links,
# but for all other links we simulate a client that never sets cookies. # but for all other links we simulate a client that never sets cookies.
if l.startswith("https://doi.org/"): if l.startswith("https://doi.org/"):
opener = build_opener(HTTPCookieProcessor()) opener = build_opener(HTTPCookieProcessor(), https_handler)
else: else:
opener = build_opener() opener = build_opener(https_handler)
response = opener.open(Request(url=l, headers=headers))
response.read() for retry in range(2):
status = "" try:
except URLError as e: response = opener.open(Request(url=l, headers=headers))
errors.append("Could not open link: %s due to %r" % (what, e)) content_type = response.info().get_content_type()
status = "" content = response.read()
last_url = url
except URLError as e:
if retry == 0 and isinstance(e, HTTPError) and e.code == 429:
try:
delay = int(e.headers['Retry-After'], 10) + 1
except Exception:
delay = 60
print("(waiting %ds due to rate limiting)" % (delay,), end=" ")
sys.stdout.flush()
sleep(delay)
continue
errors.append("Could not open link: %s due to %r" % (what, e))
status = ""
content_type = None
content = None
last_url = None
dests = None
break
if content is not None:
if fragment:
if dests is None:
if content_type == 'text/html':
(_, dests) = get_links_and_destinations_from_html(BytesIO(content))
elif content_type == 'application/pdf':
(_, dests) = get_links_and_destinations_from_pdf(BytesIO(content))
if dests is None:
print("(link target not checked)", end=" ")
status = ""
elif fragment not in dests:
errors.append("Missing link target: " + what)
status = ""
else:
status = ""
else:
status = ""
else:
errors.append("Insecure or unrecognized protocol in link: " + what)
status = ""
print(status) print(status)
if print_dests: if print_dests:
for dests in all_dests: for (path, dests) in all_dests.items():
print("\nDestinations for %s:" % (dests,)) if path + ".html" not in all_dests: # avoid duplication
for d in dests: print("\nDestinations for %s:" % (path,))
print(d) for d in dests:
print(d)
if errors: if errors:
print("\nErrors:") print("\nErrors:")
@ -107,5 +215,6 @@ def main(args):
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main(sys.argv)) sys.exit(main(sys.argv))