mirror of https://github.com/zcash/zips.git
Update links_and_dests.py to support HTML files and rate limiting (part 1).
Signed-off-by: Daira Hopwood <daira@jacaranda.org>
This commit is contained in:
parent
dcb4c4e89a
commit
4af8a9684d
|
@ -1,19 +1,34 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request
|
||||||
|
from urllib.error import URLError, HTTPError
|
||||||
|
from os.path import relpath
|
||||||
|
from collections import deque
|
||||||
|
import sys
|
||||||
|
from time import sleep
|
||||||
|
import ssl
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import html5lib
|
||||||
|
import certifi
|
||||||
|
except ImportError:
|
||||||
|
print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n")
|
||||||
|
raise
|
||||||
|
|
||||||
|
if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]:
|
||||||
|
print("Please upgrade certifi using `pip install --upgrade certifi`.\n")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def get_links_and_destinations_from_pdf(f):
|
||||||
try:
|
try:
|
||||||
from PyPDF2 import PdfFileReader
|
from PyPDF2 import PdfFileReader
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("Please install the PyPDF2 library using `pip3 install PyPDF2`.\n")
|
print("Please install the PyPDF2 library using `pip install PyPDF2`.\n")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
from urllib.request import build_opener, HTTPCookieProcessor, Request
|
|
||||||
from urllib.error import URLError
|
|
||||||
from os.path import basename
|
|
||||||
from collections import deque
|
|
||||||
import sys
|
|
||||||
|
|
||||||
def get_links_and_destinations(f):
|
|
||||||
# Based on <https://stackoverflow.com/a/5978161/393146>
|
# Based on <https://stackoverflow.com/a/5978161/393146>
|
||||||
pdf = PdfFileReader(f)
|
pdf = PdfFileReader(f)
|
||||||
|
|
||||||
|
@ -26,14 +41,36 @@ def get_links_and_destinations(f):
|
||||||
if uri is not None and uri not in links:
|
if uri is not None and uri not in links:
|
||||||
links.add(uri)
|
links.add(uri)
|
||||||
|
|
||||||
dests = pdf.getNamedDestinations()
|
dests = pdf.getNamedDestinations().keys()
|
||||||
|
|
||||||
return (links, dests)
|
return (links, dests)
|
||||||
|
|
||||||
|
|
||||||
|
def get_links_and_destinations_from_html(f):
|
||||||
|
links = set()
|
||||||
|
internal = set()
|
||||||
|
dests = set()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(f.read(), "html5lib")
|
||||||
|
for link in soup.find_all('a'):
|
||||||
|
if link.has_attr('href'):
|
||||||
|
url = link['href']
|
||||||
|
(internal if url.startswith('#') else links).add(url)
|
||||||
|
|
||||||
|
if link.has_attr('name'):
|
||||||
|
dests.add(link['name'])
|
||||||
|
|
||||||
|
for link in soup.find_all(id=True):
|
||||||
|
dests.add(link['id'])
|
||||||
|
|
||||||
|
internal.difference_update(['#' + d for d in dests]) # ignore internal links satisfied by a dest
|
||||||
|
links.update(internal)
|
||||||
|
return (links, dests)
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
if len(args) < 2:
|
if len(args) < 2:
|
||||||
print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf>")
|
print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf|html|xhtml>")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
check = '--check' in args[1:]
|
check = '--check' in args[1:]
|
||||||
|
@ -43,32 +80,55 @@ def main(args):
|
||||||
all_links = {} # url -> pdf_paths
|
all_links = {} # url -> pdf_paths
|
||||||
all_dests = {} # url -> dests
|
all_dests = {} # url -> dests
|
||||||
|
|
||||||
for pdf_path in paths:
|
errors = deque()
|
||||||
with open(pdf_path, 'rb') as f:
|
|
||||||
(links, dests) = get_links_and_destinations(f)
|
|
||||||
|
|
||||||
|
print("Reading files...")
|
||||||
|
for path in paths:
|
||||||
|
print(path, end=" ")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
if path.endswith(".html") or path.endswith(".xhtml"):
|
||||||
|
(links, dests) = get_links_and_destinations_from_html(f)
|
||||||
|
elif path.endswith(".pdf"):
|
||||||
|
(links, dests) = get_links_and_destinations_from_pdf(f)
|
||||||
|
else:
|
||||||
|
errors.append("Unrecognized file type: " + path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
path = relpath(path)
|
||||||
for l in links:
|
for l in links:
|
||||||
refs = all_links.get(l, None)
|
refs = all_links.get(l, None)
|
||||||
if refs is None:
|
if refs is None:
|
||||||
all_links[l] = refs = deque()
|
all_links[l] = refs = deque()
|
||||||
refs.append(pdf_path)
|
refs.append(path)
|
||||||
|
|
||||||
all_dests["https://zips.z.cash/protocol/" + basename(pdf_path)] = dests
|
all_dests["https://zips.z.cash/" + path] = dests
|
||||||
|
if path.endswith(".html"):
|
||||||
errors = deque()
|
all_dests["https://zips.z.cash/" + path[:-5]] = dests
|
||||||
|
|
||||||
|
print("\n")
|
||||||
print("Links:")
|
print("Links:")
|
||||||
|
|
||||||
|
last_url = None
|
||||||
|
content = None
|
||||||
|
content_type = None
|
||||||
|
dests = None
|
||||||
|
|
||||||
for (l, p) in sorted(all_links.items()):
|
for (l, p) in sorted(all_links.items()):
|
||||||
print(l, end=" ")
|
print(l, end=" ")
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l
|
what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l
|
||||||
status = ""
|
status = ""
|
||||||
|
|
||||||
if not l.startswith("https:"):
|
if ":" not in l:
|
||||||
errors.append("Insecure or unrecognized protocol in link: " + what)
|
l = "https://zips.z.cash/" + l
|
||||||
status = "❌"
|
|
||||||
else:
|
if l.startswith("mailto:"):
|
||||||
|
status = "(not checked)"
|
||||||
|
elif l.startswith("https:") or l.startswith("HTTP:"): # use uppercase HTTP: for links with no https: equivalent
|
||||||
(url, _, fragment) = l.partition("#")
|
(url, _, fragment) = l.partition("#")
|
||||||
|
|
||||||
if url in all_dests:
|
if url in all_dests:
|
||||||
if fragment and fragment not in all_dests[url]:
|
if fragment and fragment not in all_dests[url]:
|
||||||
errors.append("Missing link target: " + what)
|
errors.append("Missing link target: " + what)
|
||||||
|
@ -76,27 +136,75 @@ def main(args):
|
||||||
else:
|
else:
|
||||||
status = "✓"
|
status = "✓"
|
||||||
elif check:
|
elif check:
|
||||||
try:
|
# If url == last_url, there is no need to refetch content. This is an optimization when
|
||||||
|
# checking URLs with the same site but different fragments (which will be sorted together).
|
||||||
|
if url != last_url:
|
||||||
headers = {"User-Agent": "Mozilla/5.0"}
|
headers = {"User-Agent": "Mozilla/5.0"}
|
||||||
|
https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where()))
|
||||||
|
|
||||||
# Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com
|
# Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com
|
||||||
# in a way that requires cookies (booo!). We allow this for DOI links,
|
# in a way that requires cookies (booo!). We allow this for DOI links,
|
||||||
# but for all other links we simulate a client that never sets cookies.
|
# but for all other links we simulate a client that never sets cookies.
|
||||||
if l.startswith("https://doi.org/"):
|
if l.startswith("https://doi.org/"):
|
||||||
opener = build_opener(HTTPCookieProcessor())
|
opener = build_opener(HTTPCookieProcessor(), https_handler)
|
||||||
else:
|
else:
|
||||||
opener = build_opener()
|
opener = build_opener(https_handler)
|
||||||
|
|
||||||
|
for retry in range(2):
|
||||||
|
try:
|
||||||
response = opener.open(Request(url=l, headers=headers))
|
response = opener.open(Request(url=l, headers=headers))
|
||||||
response.read()
|
content_type = response.info().get_content_type()
|
||||||
status = "✓"
|
content = response.read()
|
||||||
|
last_url = url
|
||||||
except URLError as e:
|
except URLError as e:
|
||||||
|
if retry == 0 and isinstance(e, HTTPError) and e.code == 429:
|
||||||
|
try:
|
||||||
|
delay = int(e.headers['Retry-After'], 10) + 1
|
||||||
|
except Exception:
|
||||||
|
delay = 60
|
||||||
|
|
||||||
|
print("(waiting %ds due to rate limiting)" % (delay,), end=" ")
|
||||||
|
sys.stdout.flush()
|
||||||
|
sleep(delay)
|
||||||
|
continue
|
||||||
|
|
||||||
errors.append("Could not open link: %s due to %r" % (what, e))
|
errors.append("Could not open link: %s due to %r" % (what, e))
|
||||||
status = "❌"
|
status = "❌"
|
||||||
|
content_type = None
|
||||||
|
content = None
|
||||||
|
last_url = None
|
||||||
|
|
||||||
|
dests = None
|
||||||
|
break
|
||||||
|
|
||||||
|
if content is not None:
|
||||||
|
if fragment:
|
||||||
|
if dests is None:
|
||||||
|
if content_type == 'text/html':
|
||||||
|
(_, dests) = get_links_and_destinations_from_html(BytesIO(content))
|
||||||
|
elif content_type == 'application/pdf':
|
||||||
|
(_, dests) = get_links_and_destinations_from_pdf(BytesIO(content))
|
||||||
|
|
||||||
|
if dests is None:
|
||||||
|
print("(link target not checked)", end=" ")
|
||||||
|
status = "✓"
|
||||||
|
elif fragment not in dests:
|
||||||
|
errors.append("Missing link target: " + what)
|
||||||
|
status = "❌"
|
||||||
|
else:
|
||||||
|
status = "✓"
|
||||||
|
else:
|
||||||
|
status = "✓"
|
||||||
|
else:
|
||||||
|
errors.append("Insecure or unrecognized protocol in link: " + what)
|
||||||
|
status = "❌"
|
||||||
|
|
||||||
print(status)
|
print(status)
|
||||||
|
|
||||||
if print_dests:
|
if print_dests:
|
||||||
for dests in all_dests:
|
for (path, dests) in all_dests.items():
|
||||||
print("\nDestinations for %s:" % (dests,))
|
if path + ".html" not in all_dests: # avoid duplication
|
||||||
|
print("\nDestinations for %s:" % (path,))
|
||||||
for d in dests:
|
for d in dests:
|
||||||
print(d)
|
print(d)
|
||||||
|
|
||||||
|
@ -107,5 +215,6 @@ def main(args):
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv))
|
sys.exit(main(sys.argv))
|
||||||
|
|
Loading…
Reference in New Issue