Add links_and_dests.py.

This can be used to print outgoing links and targets in the PDF, and detect a subset of errors.
It depends on the PyPDF2 library (pip3 install PyPDF2).

Signed-off-by: Daira Hopwood <daira@jacaranda.org>
This commit is contained in:
Daira Hopwood 2021-04-05 21:39:51 +01:00
parent 4f50d5e515
commit 1f041f955a
1 changed files with 66 additions and 0 deletions

66
protocol/links_and_dests.py Executable file
View File

@ -0,0 +1,66 @@
#!/usr/bin/env python3
try:
from PyPDF2 import PdfFileReader
except ImportError:
print("Please install the PyPDF2 library using `pip3 install PyPDF2`.\n")
raise
from collections import deque
import sys
def get_links_and_destinations(f):
# Based on <https://stackoverflow.com/a/5978161/393146>
pdf = PdfFileReader(f)
links = deque()
dests = deque()
errors = deque()
for pg in range(pdf.getNumPages()):
obj = pdf.getPage(pg).getObject()
for annotation in obj.get('/Annots', []):
uri = annotation.getObject().get('/A', {}).get('/URI', None)
if uri is not None and uri not in links:
links.append(uri)
dests = pdf.getNamedDestinations()
for l in links:
if not l.startswith("https:"):
errors.append("Insecure or unrecognized protocol in link: " + l)
if l.startswith("https://zips.z.cash/protocol/"):
fragment = l.partition("#")[2]
if fragment and fragment not in dests:
errors.append("Missing link target: " + l)
return (links, dests, errors)
def main(args):
if len(args) < 2:
print("Usage: ./links_and_dests.py <file.pdf>")
return 1
with open(args[1], 'rb') as f:
(links, dests, errors) = get_links_and_destinations(f)
print("Links:")
for l in links:
print(l)
print("\nDestinations:")
for d in dests:
print(d)
if errors:
print("\nErrors:")
for e in errors:
print(e)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))