diff --git a/links_and_dests.py b/links_and_dests.py index c19a32b2..ae25fc46 100755 --- a/links_and_dests.py +++ b/links_and_dests.py @@ -8,7 +8,9 @@ from collections import deque import sys from time import sleep import ssl -from io import BytesIO +from io import BytesIO, StringIO +import json +import re try: from bs4 import BeautifulSoup @@ -22,6 +24,9 @@ if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]: print("Please upgrade certifi using `pip install --upgrade certifi`.\n") sys.exit(1) +GITHUB_LINE_FRAGMENT = re.compile('L[0-9]+') + + def get_links_and_destinations_from_pdf(f): try: from PyPDF2 import PdfFileReader @@ -52,13 +57,23 @@ def get_links_and_destinations_from_html(f): dests = set() soup = BeautifulSoup(f.read(), "html5lib") + + # First try to find this: