From 292053edac4e1bca3b19bff51fd1c3f022a50fb0 Mon Sep 17 00:00:00 2001 From: mi-hol <22799428+mi-hol@users.noreply.github.com> Date: Fri, 30 Dec 2022 16:33:04 +0100 Subject: [PATCH] remove as still WIP --- wiki-tools/weborphans.py | 366 --------------------------------------- 1 file changed, 366 deletions(-) delete mode 100644 wiki-tools/weborphans.py diff --git a/wiki-tools/weborphans.py b/wiki-tools/weborphans.py deleted file mode 100644 index ec0f5576..00000000 --- a/wiki-tools/weborphans.py +++ /dev/null @@ -1,366 +0,0 @@ -#!/usr/bin/env python - -#source: https://shallowsky.com/blog/tech/web/finding-web-orphans.html -#https://github.com/akkana/scripts/blob/master/weborphans - - -# Check a website (perhaps localhost) against a local mirror. -# Find broken links and orphaned files. -# You must specify both the directory, and a web URL to a server -# (e.g. localhost) that is serving that directory. - -import sys, os -import posixpath -import re -#from scheme import * -#from urllib.parse import scheme -from urllib.parse import urlparse -from urllib.parse import quote_plus -from urllib.parse import urlunsplit -from urllib.request import Request -from urllib.request import build_opener -from urllib.error import HTTPError -from urllib.error import URLError - -# tested with python version 3.10.8 -if sys.version_info < (3, 10): - raise RuntimeError("This package requires Python 3.10+") - -#from bs4 import BeautifulSoup -#from BeautifulSoup import BeautifulSoup - -class Spider: - def __init__(self, rootdir, starturl): - self.debug = False - - self.starturl = starturl - self.rootdir = os.path.normpath(rootdir) - if not os.path.isdir(rootdir): - # It's not a directory, so take the dirname, but save the filename. - self.rootdir, rootfile = os.path.split(rootdir) - else: - # It's already a directory, so self.rootdir is fine. - rootfile = None - - # XXX This next bit isn't platform-agnostic: - if not self.rootdir.endswith('/'): - self.rootdir += '/' - - # Now we need to get the true root url. The starturl may have - # something like /index.html appended to it; we need something - # we can prepend to paths. - - # Extract any path information from the root url: - parsed = urlparse(starturl) - self.scheme = parsed.scheme - self.host = parsed.netloc - self.rooturlpath = posixpath.normpath(parsed.path) - dirpart, basepart = posixpath.split(self.rooturlpath) - # If the path is a directory and ends in / (as it should) - # then posixpath will split on that slash, not the previous one. - if not basepart: - dirpart, basepart = posixpath.split(dirpart) - - # Now basepart is the last part of the path, which might - # be a directory name on the server or it might be index.* - # Compare it to the last part of self.rootdir, which is - # guaranteed to be a directory. - # But we have to split it twice, because self.rootdir ends in / - # so the first split will return '' as the basename. - lastdir = posixpath.basename(posixpath.dirname(self.rootdir)) - if basepart != lastdir: - self.rooturlpath = posixpath.dirname(self.rooturlpath) - - if not self.rooturlpath.endswith('/'): - self.rooturlpath += '/' - - # Now we're confident self.rooturlpath is the base directory. - # Add the schema and host back on. - self.rooturl = urlunsplit((self.scheme, self.host, - self.rooturlpath, None, None)) - if not self.rooturl.endswith('/'): - self.rooturl += '/' - - print ("rootdir:", self.rootdir) - print ("rooturl:", self.rooturl) - print ("rooturlpath:", self.rooturlpath) - print ("scheme:", self.scheme) - print ("host:", self.host) - print - - self.urls_to_check = [ self.rooturl ] - self.urls_succeeded = [] - self.urls_failed = [] - self.outside_urls = [] - self.files_succeeded = [] - - # Eventually, the list of excludes should be a commandline argument. - # For now, let's just make sure all the .git objects aren't orphaned, - # nor web stats or archived files. - self.excludes = [ ".git", "stats", "0-pre2011", "0-calendars" ] - - # Files that aren't explicitly referenced by the website, - # but might be needed for other purposes. - self.nonorphans = [ "favicon.ico", "robots.txt", ".htaccess" ] - - def spide(self): - """Check all urls in urls_to_check, which has new urls - being added to it during the spidering process. - """ - self.check_url(self.starturl) - while self.urls_to_check: - self.check_url(self.urls_to_check.pop()) - - print ("Done spiding") - - def check_orphans(self): - """Assuming we already have self.files_succeeded, - find all files in self.rootdir that weren't in succeeded. - """ - self.orphans = [] - for root, dirs, files in os.walk(self.rootdir, topdown=True): - dirs[:] = [d for d in dirs if d not in self.excludes] - for filename in files: - if filename in self.nonorphans: - continue - f = os.path.join(root, filename) - if f not in self.files_succeeded: - self.orphans.append(f) - - def print_summary(self): - print - print ("URLs succeeded:") - print ('\n'.join(self.urls_succeeded)) - print - print ("Outside URLs:") - print ('\n'.join(self.outside_urls)) - print - print ("URLs failed:") - print ('\n'.join(self.urls_failed)) - print - print ("Orphans:") - print ('\n'.join(self.orphans)) - print - print (len(self.urls_succeeded), "good links,", \ - len(self.outside_urls), "external urls not checked,", \ - len(self.urls_failed), "bad links,", \ - len(self.orphans), "orphaned files." - ) - - def get_local_for_url(self, urlpath): - """Get a local file path for a path parsed from an absolute URL. - """ - # Now compare parsed.path with self.rooturlpath - if self.rooturlpath not in urlpath: - return None - return os.path.normpath(urlpath.replace(self.rooturlpath, - self.rootdir, - 1)) - - def make_absolute(self, url, relative_to): - """Make a URL absolute. If it's a relative path, - then make it relative to relative_to - which must be an absolute path on the webhost. - """ - parsed = urlparse(url) - if parsed.scheme: # already has an http://host specified - # XXX If we ever extend this to check validity of - # external URLs, this next condition is the one to change. - if parsed.netloc != self.host: - if self.debug: - print ("Ignoring external link", url) - return None - return url - - # So there's no scheme. Add one. - if parsed.path.startswith('/'): - # The results of urlparse() aren't modifiable, but - # if we turn them into a list we can modify them - # then turn them back into a URL. - lurl = list(parsed) - lurl[0] = self.scheme - lurl[1] = self.host - #return urlparse.urlunparse(lurl) - return urlunparse(lurl) - - # Otherwise it's relative to urldir. Make it absolute, normalized. - lurl = list(parsed) - lurl[0] = self.scheme - lurl[1] = self.host - lurl[2] = posixpath.normpath(posixpath.join(relative_to, parsed.path)) - #return urlparse.urlunparse(lurl) - return urlunparse(lurl) - - def check_url(self, url): - """Check a URL. This should be an absolute URL on the server.""" - # If we got this far, we'll be comparing links. - # So we'll need to know the parsed parts of this url. - urlparsed = urlparse(url) - if not urlparsed.scheme or not urlparsed.path.startswith('/'): - #if not scheme or not path.startswith('/'): - print ("EEK! Non-relative URL passed to check_url, bailing") - return - - # URL encode special characters like spaces: - # urlpath = urllib.quote(urlparsed.path.encode('utf-8')) - urlpath = quote_plus(urlparsed.path.encode('utf-8')) - - # This check must come after the special char substitution. - if urlpath in self.urls_succeeded or urlpath in self.urls_failed: - return - - if self.debug: - print ("=============================== Checking", url) - - # Now we need just the directory part. This might be - # dirname(urlparsed.path), if the url is a file, or it - # might just be urlparsed.path if that's already a directory. - # The only way to know is to check on the local filesystem. - # But here's the tricky part: to get the absolute path, - # we need to know what relative links are relative_to, - # but if they themselves XXX - localpath = self.get_local_for_url(urlparsed.path) - if self.debug: - print ("=== local for", urlpath, "is", localpath) - - if not localpath: - if self.debug: - print (urlparsed.path, "is outside original directory; skipping") - if url not in self.outside_urls: - self.outside_urls.append(url) - return - - if not os.path.exists(localpath): - if self.debug: - print ("Local path '%s' doesn't exist! %s" % (localpath, url)) - self.urls_failed.append(urlpath) - return - - # If we substituted any special characters, rebuild the URL: - if urlpath != urlparsed.path: - lurl = list(urlparsed) - lurl[2] = urlpath - print (dir(lurl)) - url = urlparse.urlunparse(lurl) - #url = urlunparse(lurl) - - if self.debug: - print ("Substituted characters, recombined to", url) - - if os.path.isdir(localpath): - # The web server will substitute index.something, - # so we'd better do that too or else the index file - # will show up as an orphan. - localdir = localpath - localpath = None - for ext in ( "php", "cgi", "html" ): - indexfile = os.path.join(localdir, "index." + ext) - if os.path.exists(indexfile): - localpath = indexfile - break - if not localpath: - print ("Can't find an index file inside", localdir) - return - urldir = urlpath - else: - localdir = os.path.dirname(localpath) - urldir = posixpath.dirname(urlpath) - - if self.debug: - print ("localpath", localpath, "localdir", localdir) - print ("urldir:", urldir) - - try: - request = Request(url) - handle = build_opener() - except IOError: - return None - - if not handle: - print ("Can't open", url) - - # request.add_header("User-Agent", AGENT) - - try: - response = handle.open(request) - info = response.info() - if 'content-type' not in info.keys() or \ - not info['content-type'].startswith('text/html'): - if self.debug: - print (url, "isn't HTML; skipping") - self.urls_succeeded.append(urlpath) - self.files_succeeded.append(localpath) - return - # content = unicode(response.read(), "utf-8", errors="replace") - # content = encode(response.read(), "utf-8", errors="replace") - content = (response.read()) - - except HTTPError as error: - if error.code == 404: - print ("ERROR: %s -> %s" % (error, error.url)) - else: - print ("ERROR: %s" % error) - self.urls_failed.append(urlpath) - return - - except URLError as error: - print ("ERROR: %s" % error) - self.urls_failed.append(urlpath) - return - - self.urls_succeeded.append(urlpath) - self.files_succeeded.append(localpath) - - ctype = response.headers['content-type'] - if not ctype.startswith("text/html"): - if self.debug: - print (url, "isn't HTML (%s); not reading content" % ctype) - return - - soup = BeautifulSoup(content) - - for tag in soup.findAll('a', href=True): - href = tag.get("href") - if not href: - continue - if href[0] == '#': - continue - - href = self.make_absolute(href, urldir) - if not href: - # It's probably an external URL. Skip it. - href = tag.get("href") - if href not in self.outside_urls: - self.outside_urls.append(href) - continue - - # This check won't get everything, because href - # hasn't been special char substituted yet. - if href not in self.urls_to_check and \ - href not in self.urls_succeeded and \ - href not in self.urls_failed: - self.urls_to_check.append(href) - - for tag in soup.findAll('img', src=True): - src = self.make_absolute(tag.get('src'), urldir) - if not src: - self.outside_urls.append(tag.get('src')) - continue - # self.urls_succeeded.append(src) - urlparsed = urlparse(src) - localpath = self.get_local_for_url(urlparsed.path) - self.urls_to_check.append(src) - -if __name__ == '__main__': - if len(sys.argv) < 3: - print ("Usage: %s local_dir url" % os.path.basename(sys.argv[0])) - sys.exit(1) - - spider = Spider(sys.argv[1], sys.argv[2]) - try: - spider.spide() - spider.check_orphans() - spider.print_summary() - except KeyboardInterrupt: - print ("Interrupt") -