Merge pull request #322 from mi-hol/master

remove weborphans.py as still WIP
2022-12-30 16:35:54 +01:00 · 2022-12-30 16:35:54 +01:00 · a99db3e0b3
parent 59203f3b16 34e6cbab9d
commit a99db3e0b3
1 changed files with 0 additions and 366 deletions
--- a/wiki-tools/weborphans.py
+++ b/wiki-tools/weborphans.py
@ -1,366 +0,0 @@
-#!/usr/bin/env python
-
-#source: https://shallowsky.com/blog/tech/web/finding-web-orphans.html
-#https://github.com/akkana/scripts/blob/master/weborphans
-
-
-# Check a website (perhaps localhost) against a local mirror.
-# Find broken links and orphaned files.
-# You must specify both the directory, and a web URL to a server
-# (e.g. localhost) that is serving that directory.
-
-import sys, os
-import posixpath
-import re
-#from scheme import *
-#from urllib.parse import scheme
-from urllib.parse import urlparse
-from urllib.parse import quote_plus
-from urllib.parse import urlunsplit
-from urllib.request import Request
-from urllib.request import build_opener
-from urllib.error import HTTPError
-from urllib.error import URLError
-
-# tested with python version 3.10.8
-if sys.version_info < (3, 10):
-    raise RuntimeError("This package requires Python 3.10+")
-
-#from bs4 import BeautifulSoup
-#from BeautifulSoup import BeautifulSoup
-
-class Spider:
-    def __init__(self, rootdir, starturl):
-        self.debug = False
-
-        self.starturl = starturl
-        self.rootdir = os.path.normpath(rootdir)
-        if not os.path.isdir(rootdir):
-            # It's not a directory, so take the dirname, but save the filename.
-            self.rootdir, rootfile = os.path.split(rootdir)
-        else:
-            # It's already a directory, so self.rootdir is fine.
-            rootfile = None
-
-        # XXX This next bit isn't platform-agnostic:
-        if not self.rootdir.endswith('/'):
-            self.rootdir += '/'
-
-        # Now we need to get the true root url. The starturl may have
-        # something like /index.html appended to it; we need something
-        # we can prepend to paths.
-
-        # Extract any path information from the root url:
-        parsed = urlparse(starturl)
-        self.scheme = parsed.scheme
-        self.host = parsed.netloc
-        self.rooturlpath = posixpath.normpath(parsed.path)
-        dirpart, basepart = posixpath.split(self.rooturlpath)
-        # If the path is a directory and ends in / (as it should)
-        # then posixpath will split on that slash, not the previous one.
-        if not basepart:
-            dirpart, basepart = posixpath.split(dirpart)
-
-        # Now basepart is the last part of the path, which might
-        # be a directory name on the server or it might be index.*
-        # Compare it to the last part of self.rootdir, which is
-        # guaranteed to be a directory.
-        # But we have to split it twice, because self.rootdir ends in /
-        # so the first split will return '' as the basename.
-        lastdir = posixpath.basename(posixpath.dirname(self.rootdir))
-        if basepart != lastdir:
-            self.rooturlpath = posixpath.dirname(self.rooturlpath)
-
-        if not self.rooturlpath.endswith('/'):
-            self.rooturlpath += '/'
-
-        # Now we're confident self.rooturlpath is the base directory.
-        # Add the schema and host back on.
-        self.rooturl = urlunsplit((self.scheme, self.host,
-                                            self.rooturlpath, None, None))
-        if not self.rooturl.endswith('/'):
-            self.rooturl += '/'
-
-        print ("rootdir:", self.rootdir)
-        print ("rooturl:", self.rooturl)
-        print ("rooturlpath:", self.rooturlpath)
-        print ("scheme:", self.scheme)
-        print ("host:", self.host)
-        print
-
-        self.urls_to_check = [ self.rooturl ]
-        self.urls_succeeded = []
-        self.urls_failed = []
-        self.outside_urls = []
-        self.files_succeeded = []
-
-        # Eventually, the list of excludes should be a commandline argument.
-        # For now, let's just make sure all the .git objects aren't orphaned,
-        # nor web stats or archived files.
-        self.excludes = [ ".git", "stats", "0-pre2011", "0-calendars" ]
-
-        # Files that aren't explicitly referenced by the website,
-        # but might be needed for other purposes.
-        self.nonorphans = [ "favicon.ico", "robots.txt", ".htaccess" ]
-
-    def spide(self):
-        """Check all urls in urls_to_check, which has new urls
-           being added to it during the spidering process.
-        """
-        self.check_url(self.starturl)
-        while self.urls_to_check:
-            self.check_url(self.urls_to_check.pop())
-
-        print ("Done spiding")
-
-    def check_orphans(self):
-        """Assuming we already have self.files_succeeded,
-           find all files in self.rootdir that weren't in succeeded.
-        """
-        self.orphans = []
-        for root, dirs, files in os.walk(self.rootdir, topdown=True):
-            dirs[:] = [d for d in dirs if d not in self.excludes]
-            for filename in files:
-                if filename in self.nonorphans:
-                    continue
-                f = os.path.join(root, filename)
-                if f not in self.files_succeeded:
-                    self.orphans.append(f)
-
-    def print_summary(self):
-        print
-        print ("URLs succeeded:")
-        print ('\n'.join(self.urls_succeeded))
-        print
-        print ("Outside URLs:")
-        print ('\n'.join(self.outside_urls))
-        print
-        print ("URLs failed:")
-        print ('\n'.join(self.urls_failed))
-        print
-        print ("Orphans:")
-        print ('\n'.join(self.orphans))
-        print
-        print (len(self.urls_succeeded), "good links,", \
-            len(self.outside_urls), "external urls not checked,", \
-            len(self.urls_failed), "bad links,", \
-            len(self.orphans), "orphaned files."
-        )
-
-    def get_local_for_url(self, urlpath):
-        """Get a local file path for a path parsed from an absolute URL.
-        """
-        # Now compare parsed.path with self.rooturlpath
-        if self.rooturlpath not in urlpath:
-            return None
-        return os.path.normpath(urlpath.replace(self.rooturlpath,
-                                                self.rootdir,
-                                                1))
-
-    def make_absolute(self, url, relative_to):
-        """Make a URL absolute. If it's a relative path,
-           then make it relative to relative_to
-           which must be an absolute path on the webhost.
-        """
-        parsed = urlparse(url)
-        if parsed.scheme:    # already has an http://host specified
-            # XXX If we ever extend this to check validity of
-            # external URLs, this next condition is the one to change.
-            if parsed.netloc != self.host:
-                if self.debug:
-                    print ("Ignoring external link", url)
-                return None
-            return url
-
-        # So there's no scheme. Add one.
-        if parsed.path.startswith('/'):
-            # The results of urlparse() aren't modifiable, but
-            # if we turn them into a list we can modify them
-            # then turn them back into a URL.
-            lurl = list(parsed)
-            lurl[0] = self.scheme
-            lurl[1] = self.host
-            #return urlparse.urlunparse(lurl)
-            return urlunparse(lurl)
-
-        # Otherwise it's relative to urldir. Make it absolute, normalized.
-        lurl = list(parsed)
-        lurl[0] = self.scheme
-        lurl[1] = self.host
-        lurl[2] = posixpath.normpath(posixpath.join(relative_to, parsed.path))
-        #return urlparse.urlunparse(lurl)
-        return urlunparse(lurl)
-
-    def check_url(self, url):
-        """Check a URL. This should be an absolute URL on the server."""
-        # If we got this far, we'll be comparing links.
-        # So we'll need to know the parsed parts of this url.
-        urlparsed = urlparse(url)
-        if not urlparsed.scheme or not urlparsed.path.startswith('/'):
-        #if not scheme or not path.startswith('/'):
-            print ("EEK! Non-relative URL passed to check_url, bailing")
-            return
-
-        # URL encode special characters like spaces:
-        # urlpath = urllib.quote(urlparsed.path.encode('utf-8'))
-        urlpath = quote_plus(urlparsed.path.encode('utf-8'))
-
-        # This check must come after the special char substitution.
-        if urlpath in self.urls_succeeded or urlpath in self.urls_failed:
-            return
-
-        if self.debug:
-            print ("=============================== Checking", url)
-
-        # Now we need just the directory part. This might be
-        # dirname(urlparsed.path), if the url is a file, or it
-        # might just be urlparsed.path if that's already a directory.
-        # The only way to know is to check on the local filesystem.
-        # But here's the tricky part: to get the absolute path,
-        # we need to know what relative links are relative_to,
-        # but if they themselves XXX
-        localpath = self.get_local_for_url(urlparsed.path)
-        if self.debug:
-            print ("=== local for", urlpath, "is", localpath)
-
-        if not localpath:
-            if self.debug:
-                print (urlparsed.path, "is outside original directory; skipping")
-            if url not in self.outside_urls:
-                self.outside_urls.append(url)
-            return
-
-        if not os.path.exists(localpath):
-            if self.debug:
-                print ("Local path '%s' doesn't exist! %s" % (localpath,  url))
-            self.urls_failed.append(urlpath)
-            return
-
-        # If we substituted any special characters, rebuild the URL:
-        if urlpath != urlparsed.path:
-            lurl = list(urlparsed)
-            lurl[2] = urlpath
-            print (dir(lurl))
-            url = urlparse.urlunparse(lurl)
-            #url = urlunparse(lurl)
-            
-            if self.debug:
-                print ("Substituted characters, recombined to", url)
-
-        if os.path.isdir(localpath):
-            # The web server will substitute index.something,
-            # so we'd better do that too or else the index file
-            # will show up as an orphan.
-            localdir = localpath
-            localpath = None
-            for ext in ( "php", "cgi", "html" ):
-                indexfile = os.path.join(localdir, "index." + ext)
-                if os.path.exists(indexfile):
-                    localpath = indexfile
-                    break
-            if not localpath:
-                print ("Can't find an index file inside", localdir)
-                return
-            urldir = urlpath
-        else:
-            localdir = os.path.dirname(localpath)
-            urldir = posixpath.dirname(urlpath)
-
-        if self.debug:
-            print ("localpath", localpath, "localdir", localdir)
-            print ("urldir:", urldir)
-
-        try:
-            request = Request(url)
-            handle = build_opener()
-        except IOError:
-            return None
-
-        if not handle:
-            print ("Can't open", url)
-
-        # request.add_header("User-Agent", AGENT)
-
-        try:
-            response = handle.open(request)
-            info = response.info()
-            if 'content-type' not in info.keys() or \
-               not info['content-type'].startswith('text/html'):
-                if self.debug:
-                    print (url, "isn't HTML; skipping")
-                self.urls_succeeded.append(urlpath)
-                self.files_succeeded.append(localpath)
-                return
-            # content = unicode(response.read(), "utf-8", errors="replace")
-            # content = encode(response.read(), "utf-8", errors="replace")
-            content = (response.read())
-
-        except HTTPError as error:
-            if error.code == 404:
-                print ("ERROR: %s -> %s" % (error, error.url))
-            else:
-                print ("ERROR: %s" % error)
-            self.urls_failed.append(urlpath)
-            return
-
-        except URLError as error:
-            print ("ERROR: %s" % error)
-            self.urls_failed.append(urlpath)
-            return
-
-        self.urls_succeeded.append(urlpath)
-        self.files_succeeded.append(localpath)
-
-        ctype = response.headers['content-type']
-        if not ctype.startswith("text/html"):
-            if self.debug:
-                print (url, "isn't HTML (%s); not reading content" % ctype)
-            return
-
-        soup = BeautifulSoup(content)
-
-        for tag in soup.findAll('a', href=True):
-            href = tag.get("href")
-            if not href:
-                continue
-            if href[0] == '#':
-                continue
-
-            href = self.make_absolute(href, urldir)
-            if not href:
-                # It's probably an external URL. Skip it.
-                href = tag.get("href")
-                if href not in self.outside_urls:
-                    self.outside_urls.append(href)
-                continue
-
-            # This check won't get everything, because href
-            # hasn't been special char substituted yet.
-            if href not in self.urls_to_check and \
-               href not in self.urls_succeeded and \
-               href not in self.urls_failed:
-                self.urls_to_check.append(href)
-
-        for tag in soup.findAll('img', src=True):
-            src = self.make_absolute(tag.get('src'), urldir)
-            if not src:
-                self.outside_urls.append(tag.get('src'))
-                continue
-            # self.urls_succeeded.append(src)
-            urlparsed = urlparse(src)
-            localpath = self.get_local_for_url(urlparsed.path)
-            self.urls_to_check.append(src)
-
-if __name__ == '__main__':
-    if len(sys.argv) < 3:
-        print ("Usage: %s local_dir url" % os.path.basename(sys.argv[0]))
-        sys.exit(1)
-
-    spider = Spider(sys.argv[1], sys.argv[2])
-    try:
-        spider.spide()
-        spider.check_orphans()
-        spider.print_summary()
-    except KeyboardInterrupt:
-        print ("Interrupt")
-