remove as still WIP
This commit is contained in:
parent
bc86054dbd
commit
292053edac
|
@ -1,366 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
#source: https://shallowsky.com/blog/tech/web/finding-web-orphans.html
|
||||
#https://github.com/akkana/scripts/blob/master/weborphans
|
||||
|
||||
|
||||
# Check a website (perhaps localhost) against a local mirror.
|
||||
# Find broken links and orphaned files.
|
||||
# You must specify both the directory, and a web URL to a server
|
||||
# (e.g. localhost) that is serving that directory.
|
||||
|
||||
import sys, os
|
||||
import posixpath
|
||||
import re
|
||||
#from scheme import *
|
||||
#from urllib.parse import scheme
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import quote_plus
|
||||
from urllib.parse import urlunsplit
|
||||
from urllib.request import Request
|
||||
from urllib.request import build_opener
|
||||
from urllib.error import HTTPError
|
||||
from urllib.error import URLError
|
||||
|
||||
# tested with python version 3.10.8
|
||||
if sys.version_info < (3, 10):
|
||||
raise RuntimeError("This package requires Python 3.10+")
|
||||
|
||||
#from bs4 import BeautifulSoup
|
||||
#from BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Spider:
|
||||
def __init__(self, rootdir, starturl):
|
||||
self.debug = False
|
||||
|
||||
self.starturl = starturl
|
||||
self.rootdir = os.path.normpath(rootdir)
|
||||
if not os.path.isdir(rootdir):
|
||||
# It's not a directory, so take the dirname, but save the filename.
|
||||
self.rootdir, rootfile = os.path.split(rootdir)
|
||||
else:
|
||||
# It's already a directory, so self.rootdir is fine.
|
||||
rootfile = None
|
||||
|
||||
# XXX This next bit isn't platform-agnostic:
|
||||
if not self.rootdir.endswith('/'):
|
||||
self.rootdir += '/'
|
||||
|
||||
# Now we need to get the true root url. The starturl may have
|
||||
# something like /index.html appended to it; we need something
|
||||
# we can prepend to paths.
|
||||
|
||||
# Extract any path information from the root url:
|
||||
parsed = urlparse(starturl)
|
||||
self.scheme = parsed.scheme
|
||||
self.host = parsed.netloc
|
||||
self.rooturlpath = posixpath.normpath(parsed.path)
|
||||
dirpart, basepart = posixpath.split(self.rooturlpath)
|
||||
# If the path is a directory and ends in / (as it should)
|
||||
# then posixpath will split on that slash, not the previous one.
|
||||
if not basepart:
|
||||
dirpart, basepart = posixpath.split(dirpart)
|
||||
|
||||
# Now basepart is the last part of the path, which might
|
||||
# be a directory name on the server or it might be index.*
|
||||
# Compare it to the last part of self.rootdir, which is
|
||||
# guaranteed to be a directory.
|
||||
# But we have to split it twice, because self.rootdir ends in /
|
||||
# so the first split will return '' as the basename.
|
||||
lastdir = posixpath.basename(posixpath.dirname(self.rootdir))
|
||||
if basepart != lastdir:
|
||||
self.rooturlpath = posixpath.dirname(self.rooturlpath)
|
||||
|
||||
if not self.rooturlpath.endswith('/'):
|
||||
self.rooturlpath += '/'
|
||||
|
||||
# Now we're confident self.rooturlpath is the base directory.
|
||||
# Add the schema and host back on.
|
||||
self.rooturl = urlunsplit((self.scheme, self.host,
|
||||
self.rooturlpath, None, None))
|
||||
if not self.rooturl.endswith('/'):
|
||||
self.rooturl += '/'
|
||||
|
||||
print ("rootdir:", self.rootdir)
|
||||
print ("rooturl:", self.rooturl)
|
||||
print ("rooturlpath:", self.rooturlpath)
|
||||
print ("scheme:", self.scheme)
|
||||
print ("host:", self.host)
|
||||
print
|
||||
|
||||
self.urls_to_check = [ self.rooturl ]
|
||||
self.urls_succeeded = []
|
||||
self.urls_failed = []
|
||||
self.outside_urls = []
|
||||
self.files_succeeded = []
|
||||
|
||||
# Eventually, the list of excludes should be a commandline argument.
|
||||
# For now, let's just make sure all the .git objects aren't orphaned,
|
||||
# nor web stats or archived files.
|
||||
self.excludes = [ ".git", "stats", "0-pre2011", "0-calendars" ]
|
||||
|
||||
# Files that aren't explicitly referenced by the website,
|
||||
# but might be needed for other purposes.
|
||||
self.nonorphans = [ "favicon.ico", "robots.txt", ".htaccess" ]
|
||||
|
||||
def spide(self):
|
||||
"""Check all urls in urls_to_check, which has new urls
|
||||
being added to it during the spidering process.
|
||||
"""
|
||||
self.check_url(self.starturl)
|
||||
while self.urls_to_check:
|
||||
self.check_url(self.urls_to_check.pop())
|
||||
|
||||
print ("Done spiding")
|
||||
|
||||
def check_orphans(self):
|
||||
"""Assuming we already have self.files_succeeded,
|
||||
find all files in self.rootdir that weren't in succeeded.
|
||||
"""
|
||||
self.orphans = []
|
||||
for root, dirs, files in os.walk(self.rootdir, topdown=True):
|
||||
dirs[:] = [d for d in dirs if d not in self.excludes]
|
||||
for filename in files:
|
||||
if filename in self.nonorphans:
|
||||
continue
|
||||
f = os.path.join(root, filename)
|
||||
if f not in self.files_succeeded:
|
||||
self.orphans.append(f)
|
||||
|
||||
def print_summary(self):
|
||||
print
|
||||
print ("URLs succeeded:")
|
||||
print ('\n'.join(self.urls_succeeded))
|
||||
print
|
||||
print ("Outside URLs:")
|
||||
print ('\n'.join(self.outside_urls))
|
||||
print
|
||||
print ("URLs failed:")
|
||||
print ('\n'.join(self.urls_failed))
|
||||
print
|
||||
print ("Orphans:")
|
||||
print ('\n'.join(self.orphans))
|
||||
print
|
||||
print (len(self.urls_succeeded), "good links,", \
|
||||
len(self.outside_urls), "external urls not checked,", \
|
||||
len(self.urls_failed), "bad links,", \
|
||||
len(self.orphans), "orphaned files."
|
||||
)
|
||||
|
||||
def get_local_for_url(self, urlpath):
|
||||
"""Get a local file path for a path parsed from an absolute URL.
|
||||
"""
|
||||
# Now compare parsed.path with self.rooturlpath
|
||||
if self.rooturlpath not in urlpath:
|
||||
return None
|
||||
return os.path.normpath(urlpath.replace(self.rooturlpath,
|
||||
self.rootdir,
|
||||
1))
|
||||
|
||||
def make_absolute(self, url, relative_to):
|
||||
"""Make a URL absolute. If it's a relative path,
|
||||
then make it relative to relative_to
|
||||
which must be an absolute path on the webhost.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme: # already has an http://host specified
|
||||
# XXX If we ever extend this to check validity of
|
||||
# external URLs, this next condition is the one to change.
|
||||
if parsed.netloc != self.host:
|
||||
if self.debug:
|
||||
print ("Ignoring external link", url)
|
||||
return None
|
||||
return url
|
||||
|
||||
# So there's no scheme. Add one.
|
||||
if parsed.path.startswith('/'):
|
||||
# The results of urlparse() aren't modifiable, but
|
||||
# if we turn them into a list we can modify them
|
||||
# then turn them back into a URL.
|
||||
lurl = list(parsed)
|
||||
lurl[0] = self.scheme
|
||||
lurl[1] = self.host
|
||||
#return urlparse.urlunparse(lurl)
|
||||
return urlunparse(lurl)
|
||||
|
||||
# Otherwise it's relative to urldir. Make it absolute, normalized.
|
||||
lurl = list(parsed)
|
||||
lurl[0] = self.scheme
|
||||
lurl[1] = self.host
|
||||
lurl[2] = posixpath.normpath(posixpath.join(relative_to, parsed.path))
|
||||
#return urlparse.urlunparse(lurl)
|
||||
return urlunparse(lurl)
|
||||
|
||||
def check_url(self, url):
|
||||
"""Check a URL. This should be an absolute URL on the server."""
|
||||
# If we got this far, we'll be comparing links.
|
||||
# So we'll need to know the parsed parts of this url.
|
||||
urlparsed = urlparse(url)
|
||||
if not urlparsed.scheme or not urlparsed.path.startswith('/'):
|
||||
#if not scheme or not path.startswith('/'):
|
||||
print ("EEK! Non-relative URL passed to check_url, bailing")
|
||||
return
|
||||
|
||||
# URL encode special characters like spaces:
|
||||
# urlpath = urllib.quote(urlparsed.path.encode('utf-8'))
|
||||
urlpath = quote_plus(urlparsed.path.encode('utf-8'))
|
||||
|
||||
# This check must come after the special char substitution.
|
||||
if urlpath in self.urls_succeeded or urlpath in self.urls_failed:
|
||||
return
|
||||
|
||||
if self.debug:
|
||||
print ("=============================== Checking", url)
|
||||
|
||||
# Now we need just the directory part. This might be
|
||||
# dirname(urlparsed.path), if the url is a file, or it
|
||||
# might just be urlparsed.path if that's already a directory.
|
||||
# The only way to know is to check on the local filesystem.
|
||||
# But here's the tricky part: to get the absolute path,
|
||||
# we need to know what relative links are relative_to,
|
||||
# but if they themselves XXX
|
||||
localpath = self.get_local_for_url(urlparsed.path)
|
||||
if self.debug:
|
||||
print ("=== local for", urlpath, "is", localpath)
|
||||
|
||||
if not localpath:
|
||||
if self.debug:
|
||||
print (urlparsed.path, "is outside original directory; skipping")
|
||||
if url not in self.outside_urls:
|
||||
self.outside_urls.append(url)
|
||||
return
|
||||
|
||||
if not os.path.exists(localpath):
|
||||
if self.debug:
|
||||
print ("Local path '%s' doesn't exist! %s" % (localpath, url))
|
||||
self.urls_failed.append(urlpath)
|
||||
return
|
||||
|
||||
# If we substituted any special characters, rebuild the URL:
|
||||
if urlpath != urlparsed.path:
|
||||
lurl = list(urlparsed)
|
||||
lurl[2] = urlpath
|
||||
print (dir(lurl))
|
||||
url = urlparse.urlunparse(lurl)
|
||||
#url = urlunparse(lurl)
|
||||
|
||||
if self.debug:
|
||||
print ("Substituted characters, recombined to", url)
|
||||
|
||||
if os.path.isdir(localpath):
|
||||
# The web server will substitute index.something,
|
||||
# so we'd better do that too or else the index file
|
||||
# will show up as an orphan.
|
||||
localdir = localpath
|
||||
localpath = None
|
||||
for ext in ( "php", "cgi", "html" ):
|
||||
indexfile = os.path.join(localdir, "index." + ext)
|
||||
if os.path.exists(indexfile):
|
||||
localpath = indexfile
|
||||
break
|
||||
if not localpath:
|
||||
print ("Can't find an index file inside", localdir)
|
||||
return
|
||||
urldir = urlpath
|
||||
else:
|
||||
localdir = os.path.dirname(localpath)
|
||||
urldir = posixpath.dirname(urlpath)
|
||||
|
||||
if self.debug:
|
||||
print ("localpath", localpath, "localdir", localdir)
|
||||
print ("urldir:", urldir)
|
||||
|
||||
try:
|
||||
request = Request(url)
|
||||
handle = build_opener()
|
||||
except IOError:
|
||||
return None
|
||||
|
||||
if not handle:
|
||||
print ("Can't open", url)
|
||||
|
||||
# request.add_header("User-Agent", AGENT)
|
||||
|
||||
try:
|
||||
response = handle.open(request)
|
||||
info = response.info()
|
||||
if 'content-type' not in info.keys() or \
|
||||
not info['content-type'].startswith('text/html'):
|
||||
if self.debug:
|
||||
print (url, "isn't HTML; skipping")
|
||||
self.urls_succeeded.append(urlpath)
|
||||
self.files_succeeded.append(localpath)
|
||||
return
|
||||
# content = unicode(response.read(), "utf-8", errors="replace")
|
||||
# content = encode(response.read(), "utf-8", errors="replace")
|
||||
content = (response.read())
|
||||
|
||||
except HTTPError as error:
|
||||
if error.code == 404:
|
||||
print ("ERROR: %s -> %s" % (error, error.url))
|
||||
else:
|
||||
print ("ERROR: %s" % error)
|
||||
self.urls_failed.append(urlpath)
|
||||
return
|
||||
|
||||
except URLError as error:
|
||||
print ("ERROR: %s" % error)
|
||||
self.urls_failed.append(urlpath)
|
||||
return
|
||||
|
||||
self.urls_succeeded.append(urlpath)
|
||||
self.files_succeeded.append(localpath)
|
||||
|
||||
ctype = response.headers['content-type']
|
||||
if not ctype.startswith("text/html"):
|
||||
if self.debug:
|
||||
print (url, "isn't HTML (%s); not reading content" % ctype)
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(content)
|
||||
|
||||
for tag in soup.findAll('a', href=True):
|
||||
href = tag.get("href")
|
||||
if not href:
|
||||
continue
|
||||
if href[0] == '#':
|
||||
continue
|
||||
|
||||
href = self.make_absolute(href, urldir)
|
||||
if not href:
|
||||
# It's probably an external URL. Skip it.
|
||||
href = tag.get("href")
|
||||
if href not in self.outside_urls:
|
||||
self.outside_urls.append(href)
|
||||
continue
|
||||
|
||||
# This check won't get everything, because href
|
||||
# hasn't been special char substituted yet.
|
||||
if href not in self.urls_to_check and \
|
||||
href not in self.urls_succeeded and \
|
||||
href not in self.urls_failed:
|
||||
self.urls_to_check.append(href)
|
||||
|
||||
for tag in soup.findAll('img', src=True):
|
||||
src = self.make_absolute(tag.get('src'), urldir)
|
||||
if not src:
|
||||
self.outside_urls.append(tag.get('src'))
|
||||
continue
|
||||
# self.urls_succeeded.append(src)
|
||||
urlparsed = urlparse(src)
|
||||
localpath = self.get_local_for_url(urlparsed.path)
|
||||
self.urls_to_check.append(src)
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 3:
|
||||
print ("Usage: %s local_dir url" % os.path.basename(sys.argv[0]))
|
||||
sys.exit(1)
|
||||
|
||||
spider = Spider(sys.argv[1], sys.argv[2])
|
||||
try:
|
||||
spider.spide()
|
||||
spider.check_orphans()
|
||||
spider.print_summary()
|
||||
except KeyboardInterrupt:
|
||||
print ("Interrupt")
|
||||
|
Loading…
Reference in New Issue