#!/usr/bin/env bash ################################################################################################## # brokenlinks.sh # # 02/18/2021 # # Written By David Holdeman # # Searches for broken links in a Github Wiki repo, and suggests and applies corrections. # # Usage: brokenlinks.sh [-s non-interactive] [-d debug] ... # ################################################################################################## # These two functions are used to escape variables for use in a sed command # Passed a single string escape() { sed 's/[^^]/[&]/g; s/\^/\\^/g' <<<"$1"; } export -f escape escapeReplace() { sed 's/[&/\]/\\&/g' <<<"$1"; } export -f escapeReplace # return status: # 0: file is good .md # 1: file is bad # 2: file is not .md checkurl() { LINK="$2" HASH="$3" # If it's an internet link, ignore it. # That's beyond the scope of this tool. if echo "$LINK" | grep -E '^http' >/dev/null; then return 2 fi # Check for links that begin in ./ or /, as they won't function as expected everywhere. if echo "$LINK" | grep -E '^[.]?/' >/dev/null; then # Save the link for replacement OLDLINK="$LINK" # Correct the link. # We save this to $LINK because the next check in this function need the corrected version. LINK=$(echo "$LINK" | sed 's/^.\{0,1\}\///') # Lock user-facing input/output so that the user is presented with one fix at a time. ( flock -x 200 # Print the file and the old link echo "In $1:" >&2 echo "$OLDLINK" >&2 # Print the options as though they are a list in order to have the same UI as other types of correction echo "$LINK" | cat --number >&2 # Make sure we aren't in non-interactive mode. if [ "$SCRIPT" -lt 1 ]; then echo "Type a number, then hit return to select an alternative, or just hit return to skip fixing:" >&2 # Read the user input read -r PICK if [[ $PICK =~ ^[0-9]+$ ]] && [ "$PICK" -eq 1 ]; then # Replace the old link with the new one. # Parentheses are placed around both the old link and new one in order to ensure we replace the link, # and not some other place in the file that happens to use the same words. REPLACE=$(escape '('"$OLDLINK""$HASH"')') REPLACEWITH=$(escapeReplace "$LINK""$HASH") sed -i "s/$REPLACE/\($REPLACEWITH\)/" "$1" fi # We don't continue here because the link we fixed might be broken. fi # File descriptor for the lock. ) 200>brokenlinks.lock fi # Skip links that are to an .md file and aren't broken. if [ "$(echo "$LIST" | grep "$LINK"".md" 2>/dev/null | wc -l)" -gt 0 ]; then # print the URL for use in `checkhash` echo "$LINK" return 0 fi # Skip non-md links if they're not broken. if echo "$LINK" | grep -v '.md$' >/dev/null && ls "$LINK" 2>/dev/null >/dev/null; then return 2 fi # Build the search term we will look for. # All hyphens and underscores are replaced with asterisks, so we # can find files with mismatched hyphens or underscores. SEARCH='*'$(basename "$LINK" | sed 's/[-_ ]/*/g')'*' # Search for matching files. # We are using `find` here because we need to search for all files, while $LIST has only .md files FILES=$(find . -iname "$SEARCH") # Lock user-facing input/output so that the user is presented with one fix at a time. ( flock -x 200 # Print the filename and the broken link. echo "In $1:" >&2 echo "$LINK" >&2 # If there are no files, skip to next link. if [ -z "$FILES" ]; then echo "Could not find" >&2 return 1 fi # List the potential files, with numbers. echo "$FILES" | cat --number >&2 # Make sure we aren't in non-interactive mode. if [ "$SCRIPT" -lt 1 ]; then echo "Type a number, then hit return to select an alternative, or just hit return to skip fixing:" >&2 # Read the user input read -r PICK # If the selection isn't a number, skip to the next link. if ! [[ $PICK =~ ^[0-9]+$ ]]; then return 1 fi # Get the selected file path, without the preceding ./ FILE=$(echo "$FILES" | head -n "$PICK" | tail -n 1 | sed 's/^\.\///') # Track if the linked file is a .md file MD=0 if echo "$FILE" | grep ".md$" >/dev/null; then MD=1 # Drop the .md from the link FILE=$(basename "$FILE" .md) fi # Replace the old link with the new one. # Parentheses are placed around both the old link and new one in order to ensure we replace the link, # and not some other place in the file that happens to use the same words. REPLACE=$(escape '('"$LINK""$HASH"')') REPLACEWITH=$(escapeReplace "$FILE""$HASH") sed -i "s/$REPLACE/\($REPLACEWITH\)/" "$1" # print the URL for use in `checkhash` echo "$LINK" if [ "$MD" -eq 1 ]; then return 0 else return 2 fi fi return 1 # File descriptor for the lock. ) 200>brokenlinks.lock # The returns within the lock closure don't return from the function, only from the closure. return $? } export -f checkurl checkhash() { # $1: file # $2: hash # $3: url - won't always be present # TODO check hash fragment validity return 0 } export -f checkhash # Main processing function # Passed the path to a .md file searchfile() { STATUS=0 # This loops for every link in the file. # See the end of the function for the grep that finds the links in the file. # We use file descriptor 3, because if we used stdin, the read calls inside this loop would read from that instead of # reading the user's input. while IFS= read -r -u 3 LINK; do # Break the link into URL and hash fragment, if one is present if echo "$LINK" | grep '#' >/dev/null; then URL=$(echo "$LINK" | cut -d '#' -f 1) HASH="#"$(echo "$LINK" | cut -d '#' -f 2) else URL="$LINK" HASH="" fi # We need to store the return status of `checkurl` to know whether we need to check the hash URLSTATUS=0 if [ -n "$URL" ]; then # `checkurl` returns the URL if it is good, # so that if it was fixed, we have the update version to use in `checkhash`. URL=$(checkurl "$1" "$URL" "$HASH") URLSTATUS=$? if [ "$URLSTATUS" -eq 1 ]; then STATUS=1 fi fi # Only check the hash if it exists and the URL was good. if [ -n "$HASH" ] && [ "$URLSTATUS" -eq 0 ]; then # Parameters are reversed because we won't always have a URL - we might only have a hash fragment. checkhash "$1" "$HASH" "$URL" # Check exit code directly https://www.shellcheck.net/wiki/SC2181 if ! checkhash "$1" "$HASH" "$URL"; then STATUS=1 fi fi if [ "$DEBUG" -eq 1 ]; then echo "$(date +%T.%N) $1 $URL $HASH" fi # This regex finds links in the file that is passed to searchfile # Results are fed to file descriptor 3 for the reasons previously explained. done 3< <(grep -oP '(?<=\]\().*?(?=[\)])' "$1" | sed -e "s/^$//g" | cut -d '"' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') return $STATUS } export -f searchfile FILES=() export SCRIPT=0 export DEBUG=0 for i in $@; do if [ "$i" == "-s" ]; then export SCRIPT=1 elif [ "$i" == "-d" ]; then export DEBUG=1 else FILES+=("${i}") fi done # split into 2 commands to avoid masking of return values https://www.shellcheck.net/wiki/SC2155 LIST=$(find . -iname "*.md" ! -name '_*') export LIST if [ "${#FILES[@]}" -gt 0 ]; then STATUS=0 # Only run `searchfile` on passed-in file names. for f in "${FILES[@]}"; do # Check exit code directly https://www.shellcheck.net/wiki/SC2181 if ! searchfile "$f"; then STATUS=1 fi done exit $STATUS else # run `searchfile` on every .md file in the repo xargs -0 -P $(nproc --all) -a <(echo "$LIST" | tr '\n' '\0') -I {} bash -c 'searchfile "$@"' _ {} fi