#!/bin/bash

# url-query - 2025-08-05
#
# query websites by key(s) with given query text
#
# open URL(s) for given key(s) and query text
# (see config file for details)


# default config file
config=${XDG_CONFIG_HOME:-$HOME/.config}/url-query.tsv

# don't quote substitution which includes spaces (1 = true, 0 = false)
noquote=1

# default menu format - "{k}" = key, "{n}" = name
defmfmt='{n} - {k}'

# format for category comment (string before %s demarks category; string after %s to end of line is removed)
catfmt='### %s - '

# command to open each URL
opencmd=xdg-open

# command to run awk script (mawk is ≈ 3x faster than sed/gawk and very well maintained)
awkcmd=awk

############################################################

usage()
{
	cat <<- sss
		$(sed -n '3s/^..//p;5s/^../  /p;6q' "$0")

		usage: $(basename -- "$0") [opt...] key query-text...
		usage: $(basename -- "$0") [opt...] -s query-text key...

		options
		  -s      use first argument as query-text and remaining argument(s) as key(s)
		  -n      create new record for given args: key name target
		  -q      toggle quoting of query-text which includes whitespace (default: noquote=$noquote)
		  -S      supress newline -> space substitution in query-text...
		  -v      verify config (and report any problems)
		  -c f    use config file f instead of default ($config)
		  -m [f]  menu: print list for a selector/menu, with optional format f (default: $defmfmt)
		  -b      print config in bookmarks.html format to stdout
		  -p      print URL(s) to stdout instead of opening
		  
		note
		  - option -s allows multiple keys to be queried for given query-text (e.g. text selection) 
		    in contrast to default mode which accepts multiple query-text parts for a given key
		  - if query-text... is not given, URL is reduced to base domain
		    if it contains one or more place holders
		  - option -b reads up to 3 arguments if given: prefix tags placeholder
		    (defaults: prefix="" ; tags="url-queries" ; placeholder="%s")
		  - option -m format is a string with these substitutions: {k} = key, {n} = name
		    (e.g. "{k} - {n}" -> "ia - Internet Archive")
		  - option -[nmb] causes other options to be ignored except -c
	sss
}

function warn
{
	typeset r z
	[[ -z $NO_COLOR && -t 2 && -n $(type -p tput) ]] && (( $(tput colors) > 1 )) && r=$(tput setaf 1) z=$(tput sgr0)
	(( $# )) && printf "%s%s%s\n" "$r" "$*" "$z" >&2
}

die()
{
	[[ -n $2 ]] && warn "${*:2}"
	[[ $1 == 2 && -t 2 ]] && usage >&2
	exit "${1:-1}"
}

urlencode()
{
	# ENVIRON avoids pipe/stdin (and \ interpretation)
	# explicit character set (avoids locale trouble with ranges)
	awks='
		BEGIN {
			unreserved = "[ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_.~-]"

			for (i = 0; i <= 255; i++)
				ord[sprintf("%c", i)] = i

			len = length(ENVIRON["t"])
			for (i = 1; i <= len; i++) {
				c = substr(ENVIRON["t"], i, 1)
				if (c !~ unreserved)
					c = sprintf("%%%02X", ord[c])
				r = r c
			}
			print r
		}
	'
	t=$1 $awkcmd "$awks"
}

gettarget()
{
	awks='
		# read the whole file to build array[key]=target
		! /^([# \t]|$)/ { kk[$1] = $3 }
		END {
			# follow key (recur until URL/key-set/null/recursive-key)
			while (k !~ /(:\/\/|[^ ] [^ ]|^$)/) {
				if (k == kk[k]) {
					print "0 0" # recursive-key
					exit
				}
				k=kk[k]
			}
			# also return c count of placeholders (to avoid a subshell call later)
			c = gsub(/%[sS]/, "&", k)
			print c, k
		}
	'
	$awkcmd -v k="$1" -F'\t' "$awks" "$config"
}

urlquery()
{
	# this is faster than loading once into shell's associative array (for at laest 6 iterations)
	# c - count of placeholders
	k=$1 url=$(gettarget "$k") c=${url%% *} url=${url#* }

	case $url in
	*'://'* ) ;;
	*[!\ ]' '[!\ ]* )
		shift
		# recur for key set (i.e. a key to multiple keys)
		for k in $url ; do
			urlquery "$k" "$@"
		done
		return
		;;
	'' )
		warn "warning - URL not found for key: $1"
		return 1
		;;
	0 )
		warn "warning - recursive key: $1"
		return 1
		;;
	esac
	shift
	
	# c (count of placeholders)
	case $c:$# in
	0:* ) ;;
	*:0 )
		# no query-text - just load base domain
		scheme=${url%%://*} url=${url#*://} url=$scheme://${url%%/*}/
		;;
	*:* )
		if (( c < $# )) ; then
			# unify extra args for cth (last) placeholder
			z=${*:c}
			pp=( "${@:1:c - 1}" "$z" )
		else
			(( c > $# )) && warn "warning - placeholders exceed substitution parts: $url"
			pp=( "$@" )
		fi

		# convert newline(s) -> space(s) (unless disabled)
		(( nosub )) || pp=( "${pp[@]//$'\n'/ }" )

		# substitute placeholder(s) with substitution part(s) (optionally URL-encoded)
		for p in "${pp[@]}" ; do
			# multiple parts: (re)quote all parts which include spaces, and are unquoted
			# single part: only quote if noquote=0, it has spaces, and it is unquoted
			(( ${#pp[@]} > noquote )) && [[ $p == *[$' \t']* && $p != \"*\" ]] && p=\"$p\"

			s=${url%%\%[sS]*}
			s=${url:${#s} + 1:1}
			case $s in
			s ) p=$(urlencode "$p") ;;
			S ) ;;
			* ) die 1 "placeholder parse failed: $s -> $p"
			esac

			url=${url/\%$s/$p}
		done
		;;
	esac
	
	case $print in
	1 ) printf '%s\n' "$url" ;;
	* )
		${opencmd:?} "$url" || die 3 "opencmd failed: $opencmd"
		;;
	esac
}

urlquerys()
{
	t=$1
	shift
	for k ; do
		urlquery "$k" "$t"
	done
}

escre()
{
	# mawk is 3x faster than sed/gawk
	$awkcmd '{ gsub(/[]\\^$.?*+(|){}[]/, "\\\&") } 1' <<< "$*"
}

listcategories()
{
	# escape regex metachars
	catfmt=$(escre "$catfmt")
	awks='
		/^'"${catfmt%%\%s*}"'[[:graph:]]/ {
			sub(/'"${catfmt#*%s}"'.*$/, "")
			print substr($0, 5)
		}
	'
	$awkcmd "$awks" "$config"
}

newrecord()
{
	[[ -t 0 ]] || die 1 "standard input is not connected to a terminal"
	(( $# == 3 )) || die 2 "3 arguments expected; $# given"
	[[ $1 != *[^[:alnum:]]* ]] || die 1 "key contains bad character(s) (allowed: [[:alnum:]]): $1"
	[[ $2 != *[^[:print:]]* ]] || die 1 "name contains bad character(s) (allowed: [[:print:]]): $2"
	[[ $3 != *[^[:print:]]* ]] || die 1 "target contains bad character(s) (allowed: [[:print:]]): $3"
	[[ -w $config ]] || die 1 "config file not writable: $config"

	! $awkcmd -F'\t' -v t="$3" '$3 == t {print; exit 1}' "$config" && {
		echo 'target already exists! continue anyway? [yN] '
		read yn && [[ $yn != [yY] ]] && exit 3
	}

	[[ $3 == *://* && $3 != *%[sS]* ]] && warn "warning - url does not contain a placeholder: $3"

	k=$1 newk=
	# test with awk instead of grep to avoid escaping regex metachars 
	until $awkcmd -F'\t' -v k="$k" '$1 == k {print; exit 1}' "$config" ; do
		echo 'key already exists! enter new key (or the same one to replace it): '
		read -r newk
		case $newk in
		"$k" )
			echo 'replace existing record? [yN] '
			read yn
			[[ $yn == [yY] ]] && {
				tmp=$(mktemp /tmp/"$(basename -- "$config")".XXXXXX)
				# ENVIRON avoids awk's backslash interpretation
				awks='
					BEGIN { OFS = FS = "\t" }
					$1 == ENVIRON["k"] { $2 = ENVIRON["n"]; $3 = ENVIRON["t"] }
					1
				'
				k=$k n=$2 t=$3 $awkcmd "$awks" "$config" > "$tmp" \
					&& mv "$tmp" "$config"

				return
			}
			;;
		?* ) k=$newk ;;
		esac
	done

	unset cc
	while read -r c ; do
		cc+=( "$c" )
	done < <(listcategories)

	[[ -n $cc ]] && {
		echo 'append record to category'
		select c in "${cc[@]}" end-of-file ; do
			[[  $c == end-of-file ]] && unset c && break
			printf '%s\0' "${cc[@]}" | grep -qFxz -- "$c" && break
		done
	}

	[[ -n $c ]] && {
		c=$(escre "$c")
		# line number before first blank line after category
		n=$($awkcmd '/^'"${catfmt%%\%s*}$c"'/,/^[\t ]*$/ { if (/^[\t ]*$/) {print NR; exit;} }' "$config")
		(( n )) && n=$(( n - 1 ))
	}

	tmp=$(mktemp)
	# append to line n or end of file
	sed "${n:-\$} a $(printf '%s\t%s\t%s\n' "$k" "$2" "$3")" "$config" > "$tmp"
	[[ -s $tmp ]] && mv "$tmp" "$config" && echo 'new record appended' >&2 || rm -f "$tmp"
}

verifyconfig()
{
	awks='
		function printerr(title, current, status) {
			print title
			if (current) print NR ": " $0
			s = status
		}
		
		/^(#|[ \t]*$)/ { next }

		NF != 3 {
			printerr("malformed record:", 1, 1)
			next
		}
		
		$1 ~ /[^[:alnum:]]/ {
			printerr("bad character in key: " $1, 1, 1)
			next
		}

		kk[$1] { printerr("duplicate key: " $1 "\n" kk[$1], 1, 1) }
		! kk[$1] { kk[$1] = NR ": " $0 }
		
		tt[$3] { printerr("duplicate target: " $3 "\n" tt[$3], 1, 0) }
		! tt[$3] { tt[$3] = NR ": " $0 }

		END {
			for (k in kk) {
				t = kk[k]
				sub(/.*\t/, "", t)
				if (t ~ /:/) continue
				split(t, tt, " ")
				for (i in tt) {
					t = tt[i]
					if (k == t) printerr(sprintf("recursive key: %s\n%s", k, kk[k]), 0, 1)
					else if (! kk[t]) printerr(sprintf("target undefined for key: %s\n%s", t, kk[k]), 0, 1)
				}
			}
			exit s
		}
	'
	$awkcmd -F'\t' "$awks" "$config" >&2
}

initconfig()
{
	cat <<- sss
		# ==========================================================
		# URL query substitution
		# ==========================================================
		#
		# format (tsv): key name target
		#
		#     target: url | key ...
		#
		# - blank lines and comments (i.e. lines beginning with #) are ignored
		# - url can contain one or more place holders for given query part(s)
		#
		#     %s = url-percent-encoded
		#     %S = verbatim 
		#
		# to make it easier to manage a large colleciton of records
		# each line beginning with "###  " may demark a category
		# which includes any following records until the next catergory
		# (the category name taken from (.+) in "^(### )(.+)( - .*)?$"
		# with the first and last subgroups removed)
		#
		# as this file contains tabular data, a text editor 
		# which supports *elastic tabstops* is essential
		# https://en.wikipedia.org/wiki/Elastic_tabstops
		
		# examples
		w	Wikipedia	https://en.wikipedia.org/w/index.php?search=%s
		jp	Justapedia	https://justapedia.org/index.php?search=%s

		### aliases - maintian consistent keys for moving targets
		e	encyclopedia	jp
		
		### key sets - query multiple sources with one key (space-separated list of target keys)
		E	search both	w jp
		
	sss
}

printbookmarks()
{
	awks='
		BEGIN {
			print "<!DOCTYPE NETSCAPE-Bookmark-file-1>\n<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\">\n<TITLE>Bookmarks</TITLE>\n<H1>Bookmarks</H1>\n<DL>\n<DT><H3>" ENVIRON["t"] "</H3>\n<DD>exported from url-query script - for use in browsers that support <a href=\"https://en.wikipedia.org/wiki/Smart_bookmark\">smart/keyword/shortcut bookmarks</a>)\n<DL><p>"
			
			split(ENVIRON["catfmt"], cf, "%s")
			a = "^" cf[1]
			c = a "[[:graph:]]"
			z = cf[2] ".*$"
		}
		
		$0 ~ c { sub(a, ""); sub(z, ""); category = $0 }

		NF == 3 && $1 !~ /^(#.*|[ \t]*)$/ {
			keys[++i] = $1
			names[$1] = $2
			targets[$1] = $3
			tags[$1] = (category) ? ENVIRON["t"] "," category : ENVIRON["t"]
		}

		END {
			len = i
			for (i = 1; i <= len; i++) {
				k = keys[i]
				n = names[k]
				u = targets[k]
				t = tags[k]
				# resolve aliases
				while (u !~ /^([a-z]+:.+)?$/ && u != k) u = targets[u]
				# skip key sets
				if (u !~ /^[a-z]+:.+$/) continue
				gsub(/%[sS]/, ENVIRON["s"], u)
				printf("<DT><A HREF=\"%s\" SHORTCUTURL=\"%s\" TAGS=\"%s\">%s</A>\n", u, ENVIRON["p"] k, t, n)
			}
			print "</DL><p>\n</DL>"
		}
	'
	# s = placeholder for search terms
	# t = title of enclosing directory and tags for each bookmark
	catfmt=$(escre "$catfmt") p=$1 t=${2:-url-queries} s=${3:-%s} $awkcmd -F'\t' "$awks" "$config"
}

menu()
{
	awks='
		! /^([# \t]|$)/ {
			fmt = ENVIRON["fmt"]
			# escape replacement chars: [&\]
			gsub(/&/, "\\\\&", $0)
			gsub(/\{k\}/, $1, fmt)
			gsub(/\{n\}/, $2, fmt)
			print fmt
		}
	'
	fmt=${mfmt:-$defmfmt} $awkcmd -F'\t' "$awks" "$config"
}

############################################################

f=urlquery
unset nosub print mfmt
while getopts :snqSvc:m:bp opt ; do
	case $opt in
	s ) f=urlquerys ;;
	n ) f=newrecord ;;
	q ) noquote=$(( ! noquote )) ;;
	S ) nosub=1 ;;
	v ) f=verifyconfig ;;
	c ) config=$OPTARG ;;
	m ) f=menu mfmt=$OPTARG ;;
	: ) [[ $OPTARG == m ]] && f=menu ;;
	b ) f=printbookmarks ;;
	p ) print=1 ;;
	* ) die 2 "bad option: $OPTARG" ;;
	esac
done
shift $(( OPTIND - 1 ))

# abort if null/unset
: ${defmfmt:?} ${catfmt:?}

[[ -n $(type -p "${awkcmd:?}") ]] || die 3 "awkcmd not found: $awkcmd"

[[ -f $config ]] || initconfig > "${config:?}" || die 1 "config file creation failed: $config"

[[ $f == urlquery* ]] && (( ! $# )) && die 2 "argument(s) expected; none given"

$f "$@"
