books/classify

#!/usr/bin/env bash
#shellcheck disable=SC2034,SC1090
#
# classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction)

shopt -s extglob
trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$

version="0.5.1"
release="20210601"

functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
	source "$functions"
else
	echo "$functions not found"
	exit 1
fi

main () {
        # PREFERENCES
        config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
	# OCLC classify API
	oclc="http://classify.oclc.org/classify2/Classify"

	declare -A API=(
		[response]='/classify/response/@code'
		[owi]='/classify/works/work[1]/@owi'
		[wi]='/classify/works/work[1]/@wi'
		[fast]='join(/classify/recommendations/fast/headings/heading,",")'
		[ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)'
		[lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)'
		[nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)'
		[author]='/classify/work/@author'
		[authors]='join(/classify/authors/author," | ")'
		[title]='/classify/work/@title'
	)

	declare -A filters=(
		[filename]="sed -e 's/[^-[:alnum:]:;?!.,+@#%]/_/g;s/^\([-_]\)*//'"
	)

	declare -A tables=(
		[libgen]="updated"
		[libgen_fiction]="fiction"
	)

	xidel=$(find_tool "xidel")
	curl=$(find_tool "curl")
	xq="$xidel -s"

	request=""

	TMPDIR="/tmp"
	xml=$(mktemp -p $TMPDIR classify.XXXXX)

	# source config file if it exists
	[[ -f ${config} ]] && source "${config}"

	while getopts "owdlnfatVAD:C:X:G@:h" OPTION; do
		case $OPTION in
			o)
				request="$request owi"
				;;
			w)
				request="$request wi"
				;;
			d)
				request="$request ddc"
				;;
			l)
				request="$request lcc"
				;;
			n)
				request="$request nlm"
				;;
			f)
				request="$request fast"
				;;
			a)
				request="$request author"
				;;
			t)
				request="$request title"
				;;
			V)
				verbose=1
				;;
			D)
				db="$OPTARG"
				;;
			C)
				[ -z "$db" ] && exit_with_error "use -D to define which database to use"
				build_csv=1
				md5="$OPTARG"
				idents=$(get_identifiers "$db" "$md5")
				[ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
				;;
			X)
				save_xml="$OPTARG"
				[[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?"
				;;
			A)
				request="author title fast owi wi ddc lcc nlm"
				verbose=1
				;;
			G)
				((debug++))
				;;
			@)
				torsocks=$(find_tool "torsocks")
				export TORSOCKS_TOR_PORT=${OPTARG}
				;;
			h)
				help
				exit
				;;

			*)
				exit_with_error "unknown option: $OPTION"
				;;
		esac
	done

	shift $((OPTIND-1))
	[ -z "$idents" ] && idents="$1"

	IFS=',' read -ra idarr <<< "$idents"

	for ident in "${idarr[@]}"; do

		[[ -n "$debug" ]] && echo "trying $ident..."

		get_xml "$xml" "stdnbr=${ident// }"
		response=$(get "response" "$xml")

		case "$response" in
			0)
				success=1
				break
				;;
			2)
				success=1
				break
				;;
			4)
				wi=$(get "wi" "$xml")
				get_xml "$xml" "wi=$wi"
				if [[ $(get "response" "$xml") =~ 0|2 ]]; then
					success=1
					break
				else
					continue
				fi
				;;
			*)
				continue
				;;
		esac
	done

	[[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents"

	if [[ -n "$save_xml" ]]; then
		[[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)"
		cp "$xml" "$save_xml/$md5.xml"
	fi

	if [[ -n "$debug" ]]; then
		cat "$xml"
	fi

	if [[ -n "$build_csv" ]]; then
		build_csv "$db" "$md5" "$xml"
	else
		show_data "$request"
	fi
}

get_xml () {
	xml="$1"
	shift
	query="$*"
	$torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml"
}

get () {
	parameter="$1"
	xml="$2"
	shift 2
	filter="$*"
	[[ -z "$filter" ]] && filter='cat -'
	$xq "$xml" -e "${API[$parameter]}"|eval "$filter"
}

get_identifiers () {
	db="$1"
	md5="$2"

	declare -A sql_identifier=(
		[libgen]="select IdentifierWODash from updated where md5='${md5}';"
		[libgen_fiction]="select Identifier from fiction where md5='${md5}';"
	)

	sql="${sql_identifier[$db]}"
	dbx "$db" "$sql"
}

show_data () {
	request="$*"

	for parameter in $request; do
		data=$(get "$parameter" "$xml")
		[[ -n "$verbose" ]] && legend="${parameter^^}:	"
		[[ -n "$data" ]] && echo "${legend}${data}"
	done
}

build_csv () {
	db="$1"
	md5="$2"
	xml="$3"

	updates="${md5}"

	for parameter in ddc lcc nlm; do
		data=$(get "$parameter" "$xml")
		updates+=",\"${data}\""
	done

	for parameter in fast author title; do
		data=$(get "$parameter" "$xml" "base64 -w0")
		updates+=",${data}"
	done

	echo "$updates"
}

cleanup () {
	base=$(basename "$xml")
	rm -f "$TMPDIR/$base"
}

help () {
	cat <<-EOHELP
	$(basename "$(readlink -f "$0")") "version $version"

	Use: classify [OPTIONS] identifier[,identifier...]

	Queries OCLC classification service for available data
	Supports: DDC, LCC, NLM, Author and Title

	Valid identifiers are ISBN, ISSN, UPC and OCLC/OWI

	OPTIONS:

	 	-d	show DDC
	 	-l	show LCC
	 	-n	show NLM
	 	-f	show FAST
	 	-a	show Author
	 	-t	show Title

	 	-o	show OWI (OCLC works identifier)
	 	-w	show WI (OCLC works number)

	 	-C md5	create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE)
	 		use -D libgen/-D libgen_fiction to indicate database

	 	-X dir	save OCLC XML response to \$dir/\$md5.xml
	 		only works with a defined MD5 (-C MD5)

	 	-D db	define which database to use (libgen/libgen_fiction)

	 	-A	show all available data for identifier

	 	-V	show labels

	 	-@ PORT	use torsocks to connect to the OCLC classify service.
		 	use this to avoid getting your IP blocked by OCLC

	 	-h	show this help message

	Examples

	$ classify -A 0199535760
	AUTHOR: Plato | Jowett, Benjamin, 1817-1893 Translator; Editor; Other] ...
	TITLE: The republic
	DDC: 321.07
	LCC: JC71

	$ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538
	25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\
	HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\
	oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\
	dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\
	JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\
	b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\
	gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\
	XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\
	90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\
	cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg==


	Classifying libgen/libgen_fiction

	This tool can be used to add classification data to libgen and
	libgen_fiction databases. It does not directy modify the database,
	instead producing CSV which can be used to apply the modifications.
	The best way to do this is to produce a list of md5 hashes for
	publications which do have Identifier values but lack values for DDC
	and/or LCC. Such lists can be produced by the following SQL:

	   libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
	   libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";

	Run these as batch jobs (mysql -B .... -e 'sql_code_here;' > md5_list), split
	the resulting file in ~1000 line sections and feed these to this tool,
	preferably with a random pause between requests to keep OCLC's intrusion
	detection systems from triggering too early. It is advisable to use
	this tool through Tor (using -@ TORPORT to enable torsocks, make sure it
	is configured correctly for your Tor instance) to avoid having too
	many requests from your IP to be registered, this again to avoid
	your IP being blocked. The OCLC classification service is not
	run as a production service (I asked them).

	Return values are stored in the following order:

	   MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE

	DDC, LCC and NLM are enclosed within double quotes and can contain
	multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded
	since these fields can contain a whole host of unwholesome characters
	which can mess up CSV. The AUTHOR field currentlydecodes to a pipe ('|')
	separated list of authors in the format:

	   LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]]

	This format could change depending on what OCLC does with the
	(experimental) service.

	EOHELP
}

main "$@"