Remove SQL generation, add XML save option, implement multiple identifier search, add CSV generation, remove filename generation

2021-05-16 14:23:57 +00:00 · 2021-05-16 14:23:57 +00:00 · 7da142f921
commit 7da142f921
parent 2bd34ab5c4
1 changed files with 105 additions and 112 deletions
--- a/217
+++ b/217
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 #shellcheck disable=SC2034,SC1090
 #
-# classify - return DDC, LCC for ISBN or MD5 (from libgen/libgen_fiction)
+# classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction)

 shopt -s extglob  
 trap "trap_error" TERM
@ -28,6 +28,8 @@ main () {
 	declare -A API=(
 		[response]='/classify/response/@code'
 		[owi]='/classify/works/work[1]/@owi'
+		[wi]='/classify/works/work[1]/@wi'
+		[fast]='join(/classify/recommendations/fast/headings/heading,",")'
 		[ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)'
 		[lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)'
 		[nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)'
@ -48,7 +50,6 @@ main () {
 	curl=$(find_tool "curl")
 	xq="$xidel -s"

-	separator="-"
 	request=""

 	TMPDIR="/tmp"
@ -57,11 +58,14 @@ main () {
 	# source config file if it exists
 	[[ -f ${config} ]] && source "${config}"

-	while getopts "odlnatS:FVAD:Q:C:G@h" OPTION; do
+	while getopts "owdlnfatVAD:C:X:G@h" OPTION; do
 		case $OPTION in
 			o)
 				request="$request owi"
 				;;
+			w)
+				request="$request wi"
+				;;
 			d)
 				request="$request ddc"
 				;;
@ -71,40 +75,34 @@ main () {
 			n)
 				request="$request nlm"
 				;;
+			f)
+				request="$request fast"
+				;;
 			a)
 				request="$request author"
 				;;
 			t)
 				request="$request title"
 				;;
-			S)
-				separator="$OPTARG"
-				;;
 			V)
 				verbose=1
 				;;
-			F)
-				build_filename=1
-				;;
 			D)
 				db="$OPTARG"
 				;;
-			Q)
-				[ -z "$db" ] && exit_with_error "use -D to define which database to use"
-				build_sql=1
-				md5="$OPTARG"
-				isbn=$(get_identifier "$db" "$md5")
-				[ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
-				;;
 			C)
 				[ -z "$db" ] && exit_with_error "use -D to define which database to use"
-				build_cdf=1
+				build_csv=1
 				md5="$OPTARG"
-				isbn=$(get_identifier "$db" "$md5")
-				[ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
+				idents=$(get_identifiers "$db" "$md5")
+				[ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
+				;;
+			X)
+				save_xml="$OPTARG"
+				[[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?"
 				;;
 			A)
-				request="author title owi ddc lcc nlm"
+				request="author title fast owi wi ddc lcc nlm"
 				verbose=1
 				;;
 			G)
@ -125,51 +123,55 @@ main () {
 	done

 	shift $((OPTIND-1))
-	[ -z "$isbn" ] && isbn="$1"
+	[ -z "$idents" ] && idents="$1"

-	get_xml "$xml" "stdnbr=$isbn"
-	response=$(get "response" "$xml")
+	IFS=',' read -ra idarr <<< "$idents"

-	case "$response" in
-		0)
-			true
-			;;
-		2)
-			true
-			;;
-		4)
-			owi=$(get "owi" "$xml")
-			get_xml "$xml" "owi=$owi"
-			;;
-		100)
-			[[ $build_sql ]] && echo "-- $md5: no input"
-			exit_with_error "no input"
-			;;
-		101)
-			[[ $build_sql ]] && echo "-- $md5: invalid input"
-			exit_with_error "invalid input"
-			;;
-		102)
-			[[ $build_sql ]] && echo "-- $md5: not found"
-			exit_with_error "not found"
-			;;
-		200)
-			[[ $build_sql ]] && echo "-- $md5: unexpected error"
-			exit_with_error "unexpected error"
-			;;
-	esac
+	for ident in "${idarr[@]}"; do
+
+		[[ -n "$debug" ]] && echo "trying $ident..."
+
+		get_xml "$xml" "stdnbr=$ident"
+		response=$(get "response" "$xml")
+
+		case "$response" in
+			0)
+				success=1
+				break
+				;;
+			2)
+				success=1
+				break
+				;;
+			4)
+				wi=$(get "wi" "$xml")
+				get_xml "$xml" "wi=$wi"
+				if [[ $(get "response" "$xml") =~ 0|2 ]]; then
+					success=1
+					break
+				else
+					continue
+				fi
+				;;
+			*)
+				continue
+				;;
+		esac
+	done
+
+	[[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents"
+
+	if [[ -n "$save_xml" ]]; then
+		[[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)"
+		cp "$xml" "$save_xml/$md5.xml"
+	fi

 	if [[ -n "$debug" ]]; then
 		cat "$xml"
 	fi

-	if [[ -n "$build_filename" ]]; then
-		build_filename "$xml"
-
-	elif [[ -n "$build_sql" ]]; then
-		build_sql "$db" "$md5" "$xml"
-	elif [[ -n "$build_cdf" ]]; then
-		build_cdf "$db" "$md5" "$xml"
+	if [[ -n "$build_csv" ]]; then
+		build_csv "$db" "$md5" "$xml"
 	else
 		show_data "$request"
 	fi
@ -179,7 +181,7 @@ get_xml () {
 	xml="$1"
 	shift
 	query="$*"
-	$torsocks "$curl" -s "${oclc}?summary=true&${query}" --output "$xml"
+	$torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml"
 }

 get () {
@ -191,7 +193,7 @@ get () {
 	$xq "$xml" -e "${API[$parameter]}"|eval "$filter"
 }

-get_identifier () {
+get_identifiers () {
 	db="$1"
 	md5="$2"

@ -201,7 +203,7 @@ get_identifier () {
 	)

 	sql="${sql_identifier[$db]}"
-	dbx "$db" "$sql"|cut -d ',' -f 1
+	dbx "$db" "$sql"
 }

 show_data () {
@ -214,38 +216,7 @@ show_data () {
 	done
 }

-build_filename () {
-	xml="$1"
-
-	dirname=$(get "ddc" "$xml")
-	filename=$(get "author" "$xml" "${filters['filename']}")${separator}$(get "title" "$xml" "${filters['filename']}")
-	echo "${dirname}/${filename}"
-}
-
-build_sql () {
-	db="$1"
-	md5="$2"
-	xml="$3"
-
-	for parameter in ddc lcc; do
-		data=$(get "$parameter" "$xml")
-		if [[ -n "$data" ]]; then
-			updates="${updates}${updates:+, }${parameter^^}='${data}'"
-		fi
-	done
-
-	if [[ -n "$updates" ]]; then
-		if [ -n "$verbose" ]; then
-			echo '/*'
-			show_data "author title"
-			echo '*/'
-		fi
-
-		echo "update ${tables[$db]} set $updates where md5='$md5';"
-	fi
-}
-
-build_cdf () {
+build_csv () {
 	db="$1"
 	md5="$2"
 	xml="$3"
@ -254,15 +225,17 @@ build_cdf () {

 	for parameter in ddc lcc nlm; do
 		data=$(get "$parameter" "$xml")
+		updates+=",\"${data}\""
+	done
+
+	for parameter in fast author title; do
+		data=$(get "$parameter" "$xml" "base64 -w0")
 		updates+=",${data}"
 	done

 	echo "$updates"
 }

-
-
-
 cleanup () {
 	base=$(basename "$xml")
 	rm -f "$TMPDIR/$base"
@ -284,24 +257,24 @@ help () {
 	 	-d	show DDC
 	 	-l	show LCC
 	 	-n	show NLM
+	 	-f	show FAST
 	 	-a	show Author
 	 	-t	show Title

-	 	-F	create filename (DDC/Author-Title)
+		-o	show OWI (OCLC works identifier)
+		-w	show WI (OCLC works number)

-	 	-Q md5	create SQL to update database
+	 	-C md5	create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE)
 	 		use -D libgen/-D libgen_fiction to indicate database
-	 		use with -V to add SQL comments with publication author
-	 		and title

+	 	-X dir	save OCLC XML response to \$dir/\$md5.xml
+			only works with a defined MD5 (-C MD5)
+	
 	 	-D db	define which database to use (libgen/libgen_fiction)

 	 	-A	show all available data for identifier

-	 	-o	show OCLC work index (owi)
-
 	 	-V	show labels
-	 	-S sep	change separator used to build filename (default: $separator)

 	 	-@	use torsocks to connect to the OCLC classify service.
 		 	use this to avoid getting your IP blocked by OCLC
@ -316,19 +289,27 @@ help () {
 	DDC: 321.07
 	LCC: JC71

-	$ classify -Q 25b8ce971343e85dbdc3fa375804b538 0199535760
-	update updated set DDC='321.07', LCC='JC71' where md5='25b8ce971343e85dbdc3fa375804b538';
+	$ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538
+	25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\
+	HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\
+	oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\
+	dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\
+	JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\
+	b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\
+	gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\
+	XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\
+	90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\
+	cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg==


 	Classifying libgen/libgen_fiction

-	This tool can be used to add DDC and LCC classification data
-	to libgen and libgen_fiction databases. It does not directy
-	modify the database, instead producing SQL code which can be
-	used to apply the modifications. The best way to do this is
-	to produce a list of md5 hashes for publications which do
-	have Identifier values but lack values for DDC and/or LCC. Such
-	lists can be produced by the following SQL:
+	This tool can be used to add classification data to libgen and
+	libgen_fiction databases. It does not directy modify the database,
+	instead producing CSV which can be used to apply the modifications.
+	The best way to do this is to produce a list of md5 hashes for
+	publications which do have Identifier values but lack values for DDC
+	and/or LCC. Such lists can be produced by the following SQL:
 	
 	   libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
 	   libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";
@ -343,6 +324,18 @@ help () {
 	your IP being blocked. The OCLC classification service is not
 	run as a production service (I asked them).

+	Return values are stored in the following order:
+
+	   MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE
+
+	DDC, LCC and NLM are enclosed within double quotes and can contain
+	multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded
+	since these fields can contain a whole host of unwholesome characters
+	which can mess up CSV. The AUTHOR field decodes to a pipe ('|') separated
+	list of authors in the format:
+
+	   LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]]
+
 	EOHELP
 }