#!/usr/bin/env bash #shellcheck disable=SC2034,SC1090 # # classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction) shopt -s extglob trap "trap_error" TERM trap "trap_clean" EXIT export TOP_PID=$$ version="0.5.1" release="20210601" functions="$(dirname "$0")/books_functions" if [ -f "$functions" ]; then source "$functions" else echo "$functions not found" exit 1 fi main () { # PREFERENCES config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf # OCLC classify API oclc="http://classify.oclc.org/classify2/Classify" declare -A API=( [response]='/classify/response/@code' [owi]='/classify/works/work[1]/@owi' [wi]='/classify/works/work[1]/@wi' [fast]='join(/classify/recommendations/fast/headings/heading,",")' [ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)' [lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)' [nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)' [author]='/classify/work/@author' [authors]='join(/classify/authors/author," | ")' [title]='/classify/work/@title' ) declare -A filters=( [filename]="sed -e 's/[^-[:alnum:]:;?!.,+@#%]/_/g;s/^\([-_]\)*//'" ) declare -A tables=( [libgen]="updated" [libgen_fiction]="fiction" ) xidel=$(find_tool "xidel") curl=$(find_tool "curl") xq="$xidel -s" request="" TMPDIR="/tmp" xml=$(mktemp -p $TMPDIR classify.XXXXX) # source config file if it exists [[ -f ${config} ]] && source "${config}" while getopts "owdlnfatVAD:C:X:G@:h" OPTION; do case $OPTION in o) request="$request owi" ;; w) request="$request wi" ;; d) request="$request ddc" ;; l) request="$request lcc" ;; n) request="$request nlm" ;; f) request="$request fast" ;; a) request="$request author" ;; t) request="$request title" ;; V) verbose=1 ;; D) db="$OPTARG" ;; C) [ -z "$db" ] && exit_with_error "use -D to define which database to use" build_csv=1 md5="$OPTARG" idents=$(get_identifiers "$db" "$md5") [ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5" ;; X) save_xml="$OPTARG" [[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?" ;; A) request="author title fast owi wi ddc lcc nlm" verbose=1 ;; G) ((debug++)) ;; @) torsocks=$(find_tool "torsocks") export TORSOCKS_TOR_PORT=${OPTARG} ;; h) help exit ;; *) exit_with_error "unknown option: $OPTION" ;; esac done shift $((OPTIND-1)) [ -z "$idents" ] && idents="$1" IFS=',' read -ra idarr <<< "$idents" for ident in "${idarr[@]}"; do [[ -n "$debug" ]] && echo "trying $ident..." get_xml "$xml" "stdnbr=${ident// }" response=$(get "response" "$xml") case "$response" in 0) success=1 break ;; 2) success=1 break ;; 4) wi=$(get "wi" "$xml") get_xml "$xml" "wi=$wi" if [[ $(get "response" "$xml") =~ 0|2 ]]; then success=1 break else continue fi ;; *) continue ;; esac done [[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents" if [[ -n "$save_xml" ]]; then [[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)" cp "$xml" "$save_xml/$md5.xml" fi if [[ -n "$debug" ]]; then cat "$xml" fi if [[ -n "$build_csv" ]]; then build_csv "$db" "$md5" "$xml" else show_data "$request" fi } get_xml () { xml="$1" shift query="$*" $torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml" } get () { parameter="$1" xml="$2" shift 2 filter="$*" [[ -z "$filter" ]] && filter='cat -' $xq "$xml" -e "${API[$parameter]}"|eval "$filter" } get_identifiers () { db="$1" md5="$2" declare -A sql_identifier=( [libgen]="select IdentifierWODash from updated where md5='${md5}';" [libgen_fiction]="select Identifier from fiction where md5='${md5}';" ) sql="${sql_identifier[$db]}" dbx "$db" "$sql" } show_data () { request="$*" for parameter in $request; do data=$(get "$parameter" "$xml") [[ -n "$verbose" ]] && legend="${parameter^^}: " [[ -n "$data" ]] && echo "${legend}${data}" done } build_csv () { db="$1" md5="$2" xml="$3" updates="${md5}" for parameter in ddc lcc nlm; do data=$(get "$parameter" "$xml") updates+=",\"${data}\"" done for parameter in fast author title; do data=$(get "$parameter" "$xml" "base64 -w0") updates+=",${data}" done echo "$updates" } cleanup () { base=$(basename "$xml") rm -f "$TMPDIR/$base" } help () { cat <<-EOHELP $(basename "$(readlink -f "$0")") "version $version" Use: classify [OPTIONS] identifier[,identifier...] Queries OCLC classification service for available data Supports: DDC, LCC, NLM, Author and Title Valid identifiers are ISBN, ISSN, UPC and OCLC/OWI OPTIONS: -d show DDC -l show LCC -n show NLM -f show FAST -a show Author -t show Title -o show OWI (OCLC works identifier) -w show WI (OCLC works number) -C md5 create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE) use -D libgen/-D libgen_fiction to indicate database -X dir save OCLC XML response to \$dir/\$md5.xml only works with a defined MD5 (-C MD5) -D db define which database to use (libgen/libgen_fiction) -A show all available data for identifier -V show labels -@ PORT use torsocks to connect to the OCLC classify service. use this to avoid getting your IP blocked by OCLC -h show this help message Examples $ classify -A 0199535760 AUTHOR: Plato | Jowett, Benjamin, 1817-1893 Translator; Editor; Other] ... TITLE: The republic DDC: 321.07 LCC: JC71 $ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538 25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\ HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\ oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\ dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\ JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\ b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\ gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\ XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\ 90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\ cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg== Classifying libgen/libgen_fiction This tool can be used to add classification data to libgen and libgen_fiction databases. It does not directy modify the database, instead producing CSV which can be used to apply the modifications. The best way to do this is to produce a list of md5 hashes for publications which do have Identifier values but lack values for DDC and/or LCC. Such lists can be produced by the following SQL: libgen: select md5 from updated where IdentifierWODash<>"" and DDC=""; libgen_fiction: select md5 from fiction where Identifier<>"" and DDC=""; Run these as batch jobs (mysql -B .... -e 'sql_code_here;' > md5_list), split the resulting file in ~1000 line sections and feed these to this tool, preferably with a random pause between requests to keep OCLC's intrusion detection systems from triggering too early. It is advisable to use this tool through Tor (using -@ TORPORT to enable torsocks, make sure it is configured correctly for your Tor instance) to avoid having too many requests from your IP to be registered, this again to avoid your IP being blocked. The OCLC classification service is not run as a production service (I asked them). Return values are stored in the following order: MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE DDC, LCC and NLM are enclosed within double quotes and can contain multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded since these fields can contain a whole host of unwholesome characters which can mess up CSV. The AUTHOR field currentlydecodes to a pipe ('|') separated list of authors in the format: LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]] This format could change depending on what OCLC does with the (experimental) service. EOHELP } main "$@"