diff --git a/classify b/classify index 3a76fbb..f08a133 100755 --- a/classify +++ b/classify @@ -1,7 +1,7 @@ #!/usr/bin/env bash #shellcheck disable=SC2034,SC1090 # -# classify - return DDC, LCC for ISBN or MD5 (from libgen/libgen_fiction) +# classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction) shopt -s extglob trap "trap_error" TERM @@ -28,6 +28,8 @@ main () { declare -A API=( [response]='/classify/response/@code' [owi]='/classify/works/work[1]/@owi' + [wi]='/classify/works/work[1]/@wi' + [fast]='join(/classify/recommendations/fast/headings/heading,",")' [ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)' [lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)' [nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)' @@ -48,7 +50,6 @@ main () { curl=$(find_tool "curl") xq="$xidel -s" - separator="-" request="" TMPDIR="/tmp" @@ -57,11 +58,14 @@ main () { # source config file if it exists [[ -f ${config} ]] && source "${config}" - while getopts "odlnatS:FVAD:Q:C:G@h" OPTION; do + while getopts "owdlnfatVAD:C:X:G@h" OPTION; do case $OPTION in o) request="$request owi" ;; + w) + request="$request wi" + ;; d) request="$request ddc" ;; @@ -71,40 +75,34 @@ main () { n) request="$request nlm" ;; + f) + request="$request fast" + ;; a) request="$request author" ;; t) request="$request title" ;; - S) - separator="$OPTARG" - ;; V) verbose=1 ;; - F) - build_filename=1 - ;; D) db="$OPTARG" ;; - Q) - [ -z "$db" ] && exit_with_error "use -D to define which database to use" - build_sql=1 - md5="$OPTARG" - isbn=$(get_identifier "$db" "$md5") - [ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5" - ;; C) [ -z "$db" ] && exit_with_error "use -D to define which database to use" - build_cdf=1 + build_csv=1 md5="$OPTARG" - isbn=$(get_identifier "$db" "$md5") - [ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5" + idents=$(get_identifiers "$db" "$md5") + [ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5" + ;; + X) + save_xml="$OPTARG" + [[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?" ;; A) - request="author title owi ddc lcc nlm" + request="author title fast owi wi ddc lcc nlm" verbose=1 ;; G) @@ -125,51 +123,55 @@ main () { done shift $((OPTIND-1)) - [ -z "$isbn" ] && isbn="$1" + [ -z "$idents" ] && idents="$1" - get_xml "$xml" "stdnbr=$isbn" - response=$(get "response" "$xml") + IFS=',' read -ra idarr <<< "$idents" - case "$response" in - 0) - true - ;; - 2) - true - ;; - 4) - owi=$(get "owi" "$xml") - get_xml "$xml" "owi=$owi" - ;; - 100) - [[ $build_sql ]] && echo "-- $md5: no input" - exit_with_error "no input" - ;; - 101) - [[ $build_sql ]] && echo "-- $md5: invalid input" - exit_with_error "invalid input" - ;; - 102) - [[ $build_sql ]] && echo "-- $md5: not found" - exit_with_error "not found" - ;; - 200) - [[ $build_sql ]] && echo "-- $md5: unexpected error" - exit_with_error "unexpected error" - ;; - esac + for ident in "${idarr[@]}"; do + + [[ -n "$debug" ]] && echo "trying $ident..." + + get_xml "$xml" "stdnbr=$ident" + response=$(get "response" "$xml") + + case "$response" in + 0) + success=1 + break + ;; + 2) + success=1 + break + ;; + 4) + wi=$(get "wi" "$xml") + get_xml "$xml" "wi=$wi" + if [[ $(get "response" "$xml") =~ 0|2 ]]; then + success=1 + break + else + continue + fi + ;; + *) + continue + ;; + esac + done + + [[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents" + + if [[ -n "$save_xml" ]]; then + [[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)" + cp "$xml" "$save_xml/$md5.xml" + fi if [[ -n "$debug" ]]; then cat "$xml" fi - if [[ -n "$build_filename" ]]; then - build_filename "$xml" - - elif [[ -n "$build_sql" ]]; then - build_sql "$db" "$md5" "$xml" - elif [[ -n "$build_cdf" ]]; then - build_cdf "$db" "$md5" "$xml" + if [[ -n "$build_csv" ]]; then + build_csv "$db" "$md5" "$xml" else show_data "$request" fi @@ -179,7 +181,7 @@ get_xml () { xml="$1" shift query="$*" - $torsocks "$curl" -s "${oclc}?summary=true&${query}" --output "$xml" + $torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml" } get () { @@ -191,7 +193,7 @@ get () { $xq "$xml" -e "${API[$parameter]}"|eval "$filter" } -get_identifier () { +get_identifiers () { db="$1" md5="$2" @@ -201,7 +203,7 @@ get_identifier () { ) sql="${sql_identifier[$db]}" - dbx "$db" "$sql"|cut -d ',' -f 1 + dbx "$db" "$sql" } show_data () { @@ -214,38 +216,7 @@ show_data () { done } -build_filename () { - xml="$1" - - dirname=$(get "ddc" "$xml") - filename=$(get "author" "$xml" "${filters['filename']}")${separator}$(get "title" "$xml" "${filters['filename']}") - echo "${dirname}/${filename}" -} - -build_sql () { - db="$1" - md5="$2" - xml="$3" - - for parameter in ddc lcc; do - data=$(get "$parameter" "$xml") - if [[ -n "$data" ]]; then - updates="${updates}${updates:+, }${parameter^^}='${data}'" - fi - done - - if [[ -n "$updates" ]]; then - if [ -n "$verbose" ]; then - echo '/*' - show_data "author title" - echo '*/' - fi - - echo "update ${tables[$db]} set $updates where md5='$md5';" - fi -} - -build_cdf () { +build_csv () { db="$1" md5="$2" xml="$3" @@ -254,15 +225,17 @@ build_cdf () { for parameter in ddc lcc nlm; do data=$(get "$parameter" "$xml") + updates+=",\"${data}\"" + done + + for parameter in fast author title; do + data=$(get "$parameter" "$xml" "base64 -w0") updates+=",${data}" done echo "$updates" } - - - cleanup () { base=$(basename "$xml") rm -f "$TMPDIR/$base" @@ -284,24 +257,24 @@ help () { -d show DDC -l show LCC -n show NLM + -f show FAST -a show Author -t show Title - -F create filename (DDC/Author-Title) + -o show OWI (OCLC works identifier) + -w show WI (OCLC works number) - -Q md5 create SQL to update database + -C md5 create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE) use -D libgen/-D libgen_fiction to indicate database - use with -V to add SQL comments with publication author - and title + -X dir save OCLC XML response to \$dir/\$md5.xml + only works with a defined MD5 (-C MD5) + -D db define which database to use (libgen/libgen_fiction) -A show all available data for identifier - -o show OCLC work index (owi) - -V show labels - -S sep change separator used to build filename (default: $separator) -@ use torsocks to connect to the OCLC classify service. use this to avoid getting your IP blocked by OCLC @@ -316,19 +289,27 @@ help () { DDC: 321.07 LCC: JC71 - $ classify -Q 25b8ce971343e85dbdc3fa375804b538 0199535760 - update updated set DDC='321.07', LCC='JC71' where md5='25b8ce971343e85dbdc3fa375804b538'; + $ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538 + 25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\ + HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\ + oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\ + dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\ + JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\ + b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\ + gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\ + XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\ + 90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\ + cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg== Classifying libgen/libgen_fiction - This tool can be used to add DDC and LCC classification data - to libgen and libgen_fiction databases. It does not directy - modify the database, instead producing SQL code which can be - used to apply the modifications. The best way to do this is - to produce a list of md5 hashes for publications which do - have Identifier values but lack values for DDC and/or LCC. Such - lists can be produced by the following SQL: + This tool can be used to add classification data to libgen and + libgen_fiction databases. It does not directy modify the database, + instead producing CSV which can be used to apply the modifications. + The best way to do this is to produce a list of md5 hashes for + publications which do have Identifier values but lack values for DDC + and/or LCC. Such lists can be produced by the following SQL: libgen: select md5 from updated where IdentifierWODash<>"" and DDC=""; libgen_fiction: select md5 from fiction where Identifier<>"" and DDC=""; @@ -343,6 +324,18 @@ help () { your IP being blocked. The OCLC classification service is not run as a production service (I asked them). + Return values are stored in the following order: + + MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE + + DDC, LCC and NLM are enclosed within double quotes and can contain + multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded + since these fields can contain a whole host of unwholesome characters + which can mess up CSV. The AUTHOR field decodes to a pipe ('|') separated + list of authors in the format: + + LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]] + EOHELP }