Remove SQL generation, add XML save option, implement multiple identifier search, add CSV generation, remove filename generation

This commit is contained in:
Yetangitu 2021-05-16 14:23:57 +00:00
parent 2bd34ab5c4
commit 7da142f921

217
classify
View file

@ -1,7 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
#shellcheck disable=SC2034,SC1090 #shellcheck disable=SC2034,SC1090
# #
# classify - return DDC, LCC for ISBN or MD5 (from libgen/libgen_fiction) # classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction)
shopt -s extglob shopt -s extglob
trap "trap_error" TERM trap "trap_error" TERM
@ -28,6 +28,8 @@ main () {
declare -A API=( declare -A API=(
[response]='/classify/response/@code' [response]='/classify/response/@code'
[owi]='/classify/works/work[1]/@owi' [owi]='/classify/works/work[1]/@owi'
[wi]='/classify/works/work[1]/@wi'
[fast]='join(/classify/recommendations/fast/headings/heading,",")'
[ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)' [ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)'
[lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)' [lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)'
[nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)' [nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)'
@ -48,7 +50,6 @@ main () {
curl=$(find_tool "curl") curl=$(find_tool "curl")
xq="$xidel -s" xq="$xidel -s"
separator="-"
request="" request=""
TMPDIR="/tmp" TMPDIR="/tmp"
@ -57,11 +58,14 @@ main () {
# source config file if it exists # source config file if it exists
[[ -f ${config} ]] && source "${config}" [[ -f ${config} ]] && source "${config}"
while getopts "odlnatS:FVAD:Q:C:G@h" OPTION; do while getopts "owdlnfatVAD:C:X:G@h" OPTION; do
case $OPTION in case $OPTION in
o) o)
request="$request owi" request="$request owi"
;; ;;
w)
request="$request wi"
;;
d) d)
request="$request ddc" request="$request ddc"
;; ;;
@ -71,40 +75,34 @@ main () {
n) n)
request="$request nlm" request="$request nlm"
;; ;;
f)
request="$request fast"
;;
a) a)
request="$request author" request="$request author"
;; ;;
t) t)
request="$request title" request="$request title"
;; ;;
S)
separator="$OPTARG"
;;
V) V)
verbose=1 verbose=1
;; ;;
F)
build_filename=1
;;
D) D)
db="$OPTARG" db="$OPTARG"
;; ;;
Q)
[ -z "$db" ] && exit_with_error "use -D to define which database to use"
build_sql=1
md5="$OPTARG"
isbn=$(get_identifier "$db" "$md5")
[ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
;;
C) C)
[ -z "$db" ] && exit_with_error "use -D to define which database to use" [ -z "$db" ] && exit_with_error "use -D to define which database to use"
build_cdf=1 build_csv=1
md5="$OPTARG" md5="$OPTARG"
isbn=$(get_identifier "$db" "$md5") idents=$(get_identifiers "$db" "$md5")
[ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5" [ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
;;
X)
save_xml="$OPTARG"
[[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?"
;; ;;
A) A)
request="author title owi ddc lcc nlm" request="author title fast owi wi ddc lcc nlm"
verbose=1 verbose=1
;; ;;
G) G)
@ -125,51 +123,55 @@ main () {
done done
shift $((OPTIND-1)) shift $((OPTIND-1))
[ -z "$isbn" ] && isbn="$1" [ -z "$idents" ] && idents="$1"
get_xml "$xml" "stdnbr=$isbn" IFS=',' read -ra idarr <<< "$idents"
response=$(get "response" "$xml")
case "$response" in for ident in "${idarr[@]}"; do
0)
true [[ -n "$debug" ]] && echo "trying $ident..."
;;
2) get_xml "$xml" "stdnbr=$ident"
true response=$(get "response" "$xml")
;;
4) case "$response" in
owi=$(get "owi" "$xml") 0)
get_xml "$xml" "owi=$owi" success=1
;; break
100) ;;
[[ $build_sql ]] && echo "-- $md5: no input" 2)
exit_with_error "no input" success=1
;; break
101) ;;
[[ $build_sql ]] && echo "-- $md5: invalid input" 4)
exit_with_error "invalid input" wi=$(get "wi" "$xml")
;; get_xml "$xml" "wi=$wi"
102) if [[ $(get "response" "$xml") =~ 0|2 ]]; then
[[ $build_sql ]] && echo "-- $md5: not found" success=1
exit_with_error "not found" break
;; else
200) continue
[[ $build_sql ]] && echo "-- $md5: unexpected error" fi
exit_with_error "unexpected error" ;;
;; *)
esac continue
;;
esac
done
[[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents"
if [[ -n "$save_xml" ]]; then
[[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)"
cp "$xml" "$save_xml/$md5.xml"
fi
if [[ -n "$debug" ]]; then if [[ -n "$debug" ]]; then
cat "$xml" cat "$xml"
fi fi
if [[ -n "$build_filename" ]]; then if [[ -n "$build_csv" ]]; then
build_filename "$xml" build_csv "$db" "$md5" "$xml"
elif [[ -n "$build_sql" ]]; then
build_sql "$db" "$md5" "$xml"
elif [[ -n "$build_cdf" ]]; then
build_cdf "$db" "$md5" "$xml"
else else
show_data "$request" show_data "$request"
fi fi
@ -179,7 +181,7 @@ get_xml () {
xml="$1" xml="$1"
shift shift
query="$*" query="$*"
$torsocks "$curl" -s "${oclc}?summary=true&${query}" --output "$xml" $torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml"
} }
get () { get () {
@ -191,7 +193,7 @@ get () {
$xq "$xml" -e "${API[$parameter]}"|eval "$filter" $xq "$xml" -e "${API[$parameter]}"|eval "$filter"
} }
get_identifier () { get_identifiers () {
db="$1" db="$1"
md5="$2" md5="$2"
@ -201,7 +203,7 @@ get_identifier () {
) )
sql="${sql_identifier[$db]}" sql="${sql_identifier[$db]}"
dbx "$db" "$sql"|cut -d ',' -f 1 dbx "$db" "$sql"
} }
show_data () { show_data () {
@ -214,38 +216,7 @@ show_data () {
done done
} }
build_filename () { build_csv () {
xml="$1"
dirname=$(get "ddc" "$xml")
filename=$(get "author" "$xml" "${filters['filename']}")${separator}$(get "title" "$xml" "${filters['filename']}")
echo "${dirname}/${filename}"
}
build_sql () {
db="$1"
md5="$2"
xml="$3"
for parameter in ddc lcc; do
data=$(get "$parameter" "$xml")
if [[ -n "$data" ]]; then
updates="${updates}${updates:+, }${parameter^^}='${data}'"
fi
done
if [[ -n "$updates" ]]; then
if [ -n "$verbose" ]; then
echo '/*'
show_data "author title"
echo '*/'
fi
echo "update ${tables[$db]} set $updates where md5='$md5';"
fi
}
build_cdf () {
db="$1" db="$1"
md5="$2" md5="$2"
xml="$3" xml="$3"
@ -254,15 +225,17 @@ build_cdf () {
for parameter in ddc lcc nlm; do for parameter in ddc lcc nlm; do
data=$(get "$parameter" "$xml") data=$(get "$parameter" "$xml")
updates+=",\"${data}\""
done
for parameter in fast author title; do
data=$(get "$parameter" "$xml" "base64 -w0")
updates+=",${data}" updates+=",${data}"
done done
echo "$updates" echo "$updates"
} }
cleanup () { cleanup () {
base=$(basename "$xml") base=$(basename "$xml")
rm -f "$TMPDIR/$base" rm -f "$TMPDIR/$base"
@ -284,24 +257,24 @@ help () {
-d show DDC -d show DDC
-l show LCC -l show LCC
-n show NLM -n show NLM
-f show FAST
-a show Author -a show Author
-t show Title -t show Title
-F create filename (DDC/Author-Title) -o show OWI (OCLC works identifier)
-w show WI (OCLC works number)
-Q md5 create SQL to update database -C md5 create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE)
use -D libgen/-D libgen_fiction to indicate database use -D libgen/-D libgen_fiction to indicate database
use with -V to add SQL comments with publication author
and title
-X dir save OCLC XML response to \$dir/\$md5.xml
only works with a defined MD5 (-C MD5)
-D db define which database to use (libgen/libgen_fiction) -D db define which database to use (libgen/libgen_fiction)
-A show all available data for identifier -A show all available data for identifier
-o show OCLC work index (owi)
-V show labels -V show labels
-S sep change separator used to build filename (default: $separator)
-@ use torsocks to connect to the OCLC classify service. -@ use torsocks to connect to the OCLC classify service.
use this to avoid getting your IP blocked by OCLC use this to avoid getting your IP blocked by OCLC
@ -316,19 +289,27 @@ help () {
DDC: 321.07 DDC: 321.07
LCC: JC71 LCC: JC71
$ classify -Q 25b8ce971343e85dbdc3fa375804b538 0199535760 $ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538
update updated set DDC='321.07', LCC='JC71' where md5='25b8ce971343e85dbdc3fa375804b538'; 25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\
HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\
oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\
dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\
JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\
b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\
gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\
XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\
90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\
cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg==
Classifying libgen/libgen_fiction Classifying libgen/libgen_fiction
This tool can be used to add DDC and LCC classification data This tool can be used to add classification data to libgen and
to libgen and libgen_fiction databases. It does not directy libgen_fiction databases. It does not directy modify the database,
modify the database, instead producing SQL code which can be instead producing CSV which can be used to apply the modifications.
used to apply the modifications. The best way to do this is The best way to do this is to produce a list of md5 hashes for
to produce a list of md5 hashes for publications which do publications which do have Identifier values but lack values for DDC
have Identifier values but lack values for DDC and/or LCC. Such and/or LCC. Such lists can be produced by the following SQL:
lists can be produced by the following SQL:
libgen: select md5 from updated where IdentifierWODash<>"" and DDC=""; libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
libgen_fiction: select md5 from fiction where Identifier<>"" and DDC=""; libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";
@ -343,6 +324,18 @@ help () {
your IP being blocked. The OCLC classification service is not your IP being blocked. The OCLC classification service is not
run as a production service (I asked them). run as a production service (I asked them).
Return values are stored in the following order:
MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE
DDC, LCC and NLM are enclosed within double quotes and can contain
multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded
since these fields can contain a whole host of unwholesome characters
which can mess up CSV. The AUTHOR field decodes to a pipe ('|') separated
list of authors in the format:
LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]]
EOHELP EOHELP
} }