Remove SQL generation, add XML save option, implement multiple identifier search, add CSV generation, remove filename generation
This commit is contained in:
parent
2bd34ab5c4
commit
7da142f921
1 changed files with 105 additions and 112 deletions
217
classify
217
classify
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
#shellcheck disable=SC2034,SC1090
|
#shellcheck disable=SC2034,SC1090
|
||||||
#
|
#
|
||||||
# classify - return DDC, LCC for ISBN or MD5 (from libgen/libgen_fiction)
|
# classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction)
|
||||||
|
|
||||||
shopt -s extglob
|
shopt -s extglob
|
||||||
trap "trap_error" TERM
|
trap "trap_error" TERM
|
||||||
|
@ -28,6 +28,8 @@ main () {
|
||||||
declare -A API=(
|
declare -A API=(
|
||||||
[response]='/classify/response/@code'
|
[response]='/classify/response/@code'
|
||||||
[owi]='/classify/works/work[1]/@owi'
|
[owi]='/classify/works/work[1]/@owi'
|
||||||
|
[wi]='/classify/works/work[1]/@wi'
|
||||||
|
[fast]='join(/classify/recommendations/fast/headings/heading,",")'
|
||||||
[ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)'
|
[ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)'
|
||||||
[lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)'
|
[lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)'
|
||||||
[nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)'
|
[nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)'
|
||||||
|
@ -48,7 +50,6 @@ main () {
|
||||||
curl=$(find_tool "curl")
|
curl=$(find_tool "curl")
|
||||||
xq="$xidel -s"
|
xq="$xidel -s"
|
||||||
|
|
||||||
separator="-"
|
|
||||||
request=""
|
request=""
|
||||||
|
|
||||||
TMPDIR="/tmp"
|
TMPDIR="/tmp"
|
||||||
|
@ -57,11 +58,14 @@ main () {
|
||||||
# source config file if it exists
|
# source config file if it exists
|
||||||
[[ -f ${config} ]] && source "${config}"
|
[[ -f ${config} ]] && source "${config}"
|
||||||
|
|
||||||
while getopts "odlnatS:FVAD:Q:C:G@h" OPTION; do
|
while getopts "owdlnfatVAD:C:X:G@h" OPTION; do
|
||||||
case $OPTION in
|
case $OPTION in
|
||||||
o)
|
o)
|
||||||
request="$request owi"
|
request="$request owi"
|
||||||
;;
|
;;
|
||||||
|
w)
|
||||||
|
request="$request wi"
|
||||||
|
;;
|
||||||
d)
|
d)
|
||||||
request="$request ddc"
|
request="$request ddc"
|
||||||
;;
|
;;
|
||||||
|
@ -71,40 +75,34 @@ main () {
|
||||||
n)
|
n)
|
||||||
request="$request nlm"
|
request="$request nlm"
|
||||||
;;
|
;;
|
||||||
|
f)
|
||||||
|
request="$request fast"
|
||||||
|
;;
|
||||||
a)
|
a)
|
||||||
request="$request author"
|
request="$request author"
|
||||||
;;
|
;;
|
||||||
t)
|
t)
|
||||||
request="$request title"
|
request="$request title"
|
||||||
;;
|
;;
|
||||||
S)
|
|
||||||
separator="$OPTARG"
|
|
||||||
;;
|
|
||||||
V)
|
V)
|
||||||
verbose=1
|
verbose=1
|
||||||
;;
|
;;
|
||||||
F)
|
|
||||||
build_filename=1
|
|
||||||
;;
|
|
||||||
D)
|
D)
|
||||||
db="$OPTARG"
|
db="$OPTARG"
|
||||||
;;
|
;;
|
||||||
Q)
|
|
||||||
[ -z "$db" ] && exit_with_error "use -D to define which database to use"
|
|
||||||
build_sql=1
|
|
||||||
md5="$OPTARG"
|
|
||||||
isbn=$(get_identifier "$db" "$md5")
|
|
||||||
[ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
|
|
||||||
;;
|
|
||||||
C)
|
C)
|
||||||
[ -z "$db" ] && exit_with_error "use -D to define which database to use"
|
[ -z "$db" ] && exit_with_error "use -D to define which database to use"
|
||||||
build_cdf=1
|
build_csv=1
|
||||||
md5="$OPTARG"
|
md5="$OPTARG"
|
||||||
isbn=$(get_identifier "$db" "$md5")
|
idents=$(get_identifiers "$db" "$md5")
|
||||||
[ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
|
[ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
|
||||||
|
;;
|
||||||
|
X)
|
||||||
|
save_xml="$OPTARG"
|
||||||
|
[[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?"
|
||||||
;;
|
;;
|
||||||
A)
|
A)
|
||||||
request="author title owi ddc lcc nlm"
|
request="author title fast owi wi ddc lcc nlm"
|
||||||
verbose=1
|
verbose=1
|
||||||
;;
|
;;
|
||||||
G)
|
G)
|
||||||
|
@ -125,51 +123,55 @@ main () {
|
||||||
done
|
done
|
||||||
|
|
||||||
shift $((OPTIND-1))
|
shift $((OPTIND-1))
|
||||||
[ -z "$isbn" ] && isbn="$1"
|
[ -z "$idents" ] && idents="$1"
|
||||||
|
|
||||||
get_xml "$xml" "stdnbr=$isbn"
|
IFS=',' read -ra idarr <<< "$idents"
|
||||||
response=$(get "response" "$xml")
|
|
||||||
|
|
||||||
case "$response" in
|
for ident in "${idarr[@]}"; do
|
||||||
0)
|
|
||||||
true
|
[[ -n "$debug" ]] && echo "trying $ident..."
|
||||||
;;
|
|
||||||
2)
|
get_xml "$xml" "stdnbr=$ident"
|
||||||
true
|
response=$(get "response" "$xml")
|
||||||
;;
|
|
||||||
4)
|
case "$response" in
|
||||||
owi=$(get "owi" "$xml")
|
0)
|
||||||
get_xml "$xml" "owi=$owi"
|
success=1
|
||||||
;;
|
break
|
||||||
100)
|
;;
|
||||||
[[ $build_sql ]] && echo "-- $md5: no input"
|
2)
|
||||||
exit_with_error "no input"
|
success=1
|
||||||
;;
|
break
|
||||||
101)
|
;;
|
||||||
[[ $build_sql ]] && echo "-- $md5: invalid input"
|
4)
|
||||||
exit_with_error "invalid input"
|
wi=$(get "wi" "$xml")
|
||||||
;;
|
get_xml "$xml" "wi=$wi"
|
||||||
102)
|
if [[ $(get "response" "$xml") =~ 0|2 ]]; then
|
||||||
[[ $build_sql ]] && echo "-- $md5: not found"
|
success=1
|
||||||
exit_with_error "not found"
|
break
|
||||||
;;
|
else
|
||||||
200)
|
continue
|
||||||
[[ $build_sql ]] && echo "-- $md5: unexpected error"
|
fi
|
||||||
exit_with_error "unexpected error"
|
;;
|
||||||
;;
|
*)
|
||||||
esac
|
continue
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents"
|
||||||
|
|
||||||
|
if [[ -n "$save_xml" ]]; then
|
||||||
|
[[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)"
|
||||||
|
cp "$xml" "$save_xml/$md5.xml"
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ -n "$debug" ]]; then
|
if [[ -n "$debug" ]]; then
|
||||||
cat "$xml"
|
cat "$xml"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ -n "$build_filename" ]]; then
|
if [[ -n "$build_csv" ]]; then
|
||||||
build_filename "$xml"
|
build_csv "$db" "$md5" "$xml"
|
||||||
|
|
||||||
elif [[ -n "$build_sql" ]]; then
|
|
||||||
build_sql "$db" "$md5" "$xml"
|
|
||||||
elif [[ -n "$build_cdf" ]]; then
|
|
||||||
build_cdf "$db" "$md5" "$xml"
|
|
||||||
else
|
else
|
||||||
show_data "$request"
|
show_data "$request"
|
||||||
fi
|
fi
|
||||||
|
@ -179,7 +181,7 @@ get_xml () {
|
||||||
xml="$1"
|
xml="$1"
|
||||||
shift
|
shift
|
||||||
query="$*"
|
query="$*"
|
||||||
$torsocks "$curl" -s "${oclc}?summary=true&${query}" --output "$xml"
|
$torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml"
|
||||||
}
|
}
|
||||||
|
|
||||||
get () {
|
get () {
|
||||||
|
@ -191,7 +193,7 @@ get () {
|
||||||
$xq "$xml" -e "${API[$parameter]}"|eval "$filter"
|
$xq "$xml" -e "${API[$parameter]}"|eval "$filter"
|
||||||
}
|
}
|
||||||
|
|
||||||
get_identifier () {
|
get_identifiers () {
|
||||||
db="$1"
|
db="$1"
|
||||||
md5="$2"
|
md5="$2"
|
||||||
|
|
||||||
|
@ -201,7 +203,7 @@ get_identifier () {
|
||||||
)
|
)
|
||||||
|
|
||||||
sql="${sql_identifier[$db]}"
|
sql="${sql_identifier[$db]}"
|
||||||
dbx "$db" "$sql"|cut -d ',' -f 1
|
dbx "$db" "$sql"
|
||||||
}
|
}
|
||||||
|
|
||||||
show_data () {
|
show_data () {
|
||||||
|
@ -214,38 +216,7 @@ show_data () {
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
build_filename () {
|
build_csv () {
|
||||||
xml="$1"
|
|
||||||
|
|
||||||
dirname=$(get "ddc" "$xml")
|
|
||||||
filename=$(get "author" "$xml" "${filters['filename']}")${separator}$(get "title" "$xml" "${filters['filename']}")
|
|
||||||
echo "${dirname}/${filename}"
|
|
||||||
}
|
|
||||||
|
|
||||||
build_sql () {
|
|
||||||
db="$1"
|
|
||||||
md5="$2"
|
|
||||||
xml="$3"
|
|
||||||
|
|
||||||
for parameter in ddc lcc; do
|
|
||||||
data=$(get "$parameter" "$xml")
|
|
||||||
if [[ -n "$data" ]]; then
|
|
||||||
updates="${updates}${updates:+, }${parameter^^}='${data}'"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -n "$updates" ]]; then
|
|
||||||
if [ -n "$verbose" ]; then
|
|
||||||
echo '/*'
|
|
||||||
show_data "author title"
|
|
||||||
echo '*/'
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "update ${tables[$db]} set $updates where md5='$md5';"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
build_cdf () {
|
|
||||||
db="$1"
|
db="$1"
|
||||||
md5="$2"
|
md5="$2"
|
||||||
xml="$3"
|
xml="$3"
|
||||||
|
@ -254,15 +225,17 @@ build_cdf () {
|
||||||
|
|
||||||
for parameter in ddc lcc nlm; do
|
for parameter in ddc lcc nlm; do
|
||||||
data=$(get "$parameter" "$xml")
|
data=$(get "$parameter" "$xml")
|
||||||
|
updates+=",\"${data}\""
|
||||||
|
done
|
||||||
|
|
||||||
|
for parameter in fast author title; do
|
||||||
|
data=$(get "$parameter" "$xml" "base64 -w0")
|
||||||
updates+=",${data}"
|
updates+=",${data}"
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "$updates"
|
echo "$updates"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cleanup () {
|
cleanup () {
|
||||||
base=$(basename "$xml")
|
base=$(basename "$xml")
|
||||||
rm -f "$TMPDIR/$base"
|
rm -f "$TMPDIR/$base"
|
||||||
|
@ -284,24 +257,24 @@ help () {
|
||||||
-d show DDC
|
-d show DDC
|
||||||
-l show LCC
|
-l show LCC
|
||||||
-n show NLM
|
-n show NLM
|
||||||
|
-f show FAST
|
||||||
-a show Author
|
-a show Author
|
||||||
-t show Title
|
-t show Title
|
||||||
|
|
||||||
-F create filename (DDC/Author-Title)
|
-o show OWI (OCLC works identifier)
|
||||||
|
-w show WI (OCLC works number)
|
||||||
|
|
||||||
-Q md5 create SQL to update database
|
-C md5 create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE)
|
||||||
use -D libgen/-D libgen_fiction to indicate database
|
use -D libgen/-D libgen_fiction to indicate database
|
||||||
use with -V to add SQL comments with publication author
|
|
||||||
and title
|
-X dir save OCLC XML response to \$dir/\$md5.xml
|
||||||
|
only works with a defined MD5 (-C MD5)
|
||||||
|
|
||||||
-D db define which database to use (libgen/libgen_fiction)
|
-D db define which database to use (libgen/libgen_fiction)
|
||||||
|
|
||||||
-A show all available data for identifier
|
-A show all available data for identifier
|
||||||
|
|
||||||
-o show OCLC work index (owi)
|
|
||||||
|
|
||||||
-V show labels
|
-V show labels
|
||||||
-S sep change separator used to build filename (default: $separator)
|
|
||||||
|
|
||||||
-@ use torsocks to connect to the OCLC classify service.
|
-@ use torsocks to connect to the OCLC classify service.
|
||||||
use this to avoid getting your IP blocked by OCLC
|
use this to avoid getting your IP blocked by OCLC
|
||||||
|
@ -316,19 +289,27 @@ help () {
|
||||||
DDC: 321.07
|
DDC: 321.07
|
||||||
LCC: JC71
|
LCC: JC71
|
||||||
|
|
||||||
$ classify -Q 25b8ce971343e85dbdc3fa375804b538 0199535760
|
$ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538
|
||||||
update updated set DDC='321.07', LCC='JC71' where md5='25b8ce971343e85dbdc3fa375804b538';
|
25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\
|
||||||
|
HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\
|
||||||
|
oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\
|
||||||
|
dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\
|
||||||
|
JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\
|
||||||
|
b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\
|
||||||
|
gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\
|
||||||
|
XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\
|
||||||
|
90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\
|
||||||
|
cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg==
|
||||||
|
|
||||||
|
|
||||||
Classifying libgen/libgen_fiction
|
Classifying libgen/libgen_fiction
|
||||||
|
|
||||||
This tool can be used to add DDC and LCC classification data
|
This tool can be used to add classification data to libgen and
|
||||||
to libgen and libgen_fiction databases. It does not directy
|
libgen_fiction databases. It does not directy modify the database,
|
||||||
modify the database, instead producing SQL code which can be
|
instead producing CSV which can be used to apply the modifications.
|
||||||
used to apply the modifications. The best way to do this is
|
The best way to do this is to produce a list of md5 hashes for
|
||||||
to produce a list of md5 hashes for publications which do
|
publications which do have Identifier values but lack values for DDC
|
||||||
have Identifier values but lack values for DDC and/or LCC. Such
|
and/or LCC. Such lists can be produced by the following SQL:
|
||||||
lists can be produced by the following SQL:
|
|
||||||
|
|
||||||
libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
|
libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
|
||||||
libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";
|
libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";
|
||||||
|
@ -343,6 +324,18 @@ help () {
|
||||||
your IP being blocked. The OCLC classification service is not
|
your IP being blocked. The OCLC classification service is not
|
||||||
run as a production service (I asked them).
|
run as a production service (I asked them).
|
||||||
|
|
||||||
|
Return values are stored in the following order:
|
||||||
|
|
||||||
|
MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE
|
||||||
|
|
||||||
|
DDC, LCC and NLM are enclosed within double quotes and can contain
|
||||||
|
multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded
|
||||||
|
since these fields can contain a whole host of unwholesome characters
|
||||||
|
which can mess up CSV. The AUTHOR field decodes to a pipe ('|') separated
|
||||||
|
list of authors in the format:
|
||||||
|
|
||||||
|
LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]]
|
||||||
|
|
||||||
EOHELP
|
EOHELP
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue