books/classify
Yetangitu 391b0189fb update_libgen: add -c option to integrat classify into the update process
config file: add these fields (values for demonstration purpose):
```
classify_xml="/home/username/Project/libgen_classify/xml"
classify_csv="/home/username/Project/libgen_classify/csv"
classify_sql="/home/username/Project/libgen_classify/sql"
classify_fields="ddc,lcc,nlm,fast,title,author"
classify_tor_ports="9100,9102,9104,9106,9108"
```
These fields are used by update_libgen to configure classify and import_metadata

All programs: the -@ switch now takes a mandatory parameter: -@ TORPORT
2021-06-01 14:12:09 +00:00

347 lines
8.1 KiB
Bash
Executable file

#!/usr/bin/env bash
#shellcheck disable=SC2034,SC1090
#
# classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction)
shopt -s extglob
trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$
version="0.5.1"
release="20210601"
functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
source "$functions"
else
echo "$functions not found"
exit 1
fi
main () {
# PREFERENCES
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
# OCLC classify API
oclc="http://classify.oclc.org/classify2/Classify"
declare -A API=(
[response]='/classify/response/@code'
[owi]='/classify/works/work[1]/@owi'
[wi]='/classify/works/work[1]/@wi'
[fast]='join(/classify/recommendations/fast/headings/heading,",")'
[ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)'
[lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)'
[nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)'
[author]='/classify/work/@author'
[authors]='join(/classify/authors/author," | ")'
[title]='/classify/work/@title'
)
declare -A filters=(
[filename]="sed -e 's/[^-[:alnum:]:;?!.,+@#%]/_/g;s/^\([-_]\)*//'"
)
declare -A tables=(
[libgen]="updated"
[libgen_fiction]="fiction"
)
xidel=$(find_tool "xidel")
curl=$(find_tool "curl")
xq="$xidel -s"
request=""
TMPDIR="/tmp"
xml=$(mktemp -p $TMPDIR classify.XXXXX)
# source config file if it exists
[[ -f ${config} ]] && source "${config}"
while getopts "owdlnfatVAD:C:X:G@:h" OPTION; do
case $OPTION in
o)
request="$request owi"
;;
w)
request="$request wi"
;;
d)
request="$request ddc"
;;
l)
request="$request lcc"
;;
n)
request="$request nlm"
;;
f)
request="$request fast"
;;
a)
request="$request author"
;;
t)
request="$request title"
;;
V)
verbose=1
;;
D)
db="$OPTARG"
;;
C)
[ -z "$db" ] && exit_with_error "use -D to define which database to use"
build_csv=1
md5="$OPTARG"
idents=$(get_identifiers "$db" "$md5")
[ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
;;
X)
save_xml="$OPTARG"
[[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?"
;;
A)
request="author title fast owi wi ddc lcc nlm"
verbose=1
;;
G)
((debug++))
;;
@)
torsocks=$(find_tool "torsocks")
export TORSOCKS_TOR_PORT=${OPTARG}
;;
h)
help
exit
;;
*)
exit_with_error "unknown option: $OPTION"
;;
esac
done
shift $((OPTIND-1))
[ -z "$idents" ] && idents="$1"
IFS=',' read -ra idarr <<< "$idents"
for ident in "${idarr[@]}"; do
[[ -n "$debug" ]] && echo "trying $ident..."
get_xml "$xml" "stdnbr=${ident// }"
response=$(get "response" "$xml")
case "$response" in
0)
success=1
break
;;
2)
success=1
break
;;
4)
wi=$(get "wi" "$xml")
get_xml "$xml" "wi=$wi"
if [[ $(get "response" "$xml") =~ 0|2 ]]; then
success=1
break
else
continue
fi
;;
*)
continue
;;
esac
done
[[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents"
if [[ -n "$save_xml" ]]; then
[[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)"
cp "$xml" "$save_xml/$md5.xml"
fi
if [[ -n "$debug" ]]; then
cat "$xml"
fi
if [[ -n "$build_csv" ]]; then
build_csv "$db" "$md5" "$xml"
else
show_data "$request"
fi
}
get_xml () {
xml="$1"
shift
query="$*"
$torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml"
}
get () {
parameter="$1"
xml="$2"
shift 2
filter="$*"
[[ -z "$filter" ]] && filter='cat -'
$xq "$xml" -e "${API[$parameter]}"|eval "$filter"
}
get_identifiers () {
db="$1"
md5="$2"
declare -A sql_identifier=(
[libgen]="select IdentifierWODash from updated where md5='${md5}';"
[libgen_fiction]="select Identifier from fiction where md5='${md5}';"
)
sql="${sql_identifier[$db]}"
dbx "$db" "$sql"
}
show_data () {
request="$*"
for parameter in $request; do
data=$(get "$parameter" "$xml")
[[ -n "$verbose" ]] && legend="${parameter^^}: "
[[ -n "$data" ]] && echo "${legend}${data}"
done
}
build_csv () {
db="$1"
md5="$2"
xml="$3"
updates="${md5}"
for parameter in ddc lcc nlm; do
data=$(get "$parameter" "$xml")
updates+=",\"${data}\""
done
for parameter in fast author title; do
data=$(get "$parameter" "$xml" "base64 -w0")
updates+=",${data}"
done
echo "$updates"
}
cleanup () {
base=$(basename "$xml")
rm -f "$TMPDIR/$base"
}
help () {
cat <<-EOHELP
$(basename "$(readlink -f "$0")") "version $version"
Use: classify [OPTIONS] identifier[,identifier...]
Queries OCLC classification service for available data
Supports: DDC, LCC, NLM, Author and Title
Valid identifiers are ISBN, ISSN, UPC and OCLC/OWI
OPTIONS:
-d show DDC
-l show LCC
-n show NLM
-f show FAST
-a show Author
-t show Title
-o show OWI (OCLC works identifier)
-w show WI (OCLC works number)
-C md5 create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE)
use -D libgen/-D libgen_fiction to indicate database
-X dir save OCLC XML response to \$dir/\$md5.xml
only works with a defined MD5 (-C MD5)
-D db define which database to use (libgen/libgen_fiction)
-A show all available data for identifier
-V show labels
-@ PORT use torsocks to connect to the OCLC classify service.
use this to avoid getting your IP blocked by OCLC
-h show this help message
Examples
$ classify -A 0199535760
AUTHOR: Plato | Jowett, Benjamin, 1817-1893 Translator; Editor; Other] ...
TITLE: The republic
DDC: 321.07
LCC: JC71
$ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538
25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\
HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\
oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\
dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\
JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\
b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\
gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\
XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\
90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\
cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg==
Classifying libgen/libgen_fiction
This tool can be used to add classification data to libgen and
libgen_fiction databases. It does not directy modify the database,
instead producing CSV which can be used to apply the modifications.
The best way to do this is to produce a list of md5 hashes for
publications which do have Identifier values but lack values for DDC
and/or LCC. Such lists can be produced by the following SQL:
libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";
Run these as batch jobs (mysql -B .... -e 'sql_code_here;' > md5_list), split
the resulting file in ~1000 line sections and feed these to this tool,
preferably with a random pause between requests to keep OCLC's intrusion
detection systems from triggering too early. It is advisable to use
this tool through Tor (using -@ TORPORT to enable torsocks, make sure it
is configured correctly for your Tor instance) to avoid having too
many requests from your IP to be registered, this again to avoid
your IP being blocked. The OCLC classification service is not
run as a production service (I asked them).
Return values are stored in the following order:
MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE
DDC, LCC and NLM are enclosed within double quotes and can contain
multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded
since these fields can contain a whole host of unwholesome characters
which can mess up CSV. The AUTHOR field currentlydecodes to a pipe ('|')
separated list of authors in the format:
LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]]
This format could change depending on what OCLC does with the
(experimental) service.
EOHELP
}
main "$@"