books/classify
Yetangitu ce5c75a0fb classify: OCLC classifier helper script
This script uses the OCLC classifier to find DDC (Dewey Decimal Code) and LCC (Library of Congress Code) for publications, it optionally produces SQL to update libgen/libgen diction databases.
2021-05-12 15:11:11 +00:00

312 lines
6.6 KiB
Bash
Executable file

#!/usr/bin/env bash
#shellcheck disable=SC2034,SC1090
#
# classify - return DDC, LCC for ISBN or MD5 (from libgen/libgen_fiction)
shopt -s extglob
trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$
version="0.3.0"
release="20210512"
functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
source "$functions"
else
echo "$functions not found"
exit 1
fi
main () {
# PREFERENCES
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
# OCLC classify API
oclc="http://classify.oclc.org/classify2/Classify"
declare -A API=(
[response]='/classify/response/@code'
[owi]='/classify/works/work[1]/@owi'
[ddc]='/classify/recommendations/ddc/mostPopular/@nsfa'
[lcc]='/classify/recommendations/lcc/mostPopular/@nsfa'
[nlm]='/classify/recommendations/nlm/mostPopular/@sfa'
[author]='/classify/work/@author'
[title]='/classify/work/@title'
)
declare -A filters=(
[filename]="sed -e 's/[^-[:alnum:]:;?!.,+@#%]/_/g;s/^\([-_]\)*//'"
)
declare -A tables=(
[libgen]="updated"
[libgen_fiction]="fiction"
)
xidel=$(find_tool "xidel")
curl=$(find_tool "curl")
xq="$xidel -s"
separator="-"
request=""
TMPDIR="/tmp"
xml=$(mktemp -p $TMPDIR classify.XXXXX)
# source config file if it exists
[[ -f ${config} ]] && source "${config}"
while getopts "odlnatS:FVAD:Q:@h" OPTION; do
case $OPTION in
o)
request="$request owi"
;;
d)
request="$request ddc"
;;
l)
request="$request lcc"
;;
n)
request="$request nlm"
;;
a)
request="$request author"
;;
t)
request="$request title"
;;
S)
separator="$OPTARG"
;;
V)
verbose=1
;;
F)
build_filename=1
;;
D)
db="$OPTARG"
;;
Q)
[ -z "$db" ] && exit_with_error "use -D to define which database to use"
build_sql=1
md5="$OPTARG"
isbn=$(get_identifier "$db" "$md5")
[ -z "$isbn" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
;;
A)
request="author title owi ddc lcc nlm"
verbose=1
;;
@)
torsocks=$(find_tool "torsocks")
;;
h)
help
exit
;;
*)
exit_with_error "unknown option: $OPTION"
;;
esac
done
shift $((OPTIND-1))
[ -z "$isbn" ] && isbn="$1"
get_xml "$xml" "stdnbr=$isbn"
response=$(get "response" "$xml")
case "$response" in
0)
true
;;
2)
true
;;
4)
owi=$(get "owi" "$xml")
get_xml "$xml" "owi=$owi"
;;
100)
exit_with_error "no input"
;;
101)
exit_with_error "invalid input"
;;
102)
exit_with_error "not found"
;;
200)
exit_with_error "unexpected error"
;;
esac
if [[ -n "$build_filename" ]]; then
build_filename "$xml"
elif [[ -n "$build_sql" ]]; then
build_sql "$db" "$md5" "$xml"
else
show_data "$request"
fi
}
get_xml () {
xml="$1"
shift
query="$*"
$torsocks "$curl" -s "${oclc}?summary=true&${query}" --output "$xml"
}
get () {
parameter="$1"
xml="$2"
shift 2
filter="$*"
[[ -z "$filter" ]] && filter='cat -'
$xq "$xml" -e "${API[$parameter]}"|eval "$filter"
}
get_identifier () {
db="$1"
md5="$2"
declare -A sql_identifier=(
[libgen]="select IdentifierWODash from updated where md5='${md5}';"
[libgen_fiction]="select Identifier from fiction where md5='${md5}';"
)
sql="${sql_identifier[$db]}"
dbx "$db" "$sql"|cut -d ',' -f 1
}
show_data () {
request="$*"
for parameter in $request; do
data=$(get "$parameter" "$xml")
[[ -n "$verbose" ]] && legend="${parameter^^}: "
[[ -n "$data" ]] && echo "${legend}${data}"
done
}
build_filename () {
xml="$1"
dirname=$(get "ddc" "$xml")
filename=$(get "author" "$xml" "${filters['filename']}")${separator}$(get "title" "$xml" "${filters['filename']}")
echo "${dirname}/${filename}"
}
build_sql () {
db="$1"
md5="$2"
xml="$3"
for parameter in ddc lcc; do
data=$(get "$parameter" "$xml")
if [[ -n "$data" ]]; then
updates="${updates}${updates:+, }${parameter^^}='${data}'"
fi
done
if [[ -n "$updates" ]]; then
if [ -n "$verbose" ]; then
echo '/*'
show_data "author title"
echo '*/'
fi
echo "update ${tables[$db]} set $updates where md5='$md5';"
fi
}
cleanup () {
base=$(basename "$xml")
rm -f "$TMPDIR/$base"
}
help () {
cat <<-EOHELP
$(basename "$(readlink -f "$0")") "version $version"
Use: classify [OPTIONS] identifier
Queries OCLC classification service for available data
Supports: DDC, LCC, NLM, Author and Title
Valid identifiers are ISBN, ISSN, UPC and OCLC/OWI
OPTIONS:
-d show DDC
-l show LCC
-n show NLM
-a show Author
-t show Title
-F create filename (DDC/Author-Title)
-Q md5 create SQL to update database
use -D libgen/-D libgen_fiction to indicate database
use with -V to add SQL comments with publication author
and title
-D db define which database to use (libgen/libgen_fiction)
-A show all available data for identifier
-o show OCLC work index (owi)
-V show labels
-S sep change separator used to build filename (default: $separator)
-@ use torsocks to connect to the OCLC classify service.
use this to avoid getting your IP blocked by OCLC
-h show this help message
Examples
$ classify -A 0199535760
AUTHOR: Plato | Jowett, Benjamin, 1817-1893 Translator; Editor; Other] ...
TITLE: The republic
DDC: 321.07
LCC: JC71
$ classify -Q 25b8ce971343e85dbdc3fa375804b538 0199535760
update updated set DDC='321.07', LCC='JC71' where md5='25b8ce971343e85dbdc3fa375804b538';
Classifying libgen/libgen_fiction
This tool can be used to add DDC and LCC classification data
to libgen and libgen_fiction databases. It does not directy
modify the database, instead producing SQL code which can be
used to apply the modifications. The best way to do this is
to produce a list of md5 hashes for publications which do
have Identifier values but lack values for DDC and/or LCC. Such
lists can be produced by the following SQL:
libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";
Run these as batch jobs (mysql -B .... -e 'sql_code_here;' > md5_list), split
the resulting file in ~1000 line sections and feed these to this tool,
preferably with a random pause between requests to keep OCLC's intrusion
detection systems from triggering too early. It is advisable to use
this tool through Tor (using -@ to enable torsocks, make sure it
is configured correctly for your Tor instance) to avoid having too
many requests from your IP to be registered, this again to avoid
your IP being blocked. The OCLC classification service is not
run as a production service (I asked them).
EOHELP
}
main "$@"