
config file: add these fields (values for demonstration purpose): ``` classify_xml="/home/username/Project/libgen_classify/xml" classify_csv="/home/username/Project/libgen_classify/csv" classify_sql="/home/username/Project/libgen_classify/sql" classify_fields="ddc,lcc,nlm,fast,title,author" classify_tor_ports="9100,9102,9104,9106,9108" ``` These fields are used by update_libgen to configure classify and import_metadata All programs: the -@ switch now takes a mandatory parameter: -@ TORPORT
347 lines
8.1 KiB
Bash
Executable file
347 lines
8.1 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
#shellcheck disable=SC2034,SC1090
|
|
#
|
|
# classify - return classification data for ISBN (etc.) or MD5 (from libgen/libgen_fiction)
|
|
|
|
shopt -s extglob
|
|
trap "trap_error" TERM
|
|
trap "trap_clean" EXIT
|
|
export TOP_PID=$$
|
|
|
|
version="0.5.1"
|
|
release="20210601"
|
|
|
|
functions="$(dirname "$0")/books_functions"
|
|
if [ -f "$functions" ]; then
|
|
source "$functions"
|
|
else
|
|
echo "$functions not found"
|
|
exit 1
|
|
fi
|
|
|
|
main () {
|
|
# PREFERENCES
|
|
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
|
|
# OCLC classify API
|
|
oclc="http://classify.oclc.org/classify2/Classify"
|
|
|
|
declare -A API=(
|
|
[response]='/classify/response/@code'
|
|
[owi]='/classify/works/work[1]/@owi'
|
|
[wi]='/classify/works/work[1]/@wi'
|
|
[fast]='join(/classify/recommendations/fast/headings/heading,",")'
|
|
[ddc]='join(/classify/recommendations/ddc/mostPopular/@nsfa)'
|
|
[lcc]='join(/classify/recommendations/lcc/mostPopular/@nsfa)'
|
|
[nlm]='join(/classify/recommendations/nlm/mostPopular/@sfa)'
|
|
[author]='/classify/work/@author'
|
|
[authors]='join(/classify/authors/author," | ")'
|
|
[title]='/classify/work/@title'
|
|
)
|
|
|
|
declare -A filters=(
|
|
[filename]="sed -e 's/[^-[:alnum:]:;?!.,+@#%]/_/g;s/^\([-_]\)*//'"
|
|
)
|
|
|
|
declare -A tables=(
|
|
[libgen]="updated"
|
|
[libgen_fiction]="fiction"
|
|
)
|
|
|
|
xidel=$(find_tool "xidel")
|
|
curl=$(find_tool "curl")
|
|
xq="$xidel -s"
|
|
|
|
request=""
|
|
|
|
TMPDIR="/tmp"
|
|
xml=$(mktemp -p $TMPDIR classify.XXXXX)
|
|
|
|
# source config file if it exists
|
|
[[ -f ${config} ]] && source "${config}"
|
|
|
|
while getopts "owdlnfatVAD:C:X:G@:h" OPTION; do
|
|
case $OPTION in
|
|
o)
|
|
request="$request owi"
|
|
;;
|
|
w)
|
|
request="$request wi"
|
|
;;
|
|
d)
|
|
request="$request ddc"
|
|
;;
|
|
l)
|
|
request="$request lcc"
|
|
;;
|
|
n)
|
|
request="$request nlm"
|
|
;;
|
|
f)
|
|
request="$request fast"
|
|
;;
|
|
a)
|
|
request="$request author"
|
|
;;
|
|
t)
|
|
request="$request title"
|
|
;;
|
|
V)
|
|
verbose=1
|
|
;;
|
|
D)
|
|
db="$OPTARG"
|
|
;;
|
|
C)
|
|
[ -z "$db" ] && exit_with_error "use -D to define which database to use"
|
|
build_csv=1
|
|
md5="$OPTARG"
|
|
idents=$(get_identifiers "$db" "$md5")
|
|
[ -z "$idents" ] && exit_with_error "no identifier found in $db for MD5 = $md5"
|
|
;;
|
|
X)
|
|
save_xml="$OPTARG"
|
|
[[ ! -d "$save_xml" ]] && exit_with_error "Save XML (-X $OPTARG): directory does not exist?"
|
|
;;
|
|
A)
|
|
request="author title fast owi wi ddc lcc nlm"
|
|
verbose=1
|
|
;;
|
|
G)
|
|
((debug++))
|
|
;;
|
|
@)
|
|
torsocks=$(find_tool "torsocks")
|
|
export TORSOCKS_TOR_PORT=${OPTARG}
|
|
;;
|
|
h)
|
|
help
|
|
exit
|
|
;;
|
|
|
|
*)
|
|
exit_with_error "unknown option: $OPTION"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
shift $((OPTIND-1))
|
|
[ -z "$idents" ] && idents="$1"
|
|
|
|
IFS=',' read -ra idarr <<< "$idents"
|
|
|
|
for ident in "${idarr[@]}"; do
|
|
|
|
[[ -n "$debug" ]] && echo "trying $ident..."
|
|
|
|
get_xml "$xml" "stdnbr=${ident// }"
|
|
response=$(get "response" "$xml")
|
|
|
|
case "$response" in
|
|
0)
|
|
success=1
|
|
break
|
|
;;
|
|
2)
|
|
success=1
|
|
break
|
|
;;
|
|
4)
|
|
wi=$(get "wi" "$xml")
|
|
get_xml "$xml" "wi=$wi"
|
|
if [[ $(get "response" "$xml") =~ 0|2 ]]; then
|
|
success=1
|
|
break
|
|
else
|
|
continue
|
|
fi
|
|
;;
|
|
*)
|
|
continue
|
|
;;
|
|
esac
|
|
done
|
|
|
|
[[ -z "$success" ]] && exit_with_error "no valid response for identifier(s) $idents"
|
|
|
|
if [[ -n "$save_xml" ]]; then
|
|
[[ -z "$md5" ]] && exit_with_error "Save XML (-X) only works with a defined MD5 (-C MD5)"
|
|
cp "$xml" "$save_xml/$md5.xml"
|
|
fi
|
|
|
|
if [[ -n "$debug" ]]; then
|
|
cat "$xml"
|
|
fi
|
|
|
|
if [[ -n "$build_csv" ]]; then
|
|
build_csv "$db" "$md5" "$xml"
|
|
else
|
|
show_data "$request"
|
|
fi
|
|
}
|
|
|
|
get_xml () {
|
|
xml="$1"
|
|
shift
|
|
query="$*"
|
|
$torsocks "$curl" -s "${oclc}?summary=false&${query}" --output "$xml"
|
|
}
|
|
|
|
get () {
|
|
parameter="$1"
|
|
xml="$2"
|
|
shift 2
|
|
filter="$*"
|
|
[[ -z "$filter" ]] && filter='cat -'
|
|
$xq "$xml" -e "${API[$parameter]}"|eval "$filter"
|
|
}
|
|
|
|
get_identifiers () {
|
|
db="$1"
|
|
md5="$2"
|
|
|
|
declare -A sql_identifier=(
|
|
[libgen]="select IdentifierWODash from updated where md5='${md5}';"
|
|
[libgen_fiction]="select Identifier from fiction where md5='${md5}';"
|
|
)
|
|
|
|
sql="${sql_identifier[$db]}"
|
|
dbx "$db" "$sql"
|
|
}
|
|
|
|
show_data () {
|
|
request="$*"
|
|
|
|
for parameter in $request; do
|
|
data=$(get "$parameter" "$xml")
|
|
[[ -n "$verbose" ]] && legend="${parameter^^}: "
|
|
[[ -n "$data" ]] && echo "${legend}${data}"
|
|
done
|
|
}
|
|
|
|
build_csv () {
|
|
db="$1"
|
|
md5="$2"
|
|
xml="$3"
|
|
|
|
updates="${md5}"
|
|
|
|
for parameter in ddc lcc nlm; do
|
|
data=$(get "$parameter" "$xml")
|
|
updates+=",\"${data}\""
|
|
done
|
|
|
|
for parameter in fast author title; do
|
|
data=$(get "$parameter" "$xml" "base64 -w0")
|
|
updates+=",${data}"
|
|
done
|
|
|
|
echo "$updates"
|
|
}
|
|
|
|
cleanup () {
|
|
base=$(basename "$xml")
|
|
rm -f "$TMPDIR/$base"
|
|
}
|
|
|
|
help () {
|
|
cat <<-EOHELP
|
|
$(basename "$(readlink -f "$0")") "version $version"
|
|
|
|
Use: classify [OPTIONS] identifier[,identifier...]
|
|
|
|
Queries OCLC classification service for available data
|
|
Supports: DDC, LCC, NLM, Author and Title
|
|
|
|
Valid identifiers are ISBN, ISSN, UPC and OCLC/OWI
|
|
|
|
OPTIONS:
|
|
|
|
-d show DDC
|
|
-l show LCC
|
|
-n show NLM
|
|
-f show FAST
|
|
-a show Author
|
|
-t show Title
|
|
|
|
-o show OWI (OCLC works identifier)
|
|
-w show WI (OCLC works number)
|
|
|
|
-C md5 create CSV (MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE)
|
|
use -D libgen/-D libgen_fiction to indicate database
|
|
|
|
-X dir save OCLC XML response to \$dir/\$md5.xml
|
|
only works with a defined MD5 (-C MD5)
|
|
|
|
-D db define which database to use (libgen/libgen_fiction)
|
|
|
|
-A show all available data for identifier
|
|
|
|
-V show labels
|
|
|
|
-@ PORT use torsocks to connect to the OCLC classify service.
|
|
use this to avoid getting your IP blocked by OCLC
|
|
|
|
-h show this help message
|
|
|
|
Examples
|
|
|
|
$ classify -A 0199535760
|
|
AUTHOR: Plato | Jowett, Benjamin, 1817-1893 Translator; Editor; Other] ...
|
|
TITLE: The republic
|
|
DDC: 321.07
|
|
LCC: JC71
|
|
|
|
$ classify -D libgen -C 25b8ce971343e85dbdc3fa375804b538
|
|
25b8ce971343e85dbdc3fa375804b538,"321.07","JC71","",UG9saXRpY2FsI\
|
|
HNjaWVuY2UsVXRvcGlhcyxKdXN0aWNlLEV0aGljcyxQb2xpdGljYWwgZXRoaWNzLFB\
|
|
oaWxvc29waHksRW5nbGlzaCBsYW5ndWFnZSxUaGVzYXVyaQo=,UGxhdG8gfCBKb3dl\
|
|
dHQsIEJlbmphbWluLCAxODE3LTE4OTMgW1RyYW5zbGF0b3I7IEVkaXRvcjsgT3RoZX\
|
|
JdIHwgV2F0ZXJmaWVsZCwgUm9iaW4sIDE5NTItIFtUcmFuc2xhdG9yOyBXcml0ZXIg\
|
|
b2YgYWRkZWQgdGV4dDsgRWRpdG9yOyBPdGhlcl0gfCBMZWUsIEguIEQuIFAuIDE5MD\
|
|
gtMTk5MyBbVHJhbnNsYXRvcjsgRWRpdG9yOyBBdXRob3Igb2YgaW50cm9kdWN0aW9u\
|
|
XSB8IFNob3JleSwgUGF1bCwgMTg1Ny0xOTM0IFtUcmFuc2xhdG9yOyBBdXRob3I7IE\
|
|
90aGVyXSB8IFJlZXZlLCBDLiBELiBDLiwgMTk0OC0gW1RyYW5zbGF0b3I7IEVkaXRv\
|
|
cjsgT3RoZXJdCg==,VGhlIHJlcHVibGljCg==
|
|
|
|
|
|
Classifying libgen/libgen_fiction
|
|
|
|
This tool can be used to add classification data to libgen and
|
|
libgen_fiction databases. It does not directy modify the database,
|
|
instead producing CSV which can be used to apply the modifications.
|
|
The best way to do this is to produce a list of md5 hashes for
|
|
publications which do have Identifier values but lack values for DDC
|
|
and/or LCC. Such lists can be produced by the following SQL:
|
|
|
|
libgen: select md5 from updated where IdentifierWODash<>"" and DDC="";
|
|
libgen_fiction: select md5 from fiction where Identifier<>"" and DDC="";
|
|
|
|
Run these as batch jobs (mysql -B .... -e 'sql_code_here;' > md5_list), split
|
|
the resulting file in ~1000 line sections and feed these to this tool,
|
|
preferably with a random pause between requests to keep OCLC's intrusion
|
|
detection systems from triggering too early. It is advisable to use
|
|
this tool through Tor (using -@ TORPORT to enable torsocks, make sure it
|
|
is configured correctly for your Tor instance) to avoid having too
|
|
many requests from your IP to be registered, this again to avoid
|
|
your IP being blocked. The OCLC classification service is not
|
|
run as a production service (I asked them).
|
|
|
|
Return values are stored in the following order:
|
|
|
|
MD5,DDC,LCC,NLM,FAST,AUTHOR,TITLE
|
|
|
|
DDC, LCC and NLM are enclosed within double quotes and can contain
|
|
multiple space-separated values. FAST, AUTHOR and TITLE are base64 encoded
|
|
since these fields can contain a whole host of unwholesome characters
|
|
which can mess up CSV. The AUTHOR field currentlydecodes to a pipe ('|')
|
|
separated list of authors in the format:
|
|
|
|
LAST_NAME, NAME_OR_INITIALS, DATE_OF_BIRTH-[DATE_OF_DEATH] [[ROLE[[;ROLE]...]]]
|
|
|
|
This format could change depending on what OCLC does with the
|
|
(experimental) service.
|
|
|
|
EOHELP
|
|
}
|
|
|
|
main "$@"
|