update_libgen: add -c option to integrat classify into the update process

config file: add these fields (values for demonstration purpose):
```
classify_xml="/home/username/Project/libgen_classify/xml"
classify_csv="/home/username/Project/libgen_classify/csv"
classify_sql="/home/username/Project/libgen_classify/sql"
classify_fields="ddc,lcc,nlm,fast,title,author"
classify_tor_ports="9100,9102,9104,9106,9108"
```
These fields are used by update_libgen to configure classify and import_metadata

All programs: the -@ switch now takes a mandatory parameter: -@ TORPORT
This commit is contained in:
Yetangitu 2021-06-01 14:12:09 +00:00
parent b5af8887b3
commit 391b0189fb
4 changed files with 51 additions and 15 deletions

7
books
View file

@ -7,8 +7,8 @@ trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$
version="0.7.1"
release="20200812"
version="0.7.2"
release="20210601"
functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
@ -336,6 +336,7 @@ main () {
;;
@)
source "$(which torsocks)" on
export TORSOCKS_TOR_PORT=$OPTARG
;;
=)
if [ -d "${OPTARG}" ]; then
@ -1448,7 +1449,7 @@ help () {
-x skip database update
(currently only the 'libgen' database can be updated)
-@ use torsocks to connect to the libgen server(s). You'll need to install
-@ TORPORT use torsocks to connect to the libgen server(s). You'll need to install
torsocks before using this option; try this in case your ISP
(or a transit provider somewhere en-route) blocks access to libgen

View file

@ -8,8 +8,8 @@ trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$
version="0.5.0"
release="20210516"
version="0.5.1"
release="20210601"
functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
@ -59,7 +59,7 @@ main () {
# source config file if it exists
[[ -f ${config} ]] && source "${config}"
while getopts "owdlnfatVAD:C:X:G@h" OPTION; do
while getopts "owdlnfatVAD:C:X:G@:h" OPTION; do
case $OPTION in
o)
request="$request owi"
@ -111,6 +111,7 @@ main () {
;;
@)
torsocks=$(find_tool "torsocks")
export TORSOCKS_TOR_PORT=${OPTARG}
;;
h)
help
@ -132,7 +133,7 @@ main () {
[[ -n "$debug" ]] && echo "trying $ident..."
get_xml "$xml" "stdnbr=$ident"
get_xml "$xml" "stdnbr=${ident// }"
response=$(get "response" "$xml")
case "$response" in
@ -249,7 +250,7 @@ help () {
Use: classify [OPTIONS] identifier[,identifier...]
Queries OCLC classification service for available data
Supports: DDC, LCC, NLM, FAST, Author and Title
Supports: DDC, LCC, NLM, Author and Title
Valid identifiers are ISBN, ISSN, UPC and OCLC/OWI
@ -277,7 +278,7 @@ help () {
-V show labels
-@ use torsocks to connect to the OCLC classify service.
-@ PORT use torsocks to connect to the OCLC classify service.
use this to avoid getting your IP blocked by OCLC
-h show this help message
@ -319,7 +320,7 @@ help () {
the resulting file in ~1000 line sections and feed these to this tool,
preferably with a random pause between requests to keep OCLC's intrusion
detection systems from triggering too early. It is advisable to use
this tool through Tor (using -@ to enable torsocks, make sure it
this tool through Tor (using -@ TORPORT to enable torsocks, make sure it
is configured correctly for your Tor instance) to avoid having too
many requests from your IP to be registered, this again to avoid
your IP being blocked. The OCLC classification service is not

View file

@ -4,7 +4,7 @@
# refresh libgen databases from dump files
version="0.6.2"
release="20210512"
release="20210601"
trap "trap_error" TERM
trap "trap_clean" EXIT
@ -156,6 +156,7 @@ main () {
;;
@)
torsocks=$(find_tool "torsocks")
export TORSOCKS_TOR_PORT=$OPTARG
;;
k)
keep_downloaded_files=1
@ -305,7 +306,7 @@ help () {
-c create a config file using current settings (see -H, -P, -U, -R)
-e edit config file
-@ use tor (through torsocks) to connect to libgen server
-@ TORPORT use tor (through torsocks) to connect to libgen server
-k keep downloaded files after exit
-h this help message

View file

@ -66,7 +66,7 @@ main () {
declare -A values=()
declare -A upsert=()
while getopts "a:D:j:hH:i:l:nP:U:qs:t:u:v@" OPTION
while getopts "a:D:j:hH:i:l:nP:U:qs:ct:u:v@:" OPTION
do
case $OPTION in
j)
@ -75,6 +75,11 @@ main () {
s)
sql_dump="${OPTARG}"
;;
c)
classify=$(find_tool "classify")
import_metadata=$(find_tool "import_metadata")
classifile="${tmpdir}/classifile"
;;
v)
((verbose++))
;;
@ -116,7 +121,8 @@ main () {
fi
;;
@)
torsocks=$(find_toolo "torsocks")
torsocks=$(find_tool "torsocks")
export TORSOCKS_TOR_PORT=${OPTARG}
;;
q)
quiet=1
@ -197,6 +203,10 @@ main () {
echo
fi
if [[ -n "$classifile" && -n "${record['identifierwodash']}" ]]; then
echo "${record['md5']}" >> "$classifile"
fi
keys=${!record[*]}
md5="${record[md5]}"
@ -230,6 +240,7 @@ main () {
sql+="insert into $table (${columns[$table]%?}) values(${values[$table]%?}) on duplicate key update ${upsert[$table]%?};"
fi
done
echo "${sql}" >> "${update_sql}"
[[ -n $sql_dump ]] && echo "${sql}" >> "${sql_dump}"
@ -249,6 +260,28 @@ main () {
[[ $no_action == 0 ]] && dbx "$db" < "${update_sql}"
done
# optionally add classification data to new records
# this will use tor and round-robin through TOR ports if these are
# defined in classify_tor_ports in the config file
if [[ -n "$classifile" && -f $classifile ]]; then
now=$(date +%Y%m%d%H%M)
csvfile="${classify_csv:+$classify_csv/}${now}.csv"
IFS=',' read -ra torports <<< "$classify_tor_ports"
if [[ ${#torports[*]} -gt 0 ]]; then
torpc=${#torports[*]}
fi
upc=0
while read md5;do
$classify ${torpc:+-@ ${torports[$upc%$torpc]}} -D "$db" ${classify_xml:+-X $classify_xml} -C "$md5" >> "${csvfile}"
((upc++))
done < <(cat "$classifile")
if [[ -f ${csvfile} ]]; then
$import_metadata -d "$db" -f "${classify_fields:-ddc,lcc,fast}" ${classify_sql:+-s $classify_sql/$now.sql} -F "${csvfile}"
fi
fi
}
get_current_fields () {
@ -417,7 +450,7 @@ help () {
-D DATABASE database name
-a APIHOST use APIHOST as API server
-@ use tor (through torsocks) to connect to libgen API server
-@ TORPORT use tor (through torsocks) to connect to libgen API server
-q don't warn about missing fields in database or api response
-h this help message