463 lines
11 KiB
Bash
Executable file
463 lines
11 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# shellcheck disable=SC2034,SC1090,SC2155,SC2207
|
|
|
|
version="0.6.1"
|
|
release="20210512"
|
|
|
|
trap "trap_error" TERM
|
|
trap "trap_clean" EXIT
|
|
export TOP_PID=$$
|
|
|
|
LC_ALL=C
|
|
|
|
functions="$(dirname "$0")/books_functions"
|
|
if [ -f "$functions" ]; then
|
|
source "$functions"
|
|
else
|
|
echo "$functions not found"
|
|
exit 1
|
|
fi
|
|
|
|
main () {
|
|
|
|
exlock now || exit 1
|
|
|
|
# PREFERENCES
|
|
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
|
|
|
|
dbhost="localhost"
|
|
dbport="3306"
|
|
db="libgen"
|
|
dbuser="libgen"
|
|
limit=1000
|
|
|
|
api="http://libgen.rs/json.php"
|
|
|
|
# source config file if it exists
|
|
[[ -f ${config} ]] && source "${config}"
|
|
|
|
# (more or less) END OF PREFERENCES
|
|
|
|
jq=$(find_tool "jq")
|
|
curl=$(find_tool "curl")
|
|
|
|
tmpdir=$(mktemp -d '/tmp/update_libgen.XXXXXX')
|
|
updates="${tmpdir}/updates"
|
|
update_count="${tmpdir}/update_count"
|
|
update_sql="${tmpdir}/update_sql"
|
|
update_last_modified="${tmpdir}/update_last_modified"
|
|
update_last_id="${tmpdir}/update_last_id"
|
|
update_newer="${tmpdir}/update_newer"
|
|
|
|
verbose=0
|
|
no_action=0
|
|
unknown_fields=""
|
|
|
|
re_type='[a-z]+'
|
|
re_int='[0-9]+'
|
|
re_year='[0-9]{4}'
|
|
re_timestamp='[0-9]{4}-[0-9]{2}-[0-9]{2} [0-2][0-9]:[0-5][0-9]:[0-5][0-9]'
|
|
|
|
declare -a tables="(description hashes updated)"
|
|
declare -A current_fields="($(get_current_fields))"
|
|
declare -A field_types="($(get_field_types))"
|
|
declare -A field_sizes="($(get_field_sizes))"
|
|
declare -A columns=()
|
|
declare -A values=()
|
|
declare -A upsert=()
|
|
|
|
while getopts "a:D:j:hH:i:l:nP:U:qs:ct:u:v@:" OPTION
|
|
do
|
|
case $OPTION in
|
|
j)
|
|
json_dump="${OPTARG}"
|
|
;;
|
|
s)
|
|
sql_dump="${OPTARG}"
|
|
;;
|
|
c)
|
|
classify=$(find_tool "classify")
|
|
import_metadata=$(find_tool "import_metadata")
|
|
classifile="${tmpdir}/classifile"
|
|
;;
|
|
v)
|
|
((verbose++))
|
|
;;
|
|
n)
|
|
no_action=1
|
|
;;
|
|
l)
|
|
limit="${OPTARG}"
|
|
if [[ $limit -le 1 ]]; then
|
|
exit_wit_error "limit too low (-l ${limit}), minimum is 2"
|
|
fi
|
|
;;
|
|
t)
|
|
startdatetime="${OPTARG}"
|
|
;;
|
|
i)
|
|
echo "${OPTARG}" > "${update_last_id}"
|
|
;;
|
|
u)
|
|
api="${OPTARG}"
|
|
;;
|
|
H)
|
|
dbhost="${OPTARG}"
|
|
;;
|
|
P)
|
|
dbport="${OPTARG}"
|
|
;;
|
|
U)
|
|
dbuser="${OPTARG}"
|
|
;;
|
|
D)
|
|
db="${OPTARG}"
|
|
;;
|
|
a)
|
|
if url_available "${OPTARG}?fields=id&ids=0"; then
|
|
api="${OPTARG}"
|
|
else
|
|
exit_with_error "-a ${OPTARG}: API endpoint not available"
|
|
fi
|
|
;;
|
|
@)
|
|
torsocks=$(find_tool "torsocks")
|
|
export TORSOCKS_TOR_PORT=${OPTARG}
|
|
;;
|
|
q)
|
|
quiet=1
|
|
;;
|
|
h)
|
|
help
|
|
exit
|
|
;;
|
|
*)
|
|
exit_with_error "unknown option $OPTION"
|
|
;;
|
|
|
|
esac
|
|
done
|
|
|
|
check_fields
|
|
|
|
while (
|
|
if [[ -s ${update_last_modified} ]]; then
|
|
last_update="$(cat "${update_last_modified}")"
|
|
last_update_in_db="$(get_time_last_modified)"
|
|
if [[ $last_update != "$last_update_in_db" && $no_action == 0 ]]; then
|
|
exit_with_error "uh oh... something went wrong, last update in db does not equal last update from api response..."
|
|
fi
|
|
elif [[ -n $startdatetime ]]; then
|
|
last_update="${startdatetime}"
|
|
else
|
|
last_update="$(get_time_last_modified)"
|
|
fi
|
|
|
|
last_id=$([[ -s ${update_last_id} ]] && cat "${update_last_id}" || get_max_id)
|
|
|
|
get_updates "$last_id" "$limit" "$last_update"
|
|
|
|
updcnt=$(get_update_count)
|
|
|
|
[[ -n $json_dump ]] && cat "${updates}" >> "${json_dump}"
|
|
|
|
if [[ $verbose -ge 1 ]]; then
|
|
echo "database last modified: $last_update";
|
|
# update counter is 0-based, humans prefer 1-based notation
|
|
if [[ ${updcnt} -gt 0 ]]; then
|
|
echo "$((updcnt+1)) updates";
|
|
else
|
|
more=$([[ -s $update_last_id ]] && echo "more " || echo "")
|
|
echo "no ${more}updates"
|
|
fi
|
|
echo ;
|
|
fi
|
|
|
|
test "$updcnt" -gt 0
|
|
); do
|
|
|
|
updcnt=$(get_update_count)
|
|
count=0
|
|
echo "start transaction;" > "${update_sql}"
|
|
|
|
while [[ $count -le $updcnt ]]; do
|
|
declare -A record
|
|
while IFS="=" read -r key value; do
|
|
# drop unknown fields
|
|
if [[ ! $unknown_fields =~ ${key,,} ]]; then
|
|
# limit field size to avoid choking jq on overly long strings
|
|
[[ ${#value} -gt 1000 ]] && value="${value:0:997}..."
|
|
record[${key,,}]="$value"
|
|
fi
|
|
done < <($jq -r ".[$count]"'|to_entries|map("\(.key)=\(.value|tostring|.[0:4000]|gsub("\n";"\\n"))")|.[]' "${updates}")
|
|
|
|
# record current position
|
|
echo "${record['id']}" > "${update_last_id}"
|
|
echo "${record['timelastmodified']}" > "${update_last_modified}"
|
|
|
|
if [[ $verbose -ge 2 ]]; then
|
|
echo "ID: ${record['id']}";
|
|
echo "Author: ${record['author']}";
|
|
echo "Title: ${record['title']}";
|
|
echo "Modified: ${record['timelastmodified']}";
|
|
echo
|
|
fi
|
|
|
|
if [[ -n "$classifile" && -n "${record['identifierwodash']}" ]]; then
|
|
echo "${record['md5']}" >> "$classifile"
|
|
fi
|
|
|
|
keys=${!record[*]}
|
|
|
|
md5="${record[md5]}"
|
|
|
|
# split fields between tables
|
|
for key in "${!record[@]}"; do
|
|
table=${current_fields[$key]}
|
|
columns[$table]+="${key},"
|
|
value=${record[$key]}
|
|
if [ -n "$value" ]; then
|
|
value=$(sanitize_field "$key" "$value")
|
|
fi
|
|
values[$table]+="'$value',"
|
|
upsert[$table]+="${key} = values(${key}),"
|
|
done
|
|
|
|
# add md5 to secondary tables (all but the last)
|
|
for n in $(seq 0 $((${#tables[@]}-2))); do
|
|
table="${tables[$n]}"
|
|
if [[ -n "${columns[$table]}" ]]; then
|
|
columns[$table]+="md5,"
|
|
values[$table]+="'$md5',"
|
|
upsert[$table]+="md5 = values(md5),"
|
|
fi
|
|
done
|
|
|
|
# main table (last in tables array) first
|
|
for n in $(seq $((${#tables[@]}-1)) -1 0); do
|
|
table="${tables[$n]}"
|
|
if [[ -n "${columns[$table]}" ]]; then
|
|
sql+="insert into $table (${columns[$table]%?}) values(${values[$table]%?}) on duplicate key update ${upsert[$table]%?};"
|
|
fi
|
|
done
|
|
|
|
echo "${sql}" >> "${update_sql}"
|
|
[[ -n $sql_dump ]] && echo "${sql}" >> "${sql_dump}"
|
|
|
|
unset record
|
|
unset keys
|
|
unset key
|
|
unset value
|
|
unset sql
|
|
columns=()
|
|
values=()
|
|
upsert=()
|
|
|
|
((count++))
|
|
done
|
|
|
|
echo "commit;" >> "${update_sql}"
|
|
|
|
[[ $no_action == 0 ]] && dbx "$db" < "${update_sql}"
|
|
done
|
|
|
|
# optionally add classification data to new records
|
|
# this will use tor and round-robin through TOR ports if these are
|
|
# defined in classify_tor_ports in the config file
|
|
if [[ -n "$classifile" && -f $classifile ]]; then
|
|
now=$(date +%Y%m%d%H%M)
|
|
csvfile="${classify_csv:+$classify_csv/}${now}.csv"
|
|
IFS=',' read -ra torports <<< "$classify_tor_ports"
|
|
if [[ ${#torports[*]} -gt 0 ]]; then
|
|
torpc=${#torports[*]}
|
|
fi
|
|
upc=0
|
|
while read md5;do
|
|
$classify ${torpc:+-@ ${torports[$upc%$torpc]}} -D "$db" ${classify_xml:+-X $classify_xml} -C "$md5" >> "${csvfile}"
|
|
((upc++))
|
|
done < <(cat "$classifile")
|
|
|
|
if [[ -f ${csvfile} ]]; then
|
|
$import_metadata -d "$db" -f "${classify_fields:-ddc,lcc,fast}" ${classify_sql:+-s $classify_sql/$now.sql} -F "${csvfile}"
|
|
fi
|
|
fi
|
|
|
|
}
|
|
|
|
get_current_fields () {
|
|
for table in "${tables[@]}"; do
|
|
dbx "$db" "describe $table;"|awk '{print "["tolower($1)"]='"$table"'"}'
|
|
done
|
|
}
|
|
|
|
get_field_type () {
|
|
field="$1"
|
|
table="${current_fields[$field]}"
|
|
dbx "$db" "show fields from $table where field=\"$field\";"|awk '{print $2}'
|
|
}
|
|
|
|
get_field_types () {
|
|
for field in "${!current_fields[@]}"; do
|
|
fieldtype=$(get_field_type "$field")
|
|
[[ "$fieldtype" =~ $re_type ]]
|
|
echo -n "[$field]=${BASH_REMATCH[0]} "
|
|
done
|
|
}
|
|
|
|
get_field_sizes () {
|
|
for field in "${!current_fields[@]}"; do
|
|
fieldtype=$(get_field_type "$field")
|
|
[[ "$fieldtype" =~ $re_int ]]
|
|
if [[ "${BASH_REMATCH[0]}" -gt 0 ]]; then
|
|
echo -n "[$field]=${BASH_REMATCH[0]} "
|
|
fi
|
|
done
|
|
}
|
|
|
|
# sanitize_field FIELD VALUE
|
|
sanitize_field () {
|
|
field=$1
|
|
shift
|
|
value="$*"
|
|
|
|
# quote values for SQL
|
|
value=${value//\\/\\\\}
|
|
value=${value//\'/\\\'}
|
|
|
|
# field-type specific filters
|
|
case "${field_types[$field]}" in
|
|
int|bigint)
|
|
[[ "$value" =~ $re_int ]]
|
|
value=${BASH_REMATCH[0]}
|
|
value=${value:0:${field_sizes[$field]}}
|
|
;;
|
|
char|varchar)
|
|
value=${value:0:${field_sizes[$field]}}
|
|
;;
|
|
timestamp)
|
|
[[ "$value" =~ $re_timestamp ]]
|
|
value=${BASH_REMATCH[0]}
|
|
;;
|
|
esac
|
|
|
|
# field-specific filters
|
|
case "$field" in
|
|
year)
|
|
# filter out Chinese date stamps
|
|
[[ "$value" =~ $re_year ]]
|
|
value=${BASH_REMATCH[0]}
|
|
;;
|
|
esac
|
|
|
|
echo -n "$value"
|
|
}
|
|
|
|
# libgen_api ID LIMIT TIME_LAST_MODIFIED
|
|
libgen_api () {
|
|
id="$1"
|
|
shift
|
|
limit="$1"
|
|
shift
|
|
if ! newer=$(date -d "$*" +'%Y-%m-%d%%20%H:%M:%S'); then
|
|
exit_with_error "date error: $* is not a valid date"
|
|
fi
|
|
|
|
echo "$newer" > "$update_newer"
|
|
|
|
$torsocks "$curl" -s "${api}?"'fields=*&idnewer='"${id}"'&mode=newer&limit1='"${limit}"'&timenewer='"${newer}"
|
|
}
|
|
|
|
# get_updates ID LIMIT TIME_LAST_MODIFIED
|
|
get_updates () {
|
|
id="$1"
|
|
shift
|
|
limit="$1"
|
|
shift
|
|
last="$*"
|
|
libgen_api "$id" "$limit" "$last" > "${updates}"
|
|
$jq '.|length' "${updates}" > "${update_count}"
|
|
}
|
|
|
|
|
|
get_time_last_modified () {
|
|
dbx "$db" 'select MAX(TimeLastModified) FROM updated;'|tail -1
|
|
}
|
|
|
|
get_max_id () {
|
|
dbx "$db" 'select MAX(id) FROM updated;'|tail -1
|
|
}
|
|
|
|
get_update_count () {
|
|
echo $(($(cat "${update_count}")-1))
|
|
}
|
|
|
|
check_fields () {
|
|
updates_fields=($(libgen_api 1 2 '2000-01-01'|$jq -r '.[0]|keys|@sh'))
|
|
db_fields="${!current_fields[*]}"
|
|
db_fields="${db_fields,,}"
|
|
|
|
# check for extra fields in api response
|
|
for index in "${!updates_fields[@]}"; do
|
|
field="${updates_fields[$index]%\'}"
|
|
field="${field#\'}"
|
|
if [[ ! $db_fields =~ ${field,,} ]]; then
|
|
if [[ ! -v quiet ]]; then
|
|
echo "unknown field in api response: ${field} (consider refreshing database from dump)"
|
|
fi
|
|
unknown_fields+="${field,,} "
|
|
else
|
|
:
|
|
fi
|
|
done
|
|
|
|
# check for missing fields in api reponse
|
|
[[ $verbose -ge 1 ]] && {
|
|
for field in "${!current_fields[@]}"; do
|
|
if [[ ! -v quiet && ! ${updates_fields[*],,} =~ ${field,,} ]]; then
|
|
echo "missing field in api response: $field"
|
|
fi
|
|
done
|
|
}
|
|
}
|
|
|
|
cleanup () {
|
|
rm -rf "${tmpdir}"
|
|
}
|
|
|
|
help () {
|
|
echo "$(basename "$(readlink -f "$0")")" "version $version"
|
|
cat <<- 'EOT'
|
|
|
|
Usage: update_libgen OPTIONS
|
|
|
|
-l LIMIT get updates in blocks of LIMIT entries
|
|
-v be verbose about what is being updated; repeat for more verbosity:
|
|
-v: show basic info (number of updates, etc)
|
|
-vv: show ID, Title and TimeLastModified for each update
|
|
-n do not update database. Use together with -v or -vv to show
|
|
how many (-v) and which (-vv) titles would be updated.
|
|
-j FILE dump (append) json to FILE
|
|
-s FILE dump (append) sql to FILE
|
|
-u URL use URL to access the libgen API (overrides default)
|
|
-t DATETIME get updates since DATETIME (ignoring TimeLastModified in database)
|
|
use this option together with -s to create an sql update file to update
|
|
non-networked machines
|
|
-i ID get updates from ID
|
|
|
|
-H DBHOST database host
|
|
-P DBPORT database port
|
|
-U DBUSER database user
|
|
-D DATABASE database name
|
|
|
|
-a APIHOST use APIHOST as API server
|
|
-@ TORPORT use tor (through torsocks) to connect to libgen API server
|
|
-c run classify over new records to get classification data
|
|
-q don't warn about missing fields in database or api response
|
|
-h this help message
|
|
|
|
EOT
|
|
}
|
|
|
|
exlock prepare || exit 1
|
|
|
|
main "$@"
|