books/update_libgen

505 lines
11 KiB
Bash
Executable file

#!/usr/bin/env bash
version="0.6"
release="20200805"
trap "trap_error" TERM
trap "trap_exit" EXIT
export TOP_PID=$$
LOCKFILE="/var/lock/$(basename $0)"
LOCKFD=99
LC_ALL=C
main () {
exlock_now || exit 1
# PREFERENCES
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
dbhost="localhost"
dbport="3306"
db="libgen"
dbuser="libgen"
limit=1000
api="http://gen.lib.rus.ec/json.php"
#api="http://libgen.io/json.php"
#api="http://libgen.lc/json.php"
#api="http://libgen.unblocked.tv/json.php"
# source config file if it exists
[[ -f ${config} ]] && source ${config}
# (more or less) END OF PREFERENCES
tmpdir=$(mktemp -d '/tmp/update_libgen.XXXXXX')
updates="${tmpdir}/updates"
update_count="${tmpdir}/update_count"
update_sql="${tmpdir}/update_sql"
update_last_modified="${tmpdir}/update_last_modified"
update_last_id="${tmpdir}/update_last_id"
mysqlcmd="mysql -s --skip-column-names -N -B -h ${dbhost} -P ${dbport} -u ${dbuser} ${db}"
verbose=0
no_action=0
unknown_fields=""
re_type='[a-z]+'
re_int='[0-9]+'
re_year='[0-9]{4}'
re_timestamp='[0-9]{4}-[0-9]{2}-[0-9]{2} [0-2][0-9]:[0-5][0-9]:[0-5][0-9]'
declare -a tables="(description hashes updated)"
declare -A current_fields="($(get_current_fields))"
declare -A field_types="($(get_field_types))"
declare -A field_sizes="($(get_field_sizes))"
declare -A columns=()
declare -A values=()
declare -A upsert=()
while getopts "a:D:j:hH:i:l:np:P:qs:t:u:v@" OPTION
do
case $OPTION in
j)
json_dump="${OPTARG}"
;;
s)
sql_dump="${OPTARG}"
;;
v)
let verbose+=1
;;
n)
no_action=1
;;
l)
limit="${OPTARG}"
if [[ $limit -le 1 ]]; then
exit_wit_error "limit too low (-l ${limit}), minimum is 2"
fi
;;
t)
startdatetime="${OPTARG}"
;;
i)
echo "${OPTARG}" > ${update_last_id}
;;
u)
api="${OPTARG}"
;;
H)
dbhost="${OPTARG}"
;;
P)
dbport="${OPTARG}"
;;
U)
dbuser="${OPTARG}"
;;
p)
dbpass="-p${OPTARG}"
;;
D)
db="${OPTARG}"
;;
a)
if url_available "${OPTARG}?fields=id&ids=0"; then
api="${OPTARG}"
else
exit_with_error "-a ${OPTARG}: API endpoint not available"
fi
;;
@)
use_torsocks=1
source $(which torsocks) on
;;
q)
quiet=1
;;
h)
help
exit
esac
done
check_sanity
check_fields
while (
if [[ -s ${update_last_modified} ]]; then
last_update="$(cat ${update_last_modified})"
last_update_in_db="$(get_time_last_modified)"
if [[ $last_update != $last_update_in_db && $no_action == 0 ]]; then
exit_with_error "uh oh... something went wrong, last update in db does not equal last update from api response..."
fi
elif [[ -n $startdatetime ]]; then
last_update="${startdatetime}"
else
last_update="$(get_time_last_modified)"
fi
last_id=$([[ -s ${update_last_id} ]] && cat ${update_last_id} || get_max_id)
get_updates $last_id $limit "$last_update"
updcnt=$(get_update_count)
[[ -n $json_dump ]] && cat ${updates} >> "${json_dump}"
if [[ $verbose -ge 1 ]]; then
echo "database last modified: $last_update";
# update counter is 0-based, humans prefer 1-based notation
if [[ ${updcnt} -gt 0 ]]; then
echo "$((${updcnt}+1)) updates";
else
more=$([[ -s $update_last_id ]] && echo "more " || echo "")
echo "no ${more}updates"
fi
echo ;
fi
test $updcnt -gt 0
); do
updcnt=$(get_update_count)
count=0
echo "start transaction;" > ${update_sql}
while [ $count -le $updcnt ]; do
declare -A record
while IFS="=" read -r key value; do
# drop unknown fields
if [[ ! $unknown_fields =~ ${key,,} ]]; then
# limit field size to avoid choking jq on overly long strings
[[ ${#value} -gt 1000 ]] && value="${value:0:997}..."
record[${key,,}]="$value"
fi
done < <(jq -r ".[$count]"'|to_entries|map("\(.key)=\(.value|tostring|.[0:4000]|gsub("\n";"\\n"))")|.[]' ${updates})
# record current position
echo "${record['id']}" > ${update_last_id}
echo "${record['timelastmodified']}" > ${update_last_modified}
if [[ $verbose -ge 2 ]]; then
echo "ID: ${record['id']}";
echo "Author: ${record['author']}";
echo "Title: ${record['title']}";
echo "Modified: ${record['timelastmodified']}";
echo
fi
keys=${!record[@]}
md5="${record[md5]}"
# split fields between tables
for key in "${!record[@]}"; do
table=${current_fields[$key]}
columns[$table]+="${key},"
value=${record[$key]}
if [ -n "$value" ]; then
value=$(sanitize_field "$key" "$value")
fi
values[$table]+="'$value',"
upsert[$table]+="${key} = values(${key}),"
done
# add md5 to secondary tables (all but the last)
for n in $(seq 0 $((${#tables[@]}-2))); do
table="${tables[$n]}"
if [[ -n "${columns[$table]}" ]]; then
columns[$table]+="md5,"
values[$table]+="'$md5',"
upsert[$table]+="md5 = values(md5),"
fi
done
# main table (last in tables array) first
for n in $(seq $((${#tables[@]}-1)) -1 0); do
table="${tables[$n]}"
if [[ -n "${columns[$table]}" ]]; then
sql+="insert into $table (${columns[$table]%?}) values(${values[$table]%?}) on duplicate key update ${upsert[$table]%?};"
fi
done
echo "${sql}" >> ${update_sql}
[[ -n $sql_dump ]] && echo "${sql}" >> "${sql_dump}"
unset record
unset keys
unset key
unset value
unset sql
columns=()
values=()
upsert=()
let count+=1
done
echo "commit;" >> ${update_sql}
[[ $no_action == 0 ]] && dbx < ${update_sql}
done
}
dbx () {
if [ $# -gt 0 ]; then
$mysqlcmd -e "$@"
else
$mysqlcmd
fi
}
get_current_fields () {
for table in ${tables[@]}; do
dbx "describe $table;"|awk '{print "["tolower($1)"]='$table'"}'
done
}
get_field_type () {
field="$1"
table="${current_fields[$field]}"
dbx "show fields from $table where field=\"$field\";"|awk '{print $2}'
}
get_field_types () {
for field in ${!current_fields[@]}; do
fieldtype=$(get_field_type "$field")
[[ "$fieldtype" =~ $re_type ]]
echo -n "[$field]=${BASH_REMATCH[0]} "
done
}
get_field_sizes () {
for field in ${!current_fields[@]}; do
fieldtype=$(get_field_type "$field")
[[ "$fieldtype" =~ $re_int ]]
if [[ "${BASH_REMATCH[0]}" -gt 0 ]]; then
echo -n "[$field]=${BASH_REMATCH[0]} "
fi
done
}
# sanitize_field FIELD VALUE
sanitize_field () {
field=$1
shift
value="$*"
# quote values for SQL
value=${value//\\/\\\\}
value=${value//\'/\\\'}
# field-type specific filters
case "${field_types[$field]}" in
int|bigint)
[[ "$value" =~ $re_int ]]
value=${BASH_REMATCH[0]}
value=${value:0:${field_sizes[$field]}}
;;
char|varchar)
value=${value:0:${field_sizes[$field]}}
;;
timestamp)
[[ "$value" =~ $re_timestamp ]]
value=${BASH_REMATCH[0]}
;;
esac
# field-specific filters
case "$field" in
year)
# filter out Chinese date stamps
[[ "$value" =~ $re_year ]]
value=${BASH_REMATCH[0]}
;;
esac
echo -n "$value"
}
# libgen_api ID LIMIT TIME_LAST_MODIFIED
libgen_api () {
id="$1"
shift
limit="$1"
shift
last=$(date -d "$@" +'%Y-%m-%d%%20%H:%M:%S')
echo $last > /tmp/last
if [[ $? -gt 0 ]]; then
exit_with_error "date error: $@ is not a valid date"
fi
curl -s ${api}?'fields=*&idnewer='${id}'&mode=newer&limit1='${limit}'&timenewer='${last}
}
# get_updates ID LIMIT TIME_LAST_MODIFIED
get_updates () {
id="$1"
shift
limit="$1"
shift
last="$@"
libgen_api $id $limit "$last" > ${updates}
jq '.|length' ${updates} > ${update_count}
}
get_time_last_modified () {
dbx 'select MAX(TimeLastModified) FROM updated;'|tail -1
}
get_max_id () {
dbx 'select MAX(id) FROM updated;'|tail -1
}
get_update_count () {
echo $(($(cat ${update_count})-1))
}
find_tool () {
tool=$(which "$1")
if [[ $? -gt 0 ]]; then
exit_with_error "missing program: $1; please install and try again"
fi
}
check_fields () {
updates_fields=($(libgen_api 1 2 '2000-01-01'|jq -r '.[0]|keys|@sh'))
db_fields="${!current_fields[@]}"
db_fields="${db_fields,,}"
# check for extra fields in api response
for index in ${!updates_fields[@]}; do
field="${updates_fields[$index]%\'}"
field="${field#\'}"
if [[ ! $db_fields =~ ${field,,} ]]; then
if [[ ! -v quiet ]]; then
echo "unknown field in api response: ${field} (consider refreshing database from dump)"
fi
unknown_fields+="${field,,} "
else
:
fi
done
# check for missing fields in api reponse
[[ $verbose -ge 1 ]] && {
for field in ${!current_fields[@]}; do
if [[ ! -v quiet && ! ${updates_fields[*],,} =~ ${field,,} ]]; then
echo "missing field in api response: $field"
fi
done
}
}
url_available () {
url="$1"
curl --output /dev/null --silent --fail -r 0-0 "$url"
}
check_sanity () {
find_tool "jq"
find_tool "awk"
find_tool "date"
find_tool "mysql"
find_tool "curl"
}
cleanup () {
rm -rf ${tmpdir}
unlock
}
trap_error () {
cleanup
exit 1
}
trap_exit () {
cleanup
exit
}
_lock() {
flock -$1 $LOCKFD
}
_no_more_locking() {
_lock u
_lock xn && rm -f $LOCKFILE
}
_prepare_locking() {
eval "exec $LOCKFD>\"$LOCKFILE\""
trap _no_more_locking EXIT
}
exlock_now() {
_lock xn
}
exlock() {
_lock x
}
shlock() {
_lock s
}
unlock() {
_lock u
}
# echo error message to stdout and terminate main
exit_with_error () {
echo -e "$(basename $0): $*" >&2
kill -s TERM $TOP_PID
}
help () {
echo $(basename $(readlink -f $0)) "version $version"
cat <<- 'EOT'
Usage: update_libgen OPTIONS
-l LIMIT get updates in blocks of LIMIT entries
-v be verbose about what is being updated; repeat for more verbosity:
-v: show basic info (number of updates, etc)
-vv: show ID, Title and TimeLastModified for each update
-n do not update database. Use together with -v or -vv to show
how many (-v) and which (-vv) titles would be updated.
-j FILE dump (append) json to FILE
-s FILE dump (append) sql to FILE
-u URL use URL to access the libgen API (overrides default)
-t DATETIME get updates since DATETIME (ignoring TimeLastModified in database)
use this option together with -s to create an sql update file to update
non-networked machines
-i ID get updates from ID
-H DBHOST database host
-P DBPORT database port
-U DBUSER database user
-p DBPASS database password (use empty string to get a password prompt)
-D DATABASE database name
-a APIHOST use APIHOST as API server
-@ use tor (through torsocks) to connect to libgen API server
-q don't warn about missing fields in database or api response
-h this help message
EOT
}
_prepare_locking
main "$@"