505 lines
11 KiB
Bash
Executable file
505 lines
11 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
version="0.6"
|
|
release="20200805"
|
|
|
|
trap "trap_error" TERM
|
|
trap "trap_exit" EXIT
|
|
export TOP_PID=$$
|
|
|
|
LOCKFILE="/var/lock/$(basename $0)"
|
|
LOCKFD=99
|
|
LC_ALL=C
|
|
|
|
main () {
|
|
|
|
exlock_now || exit 1
|
|
|
|
# PREFERENCES
|
|
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
|
|
|
|
dbhost="localhost"
|
|
dbport="3306"
|
|
db="libgen"
|
|
dbuser="libgen"
|
|
limit=1000
|
|
|
|
|
|
api="http://gen.lib.rus.ec/json.php"
|
|
#api="http://libgen.io/json.php"
|
|
#api="http://libgen.lc/json.php"
|
|
#api="http://libgen.unblocked.tv/json.php"
|
|
|
|
# source config file if it exists
|
|
[[ -f ${config} ]] && source ${config}
|
|
|
|
# (more or less) END OF PREFERENCES
|
|
|
|
tmpdir=$(mktemp -d '/tmp/update_libgen.XXXXXX')
|
|
updates="${tmpdir}/updates"
|
|
update_count="${tmpdir}/update_count"
|
|
update_sql="${tmpdir}/update_sql"
|
|
update_last_modified="${tmpdir}/update_last_modified"
|
|
update_last_id="${tmpdir}/update_last_id"
|
|
|
|
mysqlcmd="mysql -s --skip-column-names -N -B -h ${dbhost} -P ${dbport} -u ${dbuser} ${db}"
|
|
|
|
verbose=0
|
|
no_action=0
|
|
unknown_fields=""
|
|
|
|
re_type='[a-z]+'
|
|
re_int='[0-9]+'
|
|
re_year='[0-9]{4}'
|
|
re_timestamp='[0-9]{4}-[0-9]{2}-[0-9]{2} [0-2][0-9]:[0-5][0-9]:[0-5][0-9]'
|
|
|
|
declare -a tables="(description hashes updated)"
|
|
declare -A current_fields="($(get_current_fields))"
|
|
declare -A field_types="($(get_field_types))"
|
|
declare -A field_sizes="($(get_field_sizes))"
|
|
declare -A columns=()
|
|
declare -A values=()
|
|
declare -A upsert=()
|
|
|
|
while getopts "a:D:j:hH:i:l:np:P:qs:t:u:v@" OPTION
|
|
do
|
|
case $OPTION in
|
|
j)
|
|
json_dump="${OPTARG}"
|
|
;;
|
|
s)
|
|
sql_dump="${OPTARG}"
|
|
;;
|
|
v)
|
|
let verbose+=1
|
|
;;
|
|
n)
|
|
no_action=1
|
|
;;
|
|
l)
|
|
limit="${OPTARG}"
|
|
if [[ $limit -le 1 ]]; then
|
|
exit_wit_error "limit too low (-l ${limit}), minimum is 2"
|
|
fi
|
|
;;
|
|
t)
|
|
startdatetime="${OPTARG}"
|
|
;;
|
|
i)
|
|
echo "${OPTARG}" > ${update_last_id}
|
|
;;
|
|
u)
|
|
api="${OPTARG}"
|
|
;;
|
|
H)
|
|
dbhost="${OPTARG}"
|
|
;;
|
|
P)
|
|
dbport="${OPTARG}"
|
|
;;
|
|
U)
|
|
dbuser="${OPTARG}"
|
|
;;
|
|
p)
|
|
dbpass="-p${OPTARG}"
|
|
;;
|
|
D)
|
|
db="${OPTARG}"
|
|
;;
|
|
a)
|
|
if url_available "${OPTARG}?fields=id&ids=0"; then
|
|
api="${OPTARG}"
|
|
else
|
|
exit_with_error "-a ${OPTARG}: API endpoint not available"
|
|
fi
|
|
;;
|
|
@)
|
|
use_torsocks=1
|
|
source $(which torsocks) on
|
|
;;
|
|
q)
|
|
quiet=1
|
|
;;
|
|
h)
|
|
help
|
|
exit
|
|
esac
|
|
done
|
|
|
|
check_sanity
|
|
|
|
check_fields
|
|
|
|
while (
|
|
if [[ -s ${update_last_modified} ]]; then
|
|
last_update="$(cat ${update_last_modified})"
|
|
last_update_in_db="$(get_time_last_modified)"
|
|
if [[ $last_update != $last_update_in_db && $no_action == 0 ]]; then
|
|
exit_with_error "uh oh... something went wrong, last update in db does not equal last update from api response..."
|
|
fi
|
|
elif [[ -n $startdatetime ]]; then
|
|
last_update="${startdatetime}"
|
|
else
|
|
last_update="$(get_time_last_modified)"
|
|
fi
|
|
|
|
last_id=$([[ -s ${update_last_id} ]] && cat ${update_last_id} || get_max_id)
|
|
|
|
get_updates $last_id $limit "$last_update"
|
|
|
|
updcnt=$(get_update_count)
|
|
|
|
[[ -n $json_dump ]] && cat ${updates} >> "${json_dump}"
|
|
|
|
if [[ $verbose -ge 1 ]]; then
|
|
echo "database last modified: $last_update";
|
|
# update counter is 0-based, humans prefer 1-based notation
|
|
if [[ ${updcnt} -gt 0 ]]; then
|
|
echo "$((${updcnt}+1)) updates";
|
|
else
|
|
more=$([[ -s $update_last_id ]] && echo "more " || echo "")
|
|
echo "no ${more}updates"
|
|
fi
|
|
echo ;
|
|
fi
|
|
|
|
test $updcnt -gt 0
|
|
); do
|
|
|
|
updcnt=$(get_update_count)
|
|
count=0
|
|
echo "start transaction;" > ${update_sql}
|
|
|
|
while [ $count -le $updcnt ]; do
|
|
declare -A record
|
|
while IFS="=" read -r key value; do
|
|
# drop unknown fields
|
|
if [[ ! $unknown_fields =~ ${key,,} ]]; then
|
|
# limit field size to avoid choking jq on overly long strings
|
|
[[ ${#value} -gt 1000 ]] && value="${value:0:997}..."
|
|
record[${key,,}]="$value"
|
|
fi
|
|
done < <(jq -r ".[$count]"'|to_entries|map("\(.key)=\(.value|tostring|.[0:4000]|gsub("\n";"\\n"))")|.[]' ${updates})
|
|
|
|
# record current position
|
|
echo "${record['id']}" > ${update_last_id}
|
|
echo "${record['timelastmodified']}" > ${update_last_modified}
|
|
|
|
if [[ $verbose -ge 2 ]]; then
|
|
echo "ID: ${record['id']}";
|
|
echo "Author: ${record['author']}";
|
|
echo "Title: ${record['title']}";
|
|
echo "Modified: ${record['timelastmodified']}";
|
|
echo
|
|
fi
|
|
|
|
keys=${!record[@]}
|
|
|
|
md5="${record[md5]}"
|
|
|
|
# split fields between tables
|
|
for key in "${!record[@]}"; do
|
|
table=${current_fields[$key]}
|
|
columns[$table]+="${key},"
|
|
value=${record[$key]}
|
|
if [ -n "$value" ]; then
|
|
value=$(sanitize_field "$key" "$value")
|
|
fi
|
|
values[$table]+="'$value',"
|
|
upsert[$table]+="${key} = values(${key}),"
|
|
done
|
|
|
|
# add md5 to secondary tables (all but the last)
|
|
for n in $(seq 0 $((${#tables[@]}-2))); do
|
|
table="${tables[$n]}"
|
|
if [[ -n "${columns[$table]}" ]]; then
|
|
columns[$table]+="md5,"
|
|
values[$table]+="'$md5',"
|
|
upsert[$table]+="md5 = values(md5),"
|
|
fi
|
|
done
|
|
|
|
# main table (last in tables array) first
|
|
for n in $(seq $((${#tables[@]}-1)) -1 0); do
|
|
table="${tables[$n]}"
|
|
if [[ -n "${columns[$table]}" ]]; then
|
|
sql+="insert into $table (${columns[$table]%?}) values(${values[$table]%?}) on duplicate key update ${upsert[$table]%?};"
|
|
fi
|
|
done
|
|
echo "${sql}" >> ${update_sql}
|
|
[[ -n $sql_dump ]] && echo "${sql}" >> "${sql_dump}"
|
|
|
|
unset record
|
|
unset keys
|
|
unset key
|
|
unset value
|
|
unset sql
|
|
columns=()
|
|
values=()
|
|
upsert=()
|
|
|
|
let count+=1
|
|
done
|
|
|
|
echo "commit;" >> ${update_sql}
|
|
|
|
[[ $no_action == 0 ]] && dbx < ${update_sql}
|
|
done
|
|
}
|
|
|
|
dbx () {
|
|
if [ $# -gt 0 ]; then
|
|
$mysqlcmd -e "$@"
|
|
else
|
|
$mysqlcmd
|
|
fi
|
|
}
|
|
|
|
get_current_fields () {
|
|
for table in ${tables[@]}; do
|
|
dbx "describe $table;"|awk '{print "["tolower($1)"]='$table'"}'
|
|
done
|
|
}
|
|
|
|
get_field_type () {
|
|
field="$1"
|
|
table="${current_fields[$field]}"
|
|
dbx "show fields from $table where field=\"$field\";"|awk '{print $2}'
|
|
}
|
|
|
|
get_field_types () {
|
|
for field in ${!current_fields[@]}; do
|
|
fieldtype=$(get_field_type "$field")
|
|
[[ "$fieldtype" =~ $re_type ]]
|
|
echo -n "[$field]=${BASH_REMATCH[0]} "
|
|
done
|
|
}
|
|
|
|
get_field_sizes () {
|
|
for field in ${!current_fields[@]}; do
|
|
fieldtype=$(get_field_type "$field")
|
|
[[ "$fieldtype" =~ $re_int ]]
|
|
if [[ "${BASH_REMATCH[0]}" -gt 0 ]]; then
|
|
echo -n "[$field]=${BASH_REMATCH[0]} "
|
|
fi
|
|
done
|
|
}
|
|
|
|
# sanitize_field FIELD VALUE
|
|
sanitize_field () {
|
|
field=$1
|
|
shift
|
|
value="$*"
|
|
|
|
# quote values for SQL
|
|
value=${value//\\/\\\\}
|
|
value=${value//\'/\\\'}
|
|
|
|
# field-type specific filters
|
|
case "${field_types[$field]}" in
|
|
int|bigint)
|
|
[[ "$value" =~ $re_int ]]
|
|
value=${BASH_REMATCH[0]}
|
|
value=${value:0:${field_sizes[$field]}}
|
|
;;
|
|
char|varchar)
|
|
value=${value:0:${field_sizes[$field]}}
|
|
;;
|
|
timestamp)
|
|
[[ "$value" =~ $re_timestamp ]]
|
|
value=${BASH_REMATCH[0]}
|
|
;;
|
|
esac
|
|
|
|
# field-specific filters
|
|
case "$field" in
|
|
year)
|
|
# filter out Chinese date stamps
|
|
[[ "$value" =~ $re_year ]]
|
|
value=${BASH_REMATCH[0]}
|
|
;;
|
|
esac
|
|
|
|
echo -n "$value"
|
|
}
|
|
|
|
# libgen_api ID LIMIT TIME_LAST_MODIFIED
|
|
libgen_api () {
|
|
id="$1"
|
|
shift
|
|
limit="$1"
|
|
shift
|
|
last=$(date -d "$@" +'%Y-%m-%d%%20%H:%M:%S')
|
|
|
|
echo $last > /tmp/last
|
|
|
|
if [[ $? -gt 0 ]]; then
|
|
exit_with_error "date error: $@ is not a valid date"
|
|
fi
|
|
|
|
curl -s ${api}?'fields=*&idnewer='${id}'&mode=newer&limit1='${limit}'&timenewer='${last}
|
|
}
|
|
|
|
# get_updates ID LIMIT TIME_LAST_MODIFIED
|
|
get_updates () {
|
|
id="$1"
|
|
shift
|
|
limit="$1"
|
|
shift
|
|
last="$@"
|
|
libgen_api $id $limit "$last" > ${updates}
|
|
jq '.|length' ${updates} > ${update_count}
|
|
}
|
|
|
|
|
|
get_time_last_modified () {
|
|
dbx 'select MAX(TimeLastModified) FROM updated;'|tail -1
|
|
}
|
|
|
|
get_max_id () {
|
|
dbx 'select MAX(id) FROM updated;'|tail -1
|
|
}
|
|
|
|
get_update_count () {
|
|
echo $(($(cat ${update_count})-1))
|
|
}
|
|
|
|
find_tool () {
|
|
tool=$(which "$1")
|
|
if [[ $? -gt 0 ]]; then
|
|
exit_with_error "missing program: $1; please install and try again"
|
|
fi
|
|
}
|
|
|
|
check_fields () {
|
|
updates_fields=($(libgen_api 1 2 '2000-01-01'|jq -r '.[0]|keys|@sh'))
|
|
db_fields="${!current_fields[@]}"
|
|
db_fields="${db_fields,,}"
|
|
|
|
# check for extra fields in api response
|
|
for index in ${!updates_fields[@]}; do
|
|
field="${updates_fields[$index]%\'}"
|
|
field="${field#\'}"
|
|
if [[ ! $db_fields =~ ${field,,} ]]; then
|
|
if [[ ! -v quiet ]]; then
|
|
echo "unknown field in api response: ${field} (consider refreshing database from dump)"
|
|
fi
|
|
unknown_fields+="${field,,} "
|
|
else
|
|
:
|
|
fi
|
|
done
|
|
|
|
# check for missing fields in api reponse
|
|
[[ $verbose -ge 1 ]] && {
|
|
for field in ${!current_fields[@]}; do
|
|
if [[ ! -v quiet && ! ${updates_fields[*],,} =~ ${field,,} ]]; then
|
|
echo "missing field in api response: $field"
|
|
fi
|
|
done
|
|
}
|
|
}
|
|
|
|
url_available () {
|
|
url="$1"
|
|
|
|
curl --output /dev/null --silent --fail -r 0-0 "$url"
|
|
}
|
|
|
|
check_sanity () {
|
|
find_tool "jq"
|
|
find_tool "awk"
|
|
find_tool "date"
|
|
find_tool "mysql"
|
|
find_tool "curl"
|
|
}
|
|
|
|
cleanup () {
|
|
rm -rf ${tmpdir}
|
|
unlock
|
|
}
|
|
|
|
trap_error () {
|
|
cleanup
|
|
exit 1
|
|
}
|
|
|
|
trap_exit () {
|
|
cleanup
|
|
exit
|
|
}
|
|
|
|
_lock() {
|
|
flock -$1 $LOCKFD
|
|
}
|
|
|
|
_no_more_locking() {
|
|
_lock u
|
|
_lock xn && rm -f $LOCKFILE
|
|
}
|
|
|
|
_prepare_locking() {
|
|
eval "exec $LOCKFD>\"$LOCKFILE\""
|
|
trap _no_more_locking EXIT
|
|
}
|
|
|
|
exlock_now() {
|
|
_lock xn
|
|
}
|
|
|
|
exlock() {
|
|
_lock x
|
|
}
|
|
|
|
shlock() {
|
|
_lock s
|
|
}
|
|
|
|
unlock() {
|
|
_lock u
|
|
}
|
|
|
|
# echo error message to stdout and terminate main
|
|
exit_with_error () {
|
|
echo -e "$(basename $0): $*" >&2
|
|
|
|
kill -s TERM $TOP_PID
|
|
}
|
|
|
|
help () {
|
|
echo $(basename $(readlink -f $0)) "version $version"
|
|
cat <<- 'EOT'
|
|
|
|
Usage: update_libgen OPTIONS
|
|
|
|
-l LIMIT get updates in blocks of LIMIT entries
|
|
-v be verbose about what is being updated; repeat for more verbosity:
|
|
-v: show basic info (number of updates, etc)
|
|
-vv: show ID, Title and TimeLastModified for each update
|
|
-n do not update database. Use together with -v or -vv to show
|
|
how many (-v) and which (-vv) titles would be updated.
|
|
-j FILE dump (append) json to FILE
|
|
-s FILE dump (append) sql to FILE
|
|
-u URL use URL to access the libgen API (overrides default)
|
|
-t DATETIME get updates since DATETIME (ignoring TimeLastModified in database)
|
|
use this option together with -s to create an sql update file to update
|
|
non-networked machines
|
|
-i ID get updates from ID
|
|
|
|
-H DBHOST database host
|
|
-P DBPORT database port
|
|
-U DBUSER database user
|
|
-p DBPASS database password (use empty string to get a password prompt)
|
|
-D DATABASE database name
|
|
|
|
-a APIHOST use APIHOST as API server
|
|
-@ use tor (through torsocks) to connect to libgen API server
|
|
-q don't warn about missing fields in database or api response
|
|
-h this help message
|
|
|
|
EOT
|
|
}
|
|
|
|
_prepare_locking
|
|
|
|
main "$@"
|