books/update_libgen
2021-06-01 14:39:51 +00:00

463 lines
11 KiB
Bash
Executable file

#!/usr/bin/env bash
# shellcheck disable=SC2034,SC1090,SC2155,SC2207
version="0.6.1"
release="20210512"
trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$
LC_ALL=C
functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
source "$functions"
else
echo "$functions not found"
exit 1
fi
main () {
exlock now || exit 1
# PREFERENCES
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
dbhost="localhost"
dbport="3306"
db="libgen"
dbuser="libgen"
limit=1000
api="http://libgen.rs/json.php"
# source config file if it exists
[[ -f ${config} ]] && source "${config}"
# (more or less) END OF PREFERENCES
jq=$(find_tool "jq")
curl=$(find_tool "curl")
tmpdir=$(mktemp -d '/tmp/update_libgen.XXXXXX')
updates="${tmpdir}/updates"
update_count="${tmpdir}/update_count"
update_sql="${tmpdir}/update_sql"
update_last_modified="${tmpdir}/update_last_modified"
update_last_id="${tmpdir}/update_last_id"
update_newer="${tmpdir}/update_newer"
verbose=0
no_action=0
unknown_fields=""
re_type='[a-z]+'
re_int='[0-9]+'
re_year='[0-9]{4}'
re_timestamp='[0-9]{4}-[0-9]{2}-[0-9]{2} [0-2][0-9]:[0-5][0-9]:[0-5][0-9]'
declare -a tables="(description hashes updated)"
declare -A current_fields="($(get_current_fields))"
declare -A field_types="($(get_field_types))"
declare -A field_sizes="($(get_field_sizes))"
declare -A columns=()
declare -A values=()
declare -A upsert=()
while getopts "a:D:j:hH:i:l:nP:U:qs:ct:u:v@:" OPTION
do
case $OPTION in
j)
json_dump="${OPTARG}"
;;
s)
sql_dump="${OPTARG}"
;;
c)
classify=$(find_tool "classify")
import_metadata=$(find_tool "import_metadata")
classifile="${tmpdir}/classifile"
;;
v)
((verbose++))
;;
n)
no_action=1
;;
l)
limit="${OPTARG}"
if [[ $limit -le 1 ]]; then
exit_wit_error "limit too low (-l ${limit}), minimum is 2"
fi
;;
t)
startdatetime="${OPTARG}"
;;
i)
echo "${OPTARG}" > "${update_last_id}"
;;
u)
api="${OPTARG}"
;;
H)
dbhost="${OPTARG}"
;;
P)
dbport="${OPTARG}"
;;
U)
dbuser="${OPTARG}"
;;
D)
db="${OPTARG}"
;;
a)
if url_available "${OPTARG}?fields=id&ids=0"; then
api="${OPTARG}"
else
exit_with_error "-a ${OPTARG}: API endpoint not available"
fi
;;
@)
torsocks=$(find_tool "torsocks")
export TORSOCKS_TOR_PORT=${OPTARG}
;;
q)
quiet=1
;;
h)
help
exit
;;
*)
exit_with_error "unknown option $OPTION"
;;
esac
done
check_fields
while (
if [[ -s ${update_last_modified} ]]; then
last_update="$(cat "${update_last_modified}")"
last_update_in_db="$(get_time_last_modified)"
if [[ $last_update != "$last_update_in_db" && $no_action == 0 ]]; then
exit_with_error "uh oh... something went wrong, last update in db does not equal last update from api response..."
fi
elif [[ -n $startdatetime ]]; then
last_update="${startdatetime}"
else
last_update="$(get_time_last_modified)"
fi
last_id=$([[ -s ${update_last_id} ]] && cat "${update_last_id}" || get_max_id)
get_updates "$last_id" "$limit" "$last_update"
updcnt=$(get_update_count)
[[ -n $json_dump ]] && cat "${updates}" >> "${json_dump}"
if [[ $verbose -ge 1 ]]; then
echo "database last modified: $last_update";
# update counter is 0-based, humans prefer 1-based notation
if [[ ${updcnt} -gt 0 ]]; then
echo "$((updcnt+1)) updates";
else
more=$([[ -s $update_last_id ]] && echo "more " || echo "")
echo "no ${more}updates"
fi
echo ;
fi
test "$updcnt" -gt 0
); do
updcnt=$(get_update_count)
count=0
echo "start transaction;" > "${update_sql}"
while [[ $count -le $updcnt ]]; do
declare -A record
while IFS="=" read -r key value; do
# drop unknown fields
if [[ ! $unknown_fields =~ ${key,,} ]]; then
# limit field size to avoid choking jq on overly long strings
[[ ${#value} -gt 1000 ]] && value="${value:0:997}..."
record[${key,,}]="$value"
fi
done < <($jq -r ".[$count]"'|to_entries|map("\(.key)=\(.value|tostring|.[0:4000]|gsub("\n";"\\n"))")|.[]' "${updates}")
# record current position
echo "${record['id']}" > "${update_last_id}"
echo "${record['timelastmodified']}" > "${update_last_modified}"
if [[ $verbose -ge 2 ]]; then
echo "ID: ${record['id']}";
echo "Author: ${record['author']}";
echo "Title: ${record['title']}";
echo "Modified: ${record['timelastmodified']}";
echo
fi
if [[ -n "$classifile" && -n "${record['identifierwodash']}" ]]; then
echo "${record['md5']}" >> "$classifile"
fi
keys=${!record[*]}
md5="${record[md5]}"
# split fields between tables
for key in "${!record[@]}"; do
table=${current_fields[$key]}
columns[$table]+="${key},"
value=${record[$key]}
if [ -n "$value" ]; then
value=$(sanitize_field "$key" "$value")
fi
values[$table]+="'$value',"
upsert[$table]+="${key} = values(${key}),"
done
# add md5 to secondary tables (all but the last)
for n in $(seq 0 $((${#tables[@]}-2))); do
table="${tables[$n]}"
if [[ -n "${columns[$table]}" ]]; then
columns[$table]+="md5,"
values[$table]+="'$md5',"
upsert[$table]+="md5 = values(md5),"
fi
done
# main table (last in tables array) first
for n in $(seq $((${#tables[@]}-1)) -1 0); do
table="${tables[$n]}"
if [[ -n "${columns[$table]}" ]]; then
sql+="insert into $table (${columns[$table]%?}) values(${values[$table]%?}) on duplicate key update ${upsert[$table]%?};"
fi
done
echo "${sql}" >> "${update_sql}"
[[ -n $sql_dump ]] && echo "${sql}" >> "${sql_dump}"
unset record
unset keys
unset key
unset value
unset sql
columns=()
values=()
upsert=()
((count++))
done
echo "commit;" >> "${update_sql}"
[[ $no_action == 0 ]] && dbx "$db" < "${update_sql}"
done
# optionally add classification data to new records
# this will use tor and round-robin through TOR ports if these are
# defined in classify_tor_ports in the config file
if [[ -n "$classifile" && -f $classifile ]]; then
now=$(date +%Y%m%d%H%M)
csvfile="${classify_csv:+$classify_csv/}${now}.csv"
IFS=',' read -ra torports <<< "$classify_tor_ports"
if [[ ${#torports[*]} -gt 0 ]]; then
torpc=${#torports[*]}
fi
upc=0
while read md5;do
$classify ${torpc:+-@ ${torports[$upc%$torpc]}} -D "$db" ${classify_xml:+-X $classify_xml} -C "$md5" >> "${csvfile}"
((upc++))
done < <(cat "$classifile")
if [[ -f ${csvfile} ]]; then
$import_metadata -d "$db" -f "${classify_fields:-ddc,lcc,fast}" ${classify_sql:+-s $classify_sql/$now.sql} -F "${csvfile}"
fi
fi
}
get_current_fields () {
for table in "${tables[@]}"; do
dbx "$db" "describe $table;"|awk '{print "["tolower($1)"]='"$table"'"}'
done
}
get_field_type () {
field="$1"
table="${current_fields[$field]}"
dbx "$db" "show fields from $table where field=\"$field\";"|awk '{print $2}'
}
get_field_types () {
for field in "${!current_fields[@]}"; do
fieldtype=$(get_field_type "$field")
[[ "$fieldtype" =~ $re_type ]]
echo -n "[$field]=${BASH_REMATCH[0]} "
done
}
get_field_sizes () {
for field in "${!current_fields[@]}"; do
fieldtype=$(get_field_type "$field")
[[ "$fieldtype" =~ $re_int ]]
if [[ "${BASH_REMATCH[0]}" -gt 0 ]]; then
echo -n "[$field]=${BASH_REMATCH[0]} "
fi
done
}
# sanitize_field FIELD VALUE
sanitize_field () {
field=$1
shift
value="$*"
# quote values for SQL
value=${value//\\/\\\\}
value=${value//\'/\\\'}
# field-type specific filters
case "${field_types[$field]}" in
int|bigint)
[[ "$value" =~ $re_int ]]
value=${BASH_REMATCH[0]}
value=${value:0:${field_sizes[$field]}}
;;
char|varchar)
value=${value:0:${field_sizes[$field]}}
;;
timestamp)
[[ "$value" =~ $re_timestamp ]]
value=${BASH_REMATCH[0]}
;;
esac
# field-specific filters
case "$field" in
year)
# filter out Chinese date stamps
[[ "$value" =~ $re_year ]]
value=${BASH_REMATCH[0]}
;;
esac
echo -n "$value"
}
# libgen_api ID LIMIT TIME_LAST_MODIFIED
libgen_api () {
id="$1"
shift
limit="$1"
shift
if ! newer=$(date -d "$*" +'%Y-%m-%d%%20%H:%M:%S'); then
exit_with_error "date error: $* is not a valid date"
fi
echo "$newer" > "$update_newer"
$torsocks "$curl" -s "${api}?"'fields=*&idnewer='"${id}"'&mode=newer&limit1='"${limit}"'&timenewer='"${newer}"
}
# get_updates ID LIMIT TIME_LAST_MODIFIED
get_updates () {
id="$1"
shift
limit="$1"
shift
last="$*"
libgen_api "$id" "$limit" "$last" > "${updates}"
$jq '.|length' "${updates}" > "${update_count}"
}
get_time_last_modified () {
dbx "$db" 'select MAX(TimeLastModified) FROM updated;'|tail -1
}
get_max_id () {
dbx "$db" 'select MAX(id) FROM updated;'|tail -1
}
get_update_count () {
echo $(($(cat "${update_count}")-1))
}
check_fields () {
updates_fields=($(libgen_api 1 2 '2000-01-01'|$jq -r '.[0]|keys|@sh'))
db_fields="${!current_fields[*]}"
db_fields="${db_fields,,}"
# check for extra fields in api response
for index in "${!updates_fields[@]}"; do
field="${updates_fields[$index]%\'}"
field="${field#\'}"
if [[ ! $db_fields =~ ${field,,} ]]; then
if [[ ! -v quiet ]]; then
echo "unknown field in api response: ${field} (consider refreshing database from dump)"
fi
unknown_fields+="${field,,} "
else
:
fi
done
# check for missing fields in api reponse
[[ $verbose -ge 1 ]] && {
for field in "${!current_fields[@]}"; do
if [[ ! -v quiet && ! ${updates_fields[*],,} =~ ${field,,} ]]; then
echo "missing field in api response: $field"
fi
done
}
}
cleanup () {
rm -rf "${tmpdir}"
}
help () {
echo "$(basename "$(readlink -f "$0")")" "version $version"
cat <<- 'EOT'
Usage: update_libgen OPTIONS
-l LIMIT get updates in blocks of LIMIT entries
-v be verbose about what is being updated; repeat for more verbosity:
-v: show basic info (number of updates, etc)
-vv: show ID, Title and TimeLastModified for each update
-n do not update database. Use together with -v or -vv to show
how many (-v) and which (-vv) titles would be updated.
-j FILE dump (append) json to FILE
-s FILE dump (append) sql to FILE
-u URL use URL to access the libgen API (overrides default)
-t DATETIME get updates since DATETIME (ignoring TimeLastModified in database)
use this option together with -s to create an sql update file to update
non-networked machines
-i ID get updates from ID
-H DBHOST database host
-P DBPORT database port
-U DBUSER database user
-D DATABASE database name
-a APIHOST use APIHOST as API server
-@ TORPORT use tor (through torsocks) to connect to libgen API server
-c run classify over new records to get classification data
-q don't warn about missing fields in database or api response
-h this help message
EOT
}
exlock prepare || exit 1
main "$@"