books/import_metadata.sh

326 lines
6.8 KiB
Bash
Executable file

#!/bin/bash
#shellcheck disable=SC2034,SC1090
#
# import_metadata - import metadata to libgen/libgen_fiction
#
# input: a [single line of|file containg] CSV-ordered metadata
shopt -s extglob
trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$
version="0.1.0"
release="20210518"
functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
source "$functions"
else
echo "$functions not found"
exit 1
fi
main () {
exlock now || exit 1
coproc coproc_ddc { coproc_ddc; }
coproc coproc_fast { coproc_fast; }
# PREFERENCES
config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf
dbhost="localhost"
dbport="3306"
db="libgen"
dbuser="libgen"
tmpdir=$(mktemp -d '/tmp/import_metadata.XXXXXX')
update_sql="${tmpdir}/update_sql"
# input field filters
declare -A filter=(
[md5]=filter_md5
[ddc]=filter_ddc
[lcc]=filter_ddc
[nlm]=filter_ddc
[fast]=filter_fast
[author]=filter_fast
[title]=filter_fast
)
# redirect OCLC [key] to field
declare -A redirect=(
[fast]="tags"
)
# used to get index for field / field for index
keys="md5 ddc lcc nlm fast author title"
declare -A headers
index=0
for key in $keys;do
headers["$key"]=$index
((index++))
done
declare -A tables=(
[libgen]="updated"
[libgen_fiction]="fiction"
)
# source config file if it exists
[[ -f ${config} ]] && source "${config}"
declare -a csvdata
declare -a csv
while getopts "d:f:F:ns:vh" OPTION; do
case $OPTION in
d)
if [ -n "${tables[$OPTARG]}" ]; then
db="$OPTARG"
else
exit_with_error "-d $OPTARG: no such database"
fi
;;
f)
for n in $OPTARG; do
if [ -n "${headers[$n]}" ]; then
fields+="${fields:+ }$n"
else
exit_with_error "no such field: $n"
fi
done
;;
F)
if [ -f "$OPTARG" ]; then
csvfile="$OPTARG"
else
exit_with_error "-f $OPTARG: no such file"
fi
;;
s)
sqlfile="$OPTARG"
if ! touch "$sqlfile"; then
exit_with_error "-s $OPTARG: can not write to file"
fi
;;
n)
dry_run=1
;;
v)
((verbose++))
;;
h)
help
exit
;;
*)
exit_with_error "unknown option: -$OPTION"
;;
esac
done
shift $((OPTIND-1))
[[ -z "$db" ]] && exit_with_error "no database defined, use -d database"
[[ -z "$fields" ]] && exit_with_error "no fields defined, use -f 'field1 field2' or -f field1 -f field2"
if [ -z "$dry_run" ]; then
declare -A current_fields='('$(get_current_fields "$db")')'
for field in $fields; do
[[ -n "${redirect[$field]}" ]] && field="${redirect[$field]}"
if [[ ! "${!current_fields[*]}" =~ "${field,,}" ]]; then
exit_with_error "field $field not in database $db"
fi
done
fi
if [[ -n "$csvfile" ]]; then
readarray -t csvdata < <(cat "$csvfile")
else
readarray -t csvdata <<< "$*"
fi
printf "start transaction;\n" > "${update_sql}"
for line in "${csvdata[@]}"; do
readarray -d',' -t csv <<< "$line"
if [[ "$verbose" -ge 2 ]]; then
index=0
for key in $keys; do
echo "${key^^}: $(${filter[$key]} "$key")"
((index++))
done
fi
sql="$(build_sql)"
printf "$sql\n" >> "$update_sql"
if [[ "$verbose" -ge 3 ]]; then
echo "$sql"
fi
[[ -n "$sqlfile" ]] && printf "$sql\n" >> "$sqlfile"
unset key
unset sql
csv=()
done
printf "commit;\n" >> "$update_sql"
[[ -z "$dry_run" ]] && dbx "$db" < "$update_sql"
}
filter_md5 () {
field="$1"
printf "${csv[${headers[$field]}]}"
}
filter_ddc () {
field="$1"
# without coprocess
# echo "${csv[${headers[$field]}]}"|sed 's/"//g;s/[[:blank:]]\+/,/g'
# with coprocess (30% faster)
printf "${csv[${headers[$field]}]}\n" >&${coproc_ddc[1]}
IFS= read -ru ${coproc_ddc[0]} value
printf "$value"
}
coproc_ddc () {
sed -u 's/"//g;s/[[:blank:]]\+/,/g'
}
filter_fast () {
field="$1"
# without coprocess
# echo "${csv[${headers[$field]}]}"|base64 -d|sed -u 's/\(["\\'\'']\)/\\\1/g;s/\r/\\r/g;s/\n/\\n/g;s/\t/\\t/g'
# with coprocess (30% faster)
# base64 can not be used as a coprocess due to its uncurable buffering addiction
value=$(printf "${csv[${headers[$field]}]}"|base64 -d)
printf "$value\n" >&${coproc_fast[1]}
IFS= read -ru ${coproc_fast[0]} value
printf "$value"
}
coproc_fast () {
sed -u 's/\(["\\'\'']\)/\\\1/g;s/\r/\\r/g;s/\n/\\n/g;s/\t/\\t/g'
}
get_field () {
field="$1"
value="${csv[${headers[$field]}]}"
if [ -n "${filters[$field]}" ]; then
printf "$value"|eval "${filters[$field]}"
else
printf "$value"
fi
}
get_current_fields () {
db="$1"
for table in "${tables[$db]}"; do
dbx "$db" "describe $table;"|awk '{printf "[%s]=%s ",tolower($1),"'$table'"}'
done
}
build_sql () {
sql=""
for field in $fields; do
data=$(${filter[$field]} "$field")
if [ -n "$data" ]; then
[[ -n "${redirect[$field]}" ]] && field="${redirect[$field]}"
sql+="${sql:+,}${field^^}='${data}'"
fi
done
if [ -n "$sql" ]; then
printf "update ${tables[$db]} set $sql where MD5='$(${filter['md5']} md5)';"
fi
}
cleanup () {
rm -rf "${tmpdir}"
}
# HELP
help () {
echo "$(basename "$(readlink -f "$0")")" "version $version"
cat <<- EOHELP
Use: import_metadata [OPTIONS] -d database -f "field1 field2" [-F CSVDATAFILE | single line of csv data ]
Taking either a single line of CSV-formatted data or a file containing
such data, this tool can be used to update a libgen / libgen_fiction
database with fresh metadata. It can also be used to produce SQL (using
the -s sqlfile option) which can be used to update multiple database
instances.
CSV data format:
$(hkeys=${keys^^};echo ${hkeys// /,})
CSV field names are subject to redirection to database field names,
currently these redirections are active (CSV -> DB):
$(for field in "${!redirect[@]}";do echo " ${field^^} -> ${redirect[$field]^^}";done)
OPTIONS:
-d DB define which database to use (libgen/libgen_fiction)
-f 'field1 field2'
-f field1 -f field2
define which fields to update
-F CSVFILE
define CSV input file
-s SQLFILE
write SQL to SQLFILE
-n do not update database
use with -s SQLFILE to produce SQL for later use
use with -vv to see data from CSVFILE
use with -vvv to see SQL
-v verbosity
repeat to increase verbosity
-h this help message
Examples
$ import_metadata -d libgen -F csv/update-0000 -f 'ddc lcc fast'
update database 'libgen' using data from CSV file csv/update-0000,
fields DDC, LCC and FAST (which is redirected to libgen.Tags)
$ for f in csv/update-*;do
import_metadata -d libgen -s sql/metadata.sql -n -f 'ddc lcc fast' -F "\$f"
done
create SQL (-s sql/metadata.sql) to update database using fields
DDC, LCC and FAST from all files matching glob csv/update-*,
do not update database (-n option)
EOHELP
}
exlock prepare || exit 1
main "$@"