#!/usr/bin/env bash # shellcheck disable=SC2034,SC1090,SC2155,SC2207 version="0.6.1" release="20210512" trap "trap_error" TERM trap "trap_clean" EXIT export TOP_PID=$$ LC_ALL=C functions="$(dirname "$0")/books_functions" if [ -f "$functions" ]; then source "$functions" else echo "$functions not found" exit 1 fi main () { exlock now || exit 1 # PREFERENCES config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf dbhost="localhost" dbport="3306" db="libgen" dbuser="libgen" limit=1000 api="http://libgen.rs/json.php" # source config file if it exists [[ -f ${config} ]] && source "${config}" # (more or less) END OF PREFERENCES jq=$(find_tool "jq") curl=$(find_tool "curl") tmpdir=$(mktemp -d '/tmp/update_libgen.XXXXXX') updates="${tmpdir}/updates" update_count="${tmpdir}/update_count" update_sql="${tmpdir}/update_sql" update_last_modified="${tmpdir}/update_last_modified" update_last_id="${tmpdir}/update_last_id" update_newer="${tmpdir}/update_newer" verbose=0 no_action=0 unknown_fields="" re_type='[a-z]+' re_int='[0-9]+' re_year='[0-9]{4}' re_timestamp='[0-9]{4}-[0-9]{2}-[0-9]{2} [0-2][0-9]:[0-5][0-9]:[0-5][0-9]' declare -a tables="(description hashes updated)" declare -A current_fields="($(get_current_fields))" declare -A field_types="($(get_field_types))" declare -A field_sizes="($(get_field_sizes))" declare -A columns=() declare -A values=() declare -A upsert=() while getopts "a:D:j:hH:i:l:nP:U:qs:ct:u:v@:" OPTION do case $OPTION in j) json_dump="${OPTARG}" ;; s) sql_dump="${OPTARG}" ;; c) classify=$(find_tool "classify") import_metadata=$(find_tool "import_metadata") classifile="${tmpdir}/classifile" ;; v) ((verbose++)) ;; n) no_action=1 ;; l) limit="${OPTARG}" if [[ $limit -le 1 ]]; then exit_wit_error "limit too low (-l ${limit}), minimum is 2" fi ;; t) startdatetime="${OPTARG}" ;; i) echo "${OPTARG}" > "${update_last_id}" ;; u) api="${OPTARG}" ;; H) dbhost="${OPTARG}" ;; P) dbport="${OPTARG}" ;; U) dbuser="${OPTARG}" ;; D) db="${OPTARG}" ;; a) if url_available "${OPTARG}?fields=id&ids=0"; then api="${OPTARG}" else exit_with_error "-a ${OPTARG}: API endpoint not available" fi ;; @) torsocks=$(find_tool "torsocks") export TORSOCKS_TOR_PORT=${OPTARG} ;; q) quiet=1 ;; h) help exit ;; *) exit_with_error "unknown option $OPTION" ;; esac done check_fields while ( if [[ -s ${update_last_modified} ]]; then last_update="$(cat "${update_last_modified}")" last_update_in_db="$(get_time_last_modified)" if [[ $last_update != "$last_update_in_db" && $no_action == 0 ]]; then exit_with_error "uh oh... something went wrong, last update in db does not equal last update from api response..." fi elif [[ -n $startdatetime ]]; then last_update="${startdatetime}" else last_update="$(get_time_last_modified)" fi last_id=$([[ -s ${update_last_id} ]] && cat "${update_last_id}" || get_max_id) get_updates "$last_id" "$limit" "$last_update" updcnt=$(get_update_count) [[ -n $json_dump ]] && cat "${updates}" >> "${json_dump}" if [[ $verbose -ge 1 ]]; then echo "database last modified: $last_update"; # update counter is 0-based, humans prefer 1-based notation if [[ ${updcnt} -gt 0 ]]; then echo "$((updcnt+1)) updates"; else more=$([[ -s $update_last_id ]] && echo "more " || echo "") echo "no ${more}updates" fi echo ; fi test "$updcnt" -gt 0 ); do updcnt=$(get_update_count) count=0 echo "start transaction;" > "${update_sql}" while [[ $count -le $updcnt ]]; do declare -A record while IFS="=" read -r key value; do # drop unknown fields if [[ ! $unknown_fields =~ ${key,,} ]]; then # limit field size to avoid choking jq on overly long strings [[ ${#value} -gt 1000 ]] && value="${value:0:997}..." record[${key,,}]="$value" fi done < <($jq -r ".[$count]"'|to_entries|map("\(.key)=\(.value|tostring|.[0:4000]|gsub("\n";"\\n"))")|.[]' "${updates}") # record current position echo "${record['id']}" > "${update_last_id}" echo "${record['timelastmodified']}" > "${update_last_modified}" if [[ $verbose -ge 2 ]]; then echo "ID: ${record['id']}"; echo "Author: ${record['author']}"; echo "Title: ${record['title']}"; echo "Modified: ${record['timelastmodified']}"; echo fi if [[ -n "$classifile" && -n "${record['identifierwodash']}" ]]; then echo "${record['md5']}" >> "$classifile" fi keys=${!record[*]} md5="${record[md5]}" # split fields between tables for key in "${!record[@]}"; do table=${current_fields[$key]} columns[$table]+="${key}," value=${record[$key]} if [ -n "$value" ]; then value=$(sanitize_field "$key" "$value") fi values[$table]+="'$value'," upsert[$table]+="${key} = values(${key})," done # add md5 to secondary tables (all but the last) for n in $(seq 0 $((${#tables[@]}-2))); do table="${tables[$n]}" if [[ -n "${columns[$table]}" ]]; then columns[$table]+="md5," values[$table]+="'$md5'," upsert[$table]+="md5 = values(md5)," fi done # main table (last in tables array) first for n in $(seq $((${#tables[@]}-1)) -1 0); do table="${tables[$n]}" if [[ -n "${columns[$table]}" ]]; then sql+="insert into $table (${columns[$table]%?}) values(${values[$table]%?}) on duplicate key update ${upsert[$table]%?};" fi done echo "${sql}" >> "${update_sql}" [[ -n $sql_dump ]] && echo "${sql}" >> "${sql_dump}" unset record unset keys unset key unset value unset sql columns=() values=() upsert=() ((count++)) done echo "commit;" >> "${update_sql}" [[ $no_action == 0 ]] && dbx "$db" < "${update_sql}" done # optionally add classification data to new records # this will use tor and round-robin through TOR ports if these are # defined in classify_tor_ports in the config file if [[ -n "$classifile" && -f $classifile ]]; then now=$(date +%Y%m%d%H%M) csvfile="${classify_csv:+$classify_csv/}${now}.csv" IFS=',' read -ra torports <<< "$classify_tor_ports" if [[ ${#torports[*]} -gt 0 ]]; then torpc=${#torports[*]} fi upc=0 while read md5;do $classify ${torpc:+-@ ${torports[$upc%$torpc]}} -D "$db" ${classify_xml:+-X $classify_xml} -C "$md5" >> "${csvfile}" ((upc++)) done < <(cat "$classifile") if [[ -f ${csvfile} ]]; then $import_metadata -d "$db" -f "${classify_fields:-ddc,lcc,fast}" ${classify_sql:+-s $classify_sql/$now.sql} -F "${csvfile}" fi fi } get_current_fields () { for table in "${tables[@]}"; do dbx "$db" "describe $table;"|awk '{print "["tolower($1)"]='"$table"'"}' done } get_field_type () { field="$1" table="${current_fields[$field]}" dbx "$db" "show fields from $table where field=\"$field\";"|awk '{print $2}' } get_field_types () { for field in "${!current_fields[@]}"; do fieldtype=$(get_field_type "$field") [[ "$fieldtype" =~ $re_type ]] echo -n "[$field]=${BASH_REMATCH[0]} " done } get_field_sizes () { for field in "${!current_fields[@]}"; do fieldtype=$(get_field_type "$field") [[ "$fieldtype" =~ $re_int ]] if [[ "${BASH_REMATCH[0]}" -gt 0 ]]; then echo -n "[$field]=${BASH_REMATCH[0]} " fi done } # sanitize_field FIELD VALUE sanitize_field () { field=$1 shift value="$*" # quote values for SQL value=${value//\\/\\\\} value=${value//\'/\\\'} # field-type specific filters case "${field_types[$field]}" in int|bigint) [[ "$value" =~ $re_int ]] value=${BASH_REMATCH[0]} value=${value:0:${field_sizes[$field]}} ;; char|varchar) value=${value:0:${field_sizes[$field]}} ;; timestamp) [[ "$value" =~ $re_timestamp ]] value=${BASH_REMATCH[0]} ;; esac # field-specific filters case "$field" in year) # filter out Chinese date stamps [[ "$value" =~ $re_year ]] value=${BASH_REMATCH[0]} ;; esac echo -n "$value" } # libgen_api ID LIMIT TIME_LAST_MODIFIED libgen_api () { id="$1" shift limit="$1" shift if ! newer=$(date -d "$*" +'%Y-%m-%d%%20%H:%M:%S'); then exit_with_error "date error: $* is not a valid date" fi echo "$newer" > "$update_newer" $torsocks "$curl" -s "${api}?"'fields=*&idnewer='"${id}"'&mode=newer&limit1='"${limit}"'&timenewer='"${newer}" } # get_updates ID LIMIT TIME_LAST_MODIFIED get_updates () { id="$1" shift limit="$1" shift last="$*" libgen_api "$id" "$limit" "$last" > "${updates}" $jq '.|length' "${updates}" > "${update_count}" } get_time_last_modified () { dbx "$db" 'select MAX(TimeLastModified) FROM updated;'|tail -1 } get_max_id () { dbx "$db" 'select MAX(id) FROM updated;'|tail -1 } get_update_count () { echo $(($(cat "${update_count}")-1)) } check_fields () { updates_fields=($(libgen_api 1 2 '2000-01-01'|$jq -r '.[0]|keys|@sh')) db_fields="${!current_fields[*]}" db_fields="${db_fields,,}" # check for extra fields in api response for index in "${!updates_fields[@]}"; do field="${updates_fields[$index]%\'}" field="${field#\'}" if [[ ! $db_fields =~ ${field,,} ]]; then if [[ ! -v quiet ]]; then echo "unknown field in api response: ${field} (consider refreshing database from dump)" fi unknown_fields+="${field,,} " else : fi done # check for missing fields in api reponse [[ $verbose -ge 1 ]] && { for field in "${!current_fields[@]}"; do if [[ ! -v quiet && ! ${updates_fields[*],,} =~ ${field,,} ]]; then echo "missing field in api response: $field" fi done } } cleanup () { rm -rf "${tmpdir}" } help () { echo "$(basename "$(readlink -f "$0")")" "version $version" cat <<- 'EOT' Usage: update_libgen OPTIONS -l LIMIT get updates in blocks of LIMIT entries -v be verbose about what is being updated; repeat for more verbosity: -v: show basic info (number of updates, etc) -vv: show ID, Title and TimeLastModified for each update -n do not update database. Use together with -v or -vv to show how many (-v) and which (-vv) titles would be updated. -j FILE dump (append) json to FILE -s FILE dump (append) sql to FILE -u URL use URL to access the libgen API (overrides default) -t DATETIME get updates since DATETIME (ignoring TimeLastModified in database) use this option together with -s to create an sql update file to update non-networked machines -i ID get updates from ID -H DBHOST database host -P DBPORT database port -U DBUSER database user -D DATABASE database name -a APIHOST use APIHOST as API server -@ TORPORT use tor (through torsocks) to connect to libgen API server -c run classify over new records to get classification data -q don't warn about missing fields in database or api response -h this help message EOT } exlock prepare || exit 1 main "$@"