books/update_libgen

#!/usr/bin/env bash
# shellcheck disable=SC2034,SC1090,SC2155,SC2207

version="0.6.1"
release="20210512"

trap "trap_error" TERM
trap "trap_clean" EXIT
export TOP_PID=$$

LC_ALL=C

functions="$(dirname "$0")/books_functions"
if [ -f "$functions" ]; then
        source "$functions"
else
        echo "$functions not found"
        exit 1
fi

main () {

	exlock now || exit 1

	# PREFERENCES
	config=${XDG_CONFIG_HOME:-$HOME/.config}/books.conf

	dbhost="localhost"
	dbport="3306"
	db="libgen"
	dbuser="libgen"
	limit=1000

	api="http://libgen.rs/json.php"

        # source config file if it exists
        [[ -f ${config} ]] && source "${config}"

	# (more or less) END OF PREFERENCES

	jq=$(find_tool "jq")
	curl=$(find_tool "curl")

	tmpdir=$(mktemp -d '/tmp/update_libgen.XXXXXX')
	updates="${tmpdir}/updates"
	update_count="${tmpdir}/update_count"
	update_sql="${tmpdir}/update_sql"
	update_last_modified="${tmpdir}/update_last_modified"
	update_last_id="${tmpdir}/update_last_id"
	update_newer="${tmpdir}/update_newer"

	verbose=0
	no_action=0
	unknown_fields=""

	re_type='[a-z]+'
	re_int='[0-9]+'
	re_year='[0-9]{4}'
	re_timestamp='[0-9]{4}-[0-9]{2}-[0-9]{2} [0-2][0-9]:[0-5][0-9]:[0-5][0-9]'

	declare -a tables="(description hashes updated)"
	declare -A current_fields="($(get_current_fields))"
	declare -A field_types="($(get_field_types))"
	declare -A field_sizes="($(get_field_sizes))"
	declare -A columns=()
	declare -A values=()
	declare -A upsert=()

	while getopts "a:D:j:hH:i:l:nP:U:qs:ct:u:v@:" OPTION
	do
		case $OPTION in
			j)
				json_dump="${OPTARG}"
				;;
			s)
				sql_dump="${OPTARG}"
				;;
			c)
				classify=$(find_tool "classify")
				import_metadata=$(find_tool "import_metadata")
				classifile="${tmpdir}/classifile"
				;;
			v)
				((verbose++))
				;;
			n)
				no_action=1
				;;
			l)
				limit="${OPTARG}"
				if [[ $limit -le 1 ]]; then
					exit_wit_error "limit too low (-l ${limit}), minimum is 2"
				fi
				;;
			t)
				startdatetime="${OPTARG}"
				;;
			i)
				echo "${OPTARG}" > "${update_last_id}"
				;;
			u)
				api="${OPTARG}"
				;;
			H)
				dbhost="${OPTARG}"
				;;
			P)
				dbport="${OPTARG}"
				;;
			U)
				dbuser="${OPTARG}"
				;;
			D)
				db="${OPTARG}"
				;;
			a)
				if url_available "${OPTARG}?fields=id&ids=0"; then
					api="${OPTARG}"
				else
					exit_with_error "-a ${OPTARG}: API endpoint not available"
				fi
				;;
			@)
				torsocks=$(find_tool "torsocks")
				export TORSOCKS_TOR_PORT=${OPTARG}
				;;
			q)
				quiet=1
				;;
			h)
				help
				exit
				;;
			*)
				exit_with_error "unknown option $OPTION"
				;;

		esac
	done

	check_fields

	while (
		if [[ -s ${update_last_modified} ]]; then
			last_update="$(cat "${update_last_modified}")"
			last_update_in_db="$(get_time_last_modified)"
			if [[ $last_update != "$last_update_in_db" && $no_action == 0 ]]; then
				exit_with_error "uh oh... something went wrong, last update in db does not equal last update from api response..."
			fi
		elif [[ -n $startdatetime ]]; then
			last_update="${startdatetime}"
		else
			last_update="$(get_time_last_modified)"
		fi

		last_id=$([[ -s ${update_last_id} ]] && cat "${update_last_id}" || get_max_id)

		get_updates "$last_id" "$limit" "$last_update"

		updcnt=$(get_update_count)

	[[ -n $json_dump ]] && cat "${updates}" >> "${json_dump}"

		if [[ $verbose -ge 1 ]]; then
			echo "database last modified: $last_update";
			# update counter is 0-based, humans prefer 1-based notation
			if [[ ${updcnt} -gt 0 ]]; then
				echo "$((updcnt+1)) updates";
			else
				more=$([[ -s $update_last_id ]] && echo "more " || echo "")
				echo "no ${more}updates"
			fi
			echo ;
		fi

		test "$updcnt" -gt 0
	); do

		updcnt=$(get_update_count)
		count=0
		echo "start transaction;" > "${update_sql}"

		while [[ $count -le $updcnt ]]; do
			declare -A record
			while IFS="=" read -r key value; do
				# drop unknown fields
				if [[ ! $unknown_fields =~ ${key,,} ]]; then
					# limit field size to avoid choking jq on overly long strings
					[[ ${#value} -gt 1000 ]] && value="${value:0:997}..."
					record[${key,,}]="$value"
				fi
			done < <($jq -r ".[$count]"'|to_entries|map("\(.key)=\(.value|tostring|.[0:4000]|gsub("\n";"\\n"))")|.[]' "${updates}")

			# record current position
			echo "${record['id']}" > "${update_last_id}"
			echo "${record['timelastmodified']}" > "${update_last_modified}"

			if [[ $verbose -ge 2 ]]; then
				echo "ID:       ${record['id']}";
				echo "Author:   ${record['author']}";
				echo "Title:    ${record['title']}";
				echo "Modified: ${record['timelastmodified']}";
				echo
			fi

			if [[ -n "$classifile" && -n "${record['identifierwodash']}" ]]; then
				echo "${record['md5']}" >> "$classifile"
			fi

			keys=${!record[*]}

			md5="${record[md5]}"

			# split fields between tables
			for key in "${!record[@]}"; do
				table=${current_fields[$key]}
				columns[$table]+="${key},"
				value=${record[$key]}
				if [ -n "$value" ]; then
					value=$(sanitize_field "$key" "$value")
				fi
				values[$table]+="'$value',"
				upsert[$table]+="${key} = values(${key}),"
			done

			# add md5 to secondary tables (all but the last)
			for n in $(seq 0 $((${#tables[@]}-2))); do
				table="${tables[$n]}"
				if [[ -n "${columns[$table]}" ]]; then
					columns[$table]+="md5,"
					values[$table]+="'$md5',"
					upsert[$table]+="md5 = values(md5),"
				fi
			done

			# main table (last in tables array) first
			for n in $(seq $((${#tables[@]}-1)) -1 0); do
				table="${tables[$n]}"
				if [[ -n "${columns[$table]}" ]]; then
					sql+="insert into $table (${columns[$table]%?}) values(${values[$table]%?}) on duplicate key update ${upsert[$table]%?};"
				fi
			done

			echo "${sql}" >> "${update_sql}"
			[[ -n $sql_dump ]] && echo "${sql}" >> "${sql_dump}"

			unset record
			unset keys
			unset key
			unset value
			unset sql
			columns=()
			values=()
			upsert=()

			((count++))
		done

		echo "commit;" >> "${update_sql}"

		[[ $no_action == 0 ]] && dbx "$db" < "${update_sql}"
	done

	# optionally add classification data to new records
	# this will use tor and round-robin through TOR ports if these are
	# defined in classify_tor_ports in the config file
	if [[ -n "$classifile" && -f $classifile ]]; then
		now=$(date +%Y%m%d%H%M)
		csvfile="${classify_csv:+$classify_csv/}${now}.csv"
		IFS=',' read -ra torports <<< "$classify_tor_ports"
		if [[ ${#torports[*]} -gt 0 ]]; then
			torpc=${#torports[*]}
		fi
		upc=0
		while read md5;do
			$classify ${torpc:+-@ ${torports[$upc%$torpc]}} -D "$db" ${classify_xml:+-X $classify_xml} -C "$md5" >> "${csvfile}"
			((upc++))
		done < <(cat "$classifile")

		if [[ -f ${csvfile} ]]; then
			$import_metadata -d "$db" -f "${classify_fields:-ddc,lcc,fast}" ${classify_sql:+-s $classify_sql/$now.sql} -F "${csvfile}"
		fi
	fi

}

get_current_fields () {
	for table in "${tables[@]}"; do
		dbx "$db" "describe $table;"|awk '{print "["tolower($1)"]='"$table"'"}'
	done
}

get_field_type () {
	field="$1"
	table="${current_fields[$field]}"
	dbx "$db" "show fields from $table where field=\"$field\";"|awk '{print $2}'
}

get_field_types () {
	for field in "${!current_fields[@]}"; do
		fieldtype=$(get_field_type "$field")
		[[ "$fieldtype" =~ $re_type ]]
		echo -n "[$field]=${BASH_REMATCH[0]} "
	done
}

get_field_sizes () {
	for field in "${!current_fields[@]}"; do
		fieldtype=$(get_field_type "$field")
		[[ "$fieldtype" =~ $re_int ]]
		if [[ "${BASH_REMATCH[0]}" -gt 0 ]]; then
			echo -n "[$field]=${BASH_REMATCH[0]} "
		fi
	done
}

# sanitize_field FIELD VALUE
sanitize_field () {
	field=$1
	shift
	value="$*"

	# quote values for SQL
	value=${value//\\/\\\\}
	value=${value//\'/\\\'}

	# field-type specific filters
	case "${field_types[$field]}" in
		int|bigint)
			[[ "$value" =~ $re_int ]]
			value=${BASH_REMATCH[0]}
			value=${value:0:${field_sizes[$field]}}
			;;
		char|varchar)
			value=${value:0:${field_sizes[$field]}}
			;;
		timestamp)
			[[ "$value" =~ $re_timestamp ]]
			value=${BASH_REMATCH[0]}
			;;
	esac

	# field-specific filters
	case "$field" in
		year)
			# filter out Chinese date stamps
			[[ "$value" =~ $re_year ]]
			value=${BASH_REMATCH[0]}
			;;
	esac

	echo -n "$value"
}

# libgen_api ID LIMIT TIME_LAST_MODIFIED
libgen_api () {
	id="$1"
	shift
	limit="$1"
	shift
	if ! newer=$(date -d "$*" +'%Y-%m-%d%%20%H:%M:%S'); then
		exit_with_error "date error: $* is not a valid date"
	fi

	echo "$newer" > "$update_newer"

	$torsocks "$curl" -s "${api}?"'fields=*&idnewer='"${id}"'&mode=newer&limit1='"${limit}"'&timenewer='"${newer}"
}

# get_updates ID LIMIT TIME_LAST_MODIFIED
get_updates () {
	id="$1"
	shift
	limit="$1"
	shift
	last="$*"
	libgen_api "$id" "$limit" "$last" > "${updates}"
	$jq '.|length' "${updates}" > "${update_count}"
}


get_time_last_modified () {
	dbx "$db" 'select MAX(TimeLastModified) FROM updated;'|tail -1
}

get_max_id () {
	dbx "$db" 'select MAX(id) FROM updated;'|tail -1
}

get_update_count () {
	echo $(($(cat "${update_count}")-1))
}

check_fields () {
	updates_fields=($(libgen_api 1 2 '2000-01-01'|$jq -r '.[0]|keys|@sh'))
	db_fields="${!current_fields[*]}"
	db_fields="${db_fields,,}"

	# check for extra fields in api response
	for index in "${!updates_fields[@]}"; do
		field="${updates_fields[$index]%\'}"
		field="${field#\'}"
		if [[ ! $db_fields =~ ${field,,} ]]; then
			if [[ ! -v quiet ]]; then
				echo "unknown field in api response: ${field} (consider refreshing database from dump)"
			fi
			unknown_fields+="${field,,} "
		else
			:
		fi
	done

	# check for missing fields in api reponse
	[[ $verbose -ge 1 ]] && {
		for field in "${!current_fields[@]}"; do
			if [[ ! -v quiet && ! ${updates_fields[*],,} =~ ${field,,} ]]; then
				echo "missing field in api response: $field"
			fi
		done
	}
}

cleanup () {
	rm -rf "${tmpdir}"
}

help () {
	echo "$(basename "$(readlink -f "$0")")" "version $version"
	cat <<- 'EOT'

	Usage: update_libgen OPTIONS

	    -l LIMIT	get updates in blocks of LIMIT entries
	    -v		be verbose about what is being updated; repeat for more verbosity:
	                -v: 	show basic info (number of updates, etc)
	                -vv:	show ID, Title and TimeLastModified for each update
	    -n		do not update database. Use together with -v or -vv to show
	    		how many (-v) and which (-vv) titles would be updated.
	    -j FILE	dump (append) json to FILE
	    -s FILE	dump (append) sql to FILE
	    -u URL	use URL to access the libgen API (overrides default)
	    -t DATETIME	get updates since DATETIME (ignoring TimeLastModified in database)
	     		use this option together with -s to create an sql update file to update
	    		non-networked machines
	    -i ID       get updates from ID

	    -H DBHOST	database host
	    -P DBPORT	database port
	    -U DBUSER	database user
	    -D DATABASE	database name

	    -a APIHOST	use APIHOST as API server
	    -@ TORPORT	use tor (through torsocks) to connect to libgen API server
	    -c		run classify over new records to get classification data
	    -q		don't warn about missing fields in database or api response
	    -h		this help message

	EOT
}

exlock prepare || exit 1

main "$@"