books/import_metadata
2021-06-02 13:15:27 +00:00

297 lines
7.9 KiB
Python
Executable file

#!/usr/bin/env python3
#
# import_metadata - import metadata to libgen/libgen_fiction
import base64
import csv
import getopt
import os
import pymysql
import re
import sys
version="0.1.0"
release="20210521"
def exit_with_error(msg):
sys.exit(os.path.basename(sys.argv[0])+" "+msg)
def try_file(f,p):
try:
fp=open(f,p)
fp.close()
return True
except IOError as x:
exit_with_error(str(x))
def main():
config = {
'dbhost': 'base.unternet.org',
'dbport': '3306',
'db': '',
'dbuser': 'libgen'
}
verbose = 0
dry_run = False
sqlfile = None
csvfile = None
use_fields=[]
sql=[]
re_csv=re.compile('(\s+)')
# read books config file (a bash source file) and interpret it
# works only for single-line static declarations (no shell code)
def read_conf(conf):
if 'APPDATA' in os.environ:
confdir = os.environ['APPDATA']
elif 'XDG_CONFIG_HOME' in os.environ:
confdir = os.environ['XDG_CONFIG_HOME']
else:
confdir = os.path.join(os.environ['HOME'], '.config')
conffile = os.path.join(confdir, 'books.conf')
if try_file(conffile,'r'):
line_re = re.compile('(?:export )?(?P<name>\w+)(?:\s*\=\s*)(?P<value>.+)')
value_re = re.compile('(?P<value>^[^#]+)(?P<comment>#.*)?$')
for line in open(conffile):
m = line_re.match(line)
if m:
name = m.group('name')
value = ''
if m.group('value'):
value = m.group('value')
m = value_re.match(value)
if m:
value=m.group('value')
conf[name]=value.strip('\"').strip("\'")
return conf
config=read_conf(config)
def to_itself(field):
return field
def to_csv(field):
return re_csv.sub(',', field)
def to_sqlescape(field):
return pymysql.escape_string(base64.b64decode(field).decode().rstrip())
fields=['md5','ddc','lcc','nlm','fast','author','title']
filters = {
'md5': to_itself,
'ddc': to_csv,
'lcc': to_csv,
'nlm': to_csv,
'fast': to_sqlescape,
'author': to_sqlescape,
'title': to_sqlescape
}
redirects = {
'fast': 'tags'
}
tables = {
'libgen': 'updated',
'libgen_fiction': 'fiction'
}
def redirect(field):
if field in redirects:
return redirects[field]
else:
return field
def usage():
msg=[]
def fmt_dict(lst):
for key in lst:
msg.append(str(key+" -> "+lst[key]).upper())
return msg
print(helpmsg.format(
progname=os.path.basename(sys.argv[0]),
version="v."+version,
csvfields=','.join(fields).upper(),
redirects=fmt_dict(redirects)
))
sys.exit()
try:
opts, args = getopt.getopt(sys.argv[1:], "d:f:F:H:u:U:ns:vh")
except getopt.GetoptError as err:
print(str(err))
usage()
for o, a in opts:
if o == "-v":
verbose+=1
elif o in ("-h"):
usage()
elif o in ("-d"):
config['db'] = a
elif o in ("-f"):
for f in a.split(','):
if f in fields:
use_fields.append(f)
else:
exit_with_error("-f "+f+" : no such field")
elif o in ("-F"):
if try_file(a,'r'):
csvfile = a
elif o in ("-H"):
config['dbhost'] = a
elif o in ("-U"):
config['dbuser'] = a
elif o in ("-n"):
dry_run = True
elif o in ("-s"):
if try_file(a,'w'):
sqlfile = a
else:
exit_with_error("unhandled option")
if len(sys.argv) <= 2:
exit_with_error("needs at least 3 parameters: -d database -f field1,field2 -F csvfile")
if not config['db'] or config['db'] not in tables:
exit_with_error("-d "+config['db']+": no such database")
if not use_fields:
exit_with_error("no fields defined, use -f field1 -f field2")
with open(csvfile) as cf:
reader = csv.DictReader(cf, fieldnames=fields)
if verbose >= 1:
sys.stdout.writelines(['\n#----DATA----------------------\n\n'])
for row in reader:
if verbose >= 1:
for field in fields:
print(field.upper()+": "+filters[field](row[field]))
print("")
updates=""
comma=""
for field in use_fields:
value=filters[field](row[field])
if value:
if updates:
comma=","
updates+=comma+redirect(field).upper()+"='"+value+"'"
if updates:
sql.append("update updated set "+updates+" where md5='"+row['md5']+"';\n")
else:
if verbose:
print("-- fields "+str(use_fields)+" not defined for md5:"+row['md5'])
if sql:
if sqlfile:
fp=open(sqlfile,'a')
fp.writelines([
'-- csvfile: '+csvfile+'\n',
'-- database: '+config['db']+'\n',
'-- fields: '+str(use_fields)+'\n',
'-- command: '+' '.join(sys.argv)+'\n',
'start transaction;\n'
])
fp.writelines(sql)
fp.writelines(['commit;\n'])
fp.close()
if verbose >= 2:
sys.stdout.writelines(['\n#----SQL-----------------------\n\n'])
sys.stdout.writelines(sql)
if not dry_run:
conn=pymysql.connect(
read_default_file='~/.my.cnf',
host=config['dbhost'],
port=config['dbport'],
user=config['dbuser'],
database=config['db']
)
with conn:
with conn.cursor() as cursor:
for line in sql:
cursor.execute(line)
conn.commit()
helpmsg = """
{progname} {version}
Use: {progname} [OPTIONS] -d database -f "field1,field2" -F CSVDATAFILE
Taking a file containing lines of CSV-formatted data, this tool can be
used to update a libgen / libgen_fiction database with fresh metadata.
It can also be used to produce SQL (using the -s sqlfile option) which
can be used to update multiple database instances.
CSV data format:
{csvfields}
Fields FAST, AUTHOR and TITLE should be base64-encoded.
CSV field names are subject to redirection to database field names,
currently these redirections are active (CSV -> DB):
{redirects}
OPTIONS:
-d DB define which database to use (libgen/libgen_fiction)
-f field1,field2
-f field1 -f field2
define which fields to update
-F CSVFILE
define CSV input file
-s SQLFILE
write SQL to SQLFILE
-n do not update database
use with -s SQLFILE to produce SQL for later use
use with -v to see data from CSVFILE
use with -vv to see SQL
-v verbosity
repeat to increase verbosity
-h this help message
Examples
$ import_metadata -d libgen -F csv/update-0000 -f 'ddc lcc fast'
update database 'libgen' using data from CSV file csv/update-0000,
fields DDC, LCC and FAST (which is redirected to libgen.Tags)
$ for f in csv/update-*;do
{progname} -d libgen -s "$f.sql" -n -f 'ddc,lcc,fast' -F "$f"
done
create SQL (-s "$f.sql") to update database using fields
DDC, LCC and FAST from all files matching glob csv/update-*,
do not update database (-n option)
"""
if __name__ == "__main__":
main()