epub.js/tools/appcache.py

#!/usr/bin/python

"""WARNING:  Script is in beta and needs to be tested thoroughly.

The script generates a rudimentary appcache file based upon the content.opf file located in either:
an uncompressed epub directory or a compressed epub file and places it in the current directory

Usage: acm_gen.py --input='/path/to/content.opf' which links to the uncompressed epub directory that includes the content.opf
OR 		 --input='/path/to/book.epub' which links to the compressed epub file
"""

__author__ = 'Luis Aguilar'
__email__ = 'luis@berkeley.edu'

import os
import xml.etree.ElementTree as ET
import zipfile
import datetime
import epub
from optparse import OptionParser

def get_parameters():
    """
        Parse the user input
    """
    parser = OptionParser()
    parser.add_option('-i', '--input', dest='input')
    parser.add_option('-o', '--output', dest='output', default='.')
    (options, args) = parser.parse_args()

    # code block to check for empty path, needed? path that includes proper filename, then valid file check
    if not options.input:
        return parser.error('input path is empty, use --input="path.to.opf.or.epub.filename"')
    elif not (options.input[-3:].lower() == 'pub' or options.input[-3:].lower() == 'opf'):
        return parser.error('Please include opf or epub filename in path')
    elif not os.path.isfile(options.input):
        return parser.error('input epub or content.opf file could not be found, please verify path and filename')
    else:
        return {'input': options.input, 'output': options.output, 'file': options.input[-3:].lower()}

def process_extracted_opf(userParams):
    """
        Parse the content.opf file.  Is it good practice to close file used
        for ElementTree processing?
    """
    namespaces = {'xmlns': 'http://www.idpf.org/2007/opf',
                    'dc':'http://purl.org/dc/elements/1.1/',
                    'dcterms':'http://purl.org/dc/terms/'}

    print "Parsing content.opf file at " + userParams['input']
    # return list
    itemHrefs = []

    # begin parsing content.opf
    tree = ET.parse(userParams['input'])
    root = tree.getroot()
    # extract item hrefs and place in return list
    for child in root.findall('xmlns:manifest/xmlns:item', namespaces=namespaces):
        itemHrefs.append(child.attrib['href'])
    return itemHrefs

def process_epub(userParams):
    """
        Parse manifest items using epub library
    """
    book = epub.open_epub(userParams['input'])

    print "Parsing epub file at " + userParams['input']

    itemHrefs = []
    for item in book.opf.manifest.values():
        itemHrefs.append(item.href)

    return itemHrefs

def write_appcache(itemHrefs):
    """
        Create offline_appcache with extracted hrefs
    """
    fileName = 'epub.appcache'
    cacheHeader = 'CACHE MANIFEST\n'

    # open pointer to new appcache file
    # will need to add functionality that checks for existing appcache
    f_appcache = open(fileName, "w")

    # write file
    f_appcache.write(cacheHeader)
    f_appcache.write('# '+ str(datetime.datetime.now()) + '\n')

    for href in itemHrefs:
        f_appcache.write(href + '\n')

    # close file
    f_appcache.close()

def main():
    # get user defined parameters
    userParams = get_parameters()

    # process the epub or the content file extracted from an epub
    if (userParams['file']=='pub'):
        itemHrefs = process_epub(userParams)
    elif(userParams['file']=='opf'):
        itemHrefs = process_extracted_opf(userParams)

    # take extracted items and generate the appcache
    write_appcache(itemHrefs)

if __name__ == '__main__':
    main()