diff --git a/xmltojson.py b/xmltojson.py index 067a1f0..efda696 100755 --- a/xmltojson.py +++ b/xmltojson.py @@ -1,102 +1,139 @@ #!/usr/bin/python -''' Make me a module -''' +import argparse +import datetime +import json +import os +import stat from xml.dom import minidom -import json,re - -# jsonwalker class copied from src/xmljson.js from citeproc-js project -# https://bitbucket.org/fbennett/citeproc-js -class jsonwalker: - - def __init__(self): - pass - - def makedoc(self,xmlstring): - #xmlstring = re.sub("(?ms)^<\?[^>]*\?>","",xmlstring); - dom = minidom.parseString(xmlstring) + + +class JsonWalker: + + @staticmethod + def make_doc(xml_string): + dom = minidom.parseString(xml_string) return dom.documentElement - def walktojson(self, elem): - obj = {} - obj["name"] = elem.nodeName - obj["attrs"] = {} + def walk_to_json(self, elem): + obj = { + "name": elem.nodeName, + "attrs": {}, + "children": [], + } + if elem.attributes: for key in elem.attributes.keys(): obj["attrs"][key] = elem.attributes[key].value - obj["children"] = [] + if len(elem.childNodes) == 0 and elem.nodeName == "term": obj["children"] = [""] + for child in elem.childNodes: if child.nodeName == "#comment": pass elif child.nodeName == "#text": - if len(elem.childNodes) == 1 and elem.nodeName in ["term","single","multiple"]: + if len(elem.childNodes) == 1 and elem.nodeName in ["term", "single", "multiple"]: obj["children"].append(child.wholeText) else: - obj["children"].append(self.walktojson(child)) + obj["children"].append(self.walk_to_json(child)) + return obj -if __name__ == "__main__": - #convert file or directory from csl xml to json - #usage: - # convert all styles in ./csl that have been modified in the last 5 minutes and place them into ./csljson - # xmltojson.py --changed 300 ./csl ./csljson - import sys,os,argparse,datetime - from stat import * - - parser = argparse.ArgumentParser(description='Convert xml to json for use with citeproc-js') - parser.add_argument('source', type=str, help='source file or directory') - parser.add_argument('dest', type=str, help='destination filename or directory') - parser.add_argument('--changed', nargs='?', metavar="N", type=int, default=0, help='convert files that have been modified within the last seconds') + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description='Convert xml to json for use with citeproc-js.', + epilog='''Examples: + + Convert all styles in ./csl that have been modified in the last 5 minutes and place them into ./csljson + + ./xmltojson.py --changed 300 ./csl ./csljson +''' + ) + parser.add_argument( + 'source', + type=str, + help='Source file or directory.' + ) + parser.add_argument( + 'dest', + type=str, + help='Destination file or directory.' + ) + parser.add_argument( + '--changed', + nargs='?', + metavar="N", + type=int, + default=0, + help='Convert files that have been modified within the last seconds.' + ) + parser.add_argument( + '-v', '--verbose', + action='store_true', + help='Show verbose progress output.' + ) args = parser.parse_args() - - w = jsonwalker() + + verbose = args.verbose mode = os.stat(args.source).st_mode - if S_ISDIR(mode): + if stat.S_ISDIR(mode): # It's a directory, convert all csl files inside - sourceDir = args.source - destDir = args.dest - if not os.path.exists(destDir): - os.mkdir(destDir) - directory = True - elif S_ISREG(mode): + if not os.path.exists(args.dest): + os.mkdir(args.dest) + convert_directory( + source_dir=args.source, + dest_dir=args.dest, + changed=args.changed, + verbose=verbose, + ) + elif stat.S_ISREG(mode): # It's a file, only convert this csl file - sourceFile = args.source - destFile = args.dest - singleFile = True + convert_file( + source_file=args.source, + dest_file=args.dest, + verbose=verbose, + ) else: - print("unknown file mode") - sys.exit(1) - - if directory: - changedCutoff = datetime.datetime.now() - datetime.timedelta(seconds=args.changed) - names = os.listdir(args.source) - for name in names: - if name[-4:] == '.csl': - fullname = os.path.join(sourceDir, name) - newname = os.path.join(destDir, name) - elif name[-4:] == '.xml': - fullname = os.path.join(sourceDir, name) - newname = os.path.join(destDir, name)[0:-3] + 'json' - else: + raise RuntimeError("Unknown file mode.") + + +def convert_file(source_file, dest_file, verbose): + if source_file[-4:] != '.csl': + raise RuntimeError("Unexpected file extension") + + if verbose: + print("Converting " + source_file + " to " + dest_file) + + w = JsonWalker() + doc = w.make_doc(open(source_file).read()) + obj = w.walk_to_json(doc) + with open(dest_file, 'w') as f: + f.write(json.dumps(obj, indent=2)) + + +def convert_directory(source_dir, dest_dir, changed, verbose): + changed_cutoff = datetime.datetime.now() - datetime.timedelta(seconds=changed) + for name in os.listdir(source_dir): + if name[-4:] == '.csl': + source_file = os.path.join(source_dir, name) + dest_file = os.path.join(dest_dir, name) + elif name[-4:] == '.xml': + source_file = os.path.join(source_dir, name) + dest_file = os.path.join(dest_dir, name)[0:-3] + 'json' + else: + continue + if changed != 0: + modified = datetime.datetime.fromtimestamp(os.stat(source_file).st_mtime) + if modified < changed_cutoff: + # not modified recently enough; continue without converting continue - if args.changed != 0: - modified = datetime.datetime.fromtimestamp(os.stat(fullname).st_mtime) - if modified < changedCutoff: - #not modified recently enough; continue without converting - continue - - print("converting " + fullname + " to " + newname) - doc = w.makedoc(open(fullname).read()) - obj = w.walktojson(doc) - open(newname, 'w').write(json.dumps(obj,indent=2)) - elif singleFile: - if sourceFile[-4:] != '.csl': - print("Unexpected file extension") - sys.exit(2) - print("converting " + sourceFile + " to " + destFile) - doc = w.makedoc(open(sourceFile).read()) - obj = w.walktojson(doc) - open(destFile, 'w').write(json.dumps(obj,indent=2)) + + convert_file(source_file, dest_file, verbose) + + +if __name__ == "__main__": + main()