Here is a more complex XML parsing example, which allows you to automatically format & scale all text boxes and diagrams in an OpenOffice or Libreoffice .odt file. Before running the example script below, rename the .odt to .zip, extract it, and ensure the script is in the same folder as the content.xml file before running from command prompt inside that folder with python <name_of_script>
import types import re import codecs from xml.dom import minidom text_box_font = "Courier New" text_box_font_size = "10pt" text_box_width = "4.3in" #----------------------------------------------------------------------------------------------------------------------------- def show_style_details(node, style_name): global text_box_font global text_box_font_size global text_box_width if node.nodeType == node.ELEMENT_NODE: if node.hasAttribute("style:name") and node.attributes["style:name"].value == style_name: #print style_name + " found!!" for child_node in node.childNodes: if child_node.hasAttribute("style:font-name") : child_node.attributes["style:font-name"].value = text_box_font if child_node.hasAttribute("style:font-size-complex") : child_node.attributes["style:font-size-complex"].value = text_box_font_size if child_node.hasAttribute("fo:font-size") : child_node.attributes["fo:font-size"].value = text_box_font_size for child_node in node.childNodes: show_style_details(child_node, style_name) #----------------------------------------------------------------------------------------------------------------------------- def set_border_style_details(node, style_name): if node.nodeType == node.ELEMENT_NODE: if node.hasAttribute("style:name") and node.attributes["style:name"].value == style_name: for child_node in node.childNodes: if child_node.hasAttribute("draw:stroke") : child_node.attributes["draw:stroke"].value = "solid" for child_node in node.childNodes: set_border_style_details(child_node, style_name) #----------------------------------------------------------------------------------------------------------------------------- def centre_images(node, style_name, image_style_found): if node.nodeType == node.ELEMENT_NODE: if node.hasAttribute("style:name") and node.attributes["style:name"].value == style_name: image_style_found = 1 if (node.nodeName == "style:paragraph-properties" and image_style_found == 1): if node.hasAttribute("fo:text-align"): node.attributes["fo:text-align"].value = "center" else: node.setAttribute("fo:text-align", "center") for node in node.childNodes: centre_images(node, style_name, image_style_found) #----------------------------------------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------------------------- def get_style_name_list(node, font_style_found): global style_list global border_style_list if node.nodeType == node.ELEMENT_NODE: #print "Key: " + node.nodeName if node.nodeName == "text:p": font_style_found = font_style_found + 1 #print "text:p found" if node.nodeName == "draw:frame": font_style_found = font_style_found + 1 if node.hasAttribute("draw:style-name") and font_style_found >= 2 : #print node.attributes["draw:style-name"].value border_style_list.add(node.attributes["draw:style-name"].value) #print "draw:frame found" if node.nodeName == "draw:text-box": font_style_found = font_style_found + 1 #print "draw:text-box found" if node.parentNode.hasAttribute("svg:width"): node.parentNode.attributes["svg:width"].value = text_box_width if node.hasAttribute("text:style-name") and font_style_found >= 3 : font_style_found = font_style_found + 1 #print "text:style-name found: " + node.attributes["text:style-name"].value style_list.add(node.attributes["text:style-name"].value) for child_node in node.childNodes: get_style_name_list(child_node, font_style_found) #else: #if (not(node.nodeValue is None) and (not(re.search(r"[t+|s+|n+]", node.nodeValue))) ): #print "Element value: " + node.nodeValue.encode("utf-8") #print "n" #----------------------------------------------------------------------------------------------------------- def scale_images(node, image_style_found): global text_box_width global image_style_list if node.nodeType == node.ELEMENT_NODE: #print "Key: " + node.nodeName if node.nodeName == "draw:frame": image_style_found = image_style_found + 1 if node.hasAttribute("xlink:href") and image_style_found >= 1 : #print node.attributes["draw:style-name"].value #image_style_list.add(node.parentNode.parentNode.attributes["text:style-name"].value) if ( re.search(r'.png', node.attributes["xlink:href"].value) or re.search(r'.jpg', node.attributes["xlink:href"].value) or re.search(r'.svm', node.attributes["xlink:href"].value) ): if node.parentNode.hasAttribute("svg:width"): width_str = node.parentNode.attributes["svg:width"].value height_str = node.parentNode.attributes["svg:height"].value match_result1 = re.match(r'([d.]*)', width_str) width = match_result1.group(1) match_result2 = re.match(r'([d.]*)', height_str) height = match_result2.group(1) match_result3 = re.match(r'([d.]*)', text_box_width) text_box_width_num = match_result3.group(1) if float(width) > float(text_box_width_num): node.parentNode.attributes["svg:width"].value = text_box_width new_height = round(float(height)/float(width) * float(text_box_width_num), 3) node.parentNode.attributes["svg:height"].value = str(new_height) + "in" for child_node in node.childNodes: scale_images(child_node, image_style_found) #else: #if (not(node.nodeValue is None) and (not(re.search(r"[t+|s+|n+]", node.nodeValue))) ): #print "Element value: " + node.nodeValue.encode("utf-8") #print "n" #----------------------------------------------------------------------------------------------------------------------------- style_list = set() border_style_list = set() image_style_list = set() xmldoc = minidom.parse('content.xml') node_list = xmldoc.getElementsByTagName('office:document-content') print "Number of nodes: " + str(len(node_list)) + "n"; for node in node_list: font_style_found = 0 image_style_found = 0 get_style_name_list(node, font_style_found) scale_images(node, image_style_found) output_file = codecs.open("content.xml", "w", "utf-8") for style_name in style_list: for node in node_list: show_style_details(node, style_name) for border_style_name in border_style_list: for node in node_list: set_border_style_details(node, border_style_name) image_style_found = 0 xmldoc.writexml(output_file) output_file.close()