Complex Python XML example

Here is a more complex XML parsing example, which allows you to automatically format & scale all text boxes and diagrams in an OpenOffice or Libreoffice .odt file. Before running the example script below, rename the .odt to .zip, extract it, and ensure the script is in the same folder as the content.xml file before running from command prompt inside that folder with python <name_of_script>


import types
import re
import codecs

from xml.dom import minidom

text_box_font = "Courier New"
text_box_font_size = "10pt"
text_box_width = "4.3in"

#-----------------------------------------------------------------------------------------------------------------------------
def show_style_details(node, style_name):

	global text_box_font
	global text_box_font_size
	global text_box_width
	
	if node.nodeType == node.ELEMENT_NODE:
	
		if node.hasAttribute("style:name") and node.attributes["style:name"].value == style_name:
			#print style_name + " found!!"
			
			for child_node in node.childNodes:
				
				if child_node.hasAttribute("style:font-name") :
					child_node.attributes["style:font-name"].value = text_box_font

				if child_node.hasAttribute("style:font-size-complex") :
					child_node.attributes["style:font-size-complex"].value = text_box_font_size
				
				if child_node.hasAttribute("fo:font-size") :
					child_node.attributes["fo:font-size"].value = text_box_font_size
	
	for child_node in node.childNodes:
		show_style_details(child_node, style_name)

#-----------------------------------------------------------------------------------------------------------------------------
def set_border_style_details(node, style_name):
	
	if node.nodeType == node.ELEMENT_NODE:
		
		if node.hasAttribute("style:name") and node.attributes["style:name"].value == style_name:
			
			for child_node in node.childNodes:
				
				if child_node.hasAttribute("draw:stroke") :
					child_node.attributes["draw:stroke"].value = "solid"
	
	for child_node in node.childNodes:
		set_border_style_details(child_node, style_name)

#-----------------------------------------------------------------------------------------------------------------------------
def centre_images(node, style_name, image_style_found):
	
	if node.nodeType == node.ELEMENT_NODE:
		
		if node.hasAttribute("style:name") and node.attributes["style:name"].value == style_name:
			image_style_found = 1
			
		if (node.nodeName == "style:paragraph-properties" and image_style_found == 1):
			
			if node.hasAttribute("fo:text-align"):
				node.attributes["fo:text-align"].value = "center"
			else:
				node.setAttribute("fo:text-align", "center")

		for node in node.childNodes:
			centre_images(node, style_name, image_style_found)
		
#-----------------------------------------------------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
def get_style_name_list(node, font_style_found):
		
	global style_list
	global border_style_list
	
	if node.nodeType == node.ELEMENT_NODE:
		
		#print "Key: " +  node.nodeName
		
		if node.nodeName == "text:p":
			font_style_found = font_style_found + 1
			
			#print "text:p found"
			
		if node.nodeName == "draw:frame":
			font_style_found = font_style_found + 1
			
			if node.hasAttribute("draw:style-name") and font_style_found &gt;= 2 :
				#print node.attributes["draw:style-name"].value
				
				border_style_list.add(node.attributes["draw:style-name"].value)
				
			#print "draw:frame found"
		
		if node.nodeName == "draw:text-box":
			font_style_found = font_style_found + 1
			#print "draw:text-box found"
			
			if node.parentNode.hasAttribute("svg:width"):
				node.parentNode.attributes["svg:width"].value = text_box_width
			
			
		if node.hasAttribute("text:style-name") and font_style_found &gt;= 3 :
			font_style_found = font_style_found + 1
			#print "text:style-name found: " + node.attributes["text:style-name"].value
			
			style_list.add(node.attributes["text:style-name"].value)
			
		for child_node in node.childNodes:
			get_style_name_list(child_node, font_style_found)
		
	#else:
		
		#if (not(node.nodeValue is None) and (not(re.search(r"[t+|s+|n+]", node.nodeValue))) ):
		
			#print "Element value: " + node.nodeValue.encode("utf-8")

			#print "n"

#-----------------------------------------------------------------------------------------------------------
def scale_images(node, image_style_found):
		
	global text_box_width
	global image_style_list
	
	if node.nodeType == node.ELEMENT_NODE:
		
		#print "Key: " +  node.nodeName
		
		if node.nodeName == "draw:frame":
			image_style_found = image_style_found + 1
			
		if node.hasAttribute("xlink:href") and image_style_found &gt;= 1 :
			#print node.attributes["draw:style-name"].value
			
			#image_style_list.add(node.parentNode.parentNode.attributes["text:style-name"].value)
			
			if ( re.search(r'.png', node.attributes["xlink:href"].value) or 
				re.search(r'.jpg', node.attributes["xlink:href"].value) or
				re.search(r'.svm', node.attributes["xlink:href"].value) ):
				
				if node.parentNode.hasAttribute("svg:width"):
					width_str = node.parentNode.attributes["svg:width"].value
					height_str = node.parentNode.attributes["svg:height"].value
				
					match_result1 = re.match(r'([d.]*)', width_str)
					width = match_result1.group(1)
					
					match_result2 = re.match(r'([d.]*)', height_str)
					height = match_result2.group(1)
					
					match_result3 = re.match(r'([d.]*)', text_box_width)
					text_box_width_num = match_result3.group(1)
					
					if float(width) &gt; float(text_box_width_num):
						node.parentNode.attributes["svg:width"].value = text_box_width
						new_height = round(float(height)/float(width) * float(text_box_width_num), 3)
						node.parentNode.attributes["svg:height"].value = str(new_height) + "in"
				
		for child_node in node.childNodes:
			scale_images(child_node, image_style_found)
		
	#else:
		
		#if (not(node.nodeValue is None) and (not(re.search(r"[t+|s+|n+]", node.nodeValue))) ):
		
			#print "Element value: " + node.nodeValue.encode("utf-8")

			#print "n"

#-----------------------------------------------------------------------------------------------------------------------------
style_list = set()
border_style_list = set()
image_style_list = set()

xmldoc = minidom.parse('content.xml')

node_list = xmldoc.getElementsByTagName('office:document-content')

print "Number of nodes: " + str(len(node_list)) + "n";

for node in node_list:

	font_style_found = 0
	image_style_found = 0
	
	get_style_name_list(node, font_style_found)
	scale_images(node, image_style_found)
	
output_file = codecs.open("content.xml", "w", "utf-8")

for style_name in style_list:
	for node in node_list:	
		show_style_details(node, style_name)

for border_style_name in border_style_list:
	for node in node_list:	
		set_border_style_details(node, border_style_name)

image_style_found = 0		

xmldoc.writexml(output_file)
			
output_file.close()

Leave a comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.