Source code file content
subversion / lib / rudi / xml_transform.rb
Size: 16841 bytes, 1 line
# Input processor
require 'rexml/document'
# Output processor
require 'rudi/XML_Builder'
module RuDI
# = XML_Transform: An XML Transformation Engine
#
# Generally, a Transform (xfm) will be used to process one variant of XML
# into another (most typically, xHTML). To do that in the easiest and most
# readable way, a syntax similar to XSL is needed--one in which you create
# "templates" that are applied to XML elements as they arrive.
#
# But while XSL provides a terrific templating facility, it has many drawbacks,
# most notably with respect to the ability to invoke functions and do other
# processing. Ruby is much better at that kind of thing, as Martin Fowler
# explains in "Moving away from XSLT", at
# http://www.martinfowler.com/bliki/MovingAwayFromXslt.html
#
# But while Ruby is great for doing calculations, most XML builders available
# in Rubyland have you writing code that looks more like Ruby than XML. That's
# great for flexibility, not so great for readability.
#
# It's hard to modify code when you can't see what it's doing, so readability
# is pretty important at the application level. The XML_Builder class provides
# that kind of readability, making it possible to achieve a blend that combines
# Ruby's processing flexibility with the XSL's all-XML templates.
#
# To make it work, the trick is divide each transform into two parts:
#
# a) The first part calls Ruby functions and does any other processing it needs
# to do, using _REXML to gather information needed from the node that is
# currently being processed.
#
# b) The second part uses XML_Builder to generate output, using variables
# defined in the first part (frequently by way of string interpolation).
#
# == Conceptual Overview
#
# A Transform (or transformation) fits the model of a prototypical system
# process:
# |
# | control
# |
# +-------------v-------------+
# input | | output
# -------->| Process |-------->
# | |
# +---------------------------+
#
# In this case the input: is the XML you want to transform, the output is the
# XML you're generating. Control, meanwhile, is exercised by defining element
# transforms and doing other configuration steps when initializing the Transform,
# or by changing Transform operation "on the fly", in response to inputs.
#
# == Usage
# You define a "transform" object by extending this class and adding the
# transformations you need. You specify inputs and outputs for that transform
# in one of three ways, in any combination that's appropriate:
#
# 1) I/O using files:
# transform.xfm_file(input_file)
# transform.to_file(output_file) or transform.write_to_stdout
#
# 2) I/O using strings:
# doc_string = "<html><body><p>text</p></body></html>"
# doc = REXML::Document.new doc_string
# transform.xfm_node(doc) --or doc.root (CONFIRM)
# output_string = transform.to_s
#
# 3) I/O using REXML nodes:
# require "rexml/document"
# file = File.new("someFile.xml" )
# doc = REXML::Document.new file
# transform.xfm_node(doc.root)
# new_xml_root_node = transform.to_xml (aka to_s)
#
# where you define your transform like this:
# require 'xml_transform'
# transform = XML_Transform.new do
#
# # Define Ruby utility methods you need. Those methods can:
# # * Call existing methods defined in the XML_Builder class:
# # - "pre!" or "cdata!" to add the specified tags
# # - "instruct!" or "doctype!" to generate xml/xhtml headers
# # - "text!" to insert text
# # * Call methods you have defined
# # * Specify an undefined method like "head" to generate an
# # element with that name.
#
# def create_declarations
# instruct! # Add an XML declaration (defined in xml_builder)
# doctype! # Add an xHTML doctype declaration (ditto)
# end
#
# def generate_header
# head {
# title document_title
# }
# end
#
# # Define new transforms or redefine existing ones
# # This transform applies to an incoming <html> node.
# xfm :html do |node|
# create_declarations
# html {
# generate_header
# body {
# xfm_node(node)
# }
# }
# end
#
# xfm :foo do |node|
# # Transformation method for <foo> element
#
# # Ruby Code (extract info from node here)
# # Example: Include node attributes in the output
# attrs = node.attributes # Attribute list
#
# # Template Code (define element template for output here)
# ui attrs do # Pass attributes to the template
# ...template code (generate desired xml here...
# li text
# end
# end # :foo
#
# xfm :bar do |node|
# ...
# end
# ...
# end # transform
#
# <b>Notes</b>
# * The term "transform" is hierarchical in nature (and consequently
# overloaded).
# * In the aggregate, it refers to an XML_Transform object,
# which is a collection of (mini) transforms for individual elements.
# (XSL uses the term "template" for the mini transforms. But that's
# only part of an XML_Transform.
# * The complete transformation ontology looks something like this:
# * A Transform (or transformation) is an object that contains multiple
# (micro/mini) transforms.
# * A (micro/mini) transform is a function that has two parts:
# a) A Ruby part that does processing
# b) A template part that produces output
# * Both parts are optional, in reality. Output could be done with
# Ruby code, or a template could be used without any pre-processing.
# * The "template" part is defined using XML_Builder. In reality, it's
# still Ruby code, but XML_Builder uses Ruby's wicked cool
# metaprogramming features and syntax-freedom to write code that
# looks as much as possible like plain XML, so it becomes akin to
# an XSL "template".
# * REXML has some element methods that are useful when writing transforms:
# * has_attributes?
# * has_elements?
# * has_text?
# * Converting CDATA to HTML <pre>
# With incoming HTML, <pre> tags are visible, so user can control output
# with normal tag processing.
#
# But with incoming XML, CDATA isn't normally visible. To take control,
# override the special transform: _cdata_xfm
#
# * Parallel operations
# These pairs of operations are inverses of each other. (Since XML_Transform
# extends XML_Builder, they can be used in any combination:
#
# XML_Transform (process inputs) XML_Builder (generate outputs)
# ------------------------------ ------------------------------
# xfm_xml to_xml / to_s
# xfm_node to_node
# xfm_file to_file
#
# Predefined Transforms
# * By convention, pre-defined transforms start with underscore and end with
# _xfm. So for example the predefined cdata transform is :_cdata_xfm.
# * Another predefined transform is :_cdata_to_pre_xfm. For XML to HTML
# transformations, copy :_cdata_to_pre_xfm to :_cdata_xfm, using copy_xfm().
# * Finally, the :_default_xfm is a copy of the :_identity_xfm. That behavior
# can be changed at any time by defining a different transform and copying
# it to :_default_xfm. (If you save the original first, you can then
# dynamically restore it later.)
#
# Author: Eric Armstrong
#
#--
# Implementation note:
# So far, the XML_Builder functionality as been accreted into the
# Transform by way of a module-mixin. An alternative implementation
# would be to delegate to a instance of XML_Builder.
#
# Implementation Options:
# Delegation: Transform delegates to builder instance
# [+] Flexible. Powerful.
# [?] XML_Builder can be used as an object--but it may be that
# every realistic use, like Transform, needs to decorate it
# with additional methods, the way Transform does
# [+] Can have seperate intializers and method_missing
# [-] Need delegators for to_s, intialize, and method_missing
# >>Module mixin: Transform acquires the builder methods via include
# (The chosen implementation.)
# [-] Single wad of code. Can't have two methods with the same name.
# [+] Can define any additional methods we need and have them do the
# right thing, without extra work. Fast.
# Subclass: Transform extends builder
# [+] Transform acquires methods like to_s and values like @out
#--
# Author: Eric Armstrong
#
class XML_Transform
include XML_Builder
# Hash table containing defined transforms
@hash
# Determines how incoming whitespace is handled
# Set to :preserve or :ignore (the default)
@whitespace
# XML_Builder already defines initialize (necessarily). Can't declare it again.
# But it will call a configure() method, if one is defined.
def configure(&block)
@hash = Hash.new
@whitespace = :ignore
# Set up identity transform, make it the default
xfm(:_identity_xfm) do |node|
if node.has_non_attribute_children?
self.send(node.name.to_sym, atts(node)) do
xfm_node(node)
end
elsif
self.send(node.name.to_sym, atts(node))
end
end
copy_xfm(:_identity_xfm, :_default_xfm)
# Set up the XML CDATA transform, compensating for a REXML bug that returns
# The NL after <![CDATA[ and the NL before ]]> as parts of the string.
# (They shouldn't be.)
# Override it to generate HTML <pre> text, when needed.
xfm(:_cdata_xfm) do |node|
text = node.to_s
text[0,1] = "" if text[0,1] == "\n"
text.chomp!
cdata! text
end
xfm(:_cdata_to_pre_xfm) do |node|
text = node.to_s
text[0,1] = "" if text[0,1] == "\n"
pre! text
end
# Process new transforms and transform overrides
# Can't yield, because the xfm method isn't defined in the calling context.
# Use instance_eval instead, as did the code XML_Builder was based on.
# (instance_eval calls methods defined in this class, but uses
# variables defined in the calling context--the perfect blend.)
#
#yield to (run the) specified codeblock if "block_given?" is true
instance_eval(&block) if block_given?
end
def inspect
@hash.each_pair {|key, proc| "#{key}: #{proc.to_s}" }.collect.join("\n")
end
# Internal method for testing
def _get_hash #:nodoc:
@hash
end
# Return the attribute list for a REXML element node
def atts(element)
return nil if ! element.has_attributes?
att_hash = Hash.new
element.attributes.each_attribute do |attr|
att_hash[attr.expanded_name.to_sym] = attr.value
end
return att_hash
end
# Preserve whitespace embedded in the incoming XML and
# turn off automated formatting (indent=nil).
def preserve_ws
@whitespace = :preserve
@indent = nil
end
# Ignore whitespace embedded in the incoming XML.
# Leave the current format setting alone. (indent)
def ignore_ws
@whitespace = :ignore
end
# Internal method for testing
def xfm_defined?(elem_sym)
@hash.has_key? elem_sym
end
# Define a transformation for an element.
# Replace the existing transformation if the symbol already exists.
# Remove the existing transformation if no block is given.
# Note:
# The ability to replace a transform means that you can dynamically
# modify behavior to create a state-smart, context-aware transformation.
def xfm(elem_sym, &block)
if block_given?
@hash[elem_sym] = block
else
@hash.delete(elem_sym)
end
end
# Convenience method: Delete existing transformation
def del_xfm(elem_sym)
xfm(elem_sym)
end
# Return a stored transform so it can processed or examined
def get_xfm(elem_sym)
@hash[elem_sym]
end
# Copy an existing transform under a new name.
# Needed internally to single source the default & identity transforms,
# but potentially useful for applications, as well.
# TODO: An alias operation would be nice. Not sure how to implement it.
# alias_xfm(existing_elem_sym, new_elem_sym)
def copy_xfm(from_elem, to_elem)
@hash[to_elem] = @hash[from_elem]
end
# Apply transform to contents of file.
def xfm_file filename
file = File.new(filename)
doc = REXML::Document.new file
xfm_root doc.root if doc.root != nil
end
# Parse the text. Apply transform to root node.
def xfm_xml aString
doc = REXML::Document.new aString
xfm_root doc.root if doc.root != nil
end
def special_xfm node #:nodoc:
#print node.class.to_s + ": "
#p node
if node.kind_of? REXML::Text
return node.to_s
end
node.ignore_whitespace_nodes # No difference, with or without
return node.name.to_s +
node.each {|n| special_xfm(n)}.collect.join +
"/" + node.name.to_s
end
def special_test xmlString #:nodoc:
doc = REXML::Document.new xmlString
doc.root.ignore_whitespace_nodes # No difference, with or without
special_xfm doc.root
end
# Applies the transformation to the designated root node, and
# then to its children. (Differs from xfm_node which only handles children.)
# Generally used for a document root node, but could also be used
# to transform a subtree.
def xfm_root element
# Process the node and it's children
element.ignore_whitespace_nodes if @whitespace == :ignore
_handle_element element do
xfm_node element
end
end
# Transform the children of a node.
# Use this method inside a transform to recurse on the current node,
# or use it to process an entirely different node (which makes it
# possible to drastically rearrange the original structure).
def xfm_node node
# Handle all children
node.each {|e| _handle(e)} # ?? if anElement ??
end
# Handle an XML node. (Not it's children)
def _handle node #:nodoc:
if node.kind_of? REXML::Element
_handle_element node
elsif node.kind_of? REXML::CData
# CDATA is a subclass of Text. Have to test for it first.
_handle_cdata node
elsif node.kind_of? REXML::Text
_handle_text_node(node)
elsif node.kind_of? REXML::Comment
comment! node.to_s
elsif node.kind_of? REXML::Instruction
_handle_pi node
elsif node.kind_of? REXML::Attribute
# Do nothing. The processing model we've defined in XML_Builder
# requires them to be specified outside the block, not in it,
# so it's up to the transform to extract them and pass them on,
# the same way the identity transform does.
else
fail "Unexpected node: #{node.class.to_s}\n" +
"Name: #{node.name}\nContent: #{node.to_s}"
end
end
def _handle_pi piNode #:nodoc:
# Handle processing instruction <?foo ...?>
# to_s returns the <? & ?>, in this case
text! piNode.to_s
end
def _handle_element anElement #:nodoc:
handler = anElement.name.to_sym
proc = @hash[handler]
proc = @hash[:_default_xfm] if proc == nil
proc.call(anElement)
end
def _handle_cdata cdataNode #:nodoc:
proc = @hash[:_cdata_xfm]
proc = @hash[:_default_xfm] if proc == nil
proc.call(cdataNode)
end
def _handle_text_node textNode #:nodoc:
# PATCH FOR REXML BUG
# REXML::Element#ignore_whitespace_nodes only works when adding nodes
# to a REXML doc, not when parsing it. So nodes containing nothing but
# NL's and blanks arrive as "text"
text = textNode.to_s
if @whitespace == :ignore
text.strip!
return if text == ""
end
text! text
end
# Transform node name into a valid proc name.
# NOTE: No longer needed, since we're using a hash table for procs
# def _name anElement :nodoc:
# anElement.name.tr("-","_")
# end
end # XML_Transform class
end # RuDI module
#####################################################################
# Defining this extension outside of the module, so it doesn't get included
# as part of the module when generating documentation.
#####################################################################
module REXML #:nodoc:
class Element
# Add a method to the REXML::Element class that tells whether it has
# children that are something other than attributes. (If it does,
# we need to generate a standard two-part tag. Otherwise, we should
# generate a singleton.)
def has_non_attribute_children?
return true if self.has_text?
return true if self.has_elements?
self.each do |child|
return true if child.kind_of? REXML::Comment
return true if child.kind_of? REXML::CData
return true if child.kind_of? REXML::Instruction
end
return false
end
end # REXML::Element
end





