module Rum
class Node
include Enumerable
include Comparable
attr_reader :parent, :label, :body, :attributes
def initialize parent, label, attributes = {}
@parent = parent
@label = label
@attributes = attributes
@body = ''
@children = []
end
def <=> other
if other.kind_of? Node
if @label == other.label
return @attributes <=> other.attributes
else
return @label <=> other.label
end
else
raise ArgumentError, "Unable to convert #{other.class} to Node"
end
end
def [](attr_name)
return attributes[attr_name]
end
def each
@children.each {|node| yield node}
end
def find label=nil, attributes=nil, &block
# kinda slow. it could probably use some optimization.
super() do |node|
(label == nil ? true : node.label == label) and
(attributes == nil ? true : attributes.all? {|key, vaue| node[key] == value}) and
(block == nil ? true : block.call(node))
end
end
def select *args, &block
subset = @children.clone
args.each do |arg|
if arg.is_a? String
subset.delete_if do |node|
node.label != arg
end
elsif arg.is_a? Hash
subset.delete_if do |node|
arg.any? do |key, value|
node[key] != value
end
end
end
end
if block != nil
subset.each do |node|
block.call node
end
end
return subset
end
def push object
object.kind_of?(Node) ? @children.push(object) : raise("Cannot add non-node #{object} as child of node #{self}")
end
def child_count
return @children.count
end
def to_xml indent=0
tab = " " * indent
if @body.size == 0 and @children.empty?
return "#{tab}<#{@label}#{attributes_to_xml_string} />"
else
return "#{tab}<#{@label}#{attributes_to_xml_string}>\n#{tab}\t#{@body}\n" << (@children.inject('') {|s,n| s << n.to_xml(indent+1)}) << "#{tab}</#{@label}>\n"
end
end
def spawn label, attributes = {}
node = Node.new(self, label, attributes)
self.push node
return node
end
protected
def attributes_to_xml_string
return @attributes.inject(' ') {|string, pair| string << "#{pair[0]}=\"#{pair[1]}\""}
end
end
class Document < Node
protected
def xml_attributes_from_string string
attributes = {}
if string != nil
string.scan(/(\w+?)="(\w+?)"/) do |pair|
attributes[pair[0]] = pair[1]
end
end
return attributes
end
def tags_from_xml string
node = self
string.scan(/(<[^>]+>|<\/[^>]+>|[^<]*)/) do |sub|
case sub[0]
when /<([\w\-]+?) (.+)?\/>/ # simple node
node.spawn $1, xml_attributes_from_string($2)
when /<([\w\-]+?)( (.+) ?)?>/ # complex node (open)
node = node.spawn $1, xml_attributes_from_string($2)
when /<\/([\w\-]+?)>/ # complex node (close)
if $1 == node.label and node != self
node = node.parent
else
raise "Attempted to close tag named '#{$1}'; current tag is named '#{tag.name}'"
end
else # node content
node.body << sub[0].strip
end
end
return self
end
public
attr_reader :source
def self.open_xml filename, &block
document = self.new :xml, File.read(filename)
if block != nil
block.call document
end
return document
end
def self.xml source, &block
document = self.new :xml, source
if block != nil
block.call document
end
return document
end
def initialize type=:xml, source='', &block
super nil, ''
@source = source
case type
when :xml
tags_from_xml @source
end
if block != nil
block.call self
end
end
def to_xml
# overridden to avoid enclosing <></>
return @children.inject('') {|string, node| string << node.to_xml}
end
end
end
Rum is a pure-Ruby library I wrote today that can be used to load, manipulate, and save XML files. What makes it different from REXML/&c.? This:
#! /usr/bin/ruby
require 'rum'
include Rum
XML = <<END_OF_STRING
<class id="101">
<student id="99887" name="John" grade="11" />
<student id="44753" name="Jenny" grade="10" />
</class>
<class id="102">
<student id="99887" name="John" grade="11" />
<student id="68532" name="Jenny" grade="10" />
</class>
END_OF_STRING
Document.xml XML do |document|
# Want to find all unique students in all classes?
document.collect {|klass|
klass.select('student')
}.flatten.uniq
# Want to find all classes with a grade 11 student?
document.select('class') {|klass|
klass.select('student', 'grade' => '11').size > 0
}
# how many juniors are in the school?
document.select('student', 'grade' => '12').uniq.count
# Want a pretty list of all classes and the students in them?
document.select('class') do |klass|
print "In class #{klass['id']}:\n"
klass.select('student') do |student|
print "\t#{student['name']} (student #{student['id']}, grade #{student['grade']})\n"
end
end
end
I still need to do some work on it, such as getting select/find's fancy search mechanisms to work with more of Enumerable, converting the simple node label recognition into more advanced path-style (think CSS selectors, I guess?) recognition, &c.
In the end I want it to be able to do things more like:
# are any seniors in any classes?
document.any? 'class/student', 'grade' => '12'
Do whatever you want with the code.