You can also use gsub! (for multiple matches) or sub! (for one match) and then use $1 and $2. Otherwise, yes, this is as simple as it gets.
In Cateia we use this kind of format for localization files:
Key1
{
Value1
}
Key2
{
Part1 of Value2
Part2 of Value2
Part3 of Value2
}
This is a Ruby script that parses such files and gives some statistics:
def parseFile(filename)
data = {}
begin
f = File.open(filename, 'r')
# the first line of a UTF file can be "screwed up"
skip = 0
line = f.readline()
line.each_byte {|i|
break if i < 128
skip += 1
}
f.seek(skip)
# read data
string = f.read().gsub("\r", '')
f.close()
# regular expressions are awesome
re = /(.+)\n\{\n((?:.|\n)+?)\n\}/
while m = re.match(string)
data[m[1]] = m[2]
string.sub!(re, "")
end
rescue
puts "exception: #{$!.message}"
puts $!.backtrace.join("\n")
end
return data
end
entries = []
Dir.entries('.').each {|i|
if File.file?(i) && i != '.' && i != '..'
entries.push(i)
end
}
size = 0
count = 0
longest = 0
entries.each {|filename|
data = parseFile(filename)
size += data.size
# check for multi-byte characters properly
re = /(\w+)/
data.each_key {|key|
length = 0
string = data[key]
while m = re.match(string)
string.sub!(re, "")
length += 1
end
count += length
longest = length if length > longest
}
}
puts "Statistics:"
puts "-- Entries: #{size}"
puts "-- Words: #{count}"
puts "-- Longest entry: #{longest}"
gets
As you can see, I have to replace the matched string with an empty string if I want to do multiple matches like this. :/ Python did have a different way of doing it.
import sys
import re
def parseFile(filename):
data = {}
try:
f = open(filename, 'r')
# the first line of a UTF file can be "screwed up"
skip = 0
line = f.readline()
for i in line:
if ord(i) < 128:
break
skip += 1
f.seek(skip)
# read data
string = f.read().replace('\r', '')
f.close()
# regular expressions are awesome
matches = re.findall('(.+)\n\{\n((?:.|\n)+?)\n\}', string)
for match in matches:
data[match[0]] = match[1]
except Exception, e:
print 'exception: ' + str(e)
trace_exception()
return data
filename = 'English.lang'
if len(sys.argv) > 1:
filename = sys.argv[1]
data = parseFile(filename)
count = 0
longest = 0
for key in data.keys():
length = len(re.findall(r'\w+', data[key]))
count += length
longest = max(longest, length)
print "Statistics for %s:" % filename
print "-- Entries: %d" % len(data)
print "-- Words: %d" % count
print "-- Longest entry: %d" % longest
Also, you should use \d+ for numbers. I think that d{1} will give you only the numbers for 0 to 9.