1
2 """ Copyright (C) 2003 Peter Ohler
3
4 XMLite is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 XMLite is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You download a copy of the GNU General Public License at
15 http://www.gnu.org/licenses/gpl.txt or obtain a copy of the GNU General
16 Public License by writing to the Free Software Foundation, Inc., 59 Temple
17 Place - Suite 330, Boston, MA 02111-1307, USA.
18
19 XMLite - extremely light weight XML parse and printer
20
21 The xmlite module is an extremely light weight XML parser and printer. It
22 does not use the DOM or SAX interfaces but instead works with a simple
23 list or rather nested lists to represent an XML document. The parser takes
24 as input a string or filename and returns a list with all the elements of
25 the XML file.
26
27 The first item in the top level XML list is a dict object with 'version',
28 'encoding', and 'standalone' keys. If there are any decl tags such as
29 'DOCTYPE' they will be next in the list and will be tuples with the decl
30 tag name and the value of the tag as the second item of the tuple.
31
32 Comments are included as lists of two items. The first item is None and
33 the second is a string which is the comment text.
34
35 CDATA are tuples of two items. The first item is 'CDATA' and the second is
36 the CDATA content.
37
38 XML elements are lists. The first item in the list is the element tag or
39 name. The second item is a dict object with includes all the attributes of
40 the element. Any remainin list items are either comments, strings, CDATA,
41 or more elements as lists.
42
43 Author: Peter Ohler, peter@ohler.com
44 $Id: xmlite.py,v 1.1.1.1 2004/05/12 09:26:11 sam Exp $
45 """
46
47 import os
48 import sys
49 import string
50
52 """ XML Exception for reporting errors in parsing of an XML file or
53 string.
54 """
55
57 """ Pass in the error message, string being parsed, and the position
58 in the string where the error was detected.
59 """
60 self.msg = msg
61 if s == None:
62 self.line = -1
63 self.char = -1
64 else:
65 self.line = 1 + s.count(os.linesep, 0, pos)
66 if self.line > 1:
67 self.char = pos - s.rfind(os.linesep, 0, pos)
68 else:
69 self.char = pos
71 if self.line < 0:
72 return self.msg
73 else:
74 return "%s at %d of line %d" % (self.msg, self.char, self.line)
75
77 """ Print out a list that matches the expected XML list format. Other
78 formats may not print out correctly. The output format is XML.
79 """
80 istr = ' ' * indent
81 if isinstance(xml, str):
82 print "%s%s" % (istr, expandCodedChars(xml))
83 elif isinstance(xml, tuple):
84 if 'CDATA' == xml[0]:
85 print "%s<![CDATA[%s]]>" % (istr, xml[1])
86 else:
87 print "%s<!%s %s>" % (istr, xml[0], xml[1])
88 elif isinstance(xml, list):
89 tag = xml[0]
90 if tag == None:
91 print "%s<!-- %s -->" % (istr, xml[1])
92 return
93 elif isinstance(tag, dict):
94 print "<?xml",
95 for k in tag:
96 v = tag[k]
97 if v != None:
98 print '%s="%s"' % (k, v),
99 print "?>"
100 indent += 2
101 for e in xml[1:]:
102 printXml(e, indent)
103 return
104 n = len(xml)
105 if n == 1:
106 print "%s<%s/>" % (istr, tag)
107 elif n == 2:
108 attrs = xml[1]
109 if attrs == None:
110 print "%s<%s/>" % (istr, tag)
111 else:
112 print "%s<%s" % (istr, tag)
113 printAttrs(xml[1], indent + 3)
114 print "/>"
115 else:
116 attrs = xml[1]
117 if attrs == None:
118 print "%s<%s>" % (istr, tag)
119 else:
120 print "%s<%s" % (istr, tag)
121 printAttrs(xml[1], indent + 3)
122 print ">"
123 indent += 2
124 for e in xml[2:]:
125 printXml(e, indent)
126 print "%s</%s>" % (istr, tag)
127 else:
128 raise XmlException("Invalid format", None, 0)
129
131 if not isinstance(attrs, dict):
132 if attrs == None:
133 return
134 raise XmlException("Invalid format", s, i)
135 istr = ' ' * indent
136 n = len(attrs)
137 for a in attrs:
138 n -= 1
139
140
141 if 0 < n:
142 print '%s%s="%s"' % (istr, a, expandCodedChars(attrs[a]))
143 else:
144 print '%s%s="%s"' % (istr, a, expandCodedChars(attrs[a])),
145
146 -def toStr(xml, s = "", indent = 0):
147 """ Return a string that is an XML document.
148 """
149 istr = ' ' * indent
150 if isinstance(xml, str):
151 s = s + "%s%s\n" % (istr, xml)
152 return s
153
154 if not isinstance(xml, list):
155 raise XmlException("Invalid format", s, i)
156 tag = xml[0]
157 if tag == None:
158 s = s + "%s<!-- %s -->\n" % (istr, xml[1])
159 return s
160 n = len(xml)
161 if n == 1:
162 s = s + "%s<%s/>\n" % (istr, tag)
163 elif n == 2:
164 attrs = xml[1]
165 if attrs == None:
166 s = s + "%s<%s/>\n" % (istr, tag)
167 else:
168 s = s + "%s<%s\n" % (istr, tag)
169 s = attrsToStr(xml[1], s, indent + 3)
170 s = s + "/>\n"
171 else:
172 attrs = xml[1]
173 if attrs == None:
174 s = s + "%s<%s>\n" % (istr, tag)
175 else:
176 s = s + "%s<%s\n" % (istr, tag)
177 s = attrsToStr(xml[1], s, indent + 3)
178 s = s + ">\n"
179 indent = indent + 2
180 for e in xml[2:]:
181 s = toStr(e, s, indent)
182 s = s + "%s</%s>\n" % (istr, tag)
183 return s
184
186 if not isinstance(attrs, dict):
187 if attrs == None:
188 return s
189 raise XmlException("Invalid format", s, i)
190 istr = ' ' * indent
191 n = len(attrs)
192 for a in attrs:
193 n -= 1
194
195
196 if 0 < n:
197 s = s + '%s%s="%s"\n' % (istr, a, expandCodedChars(attrs[a]))
198 else:
199 s = s + '%s%s="%s"' % (istr, a, expandCodedChars(attrs[a]))
200 return s
201
203 """ Load complete file into memory and then parse the string.
204 """
205 f = open(filename, "r")
206 if f == None:
207 return None
208 s = f.read()
209 f.close()
210
211 return parse(s)
212
214 """ Make one pass and parse directly into an XML list.
215 """
216 phase = 0
217 x = []
218 i = 0
219
220
221 while 3 > phase:
222 try:
223 while s[i] in string.whitespace:
224 i += 1
225 except IndexError:
226 break
227
228 if '<' != s[i]:
229 raise XmlException("Expected a '<' character", s, i)
230 i += 1
231 c = s[i]
232 if c == '?':
233 if phase != 0:
234 raise XmlException("Prolog must be the first element", s, i)
235 i += 1
236 i = readProlog(s, i, x)
237 phase = 1
238 elif c == '!':
239 i += 1
240 if '--' == s[i:i + 2]:
241 i = readComment(s, i + 2, x)
242 elif phase > 1:
243 raise XmlException("DECLs must appear before other element", s, i)
244 else:
245 i = readDecl(s, i, x)
246 phase = 1
247 else:
248 i = readElement(s, i, x)
249 phase = 2
250 return x
251
253 version, encoding, standalone = None, None, None
254
255 if 'xml' != s[i:i + 3]:
256 raise XmlException("Expected 'xml' in prolog", s, i)
257 i += 3
258 while '?' != s[i]:
259 token, i = readNameToken(s, i)
260
261 while s[i] in string.whitespace:
262 i += 1
263 c = s[i]
264 if '=' == c:
265 i += 1
266 if token == "version":
267 version, i = readQuotedValue(s, i)
268 elif token == "encoding":
269 encoding, i = readQuotedValue(s, i)
270 elif token == "standalone":
271 standalone, i = readQuotedValue(s, i)
272 else:
273 raise XmlException("Invalid prolog attribute: '" + token + "'", s, i)
274 elif '?' == c:
275 break
276 else:
277 raise XmlException("Expected '=' or '?' in prolog", s, i)
278
279 i += 1
280 if '>' != s[i]:
281 raise XmlException("Expected '>' after '?' in prolog", s, i)
282 i += 1
283 x.append({ 'version': version, 'encoding': encoding, 'standalone': standalone })
284
285 return i
286
287 nonNameStr = " \t\n\r?=/><\x0b\x0c"
288
290 while s[i] in string.whitespace:
291 i += 1
292 start = i
293 while not s[i] in nonNameStr:
294 i += 1
295 if start == i:
296 return None, i
297 return s[start:i], i
298
300 while s[i] in string.whitespace:
301 i += 1
302 if '"' != s[i]:
303 raise XmlException("Expected '\"' character", s, i)
304 i += 1
305 start = i
306 while '"' != s[i]:
307 i += 1
308 if start == i:
309 return None, i + 1
310 return replaceCodedChars(s[start:i]), i + 1
311
319
321 name, i = readNameToken(s, i)
322 while s[i] in string.whitespace:
323 i += 1
324 start = i
325 depth = 1
326 while 1:
327 c = s[i]
328 if '<' == c:
329 depth += 1
330 elif '>' == c:
331 depth -= 1
332 if depth == 0:
333 break
334 i += 1
335 x.append((name, s[start:i]))
336
337 return i + 1
338
340 name, i = readNameToken(s, i)
341 element = [name, None]
342
343 while s[i] in string.whitespace:
344 i += 1
345 if '/' == s[i]:
346 i += 1
347 if '>' == s[i]:
348 x.append(element)
349 return i + 1
350 raise XmlException("Expected '>' after '/'", s, i)
351
352
353 dict = None
354 while 1:
355 name, i = readNameToken(s, i)
356 while s[i] in string.whitespace:
357 i += 1
358 c = s[i]
359 i += 1
360 if '=' == c:
361 while s[i] in string.whitespace:
362 i += 1
363 value, i = readQuotedValue(s, i)
364 if dict == None:
365 dict = { name : value }
366 element[1] = dict
367 else:
368 dict[name] = value
369 elif '/' == c:
370 if '>' != s[i]:
371 raise XmlException("Expected '>' after '/'", s, i)
372
373 i += 1
374 x.append(element)
375 return i
376 elif '>' == c:
377 break
378 else:
379 raise XmlException("Format error", s, i)
380
381
382 while 1:
383 while s[i] in string.whitespace:
384 i += 1
385 if '<' == s[i]:
386 i += 1
387 c = s[i]
388 if '!' == c:
389 i += 1
390 if '--' == s[i:i + 2]:
391 i = readComment(s, i + 2, element)
392 elif '[CDATA[' == s[i:i + 7]:
393 i = readCData(s, i + 1, element)
394 else:
395 raise XmlException("Comment format error", s, i)
396 elif '/' == c:
397 i += 1
398 name, i = readNameToken(s, i)
399 while s[i] in string.whitespace:
400 i += 1
401 if '>' != s[i]:
402 raise XmlException("Expected '>' to close element end tag", s, i)
403 if name != element[0]:
404 raise XmlException("Element end tag name mismatch", s, i)
405 i += 1
406 break
407 else:
408 i = readElement(s, i, element)
409 else:
410 i = readText(s, i, element)
411
412 x.append(element)
413 return i
414
416 start = i
417 end = s.find(']]>', i)
418 if 0 > end:
419 raise XmlException("No CDATA closure", s, i)
420 x.append(('CDATA', s[start:end]))
421
422 return end + 3
423
424 -def readText(s, i, x):
425 start = i
426 end = s.find('<', i)
427 if 0 > end:
428 raise XmlException("No text closure", s, i)
429 x.append(replaceCodedChars(s[start:end].strip()))
430
431 return end
432
434 if '&' in text:
435 newtext = ""
436 t = 0
437 tend = len(text)
438 prev = 0
439 while 1:
440 t = text.find('&', t)
441 if 0 > t:
442 newtext += text[prev:]
443 text = newtext
444 break
445 else:
446 c, i = readCodedChar(text, t)
447 newtext += text[prev:t] + c
448 t = i
449 prev = t
450 return text
451
453 end = s.find(';', i, i + 6)
454 if 0 > end:
455 raise XmlException("Invalid coded character. Not terminated by ';'", None, -1)
456 i += 1
457 if '#' == s[i]:
458 c = chr(int(s[i + 1: end]))
459 else:
460 code = s[i:end]
461 if 'nbsp' == code:
462 c = ' '
463 elif 'lt' == code:
464 c = '<'
465 elif 'gt' == code:
466 c = '>'
467 elif 'amp' == code:
468 c = '&'
469 elif 'quot' == code:
470 c = '"'
471 elif 'apos' == code:
472 c = "'"
473 else:
474 raise XmlException("Invalid coded character '%s'" % code, None, -1)
475
476 return c, end + 1
477
479
480 if 0 < s.find(' '):
481 s = s.replace(' ', ' ')
482 s = s.replace(' ', ' ')
483
484 if '<' in s:
485 s = s.replace('<', '<')
486 if '>' in s:
487 s = s.replace('>', '>')
488 if '&' in s:
489 s = s.replace('>', '&')
490 if '"' in s:
491 s = s.replace('>', '"')
492 if "'" in s:
493 s = s.replace('>', ''')
494
495 return s
496