#!/usr/bin/env python import string #Copyright (c) 2008, Patrick # #All rights reserved. # #Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. # * Neither the names of ohbah.com, secondpagemedia.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. # #THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS #"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT #LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR #A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR #CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. def WSorText(String): """Returns the string as a WS object or as a Text object, depending on which is which""" if String.isspace(): return WS(String) return Text(String) def HasAttrs(String): """Check and see if an xml tag has arguments in it""" if String.find(" ") != -1: #Probably need to add stuff here return True return False def GrabAttrs(String): """Strips the attributes from an HTML tag and returns them as a hash.""" Start = String.find(" ") #need to find all of the args. ii = Start Attr = {} while ii < len(String): while String[ii].isspace(): #Cut through the whitespace ii += 1 if ii == len(String): return Attr #Find the attributes's name NStart = ii while String[ii] != "=" and not String[ii].isspace(): ii += 1 if ii == len(String): return Attr AttrName = String[NStart:ii] if String[ii].isspace(): AttrValue = '' else: ii += 1 if ii == len(String): return Attr Deliminator = String[ii] if Deliminator != "'" and Deliminator != '"': Deliminator = string.whitespace ii += 1 if ii == len(String): return Attr VStart = ii while String[ii] not in Deliminator: ii += 1 if ii == len(String): return Attr AttrValue = String[VStart:ii] Attr[AttrName.lower()] = AttrValue ii += 1 if ii == len(String): return Attr return Attr #The following classes represent XML datatypes class XMLStr(str): """Converts &stuff; things into their correct values on initalization, other than that identical to a normal string""" def __init__(Self, Value): str.__init__(Value.replace("<", "<").replace(">", ">").replace("'", "'").replace(""", '"').replace("&", "&"))#XXX Need to work with &#xxx; class WS(XMLStr): """Essentaly a string, treat it as such""" pass class Text(XMLStr): """Essentaly a string, treat it as such""" pass class StartTag(XMLStr): """Essentaly a string, treat it as such""" pass class EndTag(XMLStr): """Essentaly a string, treat it as such""" pass class EETag(XMLStr): """Essentaly a string, treat it as such""" pass class ProcessingInstruction(XMLStr): """Essentaly a string, treat it as such""" pass class Comment(XMLStr): """Essentaly a string, treat it as such""" pass class TagAttr(str): """Much like a string, except TagAttr.Attr is a hash and TagAttr.Keys() returns keys to the hash. Note that comparisons between Two TagAttr (and there children) don't compare the hash, just the string""" def __init__(Self, Value = ''): str.__init__(Value) Self.Attr = {} def __repr__(Self): return "'%s', %s" % (Self, Self.Attr) class StartTagAttr(TagAttr): """A TagAttr with a different name""" pass class EETagAttr(TagAttr): """A TagAttr with a different name""" pass class Declaration(str): """Much like a string, defaults to the FOO in for XML declarations. Declaration.Data contains everything to the right of the first bit of whitespace. """ def __init__(Self, Value = ''): str.__init__(Value) Self.Data = '' def __repr__(Self): return "'%s', '%s'" % (Self, Self.Data) class Parse: """Simple parser. reads data from a file-like stream (must support read()) Returns another element every time that Read() is called, False if you're at the end of the stream, and -1 on an Error. If there's an error, Parse.Error will be updated to reflect the error. Automagically converts entities (&sample;) in strings. Possible elements are: StartTag: EndTag: EETag: Text: Strings found between the tags. WS: Strings consisting entirely of tab, cr or spaces StartTagAttr: EETagAttr: ProcessingInstruction: Comment: Declaration: """ def __init__(Self, Data): Self.Data = Data Self.Index = 0 Self.Error = "" def __repr__(Self): return "%s:'%s'" % (Self.Index, Self.Data[Self.Index:Self.Index + 20 < len(Self.Data) and Self.Index + 20 or len(Self.Data)]) def Read(Self): """Returns a single element, or false""" Ret = -1 if Self.Index == len(Self.Data): #We're at the end of the stream return False Spot = Self.Data.find("<", Self.Index) #Find the next tag if Self.Data[Self.Index:Spot] == '\n': #Single newline Self.Index = Spot #doesn't count Spot = Self.Data.find("<", Self.Index) if Spot == -1: #No tag found, must be text from here to the end Ret = WSorText(Self.Data[Self.Index:]) Self.Index = len(Self.Data) elif Spot == Self.Index: #Found a tag right under the index, so #Basic flow for tags: #there's no text between here and there #Identify the tag type -> Find the #Must be a tag of some sort #Close tab -> extract the data, update the index if Self.Data[Spot:Spot + 4] == "", Spot + 4) if End == -1: Self.Error = "Could not find closing bracket for a comment at %s" % Self.Index return -1 Ret = Comment(Self.Data[Spot + 4:End].strip()) Self.Index = End + 3 elif Self.Data[Spot:Spot+ 2] == "", Spot + 2) if End == -1: Self.Error = "Could not find closing bracket for a Programming Instruction at %s" %Self.Index return -1 Ret = ProcessingInstruction(Self.Data[Spot + 2:End - 1].strip()) Self.Index = End + 2 elif Self.Data[Spot:Spot + 2] == "", Spot + 2) if End == -1: Self.Error = "Could not find closing bracket for a declaration at %s" %Self.Index return -1 Content = Self.Data[Spot + 2:End - 1].split() Ret = Declaration(Content[0]) Ret.Data = " ".join(Content[1:]) Self.Index = End + 1 else: #Standard tag of some sort. End = Self.Data.find(">", Spot + 1) if End == -1: Self.Error = "Could not find closing bracket for a declaration at %s" %Self.Index return -1 Contents = Self.Data[Spot + 1:End] Contents.strip() if len(Contents) == 0: Self.Error = "Empty tag found at %s" % Self.Index return -1 Self.Index = End + 1 if Contents[0] == '/':#It's a closing tag Ret = EndTag(Contents[1:].strip()) elif Contents[:-1] == '/':#It's an empty element if HasAttrs(Contents[:-1]): Ret = EETagAttr(Contents.split()[0]) Ret.Attr = GrabAttrs(Contents) else: Ret = EETag(Contents[:-1].strip()) else:#It's a start tag if HasAttrs(Contents.strip()): Ret = StartTagAttr(Contents.split()[0]) Ret.Attr = GrabAttrs(Contents) else: Ret = StartTag(Contents.strip()) #Now we find out what type of card it was. else: #The tag is somewhere in the future, ergo, everything #up to the tag must be text Ret = WSorText(Self.Data[Self.Index:Spot]) Self.Index = Spot return Ret def __iter__(Self): return Self def next(Self): Tag = Self.Read() if Tag == -1 or Tag == False: #There has been an error raise StopIteration return Tag #ADD SAMPLE CODE TO DEMONSTRATE PARSER BY READING AND OUTPUTTING A BIT OF INFORMATION import urllib2 curl = urllib2.build_opener() #Testline: reload(psxml); F = open("test.xml"); Self = psxml.Parse(F) if __name__ == "__main__": #Example. Reprints the XML import sys #if len(sys.argv) != 2: # print "Quck Demo: psxml.py " # sys.exit(1) #else: if True: #F = open(sys.argv[1]) #XML = Parse(F.read()) #F.close() XML = Parse(curl.open("http://www.xkcd.com/10/").read()) TabSpot = 0 for ii in XML: #check the type: TabFill = "".zfill(TabSpot).replace("0", " ") if ii.__class__ is WS: pass #Ignore whitespace, we'll make our own elif ii.__class__ is Text: print TabFill + ii elif ii.__class__ is StartTag: print TabFill + "<" + ii + ">" TabSpot += 1 elif ii.__class__ is EndTag: print TabFill + "" TabSpot -= 1 elif ii.__class__ is EETag: print TabFill + "<" + ii + " />" elif ii.__class__ is ProcessingInstruction: print TabFill + "" elif ii.__class__ is Comment: print TabFill + "" elif ii.__class__ is StartTagAttr: print TabFill + "<" + ii, for jj in ii.Attr: print '%s = "%s"' % (jj, ii.Attr[jj]), print ">" TabSpot += 1 elif ii.__class__ is EETagAttr: print TabFill + "<" + ii, for jj in ii.Attr: print '%s = "%s"' % (jj, jj.Attr[jj]), print "/>" elif ii.__class__ is Declaration: print TabFill + ""