#!/usr/bin/env python
import string
#Copyright (c) 2008, Patrick
#
#All rights reserved.
#
#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
#
#    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
#    * Neither the names of ohbah.com, secondpagemedia.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
#
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


def WSorText(String):
	"""Returns the string as a WS object or as a Text object, depending on 
which is which"""
	if String.isspace():
		return WS(String)
	return Text(String)
def HasAttrs(String):
	"""Check and see if an xml tag has arguments in it"""
	if String.find(" ") != -1: #Probably need to add stuff here
		return True
	return False 

def GrabAttrs(String):
	"""Strips the attributes from an HTML tag and returns them as a hash."""
	Start = String.find(" ")
	#need to find all of the args.  
	ii = Start
	Attr = {}
	while ii < len(String):
		while String[ii].isspace(): #Cut through the whitespace
			ii += 1
			if ii == len(String): return Attr
		
		#Find the attributes's name
		NStart = ii
		while String[ii] != "=" and not String[ii].isspace():
			ii += 1
			if ii == len(String): return Attr
		AttrName = String[NStart:ii]
		if String[ii].isspace():
			AttrValue = ''
		else:
			ii += 1
			if ii == len(String): return Attr
			Deliminator = String[ii]
			if Deliminator != "'" and Deliminator != '"':
				Deliminator = string.whitespace
			ii += 1
			if ii == len(String): return Attr
			VStart = ii 

			while String[ii] not in Deliminator:
				ii += 1
				if ii == len(String): return Attr
			AttrValue = String[VStart:ii]
		Attr[AttrName.lower()] = AttrValue
		ii += 1
		if ii == len(String): return Attr
	return Attr


#The following classes represent XML datatypes
class XMLStr(str):
	"""Converts &stuff; things into their correct values on initalization, 
other than that identical to a normal string"""
	def __init__(Self, Value):
		str.__init__(Value.replace("&lt;", "<").replace("&gt;", ">").replace("&apos;", "'").replace("&quot", '"').replace("&amp;", "&"))#XXX Need to work with &#xxx;

class WS(XMLStr):
	"""Essentaly a string, treat it as such"""
	pass

class Text(XMLStr):
	"""Essentaly a string, treat it as such"""
	pass

class StartTag(XMLStr):
	"""Essentaly a string, treat it as such"""
	pass

class EndTag(XMLStr):
	"""Essentaly a string, treat it as such"""
	pass

class EETag(XMLStr):
	"""Essentaly a string, treat it as such"""
	pass

class ProcessingInstruction(XMLStr):
	"""Essentaly a string, treat it as such"""
	pass

class Comment(XMLStr):
	"""Essentaly a string, treat it as such"""
	pass

class TagAttr(str):
	"""Much like a string, except TagAttr.Attr is a hash and 
TagAttr.Keys() returns keys to the hash.  Note that comparisons between
Two TagAttr (and there children) don't compare the hash, just the string"""
	def __init__(Self, Value = ''):
		str.__init__(Value)
		Self.Attr = {}
	def __repr__(Self):
		return "'%s', %s" % (Self, Self.Attr)
	
class StartTagAttr(TagAttr):
	"""A TagAttr with a different name"""
	pass
class EETagAttr(TagAttr):
	"""A TagAttr with a different name"""
	pass

class Declaration(str):
	"""Much like a string, defaults to the FOO in <!FOO BAR> for XML
declarations.  Declaration.Data contains everything to the right of the first 
bit of whitespace.  """
	def __init__(Self, Value = ''):
		str.__init__(Value)
		Self.Data = ''
	def __repr__(Self):
		return "'%s', '%s'" % (Self, Self.Data)
	

class Parse:
	"""Simple parser.  reads data from a file-like stream (must support 
read())  Returns another element every time that Read() is called, False if 
you're at the end of the stream, and -1 on an Error.  If there's an error, 
Parse.Error will be updated to reflect the error.
Automagically converts entities (&sample;) in strings.  
Possible elements are:
	StartTag: <sample>
	EndTag: </sample>
	EETag: <sample />
	Text: Strings found between the tags.
	WS: Strings consisting entirely of tab, cr or spaces
	StartTagAttr: <sample attribute='value'>
	EETagAttr: <sample attribute='value' />
	ProcessingInstruction: <?sample?>
	Comment: <!--sample-->
	Declaration: <!SAMPLE data>"""
	def __init__(Self, Data):
		Self.Data = Data
		Self.Index = 0
		Self.Error = ""
	def __repr__(Self):
		return "%s:'%s'" % (Self.Index, Self.Data[Self.Index:Self.Index + 20 < len(Self.Data) and Self.Index + 20 or len(Self.Data)])
	def Read(Self):
		"""Returns a single element, or false"""
		Ret = -1
		if Self.Index == len(Self.Data):
			#We're at the end of the stream
			return False

		Spot = Self.Data.find("<", Self.Index) #Find the next tag
		if Self.Data[Self.Index:Spot] == '\n': #Single newline 
			Self.Index = Spot		    #doesn't count
			Spot = Self.Data.find("<", Self.Index) 
		

		if Spot == -1: #No tag found, must be text from here to the end
			Ret = WSorText(Self.Data[Self.Index:])
			Self.Index = len(Self.Data)
	
		elif Spot == Self.Index:  #Found a tag right under the index, so 
#Basic flow for tags: 			#there's no text between here and there
#Identify the tag type -> Find the 	#Must be a tag of some sort
#Close tab -> extract the data, update the index
			if Self.Data[Spot:Spot + 4] == "<!--":#It's a comment
				End = Self.Data.find("-->", Spot + 4)
				if End == -1:
					Self.Error = "Could not find closing bracket for a comment at %s" % Self.Index
					return -1
				Ret = Comment(Self.Data[Spot + 4:End].strip())
				Self.Index = End + 3

			elif Self.Data[Spot:Spot+ 2] == "<?":#It's a PI
				End = Self.Data.find("?>", Spot + 2)
				if End == -1:
					Self.Error = "Could not find closing bracket for a Programming Instruction at %s" %Self.Index
					return -1
				Ret = ProcessingInstruction(Self.Data[Spot + 2:End - 1].strip())
				Self.Index = End + 2

			elif Self.Data[Spot:Spot + 2] == "<!":#Declaration
				End = Self.Data.find(">", Spot + 2)
				if End == -1:
					Self.Error = "Could not find closing bracket for a declaration at %s" %Self.Index
					return -1
				Content = Self.Data[Spot + 2:End - 1].split()
				Ret = Declaration(Content[0])
				Ret.Data = " ".join(Content[1:])
				Self.Index = End + 1
			else: #Standard tag of some sort.  
				End = Self.Data.find(">", Spot + 1)
				if End == -1:
					Self.Error = "Could not find closing bracket for a declaration at %s" %Self.Index
					return -1
				Contents = Self.Data[Spot + 1:End]
				Contents.strip()
				if len(Contents) == 0:
					Self.Error = "Empty tag found at %s" % Self.Index
					return -1
				Self.Index = End + 1
				if Contents[0] == '/':#It's a closing tag
					Ret = EndTag(Contents[1:].strip())
				elif Contents[:-1] == '/':#It's an empty element
					if HasAttrs(Contents[:-1]):
						Ret = EETagAttr(Contents.split()[0])
						Ret.Attr = GrabAttrs(Contents)
					else:
						Ret = EETag(Contents[:-1].strip())
				else:#It's a start tag 
					if HasAttrs(Contents.strip()):
						Ret = StartTagAttr(Contents.split()[0])
						Ret.Attr = GrabAttrs(Contents)
					else:
						Ret = StartTag(Contents.strip())
				#Now we find out what type of card it was.  
		else:
			#The tag is somewhere in the future, ergo, everything 
			#up to the tag must be text
			Ret = WSorText(Self.Data[Self.Index:Spot])
			Self.Index = Spot
		return Ret

	def __iter__(Self):
		return Self

	def next(Self):
		Tag = Self.Read()
		if Tag == -1 or Tag == False: #There has been an error
			raise StopIteration
		return Tag
#ADD SAMPLE CODE TO DEMONSTRATE PARSER BY READING AND OUTPUTTING A BIT OF INFORMATION

import urllib2
curl = urllib2.build_opener()
#Testline: reload(psxml);  F = open("test.xml"); Self = psxml.Parse(F)
if __name__ == "__main__":
	#Example.  Reprints the XML
	import sys
	#if len(sys.argv) != 2:
	#	print "Quck Demo: psxml.py <filename>"
	#	sys.exit(1)
	#else:
	if True:
		#F = open(sys.argv[1])
		#XML = Parse(F.read())
		#F.close()
		XML = Parse(curl.open("http://www.xkcd.com/10/").read())
		TabSpot = 0
		for ii in XML:
			#check the type:
			TabFill = "".zfill(TabSpot).replace("0", "  ")
			if ii.__class__ is WS:
				pass #Ignore whitespace, we'll make our own
			elif ii.__class__ is Text:
				print TabFill +  ii
			elif ii.__class__ is StartTag:
				print TabFill +  "<" + ii + ">"
				TabSpot += 1
			elif ii.__class__ is EndTag:
				print TabFill +  "</" + ii + ">"
				TabSpot -= 1
			elif ii.__class__ is EETag:
				print TabFill +  "<" + ii + " />"
			elif ii.__class__ is ProcessingInstruction:
				print TabFill +  "<?" + ii + "?>"
			elif ii.__class__ is Comment:
				print TabFill +  "<!--" + ii + "-->"
			elif ii.__class__ is StartTagAttr:
				print TabFill +  "<" + ii,
				for jj in ii.Attr:
					print '%s = "%s"' % (jj, ii.Attr[jj]),
				print ">"
				TabSpot += 1
			elif ii.__class__ is EETagAttr:
				print TabFill +  "<" + ii,
				for jj in ii.Attr:
					print '%s = "%s"' % (jj, jj.Attr[jj]),
				print "/>"
			elif ii.__class__ is Declaration:
				print TabFill +  "<!" + ii + " " + ii.Data + ">"