Code archives/File Utilities/HTML/XML Parser

This code has been declared by its author to be Public Domain code.

Download source code

HTML/XML Parser by Perturbatio2005
(BMX)
example is at the bottom of the code.
Strict
Rem
bbdoc: Pert.HTML
End Rem
Module Pert.html
ModuleInfo "Module: Perturbatio's HTML Mod"
ModuleInfo "Version: 0.01"
ModuleInfo "Author: Kris Kelly (Perturbatio), portions converted from delphi source, author unknown"
ModuleInfo "License: Public Domain"
Import Pub.stdc
Import BRL.FileSystem
Import BRL.LinkedList


Global Entities:String[][] =[[""",  """],["&",   "&"],["<",    "<"],[">",    ">"],[" ",  " "],["¡", "¡"],["¢",  "¢"],["£", "£"],["¤","¤"],["¥",   "¥"],["¦","¦"],["§",  "§"],["¨",   "¨"],["©",  "©"],["ª",  "ª"],["«", "«"],["¬",   "¬"],["­",   "­"],["®",   "®"],["¯",  "¯"],["°",   "°"],["±","±"],["²",  "²"],["³",  "³"],["´", "´"],["µ", "µ"],["¶",  "¶"],["·","·"],["¸", "¸"],["¹",  "¹"],["º",  "º"],["»", "»"],["¼","¼"],["½","½"],["¾","¾"],["¿","¿"],["À","À"],["Á","Á"],["Â", "Â"],["Ã","Ã"],["Ä",  "Ä"],["Å", "Å"],["Æ", "Æ"],["Ç","Ç"],["È","È"],["É","É"],["Ê", "Ê"],["Ë",  "Ë"],["Ì","Ì"],["Í","Í"],["Î", "Î"],["Ï",  "Ï"],["Ð",   "Ð"],["Ñ","Ñ"],["Ò","Ò"],["Ó","Ó"],["Ô", "Ô"],["Õ","Õ"],["Ö",  "Ö"],["×", "×"],["Ø","Ø"],["Ù","Ù"],["Ú","Ú"],["Û", "Û"],["Ü",  "Ü"],["Ý","Ý"],["Þ", "Þ"],["ß", "ß"],["à","à"],["á","á"],["â", "â"],["ã","ã"],["ä",  "ä"],["å", "å"],["æ", "æ"],["ç","ç"],["è","è"],["é","é"],["ê", "ê"],["ë",  "ë"],["ì","ì"],["í","í"],["î", "î"],["ï",  "ï"],["ð",   "ð"],["ñ","ñ"],["ò","ò"],["ó","ó"],["ô", "ô"],["õ","õ"],["ö",  "ö"],["÷","÷"],["ø","ø"],["ù","ù"],["ú","ú"],["û", "û"],["ü",  "ü"],["ý","ý"],["þ", "þ"],["ÿ",  "ÿ"]];

Global CharSet:String[] = [" ","!","~q","#","$","%","&","(",")"..
,"*","+",",","-",".","/","0","1","2","3","4","5","6","7","8"..
,"9",":",";","<","=",">","?","@","A","B","C","D","E","F","G"..
,"H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V"..
,"W","X","Y","Z","[","\","]","^","_","`","a","b","c","d","e"..
,"f","g","h","i","j","k","l","m","n","o","p","q","r","s","t"..
,"u","v","w","x","y","z","{","|","}","","_","_","~q","ƒ"..
,".","?","?","^","%","S","<","O","_"..
,"Z","_","_","","-","-","~~","T","s",">","o"..
,"_","z","Y"," ","¡","¢","£","¤","¥","¦","§","¨","©","ª","«"..
,"¬","­","®","¯","°","±","²","³","´","µ","¶","·","¸","¹","º"..
,"»","¼","½","¾","¿","À","Á","Â","Ã","Ä","Å","Æ","Ç","È","É"..
,"Ê","Ë","Ì","Í","Î","Ï","Ð","Ñ","Ò","Ó","Ô","Õ","Ö","×","Ø"..
,"Ù","Ú","Û","Ü","Ý","Þ","ß","à","á","â","ã","ä","å","æ","ç"..
,"è","é","ê","ë","ì","í","î","ï","ð","ñ","ò","ó","ô","õ","ö"..
,"÷","ø","ù","ú","û","ü","ý","þ","ÿ"]
Rem
bbdoc: THTMLParam type
End Rem
Type THTMLParam

	Field fRaw:String
	Field fKey:String
	Field fValue:String
	
	Rem
	bbdoc: Sets a key passed as a string i.e. "color=~qBlack~q"
	End Rem
	Method SetKey(Key:String)
		fValue = ""
		fRaw = Key
		'DebugStop
		If Key.Find("=")>-1 Then
			fValue = Key
			'fValue = fValue[0..Key.Find("=")]
			fValue = fValue[Key.Find("=")+1..]
			key = Key[0..Key.Find("=")]
			
			If Len(fValue)>1 Then
				If (fValue[0..1] = "~q") And (fValue[Len(fValue)-1..]="~q") Then
					fValue = fValue[1..Len(fValue)-1]
				EndIf
			EndIf
		EndIf
		
		fKey = Key.ToUpper()
	End Method
	Rem
	bbdoc: Creates a new THTMLParam instance
	About: Usage: THTMLParam.Create()
	End Rem
	Function Create:THTMLParam()
		Local tempHTMLParam:THTMLParam = New THTMLParam
		Return tempHTMLParam
	End Function
	Rem
	bbdoc: Destroy function frees the passed type and flushes the memory
	about: pass an instance of the THTMLParam type, no return value<br> usage:THTMLParam.Destroy(HTMLParam)
	End Rem
	Function Destroy(HTMLParam : THTMLParam Var)
		HTMLParam = Null
		
	End Function
End Type

Rem
bbdoc: THTMLtag type
End Rem
Type THTMLTag

	Field fName:String
	Field fRaw:String
	Field Params:TList
	Rem
	bbdoc: Sets the tag name (should not be called directly)
	End Rem
	Method SetName(Name:String)
		Local Tag : String
		Local param : String
		Local HTMLParam : THTMLParam
		Local isQuote : Int
		
		fRaw = Name
		Params.Clear()
		'DebugStop
		
		While (Len(Name)>0) And (Name[0..1] <> " ")
			Tag = Tag + Name[0..1]
			Name = Name[1..]
		Wend

		fName = Tag.ToUpper()
		
		While (Len(Name)>0)
			param = ""
			isQuote = False
			While (Len(Name)>0) And ( Not ((Name[0..1]=" ") And (isQuote=False)))
				If Name[0..1] = "~q" Then IsQuote = Not(IsQuote)
					param = param + Name[0..1]
					Name = Name[1..]
			Wend
				
				If (Len(Name)>0) And (Name[0..1]=" ") Then Name = Name[1..]
				If (param <> "") Then
					HTMLParam = THTMLParam.Create()
					HTMLParam.SetKey(param)
					params.AddLast(HTMLParam)
				EndIf
			
		Wend
	GCCollect()
	End Method
	
	Method GetName()
	End Method
	Rem
	bbdoc: returns the raw HTML code for this tag
	End Rem
	Method GetRaw:String()
		Return fRaw
	End Method
		
	Function Create:THTMLTag()
		Local tempHTMLTag:THTMLTag = New THTMLTag
			tempHTMLTag.Params = New TList
		Return tempHTMLTag
	End Function
	
	Function Destroy(TAG:THTMLTag)
		TAG.Params.Clear()
		TAG = Null
		
	End Function

End Type

Rem
bbdoc: THTMLText type
about: Contains any text blocks within the supplied document
End Rem
Type THTMLText

	Field fLine:String
	Field fRawLine:String

	Method SetLine(Line:String)
		Local j : Int
		Local i : Int
		Local Entity : String
		Local isEntity : Int
		Local EnLen : Int
		Local EnPos : Int
		Local d : Int
		Local c : Int
		
		fRawLine = Line
		
		Line = Line.Replace(Chr(10), " ")
		Line = Line.Replace("  ", " ")
		
		i = 0
		isEntity = False
		EnPos = -1

		While i <= Len(Line)
			If Line[i..i+1] = "&" Then 
				EnPos = 1
				isEntity = True
				Entity = ""
			EndIf
			If isEntity Then Entity = Entity+Line[i..i+1]
			
			If isEntity Then
				If (Line[i..i+1]=";") Or (Line[i..i+1]=" ") Then
					EnLen = Len(Entity)
					
					If (EnLen > 2) And (Entity[1..2] = "#") Then
						Entity = Entity[..EnLen-1] 'remove semicolon
						Entity = Entity[2..] 'remove &#
						If Entity[0..1].ToUpper()="X" Then Entity = "$" + Entity[1..]
						If (Len(Entity)<=3) Then 
							d = Int(entity)
							If d <> Null Then
								Line = Line[0..EnPos]+Line[EnPos+EnLen..]
								StrInsert(CharSet[d], Line, EnPos)
								i = EnPos
							EndIf
						EndIf
					Else
					
						j = 1
					
						While (j<=100)
							If Entity = (Entities[j][1]) Then
								Line = Line[0..EnPos]+Line[EnPos+EnLen..]
								StrInsert(Line, Entities[j][2], EnPos)
								j = 102
							EndIf
							j:+1
						Wend
						
						If j=103 Then 
							i = enPos-1
						Else 
							i = EnPos
						EndIf
						
						
					EndIf
					
				EndIf
				IsEntity=False
			EndIf
			i:+1
		Wend
	fLine=Line;
	
	End Method
	Rem
	bbdoc: returns the raw HTML code for this text portion
	End Rem
	Method GetRaw:String()
		Return fRawLine
	End Method
	Rem
	bbdoc: returns a new THTMLText instance
	End Rem
	Function Create:THTMLText()
		Local tempHTMLText : THTMLText = New THTMLText
		
		Return tempHTMLText
	End Function
	Rem
	bbdoc: Destroy a THTMLText instance
	End Rem
	Function Destroy(HTMLText : THTMLText Var)
		HTMLText = Null
		
	End Function

End Type

Rem
bbdoc: THTMLParser Type
End Rem
Type THTMLParser

	Field Text:String
	Field Tag:String
	Field isTag:Int
	Field parsed:TList
	Field Lines:TList

	Method AddText()
		Local HTMLText:THTMLText
		If Not isTag Then
			If Text <> "" Then
				
				HTMLText = THTMLText.Create()
				HTMLText.SetLine(Text)
				Text = ""
				parsed.AddLast(HTMLText)
				
			EndIf
		EndIf
		
	End Method
	
	Rem
	bbdoc: Pass a filename to load (can specify a url by prefixing with "http::" )
	End Rem
	Method LoadFile(FileName:String)
		Lines.Clear()
		Local HTMLFile:TStream 
		Try
			HTMLFile = ReadStream(FileName)
			While Not Eof(HTMLFile)
				Lines.AddLast(ReadLine(HTMLFile))
			Wend
		Catch a$
			CloseStream(HTMLFile)
			RuntimeError(a$)
		EndTry
		CloseStream(HTMLFile)
		
	End Method
	
	Method AddTag()
		Local HTMLTag:THTMLTag;

		isTag = False
		HTMLTag = THTMLTag.Create()
		
		HTMLTag.SetName(Tag)
		
		Tag = ""
		parsed.AddLast(HTMLTag)
		
	End Method

	
	Function Create:THTMLParser()
		Local tempParser : THTMLParser = New THTMLParser
			'initialize the lists
			tempParser.parsed:TList = New TList
			tempParser.Lines:TList = New TList
		Return tempParser
	End Function
	

	Function Destroy(parser:THTMLParser Var)
		parser.parsed.clear()
		parser.lines.clear()
		parser.parsed = Null
		parser.lines = Null
		parser = Null
		
	End Function

	Rem
	bbdoc: Call execute to parse the file (NOTE: You MUST call LoadFile first)
	End Rem
	Method Execute()
		Local s:String
		Text = ""
		Tag = ""
		isTag =False;
		
		For s = EachIn Lines
			While Len(s) > 0
				If s[0..1] = "<" Then 
					AddText()
					isTag=True
				Else If s[0..1] = ">" Then 
					AddTag()
				Else If isTag Then 
					Tag = Tag + s[0..1]
				Else 
					Text = Text + s[0..1]
			    End If
				
				s=s[1..] 'slice the first character off
				
			Wend
			
				If (Not isTag) And (Text <> "") Then Text = Text + Chr(10)
				
		Next
					
		If (isTag) And (Tag <> "") Then AddTag()
		If (Not isTag) And (Text <> "") Then AddText()
		
	End Method

End Type

Rem
bbdoc: insert inString into SourceStr at the specified index
End Rem
Function StrInsert(SourceStr:String Var, inString:String, Index:Int)
	SourceStr = SourceStr[..Index] + inString + SourceStr[Index..]
End Function


'test
Rem
Print MemAlloced()

Local myParser : THTMLParser = THTMLParser.Create()

myParser.LoadFile("http::www.blitzbasic.com")

myParser.Execute()

For Local a:Object = EachIn myParser.parsed
	If THTMLTag(a) Then
		Print THTMLTag(a).fName
		For Local b:THTMLParam = EachIn THTMLTag(a).params
			Print b.fKey
			Print b.fValue
		Next
	Else
		Print THTMLText(a).fLine
	EndIf
Next

myParser.Destroy(myParser)
FlushMem
Print MemAlloced()
End
EndRem

Comments

None.

Code Archives Forum