Code archives/File Utilities/HTML/XML Parser
This code has been declared by its author to be Public Domain code.
Download source code
| |||||
(BMX) example is at the bottom of the code. | |||||
Strict Rem bbdoc: Pert.HTML End Rem Module Pert.html ModuleInfo "Module: Perturbatio's HTML Mod" ModuleInfo "Version: 0.01" ModuleInfo "Author: Kris Kelly (Perturbatio), portions converted from delphi source, author unknown" ModuleInfo "License: Public Domain" Import Pub.stdc Import BRL.FileSystem Import BRL.LinkedList Global Entities:String[][] =[[""", """],["&", "&"],["<", "<"],[">", ">"],[" ", " "],["¡", "¡"],["¢", "¢"],["£", "£"],["¤","¤"],["¥", "¥"],["¦","¦"],["§", "§"],["¨", "¨"],["©", "©"],["ª", "ª"],["«", "«"],["¬", "¬"],["­", "­"],["®", "®"],["¯", "¯"],["°", "°"],["±","±"],["²", "²"],["³", "³"],["´", "´"],["µ", "µ"],["¶", "¶"],["·","·"],["¸", "¸"],["¹", "¹"],["º", "º"],["»", "»"],["¼","¼"],["½","½"],["¾","¾"],["¿","¿"],["À","À"],["Á","Á"],["Â", "Â"],["Ã","Ã"],["Ä", "Ä"],["Å", "Å"],["Æ", "Æ"],["Ç","Ç"],["È","È"],["É","É"],["Ê", "Ê"],["Ë", "Ë"],["Ì","Ì"],["Í","Í"],["Î", "Î"],["Ï", "Ï"],["Ð", "Ð"],["Ñ","Ñ"],["Ò","Ò"],["Ó","Ó"],["Ô", "Ô"],["Õ","Õ"],["Ö", "Ö"],["×", "×"],["Ø","Ø"],["Ù","Ù"],["Ú","Ú"],["Û", "Û"],["Ü", "Ü"],["Ý","Ý"],["Þ", "Þ"],["ß", "ß"],["à","à"],["á","á"],["â", "â"],["ã","ã"],["ä", "ä"],["å", "å"],["æ", "æ"],["ç","ç"],["è","è"],["é","é"],["ê", "ê"],["ë", "ë"],["ì","ì"],["í","í"],["î", "î"],["ï", "ï"],["ð", "ð"],["ñ","ñ"],["ò","ò"],["ó","ó"],["ô", "ô"],["õ","õ"],["ö", "ö"],["÷","÷"],["ø","ø"],["ù","ù"],["ú","ú"],["û", "û"],["ü", "ü"],["ý","ý"],["þ", "þ"],["ÿ", "ÿ"]]; Global CharSet:String[] = [" ","!","~q","#","$","%","&","(",")".. ,"*","+",",","-",".","/","0","1","2","3","4","5","6","7","8".. ,"9",":",";","<","=",">","?","@","A","B","C","D","E","F","G".. ,"H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V".. ,"W","X","Y","Z","[","\","]","^","_","`","a","b","c","d","e".. ,"f","g","h","i","j","k","l","m","n","o","p","q","r","s","t".. ,"u","v","w","x","y","z","{","|","}","","_","_","~q","ƒ".. ,".","?","?","^","%","S","<","O","_".. ,"Z","_","_","","-","-","~~","T","s",">","o".. ,"_","z","Y"," ","¡","¢","£","¤","¥","¦","§","¨","©","ª","«".. ,"¬","","®","¯","°","±","²","³","´","µ","¶","·","¸","¹","º".. ,"»","¼","½","¾","¿","À","Á","Â","Ã","Ä","Å","Æ","Ç","È","É".. ,"Ê","Ë","Ì","Í","Î","Ï","Ð","Ñ","Ò","Ó","Ô","Õ","Ö","×","Ø".. ,"Ù","Ú","Û","Ü","Ý","Þ","ß","à","á","â","ã","ä","å","æ","ç".. ,"è","é","ê","ë","ì","í","î","ï","ð","ñ","ò","ó","ô","õ","ö".. ,"÷","ø","ù","ú","û","ü","ý","þ","ÿ"] Rem bbdoc: THTMLParam type End Rem Type THTMLParam Field fRaw:String Field fKey:String Field fValue:String Rem bbdoc: Sets a key passed as a string i.e. "color=~qBlack~q" End Rem Method SetKey(Key:String) fValue = "" fRaw = Key 'DebugStop If Key.Find("=")>-1 Then fValue = Key 'fValue = fValue[0..Key.Find("=")] fValue = fValue[Key.Find("=")+1..] key = Key[0..Key.Find("=")] If Len(fValue)>1 Then If (fValue[0..1] = "~q") And (fValue[Len(fValue)-1..]="~q") Then fValue = fValue[1..Len(fValue)-1] EndIf EndIf EndIf fKey = Key.ToUpper() End Method Rem bbdoc: Creates a new THTMLParam instance About: Usage: THTMLParam.Create() End Rem Function Create:THTMLParam() Local tempHTMLParam:THTMLParam = New THTMLParam Return tempHTMLParam End Function Rem bbdoc: Destroy function frees the passed type and flushes the memory about: pass an instance of the THTMLParam type, no return value<br> usage:THTMLParam.Destroy(HTMLParam) End Rem Function Destroy(HTMLParam : THTMLParam Var) HTMLParam = Null End Function End Type Rem bbdoc: THTMLtag type End Rem Type THTMLTag Field fName:String Field fRaw:String Field Params:TList Rem bbdoc: Sets the tag name (should not be called directly) End Rem Method SetName(Name:String) Local Tag : String Local param : String Local HTMLParam : THTMLParam Local isQuote : Int fRaw = Name Params.Clear() 'DebugStop While (Len(Name)>0) And (Name[0..1] <> " ") Tag = Tag + Name[0..1] Name = Name[1..] Wend fName = Tag.ToUpper() While (Len(Name)>0) param = "" isQuote = False While (Len(Name)>0) And ( Not ((Name[0..1]=" ") And (isQuote=False))) If Name[0..1] = "~q" Then IsQuote = Not(IsQuote) param = param + Name[0..1] Name = Name[1..] Wend If (Len(Name)>0) And (Name[0..1]=" ") Then Name = Name[1..] If (param <> "") Then HTMLParam = THTMLParam.Create() HTMLParam.SetKey(param) params.AddLast(HTMLParam) EndIf Wend GCCollect() End Method Method GetName() End Method Rem bbdoc: returns the raw HTML code for this tag End Rem Method GetRaw:String() Return fRaw End Method Function Create:THTMLTag() Local tempHTMLTag:THTMLTag = New THTMLTag tempHTMLTag.Params = New TList Return tempHTMLTag End Function Function Destroy(TAG:THTMLTag) TAG.Params.Clear() TAG = Null End Function End Type Rem bbdoc: THTMLText type about: Contains any text blocks within the supplied document End Rem Type THTMLText Field fLine:String Field fRawLine:String Method SetLine(Line:String) Local j : Int Local i : Int Local Entity : String Local isEntity : Int Local EnLen : Int Local EnPos : Int Local d : Int Local c : Int fRawLine = Line Line = Line.Replace(Chr(10), " ") Line = Line.Replace(" ", " ") i = 0 isEntity = False EnPos = -1 While i <= Len(Line) If Line[i..i+1] = "&" Then EnPos = 1 isEntity = True Entity = "" EndIf If isEntity Then Entity = Entity+Line[i..i+1] If isEntity Then If (Line[i..i+1]=";") Or (Line[i..i+1]=" ") Then EnLen = Len(Entity) If (EnLen > 2) And (Entity[1..2] = "#") Then Entity = Entity[..EnLen-1] 'remove semicolon Entity = Entity[2..] 'remove &# If Entity[0..1].ToUpper()="X" Then Entity = "$" + Entity[1..] If (Len(Entity)<=3) Then d = Int(entity) If d <> Null Then Line = Line[0..EnPos]+Line[EnPos+EnLen..] StrInsert(CharSet[d], Line, EnPos) i = EnPos EndIf EndIf Else j = 1 While (j<=100) If Entity = (Entities[j][1]) Then Line = Line[0..EnPos]+Line[EnPos+EnLen..] StrInsert(Line, Entities[j][2], EnPos) j = 102 EndIf j:+1 Wend If j=103 Then i = enPos-1 Else i = EnPos EndIf EndIf EndIf IsEntity=False EndIf i:+1 Wend fLine=Line; End Method Rem bbdoc: returns the raw HTML code for this text portion End Rem Method GetRaw:String() Return fRawLine End Method Rem bbdoc: returns a new THTMLText instance End Rem Function Create:THTMLText() Local tempHTMLText : THTMLText = New THTMLText Return tempHTMLText End Function Rem bbdoc: Destroy a THTMLText instance End Rem Function Destroy(HTMLText : THTMLText Var) HTMLText = Null End Function End Type Rem bbdoc: THTMLParser Type End Rem Type THTMLParser Field Text:String Field Tag:String Field isTag:Int Field parsed:TList Field Lines:TList Method AddText() Local HTMLText:THTMLText If Not isTag Then If Text <> "" Then HTMLText = THTMLText.Create() HTMLText.SetLine(Text) Text = "" parsed.AddLast(HTMLText) EndIf EndIf End Method Rem bbdoc: Pass a filename to load (can specify a url by prefixing with "http::" ) End Rem Method LoadFile(FileName:String) Lines.Clear() Local HTMLFile:TStream Try HTMLFile = ReadStream(FileName) While Not Eof(HTMLFile) Lines.AddLast(ReadLine(HTMLFile)) Wend Catch a$ CloseStream(HTMLFile) RuntimeError(a$) EndTry CloseStream(HTMLFile) End Method Method AddTag() Local HTMLTag:THTMLTag; isTag = False HTMLTag = THTMLTag.Create() HTMLTag.SetName(Tag) Tag = "" parsed.AddLast(HTMLTag) End Method Function Create:THTMLParser() Local tempParser : THTMLParser = New THTMLParser 'initialize the lists tempParser.parsed:TList = New TList tempParser.Lines:TList = New TList Return tempParser End Function Function Destroy(parser:THTMLParser Var) parser.parsed.clear() parser.lines.clear() parser.parsed = Null parser.lines = Null parser = Null End Function Rem bbdoc: Call execute to parse the file (NOTE: You MUST call LoadFile first) End Rem Method Execute() Local s:String Text = "" Tag = "" isTag =False; For s = EachIn Lines While Len(s) > 0 If s[0..1] = "<" Then AddText() isTag=True Else If s[0..1] = ">" Then AddTag() Else If isTag Then Tag = Tag + s[0..1] Else Text = Text + s[0..1] End If s=s[1..] 'slice the first character off Wend If (Not isTag) And (Text <> "") Then Text = Text + Chr(10) Next If (isTag) And (Tag <> "") Then AddTag() If (Not isTag) And (Text <> "") Then AddText() End Method End Type Rem bbdoc: insert inString into SourceStr at the specified index End Rem Function StrInsert(SourceStr:String Var, inString:String, Index:Int) SourceStr = SourceStr[..Index] + inString + SourceStr[Index..] End Function 'test Rem Print MemAlloced() Local myParser : THTMLParser = THTMLParser.Create() myParser.LoadFile("http::www.blitzbasic.com") myParser.Execute() For Local a:Object = EachIn myParser.parsed If THTMLTag(a) Then Print THTMLTag(a).fName For Local b:THTMLParam = EachIn THTMLTag(a).params Print b.fKey Print b.fValue Next Else Print THTMLText(a).fLine EndIf Next myParser.Destroy(myParser) FlushMem Print MemAlloced() End EndRem |
Comments
None.
Code Archives Forum