Code archives/Algorithms/Regular expressions

This code has been declared by its author to be Public Domain code.

Download source code

Regular expressions by Warpy2008
I saw this a few days ago, and it looked very appealing. My master's project involved a lot of FSAs, so I thought this would be well within my grasp.

Turns out the FSAs aren't the hard part, it's interpreting a pattern correctly to *make* an FSA that's hard. Anyway, after a good day or so of despairing at the computer, I think I've got it working. This should accept all the patterns that the reAnimator machine does.

The code is not pretty, even by my standards, this is what I hacked together while trying to sort out how it all fits together. I'm sure in the future I'll actually need a regular expression matcher for something and I'll write a nice one. Anyway, enjoy!
Type charset
	Field pattern$
	Field ranges:TList
	
	Method New()
		ranges=New TList
	End Method
	
	Function Create:charset(pattern$)
		If Not pattern Return emptyset
		cs:charset=New charset
		cs.pattern=pattern
		While Len(pattern)
			If Len(pattern)>2 And Chr(pattern[1])="-"
				low=pattern[0]
				high=pattern[2]
				cs.addrange(low,high)
				pattern=pattern[3..]
			Else
				c=pattern[0]
				cs.addrange(c,c)
				pattern=pattern[1..]
			EndIf
		Wend
		Return cs
	End Function
	
	Method addrange(low,high)
		Local range[]=[low,high]
		ranges.addlast range
		pattern:+Chr(low)+"-"+Chr(high)
	End Method
	
	Method match(cr$)
		c=Asc(cr)
		Local range[]
		For range=EachIn ranges
			If c>=range[0] And c<=range[1]
				Return True
			EndIf
		Next
		Return False
	End Method
	
	Method repr$()
		Local range[]
		txt$=""
		For range=EachIn ranges
			If range[0]=range[1]
				txt:+Chr(range[0])
			Else
				txt:+Chr(range[0])+"-"+Chr(range[1])
			EndIf
		Next
		Return txt
	End Method
End Type

Global emptyset:charset=New charset
Global digits:charset=charset.Create("0-9")
Global alphanum:charset=charset.Create("0-9A-Za-z")
Global nonalphanum:charset=New charset
Global nondigits:charset=New charset
Global fullset:charset=New charset

fullset.addrange 0,255

nondigits.addrange 0,47
nondigits.addrange 58,255 
nonalphanum.addrange 0,47
nonalphanum.addrange 58,64
nonalphanum.addrange 91,96
nonalphanum.addrange 123,255



Function listsame(l1:TList,l2:TList)
	If l1.count()<>l2.count() Return False
	
	For o:Object=EachIn l1
		If Not l2.contains(o) Return False
	Next

	Return True
End Function

Type fsa
	Field accepting
	Field transitions:TList
	Field name$
	
	Method New()
		transitions=New TList
	End Method
	
	Method addtransition(symbol$,dest:fsa)
		transitions.addlast transition.Create(symbol,dest)
	End Method
	
	Method matches:TList(symbol$,l:TList=Null)
		If Not l l=New TList
		For t:transition=EachIn transitions
			If t.cs.match(symbol)
				l.addlast t.dest
			EndIf
		Next
		Return l
	End Method
	
	Method evaluate(pattern$,spaces$="")
		Print spaces+name+"?"+pattern
		If Not pattern
			Print "end at "+name
			Return accepting
		EndIf
		symbol$=Chr(pattern[0])
		npattern$=pattern[1..]
		For f:fsa=EachIn matches(symbol)
			If f.evaluate(npattern,spaces+" ") Return True
		Next
		For t:transition=EachIn transitions
			If t.cs=emptyset
				Print spaces+"empty move to "+t.dest.name
				If t.dest.evaluate(pattern,spaces) Return True
			EndIf
		Next
		
		Print spaces+"fail at "+name

		Return False
	End Method
	
	Method emptymoves:TList(l:TList=Null,addself=0)
		If Not l l=New TList
		If addself l.addlast(Self)
		For t:transition=EachIn transitions
			If t.cs=emptyset
				If Not l.contains(t.dest)
					l.addlast t.dest
					t.dest.emptymoves(l)
				EndIf
			EndIf
		Next
		Return l
	End Method
	
	Method repr$()
		txt$=name+"~n"
		If accepting txt:+"accepting~n"
		For t:transition=EachIn transitions
			txt:+"  "+t.repr()+"~n"
		Next
		Return txt
	End Method
	
	Function collapse:tmap(f:fsa,checked:tmap=Null)
		If Not checked checked=New tmap
		If checked.contains(f) Return checked

		ntransitions:tmap=New tmap
		destinations:TList=New TList
		
		For t:transition=EachIn f.transitions
			If Not ntransitions.contains(t.cs.pattern)
				ntransitions.insert t.cs.pattern,New TList
			EndIf
			tl:TList=TList(ntransitions.valueforkey(t.cs.pattern))
			If Not tl.contains(t.dest)
				tl.addlast t.dest
			EndIf
			
			If Not destinations.contains(t.dest)
				destinations.addlast t.dest
			EndIf
		Next
		
		'Rem
		of:fsa=New fsa
		'Print of.name
		For key$=EachIn ntransitions.keys()
			If key
				tl:TList=TList(ntransitions.valueforkey(key))
				tname$=""
				For fp:fsa=EachIn tl
					For f2:fsa=EachIn fp.emptymoves()
						If Not tl.contains(f2)
							tl.addlast f2
						EndIf
					Next
				Next
				For f2:fsa=EachIn tl
					If tname tname:+","
					tname:+f2.name
				Next
				tname="{"+tname+"}"
				'Print "  "+key+" -> "+tname
			EndIf
		Next
		'EndRem
		
		checked.insert f,ntransitions
		
		For f2:fsa=EachIn destinations
			fsa.collapse f2,checked
		Next
		
		Return checked
	End Function
	
	Function powerset:fsa(startnode:fsa)
		Global allnodes:tmap,newnodes:TList
		
		maps:tmap=collapse(startnode)
		
		allnodes:tmap=New tmap
		
		Function findnode:fsa(l:TList,adding=1)
			For l2:TList=EachIn allnodes.keys()
				If listsame(l,l2) Return fsa(allnodes.valueforkey(l2))
			Next
			nf:fsa=New fsa
			nf.name=ziplist(l)
			'Print "new node "+nf.name
			allnodes.insert l,nf
			If adding
				newnodes.addlast l
			EndIf
			Return nf
		End Function
		
		Function ziplist$(l:TList)
			l=l.copy()
			l.sort
			txt$=""
			For f:fsa=EachIn l
				If txt txt:+","
				txt:+f.name
			Next
			Return "{"+txt+"}"
		End Function
		
		outstart:fsa=Null
		newnodes:TList=New TList
		newnodes.addlast startnode.emptymoves(Null,1)
		
		While newnodes.count()
		
			'get the new state we're making. It's a list of the old nodes
			l:TList=TList(newnodes.removefirst()) 
			
			nf:fsa=findnode(l,False)
			If Not outstart outstart=nf
			
			nf.name=ziplist(l)
			'Print "constructing "+nf.name
			
			ntransitions:tmap=New tmap
			
			'for each old node in the list, work out the transitions
			For f:fsa=EachIn l
				If f.accepting nf.accepting=1
			
				'get transition map for this nfsa node
				fnt:tmap=tmap(maps.valueforkey(f))
				
				'for each symbol, add all the destinations
				For key$=EachIn fnt.keys()
					If key
						If Not ntransitions.contains(key)
							ntransitions.insert key,New TList
						EndIf
						otl:TList=TList(fnt.valueforkey(key))
						ntl:TList=TList(ntransitions.valueforkey(key))
						For f2:fsa=EachIn otl
							If Not ntl.contains(f2) ntl.addlast f2
						Next
					EndIf
				Next
			Next
			
			'we now have a full set of transitions, make the node
			For key$=EachIn ntransitions.keys()
				df:fsa=findnode(TList(ntransitions.valueforkey(key)))
				nf.addtransition key,df
			Next
			'Print nf.repr()
		Wend
		
		Print "ALL NODES"
		For f:fsa=EachIn allnodes.values()
			Print f.repr()
		Next
		
		Return outstart
	End Function
	
End Type

Type transition
	Field cs:charset
	Field dest:fsa
	
	Function Create:transition(pattern$,dest:fsa)
		t:transition=New transition
		t.cs=charset.Create(pattern)
		t.dest=dest
		Return t
	End Function
	
	Method repr$()
		Return cs.repr()+" -> "+dest.name
	End Method
End Type


Global numnodes=0
Function compile:fsa[](pattern$,starts:fsa[],spaces$="")
	Print spaces+"compile: "+pattern
	

	Local bits$[]
	Local ends:fsa[],ostarts:fsa[],nends:fsa[]
	bits=splitpipes(pattern)
	
	'note that starts is not really a set of starting nodes for the whole machine,
	'but in fact the previous set of finals.

	If Len(bits)=1
		While Len(pattern)
			ostarts=starts
			symbol$=Chr(pattern[0])
			'Print spaces+">"+symbol
			nends=Null
			Select symbol
			Case "(" 'start brackets
				Print spaces+"brackets"
				inparens=1
				i=1
				While inparens
					Select Chr(pattern[i])
					Case "("
						inparens:+1
					Case ")"
						inparens:-1
					End Select
					i:+1
				Wend
				bit$=pattern[1..i-1]
				ends=compile(bit,starts,spaces+"  ")
				pattern=pattern[i..]
				
			Case "["
				Print spaces+"squares"
				i=1
				While Chr(pattern[i])<>"]"
					i:+1
				Wend
				bit$=pattern[0..i+1]
				'Print spaces+bit
				pattern=pattern[i+1..]
				nf:fsa=New fsa
				For f:fsa=EachIn starts
					f.addtransition bit,nf
				Next
				ends=[nf]
			Case "\"
				symbol=Chr(pattern[1])
				Print "special character "+symbol
				Local cs:charset
				Select symbol
				Case "d" 'digit
					cs=digits
				Case "D"
					cs=nondigits
				Case "w"
					cs=alphanum
				Case "W"
					cs=nonalphanum
				Default
					cs=charset.Create(symbol)
				End Select
				Print cs.repr()
				mf:fsa=New fsa
				mf.name="m"
				nf:fsa=New fsa
				numnodes:+1
				nf.name=String(numnodes)
				For f:fsa=EachIn starts
					tr:transition=New transition
					tr.cs=cs
					tr.dest=mf
					f.transitions.addlast tr
				Next
				mf.addtransition "",nf
				ends=[mf]
				nends=[nf]
				pattern=pattern[2..]
			Case "."
				Print "any character"
				mf:fsa=New fsa
				nf:fsa=New fsa
				mf.name="m"
				numnodes:+1
				nf.name=String(numnodes)
				For f:fsa=EachIn starts
					tr:transition=New transition
					tr.cs=fullset
					tr.dest=mf
					f.transitions.addlast tr
				Next
				mf.addtransition "",nf
				ends=[mf]
				nends=[nf]
				pattern=pattern[1..]
			Default 'normal character
				Print spaces+"character: "+symbol
				mf:fsa=New fsa
				mf.name="m"
				nf:fsa=New fsa
				numnodes:+1
				nf.name=String(numnodes)
				For f:fsa=EachIn starts
					f.addtransition symbol,mf
				Next
				mf.addtransition "",nf
				ends=[mf]
				nends=[nf]
				pattern=pattern[1..]
				
			End Select
			
			If Len(pattern)
				op$=Chr(pattern[0])
			Else
				op$=""
			EndIf
			Select op
			Case "*" 'kleene star closure - none or more times
				fin:fsa=New fsa
				For f:fsa=EachIn ends
					f.addtransition "",fin
				Next
				For f:fsa=EachIn starts
					f.addtransition "",fin
					fin.addtransition "",f
					If nends
						For f2:fsa=EachIn nends
							f.addtransition "",f2
						Next
					EndIf
				Next
				fin2:fsa=New fsa
				fin.addtransition "",fin2
				ends=[fin2]
				pattern=pattern[1..]
			Case "?" 'zero or one times
				fin:fsa=New fsa
				For f:fsa=EachIn starts
					f.addtransition "",fin
					If nends
						For f2:fsa=EachIn nends
							f.addtransition "",f2
						Next
					EndIf
				Next
				For f:fsa=EachIn ends
					f.addtransition "",fin
				Next
				ends=[fin]
				pattern=pattern[1..]
			Case "+" 'one or more times
				fin:fsa=New fsa
				For f:fsa=EachIn ends
					f.addtransition "",fin
				Next
				For f:fsa=EachIn starts
					fin.addtransition "",f
				Next
				ends=[fin]
				pattern=pattern[1..]
			End Select
			If nends
				ends=nends
			EndIf
			starts=ends
		Wend
	Else
		Print spaces+"pipes"
		For bit$=EachIn bits
			ends:+compile(bit,starts,spaces+"  ")
		Next
	EndIf
	
	Return ends
End Function

Function splitpipes$[](pattern$)
	'Print pattern
	inparens=0
	i=0
	Local bits$[0]
	starti=0
	While i<Len(pattern)
		Select Chr(pattern[i])
		Case "\"
			i:+1
		Case "("
			inparens:+1
		Case ")"
			inparens:-1
		Case "|"
			If Not inparens
				bits:+[pattern[starti..i]]
				starti=i+1
			EndIf
		End Select
		i:+1
	Wend
	If starti<Len(pattern)
		bits=bits+[pattern[starti..]]
	EndIf
	
	Return bits
End Function

start:fsa=New fsa
start.name="s"
're$="(([1-9]+[0-9]*)|0)(.[0-9]+)?"
're$="[a-z]*(,? [a-z]*)*"
re$=".*"
're$=Input("re> ")
For f:fsa=EachIn compile(re,[start])
	f.accepting=1
Next
start=fsa.powerset(start)


in$=""
While in<>"quit"
	in$=Input(">")
	If start.evaluate(in)
		Print "Yes"
	Else
		Print "No"
	EndIf
Wend

Comments

xlsior2008
Interesting!


Code Archives Forum