On Feb 21, 10:34 am, garri...@gmail. com wrote:
On Feb 20, 6:14 pm, Pop User <popu...@christ est2.dc.k12us.c omwrote:
http://swtch.com/~rsc/regexp/regexp1.html
Going back a bit on a tangent, the author of this citation states that
any regex can be expressed as a DFA machine. However, while
investigating this more I appear to have found one example of a regex
which breaks this assumption.
"ab+c|abd"
Am I correct? Can you think of a deterministic method of computing
this expression? It would be easier with a NFA machine, but given that
the Python method of computing RE's involves pre-compiling a re
object, optimizing the matching engine would make the most sense to
me.
Here's what I have so far:
class State(object):
def __init__(self):
self.nextState = {}
self.nextStateK eys = []
self.prevState = None
self.isMatchSta te = True
def setNextState(se lf, chars, iNextState):
self.nextState[chars] = iNextState
self.nextStateK eys = self.nextState. keys()
self.isMatchSta te = False
def setPrevState(se lf, iPrevState):
self.prevState = iPrevState
def moveToNextState (self, testChar):
if testChar in self.nextStateK eys:
return self.nextState[testChar]
else:
return None
class CompiledRegex(o bject):
def __init__(self, startState):
self.startState = startState
def match(self, matchStr):
match_set = []
currentStates = [self.startState]
nextStates = [self.startState]
for character in matchStr:
for state in currentStates:
nextState = state.moveToNex tState(characte r)
if nextState is not None:
nextStates.appe nd(nextState)
if nextState.isMat chState:
print "Match!"
return
currentStates = nextStates
nextStates = [self.startState]
print "No Match!"
def compile(regexSt r):
startState = State()
currentState = startState
backRefState = None
lastChar = ""
for character in regexStr:
if character == "+":
currentState.se tNextState(last Char, currentState)
elif character == "|":
currentState = startState
elif character == "?":
backRefState = currentState.pr evState
elif character == "(":
# Implement "("
pass
elif character == ")":
# Implement ")"
pass
elif character == "*":
currentState = currentState.pr evState
currentState.se tNextState(last Char, currentState)
else:
testRepeatState = currentState.mo veToNextState(c haracter)
if testRepeatState is None:
newState = State()
newState.setPre vState(currentS tate)
currentState.se tNextState(char acter, newState)
if backRefState is not None:
backRefState.se tNextState(char acter, newState)
backRefState = None
currentState = newState
else:
currentState = testRepeatState
lastChar = character
return CompiledRegex(s tartState)
>>a = compile("ab+c")
a.match("abc" )
Match!
>>a.match("abbc ")
Match!
>>a.match("ac ")
No Match!
>>a = compile("ab+c|a bd")
a.match("abc" )
Match!
>>a.match("abbc ")
Match!
>>a.match("ac ")
No Match!
>>a.match("abd" )
Match!
>>a.match("abbd ")
Match!
>>>