# # python code for building a parser from a grammar # Copyright Aaron Robert Watters, 1994 # # BUGS: # A bad grammar that has no derivations for # the root nonterminal may cause a name error # on the variable "GoodStartingPlace" # this needs to be modified so the RULEGRAM is loaded from a # compiled representation if available. import string import kjSet import kjParser import re # import some constants from kjParser import \ TERMFLAG, NOMATCHFLAG, MOVETOFLAG, REDUCEFLAG, TRANSFLAG, KEYFLAG, \ NONTERMFLAG, TERMFLAG, EOFFLAG, ENDOFFILETOKEN PMODULE = kjParser.THISMODULE # errors raised here TokenError = "TokenError" # may happen on autogen with bad grammar NotSLRError = "NotSLRError" # may happen for nonSLR grammar # set this flag for regression testing at each load RUNTESTS = 0 # set this flag to abort automatic generation on Errors ABORTONERROR = 0 # token used to mark null productions NULLTOKEN = (None,None) # a derived FSM class, with closure computation methods defined # (compilable FSMachine) # class CFSMachine(kjParser.FSMachine): def __init__(self, nonterm): kjParser.FSMachine.__init__(self, nonterm) # return the epsilon closure of the FSM as a new FSM # # DoNullMap, if set, will map unexpected tokens to # the "empty" state (usually creating a really big fsm) # def Eclosure(self, Epsilon, DoNullMaps=0): Closure = CFSMachine( self.root_nonTerminal ) # compute the Epsilon Graph between states EGraph = kjSet.NewDG([]) for State in range(0,self.maxState+1): # every state is E-connected to self kjSet.AddArc( EGraph, State, State ) # add possible transition on epsilon (ONLY ONE SUPPORTED!) key = (State, Epsilon) if self.StateTokenMap.has_key(key): keymap = self.StateTokenMap[key] if keymap[0][0] != MOVETOFLAG: raise TypeError, "unexpected map type in StateTokenMap" for (Flag,ToState) in keymap: kjSet.AddArc( EGraph, State, ToState ) #endfor # transitively close EGraph kjSet.TransClose( EGraph ) # Translate EGraph into a dictionary of lists EMap = {} for State in range(0,self.maxState+1): EMap[State] = kjSet.Neighbors( EGraph, State ) # make each e-closure of each self.state a state of the closure FSM. # here closure states assumed transient -- reset elsewhere. # first do the initial state Closure.States[ Closure.initial_state ] = \ [TRANSFLAG, kjSet.NewSet(EMap[self.initial_state]) ] # do all other states (save initial and successful final states) #for State in range(0,self.maxState+1): # if State != self.initial_state \ # and State != self.successful_final_state: # Closure.NewSetState(TRANSFLAG, kjSet.NewSet(EMap[State]) ) ##endfor # compute set of all known tokens EXCEPT EPSILON Tokens = kjSet.NewSet( [] ) for (State, Token) in self.StateTokenMap.keys(): if Token != Epsilon: kjSet.addMember(Token, Tokens) # tranform it into a list Tokens = kjSet.get_elts(Tokens) # for each state of the the closure FSM (past final) add transitions # and add new states as needed until all states are processed # (uses convention that states are allocated sequentially) ThisClosureState = 1 while ThisClosureState <= Closure.maxState: MemberStates = kjSet.get_elts(Closure.States[ThisClosureState][1]) # for each possible Token, compute the union UTrans of all # e-closures for all transitions for all member states, # on the Token, make UTrans a new state (if needed), # and transition ThisClosureState to UTrans on Token for Token in Tokens: UTrans = kjSet.NewSet( [] ) for MState in MemberStates: # if MState has a transition on Token, include # EMap for the destination state key = (MState, Token) if self.StateTokenMap.has_key(key): DStateTup = self.StateTokenMap[key] if DStateTup[0][0] != MOVETOFLAG: raise TypeError, "unknown map type" for (DFlag, DState) in DStateTup: for EDState in EMap[DState]: kjSet.addMember(EDState, UTrans) #endif #endfor MState # register UTrans as a new state if needed UTState = Closure.NewSetState(TRANSFLAG, UTrans) # record transition from # ThisClosureState to UTState on Token if DoNullMaps: Closure.SetMap( ThisClosureState, Token, UTState) else: if not kjSet.Empty(UTrans): Closure.SetMap( ThisClosureState, Token, UTState) #endfor Token ThisClosureState = ThisClosureState +1 #endwhile return Closure #enddef Eclosure # add an set-marked state to self if not present # uses self.States[s][1] as the set marking the state s # # only used by Eclosure above # def NewSetState(self, kind, InSet): # return existing state if one is present that matches the set LastState= self.maxState # skip state 0 (successful final state)??? for State in range(1,LastState+1): MarkSet = self.States[State][1] if kjSet.Same(InSet,MarkSet): return State # nonlocal #endfor # if not exited then allocate a new state LastState = LastState + 1 self.States[LastState] = [ kind , InSet ] self.maxState = LastState return LastState #enddef newSetState #endclass CFSMachine # Ruleset class, used to compute NFA and then DFA for # parsing based on a list of rules. # class ruleset: def __init__(self, StartNonterm, Rulelist): # initialize the ruleset self.StartNonterm = StartNonterm self.Rules = Rulelist # method to compute prefixes and First sets for nonterminals def CompFirst(self): # uses the special null production token NULLTOKEN # snarfed directly from Aho+Ullman (terminals glossed) First = kjSet.NewDG( [] ) # repeat the while loop until no change is made to First done = 0 while not done: done = 1 # assume we're done until a change is made to First # iterate through all rules looking for a new arc to add # indicating Terminal > possible first token derivation # for R in self.Rules: GoalNonterm = R.Nonterm Bodylength = len(R.Body) # look through the body of the rule up to the token with # no epsilon production (yet seen) Bodyindex = 0 Processindex = 1 while Processindex: # unless otherwise indicated below, don't go to next token Processindex = 0 # if index is past end of body then record # an epsilon production for this nonterminal if Bodyindex >= Bodylength: if not kjSet.HasArc(First, GoalNonterm, NULLTOKEN ): kjSet.AddArc( First, GoalNonterm, NULLTOKEN ) done = 0 # change made to First else: # otherwise try to add firsts of this token # to firsts of the Head of the rule. Token = R.Body[Bodyindex] (type, name) = Token if type in (KEYFLAG,TERMFLAG): # try to add this terminal to First for GoalNonterm if not kjSet.HasArc(First, GoalNonterm, Token): kjSet.AddArc( First, GoalNonterm, Token) done = 0 elif type == NONTERMFLAG: # try to add each First entry for nonterminal # to First entry for GoalNonterm for FToken in kjSet.Neighbors( First, Token ): if not kjSet.HasArc(First, GoalNonterm, FToken): kjSet.AddArc( First, GoalNonterm, FToken) done = 0 # does this nonterminal have a known e production? if kjSet.HasArc( First, Token, NULLTOKEN ): # if so, process next token in rule Processindex = 1 else: raise TokenError, "unknown token type in rule body" #endif Bodyindex = Bodyindex + 1 #endwhile Processindex #endfor R in self.Rules #endwhile not done self.First = First #enddef CompFirst # computing the Follow set for the ruleset # the good news: I think it's correct. # the bad news: It's slower than it needs to be for epsilon cases. def CompFollow(self): Follow = kjSet.NewDG( [] ) # put end marker on follow of start nonterminal kjSet.AddArc(Follow, self.StartNonterm, kjParser.ENDOFFILETOKEN) # now compute other follows using the rules; # repeat the loop until no change to Follow. done = 0 while not done: done = 1 # assume done unless Follow changes for R in self.Rules: #print R # work backwards in the rule body to # avoid retesting for epsilon nonterminals Bodylength = len(R.Body) EpsilonTail = 1 # the tail of rule may expand to null BodyIndex = Bodylength - 1 Last = 1 # loop starts at the last from types import TupleType while BodyIndex >= 0: Token = R.Body[BodyIndex] (Ttype,Tname) = Token if Ttype in (KEYFLAG,TERMFLAG): # keywords etc cancel epsilon tail, otherwise ignore EpsilonTail = 0 elif Ttype == NONTERMFLAG: # if the tail expands to epsilon, map # follow for the goal nonterminal to this token # and also follow for the tail nonterms if EpsilonTail: # add follow for goal for FToken in kjSet.Neighbors(Follow,R.Nonterm): if not kjSet.HasArc(Follow,Token,FToken): kjSet.AddArc(Follow,Token,FToken) #if type(FToken[0])==TupleType: # raise ValueError, "bad FToken"+`FToken` #print "new", Token, FToken done = 0 # follow changed, loop again # add follow for tail members #for Index2 in range(BodyIndex+1, Bodylength): # TailToken = R.Body[Index2] # for FToken in kjSet.Neighbors(Follow,TailToken): # if not kjSet.HasArc(Follow,Token,FToken): # kjSet.AddArc(Follow,Token,FToken) # done = 0 #endif EpsilonTail # if we are not at the end use First set for next token if not Last: NextToken = R.Body[BodyIndex+1] (NTtype, NTname) = NextToken if NTtype in (KEYFLAG,TERMFLAG): if not kjSet.HasArc(Follow,Token,NextToken): kjSet.AddArc(Follow,Token,NextToken) #print "next", Token, NextToken done = 0 elif NTtype == NONTERMFLAG: for FToken in kjSet.Neighbors(self.First, NextToken): if FToken != NULLTOKEN: if not kjSet.HasArc(Follow,Token,FToken): kjSet.AddArc(Follow,Token,FToken) #print "neighbor", Token, FToken done = 0 else: # next token expands to epsilon: # add its follow, unless already done above #if not EpsilonTail: for FToken in kjSet.Neighbors(Follow,NextToken): if not kjSet.HasArc(Follow,Token,FToken): kjSet.AddArc(Follow,Token,FToken) #print "epsilon", Token, FToken done = 0 else: raise TokenError, "unknown token type in rule body" #endif not Last # finally, check whether next iteration has epsilon tail if not kjSet.HasArc(self.First, Token, NULLTOKEN): EpsilonTail = 0 else: raise TokenError, "unknown token type in rule body" BodyIndex = BodyIndex - 1 Last = 0 # no longer at the last token of the rule #endwhile #endfor #endwhile self.Follow = Follow #enddef CompFollow def DumpFirstFollow(self): First = self.First Follow = self.Follow print "First:" for key in First.keys(): name = key[1] print name," :: ", for (flag2,name2) in First[key].keys(): print name2,", ", print print "Follow:" for key in Follow.keys(): name = key[1] print name," :: ", for (flag2,name2) in Follow[key].keys(): print name2,", ", print # computing the "first" of the tail of a rule followed by an # optional terminal # doesn't include NULLTOKEN # requires self.First to be computed # def FirstOfTail(self, Rule, TailIndex, Token=None): Result = kjSet.NewSet( [] ) # go through all tokens in rule tail so long as there is a # null derivation for the remainder Nullprefix = 1 BodyLength = len(Rule.Body) ThisIndex = TailIndex while Nullprefix and ThisIndex < BodyLength: RToken = Rule.Body[ThisIndex] (RTtype, RTname) = RToken if RTtype == NONTERMFLAG: for FToken in kjSet.Neighbors(self.First, RToken): if FToken != NULLTOKEN: kjSet.addMember(FToken, Result) #endfor # check whether this symbol might have a null production if not kjSet.HasArc(self.First, RToken, NULLTOKEN): Nullprefix = 0 elif RTtype in [KEYFLAG, TERMFLAG]: kjSet.addMember(RToken, Result) Nullprefix = 0 else: raise TokenError, "unknown token type in rule body" ThisIndex = ThisIndex + 1 #endwhile # add the optional token if given and Nullprefix still set if Nullprefix and Token != None: kjSet.addMember(Token, Result) return Result #enddef FirstOfTail # compute an SLR NFA for the ruleset with states for each SLR "item" # and transitions, eg: # X > .AB # on A maps to X > A.B # on epsilon maps to A > .ZC # and A > .WK # an item is a pair (rulenumber, bodyposition) # where body position 0 is interpreted to point before the # beginning of the body. # # SLR = "simple LR" in Aho+Ullman terminology # def CompSLRNFA(self): NFA = CFSMachine(self.StartNonterm) Nrules = len(self.Rules) itemStateMap = {} for Ruleindex in range(0,Nrules): Rule = self.Rules[Ruleindex] # make an item for each "dot" position in the body for DotPos in range(0, len(Rule.Body) + 1): item = (Ruleindex, DotPos) itemState = NFA.NewState(TRANSFLAG, [item]) itemStateMap[item] = itemState #endfor DotPos #endfor Ruleindex # now that the states are initialized # compute transitions except for the last item of a rule # (which has none) for Ruleindex in range(0,Nrules): Rule = self.Rules[Ruleindex] for DotPos in range(0, len(Rule.Body)): item = (Ruleindex, DotPos) CurrentToken = Rule.Body[DotPos] ThisState = itemStateMap[item] NextState = itemStateMap[ (Ruleindex, DotPos + 1) ] NFA.SetMap( ThisState, CurrentToken, NextState ) # if the current token is a nonterminal # ad epsilon transitions to first item for any # rule that derives this nonterminal (CTtype, CTname) = CurrentToken if CTtype == NONTERMFLAG: for Rule2index in range(0,Nrules): Rule2 = self.Rules[Rule2index] Head = Rule2.Nonterm if Head == CurrentToken: NextState = itemStateMap[( Rule2index, 0 )] NFA.SetMap( ThisState, NULLTOKEN, NextState ) #endfor Rule2index #endif CTtype == NONTERMFLAG #endfor DotPos #endfor Ruleindex # must handle the initial state properly here! # Make a dummy state with e-transitions to all first items # for rules that derive the initial nonterminal ThisState = NFA.initial_state GoodStartingPlace = None for Ruleindex in range(0,Nrules): Rule = self.Rules[Ruleindex] Head = Rule.Nonterm if Head == self.StartNonterm: GoodStartingPlace= (Ruleindex, 0) NextState = itemStateMap[ GoodStartingPlace ] NFA.SetMap( ThisState, NULLTOKEN, NextState ) # fix the NFA.States entry if GoodStartingPlace == None: raise NotSLRError, "No derivation for root nonterminal." NFA.States[ NFA.initial_state ] = \ [ 'transient', GoodStartingPlace ] self.SLRNFA = NFA #enddef CompSLRNFA # dump an item def ItemDump(self, item): (ruleindex, position) = item Rule = self.Rules[ruleindex] print Rule.Nonterm[1],' >> ', for bindex in range(0, len(Rule.Body)): if position == bindex: print " (*) ", print Rule.Body[bindex][1], if position == len(Rule.Body): print " (*) " else: print # utility function -- returns true if an item is a final item def SLRItemIsFinal(self, item): (ruleindex, position) = item Rule = self.Rules[ruleindex] if position == len(Rule.Body): return 1 else: return 0 # dump the NFA def DumpSLRNFA(self): NFA = self.SLRNFA print "root: ", NFA.root_nonTerminal for key in NFA.StateTokenMap.keys(): map = NFA.StateTokenMap[key] (fromstate, token) = key fromitem = NFA.States[ fromstate ][1] self.ItemDump(fromitem) print " on ", token[1], " maps " for Tostate in map: Toitem = NFA.States[Tostate][1] print " ", self.ItemDump(Toitem) # compute DFA for ruleset by computing the E-closure of the # NFA def CompDFA(self): self.DFA = self.SLRNFA.Eclosure(NULLTOKEN) def DumpDFAsets(self): DFA = self.DFA print "root: ", DFA.root_nonTerminal for State in range(1, len(DFA.States) ): self.DumpItemSet(State) def DumpItemSet(self,State): DFA = self.DFA NFA = self.SLRNFA print print "STATE ", State, " *******" fromNFAindices = kjSet.get_elts(DFA.States[State][1]) for NFAindex in fromNFAindices: item = NFA.States[NFAindex][1] print " ", NFAindex, ": ", self.ItemDump(item) # this function completes the computation of an SLR DFA # by adding reduction states for each DFA state S containing # item H > B. # which reduces rule H > B # for each token T in Follow of H. # if S already has a transition for T then there is a conflict! # # assumes DFA and SLRNFA and Follow have been computed. # def SLRFixDFA(self): DFA = self.DFA NFA = self.SLRNFA # look through the states (except 0=success) of the DFA # initially don't add any new states, just record # actions to be done # uses convention that 0 is successful final state # ToDo is a dictionary which maps # (State, Token) to a item to reduce ToDo = {} Error = None for State in range(1, len(DFA.States) ): # look for a final item for a rule in this state fromNFAindices = kjSet.get_elts(DFA.States[State][1]) for NFAindex in fromNFAindices: item = NFA.States[NFAindex][1] # if the item is final remember to do the reductions... if self.SLRItemIsFinal(item): (ruleindex, position) = item Rule = self.Rules[ruleindex] Head = Rule.Nonterm Following = kjSet.Neighbors( self.Follow, Head ) for Token in Following: key = (State, Token) if not ToDo.has_key(key): ToDo[ key ] = item else: # it might be okay if the items are identical? item2 = ToDo[key] if item != item2: print "reduce/reduce conflict on ",key self.ItemDump(item) self.ItemDump(item2) Error = " apparent reduce/reduce conflict" #endif #endfor #endif #endfor NFAindex #endfor State # for each (State,Token) pair which indicates a reduction # record the reduction UNLESS the map is already set for the pair for key in ToDo.keys(): (State,Token) = key item = ToDo[key] (rulenum, dotpos) = item ExistingMap = DFA.map( State, Token ) if ExistingMap[0] == NOMATCHFLAG: DFA.SetReduction( State, Token, rulenum ) else: print "apparent shift/reduce conflict" print "reduction: ", key, ": " self.ItemDump(item) print "existing map ", ExistingMap Error = " apparent shift/reduce conflict" #endfor if Error and ABORTONERROR: raise NotSLRError, Error #enddef SLRfixDFA() # do complete SLR DFA creation starting after initialization def DoSLRGeneration(self): self.CompFirst() self.CompFollow() self.CompSLRNFA() self.CompDFA() self.SLRFixDFA() #endclass ruleset ################ the following are interpretation functions ################ used by RULEGRAM meta grammar # some constants used here COMMENTFORM = "##.*\n" RSKEY = "@R" COLKEY = "::" LTKEY = ">>" IDNAME = "ident" # an identifier in the meta grammar is any nonwhite string # except the keywords @R :: >> or comment flag ## IDFORM = "[^" + string.whitespace + "]+" # for identifiers simply return the string def IdentFun(string): return string # RootReduction should receive list of form # [ nontermtoken, keyword COLKEY, RuleList ] def RootReduction(list, ObjectGram): if len(list) != 3 or list[1] != COLKEY: raise FlowError, "unexpected metagrammar root reduction" return (list[0], list[2]) # NullRuleList should receive list of form # [] def NullRuleList(list, ObjectGram): if list != []: raise FlowError, "unexpected null RuleList form" return [] # FullRuleList should receive list of form # [ Rule, RuleList ] def FullRuleList(list, ObjectGram): if type(list) != type([]) or len(list)!=2: raise FlowError, "unexpected full RuleList form" NewRule = list[0] OldRules = list[1] return [NewRule] + OldRules # InterpRule should receive list of form # [keyword RSKEY, # RuleNameStr, # keyword COLKEY, # Nontermtoken, # keyword LTKEY, # Bodylist] # def InterpRule(list, ObjectGram): # check keywords: if len(list) != 6 or \ list[0] != RSKEY or \ list[2] != COLKEY or \ list[4] != LTKEY: raise FlowError, "unexpected meta rule reduction form" ruleName = list[1] ruleNonterm = list[3] ruleBody = list[5] # upcase the the representation of keywords if needed if not ObjectGram.LexD.isCaseSensitive(): for i in range(0,len(ruleBody)): (flag, name) = ruleBody[i] if flag == KEYFLAG: ruleBody[i] = (KEYFLAG, string.upper(name)) elif not flag in (TERMFLAG, NONTERMFLAG): raise FlowError, "unexpected rule body member" rule = kjParser.ParseRule( ruleNonterm, ruleBody ) rule.Name = ruleName return rule # InterpRuleName should receive # [ string ] def InterpRuleName(list, ObjectGram): #print list # add error checking? return list[0] # InterpNonTerm should receive # [ string ] def InterpNonTerm(list, ObjectGram): #print list if type(list)!=type([]) or len(list)!=1: raise FlowError, "unexpected rulename form" Name = list[0] # determine whether this is a valid nonterminal if not ObjectGram.NonTermDict.has_key(Name): #print Name raise TokenError, "LHS of Rule must be nonterminal: "+Name return ObjectGram.NonTermDict[Name] # NullBody should receive [] def NullBody(list, ObjectGram): #print list if list != []: raise FlowError, "unexpected null Body form" return [] # FullBody should receive # [ string, Bodylist] # must determine whether the string represents # a keyword, a nonterminal, or a terminal of the object # grammar. # returns (KEYFLAG, string) (TERMFLAG, string) or # (NONTERMFLAG, string) respectively # def FullBody(list,ObjectGram): #print list if type(list)!=type([]) or len(list)!=2: raise FlowError, "unexpected body form" Name = list[0] # Does the Name rep a nonterm, keyword or term # of the object grammar (in that order). if ObjectGram.NonTermDict.has_key(Name): kind = NONTERMFLAG elif ObjectGram.LexD.keywordmap.has_key(Name): kind = KEYFLAG elif ObjectGram.TermDict.has_key(Name): kind = TERMFLAG else: raise TokenError, "Rule body contains unregistered string: "+Name restOfBody = list[1] return [(kind, Name)] + restOfBody # function to generate a grammar for parsing grammar rules # def ruleGrammar(): LexD = kjParser.LexDictionary() # use SQL/Ansi style comments LexD.comment( COMMENTFORM ) # declare keywords RStart = LexD.keyword( RSKEY ) TwoColons = LexD.keyword( COLKEY ) LeadsTo = LexD.keyword( LTKEY ) # declare terminals ident = LexD.terminal(IDNAME, IDFORM, IdentFun ) # declare nonterminals Root = kjParser.nonterminal("Root") Rulelist = kjParser.nonterminal("RuleList") Rule = kjParser.nonterminal("Rule") RuleName = kjParser.nonterminal("RuleName") NonTerm = kjParser.nonterminal("NonTerm") Body = kjParser.nonterminal("Body") # declare rules # Root >> NonTerm :: Rulelist InitRule = kjParser.ParseRule( Root, \ [NonTerm, TwoColons, Rulelist], RootReduction ) # Rulelist >> RLNull = kjParser.ParseRule( Rulelist, [], NullRuleList) # Rulelist >> Rule Rulelist RLFull = kjParser.ParseRule( Rulelist, [Rule,Rulelist], FullRuleList) # Rule >> "@R :: NonTerm >> Body RuleR = kjParser.ParseRule( Rule, \ [RStart, RuleName, TwoColons, NonTerm, LeadsTo, Body],\ InterpRule) # Rulename >> ident RuleNameR = kjParser.ParseRule( RuleName, [ident], InterpRuleName) # NonTerm >> ident NonTermR = kjParser.ParseRule( NonTerm, [ident], InterpNonTerm) # Body >> BodyNull = kjParser.ParseRule( Body, [], NullBody) # Body >> ident Body BodyFull = kjParser.ParseRule( Body, [ident,Body], FullBody) # declare Rules list and Associated Name dictionary Rules = [RLNull, RLFull, RuleR, RuleNameR, NonTermR,\ BodyNull, BodyFull, InitRule] RuleDict = \ { "RLNull":0, "RLFull":1, "RuleR":2, "RuleNameR":3, \ "NonTermR":4, "BodyNull":5, "BodyFull":6 , "InitRule":7 } # make the RuleSet and compute the associate DFA RuleSet = ruleset( Root, Rules ) RuleSet.DoSLRGeneration() # construct the Grammar object Result = kjParser.Grammar( LexD, RuleSet.DFA, Rules, RuleDict ) return Result #enddef RuleGrammar() # this is the rule grammar object for # parsing RULEGRAM = ruleGrammar() # a derived grammar class (object oriented programming is cool!) # this is a compilable grammar for automatic parser generation. # class CGrammar(kjParser.Grammar): # initialization is handled by the base class # insert a white separated list of keywords into the LexD # THIS SHOULD CHECK FOR KEYWORD/NONTERMINAL/PUNCT NAME # COLLISIONS (BUT DOESN'T YET). def Keywords(self, Stringofkeys): keywordlist = string.split(Stringofkeys) for keyword in keywordlist: self.LexD.keyword( keyword ) # insert a string of punctuations into the LexD def punct(self, Stringofpuncts): for p in Stringofpuncts: self.LexD.punctuation(p) # register a list of regular expression strings # to represent comments in LexD def comments(self, listOfCommentStrings): for str in listOfCommentStrings: self.LexD.comment(str) # register a white separated list of nonterminal strings def Nonterms(self, StringofNonterms): nonTermlist = string.split(StringofNonterms) for NonTerm in nonTermlist: self.NonTermDict[NonTerm] = kjParser.nonterminal(NonTerm) # initialize or add more rules to the RuleString def Declarerules(self, StringWithRules): self.RuleString = self.RuleString + "\n" + StringWithRules # The compilation function assumes # NonTermDict # RuleString # LexD # TermDict # have all been set up properly # (at least if the default MetaGrammar is used). # On successful completion it will set up # DFA # RuleL # RuleNameToIndex def Compile(self, MetaGrammar=RULEGRAM): # the following should return a list of rules # with punctuations of self.LexD interpreted as trivial keywords # keywords of seld.LexD interpreted as keywords # and nonterminals registered in NonTermDict interpreted as # nonterms. # ParseResult should be of form ( (rootNT, RuleL), self ) ParseResult = MetaGrammar.DoParse1( self.RuleString, self ) (RootNonterm, Rulelist) = ParseResult # make a ruleset and compute its DFA RuleS = ruleset( RootNonterm, Rulelist ) RuleS.DoSLRGeneration() # make the rulename to index map to allow future bindings for i in range(0,len(Rulelist)): Rule = Rulelist[i] self.RuleNameToIndex[ Rule.Name ] = i # fill in the blanks self.DFA = RuleS.DFA self.RuleL = Rulelist # FOR DEBUG AND TESTING self.Ruleset = RuleS # DON'T clean up the grammar (misc structures are used) # in future bindings #enddef Compile # Write a reconstructable representation for this grammar # to a file #EXCEPT: # - rule associations to reduction functions # will be lost (must be reset elsewhere) # - terminals in the lexical dictionary # will not be initialized # # IND is used for indentation, should be whitespace (add check!) # # FName if given will cause the reconstructed to be placed # inside a function `FName`+"()" returning the grammar object # # NOTE: this function violates information hiding principles; # in particular it "knows" the guts of the FSM and LexD classes # def Reconstruct(self, VarName, Tofile, FName=None, indent=""): Reconstruction = codeReconstruct(VarName, Tofile, self, FName, indent) GrammarDumpSequence(Reconstruction) #enddef Reconstruct # marshalling of a grammar to a file def MarshalDump(self, Tofile): Reconstruction = marshalReconstruct(self, Tofile) GrammarDumpSequence(Reconstruction) #endclass CGrammar # general procedure for different types of archiving for grammars def GrammarDumpSequence(ReconstructObj): # assume an initialized Reconstruct Object with appropriate grammar etc. # put the lexical part ReconstructObj.PutLex() # put the rules ReconstructObj.PutRules() # put transitions ReconstructObj.PutTransitions() # finish up ReconstructObj.Cleanup() # function to create a "null CGrammar" def NullCGrammar(): return CGrammar(None,None,None,{}) # utility classes -- Grammar reconstruction objects # encapsulate the process of grammar archiving. # class Reconstruct: # this "virtual class" is only for common behaviors of subclasses. def MakeTokenArchives(self): # make a list of all tokens and # initialize token > int dictionary keys = self.Gram.DFA.StateTokenMap.keys() tokenToInt = {} tokenSet = kjSet.NewSet([]) for k in keys: kjSet.addMember(k[1], tokenSet) tokens = kjSet.get_elts(tokenSet) for i in range(0,len(tokens)): tokenToInt[ tokens[i] ] = i self.keys = keys self.tokens = tokens # global sub self.tokInt = tokenToInt # global sub # grammar reconstruction to a file class codeReconstruct(Reconstruct): def __init__(self, VarName, Tofile, Grammar, FName=None, indent =""): # do global subs for each of these self.Var = VarName self.File = Tofile self.FName = FName self.Gram = Grammar # put the reconstruction in a function if FName is given if FName != None: Tofile.write("\n\n") Tofile.write(indent+"def "+FName+"():\n") IND = indent+" " else: IND = indent self.I = IND # global sub! Tofile.write("\n\n") Tofile.write(IND+"# ******************************BEGIN RECONSTRUCTION\n") Tofile.write(IND+"# Python declaration of Grammar variable "+VarName+".\n") Tofile.write(IND+"# automatically generated by module "+PMODULE+".\n") Tofile.write(IND+"# Altering this sequence by hand will probably\n") Tofile.write(IND+"# leave it unusable.\n") Tofile.write(IND+"#\n") Tofile.write(IND+"import "+PMODULE+"\n\n") Tofile.write(IND+"# variable declaration:\n") Tofile.write(IND+VarName+"= "+PMODULE+".NullGrammar()\n\n") # make self.keys list of dfa keys, # self.tokens list of grammar tokens, # self.tokInt inverted dictionary for self.tokens self.MakeTokenArchives() Tofile.write("\n\n"+IND+"# case sensitivity behavior for keywords.\n") if self.Gram.LexD.isCaseSensitive(): Tofile.write(IND+VarName+".SetCaseSensitivity(1)\n") else: Tofile.write(IND+VarName+".SetCaseSensitivity(0)\n") #enddef __init__ def PutLex(self): IND = self.I Tofile = self.File VarName = self.Var LexD = self.Gram.LexD tokens = self.tokens Tofile.write("\n\n"+IND+"# declaration of lexical dictionary.\n") Tofile.write(IND+"# EXCEPT FOR TERMINALS\n") Tofile.write(IND+VarName+".LexD.punctuationlist = ") Tofile.write(`LexD.punctuationlist`+"\n") Tofile.write(IND+"# now comment patterns\n") for comment in LexD.commentstrings: Tofile.write(IND+VarName+".LexD.comment("+`comment`+")\n") Tofile.write(IND+"# now define tokens\n") for i in range(0,len(tokens)): tok = tokens[i] (kind, name) = tok if kind == TERMFLAG: # put warning at end! # nonterminal not installed in lexical dictionary here! Tofile.write(IND+VarName+".IndexToToken["+`i`+"] = ") Tofile.write(PMODULE+".termrep("+`name`+")\n") elif kind == KEYFLAG: Tofile.write(IND+VarName+".IndexToToken["+`i`+"] = ") Tofile.write(VarName+".LexD.keyword("+`name`+")\n") elif kind == NONTERMFLAG: Tofile.write(IND+VarName+".IndexToToken["+`i`+"] = ") Tofile.write(PMODULE+".nonterminal("+`name`+")\n") else: raise FlowError, "unknown token type" #enddef PutLex def PutRules(self): IND = self.I VarName = self.Var Rules = self.Gram.RuleL Tofile = self.File Root = self.Gram.DFA.root_nonTerminal Tofile.write("\n\n"+IND+"# declaration of rule list with names.\n") Tofile.write(IND+"# EXCEPT FOR INTERP FUNCTIONS\n") nrules = len(Rules) Tofile.write(IND+VarName+".RuleL = [None] * "+`nrules`+"\n") for i in range(0,nrules): # put warning at end: # rule reduction function not initialized here! rule = Rules[i] name = rule.Name Tofile.write(IND+"rule = "+`rule`+"\n") Tofile.write(IND+"name = "+`name`+"\n") Tofile.write(IND+"rule.Name = name\n") Tofile.write(IND+VarName+".RuleL["+`i`+"] = rule\n") Tofile.write(IND+VarName+".RuleNameToIndex[name] = "+`i`+"\n") Tofile.write("\n\n"+IND+"# DFA root nonterminal.\n") Tofile.write(IND+VarName+".DFA.root_nonTerminal =") Tofile.write(`Root`+"\n") #enddef PutRules def PutTransitions(self): IND = self.I Tofile = self.File VarName = self.Var maxState = self.Gram.DFA.maxState tokenToInt = self.tokInt StateTokenMap = self.Gram.DFA.StateTokenMap keys = self.keys Tofile.write("\n\n"+IND+"# DFA state declarations.\n") for state in range(1, maxState+1): Tofile.write(IND+VarName+".DFA.States["+`state`+"] = ") Tofile.write('['+`TRANSFLAG`+']\n') Tofile.write(IND+VarName+".DFA.maxState = "+`maxState`+"\n") Tofile.write("\n\n"+IND+"# DFA transition declarations.\n") for key in keys: (fromState, TokenRep) = key TokenIndex = tokenToInt[TokenRep] TokenArg = VarName+".IndexToToken["+`TokenIndex`+"]" TMap = StateTokenMap[key] TMaptype = TMap[0][0] if TMaptype == REDUCEFLAG: # reduction rulenum = TMap[0][1] Args = "("+`fromState`+","+TokenArg+","+`rulenum`+")" Tofile.write(IND+VarName+".DFA.SetReduction"+Args+"\n") elif TMaptype == MOVETOFLAG: # MoveTo Args = "("+`fromState`+","+TokenArg+","+`TMap[0][1]`+")" Tofile.write(IND+VarName+".DFA.SetMap"+Args+"\n") else: raise FlowError, "unexpected else (2)" #enddef def Cleanup(self): Tofile = self.File RuleL = self.Gram.RuleL tokens = self.tokens VarName = self.Var IND = self.I FName = self.FName Tofile.write("\n\n"+IND+"# Clean up the grammar.\n") Tofile.write(IND+VarName+".CleanUp()\n") # if the Fname was given return the grammar as function result if FName != None: Tofile.write("\n\n"+IND+"# return the grammar.\n") Tofile.write(IND+"return "+VarName+"\n") Tofile.write("\n\n"+IND+"# WARNINGS ****************************** \n") Tofile.write(IND+"# You must bind the following rule names \n") Tofile.write(IND+"# to reduction interpretation functions \n") for R in RuleL: Tofile.write(IND+"# "+VarName+".Bind("+`R.Name`+", ??function??)\n") Tofile.write(IND+"#(last rule)\n") Tofile.write("\n\n"+IND+"# WARNINGS ****************************** \n") Tofile.write(IND+"# You must bind the following terminals \n") Tofile.write(IND+"# to regular expressions and interpretation functions \n") warningPrinted = 0 for tok in tokens: (kind, name) = tok if kind == TERMFLAG and tok != ENDOFFILETOKEN: Tofile.write(IND+"# "+VarName+\ ".Addterm("+`name`+", ??regularExp??, ??function??)\n") warningPrinted = 1 if not warningPrinted: Tofile.write(IND+"# ***NONE** \n") Tofile.write(IND+"#(last terminal)\n") Tofile.write(IND+"# ******************************END RECONSTRUCTION\n") #enddef #endclass # reconstruction using marshalling to a file # encodes internal structures for grammar using marshal-able # objects. Final marshalling to the file is done at CleanUp() # storing one big tuple. # class marshalReconstruct(Reconstruct): def __init__(self, Grammar, Tofile): self.Gram = Grammar self.File = Tofile # should archive self.tokens structure self.MakeTokenArchives() # archive this self.CaseSensitivity = Grammar.LexD.isCaseSensitive() def PutLex(self): LexD = self.Gram.LexD # archive these self.punct = LexD.punctuationlist self.comments = LexD.commentstrings def PutRules(self): # archive this self.Root = self.Gram.DFA.root_nonTerminal # make a list of tuples that can be used with # rule = apply(ParseRule, tuple[1]) # rule.Name = tuple[0] Rules = self.Gram.RuleL nrules = len(Rules) RuleTuples = [None] * nrules for i in range(nrules): rule = Rules[i] RuleTuples[i] = (rule.Name, rule.components()) #archive this self.RuleTups = RuleTuples def PutTransitions(self): keys = self.keys tokenToInt = self.tokInt StateTokenMap = self.Gram.DFA.StateTokenMap # archive this self.MaxStates = self.Gram.DFA.maxState # create two lists, # one for reductions with contents (fromState, tokennumber, rulenum) # one for movetos with contents (fromstate, tokennumber, tostate) # (note: token number not token itself to allow sharing) # to allow arbitrary growing, first use dicts: reductDict = {} nreducts = 0 moveToDict = {} nmoveTos = 0 for key in self.keys: (fromState, TokenRep) = key TokenIndex = tokenToInt[TokenRep] TMap = StateTokenMap[key] TMaptype = TMap[0][0] if TMaptype == REDUCEFLAG: rulenum = TMap[0][1] reductDict[nreducts] = (fromState, TokenIndex, rulenum) nreducts = nreducts + 1 elif TMaptype == MOVETOFLAG: ToState = TMap[0][1] moveToDict[nmoveTos] = (fromState, TokenIndex, ToState) nmoveTos = nmoveTos + 1 else: raise FlowError, "unexpected else" #endfor # translate dicts to lists reducts = [None] * nreducts for i in range(nreducts): reducts[i] = reductDict[i] moveTos = [None] * nmoveTos for i in range(nmoveTos): moveTos[i] = moveToDict[i] # archive these self.reducts = reducts self.moveTos = moveTos # this is the function that does the marshalling def Cleanup(self): import marshal # make the big list to marshal BigList = [None] * 9 BigList[0] = self.tokens BigList[1] = self.punct BigList[2] = self.comments BigList[3] = self.RuleTups BigList[4] = self.MaxStates BigList[5] = self.reducts BigList[6] = self.moveTos BigList[7] = self.Root BigList[8] = self.CaseSensitivity # dump the big list to the file marshal.dump( BigList, self.File ) #end class #######################testing stuff if RUNTESTS: def echo(x): return x # simple grammar stolen from a text LD0 = kjParser.LexDictionary() id = LD0.terminal("id","id",echo) plus = LD0.punctuation("+") star = LD0.punctuation("*") oppar = LD0.punctuation("(") clpar = LD0.punctuation(")") equals = LD0.punctuation("=") E = kjParser.nonterminal("E") T = kjParser.nonterminal("T") Tp = kjParser.nonterminal("Tp") Ep = kjParser.nonterminal("Ep") F = kjParser.nonterminal("F") rule1 = kjParser.ParseRule( E, [ T, Ep ] ) rule2 = kjParser.ParseRule( Ep, [ plus, T, Ep ] ) rule3 = kjParser.ParseRule( Ep, [ ] ) rule4 = kjParser.ParseRule( T, [ F, Tp ] ) rule5 = kjParser.ParseRule( Tp, [ star, F, Tp ] ) rule6 = kjParser.ParseRule( Tp, [ ] ) rule7 = kjParser.ParseRule( F, [ oppar, E, clpar ] ) rule8 = kjParser.ParseRule( F, [ id ] ) rl0 = [ rule1, rule2, rule3, rule4, rule5, rule6, rule7,rule8] rs0 = ruleset(E, rl0) rs0.CompFirst() Firstpairs = kjSet.GetPairs(rs0.First) rs0.CompFollow() Followpairs = kjSet.GetPairs(rs0.Follow) rs0.CompSLRNFA() NFA0 = rs0.SLRNFA rs0.CompDFA() rs0.SLRFixDFA() DFA0 = rs0.DFA class dummy: pass ttt0 = dummy() def TESTDFA( STRING , ttt, DFA, Rulelist, DOREDUCTIONS = 1): ttt.STRING = STRING #ttt.List = kjParser.LexList(LD0, ttt.STRING) ttt.Stream = kjParser.LexStringWalker( ttt.STRING, LD0 ) ttt.Stack = {-1:0}# Walkers.SimpleStack() ttt.ParseObj = kjParser.ParserObj( Rulelist, \ ttt.Stream, DFA, ttt.Stack,DOREDUCTIONS) ttt.RESULT = ttt.ParseObj.GO() #ttt.Stack.Dump(10) return ttt.RESULT def TESTDFA0( STRING , DOREDUCTIONS = 1): return TESTDFA( STRING, ttt0, DFA0, rl0, DOREDUCTIONS ) TESTDFA0( " id + id * id ") # an even simpler grammar S = kjParser.nonterminal("S") M = kjParser.nonterminal("M") A = kjParser.nonterminal("A") rr1 = kjParser.ParseRule( S, [M] ) #rr2 = kjParser.ParseRule( A, [A, plus, M]) #rr3 = kjParser.ParseRule( A, [M], echo) #rr4 = kjParser.ParseRule( M, [M, star, M]) rr5 = kjParser.ParseRule( M, [oppar, M, clpar]) rr6 = kjParser.ParseRule( M, [id]) rl1 = [rr1,rr5,rr6] rs1 = ruleset(S, rl1) rs1.CompFirst() rs1.CompFollow() rs1.CompSLRNFA() rs1.CompDFA() rs1.SLRFixDFA() DFA1 = rs1.DFA ttt1=dummy() def TESTDFA1( STRING , DOREDUCTIONS = 1): return TESTDFA( STRING, ttt1, DFA1, rl1, DOREDUCTIONS ) X = kjParser.nonterminal("X") Y = kjParser.nonterminal("Y") RX = kjParser.ParseRule( X, [ oppar, Y, clpar ] ) RY = kjParser.ParseRule( Y, [] ) rl2 = [RX,RY] rs2 = ruleset(X, rl2) rs2.CompFirst() rs2.CompFollow() rs2.CompSLRNFA() rs2.CompDFA() rs2.SLRFixDFA() DFA2 = rs2.DFA ttt2 = dummy() def TESTDFA2( STRING, DOREDUCTIONS = 1): return TESTDFA( STRING, ttt2, DFA2, rl2, DOREDUCTIONS ) # the following grammar should fail to be slr # (Aho,Ullman p. 213) S = kjParser.nonterminal("S") L = kjParser.nonterminal("L") R = kjParser.nonterminal("R") RS1 = kjParser.ParseRule( S, [L, equals, R] ) RS2 = kjParser.ParseRule( S, [R], echo ) RL1 = kjParser.ParseRule( L, [star, R]) RL2 = kjParser.ParseRule( L, [id]) RR1 = kjParser.ParseRule( R, [L] ) rs3 = ruleset(S, [RS1,RS2,RL1,RL2,RR1]) rs3.CompFirst() rs3.CompFollow() rs3.CompSLRNFA() rs3.CompDFA() #rs3.SLRFixDFA() # should fail and does. # testing RULEGRAM ObjG = NullCGrammar() ObjG.Addterm("id","id",echo) ObjG.Nonterms("T E Ep F Tp") ObjG.Keywords("begin end") ObjG.punct("+*()") ObjG.comments(["--.*\n"]) # PROBLEM WITH COMMENTS??? Rulestr = """ ## what a silly grammar! T :: @R One :: T >> begin E end @R Three :: E >> @R Two :: E >> E + T @R Four :: E >> ( T ) """ RL = RULEGRAM.DoParse1( Rulestr, ObjG )