#!/usr/bin/env python # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - """\ Very first try to parse delphi source """ __author__ = "Benoit Kogut-Kubiak" __email__ = "benoit.kogutkubiak@netasq.com" __version__ = "$Revision: 0.0 $"[11:-2] # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - import string import tpg # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - PARSE_FORMAT_NUMBER = '[NUM %s]' PARSE_FORMAT_BOOLEAN = '[BOOL %s]' PARSE_FORMAT_STRING = '[STR %s]' # func name + args PARSE_FORMAT_CALL = '[CALL %s%s]' PARSE_FORMAT_KEYWORD = '[KEYWRD %s]' # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Expression(tpg.Parser): r""" set word_boundary= True set lexer_ignorecase= True ## SEPARATORS =========================================================== separator spaces '[\s\n]+' ; separator sngleComment '//.*?(?:\n|$)' ; # (?s) => DOTALL separator multiComment '(?s)(?:\(\*.*?\*\)|\{.*?\})' ; ## TOKENS =============================================================== # Hopefully we won't need to parse assembler within delphi code; Sadly # we have to EXPLICTLY take out comments to avoid any wrong 'end' match token ASSEMBLER '(?s)\bASM\b(?:(?://|;).*?(?:\n|$)|\(\*.*?\*\)|\{.*?\}|.)*?\bEND\b' ; token VREAL '\d+\.\d+(?:e[-+]?\d+)?|\d+e[-+]?\d+' ; token VINT '\d+' ; token VBOOL '\b(?:TRUE|FALSE)\b' ; # no backslashing madness :) token VSTR '(?:\'[^\n]*?\')+' ; # parenthesis tokens should not be confused with multi line comment delimiters token LP '\((?!\*)' ; token RP '(?=, <>, <, >, = token LE '<=' ; token GE '(?=' ; token NE '<>(?!=)' ; token LS '<(?![>=])' ; token GT '(?(?!=)' ; token EQ '(?:])=' ; token CIRC '\^' ; token AT '@' ; token COMMA ',' ; token AFFECT ':=' ; token COLON ':(?!=)' ; token SCOLN ';' ; # .. must be matched before . token RANGE '\.\.' ; # avoid .. and . confusion token DOT '(? Expression/s ; # Expression - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Expression/e -> SimpleExpr/e ( RelOp/op SimpleExpr/se $ e = '[E: %s.%s.%s]' % (op, e, se) )* ; ExprList/le -> Expression/le ( COMMA Expression/e $ le = '%s, %s' % (le, e) )* ; # A constant expression is an expression that the compiler can evaluate # without executing the program in which it occurs. # Constant expressions include numerals, character strings, true # constants, values of enumerated types, the special constants True, # False, and nil, and expressions built exclusively from these # elements with operators, typecasts, and set constructors. # Constant expressions cannot include variables, pointers, or function # calls, except calls to the following predefined functions: # # Abs, Chr, Hi High, Length, Lo Low, Odd, Ord Pred, Round, SizeOf, # Succ, Swap, Trunc ConstExpr/ce -> # Blame me, I choosed the easy way Expression/ce ; SimpleExpr/se -> '[+-]'/op SimpleExpr/se $ se = '%s%s' % (op, se) | Term/se ( AddOp/op Term/t $ se = '[SE: %s.%s.%s]' % (op, se, t) )* ; Term/t -> Factor/t ( MulOp/op Factor/f $ t = '[T: %s.%s.%s]' % (op, t, f) )* ; Factor/f -> # procedure call Designator/pc Arguments/a $ f = PARSE_FORMAT_CALL % (pc, a) # variable | Designator/f $ f = '[VAR %s]' % (f,) | Address/f | Number/f $ f = PARSE_FORMAT_NUMBER % (f,) | Boolean/f $ f = PARSE_FORMAT_BOOLEAN % (f,) | String/f $ f = PARSE_FORMAT_STRING % (f,) | NIL/f $ f = PARSE_FORMAT_KEYWORD % (f,) | LP Expression/f RP $ f = '(%s)' % (f,) | NOT Factor/f $ f = 'not %s' % (f,) | SetConstructor/f | TypeId/tid LP Expression/e RP $ f = '%s(%s)' % (tid, e) ; # Sets - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - SetConstructor/sc -> LB SetElement/sc RB $ sc = '[%s]' % (sc,) | LB SetElement/sc ( COMMA SetElement/se $ sc = '[%s, %s]' % (sc, se) )+ RB ; SetElement/se -> Expression/se ( RANGE Expression/e $se = '%s..%s' % (se, e) )? ; # ProcedureCall - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ProcedureCall/pc -> Designator/pc ( Arguments/a $ pc = '%s%s' % (pc, a) )? ; Arguments/a -> LP ArgsTail/at $ a = '(%s' % (at,) ; ArgsTail/at -> ExprList/le RP $ at = '%s)' % (le,) | RP/at ; # Operators - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - RelOp/op -> LE/op | GE/op | NE/op | LS/op | GT/op | EQ/op | 'IN'/op | 'IS'/op | 'AS'/op ; AddOp/op -> '[+-]'/op | 'OR'/op | 'XOR'/op ; MulOp/op -> SLASH/op | STAR/op | 'DIV'/op | 'MOD'/op | 'AND'/op | 'SHL'/op | 'SHR'/op ; # Idents and Misc. Parsing - - - - - - - - - - - - - - - - - - - - - - - Address/a -> AT Designator/d $ a = '@%s' % (d,) ; Designator/d -> QualId/d ( DesignatorTail/dt $ d = '%s%s' % (d,dt) )* ; DesignatorTail/dt -> DOT Ident/id $ dt = '.%s' % (id,) | LB ExprList/el RB $ dt = '[%s]' % (el,) | CIRC/dt ; LabelId/lid -> ID/lid ; QualId/qid -> UnitId/uid DOT QualId/qid $ qid = '%s.%s' % (uid, qid) | Ident/qid ; UnitId/uid -> ID/uid ; TypeId/tid -> UnitId/uid DOT TypeId/tid $ tid = '%s.%s' % (uid, tid) | ID/tid ; Ident/id -> ID/id ( DOT ID/i $ id = '%s.%s' % (id, i) )* ; IdentList/lid -> Ident/lid ( COMMA Ident/id $ lid = '%s, %s' % (lid, id) )* ; Number/n -> VINT/n | VREAL/n ; Boolean/b -> VBOOL/b ; String/s -> VSTR/s ; """ # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if __name__ == '__main__' : expr= Expression() file= open('./expression.txt') try : try : print expr(file.read()) except Exception, e : print e finally : file.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -