import ast import re import sys import codecs from json import JSONEncoder from ast import * # Is it Python 3? is_python3 = hasattr(sys.version_info, 'major') and (sys.version_info.major == 3) class AstEncoder(JSONEncoder): def default(self, o): if hasattr(o, '__dict__'): d = o.__dict__ # workaround: decode strings if it's not Python3 code if not is_python3: for k in d: if isinstance(d[k], str): if k == 's': d[k] = lines[d['start']:d['end']] else: d[k] = d[k].decode(enc) d['type'] = o.__class__.__name__ return d else: return str(o) enc = 'latin1' lines = '' def parse_dump(filename, output, end_mark): try: if is_python3: encoder = AstEncoder() else: encoder = AstEncoder(encoding=enc) tree = parse_file(filename) encoded = encoder.encode(tree) f = open(output, "w") f.write(encoded) f.close() finally: # write marker file to signal write end f = open(end_mark, "w") f.close() def parse_json(filename): try: if is_python3: encoder = AstEncoder() else: encoder = AstEncoder(encoding=enc) tree = parse_file(filename) encoded = encoder.encode(tree) return encoded except: return "" def parse_file(filename): global enc, lines enc, enc_len = detect_encoding(filename) f = codecs.open(filename, 'r', enc) lines = f.read() # remove BOM lines = re.sub(u'\ufeff', ' ', lines) # replace the encoding decl by spaces to fool python parser # otherwise you get 'encoding decl in unicode string' syntax error # print('enc:', enc, 'enc_len', enc_len) if enc_len > 0: lines = re.sub('#.*coding\s*[:=]\s*[\w\d\-]+', '#' + ' ' * (enc_len-1), lines) f.close() return parse_string(lines, filename) def parse_string(string, filename=None): tree = ast.parse(string) improve_ast(tree, string) if filename: tree.filename = filename return tree # short function for experiments def p(filename): parse_dump(filename, "json1", "end1") def detect_encoding(path): fin = open(path, 'rb') prefix = str(fin.read(80)) encs = re.findall('#.*coding\s*[:=]\s*([\w\d\-]+)', prefix) decl = re.findall('#.*coding\s*[:=]\s*[\w\d\-]+', prefix) if encs: enc1 = encs[0] enc_len = len(decl[0]) try: info = codecs.lookup(enc1) # print('lookedup: ', info) except LookupError: # print('encoding not exist: ' + enc1) return 'latin1', enc_len return enc1, enc_len else: return 'latin1', -1 #------------------------------------------------------------- # improvements to the AST #------------------------------------------------------------- def improve_ast(node, s): build_index_map(s) improve_node(node, s) # build global table 'idxmap' for lineno <-> index oonversion def build_index_map(s): global line_starts idx = 0 line_starts = [0] while idx < len(s): if s[idx] == '\n': line_starts.append(idx + 1) idx += 1 # convert (line, col) to offset index def map_idx(line, col): return line_starts[line - 1] + col # convert offset index into (line, col) def map_line_col(idx): line = 0 for start in line_starts: if idx < start: break line += 1 col = idx - line_starts[line - 1] return (line, col) def improve_node(node, s): if isinstance(node, list): for n in node: improve_node(n, s) elif isinstance(node, AST): find_start(node, s) find_end(node, s) add_missing_names(node, s) for f in node_fields(node): improve_node(f, s) def find_start(node, s): ret = None # default value if hasattr(node, 'start'): ret = node.start elif isinstance(node, list): if node != []: ret = find_start(node[0], s) elif isinstance(node, Module): if node.body != []: ret = find_start(node.body[0], s) elif isinstance(node, BinOp): leftstart = find_start(node.left, s) if leftstart != None: ret = leftstart else: ret = map_idx(node.lineno, node.col_offset) elif hasattr(node, 'lineno'): if node.col_offset >= 0: ret = map_idx(node.lineno, node.col_offset) else: # special case for """ strings i = map_idx(node.lineno, node.col_offset) while i > 0 and i + 2 < len(s) and s[i:i + 3] != '"""' and s[i:i + 3] != "'''": i -= 1 ret = i else: return None if ret == None and hasattr(node, 'lineno'): raise TypeError("got None for node that has lineno", node) if isinstance(node, AST) and ret != None: node.start = ret return ret def find_end(node, s): the_end = None if hasattr(node, 'end'): return node.end elif isinstance(node, list): if node != []: the_end = find_end(node[-1], s) elif isinstance(node, Module): if node.body != []: the_end = find_end(node.body[-1], s) elif isinstance(node, Expr): the_end = find_end(node.value, s) elif isinstance(node, Str): i = find_start(node, s) while s[i] != '"' and s[i] != "'": i += 1 if i + 2 < len(s) and s[i:i + 3] == '"""': q = '"""' i += 3 elif i + 2 < len(s) and s[i:i + 3] == "'''": q = "'''" i += 3 elif s[i] == '"': q = '"' i += 1 elif s[i] == "'": q = "'" i += 1 else: print("illegal quote:", i, s[i]) q = '' if q != '': the_end = end_seq(s, q, i) elif isinstance(node, Name): the_end = find_start(node, s) + len(node.id) elif isinstance(node, Attribute): the_end = end_seq(s, node.attr, find_end(node.value, s)) elif isinstance(node, FunctionDef): the_end = find_end(node.body, s) elif isinstance(node, Lambda): the_end = find_end(node.body, s) elif isinstance(node, ClassDef): the_end = find_end(node.body, s) # print will be a Call in Python 3 elif not is_python3 and isinstance(node, Print): the_end = start_seq(s, '\n', find_start(node, s)) elif isinstance(node, Call): start = find_end(node.func, s) if start != None: the_end = match_paren(s, '(', ')', start) elif isinstance(node, Yield): the_end = find_end(node.value, s) elif isinstance(node, Return): if node.value != None: the_end = find_end(node.value, s) else: the_end = find_start(node, s) + len('return') elif (isinstance(node, For) or isinstance(node, While) or isinstance(node, If) or isinstance(node, IfExp)): if node.orelse != []: the_end = find_end(node.orelse, s) else: the_end = find_end(node.body, s) elif isinstance(node, Assign) or isinstance(node, AugAssign): the_end = find_end(node.value, s) elif isinstance(node, BinOp): the_end = find_end(node.right, s) elif isinstance(node, BoolOp): the_end = find_end(node.values[-1], s) elif isinstance(node, Compare): the_end = find_end(node.comparators[-1], s) elif isinstance(node, UnaryOp): the_end = find_end(node.operand, s) elif isinstance(node, Num): the_end = find_start(node, s) + len(str(node.n)) elif isinstance(node, List): the_end = match_paren(s, '[', ']', find_start(node, s)); elif isinstance(node, Subscript): the_end = match_paren(s, '[', ']', find_start(node, s)); elif isinstance(node, Tuple): if node.elts != []: the_end = find_end(node.elts[-1], s) elif isinstance(node, Dict): the_end = match_paren(s, '{', '}', find_start(node, s)); elif ((not is_python3 and isinstance(node, TryExcept)) or (is_python3 and isinstance(node, Try))): if node.orelse != []: the_end = find_end(node.orelse, s) elif node.handlers != []: the_end = find_end(node.handlers, s) else: the_end = find_end(node.body, s) elif isinstance(node, ExceptHandler): the_end = find_end(node.body, s) elif isinstance(node, Pass): the_end = find_start(node, s) + len('pass') elif isinstance(node, Break): the_end = find_start(node, s) + len('break') elif isinstance(node, Continue): the_end = find_start(node, s) + len('continue') elif isinstance(node, Global): the_end = start_seq(s, '\n', find_start(node, s)) elif isinstance(node, Import): the_end = find_start(node, s) + len('import') elif isinstance(node, ImportFrom): the_end = find_start(node, s) + len('from') else: # can't determine node end, set to 3 chars after start start = find_start(node, s) if start != None: the_end = start + 3 if isinstance(node, AST) and the_end != None: node.end = the_end return the_end def add_missing_names(node, s): if hasattr(node, 'extra_attr'): return if isinstance(node, list): for n in node: add_missing_names(n, s) elif isinstance(node, ClassDef): head = find_start(node, s) start = s.find("class", head) + len("class") if start != None: node.name_node = str_to_name(s, start) node._fields += ('name_node',) elif isinstance(node, FunctionDef): # skip to "def" because it may contain decorators like @property head = find_start(node, s) start = s.find("def", head) + len("def") if start != None: node.name_node = str_to_name(s, start) node._fields += ('name_node',) # keyword_start = find_start(node, s) # node.keyword_node = str_to_name(s, keyword_start) # node._fields += ('keyword_node',) if node.args.vararg != None: if len(node.args.args) > 0: vstart = find_end(node.args.args[-1], s) else: vstart = find_end(node.name_node, s) if vstart != None: vname = str_to_name(s, vstart) node.vararg_name = vname else: node.vararg_name = None node._fields += ('vararg_name',) if node.args.kwarg != None: if len(node.args.args) > 0: kstart = find_end(node.args.args[-1], s) else: kstart = find_end(node.vararg_name, s) if kstart: kname = str_to_name(s, kstart) node.kwarg_name = kname else: node.kwarg_name = None node._fields += ('kwarg_name',) elif isinstance(node, Attribute): start = find_end(node.value, s) if start is not None: name = str_to_name(s, start) node.attr_name = name node._fields = ('value', 'attr_name') # remove attr for node size accuracy elif isinstance(node, Compare): start = find_start(node, s) if start is not None: node.opsName = convert_ops(node.ops, s, start) node._fields += ('opsName',) elif (isinstance(node, BoolOp) or isinstance(node, BinOp) or isinstance(node, UnaryOp) or isinstance(node, AugAssign)): if hasattr(node, 'left'): start = find_end(node.left, s) else: start = find_start(node, s) if start is not None: ops = convert_ops([node.op], s, start) else: ops = [] if ops != []: node.op_node = ops[0] node._fields += ('op_node',) elif isinstance(node, Num): if isinstance(node.n, int) or (not is_python3 and isinstance(node.n, long)): type = 'int' node.n = str(node.n) elif isinstance(node.n, float): type = 'float' node.n = str(node.n) elif isinstance(node.n, complex): type = 'complex' node.real = node.n.real node.imag = node.n.imag node._fields += ('real', 'imag') node.num_type = type node._fields += ('num_type',) node.extra_attr = True #------------------------------------------------------------- # utilities used by improve AST functions #------------------------------------------------------------- # find a sequence in a string s, returning the start point def start_seq(s, pat, start): try: return s.index(pat, start) except ValueError: return len(s) # find a sequence in a string s, returning the end point def end_seq(s, pat, start): try: return s.index(pat, start) + len(pat) except ValueError: return len(s) # find matching close paren from start def match_paren(s, open, close, start): while start < len(s) and s[start] != open: start += 1 if start >= len(s): return len(s) left = 1 i = start + 1 while left > 0 and i < len(s): if s[i] == open: left += 1 elif s[i] == close: left -= 1 i += 1 return i # convert string to Name def str_to_name(s, start): i = start; while i < len(s) and not is_alpha(s[i]): i += 1 name_start = i ret = [] while i < len(s) and is_alpha(s[i]): ret.append(s[i]) i += 1 name_end = i id1 = ''.join(ret) if id1 == '': return None else: name = Name(id1, None) name.start = name_start name.end = name_end name.lineno, name.col_offset = map_line_col(name_start) return name def convert_ops(ops, s, start): syms = [] for op in ops: if type(op) in ops_map: syms.append(ops_map[type(op)]) else: print("[WARNING] operator %s is missing from ops_map, " "please report the bug on GitHub" % op) i = start j = 0 ret = [] while i < len(s) and j < len(syms): oplen = len(syms[j]) if s[i:i + oplen] == syms[j]: op_node = Name(syms[j], None) op_node.start = i op_node.end = i + oplen op_node.lineno, op_node.col_offset = map_line_col(i) ret.append(op_node) j += 1 i = op_node.end else: i += 1 return ret # lookup table for operators for convert_ops ops_map = { # compare: Eq: '==', NotEq: '!=', LtE: '<=', Lt: '<', GtE: '>=', Gt: '>', NotIn: 'not in', In: 'in', IsNot: 'is not', Is: 'is', # BoolOp Or: 'or', And: 'and', Not: 'not', Invert: '~', # bit operators BitOr: '|', BitAnd: '&', BitXor: '^', RShift: '>>', LShift: '<<', # BinOp Add: '+', Sub: '-', Mult: '*', Div: '/', FloorDiv: '//', Mod: '%', Pow: '**', # UnaryOp USub: '-', UAdd: '+', } # get list of fields from a node def node_fields(node): ret = [] for field in node._fields: if field != 'ctx' and hasattr(node, field): ret.append(getattr(node, field)) return ret # get full source text where the node is from def node_source(node): if hasattr(node, 'node_source'): return node.node_source else: return None # utility for getting exact source code part of the node def src(node): return node.node_source[node.start: node.end] def start(node): if hasattr(node, 'start'): return node.start else: return 0 def end(node): if hasattr(node, 'end'): return node.end else: return None def is_alpha(c): return (c == '_' or ('0' <= c <= '9') or ('a' <= c <= 'z') or ('A' <= c <= 'Z')) # p('/Users/yinwang/Code/django/tests/invalid_models/invalid_models/models.py') # p('/Users/yinwang/Dropbox/prog/pysonar2/tests/test-unicode/test1.py') # p('/Users/yinwang/Code/cpython/Lib/lib2to3/tests/data/bom.py') # p('/Users/yinwang/Code/cpython/Lib/test/test_decimal.py') # p('/Users/yinwang/Code/cpython/Lib/test/test_pep3131.py') # p('/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/tarfile.py') # p('/Users/yinwang/Code/cpython/Lib/lib2to3/tests/data/false_encoding.py') # p('/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/test/test_marshal.py') # p('/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/lib-tk/Tix.py') #p('/home/liaoxinxi/trunk/src/bvs/login/test_system.py') #p('/home/liaoxinxi/trunk/src/www/npai/systemforpa.py') p('libssh2_login_test.py.bak') #p('arg.py')