Michael White | 18c2844 | 2017-02-02 20:02:20 -0800 | [diff] [blame] | 1 | #------------------------------------------------------------------------------ |
| 2 | # pycparser: c_json.py |
| 3 | # |
| 4 | # by Michael White (@mypalmike) |
| 5 | # |
| 6 | # This example includes functions to serialize and deserialize an ast |
| 7 | # to and from json format. Serializing involves walking the ast and converting |
| 8 | # each node from a python Node object into a python dict. Deserializing |
| 9 | # involves the opposite conversion, walking the tree formed by the |
| 10 | # dict and converting each dict into the specific Node object it represents. |
| 11 | # The dict itself is serialized and deserialized using the python json module. |
| 12 | # |
| 13 | # The dict representation is a fairly direct transformation of the object |
| 14 | # attributes. Each node in the dict gets one metadata field referring to the |
| 15 | # specific node class name, _nodetype. Each local attribute (i.e. not linking |
| 16 | # to child nodes) has a string value or array of string values. Each child |
| 17 | # attribute is either another dict or an array of dicts, exactly as in the |
| 18 | # Node object representation. The "coord" attribute, representing the |
| 19 | # node's location within the source code, is serialized/deserialized from |
| 20 | # a Coord object into a string of the format "filename:line[:column]". |
| 21 | # |
| 22 | # Example TypeDecl node, with IdentifierType child node, represented as a dict: |
| 23 | # "type": { |
| 24 | # "_nodetype": "TypeDecl", |
| 25 | # "coord": "c_files/funky.c:8", |
| 26 | # "declname": "o", |
| 27 | # "quals": [], |
| 28 | # "type": { |
| 29 | # "_nodetype": "IdentifierType", |
| 30 | # "coord": "c_files/funky.c:8", |
| 31 | # "names": [ |
| 32 | # "char" |
| 33 | # ] |
| 34 | # } |
| 35 | # } |
| 36 | #------------------------------------------------------------------------------ |
| 37 | from __future__ import print_function |
| 38 | |
| 39 | import json |
| 40 | import sys |
| 41 | import re |
| 42 | |
| 43 | # This is not required if you've installed pycparser into |
| 44 | # your site-packages/ with setup.py |
| 45 | # |
| 46 | sys.path.extend(['.', '..']) |
| 47 | |
| 48 | from pycparser import parse_file, c_ast |
| 49 | from pycparser.plyparser import Coord |
| 50 | |
| 51 | |
Ville Skyttä | 2129f5f | 2017-03-05 04:52:22 +0200 | [diff] [blame^] | 52 | RE_CHILD_ARRAY = re.compile(r'(.*)\[(.*)\]') |
Michael White | 18c2844 | 2017-02-02 20:02:20 -0800 | [diff] [blame] | 53 | RE_INTERNAL_ATTR = re.compile('__.*__') |
| 54 | |
| 55 | |
| 56 | class CJsonError(Exception): |
| 57 | pass |
| 58 | |
| 59 | |
| 60 | def memodict(fn): |
| 61 | """ Fast memoization decorator for a function taking a single argument """ |
| 62 | class memodict(dict): |
| 63 | def __missing__(self, key): |
| 64 | ret = self[key] = fn(key) |
| 65 | return ret |
| 66 | return memodict().__getitem__ |
| 67 | |
| 68 | |
| 69 | @memodict |
| 70 | def child_attrs_of(klass): |
| 71 | """ |
| 72 | Given a Node class, get a set of child attrs. |
| 73 | Memoized to avoid highly repetitive string manipulation |
| 74 | |
| 75 | """ |
| 76 | non_child_attrs = set(klass.attr_names) |
| 77 | all_attrs = set([i for i in klass.__slots__ if not RE_INTERNAL_ATTR.match(i)]) |
| 78 | return all_attrs - non_child_attrs |
| 79 | |
| 80 | |
| 81 | def to_dict(node): |
| 82 | """ Recursively convert an ast into dict representation. """ |
| 83 | klass = node.__class__ |
| 84 | |
| 85 | result = {} |
| 86 | |
| 87 | # Metadata |
| 88 | result['_nodetype'] = klass.__name__ |
| 89 | |
| 90 | # Local node attributes |
| 91 | for attr in klass.attr_names: |
| 92 | result[attr] = getattr(node, attr) |
| 93 | |
| 94 | # Coord object |
| 95 | if node.coord: |
| 96 | result['coord'] = str(node.coord) |
| 97 | else: |
| 98 | result['coord'] = None |
| 99 | |
| 100 | # Child attributes |
| 101 | for child_name, child in node.children(): |
| 102 | # Child strings are either simple (e.g. 'value') or arrays (e.g. 'block_items[1]') |
| 103 | match = RE_CHILD_ARRAY.match(child_name) |
| 104 | if match: |
| 105 | array_name, array_index = match.groups() |
| 106 | array_index = int(array_index) |
| 107 | # arrays come in order, so we verify and append. |
| 108 | result[array_name] = result.get(array_name, []) |
| 109 | if array_index != len(result[array_name]): |
| 110 | raise CJsonError('Internal ast error. Array {} out of order. ' |
| 111 | 'Expected index {}, got {}'.format( |
| 112 | array_name, len(result[array_name]), array_index)) |
| 113 | result[array_name].append(to_dict(child)) |
| 114 | else: |
| 115 | result[child_name] = to_dict(child) |
| 116 | |
| 117 | # Any child attributes that were missing need "None" values in the json. |
| 118 | for child_attr in child_attrs_of(klass): |
| 119 | if child_attr not in result: |
| 120 | result[child_attr] = None |
| 121 | |
| 122 | return result |
| 123 | |
| 124 | |
| 125 | def to_json(node, **kwargs): |
| 126 | """ Convert ast node to json string """ |
| 127 | return json.dumps(to_dict(node), **kwargs) |
| 128 | |
| 129 | |
| 130 | def file_to_dict(filename): |
| 131 | """ Load C file into dict representation of ast """ |
| 132 | ast = parse_file(filename, use_cpp=True) |
| 133 | return to_dict(ast) |
| 134 | |
| 135 | |
| 136 | def file_to_json(filename, **kwargs): |
| 137 | """ Load C file into json string representation of ast """ |
| 138 | ast = parse_file(filename, use_cpp=True) |
| 139 | return to_json(ast, **kwargs) |
| 140 | |
| 141 | |
| 142 | def _parse_coord(coord_str): |
| 143 | """ Parse coord string (file:line[:column]) into Coord object. """ |
| 144 | if coord_str is None: |
| 145 | return None |
| 146 | |
| 147 | vals = coord_str.split(':') |
| 148 | vals.extend([None] * 3) |
| 149 | filename, line, column = vals[:3] |
| 150 | return Coord(filename, line, column) |
| 151 | |
| 152 | |
| 153 | def _convert_to_obj(value): |
| 154 | """ |
| 155 | Convert an object in the dict representation into an object. |
| 156 | Note: Mutually recursive with from_dict. |
| 157 | |
| 158 | """ |
| 159 | value_type = type(value) |
| 160 | if value_type == dict: |
| 161 | return from_dict(value) |
| 162 | elif value_type == list: |
| 163 | return [_convert_to_obj(item) for item in value] |
| 164 | else: |
| 165 | # String |
| 166 | return value |
| 167 | |
| 168 | |
| 169 | def from_dict(node_dict): |
| 170 | """ Recursively build an ast from dict representation """ |
| 171 | class_name = node_dict.pop('_nodetype') |
| 172 | |
| 173 | klass = getattr(c_ast, class_name) |
| 174 | |
| 175 | # Create a new dict containing the key-value pairs which we can pass |
| 176 | # to node constructors. |
| 177 | objs = {} |
| 178 | for key, value in node_dict.items(): |
| 179 | if key == 'coord': |
| 180 | objs[key] = _parse_coord(value) |
| 181 | else: |
| 182 | objs[key] = _convert_to_obj(value) |
| 183 | |
| 184 | # Use keyword parameters, which works thanks to beautifully consistent |
| 185 | # ast Node initializers. |
| 186 | return klass(**objs) |
| 187 | |
| 188 | |
| 189 | def from_json(ast_json): |
| 190 | """ Build an ast from json string representation """ |
| 191 | return from_dict(json.loads(ast_json)) |
| 192 | |
| 193 | |
| 194 | #------------------------------------------------------------------------------ |
| 195 | if __name__ == "__main__": |
| 196 | if len(sys.argv) > 1: |
| 197 | # Some test code... |
| 198 | # Do trip from C -> ast -> dict -> ast -> json, then print. |
| 199 | ast_dict = file_to_dict(sys.argv[1]) |
| 200 | ast = from_dict(ast_dict) |
| 201 | print(to_json(ast, sort_keys=True, indent=4)) |
| 202 | else: |
| 203 | print("Please provide a filename as argument") |