comment_splicer.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. # Copyright 2015 Google Inc. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Comment splicer for lib2to3 trees.
  15. The lib2to3 syntax tree produced by the parser holds comments and whitespace in
  16. prefix attributes of nodes, rather than nodes themselves. This module provides
  17. functionality to splice comments out of prefixes and into nodes of their own,
  18. making them easier to process.
  19. SpliceComments(): the main function exported by this module.
  20. """
  21. from yapf_third_party._ylib2to3 import pygram
  22. from yapf_third_party._ylib2to3 import pytree
  23. from yapf_third_party._ylib2to3.pgen2 import token
  24. from yapf.pytree import pytree_utils
  25. def SpliceComments(tree):
  26. """Given a pytree, splice comments into nodes of their own right.
  27. Extract comments from the prefixes where they are housed after parsing.
  28. The prefixes that previously housed the comments become empty.
  29. Args:
  30. tree: a pytree.Node - the tree to work on. The tree is modified by this
  31. function.
  32. """
  33. # The previous leaf node encountered in the traversal.
  34. # This is a list because Python 2.x doesn't have 'nonlocal' :)
  35. prev_leaf = [None]
  36. _AnnotateIndents(tree)
  37. def _VisitNodeRec(node):
  38. """Recursively visit each node to splice comments into the AST."""
  39. # This loop may insert into node.children, so we'll iterate over a copy.
  40. for child in node.children[:]:
  41. if isinstance(child, pytree.Node):
  42. # Nodes don't have prefixes.
  43. _VisitNodeRec(child)
  44. else:
  45. if child.prefix.lstrip().startswith('#'):
  46. # We have a comment prefix in this child, so splicing is needed.
  47. comment_prefix = child.prefix
  48. comment_lineno = child.lineno - comment_prefix.count('\n')
  49. comment_column = child.column
  50. # Remember the leading indentation of this prefix and clear it.
  51. # Mopping up the prefix is important because we may go over this same
  52. # child in the next iteration...
  53. child_prefix = child.prefix.lstrip('\n')
  54. prefix_indent = child_prefix[:child_prefix.find('#')]
  55. if '\n' in prefix_indent:
  56. prefix_indent = prefix_indent[prefix_indent.rfind('\n') + 1:]
  57. child.prefix = ''
  58. if child.type == token.NEWLINE:
  59. # If the prefix was on a NEWLINE leaf, it's part of the line so it
  60. # will be inserted after the previously encountered leaf.
  61. # We can't just insert it before the NEWLINE node, because as a
  62. # result of the way pytrees are organized, this node can be under
  63. # an inappropriate parent.
  64. comment_column -= len(comment_prefix.lstrip())
  65. pytree_utils.InsertNodesAfter(
  66. _CreateCommentsFromPrefix(
  67. comment_prefix,
  68. comment_lineno,
  69. comment_column,
  70. standalone=False), prev_leaf[0])
  71. elif child.type == token.DEDENT:
  72. # Comment prefixes on DEDENT nodes also deserve special treatment,
  73. # because their final placement depends on their prefix.
  74. # We'll look for an ancestor of this child with a matching
  75. # indentation, and insert the comment before it if the ancestor is
  76. # on a DEDENT node and after it otherwise.
  77. #
  78. # lib2to3 places comments that should be separated into the same
  79. # DEDENT node. For example, "comment 1" and "comment 2" will be
  80. # combined.
  81. #
  82. # def _():
  83. # for x in y:
  84. # pass
  85. # # comment 1
  86. #
  87. # # comment 2
  88. # pass
  89. #
  90. # In this case, we need to split them up ourselves.
  91. # Split into groups of comments at decreasing levels of indentation
  92. comment_groups = []
  93. comment_column = None
  94. for cmt in comment_prefix.split('\n'):
  95. col = cmt.find('#')
  96. if col < 0:
  97. if comment_column is None:
  98. # Skip empty lines at the top of the first comment group
  99. comment_lineno += 1
  100. continue
  101. elif comment_column is None or col < comment_column:
  102. comment_column = col
  103. comment_indent = cmt[:comment_column]
  104. comment_groups.append((comment_column, comment_indent, []))
  105. comment_groups[-1][-1].append(cmt)
  106. # Insert a node for each group
  107. for comment_column, comment_indent, comment_group in comment_groups:
  108. ancestor_at_indent = _FindAncestorAtIndent(child, comment_indent)
  109. if ancestor_at_indent.type == token.DEDENT:
  110. InsertNodes = pytree_utils.InsertNodesBefore # pylint: disable=invalid-name # noqa
  111. else:
  112. InsertNodes = pytree_utils.InsertNodesAfter # pylint: disable=invalid-name # noqa
  113. InsertNodes(
  114. _CreateCommentsFromPrefix(
  115. '\n'.join(comment_group) + '\n',
  116. comment_lineno,
  117. comment_column,
  118. standalone=True), ancestor_at_indent)
  119. comment_lineno += len(comment_group)
  120. else:
  121. # Otherwise there are two cases.
  122. #
  123. # 1. The comment is on its own line
  124. # 2. The comment is part of an expression.
  125. #
  126. # Unfortunately, it's fairly difficult to distinguish between the
  127. # two in lib2to3 trees. The algorithm here is to determine whether
  128. # child is the first leaf in the statement it belongs to. If it is,
  129. # then the comment (which is a prefix) belongs on a separate line.
  130. # If it is not, it means the comment is buried deep in the statement
  131. # and is part of some expression.
  132. stmt_parent = _FindStmtParent(child)
  133. for leaf_in_parent in stmt_parent.leaves():
  134. if leaf_in_parent.type == token.NEWLINE:
  135. continue
  136. elif id(leaf_in_parent) == id(child):
  137. # This comment stands on its own line, and it has to be inserted
  138. # into the appropriate parent. We'll have to find a suitable
  139. # parent to insert into. See comments above
  140. # _STANDALONE_LINE_NODES for more details.
  141. node_with_line_parent = _FindNodeWithStandaloneLineParent(child)
  142. if pytree_utils.NodeName(
  143. node_with_line_parent.parent) in {'funcdef', 'classdef'}:
  144. # Keep a comment that's not attached to a function or class
  145. # next to the object it is attached to.
  146. comment_end = (
  147. comment_lineno + comment_prefix.rstrip('\n').count('\n'))
  148. if comment_end < node_with_line_parent.lineno - 1:
  149. node_with_line_parent = node_with_line_parent.parent
  150. pytree_utils.InsertNodesBefore(
  151. _CreateCommentsFromPrefix(
  152. comment_prefix, comment_lineno, 0, standalone=True),
  153. node_with_line_parent)
  154. break
  155. else:
  156. if comment_lineno == prev_leaf[0].lineno:
  157. comment_lines = comment_prefix.splitlines()
  158. value = comment_lines[0].lstrip()
  159. if value.rstrip('\n'):
  160. comment_column = prev_leaf[0].column
  161. comment_column += len(prev_leaf[0].value)
  162. comment_column += (
  163. len(comment_lines[0]) - len(comment_lines[0].lstrip()))
  164. comment_leaf = pytree.Leaf(
  165. type=token.COMMENT,
  166. value=value.rstrip('\n'),
  167. context=('', (comment_lineno, comment_column)))
  168. pytree_utils.InsertNodesAfter([comment_leaf], prev_leaf[0])
  169. comment_prefix = '\n'.join(comment_lines[1:])
  170. comment_lineno += 1
  171. rindex = (0 if '\n' not in comment_prefix.rstrip() else
  172. comment_prefix.rstrip().rindex('\n') + 1)
  173. comment_column = (
  174. len(comment_prefix[rindex:]) -
  175. len(comment_prefix[rindex:].lstrip()))
  176. comments = _CreateCommentsFromPrefix(
  177. comment_prefix,
  178. comment_lineno,
  179. comment_column,
  180. standalone=False)
  181. pytree_utils.InsertNodesBefore(comments, child)
  182. break
  183. prev_leaf[0] = child
  184. _VisitNodeRec(tree)
  185. def _CreateCommentsFromPrefix(comment_prefix,
  186. comment_lineno,
  187. comment_column,
  188. standalone=False):
  189. """Create pytree nodes to represent the given comment prefix.
  190. Args:
  191. comment_prefix: (unicode) the text of the comment from the node's prefix.
  192. comment_lineno: (int) the line number for the start of the comment.
  193. comment_column: (int) the column for the start of the comment.
  194. standalone: (bool) determines if the comment is standalone or not.
  195. Returns:
  196. The simple_stmt nodes if this is a standalone comment, otherwise a list of
  197. new COMMENT leafs. The prefix may consist of multiple comment blocks,
  198. separated by blank lines. Each block gets its own leaf.
  199. """
  200. # The comment is stored in the prefix attribute, with no lineno of its
  201. # own. So we only know at which line it ends. To find out at which line it
  202. # starts, look at how many newlines the comment itself contains.
  203. comments = []
  204. lines = comment_prefix.split('\n')
  205. index = 0
  206. while index < len(lines):
  207. comment_block = []
  208. while index < len(lines) and lines[index].lstrip().startswith('#'):
  209. comment_block.append(lines[index].strip())
  210. index += 1
  211. if comment_block:
  212. new_lineno = comment_lineno + index - 1
  213. comment_block[0] = comment_block[0].strip()
  214. comment_block[-1] = comment_block[-1].strip()
  215. comment_leaf = pytree.Leaf(
  216. type=token.COMMENT,
  217. value='\n'.join(comment_block),
  218. context=('', (new_lineno, comment_column)))
  219. comment_node = comment_leaf if not standalone else pytree.Node(
  220. pygram.python_symbols.simple_stmt, [comment_leaf])
  221. comments.append(comment_node)
  222. while index < len(lines) and not lines[index].lstrip():
  223. index += 1
  224. return comments
  225. # "Standalone line nodes" are tree nodes that have to start a new line in Python
  226. # code (and cannot follow a ';' or ':'). Other nodes, like 'expr_stmt', serve as
  227. # parents of other nodes but can come later in a line. This is a list of
  228. # standalone line nodes in the grammar. It is meant to be exhaustive
  229. # *eventually*, and we'll modify it with time as we discover more corner cases
  230. # in the parse tree.
  231. #
  232. # When splicing a standalone comment (i.e. a comment that appears on its own
  233. # line, not on the same line with other code), it's important to insert it into
  234. # an appropriate parent of the node it's attached to. An appropriate parent
  235. # is the first "standalone line node" in the parent chain of a node.
  236. _STANDALONE_LINE_NODES = frozenset([
  237. 'suite', 'if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt',
  238. 'funcdef', 'classdef', 'decorated', 'file_input'
  239. ])
  240. def _FindNodeWithStandaloneLineParent(node):
  241. """Find a node whose parent is a 'standalone line' node.
  242. See the comment above _STANDALONE_LINE_NODES for more details.
  243. Arguments:
  244. node: node to start from
  245. Returns:
  246. Suitable node that's either the node itself or one of its ancestors.
  247. """
  248. if pytree_utils.NodeName(node.parent) in _STANDALONE_LINE_NODES:
  249. return node
  250. else:
  251. # This is guaranteed to terminate because 'file_input' is the root node of
  252. # any pytree.
  253. return _FindNodeWithStandaloneLineParent(node.parent)
  254. # "Statement nodes" are standalone statements. The don't have to start a new
  255. # line.
  256. _STATEMENT_NODES = frozenset(['simple_stmt']) | _STANDALONE_LINE_NODES
  257. def _FindStmtParent(node):
  258. """Find the nearest parent of node that is a statement node.
  259. Arguments:
  260. node: node to start from
  261. Returns:
  262. Nearest parent (or node itself, if suitable).
  263. """
  264. if pytree_utils.NodeName(node) in _STATEMENT_NODES:
  265. return node
  266. else:
  267. return _FindStmtParent(node.parent)
  268. def _FindAncestorAtIndent(node, indent):
  269. """Find an ancestor of node with the given indentation.
  270. Arguments:
  271. node: node to start from. This must not be the tree root.
  272. indent: indentation string for the ancestor we're looking for.
  273. See _AnnotateIndents for more details.
  274. Returns:
  275. An ancestor node with suitable indentation. If no suitable ancestor is
  276. found, the closest ancestor to the tree root is returned.
  277. """
  278. if node.parent.parent is None:
  279. # Our parent is the tree root, so there's nowhere else to go.
  280. return node
  281. # If the parent has an indent annotation, and it's shorter than node's
  282. # indent, this is a suitable ancestor.
  283. # The reason for "shorter" rather than "equal" is that comments may be
  284. # improperly indented (i.e. by three spaces, where surrounding statements
  285. # have either zero or two or four), and we don't want to propagate them all
  286. # the way to the root.
  287. parent_indent = pytree_utils.GetNodeAnnotation(
  288. node.parent, pytree_utils.Annotation.CHILD_INDENT)
  289. if parent_indent is not None and indent.startswith(parent_indent):
  290. return node
  291. else:
  292. # Keep looking up the tree.
  293. return _FindAncestorAtIndent(node.parent, indent)
  294. def _AnnotateIndents(tree):
  295. """Annotate the tree with child_indent annotations.
  296. A child_indent annotation on a node specifies the indentation (as a string,
  297. like " ") of its children. It is inferred from the INDENT child of a node.
  298. Arguments:
  299. tree: root of a pytree. The pytree is modified to add annotations to nodes.
  300. Raises:
  301. RuntimeError: if the tree is malformed.
  302. """
  303. # Annotate the root of the tree with zero indent.
  304. if tree.parent is None:
  305. pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT,
  306. '')
  307. for child in tree.children:
  308. if child.type == token.INDENT:
  309. child_indent = pytree_utils.GetNodeAnnotation(
  310. tree, pytree_utils.Annotation.CHILD_INDENT)
  311. if child_indent is not None and child_indent != child.value:
  312. raise RuntimeError('inconsistent indentation for child', (tree, child))
  313. pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT,
  314. child.value)
  315. _AnnotateIndents(child)