pyparser.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # Copyright 2022 Bill Wendling, All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Simple Python Parser
  15. Parse Python code into a list of logical lines, represented by LogicalLine
  16. objects. This uses Python's tokenizer to generate the tokens. As such, YAPF must
  17. be run with the appropriate Python version---Python >=3.7 for Python 3.7 code,
  18. Python >=3.8 for Python 3.8 code, etc.
  19. This parser uses Python's native "tokenizer" module to generate a list of tokens
  20. for the source code. It then uses Python's native "ast" module to assign
  21. subtypes, calculate split penalties, etc.
  22. A "logical line" produced by Python's "tokenizer" module ends with a
  23. tokenize.NEWLINE, rather than a tokenize.NL, making it easy to separate them
  24. out. Comments all end with a tokentizer.NL, so we need to make sure we don't
  25. errantly pick up non-comment tokens when parsing comment blocks.
  26. ParseCode(): parse the code producing a list of logical lines.
  27. """
  28. # TODO: Call from yapf_api.FormatCode.
  29. import ast
  30. import codecs
  31. import os
  32. import token
  33. import tokenize
  34. from io import StringIO
  35. from tokenize import TokenInfo
  36. from yapf.pyparser import split_penalty_visitor
  37. from yapf.yapflib import format_token
  38. from yapf.yapflib import logical_line
  39. CONTINUATION = token.N_TOKENS
  40. def ParseCode(unformatted_source, filename='<unknown>'):
  41. """Parse a string of Python code into logical lines.
  42. This provides an alternative entry point to YAPF.
  43. Arguments:
  44. unformatted_source: (unicode) The code to format.
  45. filename: (unicode) The name of the file being reformatted.
  46. Returns:
  47. A list of LogicalLines.
  48. Raises:
  49. An exception is raised if there's an error during AST parsing.
  50. """
  51. if not unformatted_source.endswith(os.linesep):
  52. unformatted_source += os.linesep
  53. try:
  54. ast_tree = ast.parse(unformatted_source, filename)
  55. ast.fix_missing_locations(ast_tree)
  56. readline = StringIO(unformatted_source).readline
  57. tokens = tokenize.generate_tokens(readline)
  58. except Exception:
  59. raise
  60. logical_lines = _CreateLogicalLines(tokens)
  61. # Process the logical lines.
  62. split_penalty_visitor.SplitPenalty(logical_lines).visit(ast_tree)
  63. return logical_lines
  64. def _CreateLogicalLines(tokens):
  65. """Separate tokens into logical lines.
  66. Arguments:
  67. tokens: (list of tokenizer.TokenInfo) Tokens generated by tokenizer.
  68. Returns:
  69. A list of LogicalLines.
  70. """
  71. formatted_tokens = []
  72. # Convert tokens into "TokenInfo" and add tokens for continuation markers.
  73. prev_tok = None
  74. for tok in tokens:
  75. tok = TokenInfo(*tok)
  76. if (prev_tok and prev_tok.line.rstrip().endswith('\\') and
  77. prev_tok.start[0] < tok.start[0]):
  78. ctok = TokenInfo(
  79. type=CONTINUATION,
  80. string='\\',
  81. start=(prev_tok.start[0], prev_tok.start[1] + 1),
  82. end=(prev_tok.end[0], prev_tok.end[0] + 2),
  83. line=prev_tok.line)
  84. ctok.lineno = ctok.start[0]
  85. ctok.column = ctok.start[1]
  86. ctok.value = '\\'
  87. formatted_tokens.append(format_token.FormatToken(ctok, 'CONTINUATION'))
  88. tok.lineno = tok.start[0]
  89. tok.column = tok.start[1]
  90. tok.value = tok.string
  91. formatted_tokens.append(
  92. format_token.FormatToken(tok, token.tok_name[tok.type]))
  93. prev_tok = tok
  94. # Generate logical lines.
  95. logical_lines, cur_logical_line = [], []
  96. depth = 0
  97. for tok in formatted_tokens:
  98. if tok.type == tokenize.ENDMARKER:
  99. break
  100. if tok.type == tokenize.NEWLINE:
  101. # End of a logical line.
  102. logical_lines.append(logical_line.LogicalLine(depth, cur_logical_line))
  103. cur_logical_line = []
  104. elif tok.type == tokenize.INDENT:
  105. depth += 1
  106. elif tok.type == tokenize.DEDENT:
  107. depth -= 1
  108. elif tok.type == tokenize.NL:
  109. pass
  110. else:
  111. if (cur_logical_line and not tok.type == tokenize.COMMENT and
  112. cur_logical_line[0].type == tokenize.COMMENT):
  113. # We were parsing a comment block, but now we have real code to worry
  114. # about. Store the comment and carry on.
  115. logical_lines.append(logical_line.LogicalLine(depth, cur_logical_line))
  116. cur_logical_line = []
  117. cur_logical_line.append(tok)
  118. # Link the FormatTokens in each line together to form a doubly linked list.
  119. for line in logical_lines:
  120. previous = line.first
  121. bracket_stack = [previous] if previous.OpensScope() else []
  122. for tok in line.tokens[1:]:
  123. tok.previous_token = previous
  124. previous.next_token = tok
  125. previous = tok
  126. # Set up the "matching_bracket" attribute.
  127. if tok.OpensScope():
  128. bracket_stack.append(tok)
  129. elif tok.ClosesScope():
  130. bracket_stack[-1].matching_bracket = tok
  131. tok.matching_bracket = bracket_stack.pop()
  132. return logical_lines