format_str_tokenizer.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. """Tokenizers for three string formatting methods"""
  2. from __future__ import annotations
  3. from enum import Enum, unique
  4. from typing import Final
  5. from mypy.checkstrformat import (
  6. ConversionSpecifier,
  7. parse_conversion_specifiers,
  8. parse_format_value,
  9. )
  10. from mypy.errors import Errors
  11. from mypy.messages import MessageBuilder
  12. from mypy.nodes import Context, Expression
  13. from mypy.options import Options
  14. from mypyc.ir.ops import Integer, Value
  15. from mypyc.ir.rtypes import (
  16. c_pyssize_t_rprimitive,
  17. is_bytes_rprimitive,
  18. is_int_rprimitive,
  19. is_short_int_rprimitive,
  20. is_str_rprimitive,
  21. )
  22. from mypyc.irbuild.builder import IRBuilder
  23. from mypyc.primitives.bytes_ops import bytes_build_op
  24. from mypyc.primitives.int_ops import int_to_str_op
  25. from mypyc.primitives.str_ops import str_build_op, str_op
  26. @unique
  27. class FormatOp(Enum):
  28. """FormatOp represents conversion operations of string formatting during
  29. compile time.
  30. Compare to ConversionSpecifier, FormatOp has fewer attributes.
  31. For example, to mark a conversion from any object to string,
  32. ConversionSpecifier may have several representations, like '%s', '{}'
  33. or '{:{}}'. However, there would only exist one corresponding FormatOp.
  34. """
  35. STR = "s"
  36. INT = "d"
  37. BYTES = "b"
  38. def generate_format_ops(specifiers: list[ConversionSpecifier]) -> list[FormatOp] | None:
  39. """Convert ConversionSpecifier to FormatOp.
  40. Different ConversionSpecifiers may share a same FormatOp.
  41. """
  42. format_ops = []
  43. for spec in specifiers:
  44. # TODO: Match specifiers instead of using whole_seq
  45. if spec.whole_seq == "%s" or spec.whole_seq == "{:{}}":
  46. format_op = FormatOp.STR
  47. elif spec.whole_seq == "%d":
  48. format_op = FormatOp.INT
  49. elif spec.whole_seq == "%b":
  50. format_op = FormatOp.BYTES
  51. elif spec.whole_seq:
  52. return None
  53. else:
  54. format_op = FormatOp.STR
  55. format_ops.append(format_op)
  56. return format_ops
  57. def tokenizer_printf_style(format_str: str) -> tuple[list[str], list[FormatOp]] | None:
  58. """Tokenize a printf-style format string using regex.
  59. Return:
  60. A list of string literals and a list of FormatOps.
  61. """
  62. literals: list[str] = []
  63. specifiers: list[ConversionSpecifier] = parse_conversion_specifiers(format_str)
  64. format_ops = generate_format_ops(specifiers)
  65. if format_ops is None:
  66. return None
  67. last_end = 0
  68. for spec in specifiers:
  69. cur_start = spec.start_pos
  70. literals.append(format_str[last_end:cur_start])
  71. last_end = cur_start + len(spec.whole_seq)
  72. literals.append(format_str[last_end:])
  73. return literals, format_ops
  74. # The empty Context as an argument for parse_format_value().
  75. # It wouldn't be used since the code has passed the type-checking.
  76. EMPTY_CONTEXT: Final = Context()
  77. def tokenizer_format_call(format_str: str) -> tuple[list[str], list[FormatOp]] | None:
  78. """Tokenize a str.format() format string.
  79. The core function parse_format_value() is shared with mypy.
  80. With these specifiers, we then parse the literal substrings
  81. of the original format string and convert `ConversionSpecifier`
  82. to `FormatOp`.
  83. Return:
  84. A list of string literals and a list of FormatOps. The literals
  85. are interleaved with FormatOps and the length of returned literals
  86. should be exactly one more than FormatOps.
  87. Return None if it cannot parse the string.
  88. """
  89. # Creates an empty MessageBuilder here.
  90. # It wouldn't be used since the code has passed the type-checking.
  91. specifiers = parse_format_value(
  92. format_str, EMPTY_CONTEXT, MessageBuilder(Errors(Options()), {})
  93. )
  94. if specifiers is None:
  95. return None
  96. format_ops = generate_format_ops(specifiers)
  97. if format_ops is None:
  98. return None
  99. literals: list[str] = []
  100. last_end = 0
  101. for spec in specifiers:
  102. # Skip { and }
  103. literals.append(format_str[last_end : spec.start_pos - 1])
  104. last_end = spec.start_pos + len(spec.whole_seq) + 1
  105. literals.append(format_str[last_end:])
  106. # Deal with escaped {{
  107. literals = [x.replace("{{", "{").replace("}}", "}") for x in literals]
  108. return literals, format_ops
  109. def convert_format_expr_to_str(
  110. builder: IRBuilder, format_ops: list[FormatOp], exprs: list[Expression], line: int
  111. ) -> list[Value] | None:
  112. """Convert expressions into string literal objects with the guidance
  113. of FormatOps. Return None when fails."""
  114. if len(format_ops) != len(exprs):
  115. return None
  116. converted = []
  117. for x, format_op in zip(exprs, format_ops):
  118. node_type = builder.node_type(x)
  119. if format_op == FormatOp.STR:
  120. if is_str_rprimitive(node_type):
  121. var_str = builder.accept(x)
  122. elif is_int_rprimitive(node_type) or is_short_int_rprimitive(node_type):
  123. var_str = builder.call_c(int_to_str_op, [builder.accept(x)], line)
  124. else:
  125. var_str = builder.call_c(str_op, [builder.accept(x)], line)
  126. elif format_op == FormatOp.INT:
  127. if is_int_rprimitive(node_type) or is_short_int_rprimitive(node_type):
  128. var_str = builder.call_c(int_to_str_op, [builder.accept(x)], line)
  129. else:
  130. return None
  131. else:
  132. return None
  133. converted.append(var_str)
  134. return converted
  135. def join_formatted_strings(
  136. builder: IRBuilder, literals: list[str] | None, substitutions: list[Value], line: int
  137. ) -> Value:
  138. """Merge the list of literals and the list of substitutions
  139. alternatively using 'str_build_op'.
  140. `substitutions` is the result value of formatting conversions.
  141. If the `literals` is set to None, we simply join the substitutions;
  142. Otherwise, the `literals` is the literal substrings of the original
  143. format string and its length should be exactly one more than
  144. substitutions.
  145. For example:
  146. (1) 'This is a %s and the value is %d'
  147. -> literals: ['This is a ', ' and the value is', '']
  148. (2) '{} and the value is {}'
  149. -> literals: ['', ' and the value is', '']
  150. """
  151. # The first parameter for str_build_op is the total size of
  152. # the following PyObject*
  153. result_list: list[Value] = [Integer(0, c_pyssize_t_rprimitive)]
  154. if literals is not None:
  155. for a, b in zip(literals, substitutions):
  156. if a:
  157. result_list.append(builder.load_str(a))
  158. result_list.append(b)
  159. if literals[-1]:
  160. result_list.append(builder.load_str(literals[-1]))
  161. else:
  162. result_list.extend(substitutions)
  163. # Special case for empty string and literal string
  164. if len(result_list) == 1:
  165. return builder.load_str("")
  166. if not substitutions and len(result_list) == 2:
  167. return result_list[1]
  168. result_list[0] = Integer(len(result_list) - 1, c_pyssize_t_rprimitive)
  169. return builder.call_c(str_build_op, result_list, line)
  170. def convert_format_expr_to_bytes(
  171. builder: IRBuilder, format_ops: list[FormatOp], exprs: list[Expression], line: int
  172. ) -> list[Value] | None:
  173. """Convert expressions into bytes literal objects with the guidance
  174. of FormatOps. Return None when fails."""
  175. if len(format_ops) != len(exprs):
  176. return None
  177. converted = []
  178. for x, format_op in zip(exprs, format_ops):
  179. node_type = builder.node_type(x)
  180. # conversion type 's' is an alias of 'b' in bytes formatting
  181. if format_op == FormatOp.BYTES or format_op == FormatOp.STR:
  182. if is_bytes_rprimitive(node_type):
  183. var_bytes = builder.accept(x)
  184. else:
  185. return None
  186. else:
  187. return None
  188. converted.append(var_bytes)
  189. return converted
  190. def join_formatted_bytes(
  191. builder: IRBuilder, literals: list[str], substitutions: list[Value], line: int
  192. ) -> Value:
  193. """Merge the list of literals and the list of substitutions
  194. alternatively using 'bytes_build_op'."""
  195. result_list: list[Value] = [Integer(0, c_pyssize_t_rprimitive)]
  196. for a, b in zip(literals, substitutions):
  197. if a:
  198. result_list.append(builder.load_bytes_from_str_literal(a))
  199. result_list.append(b)
  200. if literals[-1]:
  201. result_list.append(builder.load_bytes_from_str_literal(literals[-1]))
  202. # Special case for empty bytes and literal
  203. if len(result_list) == 1:
  204. return builder.load_bytes_from_str_literal("")
  205. if not substitutions and len(result_list) == 2:
  206. return result_list[1]
  207. result_list[0] = Integer(len(result_list) - 1, c_pyssize_t_rprimitive)
  208. return builder.call_c(bytes_build_op, result_list, line)