| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 |
- """Tokenizers for three string formatting methods"""
- from __future__ import annotations
- from enum import Enum, unique
- from typing import Final
- from mypy.checkstrformat import (
- ConversionSpecifier,
- parse_conversion_specifiers,
- parse_format_value,
- )
- from mypy.errors import Errors
- from mypy.messages import MessageBuilder
- from mypy.nodes import Context, Expression
- from mypy.options import Options
- from mypyc.ir.ops import Integer, Value
- from mypyc.ir.rtypes import (
- c_pyssize_t_rprimitive,
- is_bytes_rprimitive,
- is_int_rprimitive,
- is_short_int_rprimitive,
- is_str_rprimitive,
- )
- from mypyc.irbuild.builder import IRBuilder
- from mypyc.primitives.bytes_ops import bytes_build_op
- from mypyc.primitives.int_ops import int_to_str_op
- from mypyc.primitives.str_ops import str_build_op, str_op
- @unique
- class FormatOp(Enum):
- """FormatOp represents conversion operations of string formatting during
- compile time.
- Compare to ConversionSpecifier, FormatOp has fewer attributes.
- For example, to mark a conversion from any object to string,
- ConversionSpecifier may have several representations, like '%s', '{}'
- or '{:{}}'. However, there would only exist one corresponding FormatOp.
- """
- STR = "s"
- INT = "d"
- BYTES = "b"
- def generate_format_ops(specifiers: list[ConversionSpecifier]) -> list[FormatOp] | None:
- """Convert ConversionSpecifier to FormatOp.
- Different ConversionSpecifiers may share a same FormatOp.
- """
- format_ops = []
- for spec in specifiers:
- # TODO: Match specifiers instead of using whole_seq
- if spec.whole_seq == "%s" or spec.whole_seq == "{:{}}":
- format_op = FormatOp.STR
- elif spec.whole_seq == "%d":
- format_op = FormatOp.INT
- elif spec.whole_seq == "%b":
- format_op = FormatOp.BYTES
- elif spec.whole_seq:
- return None
- else:
- format_op = FormatOp.STR
- format_ops.append(format_op)
- return format_ops
- def tokenizer_printf_style(format_str: str) -> tuple[list[str], list[FormatOp]] | None:
- """Tokenize a printf-style format string using regex.
- Return:
- A list of string literals and a list of FormatOps.
- """
- literals: list[str] = []
- specifiers: list[ConversionSpecifier] = parse_conversion_specifiers(format_str)
- format_ops = generate_format_ops(specifiers)
- if format_ops is None:
- return None
- last_end = 0
- for spec in specifiers:
- cur_start = spec.start_pos
- literals.append(format_str[last_end:cur_start])
- last_end = cur_start + len(spec.whole_seq)
- literals.append(format_str[last_end:])
- return literals, format_ops
- # The empty Context as an argument for parse_format_value().
- # It wouldn't be used since the code has passed the type-checking.
- EMPTY_CONTEXT: Final = Context()
- def tokenizer_format_call(format_str: str) -> tuple[list[str], list[FormatOp]] | None:
- """Tokenize a str.format() format string.
- The core function parse_format_value() is shared with mypy.
- With these specifiers, we then parse the literal substrings
- of the original format string and convert `ConversionSpecifier`
- to `FormatOp`.
- Return:
- A list of string literals and a list of FormatOps. The literals
- are interleaved with FormatOps and the length of returned literals
- should be exactly one more than FormatOps.
- Return None if it cannot parse the string.
- """
- # Creates an empty MessageBuilder here.
- # It wouldn't be used since the code has passed the type-checking.
- specifiers = parse_format_value(
- format_str, EMPTY_CONTEXT, MessageBuilder(Errors(Options()), {})
- )
- if specifiers is None:
- return None
- format_ops = generate_format_ops(specifiers)
- if format_ops is None:
- return None
- literals: list[str] = []
- last_end = 0
- for spec in specifiers:
- # Skip { and }
- literals.append(format_str[last_end : spec.start_pos - 1])
- last_end = spec.start_pos + len(spec.whole_seq) + 1
- literals.append(format_str[last_end:])
- # Deal with escaped {{
- literals = [x.replace("{{", "{").replace("}}", "}") for x in literals]
- return literals, format_ops
- def convert_format_expr_to_str(
- builder: IRBuilder, format_ops: list[FormatOp], exprs: list[Expression], line: int
- ) -> list[Value] | None:
- """Convert expressions into string literal objects with the guidance
- of FormatOps. Return None when fails."""
- if len(format_ops) != len(exprs):
- return None
- converted = []
- for x, format_op in zip(exprs, format_ops):
- node_type = builder.node_type(x)
- if format_op == FormatOp.STR:
- if is_str_rprimitive(node_type):
- var_str = builder.accept(x)
- elif is_int_rprimitive(node_type) or is_short_int_rprimitive(node_type):
- var_str = builder.call_c(int_to_str_op, [builder.accept(x)], line)
- else:
- var_str = builder.call_c(str_op, [builder.accept(x)], line)
- elif format_op == FormatOp.INT:
- if is_int_rprimitive(node_type) or is_short_int_rprimitive(node_type):
- var_str = builder.call_c(int_to_str_op, [builder.accept(x)], line)
- else:
- return None
- else:
- return None
- converted.append(var_str)
- return converted
- def join_formatted_strings(
- builder: IRBuilder, literals: list[str] | None, substitutions: list[Value], line: int
- ) -> Value:
- """Merge the list of literals and the list of substitutions
- alternatively using 'str_build_op'.
- `substitutions` is the result value of formatting conversions.
- If the `literals` is set to None, we simply join the substitutions;
- Otherwise, the `literals` is the literal substrings of the original
- format string and its length should be exactly one more than
- substitutions.
- For example:
- (1) 'This is a %s and the value is %d'
- -> literals: ['This is a ', ' and the value is', '']
- (2) '{} and the value is {}'
- -> literals: ['', ' and the value is', '']
- """
- # The first parameter for str_build_op is the total size of
- # the following PyObject*
- result_list: list[Value] = [Integer(0, c_pyssize_t_rprimitive)]
- if literals is not None:
- for a, b in zip(literals, substitutions):
- if a:
- result_list.append(builder.load_str(a))
- result_list.append(b)
- if literals[-1]:
- result_list.append(builder.load_str(literals[-1]))
- else:
- result_list.extend(substitutions)
- # Special case for empty string and literal string
- if len(result_list) == 1:
- return builder.load_str("")
- if not substitutions and len(result_list) == 2:
- return result_list[1]
- result_list[0] = Integer(len(result_list) - 1, c_pyssize_t_rprimitive)
- return builder.call_c(str_build_op, result_list, line)
- def convert_format_expr_to_bytes(
- builder: IRBuilder, format_ops: list[FormatOp], exprs: list[Expression], line: int
- ) -> list[Value] | None:
- """Convert expressions into bytes literal objects with the guidance
- of FormatOps. Return None when fails."""
- if len(format_ops) != len(exprs):
- return None
- converted = []
- for x, format_op in zip(exprs, format_ops):
- node_type = builder.node_type(x)
- # conversion type 's' is an alias of 'b' in bytes formatting
- if format_op == FormatOp.BYTES or format_op == FormatOp.STR:
- if is_bytes_rprimitive(node_type):
- var_bytes = builder.accept(x)
- else:
- return None
- else:
- return None
- converted.append(var_bytes)
- return converted
- def join_formatted_bytes(
- builder: IRBuilder, literals: list[str], substitutions: list[Value], line: int
- ) -> Value:
- """Merge the list of literals and the list of substitutions
- alternatively using 'bytes_build_op'."""
- result_list: list[Value] = [Integer(0, c_pyssize_t_rprimitive)]
- for a, b in zip(literals, substitutions):
- if a:
- result_list.append(builder.load_bytes_from_str_literal(a))
- result_list.append(b)
- if literals[-1]:
- result_list.append(builder.load_bytes_from_str_literal(literals[-1]))
- # Special case for empty bytes and literal
- if len(result_list) == 1:
- return builder.load_bytes_from_str_literal("")
- if not substitutions and len(result_list) == 2:
- return result_list[1]
- result_list[0] = Integer(len(result_list) - 1, c_pyssize_t_rprimitive)
- return builder.call_c(bytes_build_op, result_list, line)
|