| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538 |
- # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
- # For details: https://github.com/PyCQA/pylint/blob/main/LICENSE
- # Copyright (c) https://github.com/PyCQA/pylint/blob/main/CONTRIBUTORS.txt
- """Unicode and some other ASCII characters can be used to create programs that run
- much different compared to what a human reader would expect from them.
- PEP 672 lists some examples.
- See: https://www.python.org/dev/peps/pep-0672/
- The following checkers are intended to make users are aware of these issues.
- """
- from __future__ import annotations
- import codecs
- import contextlib
- import io
- import re
- from collections import OrderedDict
- from collections.abc import Iterable
- from functools import lru_cache
- from tokenize import detect_encoding
- from typing import NamedTuple, TypeVar
- from astroid import nodes
- import pylint.interfaces
- import pylint.lint
- from pylint import checkers
- _StrLike = TypeVar("_StrLike", str, bytes)
- # Based on:
- # https://golangexample.com/go-linter-which-checks-for-dangerous-unicode-character-sequences/
- # We use '\u' because it doesn't require a map lookup and is therefore faster
- BIDI_UNICODE = [
- "\u202A", # \N{LEFT-TO-RIGHT EMBEDDING}
- "\u202B", # \N{RIGHT-TO-LEFT EMBEDDING}
- "\u202C", # \N{POP DIRECTIONAL FORMATTING}
- "\u202D", # \N{LEFT-TO-RIGHT OVERRIDE}
- "\u202E", # \N{RIGHT-TO-LEFT OVERRIDE}
- "\u2066", # \N{LEFT-TO-RIGHT ISOLATE}
- "\u2067", # \N{RIGHT-TO-LEFT ISOLATE}
- "\u2068", # \N{FIRST STRONG ISOLATE}
- "\u2069", # \N{POP DIRECTIONAL ISOLATE}
- # The following was part of PEP 672:
- # https://www.python.org/dev/peps/pep-0672/
- # so the list above might not be complete
- "\u200F", # \n{RIGHT-TO-LEFT MARK}
- # We don't use
- # "\u200E" # \n{LEFT-TO-RIGHT MARK}
- # as this is the default for latin files and can't be used
- # to hide code
- ]
- class _BadChar(NamedTuple):
- """Representation of an ASCII char considered bad."""
- name: str
- unescaped: str
- escaped: str
- code: str
- help_text: str
- def description(self) -> str:
- """Used for the detailed error message description."""
- return (
- f"Invalid unescaped character {self.name}, "
- f'use "{self.escaped}" instead.'
- )
- def human_code(self) -> str:
- """Used to generate the human readable error message."""
- return f"invalid-character-{self.name}"
- # Based on https://www.python.org/dev/peps/pep-0672/
- BAD_CHARS = [
- _BadChar(
- "backspace",
- "\b",
- "\\b",
- "E2510",
- (
- "Moves the cursor back, so the character after it will overwrite the "
- "character before."
- ),
- ),
- _BadChar(
- "carriage-return",
- "\r",
- "\\r",
- "E2511",
- (
- "Moves the cursor to the start of line, subsequent characters overwrite "
- "the start of the line."
- ),
- ),
- _BadChar(
- "sub",
- "\x1A",
- "\\x1A",
- "E2512",
- (
- 'Ctrl+Z "End of text" on Windows. Some programs (such as type) ignore '
- "the rest of the file after it."
- ),
- ),
- _BadChar(
- "esc",
- "\x1B",
- "\\x1B",
- "E2513",
- (
- "Commonly initiates escape codes which allow arbitrary control "
- "of the terminal."
- ),
- ),
- _BadChar(
- "nul",
- "\0",
- "\\0",
- "E2514",
- "Mostly end of input for python.",
- ),
- _BadChar(
- # Zero Width with Space. At the time of writing not accepted by Python.
- # But used in Trojan Source Examples, so still included and tested for.
- "zero-width-space",
- "\u200B", # \n{ZERO WIDTH SPACE}
- "\\u200B",
- "E2515",
- "Invisible space character could hide real code execution.",
- ),
- ]
- BAD_ASCII_SEARCH_DICT = {char.unescaped: char for char in BAD_CHARS}
- def _line_length(line: _StrLike, codec: str) -> int:
- """Get the length of a string like line as displayed in an editor."""
- if isinstance(line, bytes):
- decoded = _remove_bom(line, codec).decode(codec, "replace")
- else:
- decoded = line
- stripped = decoded.rstrip("\n")
- if stripped != decoded:
- stripped = stripped.rstrip("\r")
- return len(stripped)
- def _map_positions_to_result(
- line: _StrLike,
- search_dict: dict[_StrLike, _BadChar],
- new_line: _StrLike,
- byte_str_length: int = 1,
- ) -> dict[int, _BadChar]:
- """Get all occurrences of search dict keys within line.
- Ignores Windows end of line and can handle bytes as well as string.
- Also takes care of encodings for which the length of an encoded code point does not
- default to 8 Bit.
- """
- result: dict[int, _BadChar] = {}
- for search_for, char in search_dict.items():
- if search_for not in line:
- continue
- # Special Handling for Windows '\r\n'
- if char.unescaped == "\r" and line.endswith(new_line):
- ignore_pos = len(line) - 2 * byte_str_length
- else:
- ignore_pos = None
- start = 0
- pos = line.find(search_for, start)
- while pos > 0:
- if pos != ignore_pos:
- # Calculate the column
- col = int(pos / byte_str_length)
- result[col] = char
- start = pos + 1
- pos = line.find(search_for, start)
- return result
- UNICODE_BOMS = {
- "utf-8": codecs.BOM_UTF8,
- "utf-16": codecs.BOM_UTF16,
- "utf-32": codecs.BOM_UTF32,
- "utf-16le": codecs.BOM_UTF16_LE,
- "utf-16be": codecs.BOM_UTF16_BE,
- "utf-32le": codecs.BOM_UTF32_LE,
- "utf-32be": codecs.BOM_UTF32_BE,
- }
- BOM_SORTED_TO_CODEC = OrderedDict(
- # Sorted by length of BOM of each codec
- (UNICODE_BOMS[codec], codec)
- for codec in ("utf-32le", "utf-32be", "utf-8", "utf-16le", "utf-16be")
- )
- UTF_NAME_REGEX_COMPILED = re.compile(
- "utf[ -]?(8|16|32)[ -]?(le|be|)?(sig)?", flags=re.IGNORECASE
- )
- def _normalize_codec_name(codec: str) -> str:
- """Make sure the codec name is always given as defined in the BOM dict."""
- return UTF_NAME_REGEX_COMPILED.sub(r"utf-\1\2", codec).lower()
- def _remove_bom(encoded: bytes, encoding: str) -> bytes:
- """Remove the bom if given from a line."""
- if encoding not in UNICODE_BOMS:
- return encoded
- bom = UNICODE_BOMS[encoding]
- if encoded.startswith(bom):
- return encoded[len(bom) :]
- return encoded
- def _encode_without_bom(string: str, encoding: str) -> bytes:
- """Encode a string but remove the BOM."""
- return _remove_bom(string.encode(encoding), encoding)
- def _byte_to_str_length(codec: str) -> int:
- """Return how many byte are usually(!) a character point."""
- if codec.startswith("utf-32"):
- return 4
- if codec.startswith("utf-16"):
- return 2
- return 1
- @lru_cache(maxsize=1000)
- def _cached_encode_search(string: str, encoding: str) -> bytes:
- """A cached version of encode used for search pattern."""
- return _encode_without_bom(string, encoding)
- def _fix_utf16_32_line_stream(steam: Iterable[bytes], codec: str) -> Iterable[bytes]:
- """Handle line ending for UTF16 and UTF32 correctly.
- Currently, Python simply strips the required zeros after \n after the
- line ending. Leading to lines that can't be decoded properly
- """
- if not codec.startswith("utf-16") and not codec.startswith("utf-32"):
- yield from steam
- else:
- # First we get all the bytes in memory
- content = b"".join(line for line in steam)
- new_line = _cached_encode_search("\n", codec)
- # Now we split the line by the real new line in the correct encoding
- # we can't use split as it would strip the \n that we need
- start = 0
- while True:
- pos = content.find(new_line, start)
- if pos >= 0:
- yield content[start : pos + len(new_line)]
- else:
- # Yield the rest and finish
- if content[start:]:
- yield content[start:]
- break
- start = pos + len(new_line)
- def extract_codec_from_bom(first_line: bytes) -> str:
- """Try to extract the codec (unicode only) by checking for the BOM.
- For details about BOM see https://unicode.org/faq/utf_bom.html#BOM
- Args:
- first_line: the first line of a file
- Returns:
- a codec name
- Raises:
- ValueError: if no codec was found
- """
- for bom, codec in BOM_SORTED_TO_CODEC.items():
- if first_line.startswith(bom):
- return codec
- raise ValueError("No BOM found. Could not detect Unicode codec.")
- class UnicodeChecker(checkers.BaseRawFileChecker):
- """Check characters that could be used to hide bad code to humans.
- This includes:
- - Bidirectional Unicode (see https://trojansource.codes/)
- - Bad ASCII characters (see PEP672)
- If a programmer requires to use such a character they should use the escaped
- version, that is also much easier to read and does not depend on the editor used.
- The Checker also includes a check that UTF-16 and UTF-32 are not used to encode
- Python files.
- At the time of writing Python supported only UTF-8. See
- https://stackoverflow.com/questions/69897842/ and https://bugs.python.org/issue1503789
- for background.
- """
- name = "unicode_checker"
- msgs = {
- "E2501": (
- # This error will be only displayed to users once Python Supports
- # UTF-16/UTF-32 (if at all)
- "UTF-16 and UTF-32 aren't backward compatible. Use UTF-8 instead",
- "invalid-unicode-codec",
- (
- "For compatibility use UTF-8 instead of UTF-16/UTF-32. "
- "See also https://bugs.python.org/issue1503789 for a history "
- "of this issue. And "
- "https://softwareengineering.stackexchange.com/questions/102205/ "
- "for some possible problems when using UTF-16 for instance."
- ),
- ),
- "E2502": (
- (
- "Contains control characters that can permit obfuscated code "
- "executed differently than displayed"
- ),
- "bidirectional-unicode",
- (
- "bidirectional unicode are typically not displayed characters required "
- "to display right-to-left (RTL) script "
- "(i.e. Chinese, Japanese, Arabic, Hebrew, ...) correctly. "
- "So can you trust this code? "
- "Are you sure it displayed correctly in all editors? "
- "If you did not write it or your language is not RTL,"
- " remove the special characters, as they could be used to trick you into "
- "executing code, "
- "that does something else than what it looks like.\n"
- "More Information:\n"
- "https://en.wikipedia.org/wiki/Bidirectional_text\n"
- "https://trojansource.codes/"
- ),
- ),
- "C2503": (
- "PEP8 recommends UTF-8 as encoding for Python files",
- "bad-file-encoding",
- (
- "PEP8 recommends UTF-8 default encoding for Python files. See "
- "https://peps.python.org/pep-0008/#source-file-encoding"
- ),
- ),
- **{
- bad_char.code: (
- bad_char.description(),
- bad_char.human_code(),
- bad_char.help_text,
- )
- for bad_char in BAD_CHARS
- },
- }
- @staticmethod
- def _is_invalid_codec(codec: str) -> bool:
- return codec.startswith("utf-16") or codec.startswith("utf-32")
- @staticmethod
- def _is_unicode(codec: str) -> bool:
- return codec.startswith("utf")
- @classmethod
- def _find_line_matches(cls, line: bytes, codec: str) -> dict[int, _BadChar]:
- """Find all matches of BAD_CHARS within line.
- Args:
- line: the input
- codec: that will be used to convert line/or search string into
- Return:
- A dictionary with the column offset and the BadASCIIChar
- """
- # We try to decode in Unicode to get the correct column offset
- # if we would use bytes, it could be off because UTF-8 has no fixed length
- try:
- line_search = line.decode(codec, errors="strict")
- search_dict = BAD_ASCII_SEARCH_DICT
- return _map_positions_to_result(line_search, search_dict, "\n")
- except UnicodeDecodeError:
- # If we can't decode properly, we simply use bytes, even so the column offsets
- # might be wrong a bit, but it is still better then nothing
- line_search_byte = line
- search_dict_byte: dict[bytes, _BadChar] = {}
- for char in BAD_CHARS:
- # Some characters might not exist in all encodings
- with contextlib.suppress(UnicodeDecodeError):
- search_dict_byte[
- _cached_encode_search(char.unescaped, codec)
- ] = char
- return _map_positions_to_result(
- line_search_byte,
- search_dict_byte,
- _cached_encode_search("\n", codec),
- byte_str_length=_byte_to_str_length(codec),
- )
- @staticmethod
- def _determine_codec(stream: io.BytesIO) -> tuple[str, int]:
- """Determine the codec from the given stream.
- first tries https://www.python.org/dev/peps/pep-0263/
- and if this fails also checks for BOMs of UTF-16 and UTF-32
- to be future-proof.
- Args:
- stream: The byte stream to analyse
- Returns: A tuple consisting of:
- - normalized codec name
- - the line in which the codec was found
- Raises:
- SyntaxError: if failing to detect codec
- """
- try:
- # First try to detect encoding with PEP 263
- # Doesn't work with UTF-16/32 at the time of writing
- # see https://bugs.python.org/issue1503789
- codec, lines = detect_encoding(stream.readline)
- # lines are empty if UTF-8 BOM is found
- codec_definition_line = len(lines) or 1
- except SyntaxError as e:
- # Codec could not be detected by Python, we try manually to check for
- # UTF 16/32 BOMs, which aren't supported by Python at the time of writing.
- # This is only included to be future save and handle these codecs as well
- stream.seek(0)
- try:
- codec = extract_codec_from_bom(stream.readline())
- codec_definition_line = 1
- except ValueError as ve:
- # Failed to detect codec, so the syntax error originated not from
- # UTF16/32 codec usage. So simply raise the error again.
- raise e from ve
- return _normalize_codec_name(codec), codec_definition_line
- def _check_codec(self, codec: str, codec_definition_line: int) -> None:
- """Check validity of the codec."""
- if codec != "utf-8":
- msg = "bad-file-encoding"
- if self._is_invalid_codec(codec):
- msg = "invalid-unicode-codec"
- self.add_message(
- msg,
- # Currently Nodes will lead to crashes of pylint
- # node=node,
- line=codec_definition_line,
- end_lineno=codec_definition_line,
- confidence=pylint.interfaces.HIGH,
- col_offset=None,
- end_col_offset=None,
- )
- def _check_invalid_chars(self, line: bytes, lineno: int, codec: str) -> None:
- """Look for chars considered bad."""
- matches = self._find_line_matches(line, codec)
- for col, char in matches.items():
- self.add_message(
- char.human_code(),
- # Currently Nodes will lead to crashes of pylint
- # node=node,
- line=lineno,
- end_lineno=lineno,
- confidence=pylint.interfaces.HIGH,
- col_offset=col + 1,
- end_col_offset=col + len(char.unescaped) + 1,
- )
- def _check_bidi_chars(self, line: bytes, lineno: int, codec: str) -> None:
- """Look for Bidirectional Unicode, if we use unicode."""
- if not self._is_unicode(codec):
- return
- for dangerous in BIDI_UNICODE:
- if _cached_encode_search(dangerous, codec) in line:
- # Note that we don't add a col_offset on purpose:
- # Using these unicode characters it depends on the editor
- # how it displays the location of characters in the line.
- # So we mark the complete line.
- self.add_message(
- "bidirectional-unicode",
- # Currently Nodes will lead to crashes of pylint
- # node=node,
- line=lineno,
- end_lineno=lineno,
- # We mark the complete line, as bidi controls make it hard
- # to determine the correct cursor position within an editor
- col_offset=0,
- end_col_offset=_line_length(line, codec),
- confidence=pylint.interfaces.HIGH,
- )
- # We look for bidirectional unicode only once per line
- # as we mark the complete line anyway
- break
- def process_module(self, node: nodes.Module) -> None:
- """Perform the actual check by checking module stream."""
- with node.stream() as stream:
- codec, codec_line = self._determine_codec(stream)
- self._check_codec(codec, codec_line)
- stream.seek(0)
- # Check for invalid content (controls/chars)
- for lineno, line in enumerate(
- _fix_utf16_32_line_stream(stream, codec), start=1
- ):
- if lineno == 1:
- line = _remove_bom(line, codec)
- self._check_bidi_chars(line, lineno, codec)
- self._check_invalid_chars(line, lineno, codec)
- def register(linter: pylint.lint.PyLinter) -> None:
- linter.register_checker(UnicodeChecker(linter))
|