encoding.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import tokenize
  2. from pathlib import Path
  3. from prospector.exceptions import CouldNotHandleEncoding, PermissionMissing
  4. # note: annotating return type with AnyStr does not work here for reasons I can't be bothered to work out
  5. # mypy complains with 'Incompatible return value type (got "str", expected "bytes")'
  6. def read_py_file(filepath: Path):
  7. # see https://docs.python.org/3/library/tokenize.html#tokenize.detect_encoding
  8. # first just see if the file is properly encoded
  9. try:
  10. with open(filepath, "rb") as bfile_:
  11. tokenize.detect_encoding(bfile_.readline)
  12. except PermissionError as err:
  13. raise PermissionMissing(filepath) from err
  14. except SyntaxError as err:
  15. # this warning is issued:
  16. # (1) in badly authored files (contains non-utf8 in a comment line)
  17. # (2) a coding is specified, but wrong and
  18. # (3) no coding is specified, and the default
  19. # 'utf-8' fails to decode.
  20. # (4) the encoding specified by a pep263 declaration did not match
  21. # with the encoding detected by inspecting the BOM
  22. raise CouldNotHandleEncoding(filepath) from err
  23. try:
  24. with tokenize.open(filepath) as file_:
  25. return file_.read()
  26. # this warning is issued:
  27. # (1) if utf-8 is specified, but latin1 is used with something like \x0e9 appearing
  28. # (see http://stackoverflow.com/a/5552623)
  29. except UnicodeDecodeError as err:
  30. raise CouldNotHandleEncoding(filepath) from err