checker.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. """Checker Manager and Checker classes."""
  2. import argparse
  3. import collections
  4. import errno
  5. import itertools
  6. import logging
  7. import multiprocessing.pool
  8. import signal
  9. import tokenize
  10. from typing import Any
  11. from typing import Dict
  12. from typing import List
  13. from typing import Optional
  14. from typing import Tuple
  15. from flake8 import defaults
  16. from flake8 import exceptions
  17. from flake8 import processor
  18. from flake8 import utils
  19. from flake8.discover_files import expand_paths
  20. from flake8.plugins.finder import Checkers
  21. from flake8.plugins.finder import LoadedPlugin
  22. from flake8.style_guide import StyleGuideManager
  23. Results = List[Tuple[str, int, int, str, Optional[str]]]
  24. LOG = logging.getLogger(__name__)
  25. SERIAL_RETRY_ERRNOS = {
  26. # ENOSPC: Added by sigmavirus24
  27. # > On some operating systems (OSX), multiprocessing may cause an
  28. # > ENOSPC error while trying to create a Semaphore.
  29. # > In those cases, we should replace the customized Queue Report
  30. # > class with pep8's StandardReport class to ensure users don't run
  31. # > into this problem.
  32. # > (See also: https://github.com/pycqa/flake8/issues/117)
  33. errno.ENOSPC,
  34. # NOTE(sigmavirus24): When adding to this list, include the reasoning
  35. # on the lines before the error code and always append your error
  36. # code. Further, please always add a trailing `,` to reduce the visual
  37. # noise in diffs.
  38. }
  39. class Manager:
  40. """Manage the parallelism and checker instances for each plugin and file.
  41. This class will be responsible for the following:
  42. - Determining the parallelism of Flake8, e.g.:
  43. * Do we use :mod:`multiprocessing` or is it unavailable?
  44. * Do we automatically decide on the number of jobs to use or did the
  45. user provide that?
  46. - Falling back to a serial way of processing files if we run into an
  47. OSError related to :mod:`multiprocessing`
  48. - Organizing the results of each checker so we can group the output
  49. together and make our output deterministic.
  50. """
  51. def __init__(
  52. self,
  53. style_guide: StyleGuideManager,
  54. plugins: Checkers,
  55. ) -> None:
  56. """Initialize our Manager instance."""
  57. self.style_guide = style_guide
  58. self.options = style_guide.options
  59. self.plugins = plugins
  60. self.jobs = self._job_count()
  61. self._all_checkers: List[FileChecker] = []
  62. self.checkers: List[FileChecker] = []
  63. self.statistics = {
  64. "files": 0,
  65. "logical lines": 0,
  66. "physical lines": 0,
  67. "tokens": 0,
  68. }
  69. self.exclude = tuple(
  70. itertools.chain(self.options.exclude, self.options.extend_exclude)
  71. )
  72. def _process_statistics(self) -> None:
  73. for checker in self.checkers:
  74. for statistic in defaults.STATISTIC_NAMES:
  75. self.statistics[statistic] += checker.statistics[statistic]
  76. self.statistics["files"] += len(self.checkers)
  77. def _job_count(self) -> int:
  78. # First we walk through all of our error cases:
  79. # - multiprocessing library is not present
  80. # - we're running on windows in which case we know we have significant
  81. # implementation issues
  82. # - the user provided stdin and that's not something we can handle
  83. # well
  84. # - we're processing a diff, which again does not work well with
  85. # multiprocessing and which really shouldn't require multiprocessing
  86. # - the user provided some awful input
  87. # class state is only preserved when using the `fork` strategy.
  88. if multiprocessing.get_start_method() != "fork":
  89. LOG.warning(
  90. "The multiprocessing module is not available. "
  91. "Ignoring --jobs arguments."
  92. )
  93. return 0
  94. if utils.is_using_stdin(self.options.filenames):
  95. LOG.warning(
  96. "The --jobs option is not compatible with supplying "
  97. "input using - . Ignoring --jobs arguments."
  98. )
  99. return 0
  100. if self.options.diff:
  101. LOG.warning(
  102. "The --diff option was specified with --jobs but "
  103. "they are not compatible. Ignoring --jobs arguments."
  104. )
  105. return 0
  106. jobs = self.options.jobs
  107. # If the value is "auto", we want to let the multiprocessing library
  108. # decide the number based on the number of CPUs. However, if that
  109. # function is not implemented for this particular value of Python we
  110. # default to 1
  111. if jobs.is_auto:
  112. try:
  113. return multiprocessing.cpu_count()
  114. except NotImplementedError:
  115. return 0
  116. # Otherwise, we know jobs should be an integer and we can just convert
  117. # it to an integer
  118. return jobs.n_jobs
  119. def _handle_results(self, filename: str, results: Results) -> int:
  120. style_guide = self.style_guide
  121. reported_results_count = 0
  122. for (error_code, line_number, column, text, physical_line) in results:
  123. reported_results_count += style_guide.handle_error(
  124. code=error_code,
  125. filename=filename,
  126. line_number=line_number,
  127. column_number=column,
  128. text=text,
  129. physical_line=physical_line,
  130. )
  131. return reported_results_count
  132. def make_checkers(self, paths: Optional[List[str]] = None) -> None:
  133. """Create checkers for each file."""
  134. if paths is None:
  135. paths = self.options.filenames
  136. self._all_checkers = [
  137. FileChecker(
  138. filename=filename,
  139. plugins=self.plugins,
  140. options=self.options,
  141. )
  142. for filename in expand_paths(
  143. paths=paths,
  144. stdin_display_name=self.options.stdin_display_name,
  145. filename_patterns=self.options.filename,
  146. exclude=self.exclude,
  147. is_running_from_diff=self.options.diff,
  148. )
  149. ]
  150. self.checkers = [c for c in self._all_checkers if c.should_process]
  151. LOG.info("Checking %d files", len(self.checkers))
  152. def report(self) -> Tuple[int, int]:
  153. """Report all of the errors found in the managed file checkers.
  154. This iterates over each of the checkers and reports the errors sorted
  155. by line number.
  156. :returns:
  157. A tuple of the total results found and the results reported.
  158. """
  159. results_reported = results_found = 0
  160. for checker in self._all_checkers:
  161. results = sorted(checker.results, key=lambda tup: (tup[1], tup[2]))
  162. filename = checker.display_name
  163. with self.style_guide.processing_file(filename):
  164. results_reported += self._handle_results(filename, results)
  165. results_found += len(results)
  166. return (results_found, results_reported)
  167. def run_parallel(self) -> None:
  168. """Run the checkers in parallel."""
  169. # fmt: off
  170. final_results: Dict[str, List[Tuple[str, int, int, str, Optional[str]]]] = collections.defaultdict(list) # noqa: E501
  171. final_statistics: Dict[str, Dict[str, int]] = collections.defaultdict(dict) # noqa: E501
  172. # fmt: on
  173. pool = _try_initialize_processpool(self.jobs)
  174. if pool is None:
  175. self.run_serial()
  176. return
  177. pool_closed = False
  178. try:
  179. pool_map = pool.imap_unordered(
  180. _run_checks,
  181. self.checkers,
  182. chunksize=calculate_pool_chunksize(
  183. len(self.checkers), self.jobs
  184. ),
  185. )
  186. for ret in pool_map:
  187. filename, results, statistics = ret
  188. final_results[filename] = results
  189. final_statistics[filename] = statistics
  190. pool.close()
  191. pool.join()
  192. pool_closed = True
  193. finally:
  194. if not pool_closed:
  195. pool.terminate()
  196. pool.join()
  197. for checker in self.checkers:
  198. filename = checker.display_name
  199. checker.results = final_results[filename]
  200. checker.statistics = final_statistics[filename]
  201. def run_serial(self) -> None:
  202. """Run the checkers in serial."""
  203. for checker in self.checkers:
  204. checker.run_checks()
  205. def run(self) -> None:
  206. """Run all the checkers.
  207. This will intelligently decide whether to run the checks in parallel
  208. or whether to run them in serial.
  209. If running the checks in parallel causes a problem (e.g.,
  210. :issue:`117`) this also implements fallback to serial processing.
  211. """
  212. try:
  213. if self.jobs > 1 and len(self.checkers) > 1:
  214. self.run_parallel()
  215. else:
  216. self.run_serial()
  217. except KeyboardInterrupt:
  218. LOG.warning("Flake8 was interrupted by the user")
  219. raise exceptions.EarlyQuit("Early quit while running checks")
  220. def start(self, paths: Optional[List[str]] = None) -> None:
  221. """Start checking files.
  222. :param paths:
  223. Path names to check. This is passed directly to
  224. :meth:`~Manager.make_checkers`.
  225. """
  226. LOG.info("Making checkers")
  227. self.make_checkers(paths)
  228. def stop(self) -> None:
  229. """Stop checking files."""
  230. self._process_statistics()
  231. class FileChecker:
  232. """Manage running checks for a file and aggregate the results."""
  233. def __init__(
  234. self,
  235. *,
  236. filename: str,
  237. plugins: Checkers,
  238. options: argparse.Namespace,
  239. ) -> None:
  240. """Initialize our file checker."""
  241. self.options = options
  242. self.filename = filename
  243. self.plugins = plugins
  244. self.results: Results = []
  245. self.statistics = {
  246. "tokens": 0,
  247. "logical lines": 0,
  248. "physical lines": 0,
  249. }
  250. self.processor = self._make_processor()
  251. self.display_name = filename
  252. self.should_process = False
  253. if self.processor is not None:
  254. self.display_name = self.processor.filename
  255. self.should_process = not self.processor.should_ignore_file()
  256. self.statistics["physical lines"] = len(self.processor.lines)
  257. def __repr__(self) -> str:
  258. """Provide helpful debugging representation."""
  259. return f"FileChecker for {self.filename}"
  260. def _make_processor(self) -> Optional[processor.FileProcessor]:
  261. try:
  262. return processor.FileProcessor(self.filename, self.options)
  263. except OSError as e:
  264. # If we can not read the file due to an IOError (e.g., the file
  265. # does not exist or we do not have the permissions to open it)
  266. # then we need to format that exception for the user.
  267. # NOTE(sigmavirus24): Historically, pep8 has always reported this
  268. # as an E902. We probably *want* a better error code for this
  269. # going forward.
  270. self.report("E902", 0, 0, f"{type(e).__name__}: {e}")
  271. return None
  272. def report(
  273. self,
  274. error_code: Optional[str],
  275. line_number: int,
  276. column: int,
  277. text: str,
  278. ) -> str:
  279. """Report an error by storing it in the results list."""
  280. if error_code is None:
  281. error_code, text = text.split(" ", 1)
  282. # If we're recovering from a problem in _make_processor, we will not
  283. # have this attribute.
  284. if hasattr(self, "processor") and self.processor is not None:
  285. line = self.processor.noqa_line_for(line_number)
  286. else:
  287. line = None
  288. self.results.append((error_code, line_number, column, text, line))
  289. return error_code
  290. def run_check(self, plugin: LoadedPlugin, **arguments: Any) -> Any:
  291. """Run the check in a single plugin."""
  292. assert self.processor is not None
  293. try:
  294. params = self.processor.keyword_arguments_for(
  295. plugin.parameters, arguments
  296. )
  297. except AttributeError as ae:
  298. raise exceptions.PluginRequestedUnknownParameters(
  299. plugin_name=plugin.display_name, exception=ae
  300. )
  301. try:
  302. return plugin.obj(**arguments, **params)
  303. except Exception as all_exc:
  304. LOG.critical(
  305. "Plugin %s raised an unexpected exception",
  306. plugin.display_name,
  307. exc_info=True,
  308. )
  309. raise exceptions.PluginExecutionFailed(
  310. filename=self.filename,
  311. plugin_name=plugin.display_name,
  312. exception=all_exc,
  313. )
  314. @staticmethod
  315. def _extract_syntax_information(exception: Exception) -> Tuple[int, int]:
  316. if (
  317. len(exception.args) > 1
  318. and exception.args[1]
  319. and len(exception.args[1]) > 2
  320. ):
  321. token = exception.args[1]
  322. row, column = token[1:3]
  323. elif (
  324. isinstance(exception, tokenize.TokenError)
  325. and len(exception.args) == 2
  326. and len(exception.args[1]) == 2
  327. ):
  328. token = ()
  329. row, column = exception.args[1]
  330. else:
  331. token = ()
  332. row, column = (1, 0)
  333. if (
  334. column > 0
  335. and token
  336. and isinstance(exception, SyntaxError)
  337. and len(token) == 4 # Python 3.9 or earlier
  338. ):
  339. # NOTE(sigmavirus24): SyntaxErrors report 1-indexed column
  340. # numbers. We need to decrement the column number by 1 at
  341. # least.
  342. column_offset = 1
  343. row_offset = 0
  344. # See also: https://github.com/pycqa/flake8/issues/169,
  345. # https://github.com/PyCQA/flake8/issues/1372
  346. # On Python 3.9 and earlier, token will be a 4-item tuple with the
  347. # last item being the string. Starting with 3.10, they added to
  348. # the tuple so now instead of it ending with the code that failed
  349. # to parse, it ends with the end of the section of code that
  350. # failed to parse. Luckily the absolute position in the tuple is
  351. # stable across versions so we can use that here
  352. physical_line = token[3]
  353. # NOTE(sigmavirus24): Not all "tokens" have a string as the last
  354. # argument. In this event, let's skip trying to find the correct
  355. # column and row values.
  356. if physical_line is not None:
  357. # NOTE(sigmavirus24): SyntaxErrors also don't exactly have a
  358. # "physical" line so much as what was accumulated by the point
  359. # tokenizing failed.
  360. # See also: https://github.com/pycqa/flake8/issues/169
  361. lines = physical_line.rstrip("\n").split("\n")
  362. row_offset = len(lines) - 1
  363. logical_line = lines[0]
  364. logical_line_length = len(logical_line)
  365. if column > logical_line_length:
  366. column = logical_line_length
  367. row -= row_offset
  368. column -= column_offset
  369. return row, column
  370. def run_ast_checks(self) -> None:
  371. """Run all checks expecting an abstract syntax tree."""
  372. assert self.processor is not None
  373. ast = self.processor.build_ast()
  374. for plugin in self.plugins.tree:
  375. checker = self.run_check(plugin, tree=ast)
  376. # If the plugin uses a class, call the run method of it, otherwise
  377. # the call should return something iterable itself
  378. try:
  379. runner = checker.run()
  380. except AttributeError:
  381. runner = checker
  382. for (line_number, offset, text, _) in runner:
  383. self.report(
  384. error_code=None,
  385. line_number=line_number,
  386. column=offset,
  387. text=text,
  388. )
  389. def run_logical_checks(self) -> None:
  390. """Run all checks expecting a logical line."""
  391. assert self.processor is not None
  392. comments, logical_line, mapping = self.processor.build_logical_line()
  393. if not mapping:
  394. return
  395. self.processor.update_state(mapping)
  396. LOG.debug('Logical line: "%s"', logical_line.rstrip())
  397. for plugin in self.plugins.logical_line:
  398. self.processor.update_checker_state_for(plugin)
  399. results = self.run_check(plugin, logical_line=logical_line) or ()
  400. for offset, text in results:
  401. line_number, column_offset = find_offset(offset, mapping)
  402. if line_number == column_offset == 0:
  403. LOG.warning("position of error out of bounds: %s", plugin)
  404. self.report(
  405. error_code=None,
  406. line_number=line_number,
  407. column=column_offset,
  408. text=text,
  409. )
  410. self.processor.next_logical_line()
  411. def run_physical_checks(self, physical_line: str) -> None:
  412. """Run all checks for a given physical line.
  413. A single physical check may return multiple errors.
  414. """
  415. assert self.processor is not None
  416. for plugin in self.plugins.physical_line:
  417. self.processor.update_checker_state_for(plugin)
  418. result = self.run_check(plugin, physical_line=physical_line)
  419. if result is not None:
  420. # This is a single result if first element is an int
  421. column_offset = None
  422. try:
  423. column_offset = result[0]
  424. except (IndexError, TypeError):
  425. pass
  426. if isinstance(column_offset, int):
  427. # If we only have a single result, convert to a collection
  428. result = (result,)
  429. for result_single in result:
  430. column_offset, text = result_single
  431. self.report(
  432. error_code=None,
  433. line_number=self.processor.line_number,
  434. column=column_offset,
  435. text=text,
  436. )
  437. def process_tokens(self) -> None:
  438. """Process tokens and trigger checks.
  439. Instead of using this directly, you should use
  440. :meth:`flake8.checker.FileChecker.run_checks`.
  441. """
  442. assert self.processor is not None
  443. parens = 0
  444. statistics = self.statistics
  445. file_processor = self.processor
  446. prev_physical = ""
  447. for token in file_processor.generate_tokens():
  448. statistics["tokens"] += 1
  449. self.check_physical_eol(token, prev_physical)
  450. token_type, text = token[0:2]
  451. if token_type == tokenize.OP:
  452. parens = processor.count_parentheses(parens, text)
  453. elif parens == 0:
  454. if processor.token_is_newline(token):
  455. self.handle_newline(token_type)
  456. prev_physical = token[4]
  457. if file_processor.tokens:
  458. # If any tokens are left over, process them
  459. self.run_physical_checks(file_processor.lines[-1])
  460. self.run_logical_checks()
  461. def run_checks(self) -> Tuple[str, Results, Dict[str, int]]:
  462. """Run checks against the file."""
  463. assert self.processor is not None
  464. try:
  465. self.run_ast_checks()
  466. self.process_tokens()
  467. except (SyntaxError, tokenize.TokenError) as e:
  468. code = "E902" if isinstance(e, tokenize.TokenError) else "E999"
  469. row, column = self._extract_syntax_information(e)
  470. self.report(code, row, column, f"{type(e).__name__}: {e.args[0]}")
  471. return self.filename, self.results, self.statistics
  472. logical_lines = self.processor.statistics["logical lines"]
  473. self.statistics["logical lines"] = logical_lines
  474. return self.filename, self.results, self.statistics
  475. def handle_newline(self, token_type: int) -> None:
  476. """Handle the logic when encountering a newline token."""
  477. assert self.processor is not None
  478. if token_type == tokenize.NEWLINE:
  479. self.run_logical_checks()
  480. self.processor.reset_blank_before()
  481. elif len(self.processor.tokens) == 1:
  482. # The physical line contains only this token.
  483. self.processor.visited_new_blank_line()
  484. self.processor.delete_first_token()
  485. else:
  486. self.run_logical_checks()
  487. def check_physical_eol(
  488. self, token: tokenize.TokenInfo, prev_physical: str
  489. ) -> None:
  490. """Run physical checks if and only if it is at the end of the line."""
  491. assert self.processor is not None
  492. # a newline token ends a single physical line.
  493. if processor.is_eol_token(token):
  494. # if the file does not end with a newline, the NEWLINE
  495. # token is inserted by the parser, but it does not contain
  496. # the previous physical line in `token[4]`
  497. if token[4] == "":
  498. self.run_physical_checks(prev_physical)
  499. else:
  500. self.run_physical_checks(token[4])
  501. elif processor.is_multiline_string(token):
  502. # Less obviously, a string that contains newlines is a
  503. # multiline string, either triple-quoted or with internal
  504. # newlines backslash-escaped. Check every physical line in the
  505. # string *except* for the last one: its newline is outside of
  506. # the multiline string, so we consider it a regular physical
  507. # line, and will check it like any other physical line.
  508. #
  509. # Subtleties:
  510. # - have to wind self.line_number back because initially it
  511. # points to the last line of the string, and we want
  512. # check_physical() to give accurate feedback
  513. line_no = token[2][0]
  514. with self.processor.inside_multiline(line_number=line_no):
  515. for line in self.processor.split_line(token):
  516. self.run_physical_checks(line)
  517. def _pool_init() -> None:
  518. """Ensure correct signaling of ^C using multiprocessing.Pool."""
  519. signal.signal(signal.SIGINT, signal.SIG_IGN)
  520. def _try_initialize_processpool(
  521. job_count: int,
  522. ) -> Optional[multiprocessing.pool.Pool]:
  523. """Return a new process pool instance if we are able to create one."""
  524. try:
  525. return multiprocessing.Pool(job_count, _pool_init)
  526. except OSError as err:
  527. if err.errno not in SERIAL_RETRY_ERRNOS:
  528. raise
  529. except ImportError:
  530. pass
  531. return None
  532. def calculate_pool_chunksize(num_checkers: int, num_jobs: int) -> int:
  533. """Determine the chunksize for the multiprocessing Pool.
  534. - For chunksize, see: https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool.imap # noqa
  535. - This formula, while not perfect, aims to give each worker two batches of
  536. work.
  537. - See: https://github.com/pycqa/flake8/issues/829#note_18878876
  538. - See: https://github.com/pycqa/flake8/issues/197
  539. """
  540. return max(num_checkers // (num_jobs * 2), 1)
  541. def _run_checks(checker: FileChecker) -> Tuple[str, Results, Dict[str, int]]:
  542. return checker.run_checks()
  543. def find_offset(
  544. offset: int, mapping: processor._LogicalMapping
  545. ) -> Tuple[int, int]:
  546. """Find the offset tuple for a single offset."""
  547. if isinstance(offset, tuple):
  548. return offset
  549. for token in mapping:
  550. token_offset = token[0]
  551. if offset <= token_offset:
  552. position = token[1]
  553. break
  554. else:
  555. position = (0, 0)
  556. offset = token_offset = 0
  557. return (position[0], position[1] + offset - token_offset)