From: Richard Si <63936253+ichard26@users.noreply.github.com> Date: Thu, 4 Aug 2022 00:18:33 +0000 (-0400) Subject: Lazily import parallelized format modules X-Git-Url: https://git.madduck.net/etc/vim.git/commitdiff_plain/e269f44b25737360e0dc65379f889dfa931dc68a Lazily import parallelized format modules `black.reformat_many` depends on a lot of slow-to-import modules. When formatting simply a single file, the time paid to import those modules is totally wasted. So I moved `black.reformat_many` and its helpers to `black.concurrency` which is now *only* imported if there's more than one file to reformat. This way, running Black over a single file is snappier Here are the numbers before and after this patch running `python -m black --version`: - interpreted: 411 ms +- 9 ms -> 342 ms +- 7 ms: 1.20x faster - compiled: 365 ms +- 15 ms -> 304 ms +- 7 ms: 1.20x faster Co-authored-by: Fabio Zadrozny --- diff --git a/CHANGES.md b/CHANGES.md index 1765952..34c5471 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -87,6 +87,8 @@ +- Reduce Black's startup time when formatting a single file by 15-30% (#3211) + ## 22.6.0 ### Style diff --git a/docs/contributing/reference/reference_functions.rst b/docs/contributing/reference/reference_functions.rst index 01ffe44..50eaeb3 100644 --- a/docs/contributing/reference/reference_functions.rst +++ b/docs/contributing/reference/reference_functions.rst @@ -52,7 +52,7 @@ Formatting .. autofunction:: black.reformat_one -.. autofunction:: black.schedule_formatting +.. autofunction:: black.concurrency.schedule_formatting File operations --------------- @@ -173,7 +173,7 @@ Utilities .. autofunction:: black.linegen.should_split_line -.. autofunction:: black.shutdown +.. autofunction:: black.concurrency.shutdown .. autofunction:: black.strings.sub_twice diff --git a/src/black/__init__.py b/src/black/__init__.py index a0c1ad4..afc76e1 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -1,10 +1,8 @@ -import asyncio import io import json import os import platform import re -import signal import sys import tokenize import traceback @@ -13,10 +11,8 @@ from dataclasses import replace from datetime import datetime from enum import Enum from json.decoder import JSONDecodeError -from multiprocessing import Manager, freeze_support from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, Generator, @@ -32,15 +28,19 @@ from typing import ( Union, ) +if sys.version_info >= (3, 8): + from typing import Final +else: + from typing_extensions import Final + import click from click.core import ParameterSource from mypy_extensions import mypyc_attr from pathspec.patterns.gitwildmatch import GitWildMatchPatternError from _black_version import version as __version__ -from black.cache import Cache, filter_cached, get_cache_info, read_cache, write_cache +from black.cache import Cache, get_cache_info, read_cache, write_cache from black.comments import normalize_fmt_off -from black.concurrency import cancel, maybe_install_uvloop, shutdown from black.const import ( DEFAULT_EXCLUDES, DEFAULT_INCLUDES, @@ -91,10 +91,8 @@ from black.trans import iter_fexpr_spans from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node -if TYPE_CHECKING: - from concurrent.futures import Executor - COMPILED = Path(__file__).suffix in (".pyd", ".so") +DEFAULT_WORKERS: Final = os.cpu_count() # types FileContent = str @@ -125,8 +123,6 @@ class WriteBack(Enum): # Legacy name, left for integrations. FileMode = Mode -DEFAULT_WORKERS = os.cpu_count() - def read_pyproject_toml( ctx: click.Context, param: click.Parameter, value: Optional[str] @@ -592,6 +588,8 @@ def main( # noqa: C901 report=report, ) else: + from black.concurrency import reformat_many + reformat_many( sources=sources, fast=fast, @@ -776,132 +774,6 @@ def reformat_one( report.failed(src, str(exc)) -# diff-shades depends on being to monkeypatch this function to operate. I know it's -# not ideal, but this shouldn't cause any issues ... hopefully. ~ichard26 -@mypyc_attr(patchable=True) -def reformat_many( - sources: Set[Path], - fast: bool, - write_back: WriteBack, - mode: Mode, - report: "Report", - workers: Optional[int], -) -> None: - """Reformat multiple files using a ProcessPoolExecutor.""" - from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor - - executor: Executor - worker_count = workers if workers is not None else DEFAULT_WORKERS - if sys.platform == "win32": - # Work around https://bugs.python.org/issue26903 - assert worker_count is not None - worker_count = min(worker_count, 60) - try: - executor = ProcessPoolExecutor(max_workers=worker_count) - except (ImportError, NotImplementedError, OSError): - # we arrive here if the underlying system does not support multi-processing - # like in AWS Lambda or Termux, in which case we gracefully fallback to - # a ThreadPoolExecutor with just a single worker (more workers would not do us - # any good due to the Global Interpreter Lock) - executor = ThreadPoolExecutor(max_workers=1) - - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - loop.run_until_complete( - schedule_formatting( - sources=sources, - fast=fast, - write_back=write_back, - mode=mode, - report=report, - loop=loop, - executor=executor, - ) - ) - finally: - try: - shutdown(loop) - finally: - asyncio.set_event_loop(None) - if executor is not None: - executor.shutdown() - - -async def schedule_formatting( - sources: Set[Path], - fast: bool, - write_back: WriteBack, - mode: Mode, - report: "Report", - loop: asyncio.AbstractEventLoop, - executor: "Executor", -) -> None: - """Run formatting of `sources` in parallel using the provided `executor`. - - (Use ProcessPoolExecutors for actual parallelism.) - - `write_back`, `fast`, and `mode` options are passed to - :func:`format_file_in_place`. - """ - cache: Cache = {} - if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF): - cache = read_cache(mode) - sources, cached = filter_cached(cache, sources) - for src in sorted(cached): - report.done(src, Changed.CACHED) - if not sources: - return - - cancelled = [] - sources_to_cache = [] - lock = None - if write_back in (WriteBack.DIFF, WriteBack.COLOR_DIFF): - # For diff output, we need locks to ensure we don't interleave output - # from different processes. - manager = Manager() - lock = manager.Lock() - tasks = { - asyncio.ensure_future( - loop.run_in_executor( - executor, format_file_in_place, src, fast, mode, write_back, lock - ) - ): src - for src in sorted(sources) - } - pending = tasks.keys() - try: - loop.add_signal_handler(signal.SIGINT, cancel, pending) - loop.add_signal_handler(signal.SIGTERM, cancel, pending) - except NotImplementedError: - # There are no good alternatives for these on Windows. - pass - while pending: - done, _ = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) - for task in done: - src = tasks.pop(task) - if task.cancelled(): - cancelled.append(task) - elif task.exception(): - report.failed(src, str(task.exception())) - else: - changed = Changed.YES if task.result() else Changed.NO - # If the file was written back or was successfully checked as - # well-formatted, store this information in the cache. - if write_back is WriteBack.YES or ( - write_back is WriteBack.CHECK and changed is Changed.NO - ): - sources_to_cache.append(src) - report.done(src, changed) - if cancelled: - if sys.version_info >= (3, 7): - await asyncio.gather(*cancelled, return_exceptions=True) - else: - await asyncio.gather(*cancelled, loop=loop, return_exceptions=True) - if sources_to_cache: - write_cache(cache, sources_to_cache, mode) - - def format_file_in_place( src: Path, fast: bool, @@ -1506,8 +1378,11 @@ def patch_click() -> None: def patched_main() -> None: - maybe_install_uvloop() - freeze_support() + if sys.platform == "win32" and getattr(sys, "frozen", False): + from multiprocessing import freeze_support + + freeze_support() + patch_click() main() diff --git a/src/black/concurrency.py b/src/black/concurrency.py index 24f67b6..d77ea40 100644 --- a/src/black/concurrency.py +++ b/src/black/concurrency.py @@ -1,9 +1,25 @@ +""" +Formatting many files at once via multiprocessing. Contains entrypoint and utilities. + +NOTE: this module is only imported if we need to format several files at once. +""" + import asyncio import logging +import signal import sys -from typing import Any, Iterable +from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor +from multiprocessing import Manager +from pathlib import Path +from typing import Any, Iterable, Optional, Set + +from mypy_extensions import mypyc_attr +from black import DEFAULT_WORKERS, WriteBack, format_file_in_place +from black.cache import Cache, filter_cached, read_cache, write_cache +from black.mode import Mode from black.output import err +from black.report import Changed, Report def maybe_install_uvloop() -> None: @@ -11,7 +27,6 @@ def maybe_install_uvloop() -> None: This is called only from command-line entry points to avoid interfering with the parent process if Black is used as a library. - """ try: import uvloop @@ -55,3 +70,129 @@ def shutdown(loop: asyncio.AbstractEventLoop) -> None: cf_logger = logging.getLogger("concurrent.futures") cf_logger.setLevel(logging.CRITICAL) loop.close() + + +# diff-shades depends on being to monkeypatch this function to operate. I know it's +# not ideal, but this shouldn't cause any issues ... hopefully. ~ichard26 +@mypyc_attr(patchable=True) +def reformat_many( + sources: Set[Path], + fast: bool, + write_back: WriteBack, + mode: Mode, + report: Report, + workers: Optional[int], +) -> None: + """Reformat multiple files using a ProcessPoolExecutor.""" + maybe_install_uvloop() + + executor: Executor + worker_count = workers if workers is not None else DEFAULT_WORKERS + if sys.platform == "win32": + # Work around https://bugs.python.org/issue26903 + assert worker_count is not None + worker_count = min(worker_count, 60) + try: + executor = ProcessPoolExecutor(max_workers=worker_count) + except (ImportError, NotImplementedError, OSError): + # we arrive here if the underlying system does not support multi-processing + # like in AWS Lambda or Termux, in which case we gracefully fallback to + # a ThreadPoolExecutor with just a single worker (more workers would not do us + # any good due to the Global Interpreter Lock) + executor = ThreadPoolExecutor(max_workers=1) + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete( + schedule_formatting( + sources=sources, + fast=fast, + write_back=write_back, + mode=mode, + report=report, + loop=loop, + executor=executor, + ) + ) + finally: + try: + shutdown(loop) + finally: + asyncio.set_event_loop(None) + if executor is not None: + executor.shutdown() + + +async def schedule_formatting( + sources: Set[Path], + fast: bool, + write_back: WriteBack, + mode: Mode, + report: "Report", + loop: asyncio.AbstractEventLoop, + executor: "Executor", +) -> None: + """Run formatting of `sources` in parallel using the provided `executor`. + + (Use ProcessPoolExecutors for actual parallelism.) + + `write_back`, `fast`, and `mode` options are passed to + :func:`format_file_in_place`. + """ + cache: Cache = {} + if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF): + cache = read_cache(mode) + sources, cached = filter_cached(cache, sources) + for src in sorted(cached): + report.done(src, Changed.CACHED) + if not sources: + return + + cancelled = [] + sources_to_cache = [] + lock = None + if write_back in (WriteBack.DIFF, WriteBack.COLOR_DIFF): + # For diff output, we need locks to ensure we don't interleave output + # from different processes. + manager = Manager() + lock = manager.Lock() + tasks = { + asyncio.ensure_future( + loop.run_in_executor( + executor, format_file_in_place, src, fast, mode, write_back, lock + ) + ): src + for src in sorted(sources) + } + pending = tasks.keys() + try: + loop.add_signal_handler(signal.SIGINT, cancel, pending) + loop.add_signal_handler(signal.SIGTERM, cancel, pending) + except NotImplementedError: + # There are no good alternatives for these on Windows. + pass + while pending: + done, _ = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) + for task in done: + src = tasks.pop(task) + if task.cancelled(): + cancelled.append(task) + elif task.exception(): + report.failed(src, str(task.exception())) + else: + changed = Changed.YES if task.result() else Changed.NO + # If the file was written back or was successfully checked as + # well-formatted, store this information in the cache. + if write_back is WriteBack.YES or ( + write_back is WriteBack.CHECK and changed is Changed.NO + ): + sources_to_cache.append(src) + report.done(src, changed) + if cancelled: + if sys.version_info >= (3, 7): + await asyncio.gather(*cancelled, return_exceptions=True) + else: + await asyncio.gather(*cancelled, loop=loop, return_exceptions=True) + if sources_to_cache: + write_cache(cache, sources_to_cache, mode) diff --git a/tests/test_black.py b/tests/test_black.py index c76b3fa..5da247b 100644 --- a/tests/test_black.py +++ b/tests/test_black.py @@ -1763,7 +1763,9 @@ class TestCaching: src = (workspace / f"test{tag}.py").resolve() with src.open("w") as fobj: fobj.write("print('hello')") - with patch("black.Manager", wraps=multiprocessing.Manager) as mgr: + with patch( + "black.concurrency.Manager", wraps=multiprocessing.Manager + ) as mgr: cmd = ["--diff", str(workspace)] if color: cmd.append("--color") @@ -1810,7 +1812,7 @@ class TestCaching: str(cached): black.get_cache_info(cached), str(cached_but_changed): (0.0, 0), } - todo, done = black.filter_cached( + todo, done = black.cache.filter_cached( cache, {uncached, cached, cached_but_changed} ) assert todo == {uncached, cached_but_changed}