diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index 571a5e58393b..346ff6783bdf 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -15,10 +15,13 @@ # specific language governing permissions and limitations # under the License. +import datetime import glob import json import os +import random import re +import subprocess from .core import BenchmarkSuite from .google import GoogleBenchmarkCommand, GoogleBenchmark @@ -26,8 +29,28 @@ from ..lang.cpp import CppCMakeDefinition, CppConfiguration from ..lang.java import JavaMavenDefinition, JavaConfiguration from ..utils.cmake import CMakeBuild +from ..utils.git import git from ..utils.maven import MavenBuild from ..utils.logger import logger +from ..utils.source import ArrowSources + + +def _rev_or_path_dirname(src, rev_or_path): + """Return a directory-name-safe identifier for a revision or a path. + + If ``rev_or_path`` resolves to a git revision, its full SHA is returned. + Otherwise (e.g. it is a filesystem path), a sanitized form with path + separators replaced is returned. + """ + if rev_or_path == ArrowSources.WORKSPACE: + return rev_or_path + try: + sha = git.rev_parse(rev_or_path, git_dir=src.path) + if isinstance(sha, bytes): + sha = sha.decode("ascii") + return sha + except subprocess.CalledProcessError: + return rev_or_path.replace("/", "_") def regex_filter(re_expr): @@ -47,6 +70,7 @@ def __init__(self, suite_filter=None, benchmark_filter=None, self.benchmark_filter = benchmark_filter self.repetitions = repetitions self.repetition_min_time = repetition_min_time + self.results_dir = None @property def suites(self): @@ -108,11 +132,14 @@ def __repr__(self): class CppBenchmarkRunner(BenchmarkRunner): """ Run suites from a CMakeBuild. """ - def __init__(self, build, benchmark_extras, **kwargs): + def __init__(self, build, benchmark_extras, run_id=None, results_dir=None, + **kwargs): """ Initialize a CppBenchmarkRunner. """ + super().__init__(**kwargs) self.build = build self.benchmark_extras = benchmark_extras - super().__init__(**kwargs) + self.run_id = run_id + self.results_dir = results_dir @staticmethod def default_configuration(**kwargs): @@ -217,19 +244,33 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs): build = CMakeBuild.from_path(rev_or_path) return CppBenchmarkRunner(build, **kwargs) else: - # Revisions can references remote via the `/` character, ensure - # that the revision is path friendly - path_rev = rev_or_path.replace("/", "_") + path_rev = _rev_or_path_dirname(src, rev_or_path) root_rev = os.path.join(root, path_rev) - os.mkdir(root_rev) + os.makedirs(root_rev, exist_ok=True) + # Clone dir is reused when there is known revision (git sha) + # in /sha/arrow clone_dir = os.path.join(root_rev, "arrow") - # Possibly checkout the sources at given revision, no need to - # perform cleanup on cloned repository as root_rev is reclaimed. - src_rev, _ = src.at_revision(rev_or_path, clone_dir) + if os.path.isdir(clone_dir): + src_rev = ArrowSources(clone_dir) + else: + src_rev, _ = src.at_revision(rev_or_path, clone_dir) cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) - build_dir = os.path.join(root_rev, "build") - return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs) + run_root = os.path.join(root_rev, "build") + os.makedirs(run_root, exist_ok=True) + # run_id is a path-safe ISO-8601 UTC timestamp down to the second + # so users get a brief idea of what run they are looking at, plus a + # random suffix to avoid collisions between runs in the same second. + now = datetime.datetime.now(datetime.timezone.utc) + run_id = now.strftime("%Y-%m-%dT%H-%M-%SZ") + \ + f"-{random.randrange(16**8):08x}" + build_dir = os.path.join(run_root, run_id) + build = cmake_def.build(build_dir) + results_dir = os.path.join(root_rev, "bench", run_id) + os.makedirs(results_dir, exist_ok=True) + return CppBenchmarkRunner( + build, run_id=run_id, results_dir=results_dir, **kwargs + ) class JavaBenchmarkRunner(BenchmarkRunner): @@ -310,16 +351,15 @@ def from_rev_or_path(src, root, rev_or_path, maven_conf, **kwargs): maven_def = JavaMavenDefinition(rev_or_path, maven_conf) return JavaBenchmarkRunner(maven_def.build(rev_or_path), **kwargs) else: - # Revisions can references remote via the `/` character, ensure - # that the revision is path friendly - path_rev = rev_or_path.replace("/", "_") + path_rev = _rev_or_path_dirname(src, rev_or_path) root_rev = os.path.join(root, path_rev) - os.mkdir(root_rev) + os.makedirs(root_rev, exist_ok=True) clone_dir = os.path.join(root_rev, "arrow") - # Possibly checkout the sources at given revision, no need to - # perform cleanup on cloned repository as root_rev is reclaimed. - src_rev, _ = src.at_revision(rev_or_path, clone_dir) + if os.path.isdir(clone_dir): + src_rev = ArrowSources(clone_dir) + else: + src_rev, _ = src.at_revision(rev_or_path, clone_dir) maven_def = JavaMavenDefinition(src_rev.java, maven_conf) build_dir = os.path.join(root_rev, "arrow/java") return JavaBenchmarkRunner(maven_def.build(build_dir), **kwargs) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index e70cc3874f22..e7dcc819500f 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -17,10 +17,12 @@ from io import StringIO import click +import datetime import json import logging import os import pathlib +import platform import sys from .benchmark.codec import JsonEncoder @@ -41,6 +43,27 @@ BOOL = ArrowBool() +def _run_metadata(ctx): + uname = platform.uname() + return { + "time": datetime.datetime.now(datetime.timezone.utc).isoformat(), + "cmd": { + "argv": sys.argv, + "params": ctx.params, + }, + "machine_info": { + "platform": platform.platform(), + "system": uname.system, + "release": uname.release, + "version": uname.version, + "machine": platform.machine(), + "processor": platform.processor(), + "architecture": platform.architecture()[0], + "logical_cores": os.cpu_count(), + }, + } + + @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.option("--debug", type=BOOL, is_flag=True, default=False, envvar='ARCHERY_DEBUG', @@ -305,6 +328,11 @@ def check_language(ctx, param, value): click.option("--preserve", type=BOOL, default=False, show_default=True, is_flag=True, help="Preserve workspace for investigation."), + click.option("--preserve-dir", metavar="", + type=click.Path(file_okay=False, resolve_path=True), + default=None, + help="Parent directory in which to create the preserved " + "workspace. Implies --preserve."), click.option("--output", metavar="", type=click.File("w", encoding="utf8"), default=None, help="Capture output result into file."), @@ -347,12 +375,14 @@ def benchmark_filter_options(cmd): default="WORKSPACE", required=False) @benchmark_common_options @click.pass_context -def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras, - java_home, java_options, build_extras, benchmark_extras, - cpp_benchmark_extras, language, **kwargs): +def benchmark_list(ctx, rev_or_path, src, preserve, preserve_dir, output, + cmake_extras, java_home, java_options, build_extras, + benchmark_extras, cpp_benchmark_extras, language, **kwargs): """ List benchmark suite. """ - with tmpdir(preserve=preserve) as root: + # A preserve_dir implies preserving the workspace. + preserve = preserve or preserve_dir is not None + with tmpdir(preserve=preserve, preserve_dir=preserve_dir) as root: logger.debug(f"Running benchmark {rev_or_path}") if language == "cpp": @@ -392,10 +422,11 @@ def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras, "Currently only supported for language=cpp. " "[default: use runner-specific defaults]")) @click.pass_context -def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, - java_home, java_options, build_extras, benchmark_extras, - language, suite_filter, benchmark_filter, repetitions, - repetition_min_time, cpp_benchmark_extras, **kwargs): +def benchmark_run(ctx, rev_or_path, src, preserve, preserve_dir, output, + cmake_extras, java_home, java_options, build_extras, + benchmark_extras, language, suite_filter, benchmark_filter, + repetitions, repetition_min_time, cpp_benchmark_extras, + **kwargs): """ Run benchmark suite. This command will run the benchmark suite for a single build. This is @@ -433,7 +464,9 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, \b archery benchmark run --output=run.json """ - with tmpdir(preserve=preserve) as root: + # A preserve_dir implies preserving the workspace. + preserve = preserve or preserve_dir is not None + with tmpdir(preserve=preserve, preserve_dir=preserve_dir) as root: logger.debug(f"Running benchmark {rev_or_path}") if language == "cpp": @@ -465,6 +498,17 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, # when asked to JSON-serialize the results, so produce a JSON # output even when none is requested. json_out = json.dumps(runner_base, cls=JsonEncoder) + if runner_base.results_dir is not None: + results_path = os.path.join(runner_base.results_dir, + "benchmark.json") + with open(results_path, "w") as f: + f.write(json_out) + # Store some run metadata to make it hard to confuse runs, for instance + # remembering what was the SIMD level set on this run. + metadata_path = os.path.join(runner_base.results_dir, + "metadata.json") + with open(metadata_path, "w") as f: + json.dump(_run_metadata(ctx), f, indent=2, default=str) if output is not None: output.write(json_out) @@ -486,11 +530,11 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, @click.argument("baseline", metavar="[]]", default="origin/HEAD", required=False) @click.pass_context -def benchmark_diff(ctx, src, preserve, output, language, cmake_extras, - suite_filter, benchmark_filter, repetitions, no_counters, - java_home, java_options, build_extras, benchmark_extras, - cpp_benchmark_extras, threshold, contender, baseline, - **kwargs): +def benchmark_diff(ctx, src, preserve, preserve_dir, output, language, + cmake_extras, suite_filter, benchmark_filter, repetitions, + no_counters, java_home, java_options, build_extras, + benchmark_extras, cpp_benchmark_extras, threshold, + contender, baseline, **kwargs): """Compare (diff) benchmark runs. This command acts like git-diff but for benchmark results. @@ -564,7 +608,9 @@ def benchmark_diff(ctx, src, preserve, output, language, cmake_extras, # This should not recompute the benchmark from run.json archery --quiet benchmark diff WORKSPACE run.json > result.json """ - with tmpdir(preserve=preserve) as root: + # A preserve_dir implies preserving the workspace. + preserve = preserve or preserve_dir is not None + with tmpdir(preserve=preserve, preserve_dir=preserve_dir) as root: logger.debug(f"Comparing {contender} (contender) with {baseline} (baseline)") if language == "cpp": diff --git a/dev/archery/archery/utils/tmpdir.py b/dev/archery/archery/utils/tmpdir.py index 07d7355c87fb..06eb9a6ba627 100644 --- a/dev/archery/archery/utils/tmpdir.py +++ b/dev/archery/archery/utils/tmpdir.py @@ -15,13 +15,17 @@ # specific language governing permissions and limitations # under the License. +import os from contextlib import contextmanager from tempfile import mkdtemp, TemporaryDirectory @contextmanager -def tmpdir(preserve=False, prefix="arrow-archery-"): - if preserve: +def tmpdir(preserve=False, prefix="arrow-archery-", preserve_dir=None): + if preserve and preserve_dir is not None: + os.makedirs(preserve_dir, exist_ok=True) + yield preserve_dir + elif preserve: yield mkdtemp(prefix=prefix) else: with TemporaryDirectory(prefix=prefix) as tmp: