Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 58 additions & 18 deletions dev/archery/archery/benchmark/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,42 @@
# specific language governing permissions and limitations
# under the License.

import datetime
import glob
import json
import os
import random
import re
import subprocess

from .core import BenchmarkSuite
from .google import GoogleBenchmarkCommand, GoogleBenchmark
from .jmh import JavaMicrobenchmarkHarnessCommand, JavaMicrobenchmarkHarness
from ..lang.cpp import CppCMakeDefinition, CppConfiguration
from ..lang.java import JavaMavenDefinition, JavaConfiguration
from ..utils.cmake import CMakeBuild
from ..utils.git import git
from ..utils.maven import MavenBuild
from ..utils.logger import logger
from ..utils.source import ArrowSources


def _rev_or_path_dirname(src, rev_or_path):
Comment thread
AntoinePrv marked this conversation as resolved.
"""Return a directory-name-safe identifier for a revision or a path.

If ``rev_or_path`` resolves to a git revision, its full SHA is returned.
Otherwise (e.g. it is a filesystem path), a sanitized form with path
separators replaced is returned.
"""
if rev_or_path == ArrowSources.WORKSPACE:
return rev_or_path
try:
sha = git.rev_parse(rev_or_path, git_dir=src.path)
if isinstance(sha, bytes):
sha = sha.decode("ascii")
return sha
except subprocess.CalledProcessError:
return rev_or_path.replace("/", "_")


def regex_filter(re_expr):
Expand All @@ -47,6 +70,7 @@ def __init__(self, suite_filter=None, benchmark_filter=None,
self.benchmark_filter = benchmark_filter
self.repetitions = repetitions
self.repetition_min_time = repetition_min_time
self.results_dir = None

@property
def suites(self):
Expand Down Expand Up @@ -108,11 +132,14 @@ def __repr__(self):
class CppBenchmarkRunner(BenchmarkRunner):
""" Run suites from a CMakeBuild. """

def __init__(self, build, benchmark_extras, **kwargs):
def __init__(self, build, benchmark_extras, run_id=None, results_dir=None,
**kwargs):
""" Initialize a CppBenchmarkRunner. """
super().__init__(**kwargs)
self.build = build
self.benchmark_extras = benchmark_extras
super().__init__(**kwargs)
self.run_id = run_id
self.results_dir = results_dir

@staticmethod
def default_configuration(**kwargs):
Expand Down Expand Up @@ -217,19 +244,33 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs):
build = CMakeBuild.from_path(rev_or_path)
return CppBenchmarkRunner(build, **kwargs)
else:
# Revisions can references remote via the `/` character, ensure
# that the revision is path friendly
path_rev = rev_or_path.replace("/", "_")
path_rev = _rev_or_path_dirname(src, rev_or_path)
root_rev = os.path.join(root, path_rev)
os.mkdir(root_rev)
os.makedirs(root_rev, exist_ok=True)

# Clone dir is reused when there is known revision (git sha)
# in <preserve-dir>/sha/arrow
clone_dir = os.path.join(root_rev, "arrow")
# Possibly checkout the sources at given revision, no need to
# perform cleanup on cloned repository as root_rev is reclaimed.
src_rev, _ = src.at_revision(rev_or_path, clone_dir)
if os.path.isdir(clone_dir):
src_rev = ArrowSources(clone_dir)
Comment thread
AntoinePrv marked this conversation as resolved.
else:
src_rev, _ = src.at_revision(rev_or_path, clone_dir)
cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
build_dir = os.path.join(root_rev, "build")
return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs)
run_root = os.path.join(root_rev, "build")
os.makedirs(run_root, exist_ok=True)
# run_id is a path-safe ISO-8601 UTC timestamp down to the second
# so users get a brief idea of what run they are looking at, plus a
# random suffix to avoid collisions between runs in the same second.
now = datetime.datetime.now(datetime.timezone.utc)
run_id = now.strftime("%Y-%m-%dT%H-%M-%SZ") + \
f"-{random.randrange(16**8):08x}"
build_dir = os.path.join(run_root, run_id)
build = cmake_def.build(build_dir)
results_dir = os.path.join(root_rev, "bench", run_id)
os.makedirs(results_dir, exist_ok=True)
return CppBenchmarkRunner(
build, run_id=run_id, results_dir=results_dir, **kwargs
)


class JavaBenchmarkRunner(BenchmarkRunner):
Expand Down Expand Up @@ -310,16 +351,15 @@ def from_rev_or_path(src, root, rev_or_path, maven_conf, **kwargs):
maven_def = JavaMavenDefinition(rev_or_path, maven_conf)
return JavaBenchmarkRunner(maven_def.build(rev_or_path), **kwargs)
else:
# Revisions can references remote via the `/` character, ensure
# that the revision is path friendly
path_rev = rev_or_path.replace("/", "_")
path_rev = _rev_or_path_dirname(src, rev_or_path)
root_rev = os.path.join(root, path_rev)
os.mkdir(root_rev)
os.makedirs(root_rev, exist_ok=True)

clone_dir = os.path.join(root_rev, "arrow")
# Possibly checkout the sources at given revision, no need to
# perform cleanup on cloned repository as root_rev is reclaimed.
src_rev, _ = src.at_revision(rev_or_path, clone_dir)
if os.path.isdir(clone_dir):
src_rev = ArrowSources(clone_dir)
else:
src_rev, _ = src.at_revision(rev_or_path, clone_dir)
maven_def = JavaMavenDefinition(src_rev.java, maven_conf)
build_dir = os.path.join(root_rev, "arrow/java")
return JavaBenchmarkRunner(maven_def.build(build_dir), **kwargs)
76 changes: 61 additions & 15 deletions dev/archery/archery/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@

from io import StringIO
import click
import datetime
import json
import logging
import os
import pathlib
import platform
import sys

from .benchmark.codec import JsonEncoder
Expand All @@ -41,6 +43,27 @@
BOOL = ArrowBool()


def _run_metadata(ctx):
uname = platform.uname()
return {
"time": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"cmd": {
"argv": sys.argv,
"params": ctx.params,
},
"machine_info": {
"platform": platform.platform(),
"system": uname.system,
"release": uname.release,
"version": uname.version,
"machine": platform.machine(),
"processor": platform.processor(),
"architecture": platform.architecture()[0],
"logical_cores": os.cpu_count(),
},
}


@click.group(context_settings={"help_option_names": ["-h", "--help"]})
@click.option("--debug", type=BOOL, is_flag=True, default=False,
envvar='ARCHERY_DEBUG',
Expand Down Expand Up @@ -305,6 +328,11 @@ def check_language(ctx, param, value):
click.option("--preserve", type=BOOL, default=False, show_default=True,
is_flag=True,
help="Preserve workspace for investigation."),
click.option("--preserve-dir", metavar="<path>",
type=click.Path(file_okay=False, resolve_path=True),
default=None,
help="Parent directory in which to create the preserved "
"workspace. Implies --preserve."),
click.option("--output", metavar="<output>",
type=click.File("w", encoding="utf8"), default=None,
help="Capture output result into file."),
Expand Down Expand Up @@ -347,12 +375,14 @@ def benchmark_filter_options(cmd):
default="WORKSPACE", required=False)
@benchmark_common_options
@click.pass_context
def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras,
java_home, java_options, build_extras, benchmark_extras,
cpp_benchmark_extras, language, **kwargs):
def benchmark_list(ctx, rev_or_path, src, preserve, preserve_dir, output,
cmake_extras, java_home, java_options, build_extras,
benchmark_extras, cpp_benchmark_extras, language, **kwargs):
""" List benchmark suite.
"""
with tmpdir(preserve=preserve) as root:
# A preserve_dir implies preserving the workspace.
preserve = preserve or preserve_dir is not None
with tmpdir(preserve=preserve, preserve_dir=preserve_dir) as root:
logger.debug(f"Running benchmark {rev_or_path}")

if language == "cpp":
Expand Down Expand Up @@ -392,10 +422,11 @@ def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras,
"Currently only supported for language=cpp. "
"[default: use runner-specific defaults]"))
@click.pass_context
def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras,
java_home, java_options, build_extras, benchmark_extras,
language, suite_filter, benchmark_filter, repetitions,
repetition_min_time, cpp_benchmark_extras, **kwargs):
def benchmark_run(ctx, rev_or_path, src, preserve, preserve_dir, output,
cmake_extras, java_home, java_options, build_extras,
benchmark_extras, language, suite_filter, benchmark_filter,
repetitions, repetition_min_time, cpp_benchmark_extras,
**kwargs):
""" Run benchmark suite.

This command will run the benchmark suite for a single build. This is
Expand Down Expand Up @@ -433,7 +464,9 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras,
\b
archery benchmark run --output=run.json
"""
with tmpdir(preserve=preserve) as root:
# A preserve_dir implies preserving the workspace.
preserve = preserve or preserve_dir is not None
with tmpdir(preserve=preserve, preserve_dir=preserve_dir) as root:
logger.debug(f"Running benchmark {rev_or_path}")

if language == "cpp":
Expand Down Expand Up @@ -465,6 +498,17 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras,
# when asked to JSON-serialize the results, so produce a JSON
# output even when none is requested.
json_out = json.dumps(runner_base, cls=JsonEncoder)
if runner_base.results_dir is not None:
results_path = os.path.join(runner_base.results_dir,
"benchmark.json")
with open(results_path, "w") as f:
Comment on lines 500 to +504
f.write(json_out)
# Store some run metadata to make it hard to confuse runs, for instance
# remembering what was the SIMD level set on this run.
metadata_path = os.path.join(runner_base.results_dir,
"metadata.json")
Comment thread
AntoinePrv marked this conversation as resolved.
with open(metadata_path, "w") as f:
json.dump(_run_metadata(ctx), f, indent=2, default=str)
if output is not None:
output.write(json_out)

Expand All @@ -486,11 +530,11 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras,
@click.argument("baseline", metavar="[<baseline>]]", default="origin/HEAD",
required=False)
@click.pass_context
def benchmark_diff(ctx, src, preserve, output, language, cmake_extras,
suite_filter, benchmark_filter, repetitions, no_counters,
java_home, java_options, build_extras, benchmark_extras,
cpp_benchmark_extras, threshold, contender, baseline,
**kwargs):
def benchmark_diff(ctx, src, preserve, preserve_dir, output, language,
cmake_extras, suite_filter, benchmark_filter, repetitions,
no_counters, java_home, java_options, build_extras,
benchmark_extras, cpp_benchmark_extras, threshold,
contender, baseline, **kwargs):
"""Compare (diff) benchmark runs.

This command acts like git-diff but for benchmark results.
Expand Down Expand Up @@ -564,7 +608,9 @@ def benchmark_diff(ctx, src, preserve, output, language, cmake_extras,
# This should not recompute the benchmark from run.json
archery --quiet benchmark diff WORKSPACE run.json > result.json
"""
with tmpdir(preserve=preserve) as root:
# A preserve_dir implies preserving the workspace.
preserve = preserve or preserve_dir is not None
with tmpdir(preserve=preserve, preserve_dir=preserve_dir) as root:
logger.debug(f"Comparing {contender} (contender) with {baseline} (baseline)")

if language == "cpp":
Expand Down
8 changes: 6 additions & 2 deletions dev/archery/archery/utils/tmpdir.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@
# specific language governing permissions and limitations
# under the License.

import os
from contextlib import contextmanager
from tempfile import mkdtemp, TemporaryDirectory


@contextmanager
def tmpdir(preserve=False, prefix="arrow-archery-"):
if preserve:
def tmpdir(preserve=False, prefix="arrow-archery-", preserve_dir=None):
if preserve and preserve_dir is not None:
os.makedirs(preserve_dir, exist_ok=True)
yield preserve_dir
elif preserve:
yield mkdtemp(prefix=prefix)
else:
with TemporaryDirectory(prefix=prefix) as tmp:
Expand Down
Loading